sched: Do not account irq time to current task
[linux-2.6-block.git] / kernel / sched.c
CommitLineData
1da177e4
LT
1/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
c31f2e8a
IM
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
b9131769
IM
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
1da177e4
LT
27 */
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
dff06c15 33#include <linux/uaccess.h>
1da177e4
LT
34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h>
37#include <linux/interrupt.h>
c59ede7b 38#include <linux/capability.h>
1da177e4
LT
39#include <linux/completion.h>
40#include <linux/kernel_stat.h>
9a11b49a 41#include <linux/debug_locks.h>
cdd6c482 42#include <linux/perf_event.h>
1da177e4
LT
43#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
7dfb7103 46#include <linux/freezer.h>
198e2f18 47#include <linux/vmalloc.h>
1da177e4
LT
48#include <linux/blkdev.h>
49#include <linux/delay.h>
b488893a 50#include <linux/pid_namespace.h>
1da177e4
LT
51#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
b5aadf7f 58#include <linux/proc_fs.h>
1da177e4 59#include <linux/seq_file.h>
969c7921 60#include <linux/stop_machine.h>
e692ab53 61#include <linux/sysctl.h>
1da177e4
LT
62#include <linux/syscalls.h>
63#include <linux/times.h>
8f0ab514 64#include <linux/tsacct_kern.h>
c6fd91f0 65#include <linux/kprobes.h>
0ff92245 66#include <linux/delayacct.h>
dff06c15 67#include <linux/unistd.h>
f5ff8422 68#include <linux/pagemap.h>
8f4d37ec 69#include <linux/hrtimer.h>
30914a58 70#include <linux/tick.h>
f00b45c1
PZ
71#include <linux/debugfs.h>
72#include <linux/ctype.h>
6cd8a4bb 73#include <linux/ftrace.h>
5a0e3ad6 74#include <linux/slab.h>
1da177e4 75
5517d86b 76#include <asm/tlb.h>
838225b4 77#include <asm/irq_regs.h>
1da177e4 78
6e0534f2 79#include "sched_cpupri.h"
21aa9af0 80#include "workqueue_sched.h"
6e0534f2 81
a8d154b0 82#define CREATE_TRACE_POINTS
ad8d75ff 83#include <trace/events/sched.h>
a8d154b0 84
1da177e4
LT
85/*
86 * Convert user-nice values [ -20 ... 0 ... 19 ]
87 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
88 * and back.
89 */
90#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
91#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
92#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
93
94/*
95 * 'User priority' is the nice value converted to something we
96 * can work with better when scaling various scheduler parameters,
97 * it's a [ 0 ... 39 ] range.
98 */
99#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
100#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
101#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
102
103/*
d7876a08 104 * Helpers for converting nanosecond timing to jiffy resolution
1da177e4 105 */
d6322faf 106#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
1da177e4 107
6aa645ea
IM
108#define NICE_0_LOAD SCHED_LOAD_SCALE
109#define NICE_0_SHIFT SCHED_LOAD_SHIFT
110
1da177e4
LT
111/*
112 * These are the 'tuning knobs' of the scheduler:
113 *
a4ec24b4 114 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
1da177e4
LT
115 * Timeslices get refilled after they expire.
116 */
1da177e4 117#define DEF_TIMESLICE (100 * HZ / 1000)
2dd73a4f 118
d0b27fa7
PZ
119/*
120 * single value that denotes runtime == period, ie unlimited time.
121 */
122#define RUNTIME_INF ((u64)~0ULL)
123
e05606d3
IM
124static inline int rt_policy(int policy)
125{
3f33a7ce 126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
e05606d3
IM
127 return 1;
128 return 0;
129}
130
131static inline int task_has_rt_policy(struct task_struct *p)
132{
133 return rt_policy(p->policy);
134}
135
1da177e4 136/*
6aa645ea 137 * This is the priority-queue data structure of the RT scheduling class:
1da177e4 138 */
6aa645ea
IM
139struct rt_prio_array {
140 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
141 struct list_head queue[MAX_RT_PRIO];
142};
143
d0b27fa7 144struct rt_bandwidth {
ea736ed5 145 /* nests inside the rq lock: */
0986b11b 146 raw_spinlock_t rt_runtime_lock;
ea736ed5
IM
147 ktime_t rt_period;
148 u64 rt_runtime;
149 struct hrtimer rt_period_timer;
d0b27fa7
PZ
150};
151
152static struct rt_bandwidth def_rt_bandwidth;
153
154static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
155
156static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
157{
158 struct rt_bandwidth *rt_b =
159 container_of(timer, struct rt_bandwidth, rt_period_timer);
160 ktime_t now;
161 int overrun;
162 int idle = 0;
163
164 for (;;) {
165 now = hrtimer_cb_get_time(timer);
166 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
167
168 if (!overrun)
169 break;
170
171 idle = do_sched_rt_period_timer(rt_b, overrun);
172 }
173
174 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
175}
176
177static
178void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
179{
180 rt_b->rt_period = ns_to_ktime(period);
181 rt_b->rt_runtime = runtime;
182
0986b11b 183 raw_spin_lock_init(&rt_b->rt_runtime_lock);
ac086bc2 184
d0b27fa7
PZ
185 hrtimer_init(&rt_b->rt_period_timer,
186 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
187 rt_b->rt_period_timer.function = sched_rt_period_timer;
d0b27fa7
PZ
188}
189
c8bfff6d
KH
190static inline int rt_bandwidth_enabled(void)
191{
192 return sysctl_sched_rt_runtime >= 0;
d0b27fa7
PZ
193}
194
195static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
196{
197 ktime_t now;
198
cac64d00 199 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
d0b27fa7
PZ
200 return;
201
202 if (hrtimer_active(&rt_b->rt_period_timer))
203 return;
204
0986b11b 205 raw_spin_lock(&rt_b->rt_runtime_lock);
d0b27fa7 206 for (;;) {
7f1e2ca9
PZ
207 unsigned long delta;
208 ktime_t soft, hard;
209
d0b27fa7
PZ
210 if (hrtimer_active(&rt_b->rt_period_timer))
211 break;
212
213 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
214 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
7f1e2ca9
PZ
215
216 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
217 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
218 delta = ktime_to_ns(ktime_sub(hard, soft));
219 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
5c333864 220 HRTIMER_MODE_ABS_PINNED, 0);
d0b27fa7 221 }
0986b11b 222 raw_spin_unlock(&rt_b->rt_runtime_lock);
d0b27fa7
PZ
223}
224
225#ifdef CONFIG_RT_GROUP_SCHED
226static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
227{
228 hrtimer_cancel(&rt_b->rt_period_timer);
229}
230#endif
231
712555ee
HC
232/*
233 * sched_domains_mutex serializes calls to arch_init_sched_domains,
234 * detach_destroy_domains and partition_sched_domains.
235 */
236static DEFINE_MUTEX(sched_domains_mutex);
237
7c941438 238#ifdef CONFIG_CGROUP_SCHED
29f59db3 239
68318b8e
SV
240#include <linux/cgroup.h>
241
29f59db3
SV
242struct cfs_rq;
243
6f505b16
PZ
244static LIST_HEAD(task_groups);
245
29f59db3 246/* task group related information */
4cf86d77 247struct task_group {
68318b8e 248 struct cgroup_subsys_state css;
6c415b92 249
052f1dc7 250#ifdef CONFIG_FAIR_GROUP_SCHED
29f59db3
SV
251 /* schedulable entities of this group on each cpu */
252 struct sched_entity **se;
253 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq;
255 unsigned long shares;
052f1dc7
PZ
256#endif
257
258#ifdef CONFIG_RT_GROUP_SCHED
259 struct sched_rt_entity **rt_se;
260 struct rt_rq **rt_rq;
261
d0b27fa7 262 struct rt_bandwidth rt_bandwidth;
052f1dc7 263#endif
6b2d7700 264
ae8393e5 265 struct rcu_head rcu;
6f505b16 266 struct list_head list;
f473aa5e
PZ
267
268 struct task_group *parent;
269 struct list_head siblings;
270 struct list_head children;
29f59db3
SV
271};
272
eff766a6 273#define root_task_group init_task_group
6f505b16 274
8ed36996 275/* task_group_lock serializes add/remove of task groups and also changes to
ec2c507f
SV
276 * a task group's cpu shares.
277 */
8ed36996 278static DEFINE_SPINLOCK(task_group_lock);
ec2c507f 279
e9036b36
CG
280#ifdef CONFIG_FAIR_GROUP_SCHED
281
57310a98
PZ
282#ifdef CONFIG_SMP
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
052f1dc7 289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
052f1dc7 290
cb4ad1ff 291/*
2e084786
LJ
292 * A weight of 0 or 1 can cause arithmetics problems.
293 * A weight of a cfs_rq is the sum of weights of which entities
294 * are queued on this cfs_rq, so a weight of a entity should not be
295 * too large, so as the shares value of a task group.
cb4ad1ff
MX
296 * (The default weight is 1024 - so there's no practical
297 * limitation from this.)
298 */
18d95a28 299#define MIN_SHARES 2
2e084786 300#define MAX_SHARES (1UL << 18)
18d95a28 301
052f1dc7
PZ
302static int init_task_group_load = INIT_TASK_GROUP_LOAD;
303#endif
304
29f59db3 305/* Default task group.
3a252015 306 * Every task in system belong to this group at bootup.
29f59db3 307 */
434d53b0 308struct task_group init_task_group;
29f59db3 309
7c941438 310#endif /* CONFIG_CGROUP_SCHED */
29f59db3 311
6aa645ea
IM
312/* CFS-related fields in a runqueue */
313struct cfs_rq {
314 struct load_weight load;
315 unsigned long nr_running;
316
6aa645ea 317 u64 exec_clock;
e9acbff6 318 u64 min_vruntime;
6aa645ea
IM
319
320 struct rb_root tasks_timeline;
321 struct rb_node *rb_leftmost;
4a55bd5e
PZ
322
323 struct list_head tasks;
324 struct list_head *balance_iterator;
325
326 /*
327 * 'curr' points to currently running entity on this cfs_rq.
6aa645ea
IM
328 * It is set to NULL otherwise (i.e when none are currently running).
329 */
4793241b 330 struct sched_entity *curr, *next, *last;
ddc97297 331
5ac5c4d6 332 unsigned int nr_spread_over;
ddc97297 333
62160e3f 334#ifdef CONFIG_FAIR_GROUP_SCHED
6aa645ea
IM
335 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
336
41a2d6cf
IM
337 /*
338 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
6aa645ea
IM
339 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
340 * (like users, containers etc.)
341 *
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance.
344 */
41a2d6cf
IM
345 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */
c09595f6
PZ
347
348#ifdef CONFIG_SMP
c09595f6 349 /*
c8cba857 350 * the part of load.weight contributed by tasks
c09595f6 351 */
c8cba857 352 unsigned long task_weight;
c09595f6 353
c8cba857
PZ
354 /*
355 * h_load = weight * f(tg)
356 *
357 * Where f(tg) is the recursive weight fraction assigned to
358 * this group.
359 */
360 unsigned long h_load;
c09595f6 361
c8cba857
PZ
362 /*
363 * this cpu's part of tg->shares
364 */
365 unsigned long shares;
f1d239f7
PZ
366
367 /*
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
c09595f6 371#endif
6aa645ea
IM
372#endif
373};
1da177e4 374
6aa645ea
IM
375/* Real-Time classes' related field in a runqueue: */
376struct rt_rq {
377 struct rt_prio_array active;
63489e45 378 unsigned long rt_nr_running;
052f1dc7 379#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
e864c499
GH
380 struct {
381 int curr; /* highest queued rt task prio */
398a153b 382#ifdef CONFIG_SMP
e864c499 383 int next; /* next highest */
398a153b 384#endif
e864c499 385 } highest_prio;
6f505b16 386#endif
fa85ae24 387#ifdef CONFIG_SMP
73fe6aae 388 unsigned long rt_nr_migratory;
a1ba4d8b 389 unsigned long rt_nr_total;
a22d7fc1 390 int overloaded;
917b627d 391 struct plist_head pushable_tasks;
fa85ae24 392#endif
6f505b16 393 int rt_throttled;
fa85ae24 394 u64 rt_time;
ac086bc2 395 u64 rt_runtime;
ea736ed5 396 /* Nests inside the rq lock: */
0986b11b 397 raw_spinlock_t rt_runtime_lock;
6f505b16 398
052f1dc7 399#ifdef CONFIG_RT_GROUP_SCHED
23b0fdfc
PZ
400 unsigned long rt_nr_boosted;
401
6f505b16
PZ
402 struct rq *rq;
403 struct list_head leaf_rt_rq_list;
404 struct task_group *tg;
6f505b16 405#endif
6aa645ea
IM
406};
407
57d885fe
GH
408#ifdef CONFIG_SMP
409
410/*
411 * We add the notion of a root-domain which will be used to define per-domain
0eab9146
IM
412 * variables. Each exclusive cpuset essentially defines an island domain by
413 * fully partitioning the member cpus from any other cpuset. Whenever a new
57d885fe
GH
414 * exclusive cpuset is created, we also create and attach a new root-domain
415 * object.
416 *
57d885fe
GH
417 */
418struct root_domain {
419 atomic_t refcount;
c6c4927b
RR
420 cpumask_var_t span;
421 cpumask_var_t online;
637f5085 422
0eab9146 423 /*
637f5085
GH
424 * The "RT overload" flag: it gets set if a CPU has more than
425 * one runnable RT task.
426 */
c6c4927b 427 cpumask_var_t rto_mask;
0eab9146 428 atomic_t rto_count;
6e0534f2 429 struct cpupri cpupri;
57d885fe
GH
430};
431
dc938520
GH
432/*
433 * By default the system creates a single root-domain with all cpus as
434 * members (mimicking the global state we have today).
435 */
57d885fe
GH
436static struct root_domain def_root_domain;
437
ed2d372c 438#endif /* CONFIG_SMP */
57d885fe 439
1da177e4
LT
440/*
441 * This is the main, per-CPU runqueue data structure.
442 *
443 * Locking rule: those places that want to lock multiple runqueues
444 * (such as the load balancing or the thread migration code), lock
445 * acquire operations must be ordered by ascending &runqueue.
446 */
70b97a7f 447struct rq {
d8016491 448 /* runqueue lock: */
05fa785c 449 raw_spinlock_t lock;
1da177e4
LT
450
451 /*
452 * nr_running and cpu_load should be in the same cacheline because
453 * remote CPUs use both these fields when doing load calculation.
454 */
455 unsigned long nr_running;
6aa645ea
IM
456 #define CPU_LOAD_IDX_MAX 5
457 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
fdf3e95d 458 unsigned long last_load_update_tick;
46cb4b7c 459#ifdef CONFIG_NO_HZ
39c0cbe2 460 u64 nohz_stamp;
83cd4fe2 461 unsigned char nohz_balance_kick;
46cb4b7c 462#endif
a64692a3
MG
463 unsigned int skip_clock_update;
464
d8016491
IM
465 /* capture load from *all* tasks on this cpu: */
466 struct load_weight load;
6aa645ea
IM
467 unsigned long nr_load_updates;
468 u64 nr_switches;
469
470 struct cfs_rq cfs;
6f505b16 471 struct rt_rq rt;
6f505b16 472
6aa645ea 473#ifdef CONFIG_FAIR_GROUP_SCHED
d8016491
IM
474 /* list of leaf cfs_rq on this cpu: */
475 struct list_head leaf_cfs_rq_list;
052f1dc7
PZ
476#endif
477#ifdef CONFIG_RT_GROUP_SCHED
6f505b16 478 struct list_head leaf_rt_rq_list;
1da177e4 479#endif
1da177e4
LT
480
481 /*
482 * This is part of a global counter where only the total sum
483 * over all CPUs matters. A task can increase this counter on
484 * one CPU and if it got migrated afterwards it may decrease
485 * it on another CPU. Always updated under the runqueue lock:
486 */
487 unsigned long nr_uninterruptible;
488
34f971f6 489 struct task_struct *curr, *idle, *stop;
c9819f45 490 unsigned long next_balance;
1da177e4 491 struct mm_struct *prev_mm;
6aa645ea 492
3e51f33f 493 u64 clock;
305e6835 494 u64 clock_task;
6aa645ea 495
1da177e4
LT
496 atomic_t nr_iowait;
497
498#ifdef CONFIG_SMP
0eab9146 499 struct root_domain *rd;
1da177e4
LT
500 struct sched_domain *sd;
501
e51fd5e2
PZ
502 unsigned long cpu_power;
503
a0a522ce 504 unsigned char idle_at_tick;
1da177e4 505 /* For active balancing */
3f029d3c 506 int post_schedule;
1da177e4
LT
507 int active_balance;
508 int push_cpu;
969c7921 509 struct cpu_stop_work active_balance_work;
d8016491
IM
510 /* cpu of this runqueue: */
511 int cpu;
1f11eb6a 512 int online;
1da177e4 513
a8a51d5e 514 unsigned long avg_load_per_task;
1da177e4 515
e9e9250b
PZ
516 u64 rt_avg;
517 u64 age_stamp;
1b9508f6
MG
518 u64 idle_stamp;
519 u64 avg_idle;
1da177e4
LT
520#endif
521
dce48a84
TG
522 /* calc_load related fields */
523 unsigned long calc_load_update;
524 long calc_load_active;
525
8f4d37ec 526#ifdef CONFIG_SCHED_HRTICK
31656519
PZ
527#ifdef CONFIG_SMP
528 int hrtick_csd_pending;
529 struct call_single_data hrtick_csd;
530#endif
8f4d37ec
PZ
531 struct hrtimer hrtick_timer;
532#endif
533
1da177e4
LT
534#ifdef CONFIG_SCHEDSTATS
535 /* latency stats */
536 struct sched_info rq_sched_info;
9c2c4802
KC
537 unsigned long long rq_cpu_time;
538 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
1da177e4
LT
539
540 /* sys_sched_yield() stats */
480b9434 541 unsigned int yld_count;
1da177e4
LT
542
543 /* schedule() stats */
480b9434
KC
544 unsigned int sched_switch;
545 unsigned int sched_count;
546 unsigned int sched_goidle;
1da177e4
LT
547
548 /* try_to_wake_up() stats */
480b9434
KC
549 unsigned int ttwu_count;
550 unsigned int ttwu_local;
b8efb561
IM
551
552 /* BKL stats */
480b9434 553 unsigned int bkl_count;
1da177e4
LT
554#endif
555};
556
f34e3b61 557static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
1da177e4 558
7d478721
PZ
559static inline
560void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
dd41f596 561{
7d478721 562 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
a64692a3
MG
563
564 /*
565 * A queue event has occurred, and we're going to schedule. In
566 * this case, we can save a useless back to back clock update.
567 */
568 if (test_tsk_need_resched(p))
569 rq->skip_clock_update = 1;
dd41f596
IM
570}
571
0a2966b4
CL
572static inline int cpu_of(struct rq *rq)
573{
574#ifdef CONFIG_SMP
575 return rq->cpu;
576#else
577 return 0;
578#endif
579}
580
497f0ab3 581#define rcu_dereference_check_sched_domain(p) \
d11c563d
PM
582 rcu_dereference_check((p), \
583 rcu_read_lock_sched_held() || \
584 lockdep_is_held(&sched_domains_mutex))
585
674311d5
NP
586/*
587 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
1a20ff27 588 * See detach_destroy_domains: synchronize_sched for details.
674311d5
NP
589 *
590 * The domain tree of any CPU may only be accessed from within
591 * preempt-disabled sections.
592 */
48f24c4d 593#define for_each_domain(cpu, __sd) \
497f0ab3 594 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
1da177e4
LT
595
596#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
597#define this_rq() (&__get_cpu_var(runqueues))
598#define task_rq(p) cpu_rq(task_cpu(p))
599#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
54d35f29 600#define raw_rq() (&__raw_get_cpu_var(runqueues))
1da177e4 601
dc61b1d6
PZ
602#ifdef CONFIG_CGROUP_SCHED
603
604/*
605 * Return the group to which this tasks belongs.
606 *
607 * We use task_subsys_state_check() and extend the RCU verification
608 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
609 * holds that lock for each task it moves into the cgroup. Therefore
610 * by holding that lock, we pin the task to the current cgroup.
611 */
612static inline struct task_group *task_group(struct task_struct *p)
613{
614 struct cgroup_subsys_state *css;
615
616 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
617 lockdep_is_held(&task_rq(p)->lock));
618 return container_of(css, struct task_group, css);
619}
620
621/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
622static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
623{
624#ifdef CONFIG_FAIR_GROUP_SCHED
625 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
626 p->se.parent = task_group(p)->se[cpu];
627#endif
628
629#ifdef CONFIG_RT_GROUP_SCHED
630 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
631 p->rt.parent = task_group(p)->rt_se[cpu];
632#endif
633}
634
635#else /* CONFIG_CGROUP_SCHED */
636
637static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
638static inline struct task_group *task_group(struct task_struct *p)
639{
640 return NULL;
641}
642
643#endif /* CONFIG_CGROUP_SCHED */
644
305e6835
VP
645static u64 irq_time_cpu(int cpu);
646
aa9c4c0f 647inline void update_rq_clock(struct rq *rq)
3e51f33f 648{
305e6835
VP
649 if (!rq->skip_clock_update) {
650 int cpu = cpu_of(rq);
651 u64 irq_time;
652
653 rq->clock = sched_clock_cpu(cpu);
654 irq_time = irq_time_cpu(cpu);
655 if (rq->clock - irq_time > rq->clock_task)
656 rq->clock_task = rq->clock - irq_time;
657 }
3e51f33f
PZ
658}
659
bf5c91ba
IM
660/*
661 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
662 */
663#ifdef CONFIG_SCHED_DEBUG
664# define const_debug __read_mostly
665#else
666# define const_debug static const
667#endif
668
017730c1
IM
669/**
670 * runqueue_is_locked
e17b38bf 671 * @cpu: the processor in question.
017730c1
IM
672 *
673 * Returns true if the current cpu runqueue is locked.
674 * This interface allows printk to be called with the runqueue lock
675 * held and know whether or not it is OK to wake up the klogd.
676 */
89f19f04 677int runqueue_is_locked(int cpu)
017730c1 678{
05fa785c 679 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
017730c1
IM
680}
681
bf5c91ba
IM
682/*
683 * Debugging: various feature bits
684 */
f00b45c1
PZ
685
686#define SCHED_FEAT(name, enabled) \
687 __SCHED_FEAT_##name ,
688
bf5c91ba 689enum {
f00b45c1 690#include "sched_features.h"
bf5c91ba
IM
691};
692
f00b45c1
PZ
693#undef SCHED_FEAT
694
695#define SCHED_FEAT(name, enabled) \
696 (1UL << __SCHED_FEAT_##name) * enabled |
697
bf5c91ba 698const_debug unsigned int sysctl_sched_features =
f00b45c1
PZ
699#include "sched_features.h"
700 0;
701
702#undef SCHED_FEAT
703
704#ifdef CONFIG_SCHED_DEBUG
705#define SCHED_FEAT(name, enabled) \
706 #name ,
707
983ed7a6 708static __read_mostly char *sched_feat_names[] = {
f00b45c1
PZ
709#include "sched_features.h"
710 NULL
711};
712
713#undef SCHED_FEAT
714
34f3a814 715static int sched_feat_show(struct seq_file *m, void *v)
f00b45c1 716{
f00b45c1
PZ
717 int i;
718
719 for (i = 0; sched_feat_names[i]; i++) {
34f3a814
LZ
720 if (!(sysctl_sched_features & (1UL << i)))
721 seq_puts(m, "NO_");
722 seq_printf(m, "%s ", sched_feat_names[i]);
f00b45c1 723 }
34f3a814 724 seq_puts(m, "\n");
f00b45c1 725
34f3a814 726 return 0;
f00b45c1
PZ
727}
728
729static ssize_t
730sched_feat_write(struct file *filp, const char __user *ubuf,
731 size_t cnt, loff_t *ppos)
732{
733 char buf[64];
7740191c 734 char *cmp;
f00b45c1
PZ
735 int neg = 0;
736 int i;
737
738 if (cnt > 63)
739 cnt = 63;
740
741 if (copy_from_user(&buf, ubuf, cnt))
742 return -EFAULT;
743
744 buf[cnt] = 0;
7740191c 745 cmp = strstrip(buf);
f00b45c1 746
c24b7c52 747 if (strncmp(buf, "NO_", 3) == 0) {
f00b45c1
PZ
748 neg = 1;
749 cmp += 3;
750 }
751
752 for (i = 0; sched_feat_names[i]; i++) {
7740191c 753 if (strcmp(cmp, sched_feat_names[i]) == 0) {
f00b45c1
PZ
754 if (neg)
755 sysctl_sched_features &= ~(1UL << i);
756 else
757 sysctl_sched_features |= (1UL << i);
758 break;
759 }
760 }
761
762 if (!sched_feat_names[i])
763 return -EINVAL;
764
42994724 765 *ppos += cnt;
f00b45c1
PZ
766
767 return cnt;
768}
769
34f3a814
LZ
770static int sched_feat_open(struct inode *inode, struct file *filp)
771{
772 return single_open(filp, sched_feat_show, NULL);
773}
774
828c0950 775static const struct file_operations sched_feat_fops = {
34f3a814
LZ
776 .open = sched_feat_open,
777 .write = sched_feat_write,
778 .read = seq_read,
779 .llseek = seq_lseek,
780 .release = single_release,
f00b45c1
PZ
781};
782
783static __init int sched_init_debug(void)
784{
f00b45c1
PZ
785 debugfs_create_file("sched_features", 0644, NULL, NULL,
786 &sched_feat_fops);
787
788 return 0;
789}
790late_initcall(sched_init_debug);
791
792#endif
793
794#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
bf5c91ba 795
b82d9fdd
PZ
796/*
797 * Number of tasks to iterate in a single balance run.
798 * Limited because this is done with IRQs disabled.
799 */
800const_debug unsigned int sysctl_sched_nr_migrate = 32;
801
2398f2c6
PZ
802/*
803 * ratelimit for updating the group shares.
55cd5340 804 * default: 0.25ms
2398f2c6 805 */
55cd5340 806unsigned int sysctl_sched_shares_ratelimit = 250000;
0bcdcf28 807unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
2398f2c6 808
ffda12a1
PZ
809/*
810 * Inject some fuzzyness into changing the per-cpu group shares
811 * this avoids remote rq-locks at the expense of fairness.
812 * default: 4
813 */
814unsigned int sysctl_sched_shares_thresh = 4;
815
e9e9250b
PZ
816/*
817 * period over which we average the RT time consumption, measured
818 * in ms.
819 *
820 * default: 1s
821 */
822const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
823
fa85ae24 824/*
9f0c1e56 825 * period over which we measure -rt task cpu usage in us.
fa85ae24
PZ
826 * default: 1s
827 */
9f0c1e56 828unsigned int sysctl_sched_rt_period = 1000000;
fa85ae24 829
6892b75e
IM
830static __read_mostly int scheduler_running;
831
9f0c1e56
PZ
832/*
833 * part of the period that we allow rt tasks to run in us.
834 * default: 0.95s
835 */
836int sysctl_sched_rt_runtime = 950000;
fa85ae24 837
d0b27fa7
PZ
838static inline u64 global_rt_period(void)
839{
840 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
841}
842
843static inline u64 global_rt_runtime(void)
844{
e26873bb 845 if (sysctl_sched_rt_runtime < 0)
d0b27fa7
PZ
846 return RUNTIME_INF;
847
848 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
849}
fa85ae24 850
1da177e4 851#ifndef prepare_arch_switch
4866cde0
NP
852# define prepare_arch_switch(next) do { } while (0)
853#endif
854#ifndef finish_arch_switch
855# define finish_arch_switch(prev) do { } while (0)
856#endif
857
051a1d1a
DA
858static inline int task_current(struct rq *rq, struct task_struct *p)
859{
860 return rq->curr == p;
861}
862
4866cde0 863#ifndef __ARCH_WANT_UNLOCKED_CTXSW
70b97a7f 864static inline int task_running(struct rq *rq, struct task_struct *p)
4866cde0 865{
051a1d1a 866 return task_current(rq, p);
4866cde0
NP
867}
868
70b97a7f 869static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
870{
871}
872
70b97a7f 873static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
4866cde0 874{
da04c035
IM
875#ifdef CONFIG_DEBUG_SPINLOCK
876 /* this is a valid case when another task releases the spinlock */
877 rq->lock.owner = current;
878#endif
8a25d5de
IM
879 /*
880 * If we are tracking spinlock dependencies then we have to
881 * fix up the runqueue lock - which gets 'carried over' from
882 * prev into current:
883 */
884 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
885
05fa785c 886 raw_spin_unlock_irq(&rq->lock);
4866cde0
NP
887}
888
889#else /* __ARCH_WANT_UNLOCKED_CTXSW */
70b97a7f 890static inline int task_running(struct rq *rq, struct task_struct *p)
4866cde0
NP
891{
892#ifdef CONFIG_SMP
893 return p->oncpu;
894#else
051a1d1a 895 return task_current(rq, p);
4866cde0
NP
896#endif
897}
898
70b97a7f 899static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
4866cde0
NP
900{
901#ifdef CONFIG_SMP
902 /*
903 * We can optimise this out completely for !SMP, because the
904 * SMP rebalancing from interrupt is the only thing that cares
905 * here.
906 */
907 next->oncpu = 1;
908#endif
909#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
05fa785c 910 raw_spin_unlock_irq(&rq->lock);
4866cde0 911#else
05fa785c 912 raw_spin_unlock(&rq->lock);
4866cde0
NP
913#endif
914}
915
70b97a7f 916static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
4866cde0
NP
917{
918#ifdef CONFIG_SMP
919 /*
920 * After ->oncpu is cleared, the task can be moved to a different CPU.
921 * We must ensure this doesn't happen until the switch is completely
922 * finished.
923 */
924 smp_wmb();
925 prev->oncpu = 0;
926#endif
927#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
928 local_irq_enable();
1da177e4 929#endif
4866cde0
NP
930}
931#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1da177e4 932
0970d299 933/*
65cc8e48
PZ
934 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
935 * against ttwu().
0970d299
PZ
936 */
937static inline int task_is_waking(struct task_struct *p)
938{
0017d735 939 return unlikely(p->state == TASK_WAKING);
0970d299
PZ
940}
941
b29739f9
IM
942/*
943 * __task_rq_lock - lock the runqueue a given task resides on.
944 * Must be called interrupts disabled.
945 */
70b97a7f 946static inline struct rq *__task_rq_lock(struct task_struct *p)
b29739f9
IM
947 __acquires(rq->lock)
948{
0970d299
PZ
949 struct rq *rq;
950
3a5c359a 951 for (;;) {
0970d299 952 rq = task_rq(p);
05fa785c 953 raw_spin_lock(&rq->lock);
65cc8e48 954 if (likely(rq == task_rq(p)))
3a5c359a 955 return rq;
05fa785c 956 raw_spin_unlock(&rq->lock);
b29739f9 957 }
b29739f9
IM
958}
959
1da177e4
LT
960/*
961 * task_rq_lock - lock the runqueue a given task resides on and disable
41a2d6cf 962 * interrupts. Note the ordering: we can safely lookup the task_rq without
1da177e4
LT
963 * explicitly disabling preemption.
964 */
70b97a7f 965static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
1da177e4
LT
966 __acquires(rq->lock)
967{
70b97a7f 968 struct rq *rq;
1da177e4 969
3a5c359a
AK
970 for (;;) {
971 local_irq_save(*flags);
972 rq = task_rq(p);
05fa785c 973 raw_spin_lock(&rq->lock);
65cc8e48 974 if (likely(rq == task_rq(p)))
3a5c359a 975 return rq;
05fa785c 976 raw_spin_unlock_irqrestore(&rq->lock, *flags);
1da177e4 977 }
1da177e4
LT
978}
979
a9957449 980static void __task_rq_unlock(struct rq *rq)
b29739f9
IM
981 __releases(rq->lock)
982{
05fa785c 983 raw_spin_unlock(&rq->lock);
b29739f9
IM
984}
985
70b97a7f 986static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
1da177e4
LT
987 __releases(rq->lock)
988{
05fa785c 989 raw_spin_unlock_irqrestore(&rq->lock, *flags);
1da177e4
LT
990}
991
1da177e4 992/*
cc2a73b5 993 * this_rq_lock - lock this runqueue and disable interrupts.
1da177e4 994 */
a9957449 995static struct rq *this_rq_lock(void)
1da177e4
LT
996 __acquires(rq->lock)
997{
70b97a7f 998 struct rq *rq;
1da177e4
LT
999
1000 local_irq_disable();
1001 rq = this_rq();
05fa785c 1002 raw_spin_lock(&rq->lock);
1da177e4
LT
1003
1004 return rq;
1005}
1006
8f4d37ec
PZ
1007#ifdef CONFIG_SCHED_HRTICK
1008/*
1009 * Use HR-timers to deliver accurate preemption points.
1010 *
1011 * Its all a bit involved since we cannot program an hrt while holding the
1012 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1013 * reschedule event.
1014 *
1015 * When we get rescheduled we reprogram the hrtick_timer outside of the
1016 * rq->lock.
1017 */
8f4d37ec
PZ
1018
1019/*
1020 * Use hrtick when:
1021 * - enabled by features
1022 * - hrtimer is actually high res
1023 */
1024static inline int hrtick_enabled(struct rq *rq)
1025{
1026 if (!sched_feat(HRTICK))
1027 return 0;
ba42059f 1028 if (!cpu_active(cpu_of(rq)))
b328ca18 1029 return 0;
8f4d37ec
PZ
1030 return hrtimer_is_hres_active(&rq->hrtick_timer);
1031}
1032
8f4d37ec
PZ
1033static void hrtick_clear(struct rq *rq)
1034{
1035 if (hrtimer_active(&rq->hrtick_timer))
1036 hrtimer_cancel(&rq->hrtick_timer);
1037}
1038
8f4d37ec
PZ
1039/*
1040 * High-resolution timer tick.
1041 * Runs from hardirq context with interrupts disabled.
1042 */
1043static enum hrtimer_restart hrtick(struct hrtimer *timer)
1044{
1045 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1046
1047 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1048
05fa785c 1049 raw_spin_lock(&rq->lock);
3e51f33f 1050 update_rq_clock(rq);
8f4d37ec 1051 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
05fa785c 1052 raw_spin_unlock(&rq->lock);
8f4d37ec
PZ
1053
1054 return HRTIMER_NORESTART;
1055}
1056
95e904c7 1057#ifdef CONFIG_SMP
31656519
PZ
1058/*
1059 * called from hardirq (IPI) context
1060 */
1061static void __hrtick_start(void *arg)
b328ca18 1062{
31656519 1063 struct rq *rq = arg;
b328ca18 1064
05fa785c 1065 raw_spin_lock(&rq->lock);
31656519
PZ
1066 hrtimer_restart(&rq->hrtick_timer);
1067 rq->hrtick_csd_pending = 0;
05fa785c 1068 raw_spin_unlock(&rq->lock);
b328ca18
PZ
1069}
1070
31656519
PZ
1071/*
1072 * Called to set the hrtick timer state.
1073 *
1074 * called with rq->lock held and irqs disabled
1075 */
1076static void hrtick_start(struct rq *rq, u64 delay)
b328ca18 1077{
31656519
PZ
1078 struct hrtimer *timer = &rq->hrtick_timer;
1079 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
b328ca18 1080
cc584b21 1081 hrtimer_set_expires(timer, time);
31656519
PZ
1082
1083 if (rq == this_rq()) {
1084 hrtimer_restart(timer);
1085 } else if (!rq->hrtick_csd_pending) {
6e275637 1086 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
31656519
PZ
1087 rq->hrtick_csd_pending = 1;
1088 }
b328ca18
PZ
1089}
1090
1091static int
1092hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1093{
1094 int cpu = (int)(long)hcpu;
1095
1096 switch (action) {
1097 case CPU_UP_CANCELED:
1098 case CPU_UP_CANCELED_FROZEN:
1099 case CPU_DOWN_PREPARE:
1100 case CPU_DOWN_PREPARE_FROZEN:
1101 case CPU_DEAD:
1102 case CPU_DEAD_FROZEN:
31656519 1103 hrtick_clear(cpu_rq(cpu));
b328ca18
PZ
1104 return NOTIFY_OK;
1105 }
1106
1107 return NOTIFY_DONE;
1108}
1109
fa748203 1110static __init void init_hrtick(void)
b328ca18
PZ
1111{
1112 hotcpu_notifier(hotplug_hrtick, 0);
1113}
31656519
PZ
1114#else
1115/*
1116 * Called to set the hrtick timer state.
1117 *
1118 * called with rq->lock held and irqs disabled
1119 */
1120static void hrtick_start(struct rq *rq, u64 delay)
1121{
7f1e2ca9 1122 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
5c333864 1123 HRTIMER_MODE_REL_PINNED, 0);
31656519 1124}
b328ca18 1125
006c75f1 1126static inline void init_hrtick(void)
8f4d37ec 1127{
8f4d37ec 1128}
31656519 1129#endif /* CONFIG_SMP */
8f4d37ec 1130
31656519 1131static void init_rq_hrtick(struct rq *rq)
8f4d37ec 1132{
31656519
PZ
1133#ifdef CONFIG_SMP
1134 rq->hrtick_csd_pending = 0;
8f4d37ec 1135
31656519
PZ
1136 rq->hrtick_csd.flags = 0;
1137 rq->hrtick_csd.func = __hrtick_start;
1138 rq->hrtick_csd.info = rq;
1139#endif
8f4d37ec 1140
31656519
PZ
1141 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1142 rq->hrtick_timer.function = hrtick;
8f4d37ec 1143}
006c75f1 1144#else /* CONFIG_SCHED_HRTICK */
8f4d37ec
PZ
1145static inline void hrtick_clear(struct rq *rq)
1146{
1147}
1148
8f4d37ec
PZ
1149static inline void init_rq_hrtick(struct rq *rq)
1150{
1151}
1152
b328ca18
PZ
1153static inline void init_hrtick(void)
1154{
1155}
006c75f1 1156#endif /* CONFIG_SCHED_HRTICK */
8f4d37ec 1157
c24d20db
IM
1158/*
1159 * resched_task - mark a task 'to be rescheduled now'.
1160 *
1161 * On UP this means the setting of the need_resched flag, on SMP it
1162 * might also involve a cross-CPU call to trigger the scheduler on
1163 * the target CPU.
1164 */
1165#ifdef CONFIG_SMP
1166
1167#ifndef tsk_is_polling
1168#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1169#endif
1170
31656519 1171static void resched_task(struct task_struct *p)
c24d20db
IM
1172{
1173 int cpu;
1174
05fa785c 1175 assert_raw_spin_locked(&task_rq(p)->lock);
c24d20db 1176
5ed0cec0 1177 if (test_tsk_need_resched(p))
c24d20db
IM
1178 return;
1179
5ed0cec0 1180 set_tsk_need_resched(p);
c24d20db
IM
1181
1182 cpu = task_cpu(p);
1183 if (cpu == smp_processor_id())
1184 return;
1185
1186 /* NEED_RESCHED must be visible before we test polling */
1187 smp_mb();
1188 if (!tsk_is_polling(p))
1189 smp_send_reschedule(cpu);
1190}
1191
1192static void resched_cpu(int cpu)
1193{
1194 struct rq *rq = cpu_rq(cpu);
1195 unsigned long flags;
1196
05fa785c 1197 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
c24d20db
IM
1198 return;
1199 resched_task(cpu_curr(cpu));
05fa785c 1200 raw_spin_unlock_irqrestore(&rq->lock, flags);
c24d20db 1201}
06d8308c
TG
1202
1203#ifdef CONFIG_NO_HZ
83cd4fe2
VP
1204/*
1205 * In the semi idle case, use the nearest busy cpu for migrating timers
1206 * from an idle cpu. This is good for power-savings.
1207 *
1208 * We don't do similar optimization for completely idle system, as
1209 * selecting an idle cpu will add more delays to the timers than intended
1210 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1211 */
1212int get_nohz_timer_target(void)
1213{
1214 int cpu = smp_processor_id();
1215 int i;
1216 struct sched_domain *sd;
1217
1218 for_each_domain(cpu, sd) {
1219 for_each_cpu(i, sched_domain_span(sd))
1220 if (!idle_cpu(i))
1221 return i;
1222 }
1223 return cpu;
1224}
06d8308c
TG
1225/*
1226 * When add_timer_on() enqueues a timer into the timer wheel of an
1227 * idle CPU then this timer might expire before the next timer event
1228 * which is scheduled to wake up that CPU. In case of a completely
1229 * idle system the next event might even be infinite time into the
1230 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1231 * leaves the inner idle loop so the newly added timer is taken into
1232 * account when the CPU goes back to idle and evaluates the timer
1233 * wheel for the next timer event.
1234 */
1235void wake_up_idle_cpu(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238
1239 if (cpu == smp_processor_id())
1240 return;
1241
1242 /*
1243 * This is safe, as this function is called with the timer
1244 * wheel base lock of (cpu) held. When the CPU is on the way
1245 * to idle and has not yet set rq->curr to idle then it will
1246 * be serialized on the timer wheel base lock and take the new
1247 * timer into account automatically.
1248 */
1249 if (rq->curr != rq->idle)
1250 return;
1251
1252 /*
1253 * We can set TIF_RESCHED on the idle task of the other CPU
1254 * lockless. The worst case is that the other CPU runs the
1255 * idle task through an additional NOOP schedule()
1256 */
5ed0cec0 1257 set_tsk_need_resched(rq->idle);
06d8308c
TG
1258
1259 /* NEED_RESCHED must be visible before we test polling */
1260 smp_mb();
1261 if (!tsk_is_polling(rq->idle))
1262 smp_send_reschedule(cpu);
1263}
39c0cbe2 1264
6d6bc0ad 1265#endif /* CONFIG_NO_HZ */
06d8308c 1266
e9e9250b
PZ
1267static u64 sched_avg_period(void)
1268{
1269 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1270}
1271
1272static void sched_avg_update(struct rq *rq)
1273{
1274 s64 period = sched_avg_period();
1275
1276 while ((s64)(rq->clock - rq->age_stamp) > period) {
0d98bb26
WD
1277 /*
1278 * Inline assembly required to prevent the compiler
1279 * optimising this loop into a divmod call.
1280 * See __iter_div_u64_rem() for another example of this.
1281 */
1282 asm("" : "+rm" (rq->age_stamp));
e9e9250b
PZ
1283 rq->age_stamp += period;
1284 rq->rt_avg /= 2;
1285 }
1286}
1287
1288static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1289{
1290 rq->rt_avg += rt_delta;
1291 sched_avg_update(rq);
1292}
1293
6d6bc0ad 1294#else /* !CONFIG_SMP */
31656519 1295static void resched_task(struct task_struct *p)
c24d20db 1296{
05fa785c 1297 assert_raw_spin_locked(&task_rq(p)->lock);
31656519 1298 set_tsk_need_resched(p);
c24d20db 1299}
e9e9250b
PZ
1300
1301static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1302{
1303}
da2b71ed
SS
1304
1305static void sched_avg_update(struct rq *rq)
1306{
1307}
6d6bc0ad 1308#endif /* CONFIG_SMP */
c24d20db 1309
45bf76df
IM
1310#if BITS_PER_LONG == 32
1311# define WMULT_CONST (~0UL)
1312#else
1313# define WMULT_CONST (1UL << 32)
1314#endif
1315
1316#define WMULT_SHIFT 32
1317
194081eb
IM
1318/*
1319 * Shift right and round:
1320 */
cf2ab469 1321#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
194081eb 1322
a7be37ac
PZ
1323/*
1324 * delta *= weight / lw
1325 */
cb1c4fc9 1326static unsigned long
45bf76df
IM
1327calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1328 struct load_weight *lw)
1329{
1330 u64 tmp;
1331
7a232e03
LJ
1332 if (!lw->inv_weight) {
1333 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1334 lw->inv_weight = 1;
1335 else
1336 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1337 / (lw->weight+1);
1338 }
45bf76df
IM
1339
1340 tmp = (u64)delta_exec * weight;
1341 /*
1342 * Check whether we'd overflow the 64-bit multiplication:
1343 */
194081eb 1344 if (unlikely(tmp > WMULT_CONST))
cf2ab469 1345 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
194081eb
IM
1346 WMULT_SHIFT/2);
1347 else
cf2ab469 1348 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
45bf76df 1349
ecf691da 1350 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
45bf76df
IM
1351}
1352
1091985b 1353static inline void update_load_add(struct load_weight *lw, unsigned long inc)
45bf76df
IM
1354{
1355 lw->weight += inc;
e89996ae 1356 lw->inv_weight = 0;
45bf76df
IM
1357}
1358
1091985b 1359static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
45bf76df
IM
1360{
1361 lw->weight -= dec;
e89996ae 1362 lw->inv_weight = 0;
45bf76df
IM
1363}
1364
2dd73a4f
PW
1365/*
1366 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1367 * of tasks with abnormal "nice" values across CPUs the contribution that
1368 * each task makes to its run queue's load is weighted according to its
41a2d6cf 1369 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
2dd73a4f
PW
1370 * scaled version of the new time slice allocation that they receive on time
1371 * slice expiry etc.
1372 */
1373
cce7ade8
PZ
1374#define WEIGHT_IDLEPRIO 3
1375#define WMULT_IDLEPRIO 1431655765
dd41f596
IM
1376
1377/*
1378 * Nice levels are multiplicative, with a gentle 10% change for every
1379 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1380 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1381 * that remained on nice 0.
1382 *
1383 * The "10% effect" is relative and cumulative: from _any_ nice level,
1384 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
f9153ee6
IM
1385 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1386 * If a task goes up by ~10% and another task goes down by ~10% then
1387 * the relative distance between them is ~25%.)
dd41f596
IM
1388 */
1389static const int prio_to_weight[40] = {
254753dc
IM
1390 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1391 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1392 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1393 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1394 /* 0 */ 1024, 820, 655, 526, 423,
1395 /* 5 */ 335, 272, 215, 172, 137,
1396 /* 10 */ 110, 87, 70, 56, 45,
1397 /* 15 */ 36, 29, 23, 18, 15,
dd41f596
IM
1398};
1399
5714d2de
IM
1400/*
1401 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1402 *
1403 * In cases where the weight does not change often, we can use the
1404 * precalculated inverse to speed up arithmetics by turning divisions
1405 * into multiplications:
1406 */
dd41f596 1407static const u32 prio_to_wmult[40] = {
254753dc
IM
1408 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1409 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1410 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1411 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1412 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1413 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1414 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1415 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
dd41f596 1416};
2dd73a4f 1417
ef12fefa
BR
1418/* Time spent by the tasks of the cpu accounting group executing in ... */
1419enum cpuacct_stat_index {
1420 CPUACCT_STAT_USER, /* ... user mode */
1421 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1422
1423 CPUACCT_STAT_NSTATS,
1424};
1425
d842de87
SV
1426#ifdef CONFIG_CGROUP_CPUACCT
1427static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
ef12fefa
BR
1428static void cpuacct_update_stats(struct task_struct *tsk,
1429 enum cpuacct_stat_index idx, cputime_t val);
d842de87
SV
1430#else
1431static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
ef12fefa
BR
1432static inline void cpuacct_update_stats(struct task_struct *tsk,
1433 enum cpuacct_stat_index idx, cputime_t val) {}
d842de87
SV
1434#endif
1435
18d95a28
PZ
1436static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1437{
1438 update_load_add(&rq->load, load);
1439}
1440
1441static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1442{
1443 update_load_sub(&rq->load, load);
1444}
1445
7940ca36 1446#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
eb755805 1447typedef int (*tg_visitor)(struct task_group *, void *);
c09595f6
PZ
1448
1449/*
1450 * Iterate the full tree, calling @down when first entering a node and @up when
1451 * leaving it for the final time.
1452 */
eb755805 1453static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
c09595f6
PZ
1454{
1455 struct task_group *parent, *child;
eb755805 1456 int ret;
c09595f6
PZ
1457
1458 rcu_read_lock();
1459 parent = &root_task_group;
1460down:
eb755805
PZ
1461 ret = (*down)(parent, data);
1462 if (ret)
1463 goto out_unlock;
c09595f6
PZ
1464 list_for_each_entry_rcu(child, &parent->children, siblings) {
1465 parent = child;
1466 goto down;
1467
1468up:
1469 continue;
1470 }
eb755805
PZ
1471 ret = (*up)(parent, data);
1472 if (ret)
1473 goto out_unlock;
c09595f6
PZ
1474
1475 child = parent;
1476 parent = parent->parent;
1477 if (parent)
1478 goto up;
eb755805 1479out_unlock:
c09595f6 1480 rcu_read_unlock();
eb755805
PZ
1481
1482 return ret;
c09595f6
PZ
1483}
1484
eb755805
PZ
1485static int tg_nop(struct task_group *tg, void *data)
1486{
1487 return 0;
c09595f6 1488}
eb755805
PZ
1489#endif
1490
1491#ifdef CONFIG_SMP
f5f08f39
PZ
1492/* Used instead of source_load when we know the type == 0 */
1493static unsigned long weighted_cpuload(const int cpu)
1494{
1495 return cpu_rq(cpu)->load.weight;
1496}
1497
1498/*
1499 * Return a low guess at the load of a migration-source cpu weighted
1500 * according to the scheduling class and "nice" value.
1501 *
1502 * We want to under-estimate the load of migration sources, to
1503 * balance conservatively.
1504 */
1505static unsigned long source_load(int cpu, int type)
1506{
1507 struct rq *rq = cpu_rq(cpu);
1508 unsigned long total = weighted_cpuload(cpu);
1509
1510 if (type == 0 || !sched_feat(LB_BIAS))
1511 return total;
1512
1513 return min(rq->cpu_load[type-1], total);
1514}
1515
1516/*
1517 * Return a high guess at the load of a migration-target cpu weighted
1518 * according to the scheduling class and "nice" value.
1519 */
1520static unsigned long target_load(int cpu, int type)
1521{
1522 struct rq *rq = cpu_rq(cpu);
1523 unsigned long total = weighted_cpuload(cpu);
1524
1525 if (type == 0 || !sched_feat(LB_BIAS))
1526 return total;
1527
1528 return max(rq->cpu_load[type-1], total);
1529}
1530
ae154be1
PZ
1531static unsigned long power_of(int cpu)
1532{
e51fd5e2 1533 return cpu_rq(cpu)->cpu_power;
ae154be1
PZ
1534}
1535
eb755805
PZ
1536static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1537
1538static unsigned long cpu_avg_load_per_task(int cpu)
1539{
1540 struct rq *rq = cpu_rq(cpu);
af6d596f 1541 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
eb755805 1542
4cd42620
SR
1543 if (nr_running)
1544 rq->avg_load_per_task = rq->load.weight / nr_running;
a2d47777
BS
1545 else
1546 rq->avg_load_per_task = 0;
eb755805
PZ
1547
1548 return rq->avg_load_per_task;
1549}
1550
1551#ifdef CONFIG_FAIR_GROUP_SCHED
c09595f6 1552
43cf38eb 1553static __read_mostly unsigned long __percpu *update_shares_data;
34d76c41 1554
c09595f6
PZ
1555static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1556
1557/*
1558 * Calculate and set the cpu's group shares.
1559 */
34d76c41
PZ
1560static void update_group_shares_cpu(struct task_group *tg, int cpu,
1561 unsigned long sd_shares,
1562 unsigned long sd_rq_weight,
4a6cc4bd 1563 unsigned long *usd_rq_weight)
18d95a28 1564{
34d76c41 1565 unsigned long shares, rq_weight;
a5004278 1566 int boost = 0;
c09595f6 1567
4a6cc4bd 1568 rq_weight = usd_rq_weight[cpu];
a5004278
PZ
1569 if (!rq_weight) {
1570 boost = 1;
1571 rq_weight = NICE_0_LOAD;
1572 }
c8cba857 1573
c09595f6 1574 /*
a8af7246
PZ
1575 * \Sum_j shares_j * rq_weight_i
1576 * shares_i = -----------------------------
1577 * \Sum_j rq_weight_j
c09595f6 1578 */
ec4e0e2f 1579 shares = (sd_shares * rq_weight) / sd_rq_weight;
ffda12a1 1580 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
c09595f6 1581
ffda12a1
PZ
1582 if (abs(shares - tg->se[cpu]->load.weight) >
1583 sysctl_sched_shares_thresh) {
1584 struct rq *rq = cpu_rq(cpu);
1585 unsigned long flags;
c09595f6 1586
05fa785c 1587 raw_spin_lock_irqsave(&rq->lock, flags);
34d76c41 1588 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
a5004278 1589 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
ffda12a1 1590 __set_se_shares(tg->se[cpu], shares);
05fa785c 1591 raw_spin_unlock_irqrestore(&rq->lock, flags);
ffda12a1 1592 }
18d95a28 1593}
c09595f6
PZ
1594
1595/*
c8cba857
PZ
1596 * Re-compute the task group their per cpu shares over the given domain.
1597 * This needs to be done in a bottom-up fashion because the rq weight of a
1598 * parent group depends on the shares of its child groups.
c09595f6 1599 */
eb755805 1600static int tg_shares_up(struct task_group *tg, void *data)
c09595f6 1601{
cd8ad40d 1602 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
4a6cc4bd 1603 unsigned long *usd_rq_weight;
eb755805 1604 struct sched_domain *sd = data;
34d76c41 1605 unsigned long flags;
c8cba857 1606 int i;
c09595f6 1607
34d76c41
PZ
1608 if (!tg->se[0])
1609 return 0;
1610
1611 local_irq_save(flags);
4a6cc4bd 1612 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
34d76c41 1613
758b2cdc 1614 for_each_cpu(i, sched_domain_span(sd)) {
34d76c41 1615 weight = tg->cfs_rq[i]->load.weight;
4a6cc4bd 1616 usd_rq_weight[i] = weight;
34d76c41 1617
cd8ad40d 1618 rq_weight += weight;
ec4e0e2f
KC
1619 /*
1620 * If there are currently no tasks on the cpu pretend there
1621 * is one of average load so that when a new task gets to
1622 * run here it will not get delayed by group starvation.
1623 */
ec4e0e2f
KC
1624 if (!weight)
1625 weight = NICE_0_LOAD;
1626
cd8ad40d 1627 sum_weight += weight;
c8cba857 1628 shares += tg->cfs_rq[i]->shares;
c09595f6 1629 }
c09595f6 1630
cd8ad40d
PZ
1631 if (!rq_weight)
1632 rq_weight = sum_weight;
1633
c8cba857
PZ
1634 if ((!shares && rq_weight) || shares > tg->shares)
1635 shares = tg->shares;
1636
1637 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1638 shares = tg->shares;
c09595f6 1639
758b2cdc 1640 for_each_cpu(i, sched_domain_span(sd))
4a6cc4bd 1641 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
34d76c41
PZ
1642
1643 local_irq_restore(flags);
eb755805
PZ
1644
1645 return 0;
c09595f6
PZ
1646}
1647
1648/*
c8cba857
PZ
1649 * Compute the cpu's hierarchical load factor for each task group.
1650 * This needs to be done in a top-down fashion because the load of a child
1651 * group is a fraction of its parents load.
c09595f6 1652 */
eb755805 1653static int tg_load_down(struct task_group *tg, void *data)
c09595f6 1654{
c8cba857 1655 unsigned long load;
eb755805 1656 long cpu = (long)data;
c09595f6 1657
c8cba857
PZ
1658 if (!tg->parent) {
1659 load = cpu_rq(cpu)->load.weight;
1660 } else {
1661 load = tg->parent->cfs_rq[cpu]->h_load;
1662 load *= tg->cfs_rq[cpu]->shares;
1663 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1664 }
c09595f6 1665
c8cba857 1666 tg->cfs_rq[cpu]->h_load = load;
c09595f6 1667
eb755805 1668 return 0;
c09595f6
PZ
1669}
1670
c8cba857 1671static void update_shares(struct sched_domain *sd)
4d8d595d 1672{
e7097159
PZ
1673 s64 elapsed;
1674 u64 now;
1675
1676 if (root_task_group_empty())
1677 return;
1678
c676329a 1679 now = local_clock();
e7097159 1680 elapsed = now - sd->last_update;
2398f2c6
PZ
1681
1682 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1683 sd->last_update = now;
eb755805 1684 walk_tg_tree(tg_nop, tg_shares_up, sd);
2398f2c6 1685 }
4d8d595d
PZ
1686}
1687
eb755805 1688static void update_h_load(long cpu)
c09595f6 1689{
eb755805 1690 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
c09595f6
PZ
1691}
1692
c09595f6
PZ
1693#else
1694
c8cba857 1695static inline void update_shares(struct sched_domain *sd)
4d8d595d
PZ
1696{
1697}
1698
18d95a28
PZ
1699#endif
1700
8f45e2b5
GH
1701#ifdef CONFIG_PREEMPT
1702
b78bb868
PZ
1703static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1704
70574a99 1705/*
8f45e2b5
GH
1706 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1707 * way at the expense of forcing extra atomic operations in all
1708 * invocations. This assures that the double_lock is acquired using the
1709 * same underlying policy as the spinlock_t on this architecture, which
1710 * reduces latency compared to the unfair variant below. However, it
1711 * also adds more overhead and therefore may reduce throughput.
70574a99 1712 */
8f45e2b5
GH
1713static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1714 __releases(this_rq->lock)
1715 __acquires(busiest->lock)
1716 __acquires(this_rq->lock)
1717{
05fa785c 1718 raw_spin_unlock(&this_rq->lock);
8f45e2b5
GH
1719 double_rq_lock(this_rq, busiest);
1720
1721 return 1;
1722}
1723
1724#else
1725/*
1726 * Unfair double_lock_balance: Optimizes throughput at the expense of
1727 * latency by eliminating extra atomic operations when the locks are
1728 * already in proper order on entry. This favors lower cpu-ids and will
1729 * grant the double lock to lower cpus over higher ids under contention,
1730 * regardless of entry order into the function.
1731 */
1732static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
70574a99
AD
1733 __releases(this_rq->lock)
1734 __acquires(busiest->lock)
1735 __acquires(this_rq->lock)
1736{
1737 int ret = 0;
1738
05fa785c 1739 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
70574a99 1740 if (busiest < this_rq) {
05fa785c
TG
1741 raw_spin_unlock(&this_rq->lock);
1742 raw_spin_lock(&busiest->lock);
1743 raw_spin_lock_nested(&this_rq->lock,
1744 SINGLE_DEPTH_NESTING);
70574a99
AD
1745 ret = 1;
1746 } else
05fa785c
TG
1747 raw_spin_lock_nested(&busiest->lock,
1748 SINGLE_DEPTH_NESTING);
70574a99
AD
1749 }
1750 return ret;
1751}
1752
8f45e2b5
GH
1753#endif /* CONFIG_PREEMPT */
1754
1755/*
1756 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1757 */
1758static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1759{
1760 if (unlikely(!irqs_disabled())) {
1761 /* printk() doesn't work good under rq->lock */
05fa785c 1762 raw_spin_unlock(&this_rq->lock);
8f45e2b5
GH
1763 BUG_ON(1);
1764 }
1765
1766 return _double_lock_balance(this_rq, busiest);
1767}
1768
70574a99
AD
1769static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1770 __releases(busiest->lock)
1771{
05fa785c 1772 raw_spin_unlock(&busiest->lock);
70574a99
AD
1773 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1774}
1e3c88bd
PZ
1775
1776/*
1777 * double_rq_lock - safely lock two runqueues
1778 *
1779 * Note this does not disable interrupts like task_rq_lock,
1780 * you need to do so manually before calling.
1781 */
1782static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1783 __acquires(rq1->lock)
1784 __acquires(rq2->lock)
1785{
1786 BUG_ON(!irqs_disabled());
1787 if (rq1 == rq2) {
1788 raw_spin_lock(&rq1->lock);
1789 __acquire(rq2->lock); /* Fake it out ;) */
1790 } else {
1791 if (rq1 < rq2) {
1792 raw_spin_lock(&rq1->lock);
1793 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1794 } else {
1795 raw_spin_lock(&rq2->lock);
1796 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1797 }
1798 }
1e3c88bd
PZ
1799}
1800
1801/*
1802 * double_rq_unlock - safely unlock two runqueues
1803 *
1804 * Note this does not restore interrupts like task_rq_unlock,
1805 * you need to do so manually after calling.
1806 */
1807static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1808 __releases(rq1->lock)
1809 __releases(rq2->lock)
1810{
1811 raw_spin_unlock(&rq1->lock);
1812 if (rq1 != rq2)
1813 raw_spin_unlock(&rq2->lock);
1814 else
1815 __release(rq2->lock);
1816}
1817
18d95a28
PZ
1818#endif
1819
30432094 1820#ifdef CONFIG_FAIR_GROUP_SCHED
34e83e85
IM
1821static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1822{
30432094 1823#ifdef CONFIG_SMP
34e83e85
IM
1824 cfs_rq->shares = shares;
1825#endif
1826}
30432094 1827#endif
e7693a36 1828
74f5187a 1829static void calc_load_account_idle(struct rq *this_rq);
0bcdcf28 1830static void update_sysctl(void);
acb4a848 1831static int get_update_sysctl_factor(void);
fdf3e95d 1832static void update_cpu_load(struct rq *this_rq);
dce48a84 1833
cd29fe6f
PZ
1834static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1835{
1836 set_task_rq(p, cpu);
1837#ifdef CONFIG_SMP
1838 /*
1839 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1840 * successfuly executed on another CPU. We must ensure that updates of
1841 * per-task data have been completed by this moment.
1842 */
1843 smp_wmb();
1844 task_thread_info(p)->cpu = cpu;
1845#endif
1846}
dce48a84 1847
1e3c88bd 1848static const struct sched_class rt_sched_class;
dd41f596 1849
34f971f6 1850#define sched_class_highest (&stop_sched_class)
1f11eb6a
GH
1851#define for_each_class(class) \
1852 for (class = sched_class_highest; class; class = class->next)
dd41f596 1853
1e3c88bd
PZ
1854#include "sched_stats.h"
1855
c09595f6 1856static void inc_nr_running(struct rq *rq)
9c217245
IM
1857{
1858 rq->nr_running++;
9c217245
IM
1859}
1860
c09595f6 1861static void dec_nr_running(struct rq *rq)
9c217245
IM
1862{
1863 rq->nr_running--;
9c217245
IM
1864}
1865
45bf76df
IM
1866static void set_load_weight(struct task_struct *p)
1867{
dd41f596
IM
1868 /*
1869 * SCHED_IDLE tasks get minimal weight:
1870 */
1871 if (p->policy == SCHED_IDLE) {
1872 p->se.load.weight = WEIGHT_IDLEPRIO;
1873 p->se.load.inv_weight = WMULT_IDLEPRIO;
1874 return;
1875 }
71f8bd46 1876
dd41f596
IM
1877 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1878 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
71f8bd46
IM
1879}
1880
371fd7e7 1881static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
2087a1ad 1882{
a64692a3 1883 update_rq_clock(rq);
dd41f596 1884 sched_info_queued(p);
371fd7e7 1885 p->sched_class->enqueue_task(rq, p, flags);
dd41f596 1886 p->se.on_rq = 1;
71f8bd46
IM
1887}
1888
371fd7e7 1889static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
71f8bd46 1890{
a64692a3 1891 update_rq_clock(rq);
46ac22ba 1892 sched_info_dequeued(p);
371fd7e7 1893 p->sched_class->dequeue_task(rq, p, flags);
dd41f596 1894 p->se.on_rq = 0;
71f8bd46
IM
1895}
1896
1e3c88bd
PZ
1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
371fd7e7 1900static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1e3c88bd
PZ
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
371fd7e7 1905 enqueue_task(rq, p, flags);
1e3c88bd
PZ
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
371fd7e7 1912static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1e3c88bd
PZ
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
371fd7e7 1917 dequeue_task(rq, p, flags);
1e3c88bd
PZ
1918 dec_nr_running(rq);
1919}
1920
b52bfee4
VP
1921#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1922
305e6835
VP
1923/*
1924 * There are no locks covering percpu hardirq/softirq time.
1925 * They are only modified in account_system_vtime, on corresponding CPU
1926 * with interrupts disabled. So, writes are safe.
1927 * They are read and saved off onto struct rq in update_rq_clock().
1928 * This may result in other CPU reading this CPU's irq time and can
1929 * race with irq/account_system_vtime on this CPU. We would either get old
1930 * or new value (or semi updated value on 32 bit) with a side effect of
1931 * accounting a slice of irq time to wrong task when irq is in progress
1932 * while we read rq->clock. That is a worthy compromise in place of having
1933 * locks on each irq in account_system_time.
1934 */
b52bfee4
VP
1935static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1936static DEFINE_PER_CPU(u64, cpu_softirq_time);
1937
1938static DEFINE_PER_CPU(u64, irq_start_time);
1939static int sched_clock_irqtime;
1940
1941void enable_sched_clock_irqtime(void)
1942{
1943 sched_clock_irqtime = 1;
1944}
1945
1946void disable_sched_clock_irqtime(void)
1947{
1948 sched_clock_irqtime = 0;
1949}
1950
305e6835
VP
1951static u64 irq_time_cpu(int cpu)
1952{
1953 if (!sched_clock_irqtime)
1954 return 0;
1955
1956 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1957}
1958
b52bfee4
VP
1959void account_system_vtime(struct task_struct *curr)
1960{
1961 unsigned long flags;
1962 int cpu;
1963 u64 now, delta;
1964
1965 if (!sched_clock_irqtime)
1966 return;
1967
1968 local_irq_save(flags);
1969
1970 now = sched_clock();
1971 cpu = smp_processor_id();
1972 delta = now - per_cpu(irq_start_time, cpu);
1973 per_cpu(irq_start_time, cpu) = now;
1974 /*
1975 * We do not account for softirq time from ksoftirqd here.
1976 * We want to continue accounting softirq time to ksoftirqd thread
1977 * in that case, so as not to confuse scheduler with a special task
1978 * that do not consume any time, but still wants to run.
1979 */
1980 if (hardirq_count())
1981 per_cpu(cpu_hardirq_time, cpu) += delta;
1982 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1983 per_cpu(cpu_softirq_time, cpu) += delta;
1984
1985 local_irq_restore(flags);
1986}
1987
305e6835
VP
1988#else
1989
1990static u64 irq_time_cpu(int cpu)
1991{
1992 return 0;
1993}
1994
b52bfee4
VP
1995#endif
1996
1e3c88bd
PZ
1997#include "sched_idletask.c"
1998#include "sched_fair.c"
1999#include "sched_rt.c"
34f971f6 2000#include "sched_stoptask.c"
1e3c88bd
PZ
2001#ifdef CONFIG_SCHED_DEBUG
2002# include "sched_debug.c"
2003#endif
2004
34f971f6
PZ
2005void sched_set_stop_task(int cpu, struct task_struct *stop)
2006{
2007 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2008 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2009
2010 if (stop) {
2011 /*
2012 * Make it appear like a SCHED_FIFO task, its something
2013 * userspace knows about and won't get confused about.
2014 *
2015 * Also, it will make PI more or less work without too
2016 * much confusion -- but then, stop work should not
2017 * rely on PI working anyway.
2018 */
2019 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2020
2021 stop->sched_class = &stop_sched_class;
2022 }
2023
2024 cpu_rq(cpu)->stop = stop;
2025
2026 if (old_stop) {
2027 /*
2028 * Reset it back to a normal scheduling class so that
2029 * it can die in pieces.
2030 */
2031 old_stop->sched_class = &rt_sched_class;
2032 }
2033}
2034
14531189 2035/*
dd41f596 2036 * __normal_prio - return the priority that is based on the static prio
14531189 2037 */
14531189
IM
2038static inline int __normal_prio(struct task_struct *p)
2039{
dd41f596 2040 return p->static_prio;
14531189
IM
2041}
2042
b29739f9
IM
2043/*
2044 * Calculate the expected normal priority: i.e. priority
2045 * without taking RT-inheritance into account. Might be
2046 * boosted by interactivity modifiers. Changes upon fork,
2047 * setprio syscalls, and whenever the interactivity
2048 * estimator recalculates.
2049 */
36c8b586 2050static inline int normal_prio(struct task_struct *p)
b29739f9
IM
2051{
2052 int prio;
2053
e05606d3 2054 if (task_has_rt_policy(p))
b29739f9
IM
2055 prio = MAX_RT_PRIO-1 - p->rt_priority;
2056 else
2057 prio = __normal_prio(p);
2058 return prio;
2059}
2060
2061/*
2062 * Calculate the current priority, i.e. the priority
2063 * taken into account by the scheduler. This value might
2064 * be boosted by RT tasks, or might be boosted by
2065 * interactivity modifiers. Will be RT if the task got
2066 * RT-boosted. If not then it returns p->normal_prio.
2067 */
36c8b586 2068static int effective_prio(struct task_struct *p)
b29739f9
IM
2069{
2070 p->normal_prio = normal_prio(p);
2071 /*
2072 * If we are RT tasks or we were boosted to RT priority,
2073 * keep the priority unchanged. Otherwise, update priority
2074 * to the normal priority:
2075 */
2076 if (!rt_prio(p->prio))
2077 return p->normal_prio;
2078 return p->prio;
2079}
2080
1da177e4
LT
2081/**
2082 * task_curr - is this task currently executing on a CPU?
2083 * @p: the task in question.
2084 */
36c8b586 2085inline int task_curr(const struct task_struct *p)
1da177e4
LT
2086{
2087 return cpu_curr(task_cpu(p)) == p;
2088}
2089
cb469845
SR
2090static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2091 const struct sched_class *prev_class,
2092 int oldprio, int running)
2093{
2094 if (prev_class != p->sched_class) {
2095 if (prev_class->switched_from)
2096 prev_class->switched_from(rq, p, running);
2097 p->sched_class->switched_to(rq, p, running);
2098 } else
2099 p->sched_class->prio_changed(rq, p, oldprio, running);
2100}
2101
1da177e4 2102#ifdef CONFIG_SMP
cc367732
IM
2103/*
2104 * Is this task likely cache-hot:
2105 */
e7693a36 2106static int
cc367732
IM
2107task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2108{
2109 s64 delta;
2110
e6c8fba7
PZ
2111 if (p->sched_class != &fair_sched_class)
2112 return 0;
2113
ef8002f6
NR
2114 if (unlikely(p->policy == SCHED_IDLE))
2115 return 0;
2116
f540a608
IM
2117 /*
2118 * Buddy candidates are cache hot:
2119 */
f685ceac 2120 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
4793241b
PZ
2121 (&p->se == cfs_rq_of(&p->se)->next ||
2122 &p->se == cfs_rq_of(&p->se)->last))
f540a608
IM
2123 return 1;
2124
6bc1665b
IM
2125 if (sysctl_sched_migration_cost == -1)
2126 return 1;
2127 if (sysctl_sched_migration_cost == 0)
2128 return 0;
2129
cc367732
IM
2130 delta = now - p->se.exec_start;
2131
2132 return delta < (s64)sysctl_sched_migration_cost;
2133}
2134
dd41f596 2135void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
c65cc870 2136{
e2912009
PZ
2137#ifdef CONFIG_SCHED_DEBUG
2138 /*
2139 * We should never call set_task_cpu() on a blocked task,
2140 * ttwu() will sort out the placement.
2141 */
077614ee
PZ
2142 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2143 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
e2912009
PZ
2144#endif
2145
de1d7286 2146 trace_sched_migrate_task(p, new_cpu);
cbc34ed1 2147
0c69774e
PZ
2148 if (task_cpu(p) != new_cpu) {
2149 p->se.nr_migrations++;
2150 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2151 }
dd41f596
IM
2152
2153 __set_task_cpu(p, new_cpu);
c65cc870
IM
2154}
2155
969c7921 2156struct migration_arg {
36c8b586 2157 struct task_struct *task;
1da177e4 2158 int dest_cpu;
70b97a7f 2159};
1da177e4 2160
969c7921
TH
2161static int migration_cpu_stop(void *data);
2162
1da177e4
LT
2163/*
2164 * The task's runqueue lock must be held.
2165 * Returns true if you have to wait for migration thread.
2166 */
969c7921 2167static bool migrate_task(struct task_struct *p, int dest_cpu)
1da177e4 2168{
70b97a7f 2169 struct rq *rq = task_rq(p);
1da177e4
LT
2170
2171 /*
2172 * If the task is not on a runqueue (and not running), then
e2912009 2173 * the next wake-up will properly place the task.
1da177e4 2174 */
969c7921 2175 return p->se.on_rq || task_running(rq, p);
1da177e4
LT
2176}
2177
2178/*
2179 * wait_task_inactive - wait for a thread to unschedule.
2180 *
85ba2d86
RM
2181 * If @match_state is nonzero, it's the @p->state value just checked and
2182 * not expected to change. If it changes, i.e. @p might have woken up,
2183 * then return zero. When we succeed in waiting for @p to be off its CPU,
2184 * we return a positive number (its total switch count). If a second call
2185 * a short while later returns the same number, the caller can be sure that
2186 * @p has remained unscheduled the whole time.
2187 *
1da177e4
LT
2188 * The caller must ensure that the task *will* unschedule sometime soon,
2189 * else this function might spin for a *long* time. This function can't
2190 * be called with interrupts off, or it may introduce deadlock with
2191 * smp_call_function() if an IPI is sent by the same process we are
2192 * waiting to become inactive.
2193 */
85ba2d86 2194unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1da177e4
LT
2195{
2196 unsigned long flags;
dd41f596 2197 int running, on_rq;
85ba2d86 2198 unsigned long ncsw;
70b97a7f 2199 struct rq *rq;
1da177e4 2200
3a5c359a
AK
2201 for (;;) {
2202 /*
2203 * We do the initial early heuristics without holding
2204 * any task-queue locks at all. We'll only try to get
2205 * the runqueue lock when things look like they will
2206 * work out!
2207 */
2208 rq = task_rq(p);
fa490cfd 2209
3a5c359a
AK
2210 /*
2211 * If the task is actively running on another CPU
2212 * still, just relax and busy-wait without holding
2213 * any locks.
2214 *
2215 * NOTE! Since we don't hold any locks, it's not
2216 * even sure that "rq" stays as the right runqueue!
2217 * But we don't care, since "task_running()" will
2218 * return false if the runqueue has changed and p
2219 * is actually now running somewhere else!
2220 */
85ba2d86
RM
2221 while (task_running(rq, p)) {
2222 if (match_state && unlikely(p->state != match_state))
2223 return 0;
3a5c359a 2224 cpu_relax();
85ba2d86 2225 }
fa490cfd 2226
3a5c359a
AK
2227 /*
2228 * Ok, time to look more closely! We need the rq
2229 * lock now, to be *sure*. If we're wrong, we'll
2230 * just go back and repeat.
2231 */
2232 rq = task_rq_lock(p, &flags);
27a9da65 2233 trace_sched_wait_task(p);
3a5c359a
AK
2234 running = task_running(rq, p);
2235 on_rq = p->se.on_rq;
85ba2d86 2236 ncsw = 0;
f31e11d8 2237 if (!match_state || p->state == match_state)
93dcf55f 2238 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
3a5c359a 2239 task_rq_unlock(rq, &flags);
fa490cfd 2240
85ba2d86
RM
2241 /*
2242 * If it changed from the expected state, bail out now.
2243 */
2244 if (unlikely(!ncsw))
2245 break;
2246
3a5c359a
AK
2247 /*
2248 * Was it really running after all now that we
2249 * checked with the proper locks actually held?
2250 *
2251 * Oops. Go back and try again..
2252 */
2253 if (unlikely(running)) {
2254 cpu_relax();
2255 continue;
2256 }
fa490cfd 2257
3a5c359a
AK
2258 /*
2259 * It's not enough that it's not actively running,
2260 * it must be off the runqueue _entirely_, and not
2261 * preempted!
2262 *
80dd99b3 2263 * So if it was still runnable (but just not actively
3a5c359a
AK
2264 * running right now), it's preempted, and we should
2265 * yield - it could be a while.
2266 */
2267 if (unlikely(on_rq)) {
2268 schedule_timeout_uninterruptible(1);
2269 continue;
2270 }
fa490cfd 2271
3a5c359a
AK
2272 /*
2273 * Ahh, all good. It wasn't running, and it wasn't
2274 * runnable, which means that it will never become
2275 * running in the future either. We're all done!
2276 */
2277 break;
2278 }
85ba2d86
RM
2279
2280 return ncsw;
1da177e4
LT
2281}
2282
2283/***
2284 * kick_process - kick a running thread to enter/exit the kernel
2285 * @p: the to-be-kicked thread
2286 *
2287 * Cause a process which is running on another CPU to enter
2288 * kernel-mode, without any delay. (to get signals handled.)
2289 *
2290 * NOTE: this function doesnt have to take the runqueue lock,
2291 * because all it wants to ensure is that the remote task enters
2292 * the kernel. If the IPI races and the task has been migrated
2293 * to another CPU then no harm is done and the purpose has been
2294 * achieved as well.
2295 */
36c8b586 2296void kick_process(struct task_struct *p)
1da177e4
LT
2297{
2298 int cpu;
2299
2300 preempt_disable();
2301 cpu = task_cpu(p);
2302 if ((cpu != smp_processor_id()) && task_curr(p))
2303 smp_send_reschedule(cpu);
2304 preempt_enable();
2305}
b43e3521 2306EXPORT_SYMBOL_GPL(kick_process);
476d139c 2307#endif /* CONFIG_SMP */
1da177e4 2308
0793a61d
TG
2309/**
2310 * task_oncpu_function_call - call a function on the cpu on which a task runs
2311 * @p: the task to evaluate
2312 * @func: the function to be called
2313 * @info: the function call argument
2314 *
2315 * Calls the function @func when the task is currently running. This might
2316 * be on the current CPU, which just calls the function directly
2317 */
2318void task_oncpu_function_call(struct task_struct *p,
2319 void (*func) (void *info), void *info)
2320{
2321 int cpu;
2322
2323 preempt_disable();
2324 cpu = task_cpu(p);
2325 if (task_curr(p))
2326 smp_call_function_single(cpu, func, info, 1);
2327 preempt_enable();
2328}
2329
970b13ba 2330#ifdef CONFIG_SMP
30da688e
ON
2331/*
2332 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2333 */
5da9a0fb
PZ
2334static int select_fallback_rq(int cpu, struct task_struct *p)
2335{
2336 int dest_cpu;
2337 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2338
2339 /* Look for allowed, online CPU in same node. */
2340 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2341 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2342 return dest_cpu;
2343
2344 /* Any allowed, online CPU? */
2345 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2346 if (dest_cpu < nr_cpu_ids)
2347 return dest_cpu;
2348
2349 /* No more Mr. Nice Guy. */
897f0b3c 2350 if (unlikely(dest_cpu >= nr_cpu_ids)) {
9084bb82 2351 dest_cpu = cpuset_cpus_allowed_fallback(p);
5da9a0fb
PZ
2352 /*
2353 * Don't tell them about moving exiting tasks or
2354 * kernel threads (both mm NULL), since they never
2355 * leave kernel.
2356 */
2357 if (p->mm && printk_ratelimit()) {
2358 printk(KERN_INFO "process %d (%s) no "
2359 "longer affine to cpu%d\n",
2360 task_pid_nr(p), p->comm, cpu);
2361 }
2362 }
2363
2364 return dest_cpu;
2365}
2366
e2912009 2367/*
30da688e 2368 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
e2912009 2369 */
970b13ba 2370static inline
0017d735 2371int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
970b13ba 2372{
0017d735 2373 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
e2912009
PZ
2374
2375 /*
2376 * In order not to call set_task_cpu() on a blocking task we need
2377 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2378 * cpu.
2379 *
2380 * Since this is common to all placement strategies, this lives here.
2381 *
2382 * [ this allows ->select_task() to simply return task_cpu(p) and
2383 * not worry about this generic constraint ]
2384 */
2385 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
70f11205 2386 !cpu_online(cpu)))
5da9a0fb 2387 cpu = select_fallback_rq(task_cpu(p), p);
e2912009
PZ
2388
2389 return cpu;
970b13ba 2390}
09a40af5
MG
2391
2392static void update_avg(u64 *avg, u64 sample)
2393{
2394 s64 diff = sample - *avg;
2395 *avg += diff >> 3;
2396}
970b13ba
PZ
2397#endif
2398
9ed3811a
TH
2399static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2400 bool is_sync, bool is_migrate, bool is_local,
2401 unsigned long en_flags)
2402{
2403 schedstat_inc(p, se.statistics.nr_wakeups);
2404 if (is_sync)
2405 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2406 if (is_migrate)
2407 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2408 if (is_local)
2409 schedstat_inc(p, se.statistics.nr_wakeups_local);
2410 else
2411 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2412
2413 activate_task(rq, p, en_flags);
2414}
2415
2416static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2417 int wake_flags, bool success)
2418{
2419 trace_sched_wakeup(p, success);
2420 check_preempt_curr(rq, p, wake_flags);
2421
2422 p->state = TASK_RUNNING;
2423#ifdef CONFIG_SMP
2424 if (p->sched_class->task_woken)
2425 p->sched_class->task_woken(rq, p);
2426
2427 if (unlikely(rq->idle_stamp)) {
2428 u64 delta = rq->clock - rq->idle_stamp;
2429 u64 max = 2*sysctl_sched_migration_cost;
2430
2431 if (delta > max)
2432 rq->avg_idle = max;
2433 else
2434 update_avg(&rq->avg_idle, delta);
2435 rq->idle_stamp = 0;
2436 }
2437#endif
21aa9af0
TH
2438 /* if a worker is waking up, notify workqueue */
2439 if ((p->flags & PF_WQ_WORKER) && success)
2440 wq_worker_waking_up(p, cpu_of(rq));
9ed3811a
TH
2441}
2442
2443/**
1da177e4 2444 * try_to_wake_up - wake up a thread
9ed3811a 2445 * @p: the thread to be awakened
1da177e4 2446 * @state: the mask of task states that can be woken
9ed3811a 2447 * @wake_flags: wake modifier flags (WF_*)
1da177e4
LT
2448 *
2449 * Put it on the run-queue if it's not already there. The "current"
2450 * thread is always on the run-queue (except when the actual
2451 * re-schedule is in progress), and as such you're allowed to do
2452 * the simpler "current->state = TASK_RUNNING" to mark yourself
2453 * runnable without the overhead of this.
2454 *
9ed3811a
TH
2455 * Returns %true if @p was woken up, %false if it was already running
2456 * or @state didn't match @p's state.
1da177e4 2457 */
7d478721
PZ
2458static int try_to_wake_up(struct task_struct *p, unsigned int state,
2459 int wake_flags)
1da177e4 2460{
cc367732 2461 int cpu, orig_cpu, this_cpu, success = 0;
1da177e4 2462 unsigned long flags;
371fd7e7 2463 unsigned long en_flags = ENQUEUE_WAKEUP;
ab3b3aa5 2464 struct rq *rq;
1da177e4 2465
e9c84311 2466 this_cpu = get_cpu();
2398f2c6 2467
04e2f174 2468 smp_wmb();
ab3b3aa5 2469 rq = task_rq_lock(p, &flags);
e9c84311 2470 if (!(p->state & state))
1da177e4
LT
2471 goto out;
2472
dd41f596 2473 if (p->se.on_rq)
1da177e4
LT
2474 goto out_running;
2475
2476 cpu = task_cpu(p);
cc367732 2477 orig_cpu = cpu;
1da177e4
LT
2478
2479#ifdef CONFIG_SMP
2480 if (unlikely(task_running(rq, p)))
2481 goto out_activate;
2482
e9c84311
PZ
2483 /*
2484 * In order to handle concurrent wakeups and release the rq->lock
2485 * we put the task in TASK_WAKING state.
eb24073b
IM
2486 *
2487 * First fix up the nr_uninterruptible count:
e9c84311 2488 */
cc87f76a
PZ
2489 if (task_contributes_to_load(p)) {
2490 if (likely(cpu_online(orig_cpu)))
2491 rq->nr_uninterruptible--;
2492 else
2493 this_rq()->nr_uninterruptible--;
2494 }
e9c84311 2495 p->state = TASK_WAKING;
efbbd05a 2496
371fd7e7 2497 if (p->sched_class->task_waking) {
efbbd05a 2498 p->sched_class->task_waking(rq, p);
371fd7e7
PZ
2499 en_flags |= ENQUEUE_WAKING;
2500 }
efbbd05a 2501
0017d735
PZ
2502 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2503 if (cpu != orig_cpu)
5d2f5a61 2504 set_task_cpu(p, cpu);
0017d735 2505 __task_rq_unlock(rq);
ab19cb23 2506
0970d299
PZ
2507 rq = cpu_rq(cpu);
2508 raw_spin_lock(&rq->lock);
f5dc3753 2509
0970d299
PZ
2510 /*
2511 * We migrated the task without holding either rq->lock, however
2512 * since the task is not on the task list itself, nobody else
2513 * will try and migrate the task, hence the rq should match the
2514 * cpu we just moved it to.
2515 */
2516 WARN_ON(task_cpu(p) != cpu);
e9c84311 2517 WARN_ON(p->state != TASK_WAKING);
1da177e4 2518
e7693a36
GH
2519#ifdef CONFIG_SCHEDSTATS
2520 schedstat_inc(rq, ttwu_count);
2521 if (cpu == this_cpu)
2522 schedstat_inc(rq, ttwu_local);
2523 else {
2524 struct sched_domain *sd;
2525 for_each_domain(this_cpu, sd) {
758b2cdc 2526 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
e7693a36
GH
2527 schedstat_inc(sd, ttwu_wake_remote);
2528 break;
2529 }
2530 }
2531 }
6d6bc0ad 2532#endif /* CONFIG_SCHEDSTATS */
e7693a36 2533
1da177e4
LT
2534out_activate:
2535#endif /* CONFIG_SMP */
9ed3811a
TH
2536 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2537 cpu == this_cpu, en_flags);
1da177e4 2538 success = 1;
1da177e4 2539out_running:
9ed3811a 2540 ttwu_post_activation(p, rq, wake_flags, success);
1da177e4
LT
2541out:
2542 task_rq_unlock(rq, &flags);
e9c84311 2543 put_cpu();
1da177e4
LT
2544
2545 return success;
2546}
2547
21aa9af0
TH
2548/**
2549 * try_to_wake_up_local - try to wake up a local task with rq lock held
2550 * @p: the thread to be awakened
2551 *
2552 * Put @p on the run-queue if it's not alredy there. The caller must
2553 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2554 * the current task. this_rq() stays locked over invocation.
2555 */
2556static void try_to_wake_up_local(struct task_struct *p)
2557{
2558 struct rq *rq = task_rq(p);
2559 bool success = false;
2560
2561 BUG_ON(rq != this_rq());
2562 BUG_ON(p == current);
2563 lockdep_assert_held(&rq->lock);
2564
2565 if (!(p->state & TASK_NORMAL))
2566 return;
2567
2568 if (!p->se.on_rq) {
2569 if (likely(!task_running(rq, p))) {
2570 schedstat_inc(rq, ttwu_count);
2571 schedstat_inc(rq, ttwu_local);
2572 }
2573 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2574 success = true;
2575 }
2576 ttwu_post_activation(p, rq, 0, success);
2577}
2578
50fa610a
DH
2579/**
2580 * wake_up_process - Wake up a specific process
2581 * @p: The process to be woken up.
2582 *
2583 * Attempt to wake up the nominated process and move it to the set of runnable
2584 * processes. Returns 1 if the process was woken up, 0 if it was already
2585 * running.
2586 *
2587 * It may be assumed that this function implies a write memory barrier before
2588 * changing the task state if and only if any tasks are woken up.
2589 */
7ad5b3a5 2590int wake_up_process(struct task_struct *p)
1da177e4 2591{
d9514f6c 2592 return try_to_wake_up(p, TASK_ALL, 0);
1da177e4 2593}
1da177e4
LT
2594EXPORT_SYMBOL(wake_up_process);
2595
7ad5b3a5 2596int wake_up_state(struct task_struct *p, unsigned int state)
1da177e4
LT
2597{
2598 return try_to_wake_up(p, state, 0);
2599}
2600
1da177e4
LT
2601/*
2602 * Perform scheduler related setup for a newly forked process p.
2603 * p is forked by current.
dd41f596
IM
2604 *
2605 * __sched_fork() is basic setup used by init_idle() too:
2606 */
2607static void __sched_fork(struct task_struct *p)
2608{
dd41f596
IM
2609 p->se.exec_start = 0;
2610 p->se.sum_exec_runtime = 0;
f6cf891c 2611 p->se.prev_sum_exec_runtime = 0;
6c594c21 2612 p->se.nr_migrations = 0;
6cfb0d5d
IM
2613
2614#ifdef CONFIG_SCHEDSTATS
41acab88 2615 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
6cfb0d5d 2616#endif
476d139c 2617
fa717060 2618 INIT_LIST_HEAD(&p->rt.run_list);
dd41f596 2619 p->se.on_rq = 0;
4a55bd5e 2620 INIT_LIST_HEAD(&p->se.group_node);
476d139c 2621
e107be36
AK
2622#ifdef CONFIG_PREEMPT_NOTIFIERS
2623 INIT_HLIST_HEAD(&p->preempt_notifiers);
2624#endif
dd41f596
IM
2625}
2626
2627/*
2628 * fork()/clone()-time setup:
2629 */
2630void sched_fork(struct task_struct *p, int clone_flags)
2631{
2632 int cpu = get_cpu();
2633
2634 __sched_fork(p);
06b83b5f 2635 /*
0017d735 2636 * We mark the process as running here. This guarantees that
06b83b5f
PZ
2637 * nobody will actually run it, and a signal or other external
2638 * event cannot wake it up and insert it on the runqueue either.
2639 */
0017d735 2640 p->state = TASK_RUNNING;
dd41f596 2641
b9dc29e7
MG
2642 /*
2643 * Revert to default priority/policy on fork if requested.
2644 */
2645 if (unlikely(p->sched_reset_on_fork)) {
f83f9ac2 2646 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
b9dc29e7 2647 p->policy = SCHED_NORMAL;
f83f9ac2
PW
2648 p->normal_prio = p->static_prio;
2649 }
b9dc29e7 2650
6c697bdf
MG
2651 if (PRIO_TO_NICE(p->static_prio) < 0) {
2652 p->static_prio = NICE_TO_PRIO(0);
f83f9ac2 2653 p->normal_prio = p->static_prio;
6c697bdf
MG
2654 set_load_weight(p);
2655 }
2656
b9dc29e7
MG
2657 /*
2658 * We don't need the reset flag anymore after the fork. It has
2659 * fulfilled its duty:
2660 */
2661 p->sched_reset_on_fork = 0;
2662 }
ca94c442 2663
f83f9ac2
PW
2664 /*
2665 * Make sure we do not leak PI boosting priority to the child.
2666 */
2667 p->prio = current->normal_prio;
2668
2ddbf952
HS
2669 if (!rt_prio(p->prio))
2670 p->sched_class = &fair_sched_class;
b29739f9 2671
cd29fe6f
PZ
2672 if (p->sched_class->task_fork)
2673 p->sched_class->task_fork(p);
2674
86951599
PZ
2675 /*
2676 * The child is not yet in the pid-hash so no cgroup attach races,
2677 * and the cgroup is pinned to this child due to cgroup_fork()
2678 * is ran before sched_fork().
2679 *
2680 * Silence PROVE_RCU.
2681 */
2682 rcu_read_lock();
5f3edc1b 2683 set_task_cpu(p, cpu);
86951599 2684 rcu_read_unlock();
5f3edc1b 2685
52f17b6c 2686#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
dd41f596 2687 if (likely(sched_info_on()))
52f17b6c 2688 memset(&p->sched_info, 0, sizeof(p->sched_info));
1da177e4 2689#endif
d6077cb8 2690#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4866cde0
NP
2691 p->oncpu = 0;
2692#endif
1da177e4 2693#ifdef CONFIG_PREEMPT
4866cde0 2694 /* Want to start with kernel preemption disabled. */
a1261f54 2695 task_thread_info(p)->preempt_count = 1;
1da177e4 2696#endif
917b627d
GH
2697 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2698
476d139c 2699 put_cpu();
1da177e4
LT
2700}
2701
2702/*
2703 * wake_up_new_task - wake up a newly created task for the first time.
2704 *
2705 * This function will do some initial scheduler statistics housekeeping
2706 * that must be done for every newly created context, then puts the task
2707 * on the runqueue and wakes it.
2708 */
7ad5b3a5 2709void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1da177e4
LT
2710{
2711 unsigned long flags;
dd41f596 2712 struct rq *rq;
c890692b 2713 int cpu __maybe_unused = get_cpu();
fabf318e
PZ
2714
2715#ifdef CONFIG_SMP
0017d735
PZ
2716 rq = task_rq_lock(p, &flags);
2717 p->state = TASK_WAKING;
2718
fabf318e
PZ
2719 /*
2720 * Fork balancing, do it here and not earlier because:
2721 * - cpus_allowed can change in the fork path
2722 * - any previously selected cpu might disappear through hotplug
2723 *
0017d735
PZ
2724 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2725 * without people poking at ->cpus_allowed.
fabf318e 2726 */
0017d735 2727 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
fabf318e 2728 set_task_cpu(p, cpu);
1da177e4 2729
06b83b5f 2730 p->state = TASK_RUNNING;
0017d735
PZ
2731 task_rq_unlock(rq, &flags);
2732#endif
2733
2734 rq = task_rq_lock(p, &flags);
cd29fe6f 2735 activate_task(rq, p, 0);
27a9da65 2736 trace_sched_wakeup_new(p, 1);
a7558e01 2737 check_preempt_curr(rq, p, WF_FORK);
9a897c5a 2738#ifdef CONFIG_SMP
efbbd05a
PZ
2739 if (p->sched_class->task_woken)
2740 p->sched_class->task_woken(rq, p);
9a897c5a 2741#endif
dd41f596 2742 task_rq_unlock(rq, &flags);
fabf318e 2743 put_cpu();
1da177e4
LT
2744}
2745
e107be36
AK
2746#ifdef CONFIG_PREEMPT_NOTIFIERS
2747
2748/**
80dd99b3 2749 * preempt_notifier_register - tell me when current is being preempted & rescheduled
421cee29 2750 * @notifier: notifier struct to register
e107be36
AK
2751 */
2752void preempt_notifier_register(struct preempt_notifier *notifier)
2753{
2754 hlist_add_head(&notifier->link, &current->preempt_notifiers);
2755}
2756EXPORT_SYMBOL_GPL(preempt_notifier_register);
2757
2758/**
2759 * preempt_notifier_unregister - no longer interested in preemption notifications
421cee29 2760 * @notifier: notifier struct to unregister
e107be36
AK
2761 *
2762 * This is safe to call from within a preemption notifier.
2763 */
2764void preempt_notifier_unregister(struct preempt_notifier *notifier)
2765{
2766 hlist_del(&notifier->link);
2767}
2768EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2769
2770static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2771{
2772 struct preempt_notifier *notifier;
2773 struct hlist_node *node;
2774
2775 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2776 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2777}
2778
2779static void
2780fire_sched_out_preempt_notifiers(struct task_struct *curr,
2781 struct task_struct *next)
2782{
2783 struct preempt_notifier *notifier;
2784 struct hlist_node *node;
2785
2786 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2787 notifier->ops->sched_out(notifier, next);
2788}
2789
6d6bc0ad 2790#else /* !CONFIG_PREEMPT_NOTIFIERS */
e107be36
AK
2791
2792static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2793{
2794}
2795
2796static void
2797fire_sched_out_preempt_notifiers(struct task_struct *curr,
2798 struct task_struct *next)
2799{
2800}
2801
6d6bc0ad 2802#endif /* CONFIG_PREEMPT_NOTIFIERS */
e107be36 2803
4866cde0
NP
2804/**
2805 * prepare_task_switch - prepare to switch tasks
2806 * @rq: the runqueue preparing to switch
421cee29 2807 * @prev: the current task that is being switched out
4866cde0
NP
2808 * @next: the task we are going to switch to.
2809 *
2810 * This is called with the rq lock held and interrupts off. It must
2811 * be paired with a subsequent finish_task_switch after the context
2812 * switch.
2813 *
2814 * prepare_task_switch sets up locking and calls architecture specific
2815 * hooks.
2816 */
e107be36
AK
2817static inline void
2818prepare_task_switch(struct rq *rq, struct task_struct *prev,
2819 struct task_struct *next)
4866cde0 2820{
e107be36 2821 fire_sched_out_preempt_notifiers(prev, next);
4866cde0
NP
2822 prepare_lock_switch(rq, next);
2823 prepare_arch_switch(next);
2824}
2825
1da177e4
LT
2826/**
2827 * finish_task_switch - clean up after a task-switch
344babaa 2828 * @rq: runqueue associated with task-switch
1da177e4
LT
2829 * @prev: the thread we just switched away from.
2830 *
4866cde0
NP
2831 * finish_task_switch must be called after the context switch, paired
2832 * with a prepare_task_switch call before the context switch.
2833 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2834 * and do any other architecture-specific cleanup actions.
1da177e4
LT
2835 *
2836 * Note that we may have delayed dropping an mm in context_switch(). If
41a2d6cf 2837 * so, we finish that here outside of the runqueue lock. (Doing it
1da177e4
LT
2838 * with the lock held can cause deadlocks; see schedule() for
2839 * details.)
2840 */
a9957449 2841static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1da177e4
LT
2842 __releases(rq->lock)
2843{
1da177e4 2844 struct mm_struct *mm = rq->prev_mm;
55a101f8 2845 long prev_state;
1da177e4
LT
2846
2847 rq->prev_mm = NULL;
2848
2849 /*
2850 * A task struct has one reference for the use as "current".
c394cc9f 2851 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
55a101f8
ON
2852 * schedule one last time. The schedule call will never return, and
2853 * the scheduled task must drop that reference.
c394cc9f 2854 * The test for TASK_DEAD must occur while the runqueue locks are
1da177e4
LT
2855 * still held, otherwise prev could be scheduled on another cpu, die
2856 * there before we look at prev->state, and then the reference would
2857 * be dropped twice.
2858 * Manfred Spraul <manfred@colorfullife.com>
2859 */
55a101f8 2860 prev_state = prev->state;
4866cde0 2861 finish_arch_switch(prev);
8381f65d
JI
2862#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2863 local_irq_disable();
2864#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
49f47433 2865 perf_event_task_sched_in(current);
8381f65d
JI
2866#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2867 local_irq_enable();
2868#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
4866cde0 2869 finish_lock_switch(rq, prev);
e8fa1362 2870
e107be36 2871 fire_sched_in_preempt_notifiers(current);
1da177e4
LT
2872 if (mm)
2873 mmdrop(mm);
c394cc9f 2874 if (unlikely(prev_state == TASK_DEAD)) {
c6fd91f0 2875 /*
2876 * Remove function-return probe instances associated with this
2877 * task and put them back on the free list.
9761eea8 2878 */
c6fd91f0 2879 kprobe_flush_task(prev);
1da177e4 2880 put_task_struct(prev);
c6fd91f0 2881 }
1da177e4
LT
2882}
2883
3f029d3c
GH
2884#ifdef CONFIG_SMP
2885
2886/* assumes rq->lock is held */
2887static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2888{
2889 if (prev->sched_class->pre_schedule)
2890 prev->sched_class->pre_schedule(rq, prev);
2891}
2892
2893/* rq->lock is NOT held, but preemption is disabled */
2894static inline void post_schedule(struct rq *rq)
2895{
2896 if (rq->post_schedule) {
2897 unsigned long flags;
2898
05fa785c 2899 raw_spin_lock_irqsave(&rq->lock, flags);
3f029d3c
GH
2900 if (rq->curr->sched_class->post_schedule)
2901 rq->curr->sched_class->post_schedule(rq);
05fa785c 2902 raw_spin_unlock_irqrestore(&rq->lock, flags);
3f029d3c
GH
2903
2904 rq->post_schedule = 0;
2905 }
2906}
2907
2908#else
da19ab51 2909
3f029d3c
GH
2910static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2911{
2912}
2913
2914static inline void post_schedule(struct rq *rq)
2915{
1da177e4
LT
2916}
2917
3f029d3c
GH
2918#endif
2919
1da177e4
LT
2920/**
2921 * schedule_tail - first thing a freshly forked thread must call.
2922 * @prev: the thread we just switched away from.
2923 */
36c8b586 2924asmlinkage void schedule_tail(struct task_struct *prev)
1da177e4
LT
2925 __releases(rq->lock)
2926{
70b97a7f
IM
2927 struct rq *rq = this_rq();
2928
4866cde0 2929 finish_task_switch(rq, prev);
da19ab51 2930
3f029d3c
GH
2931 /*
2932 * FIXME: do we need to worry about rq being invalidated by the
2933 * task_switch?
2934 */
2935 post_schedule(rq);
70b97a7f 2936
4866cde0
NP
2937#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2938 /* In this case, finish_task_switch does not reenable preemption */
2939 preempt_enable();
2940#endif
1da177e4 2941 if (current->set_child_tid)
b488893a 2942 put_user(task_pid_vnr(current), current->set_child_tid);
1da177e4
LT
2943}
2944
2945/*
2946 * context_switch - switch to the new MM and the new
2947 * thread's register state.
2948 */
dd41f596 2949static inline void
70b97a7f 2950context_switch(struct rq *rq, struct task_struct *prev,
36c8b586 2951 struct task_struct *next)
1da177e4 2952{
dd41f596 2953 struct mm_struct *mm, *oldmm;
1da177e4 2954
e107be36 2955 prepare_task_switch(rq, prev, next);
27a9da65 2956 trace_sched_switch(prev, next);
dd41f596
IM
2957 mm = next->mm;
2958 oldmm = prev->active_mm;
9226d125
ZA
2959 /*
2960 * For paravirt, this is coupled with an exit in switch_to to
2961 * combine the page table reload and the switch backend into
2962 * one hypercall.
2963 */
224101ed 2964 arch_start_context_switch(prev);
9226d125 2965
31915ab4 2966 if (!mm) {
1da177e4
LT
2967 next->active_mm = oldmm;
2968 atomic_inc(&oldmm->mm_count);
2969 enter_lazy_tlb(oldmm, next);
2970 } else
2971 switch_mm(oldmm, mm, next);
2972
31915ab4 2973 if (!prev->mm) {
1da177e4 2974 prev->active_mm = NULL;
1da177e4
LT
2975 rq->prev_mm = oldmm;
2976 }
3a5f5e48
IM
2977 /*
2978 * Since the runqueue lock will be released by the next
2979 * task (which is an invalid locking op but in the case
2980 * of the scheduler it's an obvious special-case), so we
2981 * do an early lockdep release here:
2982 */
2983#ifndef __ARCH_WANT_UNLOCKED_CTXSW
8a25d5de 2984 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3a5f5e48 2985#endif
1da177e4
LT
2986
2987 /* Here we just switch the register state and the stack. */
2988 switch_to(prev, next, prev);
2989
dd41f596
IM
2990 barrier();
2991 /*
2992 * this_rq must be evaluated again because prev may have moved
2993 * CPUs since it called schedule(), thus the 'rq' on its stack
2994 * frame will be invalid.
2995 */
2996 finish_task_switch(this_rq(), prev);
1da177e4
LT
2997}
2998
2999/*
3000 * nr_running, nr_uninterruptible and nr_context_switches:
3001 *
3002 * externally visible scheduler statistics: current number of runnable
3003 * threads, current number of uninterruptible-sleeping threads, total
3004 * number of context switches performed since bootup.
3005 */
3006unsigned long nr_running(void)
3007{
3008 unsigned long i, sum = 0;
3009
3010 for_each_online_cpu(i)
3011 sum += cpu_rq(i)->nr_running;
3012
3013 return sum;
f711f609 3014}
1da177e4
LT
3015
3016unsigned long nr_uninterruptible(void)
f711f609 3017{
1da177e4 3018 unsigned long i, sum = 0;
f711f609 3019
0a945022 3020 for_each_possible_cpu(i)
1da177e4 3021 sum += cpu_rq(i)->nr_uninterruptible;
f711f609
GS
3022
3023 /*
1da177e4
LT
3024 * Since we read the counters lockless, it might be slightly
3025 * inaccurate. Do not allow it to go below zero though:
f711f609 3026 */
1da177e4
LT
3027 if (unlikely((long)sum < 0))
3028 sum = 0;
f711f609 3029
1da177e4 3030 return sum;
f711f609 3031}
f711f609 3032
1da177e4 3033unsigned long long nr_context_switches(void)
46cb4b7c 3034{
cc94abfc
SR
3035 int i;
3036 unsigned long long sum = 0;
46cb4b7c 3037
0a945022 3038 for_each_possible_cpu(i)
1da177e4 3039 sum += cpu_rq(i)->nr_switches;
46cb4b7c 3040
1da177e4
LT
3041 return sum;
3042}
483b4ee6 3043
1da177e4
LT
3044unsigned long nr_iowait(void)
3045{
3046 unsigned long i, sum = 0;
483b4ee6 3047
0a945022 3048 for_each_possible_cpu(i)
1da177e4 3049 sum += atomic_read(&cpu_rq(i)->nr_iowait);
46cb4b7c 3050
1da177e4
LT
3051 return sum;
3052}
483b4ee6 3053
8c215bd3 3054unsigned long nr_iowait_cpu(int cpu)
69d25870 3055{
8c215bd3 3056 struct rq *this = cpu_rq(cpu);
69d25870
AV
3057 return atomic_read(&this->nr_iowait);
3058}
46cb4b7c 3059
69d25870
AV
3060unsigned long this_cpu_load(void)
3061{
3062 struct rq *this = this_rq();
3063 return this->cpu_load[0];
3064}
e790fb0b 3065
46cb4b7c 3066
dce48a84
TG
3067/* Variables and functions for calc_load */
3068static atomic_long_t calc_load_tasks;
3069static unsigned long calc_load_update;
3070unsigned long avenrun[3];
3071EXPORT_SYMBOL(avenrun);
46cb4b7c 3072
74f5187a
PZ
3073static long calc_load_fold_active(struct rq *this_rq)
3074{
3075 long nr_active, delta = 0;
3076
3077 nr_active = this_rq->nr_running;
3078 nr_active += (long) this_rq->nr_uninterruptible;
3079
3080 if (nr_active != this_rq->calc_load_active) {
3081 delta = nr_active - this_rq->calc_load_active;
3082 this_rq->calc_load_active = nr_active;
3083 }
3084
3085 return delta;
3086}
3087
3088#ifdef CONFIG_NO_HZ
3089/*
3090 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
3091 *
3092 * When making the ILB scale, we should try to pull this in as well.
3093 */
3094static atomic_long_t calc_load_tasks_idle;
3095
3096static void calc_load_account_idle(struct rq *this_rq)
3097{
3098 long delta;
3099
3100 delta = calc_load_fold_active(this_rq);
3101 if (delta)
3102 atomic_long_add(delta, &calc_load_tasks_idle);
3103}
3104
3105static long calc_load_fold_idle(void)
3106{
3107 long delta = 0;
3108
3109 /*
3110 * Its got a race, we don't care...
3111 */
3112 if (atomic_long_read(&calc_load_tasks_idle))
3113 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3114
3115 return delta;
3116}
3117#else
3118static void calc_load_account_idle(struct rq *this_rq)
3119{
3120}
3121
3122static inline long calc_load_fold_idle(void)
3123{
3124 return 0;
3125}
3126#endif
3127
2d02494f
TG
3128/**
3129 * get_avenrun - get the load average array
3130 * @loads: pointer to dest load array
3131 * @offset: offset to add
3132 * @shift: shift count to shift the result left
3133 *
3134 * These values are estimates at best, so no need for locking.
3135 */
3136void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3137{
3138 loads[0] = (avenrun[0] + offset) << shift;
3139 loads[1] = (avenrun[1] + offset) << shift;
3140 loads[2] = (avenrun[2] + offset) << shift;
46cb4b7c 3141}
46cb4b7c 3142
dce48a84
TG
3143static unsigned long
3144calc_load(unsigned long load, unsigned long exp, unsigned long active)
db1b1fef 3145{
dce48a84
TG
3146 load *= exp;
3147 load += active * (FIXED_1 - exp);
3148 return load >> FSHIFT;
3149}
46cb4b7c
SS
3150
3151/*
dce48a84
TG
3152 * calc_load - update the avenrun load estimates 10 ticks after the
3153 * CPUs have updated calc_load_tasks.
7835b98b 3154 */
dce48a84 3155void calc_global_load(void)
7835b98b 3156{
dce48a84
TG
3157 unsigned long upd = calc_load_update + 10;
3158 long active;
1da177e4 3159
dce48a84
TG
3160 if (time_before(jiffies, upd))
3161 return;
1da177e4 3162
dce48a84
TG
3163 active = atomic_long_read(&calc_load_tasks);
3164 active = active > 0 ? active * FIXED_1 : 0;
1da177e4 3165
dce48a84
TG
3166 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3167 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3168 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
dd41f596 3169
dce48a84
TG
3170 calc_load_update += LOAD_FREQ;
3171}
1da177e4 3172
dce48a84 3173/*
74f5187a
PZ
3174 * Called from update_cpu_load() to periodically update this CPU's
3175 * active count.
dce48a84
TG
3176 */
3177static void calc_load_account_active(struct rq *this_rq)
3178{
74f5187a 3179 long delta;
08c183f3 3180
74f5187a
PZ
3181 if (time_before(jiffies, this_rq->calc_load_update))
3182 return;
783609c6 3183
74f5187a
PZ
3184 delta = calc_load_fold_active(this_rq);
3185 delta += calc_load_fold_idle();
3186 if (delta)
dce48a84 3187 atomic_long_add(delta, &calc_load_tasks);
74f5187a
PZ
3188
3189 this_rq->calc_load_update += LOAD_FREQ;
46cb4b7c
SS
3190}
3191
fdf3e95d
VP
3192/*
3193 * The exact cpuload at various idx values, calculated at every tick would be
3194 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3195 *
3196 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3197 * on nth tick when cpu may be busy, then we have:
3198 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3199 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3200 *
3201 * decay_load_missed() below does efficient calculation of
3202 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3203 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3204 *
3205 * The calculation is approximated on a 128 point scale.
3206 * degrade_zero_ticks is the number of ticks after which load at any
3207 * particular idx is approximated to be zero.
3208 * degrade_factor is a precomputed table, a row for each load idx.
3209 * Each column corresponds to degradation factor for a power of two ticks,
3210 * based on 128 point scale.
3211 * Example:
3212 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3213 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3214 *
3215 * With this power of 2 load factors, we can degrade the load n times
3216 * by looking at 1 bits in n and doing as many mult/shift instead of
3217 * n mult/shifts needed by the exact degradation.
3218 */
3219#define DEGRADE_SHIFT 7
3220static const unsigned char
3221 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3222static const unsigned char
3223 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3224 {0, 0, 0, 0, 0, 0, 0, 0},
3225 {64, 32, 8, 0, 0, 0, 0, 0},
3226 {96, 72, 40, 12, 1, 0, 0},
3227 {112, 98, 75, 43, 15, 1, 0},
3228 {120, 112, 98, 76, 45, 16, 2} };
3229
3230/*
3231 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3232 * would be when CPU is idle and so we just decay the old load without
3233 * adding any new load.
3234 */
3235static unsigned long
3236decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3237{
3238 int j = 0;
3239
3240 if (!missed_updates)
3241 return load;
3242
3243 if (missed_updates >= degrade_zero_ticks[idx])
3244 return 0;
3245
3246 if (idx == 1)
3247 return load >> missed_updates;
3248
3249 while (missed_updates) {
3250 if (missed_updates % 2)
3251 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3252
3253 missed_updates >>= 1;
3254 j++;
3255 }
3256 return load;
3257}
3258
46cb4b7c 3259/*
dd41f596 3260 * Update rq->cpu_load[] statistics. This function is usually called every
fdf3e95d
VP
3261 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3262 * every tick. We fix it up based on jiffies.
46cb4b7c 3263 */
dd41f596 3264static void update_cpu_load(struct rq *this_rq)
46cb4b7c 3265{
495eca49 3266 unsigned long this_load = this_rq->load.weight;
fdf3e95d
VP
3267 unsigned long curr_jiffies = jiffies;
3268 unsigned long pending_updates;
dd41f596 3269 int i, scale;
46cb4b7c 3270
dd41f596 3271 this_rq->nr_load_updates++;
46cb4b7c 3272
fdf3e95d
VP
3273 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3274 if (curr_jiffies == this_rq->last_load_update_tick)
3275 return;
3276
3277 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3278 this_rq->last_load_update_tick = curr_jiffies;
3279
dd41f596 3280 /* Update our load: */
fdf3e95d
VP
3281 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3282 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
dd41f596 3283 unsigned long old_load, new_load;
7d1e6a9b 3284
dd41f596 3285 /* scale is effectively 1 << i now, and >> i divides by scale */
46cb4b7c 3286
dd41f596 3287 old_load = this_rq->cpu_load[i];
fdf3e95d 3288 old_load = decay_load_missed(old_load, pending_updates - 1, i);
dd41f596 3289 new_load = this_load;
a25707f3
IM
3290 /*
3291 * Round up the averaging division if load is increasing. This
3292 * prevents us from getting stuck on 9 if the load is 10, for
3293 * example.
3294 */
3295 if (new_load > old_load)
fdf3e95d
VP
3296 new_load += scale - 1;
3297
3298 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
dd41f596 3299 }
da2b71ed
SS
3300
3301 sched_avg_update(this_rq);
fdf3e95d
VP
3302}
3303
3304static void update_cpu_load_active(struct rq *this_rq)
3305{
3306 update_cpu_load(this_rq);
46cb4b7c 3307
74f5187a 3308 calc_load_account_active(this_rq);
46cb4b7c
SS
3309}
3310
dd41f596 3311#ifdef CONFIG_SMP
8a0be9ef 3312
46cb4b7c 3313/*
38022906
PZ
3314 * sched_exec - execve() is a valuable balancing opportunity, because at
3315 * this point the task has the smallest effective memory and cache footprint.
46cb4b7c 3316 */
38022906 3317void sched_exec(void)
46cb4b7c 3318{
38022906 3319 struct task_struct *p = current;
1da177e4 3320 unsigned long flags;
70b97a7f 3321 struct rq *rq;
0017d735 3322 int dest_cpu;
46cb4b7c 3323
1da177e4 3324 rq = task_rq_lock(p, &flags);
0017d735
PZ
3325 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3326 if (dest_cpu == smp_processor_id())
3327 goto unlock;
38022906 3328
46cb4b7c 3329 /*
38022906 3330 * select_task_rq() can race against ->cpus_allowed
46cb4b7c 3331 */
30da688e 3332 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
969c7921
TH
3333 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3334 struct migration_arg arg = { p, dest_cpu };
46cb4b7c 3335
1da177e4 3336 task_rq_unlock(rq, &flags);
969c7921 3337 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1da177e4
LT
3338 return;
3339 }
0017d735 3340unlock:
1da177e4 3341 task_rq_unlock(rq, &flags);
1da177e4 3342}
dd41f596 3343
1da177e4
LT
3344#endif
3345
1da177e4
LT
3346DEFINE_PER_CPU(struct kernel_stat, kstat);
3347
3348EXPORT_PER_CPU_SYMBOL(kstat);
3349
3350/*
c5f8d995 3351 * Return any ns on the sched_clock that have not yet been accounted in
f06febc9 3352 * @p in case that task is currently running.
c5f8d995
HS
3353 *
3354 * Called with task_rq_lock() held on @rq.
1da177e4 3355 */
c5f8d995
HS
3356static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3357{
3358 u64 ns = 0;
3359
3360 if (task_current(rq, p)) {
3361 update_rq_clock(rq);
305e6835 3362 ns = rq->clock_task - p->se.exec_start;
c5f8d995
HS
3363 if ((s64)ns < 0)
3364 ns = 0;
3365 }
3366
3367 return ns;
3368}
3369
bb34d92f 3370unsigned long long task_delta_exec(struct task_struct *p)
1da177e4 3371{
1da177e4 3372 unsigned long flags;
41b86e9c 3373 struct rq *rq;
bb34d92f 3374 u64 ns = 0;
48f24c4d 3375
41b86e9c 3376 rq = task_rq_lock(p, &flags);
c5f8d995
HS
3377 ns = do_task_delta_exec(p, rq);
3378 task_rq_unlock(rq, &flags);
1508487e 3379
c5f8d995
HS
3380 return ns;
3381}
f06febc9 3382
c5f8d995
HS
3383/*
3384 * Return accounted runtime for the task.
3385 * In case the task is currently running, return the runtime plus current's
3386 * pending runtime that have not been accounted yet.
3387 */
3388unsigned long long task_sched_runtime(struct task_struct *p)
3389{
3390 unsigned long flags;
3391 struct rq *rq;
3392 u64 ns = 0;
3393
3394 rq = task_rq_lock(p, &flags);
3395 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3396 task_rq_unlock(rq, &flags);
3397
3398 return ns;
3399}
48f24c4d 3400
c5f8d995
HS
3401/*
3402 * Return sum_exec_runtime for the thread group.
3403 * In case the task is currently running, return the sum plus current's
3404 * pending runtime that have not been accounted yet.
3405 *
3406 * Note that the thread group might have other running tasks as well,
3407 * so the return value not includes other pending runtime that other
3408 * running tasks might have.
3409 */
3410unsigned long long thread_group_sched_runtime(struct task_struct *p)
3411{
3412 struct task_cputime totals;
3413 unsigned long flags;
3414 struct rq *rq;
3415 u64 ns;
3416
3417 rq = task_rq_lock(p, &flags);
3418 thread_group_cputime(p, &totals);
3419 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
41b86e9c 3420 task_rq_unlock(rq, &flags);
48f24c4d 3421
1da177e4
LT
3422 return ns;
3423}
3424
1da177e4
LT
3425/*
3426 * Account user cpu time to a process.
3427 * @p: the process that the cpu time gets accounted to
1da177e4 3428 * @cputime: the cpu time spent in user space since the last update
457533a7 3429 * @cputime_scaled: cputime scaled by cpu frequency
1da177e4 3430 */
457533a7
MS
3431void account_user_time(struct task_struct *p, cputime_t cputime,
3432 cputime_t cputime_scaled)
1da177e4
LT
3433{
3434 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3435 cputime64_t tmp;
3436
457533a7 3437 /* Add user time to process. */
1da177e4 3438 p->utime = cputime_add(p->utime, cputime);
457533a7 3439 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
f06febc9 3440 account_group_user_time(p, cputime);
1da177e4
LT
3441
3442 /* Add user time to cpustat. */
3443 tmp = cputime_to_cputime64(cputime);
3444 if (TASK_NICE(p) > 0)
3445 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3446 else
3447 cpustat->user = cputime64_add(cpustat->user, tmp);
ef12fefa
BR
3448
3449 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
49b5cf34
JL
3450 /* Account for user time used */
3451 acct_update_integrals(p);
1da177e4
LT
3452}
3453
94886b84
LV
3454/*
3455 * Account guest cpu time to a process.
3456 * @p: the process that the cpu time gets accounted to
3457 * @cputime: the cpu time spent in virtual machine since the last update
457533a7 3458 * @cputime_scaled: cputime scaled by cpu frequency
94886b84 3459 */
457533a7
MS
3460static void account_guest_time(struct task_struct *p, cputime_t cputime,
3461 cputime_t cputime_scaled)
94886b84
LV
3462{
3463 cputime64_t tmp;
3464 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3465
3466 tmp = cputime_to_cputime64(cputime);
3467
457533a7 3468 /* Add guest time to process. */
94886b84 3469 p->utime = cputime_add(p->utime, cputime);
457533a7 3470 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
f06febc9 3471 account_group_user_time(p, cputime);
94886b84
LV
3472 p->gtime = cputime_add(p->gtime, cputime);
3473
457533a7 3474 /* Add guest time to cpustat. */
ce0e7b28
RO
3475 if (TASK_NICE(p) > 0) {
3476 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3477 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3478 } else {
3479 cpustat->user = cputime64_add(cpustat->user, tmp);
3480 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3481 }
94886b84
LV
3482}
3483
1da177e4
LT
3484/*
3485 * Account system cpu time to a process.
3486 * @p: the process that the cpu time gets accounted to
3487 * @hardirq_offset: the offset to subtract from hardirq_count()
3488 * @cputime: the cpu time spent in kernel space since the last update
457533a7 3489 * @cputime_scaled: cputime scaled by cpu frequency
1da177e4
LT
3490 */
3491void account_system_time(struct task_struct *p, int hardirq_offset,
457533a7 3492 cputime_t cputime, cputime_t cputime_scaled)
1da177e4
LT
3493{
3494 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1da177e4
LT
3495 cputime64_t tmp;
3496
983ed7a6 3497 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
457533a7 3498 account_guest_time(p, cputime, cputime_scaled);
983ed7a6
HH
3499 return;
3500 }
94886b84 3501
457533a7 3502 /* Add system time to process. */
1da177e4 3503 p->stime = cputime_add(p->stime, cputime);
457533a7 3504 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
f06febc9 3505 account_group_system_time(p, cputime);
1da177e4
LT
3506
3507 /* Add system time to cpustat. */
3508 tmp = cputime_to_cputime64(cputime);
3509 if (hardirq_count() - hardirq_offset)
3510 cpustat->irq = cputime64_add(cpustat->irq, tmp);
75e1056f 3511 else if (in_serving_softirq())
1da177e4 3512 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
1da177e4 3513 else
79741dd3
MS
3514 cpustat->system = cputime64_add(cpustat->system, tmp);
3515
ef12fefa
BR
3516 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3517
1da177e4
LT
3518 /* Account for system time used */
3519 acct_update_integrals(p);
1da177e4
LT
3520}
3521
c66f08be 3522/*
1da177e4 3523 * Account for involuntary wait time.
1da177e4 3524 * @steal: the cpu time spent in involuntary wait
c66f08be 3525 */
79741dd3 3526void account_steal_time(cputime_t cputime)
c66f08be 3527{
79741dd3
MS
3528 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3529 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3530
3531 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
c66f08be
MN
3532}
3533
1da177e4 3534/*
79741dd3
MS
3535 * Account for idle time.
3536 * @cputime: the cpu time spent in idle wait
1da177e4 3537 */
79741dd3 3538void account_idle_time(cputime_t cputime)
1da177e4
LT
3539{
3540 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
79741dd3 3541 cputime64_t cputime64 = cputime_to_cputime64(cputime);
70b97a7f 3542 struct rq *rq = this_rq();
1da177e4 3543
79741dd3
MS
3544 if (atomic_read(&rq->nr_iowait) > 0)
3545 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3546 else
3547 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
1da177e4
LT
3548}
3549
79741dd3
MS
3550#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3551
3552/*
3553 * Account a single tick of cpu time.
3554 * @p: the process that the cpu time gets accounted to
3555 * @user_tick: indicates if the tick is a user or a system tick
3556 */
3557void account_process_tick(struct task_struct *p, int user_tick)
3558{
a42548a1 3559 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
79741dd3
MS
3560 struct rq *rq = this_rq();
3561
3562 if (user_tick)
a42548a1 3563 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
f5f293a4 3564 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
a42548a1 3565 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
79741dd3
MS
3566 one_jiffy_scaled);
3567 else
a42548a1 3568 account_idle_time(cputime_one_jiffy);
79741dd3
MS
3569}
3570
3571/*
3572 * Account multiple ticks of steal time.
3573 * @p: the process from which the cpu time has been stolen
3574 * @ticks: number of stolen ticks
3575 */
3576void account_steal_ticks(unsigned long ticks)
3577{
3578 account_steal_time(jiffies_to_cputime(ticks));
3579}
3580
3581/*
3582 * Account multiple ticks of idle time.
3583 * @ticks: number of stolen ticks
3584 */
3585void account_idle_ticks(unsigned long ticks)
3586{
3587 account_idle_time(jiffies_to_cputime(ticks));
1da177e4
LT
3588}
3589
79741dd3
MS
3590#endif
3591
49048622
BS
3592/*
3593 * Use precise platform statistics if available:
3594 */
3595#ifdef CONFIG_VIRT_CPU_ACCOUNTING
d180c5bc 3596void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
49048622 3597{
d99ca3b9
HS
3598 *ut = p->utime;
3599 *st = p->stime;
49048622
BS
3600}
3601
0cf55e1e 3602void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
49048622 3603{
0cf55e1e
HS
3604 struct task_cputime cputime;
3605
3606 thread_group_cputime(p, &cputime);
3607
3608 *ut = cputime.utime;
3609 *st = cputime.stime;
49048622
BS
3610}
3611#else
761b1d26
HS
3612
3613#ifndef nsecs_to_cputime
b7b20df9 3614# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
761b1d26
HS
3615#endif
3616
d180c5bc 3617void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
49048622 3618{
d99ca3b9 3619 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
49048622
BS
3620
3621 /*
3622 * Use CFS's precise accounting:
3623 */
d180c5bc 3624 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
49048622
BS
3625
3626 if (total) {
e75e863d 3627 u64 temp = rtime;
d180c5bc 3628
e75e863d 3629 temp *= utime;
49048622 3630 do_div(temp, total);
d180c5bc
HS
3631 utime = (cputime_t)temp;
3632 } else
3633 utime = rtime;
49048622 3634
d180c5bc
HS
3635 /*
3636 * Compare with previous values, to keep monotonicity:
3637 */
761b1d26 3638 p->prev_utime = max(p->prev_utime, utime);
d99ca3b9 3639 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
49048622 3640
d99ca3b9
HS
3641 *ut = p->prev_utime;
3642 *st = p->prev_stime;
49048622
BS
3643}
3644
0cf55e1e
HS
3645/*
3646 * Must be called with siglock held.
3647 */
3648void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
49048622 3649{
0cf55e1e
HS
3650 struct signal_struct *sig = p->signal;
3651 struct task_cputime cputime;
3652 cputime_t rtime, utime, total;
49048622 3653
0cf55e1e 3654 thread_group_cputime(p, &cputime);
49048622 3655
0cf55e1e
HS
3656 total = cputime_add(cputime.utime, cputime.stime);
3657 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
49048622 3658
0cf55e1e 3659 if (total) {
e75e863d 3660 u64 temp = rtime;
49048622 3661
e75e863d 3662 temp *= cputime.utime;
0cf55e1e
HS
3663 do_div(temp, total);
3664 utime = (cputime_t)temp;
3665 } else
3666 utime = rtime;
3667
3668 sig->prev_utime = max(sig->prev_utime, utime);
3669 sig->prev_stime = max(sig->prev_stime,
3670 cputime_sub(rtime, sig->prev_utime));
3671
3672 *ut = sig->prev_utime;
3673 *st = sig->prev_stime;
49048622 3674}
49048622 3675#endif
49048622 3676
7835b98b
CL
3677/*
3678 * This function gets called by the timer code, with HZ frequency.
3679 * We call it with interrupts disabled.
3680 *
3681 * It also gets called by the fork code, when changing the parent's
3682 * timeslices.
3683 */
3684void scheduler_tick(void)
3685{
7835b98b
CL
3686 int cpu = smp_processor_id();
3687 struct rq *rq = cpu_rq(cpu);
dd41f596 3688 struct task_struct *curr = rq->curr;
3e51f33f
PZ
3689
3690 sched_clock_tick();
dd41f596 3691
05fa785c 3692 raw_spin_lock(&rq->lock);
3e51f33f 3693 update_rq_clock(rq);
fdf3e95d 3694 update_cpu_load_active(rq);
fa85ae24 3695 curr->sched_class->task_tick(rq, curr, 0);
05fa785c 3696 raw_spin_unlock(&rq->lock);
7835b98b 3697
49f47433 3698 perf_event_task_tick(curr);
e220d2dc 3699
e418e1c2 3700#ifdef CONFIG_SMP
dd41f596
IM
3701 rq->idle_at_tick = idle_cpu(cpu);
3702 trigger_load_balance(rq, cpu);
e418e1c2 3703#endif
1da177e4
LT
3704}
3705
132380a0 3706notrace unsigned long get_parent_ip(unsigned long addr)
6cd8a4bb
SR
3707{
3708 if (in_lock_functions(addr)) {
3709 addr = CALLER_ADDR2;
3710 if (in_lock_functions(addr))
3711 addr = CALLER_ADDR3;
3712 }
3713 return addr;
3714}
1da177e4 3715
7e49fcce
SR
3716#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3717 defined(CONFIG_PREEMPT_TRACER))
3718
43627582 3719void __kprobes add_preempt_count(int val)
1da177e4 3720{
6cd8a4bb 3721#ifdef CONFIG_DEBUG_PREEMPT
1da177e4
LT
3722 /*
3723 * Underflow?
3724 */
9a11b49a
IM
3725 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3726 return;
6cd8a4bb 3727#endif
1da177e4 3728 preempt_count() += val;
6cd8a4bb 3729#ifdef CONFIG_DEBUG_PREEMPT
1da177e4
LT
3730 /*
3731 * Spinlock count overflowing soon?
3732 */
33859f7f
MOS
3733 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3734 PREEMPT_MASK - 10);
6cd8a4bb
SR
3735#endif
3736 if (preempt_count() == val)
3737 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
1da177e4
LT
3738}
3739EXPORT_SYMBOL(add_preempt_count);
3740
43627582 3741void __kprobes sub_preempt_count(int val)
1da177e4 3742{
6cd8a4bb 3743#ifdef CONFIG_DEBUG_PREEMPT
1da177e4
LT
3744 /*
3745 * Underflow?
3746 */
01e3eb82 3747 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
9a11b49a 3748 return;
1da177e4
LT
3749 /*
3750 * Is the spinlock portion underflowing?
3751 */
9a11b49a
IM
3752 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3753 !(preempt_count() & PREEMPT_MASK)))
3754 return;
6cd8a4bb 3755#endif
9a11b49a 3756
6cd8a4bb
SR
3757 if (preempt_count() == val)
3758 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
1da177e4
LT
3759 preempt_count() -= val;
3760}
3761EXPORT_SYMBOL(sub_preempt_count);
3762
3763#endif
3764
3765/*
dd41f596 3766 * Print scheduling while atomic bug:
1da177e4 3767 */
dd41f596 3768static noinline void __schedule_bug(struct task_struct *prev)
1da177e4 3769{
838225b4
SS
3770 struct pt_regs *regs = get_irq_regs();
3771
3df0fc5b
PZ
3772 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3773 prev->comm, prev->pid, preempt_count());
838225b4 3774
dd41f596 3775 debug_show_held_locks(prev);
e21f5b15 3776 print_modules();
dd41f596
IM
3777 if (irqs_disabled())
3778 print_irqtrace_events(prev);
838225b4
SS
3779
3780 if (regs)
3781 show_regs(regs);
3782 else
3783 dump_stack();
dd41f596 3784}
1da177e4 3785
dd41f596
IM
3786/*
3787 * Various schedule()-time debugging checks and statistics:
3788 */
3789static inline void schedule_debug(struct task_struct *prev)
3790{
1da177e4 3791 /*
41a2d6cf 3792 * Test if we are atomic. Since do_exit() needs to call into
1da177e4
LT
3793 * schedule() atomically, we ignore that path for now.
3794 * Otherwise, whine if we are scheduling when we should not be.
3795 */
3f33a7ce 3796 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
dd41f596
IM
3797 __schedule_bug(prev);
3798
1da177e4
LT
3799 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3800
2d72376b 3801 schedstat_inc(this_rq(), sched_count);
b8efb561
IM
3802#ifdef CONFIG_SCHEDSTATS
3803 if (unlikely(prev->lock_depth >= 0)) {
2d72376b
IM
3804 schedstat_inc(this_rq(), bkl_count);
3805 schedstat_inc(prev, sched_info.bkl_count);
b8efb561
IM
3806 }
3807#endif
dd41f596
IM
3808}
3809
6cecd084 3810static void put_prev_task(struct rq *rq, struct task_struct *prev)
df1c99d4 3811{
a64692a3
MG
3812 if (prev->se.on_rq)
3813 update_rq_clock(rq);
3814 rq->skip_clock_update = 0;
6cecd084 3815 prev->sched_class->put_prev_task(rq, prev);
df1c99d4
MG
3816}
3817
dd41f596
IM
3818/*
3819 * Pick up the highest-prio task:
3820 */
3821static inline struct task_struct *
b67802ea 3822pick_next_task(struct rq *rq)
dd41f596 3823{
5522d5d5 3824 const struct sched_class *class;
dd41f596 3825 struct task_struct *p;
1da177e4
LT
3826
3827 /*
dd41f596
IM
3828 * Optimization: we know that if all tasks are in
3829 * the fair class we can call that function directly:
1da177e4 3830 */
dd41f596 3831 if (likely(rq->nr_running == rq->cfs.nr_running)) {
fb8d4724 3832 p = fair_sched_class.pick_next_task(rq);
dd41f596
IM
3833 if (likely(p))
3834 return p;
1da177e4
LT
3835 }
3836
34f971f6 3837 for_each_class(class) {
fb8d4724 3838 p = class->pick_next_task(rq);
dd41f596
IM
3839 if (p)
3840 return p;
dd41f596 3841 }
34f971f6
PZ
3842
3843 BUG(); /* the idle class will always have a runnable task */
dd41f596 3844}
1da177e4 3845
dd41f596
IM
3846/*
3847 * schedule() is the main scheduler function.
3848 */
ff743345 3849asmlinkage void __sched schedule(void)
dd41f596
IM
3850{
3851 struct task_struct *prev, *next;
67ca7bde 3852 unsigned long *switch_count;
dd41f596 3853 struct rq *rq;
31656519 3854 int cpu;
dd41f596 3855
ff743345
PZ
3856need_resched:
3857 preempt_disable();
dd41f596
IM
3858 cpu = smp_processor_id();
3859 rq = cpu_rq(cpu);
25502a6c 3860 rcu_note_context_switch(cpu);
dd41f596 3861 prev = rq->curr;
dd41f596
IM
3862
3863 release_kernel_lock(prev);
3864need_resched_nonpreemptible:
3865
3866 schedule_debug(prev);
1da177e4 3867
31656519 3868 if (sched_feat(HRTICK))
f333fdc9 3869 hrtick_clear(rq);
8f4d37ec 3870
05fa785c 3871 raw_spin_lock_irq(&rq->lock);
1e819950 3872 clear_tsk_need_resched(prev);
1da177e4 3873
246d86b5 3874 switch_count = &prev->nivcsw;
1da177e4 3875 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
21aa9af0 3876 if (unlikely(signal_pending_state(prev->state, prev))) {
1da177e4 3877 prev->state = TASK_RUNNING;
21aa9af0
TH
3878 } else {
3879 /*
3880 * If a worker is going to sleep, notify and
3881 * ask workqueue whether it wants to wake up a
3882 * task to maintain concurrency. If so, wake
3883 * up the task.
3884 */
3885 if (prev->flags & PF_WQ_WORKER) {
3886 struct task_struct *to_wakeup;
3887
3888 to_wakeup = wq_worker_sleeping(prev, cpu);
3889 if (to_wakeup)
3890 try_to_wake_up_local(to_wakeup);
3891 }
371fd7e7 3892 deactivate_task(rq, prev, DEQUEUE_SLEEP);
21aa9af0 3893 }
dd41f596 3894 switch_count = &prev->nvcsw;
1da177e4
LT
3895 }
3896
3f029d3c 3897 pre_schedule(rq, prev);
f65eda4f 3898
dd41f596 3899 if (unlikely(!rq->nr_running))
1da177e4 3900 idle_balance(cpu, rq);
1da177e4 3901
df1c99d4 3902 put_prev_task(rq, prev);
b67802ea 3903 next = pick_next_task(rq);
1da177e4 3904
1da177e4 3905 if (likely(prev != next)) {
673a90a1 3906 sched_info_switch(prev, next);
49f47433 3907 perf_event_task_sched_out(prev, next);
673a90a1 3908
1da177e4
LT
3909 rq->nr_switches++;
3910 rq->curr = next;
3911 ++*switch_count;
3912
dd41f596 3913 context_switch(rq, prev, next); /* unlocks the rq */
8f4d37ec 3914 /*
246d86b5
ON
3915 * The context switch have flipped the stack from under us
3916 * and restored the local variables which were saved when
3917 * this task called schedule() in the past. prev == current
3918 * is still correct, but it can be moved to another cpu/rq.
8f4d37ec
PZ
3919 */
3920 cpu = smp_processor_id();
3921 rq = cpu_rq(cpu);
1da177e4 3922 } else
05fa785c 3923 raw_spin_unlock_irq(&rq->lock);
1da177e4 3924
3f029d3c 3925 post_schedule(rq);
1da177e4 3926
246d86b5 3927 if (unlikely(reacquire_kernel_lock(prev)))
1da177e4 3928 goto need_resched_nonpreemptible;
8f4d37ec 3929
1da177e4 3930 preempt_enable_no_resched();
ff743345 3931 if (need_resched())
1da177e4
LT
3932 goto need_resched;
3933}
1da177e4
LT
3934EXPORT_SYMBOL(schedule);
3935
c08f7829 3936#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
0d66bf6d
PZ
3937/*
3938 * Look out! "owner" is an entirely speculative pointer
3939 * access and not reliable.
3940 */
3941int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3942{
3943 unsigned int cpu;
3944 struct rq *rq;
3945
3946 if (!sched_feat(OWNER_SPIN))
3947 return 0;
3948
3949#ifdef CONFIG_DEBUG_PAGEALLOC
3950 /*
3951 * Need to access the cpu field knowing that
3952 * DEBUG_PAGEALLOC could have unmapped it if
3953 * the mutex owner just released it and exited.
3954 */
3955 if (probe_kernel_address(&owner->cpu, cpu))
4b402210 3956 return 0;
0d66bf6d
PZ
3957#else
3958 cpu = owner->cpu;
3959#endif
3960
3961 /*
3962 * Even if the access succeeded (likely case),
3963 * the cpu field may no longer be valid.
3964 */
3965 if (cpu >= nr_cpumask_bits)
4b402210 3966 return 0;
0d66bf6d
PZ
3967
3968 /*
3969 * We need to validate that we can do a
3970 * get_cpu() and that we have the percpu area.
3971 */
3972 if (!cpu_online(cpu))
4b402210 3973 return 0;
0d66bf6d
PZ
3974
3975 rq = cpu_rq(cpu);
3976
3977 for (;;) {
3978 /*
3979 * Owner changed, break to re-assess state.
3980 */
9d0f4dcc
TC
3981 if (lock->owner != owner) {
3982 /*
3983 * If the lock has switched to a different owner,
3984 * we likely have heavy contention. Return 0 to quit
3985 * optimistic spinning and not contend further:
3986 */
3987 if (lock->owner)
3988 return 0;
0d66bf6d 3989 break;
9d0f4dcc 3990 }
0d66bf6d
PZ
3991
3992 /*
3993 * Is that owner really running on that cpu?
3994 */
3995 if (task_thread_info(rq->curr) != owner || need_resched())
3996 return 0;
3997
3998 cpu_relax();
3999 }
4b402210 4000
0d66bf6d
PZ
4001 return 1;
4002}
4003#endif
4004
1da177e4
LT
4005#ifdef CONFIG_PREEMPT
4006/*
2ed6e34f 4007 * this is the entry point to schedule() from in-kernel preemption
41a2d6cf 4008 * off of preempt_enable. Kernel preemptions off return from interrupt
1da177e4
LT
4009 * occur there and call schedule directly.
4010 */
d1f74e20 4011asmlinkage void __sched notrace preempt_schedule(void)
1da177e4
LT
4012{
4013 struct thread_info *ti = current_thread_info();
6478d880 4014
1da177e4
LT
4015 /*
4016 * If there is a non-zero preempt_count or interrupts are disabled,
41a2d6cf 4017 * we do not want to preempt the current task. Just return..
1da177e4 4018 */
beed33a8 4019 if (likely(ti->preempt_count || irqs_disabled()))
1da177e4
LT
4020 return;
4021
3a5c359a 4022 do {
d1f74e20 4023 add_preempt_count_notrace(PREEMPT_ACTIVE);
3a5c359a 4024 schedule();
d1f74e20 4025 sub_preempt_count_notrace(PREEMPT_ACTIVE);
1da177e4 4026
3a5c359a
AK
4027 /*
4028 * Check again in case we missed a preemption opportunity
4029 * between schedule and now.
4030 */
4031 barrier();
5ed0cec0 4032 } while (need_resched());
1da177e4 4033}
1da177e4
LT
4034EXPORT_SYMBOL(preempt_schedule);
4035
4036/*
2ed6e34f 4037 * this is the entry point to schedule() from kernel preemption
1da177e4
LT
4038 * off of irq context.
4039 * Note, that this is called and return with irqs disabled. This will
4040 * protect us against recursive calling from irq.
4041 */
4042asmlinkage void __sched preempt_schedule_irq(void)
4043{
4044 struct thread_info *ti = current_thread_info();
6478d880 4045
2ed6e34f 4046 /* Catch callers which need to be fixed */
1da177e4
LT
4047 BUG_ON(ti->preempt_count || !irqs_disabled());
4048
3a5c359a
AK
4049 do {
4050 add_preempt_count(PREEMPT_ACTIVE);
3a5c359a
AK
4051 local_irq_enable();
4052 schedule();
4053 local_irq_disable();
3a5c359a 4054 sub_preempt_count(PREEMPT_ACTIVE);
1da177e4 4055
3a5c359a
AK
4056 /*
4057 * Check again in case we missed a preemption opportunity
4058 * between schedule and now.
4059 */
4060 barrier();
5ed0cec0 4061 } while (need_resched());
1da177e4
LT
4062}
4063
4064#endif /* CONFIG_PREEMPT */
4065
63859d4f 4066int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
95cdf3b7 4067 void *key)
1da177e4 4068{
63859d4f 4069 return try_to_wake_up(curr->private, mode, wake_flags);
1da177e4 4070}
1da177e4
LT
4071EXPORT_SYMBOL(default_wake_function);
4072
4073/*
41a2d6cf
IM
4074 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
4075 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
1da177e4
LT
4076 * number) then we wake all the non-exclusive tasks and one exclusive task.
4077 *
4078 * There are circumstances in which we can try to wake a task which has already
41a2d6cf 4079 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
1da177e4
LT
4080 * zero in this (rare) case, and we handle it by continuing to scan the queue.
4081 */
78ddb08f 4082static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
63859d4f 4083 int nr_exclusive, int wake_flags, void *key)
1da177e4 4084{
2e45874c 4085 wait_queue_t *curr, *next;
1da177e4 4086
2e45874c 4087 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
48f24c4d
IM
4088 unsigned flags = curr->flags;
4089
63859d4f 4090 if (curr->func(curr, mode, wake_flags, key) &&
48f24c4d 4091 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
1da177e4
LT
4092 break;
4093 }
4094}
4095
4096/**
4097 * __wake_up - wake up threads blocked on a waitqueue.
4098 * @q: the waitqueue
4099 * @mode: which threads
4100 * @nr_exclusive: how many wake-one or wake-many threads to wake up
67be2dd1 4101 * @key: is directly passed to the wakeup function
50fa610a
DH
4102 *
4103 * It may be assumed that this function implies a write memory barrier before
4104 * changing the task state if and only if any tasks are woken up.
1da177e4 4105 */
7ad5b3a5 4106void __wake_up(wait_queue_head_t *q, unsigned int mode,
95cdf3b7 4107 int nr_exclusive, void *key)
1da177e4
LT
4108{
4109 unsigned long flags;
4110
4111 spin_lock_irqsave(&q->lock, flags);
4112 __wake_up_common(q, mode, nr_exclusive, 0, key);
4113 spin_unlock_irqrestore(&q->lock, flags);
4114}
1da177e4
LT
4115EXPORT_SYMBOL(__wake_up);
4116
4117/*
4118 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
4119 */
7ad5b3a5 4120void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
1da177e4
LT
4121{
4122 __wake_up_common(q, mode, 1, 0, NULL);
4123}
22c43c81 4124EXPORT_SYMBOL_GPL(__wake_up_locked);
1da177e4 4125
4ede816a
DL
4126void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4127{
4128 __wake_up_common(q, mode, 1, 0, key);
4129}
4130
1da177e4 4131/**
4ede816a 4132 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
1da177e4
LT
4133 * @q: the waitqueue
4134 * @mode: which threads
4135 * @nr_exclusive: how many wake-one or wake-many threads to wake up
4ede816a 4136 * @key: opaque value to be passed to wakeup targets
1da177e4
LT
4137 *
4138 * The sync wakeup differs that the waker knows that it will schedule
4139 * away soon, so while the target thread will be woken up, it will not
4140 * be migrated to another CPU - ie. the two threads are 'synchronized'
4141 * with each other. This can prevent needless bouncing between CPUs.
4142 *
4143 * On UP it can prevent extra preemption.
50fa610a
DH
4144 *
4145 * It may be assumed that this function implies a write memory barrier before
4146 * changing the task state if and only if any tasks are woken up.
1da177e4 4147 */
4ede816a
DL
4148void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4149 int nr_exclusive, void *key)
1da177e4
LT
4150{
4151 unsigned long flags;
7d478721 4152 int wake_flags = WF_SYNC;
1da177e4
LT
4153
4154 if (unlikely(!q))
4155 return;
4156
4157 if (unlikely(!nr_exclusive))
7d478721 4158 wake_flags = 0;
1da177e4
LT
4159
4160 spin_lock_irqsave(&q->lock, flags);
7d478721 4161 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
1da177e4
LT
4162 spin_unlock_irqrestore(&q->lock, flags);
4163}
4ede816a
DL
4164EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4165
4166/*
4167 * __wake_up_sync - see __wake_up_sync_key()
4168 */
4169void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4170{
4171 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4172}
1da177e4
LT
4173EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4174
65eb3dc6
KD
4175/**
4176 * complete: - signals a single thread waiting on this completion
4177 * @x: holds the state of this particular completion
4178 *
4179 * This will wake up a single thread waiting on this completion. Threads will be
4180 * awakened in the same order in which they were queued.
4181 *
4182 * See also complete_all(), wait_for_completion() and related routines.
50fa610a
DH
4183 *
4184 * It may be assumed that this function implies a write memory barrier before
4185 * changing the task state if and only if any tasks are woken up.
65eb3dc6 4186 */
b15136e9 4187void complete(struct completion *x)
1da177e4
LT
4188{
4189 unsigned long flags;
4190
4191 spin_lock_irqsave(&x->wait.lock, flags);
4192 x->done++;
d9514f6c 4193 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
1da177e4
LT
4194 spin_unlock_irqrestore(&x->wait.lock, flags);
4195}
4196EXPORT_SYMBOL(complete);
4197
65eb3dc6
KD
4198/**
4199 * complete_all: - signals all threads waiting on this completion
4200 * @x: holds the state of this particular completion
4201 *
4202 * This will wake up all threads waiting on this particular completion event.
50fa610a
DH
4203 *
4204 * It may be assumed that this function implies a write memory barrier before
4205 * changing the task state if and only if any tasks are woken up.
65eb3dc6 4206 */
b15136e9 4207void complete_all(struct completion *x)
1da177e4
LT
4208{
4209 unsigned long flags;
4210
4211 spin_lock_irqsave(&x->wait.lock, flags);
4212 x->done += UINT_MAX/2;
d9514f6c 4213 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
1da177e4
LT
4214 spin_unlock_irqrestore(&x->wait.lock, flags);
4215}
4216EXPORT_SYMBOL(complete_all);
4217
8cbbe86d
AK
4218static inline long __sched
4219do_wait_for_common(struct completion *x, long timeout, int state)
1da177e4 4220{
1da177e4
LT
4221 if (!x->done) {
4222 DECLARE_WAITQUEUE(wait, current);
4223
a93d2f17 4224 __add_wait_queue_tail_exclusive(&x->wait, &wait);
1da177e4 4225 do {
94d3d824 4226 if (signal_pending_state(state, current)) {
ea71a546
ON
4227 timeout = -ERESTARTSYS;
4228 break;
8cbbe86d
AK
4229 }
4230 __set_current_state(state);
1da177e4
LT
4231 spin_unlock_irq(&x->wait.lock);
4232 timeout = schedule_timeout(timeout);
4233 spin_lock_irq(&x->wait.lock);
ea71a546 4234 } while (!x->done && timeout);
1da177e4 4235 __remove_wait_queue(&x->wait, &wait);
ea71a546
ON
4236 if (!x->done)
4237 return timeout;
1da177e4
LT
4238 }
4239 x->done--;
ea71a546 4240 return timeout ?: 1;
1da177e4 4241}
1da177e4 4242
8cbbe86d
AK
4243static long __sched
4244wait_for_common(struct completion *x, long timeout, int state)
1da177e4 4245{
1da177e4
LT
4246 might_sleep();
4247
4248 spin_lock_irq(&x->wait.lock);
8cbbe86d 4249 timeout = do_wait_for_common(x, timeout, state);
1da177e4 4250 spin_unlock_irq(&x->wait.lock);
8cbbe86d
AK
4251 return timeout;
4252}
1da177e4 4253
65eb3dc6
KD
4254/**
4255 * wait_for_completion: - waits for completion of a task
4256 * @x: holds the state of this particular completion
4257 *
4258 * This waits to be signaled for completion of a specific task. It is NOT
4259 * interruptible and there is no timeout.
4260 *
4261 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4262 * and interrupt capability. Also see complete().
4263 */
b15136e9 4264void __sched wait_for_completion(struct completion *x)
8cbbe86d
AK
4265{
4266 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
1da177e4 4267}
8cbbe86d 4268EXPORT_SYMBOL(wait_for_completion);
1da177e4 4269
65eb3dc6
KD
4270/**
4271 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4272 * @x: holds the state of this particular completion
4273 * @timeout: timeout value in jiffies
4274 *
4275 * This waits for either a completion of a specific task to be signaled or for a
4276 * specified timeout to expire. The timeout is in jiffies. It is not
4277 * interruptible.
4278 */
b15136e9 4279unsigned long __sched
8cbbe86d 4280wait_for_completion_timeout(struct completion *x, unsigned long timeout)
1da177e4 4281{
8cbbe86d 4282 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
1da177e4 4283}
8cbbe86d 4284EXPORT_SYMBOL(wait_for_completion_timeout);
1da177e4 4285
65eb3dc6
KD
4286/**
4287 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4288 * @x: holds the state of this particular completion
4289 *
4290 * This waits for completion of a specific task to be signaled. It is
4291 * interruptible.
4292 */
8cbbe86d 4293int __sched wait_for_completion_interruptible(struct completion *x)
0fec171c 4294{
51e97990
AK
4295 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4296 if (t == -ERESTARTSYS)
4297 return t;
4298 return 0;
0fec171c 4299}
8cbbe86d 4300EXPORT_SYMBOL(wait_for_completion_interruptible);
1da177e4 4301
65eb3dc6
KD
4302/**
4303 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4304 * @x: holds the state of this particular completion
4305 * @timeout: timeout value in jiffies
4306 *
4307 * This waits for either a completion of a specific task to be signaled or for a
4308 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4309 */
b15136e9 4310unsigned long __sched
8cbbe86d
AK
4311wait_for_completion_interruptible_timeout(struct completion *x,
4312 unsigned long timeout)
0fec171c 4313{
8cbbe86d 4314 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
0fec171c 4315}
8cbbe86d 4316EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
1da177e4 4317
65eb3dc6
KD
4318/**
4319 * wait_for_completion_killable: - waits for completion of a task (killable)
4320 * @x: holds the state of this particular completion
4321 *
4322 * This waits to be signaled for completion of a specific task. It can be
4323 * interrupted by a kill signal.
4324 */
009e577e
MW
4325int __sched wait_for_completion_killable(struct completion *x)
4326{
4327 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4328 if (t == -ERESTARTSYS)
4329 return t;
4330 return 0;
4331}
4332EXPORT_SYMBOL(wait_for_completion_killable);
4333
0aa12fb4
SW
4334/**
4335 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4336 * @x: holds the state of this particular completion
4337 * @timeout: timeout value in jiffies
4338 *
4339 * This waits for either a completion of a specific task to be
4340 * signaled or for a specified timeout to expire. It can be
4341 * interrupted by a kill signal. The timeout is in jiffies.
4342 */
4343unsigned long __sched
4344wait_for_completion_killable_timeout(struct completion *x,
4345 unsigned long timeout)
4346{
4347 return wait_for_common(x, timeout, TASK_KILLABLE);
4348}
4349EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4350
be4de352
DC
4351/**
4352 * try_wait_for_completion - try to decrement a completion without blocking
4353 * @x: completion structure
4354 *
4355 * Returns: 0 if a decrement cannot be done without blocking
4356 * 1 if a decrement succeeded.
4357 *
4358 * If a completion is being used as a counting completion,
4359 * attempt to decrement the counter without blocking. This
4360 * enables us to avoid waiting if the resource the completion
4361 * is protecting is not available.
4362 */
4363bool try_wait_for_completion(struct completion *x)
4364{
7539a3b3 4365 unsigned long flags;
be4de352
DC
4366 int ret = 1;
4367
7539a3b3 4368 spin_lock_irqsave(&x->wait.lock, flags);
be4de352
DC
4369 if (!x->done)
4370 ret = 0;
4371 else
4372 x->done--;
7539a3b3 4373 spin_unlock_irqrestore(&x->wait.lock, flags);
be4de352
DC
4374 return ret;
4375}
4376EXPORT_SYMBOL(try_wait_for_completion);
4377
4378/**
4379 * completion_done - Test to see if a completion has any waiters
4380 * @x: completion structure
4381 *
4382 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4383 * 1 if there are no waiters.
4384 *
4385 */
4386bool completion_done(struct completion *x)
4387{
7539a3b3 4388 unsigned long flags;
be4de352
DC
4389 int ret = 1;
4390
7539a3b3 4391 spin_lock_irqsave(&x->wait.lock, flags);
be4de352
DC
4392 if (!x->done)
4393 ret = 0;
7539a3b3 4394 spin_unlock_irqrestore(&x->wait.lock, flags);
be4de352
DC
4395 return ret;
4396}
4397EXPORT_SYMBOL(completion_done);
4398
8cbbe86d
AK
4399static long __sched
4400sleep_on_common(wait_queue_head_t *q, int state, long timeout)
1da177e4 4401{
0fec171c
IM
4402 unsigned long flags;
4403 wait_queue_t wait;
4404
4405 init_waitqueue_entry(&wait, current);
1da177e4 4406
8cbbe86d 4407 __set_current_state(state);
1da177e4 4408
8cbbe86d
AK
4409 spin_lock_irqsave(&q->lock, flags);
4410 __add_wait_queue(q, &wait);
4411 spin_unlock(&q->lock);
4412 timeout = schedule_timeout(timeout);
4413 spin_lock_irq(&q->lock);
4414 __remove_wait_queue(q, &wait);
4415 spin_unlock_irqrestore(&q->lock, flags);
4416
4417 return timeout;
4418}
4419
4420void __sched interruptible_sleep_on(wait_queue_head_t *q)
4421{
4422 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
1da177e4 4423}
1da177e4
LT
4424EXPORT_SYMBOL(interruptible_sleep_on);
4425
0fec171c 4426long __sched
95cdf3b7 4427interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
1da177e4 4428{
8cbbe86d 4429 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
1da177e4 4430}
1da177e4
LT
4431EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4432
0fec171c 4433void __sched sleep_on(wait_queue_head_t *q)
1da177e4 4434{
8cbbe86d 4435 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
1da177e4 4436}
1da177e4
LT
4437EXPORT_SYMBOL(sleep_on);
4438
0fec171c 4439long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
1da177e4 4440{
8cbbe86d 4441 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
1da177e4 4442}
1da177e4
LT
4443EXPORT_SYMBOL(sleep_on_timeout);
4444
b29739f9
IM
4445#ifdef CONFIG_RT_MUTEXES
4446
4447/*
4448 * rt_mutex_setprio - set the current priority of a task
4449 * @p: task
4450 * @prio: prio value (kernel-internal form)
4451 *
4452 * This function changes the 'effective' priority of a task. It does
4453 * not touch ->normal_prio like __setscheduler().
4454 *
4455 * Used by the rt_mutex code to implement priority inheritance logic.
4456 */
36c8b586 4457void rt_mutex_setprio(struct task_struct *p, int prio)
b29739f9
IM
4458{
4459 unsigned long flags;
83b699ed 4460 int oldprio, on_rq, running;
70b97a7f 4461 struct rq *rq;
83ab0aa0 4462 const struct sched_class *prev_class;
b29739f9
IM
4463
4464 BUG_ON(prio < 0 || prio > MAX_PRIO);
4465
4466 rq = task_rq_lock(p, &flags);
4467
a8027073 4468 trace_sched_pi_setprio(p, prio);
d5f9f942 4469 oldprio = p->prio;
83ab0aa0 4470 prev_class = p->sched_class;
dd41f596 4471 on_rq = p->se.on_rq;
051a1d1a 4472 running = task_current(rq, p);
0e1f3483 4473 if (on_rq)
69be72c1 4474 dequeue_task(rq, p, 0);
0e1f3483
HS
4475 if (running)
4476 p->sched_class->put_prev_task(rq, p);
dd41f596
IM
4477
4478 if (rt_prio(prio))
4479 p->sched_class = &rt_sched_class;
4480 else
4481 p->sched_class = &fair_sched_class;
4482
b29739f9
IM
4483 p->prio = prio;
4484
0e1f3483
HS
4485 if (running)
4486 p->sched_class->set_curr_task(rq);
dd41f596 4487 if (on_rq) {
371fd7e7 4488 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
cb469845
SR
4489
4490 check_class_changed(rq, p, prev_class, oldprio, running);
b29739f9
IM
4491 }
4492 task_rq_unlock(rq, &flags);
4493}
4494
4495#endif
4496
36c8b586 4497void set_user_nice(struct task_struct *p, long nice)
1da177e4 4498{
dd41f596 4499 int old_prio, delta, on_rq;
1da177e4 4500 unsigned long flags;
70b97a7f 4501 struct rq *rq;
1da177e4
LT
4502
4503 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4504 return;
4505 /*
4506 * We have to be careful, if called from sys_setpriority(),
4507 * the task might be in the middle of scheduling on another CPU.
4508 */
4509 rq = task_rq_lock(p, &flags);
4510 /*
4511 * The RT priorities are set via sched_setscheduler(), but we still
4512 * allow the 'normal' nice value to be set - but as expected
4513 * it wont have any effect on scheduling until the task is
dd41f596 4514 * SCHED_FIFO/SCHED_RR:
1da177e4 4515 */
e05606d3 4516 if (task_has_rt_policy(p)) {
1da177e4
LT
4517 p->static_prio = NICE_TO_PRIO(nice);
4518 goto out_unlock;
4519 }
dd41f596 4520 on_rq = p->se.on_rq;
c09595f6 4521 if (on_rq)
69be72c1 4522 dequeue_task(rq, p, 0);
1da177e4 4523
1da177e4 4524 p->static_prio = NICE_TO_PRIO(nice);
2dd73a4f 4525 set_load_weight(p);
b29739f9
IM
4526 old_prio = p->prio;
4527 p->prio = effective_prio(p);
4528 delta = p->prio - old_prio;
1da177e4 4529
dd41f596 4530 if (on_rq) {
371fd7e7 4531 enqueue_task(rq, p, 0);
1da177e4 4532 /*
d5f9f942
AM
4533 * If the task increased its priority or is running and
4534 * lowered its priority, then reschedule its CPU:
1da177e4 4535 */
d5f9f942 4536 if (delta < 0 || (delta > 0 && task_running(rq, p)))
1da177e4
LT
4537 resched_task(rq->curr);
4538 }
4539out_unlock:
4540 task_rq_unlock(rq, &flags);
4541}
1da177e4
LT
4542EXPORT_SYMBOL(set_user_nice);
4543
e43379f1
MM
4544/*
4545 * can_nice - check if a task can reduce its nice value
4546 * @p: task
4547 * @nice: nice value
4548 */
36c8b586 4549int can_nice(const struct task_struct *p, const int nice)
e43379f1 4550{
024f4747
MM
4551 /* convert nice value [19,-20] to rlimit style value [1,40] */
4552 int nice_rlim = 20 - nice;
48f24c4d 4553
78d7d407 4554 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
e43379f1
MM
4555 capable(CAP_SYS_NICE));
4556}
4557
1da177e4
LT
4558#ifdef __ARCH_WANT_SYS_NICE
4559
4560/*
4561 * sys_nice - change the priority of the current process.
4562 * @increment: priority increment
4563 *
4564 * sys_setpriority is a more generic, but much slower function that
4565 * does similar things.
4566 */
5add95d4 4567SYSCALL_DEFINE1(nice, int, increment)
1da177e4 4568{
48f24c4d 4569 long nice, retval;
1da177e4
LT
4570
4571 /*
4572 * Setpriority might change our priority at the same moment.
4573 * We don't have to worry. Conceptually one call occurs first
4574 * and we have a single winner.
4575 */
e43379f1
MM
4576 if (increment < -40)
4577 increment = -40;
1da177e4
LT
4578 if (increment > 40)
4579 increment = 40;
4580
2b8f836f 4581 nice = TASK_NICE(current) + increment;
1da177e4
LT
4582 if (nice < -20)
4583 nice = -20;
4584 if (nice > 19)
4585 nice = 19;
4586
e43379f1
MM
4587 if (increment < 0 && !can_nice(current, nice))
4588 return -EPERM;
4589
1da177e4
LT
4590 retval = security_task_setnice(current, nice);
4591 if (retval)
4592 return retval;
4593
4594 set_user_nice(current, nice);
4595 return 0;
4596}
4597
4598#endif
4599
4600/**
4601 * task_prio - return the priority value of a given task.
4602 * @p: the task in question.
4603 *
4604 * This is the priority value as seen by users in /proc.
4605 * RT tasks are offset by -200. Normal tasks are centered
4606 * around 0, value goes from -16 to +15.
4607 */
36c8b586 4608int task_prio(const struct task_struct *p)
1da177e4
LT
4609{
4610 return p->prio - MAX_RT_PRIO;
4611}
4612
4613/**
4614 * task_nice - return the nice value of a given task.
4615 * @p: the task in question.
4616 */
36c8b586 4617int task_nice(const struct task_struct *p)
1da177e4
LT
4618{
4619 return TASK_NICE(p);
4620}
150d8bed 4621EXPORT_SYMBOL(task_nice);
1da177e4
LT
4622
4623/**
4624 * idle_cpu - is a given cpu idle currently?
4625 * @cpu: the processor in question.
4626 */
4627int idle_cpu(int cpu)
4628{
4629 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4630}
4631
1da177e4
LT
4632/**
4633 * idle_task - return the idle task for a given cpu.
4634 * @cpu: the processor in question.
4635 */
36c8b586 4636struct task_struct *idle_task(int cpu)
1da177e4
LT
4637{
4638 return cpu_rq(cpu)->idle;
4639}
4640
4641/**
4642 * find_process_by_pid - find a process with a matching PID value.
4643 * @pid: the pid in question.
4644 */
a9957449 4645static struct task_struct *find_process_by_pid(pid_t pid)
1da177e4 4646{
228ebcbe 4647 return pid ? find_task_by_vpid(pid) : current;
1da177e4
LT
4648}
4649
4650/* Actually do priority change: must hold rq lock. */
dd41f596
IM
4651static void
4652__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
1da177e4 4653{
dd41f596 4654 BUG_ON(p->se.on_rq);
48f24c4d 4655
1da177e4
LT
4656 p->policy = policy;
4657 p->rt_priority = prio;
b29739f9
IM
4658 p->normal_prio = normal_prio(p);
4659 /* we are holding p->pi_lock already */
4660 p->prio = rt_mutex_getprio(p);
ffd44db5
PZ
4661 if (rt_prio(p->prio))
4662 p->sched_class = &rt_sched_class;
4663 else
4664 p->sched_class = &fair_sched_class;
2dd73a4f 4665 set_load_weight(p);
1da177e4
LT
4666}
4667
c69e8d9c
DH
4668/*
4669 * check the target process has a UID that matches the current process's
4670 */
4671static bool check_same_owner(struct task_struct *p)
4672{
4673 const struct cred *cred = current_cred(), *pcred;
4674 bool match;
4675
4676 rcu_read_lock();
4677 pcred = __task_cred(p);
4678 match = (cred->euid == pcred->euid ||
4679 cred->euid == pcred->uid);
4680 rcu_read_unlock();
4681 return match;
4682}
4683
961ccddd
RR
4684static int __sched_setscheduler(struct task_struct *p, int policy,
4685 struct sched_param *param, bool user)
1da177e4 4686{
83b699ed 4687 int retval, oldprio, oldpolicy = -1, on_rq, running;
1da177e4 4688 unsigned long flags;
83ab0aa0 4689 const struct sched_class *prev_class;
70b97a7f 4690 struct rq *rq;
ca94c442 4691 int reset_on_fork;
1da177e4 4692
66e5393a
SR
4693 /* may grab non-irq protected spin_locks */
4694 BUG_ON(in_interrupt());
1da177e4
LT
4695recheck:
4696 /* double check policy once rq lock held */
ca94c442
LP
4697 if (policy < 0) {
4698 reset_on_fork = p->sched_reset_on_fork;
1da177e4 4699 policy = oldpolicy = p->policy;
ca94c442
LP
4700 } else {
4701 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4702 policy &= ~SCHED_RESET_ON_FORK;
4703
4704 if (policy != SCHED_FIFO && policy != SCHED_RR &&
4705 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4706 policy != SCHED_IDLE)
4707 return -EINVAL;
4708 }
4709
1da177e4
LT
4710 /*
4711 * Valid priorities for SCHED_FIFO and SCHED_RR are
dd41f596
IM
4712 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4713 * SCHED_BATCH and SCHED_IDLE is 0.
1da177e4
LT
4714 */
4715 if (param->sched_priority < 0 ||
95cdf3b7 4716 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
d46523ea 4717 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
1da177e4 4718 return -EINVAL;
e05606d3 4719 if (rt_policy(policy) != (param->sched_priority != 0))
1da177e4
LT
4720 return -EINVAL;
4721
37e4ab3f
OC
4722 /*
4723 * Allow unprivileged RT tasks to decrease priority:
4724 */
961ccddd 4725 if (user && !capable(CAP_SYS_NICE)) {
e05606d3 4726 if (rt_policy(policy)) {
a44702e8
ON
4727 unsigned long rlim_rtprio =
4728 task_rlimit(p, RLIMIT_RTPRIO);
8dc3e909
ON
4729
4730 /* can't set/change the rt policy */
4731 if (policy != p->policy && !rlim_rtprio)
4732 return -EPERM;
4733
4734 /* can't increase priority */
4735 if (param->sched_priority > p->rt_priority &&
4736 param->sched_priority > rlim_rtprio)
4737 return -EPERM;
4738 }
dd41f596
IM
4739 /*
4740 * Like positive nice levels, dont allow tasks to
4741 * move out of SCHED_IDLE either:
4742 */
4743 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4744 return -EPERM;
5fe1d75f 4745
37e4ab3f 4746 /* can't change other user's priorities */
c69e8d9c 4747 if (!check_same_owner(p))
37e4ab3f 4748 return -EPERM;
ca94c442
LP
4749
4750 /* Normal users shall not reset the sched_reset_on_fork flag */
4751 if (p->sched_reset_on_fork && !reset_on_fork)
4752 return -EPERM;
37e4ab3f 4753 }
1da177e4 4754
725aad24 4755 if (user) {
725aad24
JF
4756 retval = security_task_setscheduler(p, policy, param);
4757 if (retval)
4758 return retval;
4759 }
4760
b29739f9
IM
4761 /*
4762 * make sure no PI-waiters arrive (or leave) while we are
4763 * changing the priority of the task:
4764 */
1d615482 4765 raw_spin_lock_irqsave(&p->pi_lock, flags);
1da177e4
LT
4766 /*
4767 * To be able to change p->policy safely, the apropriate
4768 * runqueue lock must be held.
4769 */
b29739f9 4770 rq = __task_rq_lock(p);
dc61b1d6 4771
34f971f6
PZ
4772 /*
4773 * Changing the policy of the stop threads its a very bad idea
4774 */
4775 if (p == rq->stop) {
4776 __task_rq_unlock(rq);
4777 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4778 return -EINVAL;
4779 }
4780
dc61b1d6
PZ
4781#ifdef CONFIG_RT_GROUP_SCHED
4782 if (user) {
4783 /*
4784 * Do not allow realtime tasks into groups that have no runtime
4785 * assigned.
4786 */
4787 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4788 task_group(p)->rt_bandwidth.rt_runtime == 0) {
4789 __task_rq_unlock(rq);
4790 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4791 return -EPERM;
4792 }
4793 }
4794#endif
4795
1da177e4
LT
4796 /* recheck policy now with rq lock held */
4797 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4798 policy = oldpolicy = -1;
b29739f9 4799 __task_rq_unlock(rq);
1d615482 4800 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1da177e4
LT
4801 goto recheck;
4802 }
dd41f596 4803 on_rq = p->se.on_rq;
051a1d1a 4804 running = task_current(rq, p);
0e1f3483 4805 if (on_rq)
2e1cb74a 4806 deactivate_task(rq, p, 0);
0e1f3483
HS
4807 if (running)
4808 p->sched_class->put_prev_task(rq, p);
f6b53205 4809
ca94c442
LP
4810 p->sched_reset_on_fork = reset_on_fork;
4811
1da177e4 4812 oldprio = p->prio;
83ab0aa0 4813 prev_class = p->sched_class;
dd41f596 4814 __setscheduler(rq, p, policy, param->sched_priority);
f6b53205 4815
0e1f3483
HS
4816 if (running)
4817 p->sched_class->set_curr_task(rq);
dd41f596
IM
4818 if (on_rq) {
4819 activate_task(rq, p, 0);
cb469845
SR
4820
4821 check_class_changed(rq, p, prev_class, oldprio, running);
1da177e4 4822 }
b29739f9 4823 __task_rq_unlock(rq);
1d615482 4824 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
b29739f9 4825
95e02ca9
TG
4826 rt_mutex_adjust_pi(p);
4827
1da177e4
LT
4828 return 0;
4829}
961ccddd
RR
4830
4831/**
4832 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4833 * @p: the task in question.
4834 * @policy: new policy.
4835 * @param: structure containing the new RT priority.
4836 *
4837 * NOTE that the task may be already dead.
4838 */
4839int sched_setscheduler(struct task_struct *p, int policy,
4840 struct sched_param *param)
4841{
4842 return __sched_setscheduler(p, policy, param, true);
4843}
1da177e4
LT
4844EXPORT_SYMBOL_GPL(sched_setscheduler);
4845
961ccddd
RR
4846/**
4847 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4848 * @p: the task in question.
4849 * @policy: new policy.
4850 * @param: structure containing the new RT priority.
4851 *
4852 * Just like sched_setscheduler, only don't bother checking if the
4853 * current context has permission. For example, this is needed in
4854 * stop_machine(): we create temporary high priority worker threads,
4855 * but our caller might not have that capability.
4856 */
4857int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4858 struct sched_param *param)
4859{
4860 return __sched_setscheduler(p, policy, param, false);
4861}
4862
95cdf3b7
IM
4863static int
4864do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
1da177e4 4865{
1da177e4
LT
4866 struct sched_param lparam;
4867 struct task_struct *p;
36c8b586 4868 int retval;
1da177e4
LT
4869
4870 if (!param || pid < 0)
4871 return -EINVAL;
4872 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4873 return -EFAULT;
5fe1d75f
ON
4874
4875 rcu_read_lock();
4876 retval = -ESRCH;
1da177e4 4877 p = find_process_by_pid(pid);
5fe1d75f
ON
4878 if (p != NULL)
4879 retval = sched_setscheduler(p, policy, &lparam);
4880 rcu_read_unlock();
36c8b586 4881
1da177e4
LT
4882 return retval;
4883}
4884
4885/**
4886 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4887 * @pid: the pid in question.
4888 * @policy: new policy.
4889 * @param: structure containing the new RT priority.
4890 */
5add95d4
HC
4891SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4892 struct sched_param __user *, param)
1da177e4 4893{
c21761f1
JB
4894 /* negative values for policy are not valid */
4895 if (policy < 0)
4896 return -EINVAL;
4897
1da177e4
LT
4898 return do_sched_setscheduler(pid, policy, param);
4899}
4900
4901/**
4902 * sys_sched_setparam - set/change the RT priority of a thread
4903 * @pid: the pid in question.
4904 * @param: structure containing the new RT priority.
4905 */
5add95d4 4906SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
1da177e4
LT
4907{
4908 return do_sched_setscheduler(pid, -1, param);
4909}
4910
4911/**
4912 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4913 * @pid: the pid in question.
4914 */
5add95d4 4915SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
1da177e4 4916{
36c8b586 4917 struct task_struct *p;
3a5c359a 4918 int retval;
1da177e4
LT
4919
4920 if (pid < 0)
3a5c359a 4921 return -EINVAL;
1da177e4
LT
4922
4923 retval = -ESRCH;
5fe85be0 4924 rcu_read_lock();
1da177e4
LT
4925 p = find_process_by_pid(pid);
4926 if (p) {
4927 retval = security_task_getscheduler(p);
4928 if (!retval)
ca94c442
LP
4929 retval = p->policy
4930 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
1da177e4 4931 }
5fe85be0 4932 rcu_read_unlock();
1da177e4
LT
4933 return retval;
4934}
4935
4936/**
ca94c442 4937 * sys_sched_getparam - get the RT priority of a thread
1da177e4
LT
4938 * @pid: the pid in question.
4939 * @param: structure containing the RT priority.
4940 */
5add95d4 4941SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
1da177e4
LT
4942{
4943 struct sched_param lp;
36c8b586 4944 struct task_struct *p;
3a5c359a 4945 int retval;
1da177e4
LT
4946
4947 if (!param || pid < 0)
3a5c359a 4948 return -EINVAL;
1da177e4 4949
5fe85be0 4950 rcu_read_lock();
1da177e4
LT
4951 p = find_process_by_pid(pid);
4952 retval = -ESRCH;
4953 if (!p)
4954 goto out_unlock;
4955
4956 retval = security_task_getscheduler(p);
4957 if (retval)
4958 goto out_unlock;
4959
4960 lp.sched_priority = p->rt_priority;
5fe85be0 4961 rcu_read_unlock();
1da177e4
LT
4962
4963 /*
4964 * This one might sleep, we cannot do it with a spinlock held ...
4965 */
4966 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4967
1da177e4
LT
4968 return retval;
4969
4970out_unlock:
5fe85be0 4971 rcu_read_unlock();
1da177e4
LT
4972 return retval;
4973}
4974
96f874e2 4975long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
1da177e4 4976{
5a16f3d3 4977 cpumask_var_t cpus_allowed, new_mask;
36c8b586
IM
4978 struct task_struct *p;
4979 int retval;
1da177e4 4980
95402b38 4981 get_online_cpus();
23f5d142 4982 rcu_read_lock();
1da177e4
LT
4983
4984 p = find_process_by_pid(pid);
4985 if (!p) {
23f5d142 4986 rcu_read_unlock();
95402b38 4987 put_online_cpus();
1da177e4
LT
4988 return -ESRCH;
4989 }
4990
23f5d142 4991 /* Prevent p going away */
1da177e4 4992 get_task_struct(p);
23f5d142 4993 rcu_read_unlock();
1da177e4 4994
5a16f3d3
RR
4995 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4996 retval = -ENOMEM;
4997 goto out_put_task;
4998 }
4999 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5000 retval = -ENOMEM;
5001 goto out_free_cpus_allowed;
5002 }
1da177e4 5003 retval = -EPERM;
c69e8d9c 5004 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
1da177e4
LT
5005 goto out_unlock;
5006
e7834f8f
DQ
5007 retval = security_task_setscheduler(p, 0, NULL);
5008 if (retval)
5009 goto out_unlock;
5010
5a16f3d3
RR
5011 cpuset_cpus_allowed(p, cpus_allowed);
5012 cpumask_and(new_mask, in_mask, cpus_allowed);
49246274 5013again:
5a16f3d3 5014 retval = set_cpus_allowed_ptr(p, new_mask);
1da177e4 5015
8707d8b8 5016 if (!retval) {
5a16f3d3
RR
5017 cpuset_cpus_allowed(p, cpus_allowed);
5018 if (!cpumask_subset(new_mask, cpus_allowed)) {
8707d8b8
PM
5019 /*
5020 * We must have raced with a concurrent cpuset
5021 * update. Just reset the cpus_allowed to the
5022 * cpuset's cpus_allowed
5023 */
5a16f3d3 5024 cpumask_copy(new_mask, cpus_allowed);
8707d8b8
PM
5025 goto again;
5026 }
5027 }
1da177e4 5028out_unlock:
5a16f3d3
RR
5029 free_cpumask_var(new_mask);
5030out_free_cpus_allowed:
5031 free_cpumask_var(cpus_allowed);
5032out_put_task:
1da177e4 5033 put_task_struct(p);
95402b38 5034 put_online_cpus();
1da177e4
LT
5035 return retval;
5036}
5037
5038static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
96f874e2 5039 struct cpumask *new_mask)
1da177e4 5040{
96f874e2
RR
5041 if (len < cpumask_size())
5042 cpumask_clear(new_mask);
5043 else if (len > cpumask_size())
5044 len = cpumask_size();
5045
1da177e4
LT
5046 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5047}
5048
5049/**
5050 * sys_sched_setaffinity - set the cpu affinity of a process
5051 * @pid: pid of the process
5052 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5053 * @user_mask_ptr: user-space pointer to the new cpu mask
5054 */
5add95d4
HC
5055SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5056 unsigned long __user *, user_mask_ptr)
1da177e4 5057{
5a16f3d3 5058 cpumask_var_t new_mask;
1da177e4
LT
5059 int retval;
5060
5a16f3d3
RR
5061 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5062 return -ENOMEM;
1da177e4 5063
5a16f3d3
RR
5064 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5065 if (retval == 0)
5066 retval = sched_setaffinity(pid, new_mask);
5067 free_cpumask_var(new_mask);
5068 return retval;
1da177e4
LT
5069}
5070
96f874e2 5071long sched_getaffinity(pid_t pid, struct cpumask *mask)
1da177e4 5072{
36c8b586 5073 struct task_struct *p;
31605683
TG
5074 unsigned long flags;
5075 struct rq *rq;
1da177e4 5076 int retval;
1da177e4 5077
95402b38 5078 get_online_cpus();
23f5d142 5079 rcu_read_lock();
1da177e4
LT
5080
5081 retval = -ESRCH;
5082 p = find_process_by_pid(pid);
5083 if (!p)
5084 goto out_unlock;
5085
e7834f8f
DQ
5086 retval = security_task_getscheduler(p);
5087 if (retval)
5088 goto out_unlock;
5089
31605683 5090 rq = task_rq_lock(p, &flags);
96f874e2 5091 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
31605683 5092 task_rq_unlock(rq, &flags);
1da177e4
LT
5093
5094out_unlock:
23f5d142 5095 rcu_read_unlock();
95402b38 5096 put_online_cpus();
1da177e4 5097
9531b62f 5098 return retval;
1da177e4
LT
5099}
5100
5101/**
5102 * sys_sched_getaffinity - get the cpu affinity of a process
5103 * @pid: pid of the process
5104 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5105 * @user_mask_ptr: user-space pointer to hold the current cpu mask
5106 */
5add95d4
HC
5107SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5108 unsigned long __user *, user_mask_ptr)
1da177e4
LT
5109{
5110 int ret;
f17c8607 5111 cpumask_var_t mask;
1da177e4 5112
84fba5ec 5113 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
cd3d8031
KM
5114 return -EINVAL;
5115 if (len & (sizeof(unsigned long)-1))
1da177e4
LT
5116 return -EINVAL;
5117
f17c8607
RR
5118 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5119 return -ENOMEM;
1da177e4 5120
f17c8607
RR
5121 ret = sched_getaffinity(pid, mask);
5122 if (ret == 0) {
8bc037fb 5123 size_t retlen = min_t(size_t, len, cpumask_size());
cd3d8031
KM
5124
5125 if (copy_to_user(user_mask_ptr, mask, retlen))
f17c8607
RR
5126 ret = -EFAULT;
5127 else
cd3d8031 5128 ret = retlen;
f17c8607
RR
5129 }
5130 free_cpumask_var(mask);
1da177e4 5131
f17c8607 5132 return ret;
1da177e4
LT
5133}
5134
5135/**
5136 * sys_sched_yield - yield the current processor to other threads.
5137 *
dd41f596
IM
5138 * This function yields the current CPU to other tasks. If there are no
5139 * other threads running on this CPU then this function will return.
1da177e4 5140 */
5add95d4 5141SYSCALL_DEFINE0(sched_yield)
1da177e4 5142{
70b97a7f 5143 struct rq *rq = this_rq_lock();
1da177e4 5144
2d72376b 5145 schedstat_inc(rq, yld_count);
4530d7ab 5146 current->sched_class->yield_task(rq);
1da177e4
LT
5147
5148 /*
5149 * Since we are going to call schedule() anyway, there's
5150 * no need to preempt or enable interrupts:
5151 */
5152 __release(rq->lock);
8a25d5de 5153 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
9828ea9d 5154 do_raw_spin_unlock(&rq->lock);
1da177e4
LT
5155 preempt_enable_no_resched();
5156
5157 schedule();
5158
5159 return 0;
5160}
5161
d86ee480
PZ
5162static inline int should_resched(void)
5163{
5164 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5165}
5166
e7b38404 5167static void __cond_resched(void)
1da177e4 5168{
e7aaaa69
FW
5169 add_preempt_count(PREEMPT_ACTIVE);
5170 schedule();
5171 sub_preempt_count(PREEMPT_ACTIVE);
1da177e4
LT
5172}
5173
02b67cc3 5174int __sched _cond_resched(void)
1da177e4 5175{
d86ee480 5176 if (should_resched()) {
1da177e4
LT
5177 __cond_resched();
5178 return 1;
5179 }
5180 return 0;
5181}
02b67cc3 5182EXPORT_SYMBOL(_cond_resched);
1da177e4
LT
5183
5184/*
613afbf8 5185 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
1da177e4
LT
5186 * call schedule, and on return reacquire the lock.
5187 *
41a2d6cf 5188 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
1da177e4
LT
5189 * operations here to prevent schedule() from being called twice (once via
5190 * spin_unlock(), once by hand).
5191 */
613afbf8 5192int __cond_resched_lock(spinlock_t *lock)
1da177e4 5193{
d86ee480 5194 int resched = should_resched();
6df3cecb
JK
5195 int ret = 0;
5196
f607c668
PZ
5197 lockdep_assert_held(lock);
5198
95c354fe 5199 if (spin_needbreak(lock) || resched) {
1da177e4 5200 spin_unlock(lock);
d86ee480 5201 if (resched)
95c354fe
NP
5202 __cond_resched();
5203 else
5204 cpu_relax();
6df3cecb 5205 ret = 1;
1da177e4 5206 spin_lock(lock);
1da177e4 5207 }
6df3cecb 5208 return ret;
1da177e4 5209}
613afbf8 5210EXPORT_SYMBOL(__cond_resched_lock);
1da177e4 5211
613afbf8 5212int __sched __cond_resched_softirq(void)
1da177e4
LT
5213{
5214 BUG_ON(!in_softirq());
5215
d86ee480 5216 if (should_resched()) {
98d82567 5217 local_bh_enable();
1da177e4
LT
5218 __cond_resched();
5219 local_bh_disable();
5220 return 1;
5221 }
5222 return 0;
5223}
613afbf8 5224EXPORT_SYMBOL(__cond_resched_softirq);
1da177e4 5225
1da177e4
LT
5226/**
5227 * yield - yield the current processor to other threads.
5228 *
72fd4a35 5229 * This is a shortcut for kernel-space yielding - it marks the
1da177e4
LT
5230 * thread runnable and calls sys_sched_yield().
5231 */
5232void __sched yield(void)
5233{
5234 set_current_state(TASK_RUNNING);
5235 sys_sched_yield();
5236}
1da177e4
LT
5237EXPORT_SYMBOL(yield);
5238
5239/*
41a2d6cf 5240 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
1da177e4 5241 * that process accounting knows that this is a task in IO wait state.
1da177e4
LT
5242 */
5243void __sched io_schedule(void)
5244{
54d35f29 5245 struct rq *rq = raw_rq();
1da177e4 5246
0ff92245 5247 delayacct_blkio_start();
1da177e4 5248 atomic_inc(&rq->nr_iowait);
8f0dfc34 5249 current->in_iowait = 1;
1da177e4 5250 schedule();
8f0dfc34 5251 current->in_iowait = 0;
1da177e4 5252 atomic_dec(&rq->nr_iowait);
0ff92245 5253 delayacct_blkio_end();
1da177e4 5254}
1da177e4
LT
5255EXPORT_SYMBOL(io_schedule);
5256
5257long __sched io_schedule_timeout(long timeout)
5258{
54d35f29 5259 struct rq *rq = raw_rq();
1da177e4
LT
5260 long ret;
5261
0ff92245 5262 delayacct_blkio_start();
1da177e4 5263 atomic_inc(&rq->nr_iowait);
8f0dfc34 5264 current->in_iowait = 1;
1da177e4 5265 ret = schedule_timeout(timeout);
8f0dfc34 5266 current->in_iowait = 0;
1da177e4 5267 atomic_dec(&rq->nr_iowait);
0ff92245 5268 delayacct_blkio_end();
1da177e4
LT
5269 return ret;
5270}
5271
5272/**
5273 * sys_sched_get_priority_max - return maximum RT priority.
5274 * @policy: scheduling class.
5275 *
5276 * this syscall returns the maximum rt_priority that can be used
5277 * by a given scheduling class.
5278 */
5add95d4 5279SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
1da177e4
LT
5280{
5281 int ret = -EINVAL;
5282
5283 switch (policy) {
5284 case SCHED_FIFO:
5285 case SCHED_RR:
5286 ret = MAX_USER_RT_PRIO-1;
5287 break;
5288 case SCHED_NORMAL:
b0a9499c 5289 case SCHED_BATCH:
dd41f596 5290 case SCHED_IDLE:
1da177e4
LT
5291 ret = 0;
5292 break;
5293 }
5294 return ret;
5295}
5296
5297/**
5298 * sys_sched_get_priority_min - return minimum RT priority.
5299 * @policy: scheduling class.
5300 *
5301 * this syscall returns the minimum rt_priority that can be used
5302 * by a given scheduling class.
5303 */
5add95d4 5304SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
1da177e4
LT
5305{
5306 int ret = -EINVAL;
5307
5308 switch (policy) {
5309 case SCHED_FIFO:
5310 case SCHED_RR:
5311 ret = 1;
5312 break;
5313 case SCHED_NORMAL:
b0a9499c 5314 case SCHED_BATCH:
dd41f596 5315 case SCHED_IDLE:
1da177e4
LT
5316 ret = 0;
5317 }
5318 return ret;
5319}
5320
5321/**
5322 * sys_sched_rr_get_interval - return the default timeslice of a process.
5323 * @pid: pid of the process.
5324 * @interval: userspace pointer to the timeslice value.
5325 *
5326 * this syscall writes the default timeslice value of a given process
5327 * into the user-space timespec buffer. A value of '0' means infinity.
5328 */
17da2bd9 5329SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
754fe8d2 5330 struct timespec __user *, interval)
1da177e4 5331{
36c8b586 5332 struct task_struct *p;
a4ec24b4 5333 unsigned int time_slice;
dba091b9
TG
5334 unsigned long flags;
5335 struct rq *rq;
3a5c359a 5336 int retval;
1da177e4 5337 struct timespec t;
1da177e4
LT
5338
5339 if (pid < 0)
3a5c359a 5340 return -EINVAL;
1da177e4
LT
5341
5342 retval = -ESRCH;
1a551ae7 5343 rcu_read_lock();
1da177e4
LT
5344 p = find_process_by_pid(pid);
5345 if (!p)
5346 goto out_unlock;
5347
5348 retval = security_task_getscheduler(p);
5349 if (retval)
5350 goto out_unlock;
5351
dba091b9
TG
5352 rq = task_rq_lock(p, &flags);
5353 time_slice = p->sched_class->get_rr_interval(rq, p);
5354 task_rq_unlock(rq, &flags);
a4ec24b4 5355
1a551ae7 5356 rcu_read_unlock();
a4ec24b4 5357 jiffies_to_timespec(time_slice, &t);
1da177e4 5358 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
1da177e4 5359 return retval;
3a5c359a 5360
1da177e4 5361out_unlock:
1a551ae7 5362 rcu_read_unlock();
1da177e4
LT
5363 return retval;
5364}
5365
7c731e0a 5366static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
36c8b586 5367
82a1fcb9 5368void sched_show_task(struct task_struct *p)
1da177e4 5369{
1da177e4 5370 unsigned long free = 0;
36c8b586 5371 unsigned state;
1da177e4 5372
1da177e4 5373 state = p->state ? __ffs(p->state) + 1 : 0;
3df0fc5b 5374 printk(KERN_INFO "%-13.13s %c", p->comm,
2ed6e34f 5375 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4bd77321 5376#if BITS_PER_LONG == 32
1da177e4 5377 if (state == TASK_RUNNING)
3df0fc5b 5378 printk(KERN_CONT " running ");
1da177e4 5379 else
3df0fc5b 5380 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
1da177e4
LT
5381#else
5382 if (state == TASK_RUNNING)
3df0fc5b 5383 printk(KERN_CONT " running task ");
1da177e4 5384 else
3df0fc5b 5385 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
1da177e4
LT
5386#endif
5387#ifdef CONFIG_DEBUG_STACK_USAGE
7c9f8861 5388 free = stack_not_used(p);
1da177e4 5389#endif
3df0fc5b 5390 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
aa47b7e0
DR
5391 task_pid_nr(p), task_pid_nr(p->real_parent),
5392 (unsigned long)task_thread_info(p)->flags);
1da177e4 5393
5fb5e6de 5394 show_stack(p, NULL);
1da177e4
LT
5395}
5396
e59e2ae2 5397void show_state_filter(unsigned long state_filter)
1da177e4 5398{
36c8b586 5399 struct task_struct *g, *p;
1da177e4 5400
4bd77321 5401#if BITS_PER_LONG == 32
3df0fc5b
PZ
5402 printk(KERN_INFO
5403 " task PC stack pid father\n");
1da177e4 5404#else
3df0fc5b
PZ
5405 printk(KERN_INFO
5406 " task PC stack pid father\n");
1da177e4
LT
5407#endif
5408 read_lock(&tasklist_lock);
5409 do_each_thread(g, p) {
5410 /*
5411 * reset the NMI-timeout, listing all files on a slow
5412 * console might take alot of time:
5413 */
5414 touch_nmi_watchdog();
39bc89fd 5415 if (!state_filter || (p->state & state_filter))
82a1fcb9 5416 sched_show_task(p);
1da177e4
LT
5417 } while_each_thread(g, p);
5418
04c9167f
JF
5419 touch_all_softlockup_watchdogs();
5420
dd41f596
IM
5421#ifdef CONFIG_SCHED_DEBUG
5422 sysrq_sched_debug_show();
5423#endif
1da177e4 5424 read_unlock(&tasklist_lock);
e59e2ae2
IM
5425 /*
5426 * Only show locks if all tasks are dumped:
5427 */
93335a21 5428 if (!state_filter)
e59e2ae2 5429 debug_show_all_locks();
1da177e4
LT
5430}
5431
1df21055
IM
5432void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5433{
dd41f596 5434 idle->sched_class = &idle_sched_class;
1df21055
IM
5435}
5436
f340c0d1
IM
5437/**
5438 * init_idle - set up an idle thread for a given CPU
5439 * @idle: task in question
5440 * @cpu: cpu the idle task belongs to
5441 *
5442 * NOTE: this function does not set the idle thread's NEED_RESCHED
5443 * flag, to make booting more robust.
5444 */
5c1e1767 5445void __cpuinit init_idle(struct task_struct *idle, int cpu)
1da177e4 5446{
70b97a7f 5447 struct rq *rq = cpu_rq(cpu);
1da177e4
LT
5448 unsigned long flags;
5449
05fa785c 5450 raw_spin_lock_irqsave(&rq->lock, flags);
5cbd54ef 5451
dd41f596 5452 __sched_fork(idle);
06b83b5f 5453 idle->state = TASK_RUNNING;
dd41f596
IM
5454 idle->se.exec_start = sched_clock();
5455
96f874e2 5456 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
dd41f596 5457 __set_task_cpu(idle, cpu);
1da177e4 5458
1da177e4 5459 rq->curr = rq->idle = idle;
4866cde0
NP
5460#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5461 idle->oncpu = 1;
5462#endif
05fa785c 5463 raw_spin_unlock_irqrestore(&rq->lock, flags);
1da177e4
LT
5464
5465 /* Set the preempt count _outside_ the spinlocks! */
8e3e076c
LT
5466#if defined(CONFIG_PREEMPT)
5467 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5468#else
a1261f54 5469 task_thread_info(idle)->preempt_count = 0;
8e3e076c 5470#endif
dd41f596
IM
5471 /*
5472 * The idle tasks have their own, simple scheduling class:
5473 */
5474 idle->sched_class = &idle_sched_class;
fb52607a 5475 ftrace_graph_init_task(idle);
1da177e4
LT
5476}
5477
5478/*
5479 * In a system that switches off the HZ timer nohz_cpu_mask
5480 * indicates which cpus entered this state. This is used
5481 * in the rcu update to wait only for active cpus. For system
5482 * which do not switch off the HZ timer nohz_cpu_mask should
6a7b3dc3 5483 * always be CPU_BITS_NONE.
1da177e4 5484 */
6a7b3dc3 5485cpumask_var_t nohz_cpu_mask;
1da177e4 5486
19978ca6
IM
5487/*
5488 * Increase the granularity value when there are more CPUs,
5489 * because with more CPUs the 'effective latency' as visible
5490 * to users decreases. But the relationship is not linear,
5491 * so pick a second-best guess by going with the log2 of the
5492 * number of CPUs.
5493 *
5494 * This idea comes from the SD scheduler of Con Kolivas:
5495 */
acb4a848 5496static int get_update_sysctl_factor(void)
19978ca6 5497{
4ca3ef71 5498 unsigned int cpus = min_t(int, num_online_cpus(), 8);
1983a922
CE
5499 unsigned int factor;
5500
5501 switch (sysctl_sched_tunable_scaling) {
5502 case SCHED_TUNABLESCALING_NONE:
5503 factor = 1;
5504 break;
5505 case SCHED_TUNABLESCALING_LINEAR:
5506 factor = cpus;
5507 break;
5508 case SCHED_TUNABLESCALING_LOG:
5509 default:
5510 factor = 1 + ilog2(cpus);
5511 break;
5512 }
19978ca6 5513
acb4a848
CE
5514 return factor;
5515}
19978ca6 5516
acb4a848
CE
5517static void update_sysctl(void)
5518{
5519 unsigned int factor = get_update_sysctl_factor();
19978ca6 5520
0bcdcf28
CE
5521#define SET_SYSCTL(name) \
5522 (sysctl_##name = (factor) * normalized_sysctl_##name)
5523 SET_SYSCTL(sched_min_granularity);
5524 SET_SYSCTL(sched_latency);
5525 SET_SYSCTL(sched_wakeup_granularity);
5526 SET_SYSCTL(sched_shares_ratelimit);
5527#undef SET_SYSCTL
5528}
55cd5340 5529
0bcdcf28
CE
5530static inline void sched_init_granularity(void)
5531{
5532 update_sysctl();
19978ca6
IM
5533}
5534
1da177e4
LT
5535#ifdef CONFIG_SMP
5536/*
5537 * This is how migration works:
5538 *
969c7921
TH
5539 * 1) we invoke migration_cpu_stop() on the target CPU using
5540 * stop_one_cpu().
5541 * 2) stopper starts to run (implicitly forcing the migrated thread
5542 * off the CPU)
5543 * 3) it checks whether the migrated task is still in the wrong runqueue.
5544 * 4) if it's in the wrong runqueue then the migration thread removes
1da177e4 5545 * it and puts it into the right queue.
969c7921
TH
5546 * 5) stopper completes and stop_one_cpu() returns and the migration
5547 * is done.
1da177e4
LT
5548 */
5549
5550/*
5551 * Change a given task's CPU affinity. Migrate the thread to a
5552 * proper CPU and schedule it away if the CPU it's executing on
5553 * is removed from the allowed bitmask.
5554 *
5555 * NOTE: the caller must have a valid reference to the task, the
41a2d6cf 5556 * task must not exit() & deallocate itself prematurely. The
1da177e4
LT
5557 * call is not atomic; no spinlocks may be held.
5558 */
96f874e2 5559int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1da177e4
LT
5560{
5561 unsigned long flags;
70b97a7f 5562 struct rq *rq;
969c7921 5563 unsigned int dest_cpu;
48f24c4d 5564 int ret = 0;
1da177e4 5565
65cc8e48
PZ
5566 /*
5567 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5568 * drop the rq->lock and still rely on ->cpus_allowed.
5569 */
5570again:
5571 while (task_is_waking(p))
5572 cpu_relax();
1da177e4 5573 rq = task_rq_lock(p, &flags);
65cc8e48
PZ
5574 if (task_is_waking(p)) {
5575 task_rq_unlock(rq, &flags);
5576 goto again;
5577 }
e2912009 5578
6ad4c188 5579 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
1da177e4
LT
5580 ret = -EINVAL;
5581 goto out;
5582 }
5583
9985b0ba 5584 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
96f874e2 5585 !cpumask_equal(&p->cpus_allowed, new_mask))) {
9985b0ba
DR
5586 ret = -EINVAL;
5587 goto out;
5588 }
5589
73fe6aae 5590 if (p->sched_class->set_cpus_allowed)
cd8ba7cd 5591 p->sched_class->set_cpus_allowed(p, new_mask);
73fe6aae 5592 else {
96f874e2
RR
5593 cpumask_copy(&p->cpus_allowed, new_mask);
5594 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
73fe6aae
GH
5595 }
5596
1da177e4 5597 /* Can the task run on the task's current CPU? If so, we're done */
96f874e2 5598 if (cpumask_test_cpu(task_cpu(p), new_mask))
1da177e4
LT
5599 goto out;
5600
969c7921
TH
5601 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5602 if (migrate_task(p, dest_cpu)) {
5603 struct migration_arg arg = { p, dest_cpu };
1da177e4
LT
5604 /* Need help from migration thread: drop lock and wait. */
5605 task_rq_unlock(rq, &flags);
969c7921 5606 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1da177e4
LT
5607 tlb_migrate_finish(p->mm);
5608 return 0;
5609 }
5610out:
5611 task_rq_unlock(rq, &flags);
48f24c4d 5612
1da177e4
LT
5613 return ret;
5614}
cd8ba7cd 5615EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1da177e4
LT
5616
5617/*
41a2d6cf 5618 * Move (not current) task off this cpu, onto dest cpu. We're doing
1da177e4
LT
5619 * this because either it can't run here any more (set_cpus_allowed()
5620 * away from this CPU, or CPU going down), or because we're
5621 * attempting to rebalance this task on exec (sched_exec).
5622 *
5623 * So we race with normal scheduler movements, but that's OK, as long
5624 * as the task is no longer on this CPU.
efc30814
KK
5625 *
5626 * Returns non-zero if task was successfully migrated.
1da177e4 5627 */
efc30814 5628static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
1da177e4 5629{
70b97a7f 5630 struct rq *rq_dest, *rq_src;
e2912009 5631 int ret = 0;
1da177e4 5632
e761b772 5633 if (unlikely(!cpu_active(dest_cpu)))
efc30814 5634 return ret;
1da177e4
LT
5635
5636 rq_src = cpu_rq(src_cpu);
5637 rq_dest = cpu_rq(dest_cpu);
5638
5639 double_rq_lock(rq_src, rq_dest);
5640 /* Already moved. */
5641 if (task_cpu(p) != src_cpu)
b1e38734 5642 goto done;
1da177e4 5643 /* Affinity changed (again). */
96f874e2 5644 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
b1e38734 5645 goto fail;
1da177e4 5646
e2912009
PZ
5647 /*
5648 * If we're not on a rq, the next wake-up will ensure we're
5649 * placed properly.
5650 */
5651 if (p->se.on_rq) {
2e1cb74a 5652 deactivate_task(rq_src, p, 0);
e2912009 5653 set_task_cpu(p, dest_cpu);
dd41f596 5654 activate_task(rq_dest, p, 0);
15afe09b 5655 check_preempt_curr(rq_dest, p, 0);
1da177e4 5656 }
b1e38734 5657done:
efc30814 5658 ret = 1;
b1e38734 5659fail:
1da177e4 5660 double_rq_unlock(rq_src, rq_dest);
efc30814 5661 return ret;
1da177e4
LT
5662}
5663
5664/*
969c7921
TH
5665 * migration_cpu_stop - this will be executed by a highprio stopper thread
5666 * and performs thread migration by bumping thread off CPU then
5667 * 'pushing' onto another runqueue.
1da177e4 5668 */
969c7921 5669static int migration_cpu_stop(void *data)
1da177e4 5670{
969c7921 5671 struct migration_arg *arg = data;
f7b4cddc 5672
969c7921
TH
5673 /*
5674 * The original target cpu might have gone down and we might
5675 * be on another cpu but it doesn't matter.
5676 */
f7b4cddc 5677 local_irq_disable();
969c7921 5678 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
f7b4cddc 5679 local_irq_enable();
1da177e4 5680 return 0;
f7b4cddc
ON
5681}
5682
1da177e4 5683#ifdef CONFIG_HOTPLUG_CPU
054b9108 5684/*
3a4fa0a2 5685 * Figure out where task on dead CPU should go, use force if necessary.
054b9108 5686 */
6a1bdc1b 5687void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
1da177e4 5688{
1445c08d
ON
5689 struct rq *rq = cpu_rq(dead_cpu);
5690 int needs_cpu, uninitialized_var(dest_cpu);
5691 unsigned long flags;
e76bd8d9 5692
1445c08d 5693 local_irq_save(flags);
e76bd8d9 5694
1445c08d
ON
5695 raw_spin_lock(&rq->lock);
5696 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5697 if (needs_cpu)
5698 dest_cpu = select_fallback_rq(dead_cpu, p);
5699 raw_spin_unlock(&rq->lock);
c1804d54
ON
5700 /*
5701 * It can only fail if we race with set_cpus_allowed(),
5702 * in the racer should migrate the task anyway.
5703 */
1445c08d 5704 if (needs_cpu)
c1804d54 5705 __migrate_task(p, dead_cpu, dest_cpu);
1445c08d 5706 local_irq_restore(flags);
1da177e4
LT
5707}
5708
5709/*
5710 * While a dead CPU has no uninterruptible tasks queued at this point,
5711 * it might still have a nonzero ->nr_uninterruptible counter, because
5712 * for performance reasons the counter is not stricly tracking tasks to
5713 * their home CPUs. So we just add the counter to another CPU's counter,
5714 * to keep the global sum constant after CPU-down:
5715 */
70b97a7f 5716static void migrate_nr_uninterruptible(struct rq *rq_src)
1da177e4 5717{
6ad4c188 5718 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
1da177e4
LT
5719 unsigned long flags;
5720
5721 local_irq_save(flags);
5722 double_rq_lock(rq_src, rq_dest);
5723 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5724 rq_src->nr_uninterruptible = 0;
5725 double_rq_unlock(rq_src, rq_dest);
5726 local_irq_restore(flags);
5727}
5728
5729/* Run through task list and migrate tasks from the dead cpu. */
5730static void migrate_live_tasks(int src_cpu)
5731{
48f24c4d 5732 struct task_struct *p, *t;
1da177e4 5733
f7b4cddc 5734 read_lock(&tasklist_lock);
1da177e4 5735
48f24c4d
IM
5736 do_each_thread(t, p) {
5737 if (p == current)
1da177e4
LT
5738 continue;
5739
48f24c4d
IM
5740 if (task_cpu(p) == src_cpu)
5741 move_task_off_dead_cpu(src_cpu, p);
5742 } while_each_thread(t, p);
1da177e4 5743
f7b4cddc 5744 read_unlock(&tasklist_lock);
1da177e4
LT
5745}
5746
dd41f596
IM
5747/*
5748 * Schedules idle task to be the next runnable task on current CPU.
94bc9a7b
DA
5749 * It does so by boosting its priority to highest possible.
5750 * Used by CPU offline code.
1da177e4
LT
5751 */
5752void sched_idle_next(void)
5753{
48f24c4d 5754 int this_cpu = smp_processor_id();
70b97a7f 5755 struct rq *rq = cpu_rq(this_cpu);
1da177e4
LT
5756 struct task_struct *p = rq->idle;
5757 unsigned long flags;
5758
5759 /* cpu has to be offline */
48f24c4d 5760 BUG_ON(cpu_online(this_cpu));
1da177e4 5761
48f24c4d
IM
5762 /*
5763 * Strictly not necessary since rest of the CPUs are stopped by now
5764 * and interrupts disabled on the current cpu.
1da177e4 5765 */
05fa785c 5766 raw_spin_lock_irqsave(&rq->lock, flags);
1da177e4 5767
dd41f596 5768 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
48f24c4d 5769
94bc9a7b 5770 activate_task(rq, p, 0);
1da177e4 5771
05fa785c 5772 raw_spin_unlock_irqrestore(&rq->lock, flags);
1da177e4
LT
5773}
5774
48f24c4d
IM
5775/*
5776 * Ensures that the idle task is using init_mm right before its cpu goes
1da177e4
LT
5777 * offline.
5778 */
5779void idle_task_exit(void)
5780{
5781 struct mm_struct *mm = current->active_mm;
5782
5783 BUG_ON(cpu_online(smp_processor_id()));
5784
5785 if (mm != &init_mm)
5786 switch_mm(mm, &init_mm, current);
5787 mmdrop(mm);
5788}
5789
054b9108 5790/* called under rq->lock with disabled interrupts */
36c8b586 5791static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
1da177e4 5792{
70b97a7f 5793 struct rq *rq = cpu_rq(dead_cpu);
1da177e4
LT
5794
5795 /* Must be exiting, otherwise would be on tasklist. */
270f722d 5796 BUG_ON(!p->exit_state);
1da177e4
LT
5797
5798 /* Cannot have done final schedule yet: would have vanished. */
c394cc9f 5799 BUG_ON(p->state == TASK_DEAD);
1da177e4 5800
48f24c4d 5801 get_task_struct(p);
1da177e4
LT
5802
5803 /*
5804 * Drop lock around migration; if someone else moves it,
41a2d6cf 5805 * that's OK. No task can be added to this CPU, so iteration is
1da177e4
LT
5806 * fine.
5807 */
05fa785c 5808 raw_spin_unlock_irq(&rq->lock);
48f24c4d 5809 move_task_off_dead_cpu(dead_cpu, p);
05fa785c 5810 raw_spin_lock_irq(&rq->lock);
1da177e4 5811
48f24c4d 5812 put_task_struct(p);
1da177e4
LT
5813}
5814
5815/* release_task() removes task from tasklist, so we won't find dead tasks. */
5816static void migrate_dead_tasks(unsigned int dead_cpu)
5817{
70b97a7f 5818 struct rq *rq = cpu_rq(dead_cpu);
dd41f596 5819 struct task_struct *next;
48f24c4d 5820
dd41f596
IM
5821 for ( ; ; ) {
5822 if (!rq->nr_running)
5823 break;
b67802ea 5824 next = pick_next_task(rq);
dd41f596
IM
5825 if (!next)
5826 break;
79c53799 5827 next->sched_class->put_prev_task(rq, next);
dd41f596 5828 migrate_dead(dead_cpu, next);
e692ab53 5829
1da177e4
LT
5830 }
5831}
dce48a84
TG
5832
5833/*
5834 * remove the tasks which were accounted by rq from calc_load_tasks.
5835 */
5836static void calc_global_load_remove(struct rq *rq)
5837{
5838 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
a468d389 5839 rq->calc_load_active = 0;
dce48a84 5840}
1da177e4
LT
5841#endif /* CONFIG_HOTPLUG_CPU */
5842
e692ab53
NP
5843#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5844
5845static struct ctl_table sd_ctl_dir[] = {
e0361851
AD
5846 {
5847 .procname = "sched_domain",
c57baf1e 5848 .mode = 0555,
e0361851 5849 },
56992309 5850 {}
e692ab53
NP
5851};
5852
5853static struct ctl_table sd_ctl_root[] = {
e0361851
AD
5854 {
5855 .procname = "kernel",
c57baf1e 5856 .mode = 0555,
e0361851
AD
5857 .child = sd_ctl_dir,
5858 },
56992309 5859 {}
e692ab53
NP
5860};
5861
5862static struct ctl_table *sd_alloc_ctl_entry(int n)
5863{
5864 struct ctl_table *entry =
5cf9f062 5865 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
e692ab53 5866
e692ab53
NP
5867 return entry;
5868}
5869
6382bc90
MM
5870static void sd_free_ctl_entry(struct ctl_table **tablep)
5871{
cd790076 5872 struct ctl_table *entry;
6382bc90 5873
cd790076
MM
5874 /*
5875 * In the intermediate directories, both the child directory and
5876 * procname are dynamically allocated and could fail but the mode
41a2d6cf 5877 * will always be set. In the lowest directory the names are
cd790076
MM
5878 * static strings and all have proc handlers.
5879 */
5880 for (entry = *tablep; entry->mode; entry++) {
6382bc90
MM
5881 if (entry->child)
5882 sd_free_ctl_entry(&entry->child);
cd790076
MM
5883 if (entry->proc_handler == NULL)
5884 kfree(entry->procname);
5885 }
6382bc90
MM
5886
5887 kfree(*tablep);
5888 *tablep = NULL;
5889}
5890
e692ab53 5891static void
e0361851 5892set_table_entry(struct ctl_table *entry,
e692ab53
NP
5893 const char *procname, void *data, int maxlen,
5894 mode_t mode, proc_handler *proc_handler)
5895{
e692ab53
NP
5896 entry->procname = procname;
5897 entry->data = data;
5898 entry->maxlen = maxlen;
5899 entry->mode = mode;
5900 entry->proc_handler = proc_handler;
5901}
5902
5903static struct ctl_table *
5904sd_alloc_ctl_domain_table(struct sched_domain *sd)
5905{
a5d8c348 5906 struct ctl_table *table = sd_alloc_ctl_entry(13);
e692ab53 5907
ad1cdc1d
MM
5908 if (table == NULL)
5909 return NULL;
5910
e0361851 5911 set_table_entry(&table[0], "min_interval", &sd->min_interval,
e692ab53 5912 sizeof(long), 0644, proc_doulongvec_minmax);
e0361851 5913 set_table_entry(&table[1], "max_interval", &sd->max_interval,
e692ab53 5914 sizeof(long), 0644, proc_doulongvec_minmax);
e0361851 5915 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
e692ab53 5916 sizeof(int), 0644, proc_dointvec_minmax);
e0361851 5917 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
e692ab53 5918 sizeof(int), 0644, proc_dointvec_minmax);
e0361851 5919 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
e692ab53 5920 sizeof(int), 0644, proc_dointvec_minmax);
e0361851 5921 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
e692ab53 5922 sizeof(int), 0644, proc_dointvec_minmax);
e0361851 5923 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
e692ab53 5924 sizeof(int), 0644, proc_dointvec_minmax);
e0361851 5925 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
e692ab53 5926 sizeof(int), 0644, proc_dointvec_minmax);
e0361851 5927 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
e692ab53 5928 sizeof(int), 0644, proc_dointvec_minmax);
ace8b3d6 5929 set_table_entry(&table[9], "cache_nice_tries",
e692ab53
NP
5930 &sd->cache_nice_tries,
5931 sizeof(int), 0644, proc_dointvec_minmax);
ace8b3d6 5932 set_table_entry(&table[10], "flags", &sd->flags,
e692ab53 5933 sizeof(int), 0644, proc_dointvec_minmax);
a5d8c348
IM
5934 set_table_entry(&table[11], "name", sd->name,
5935 CORENAME_MAX_SIZE, 0444, proc_dostring);
5936 /* &table[12] is terminator */
e692ab53
NP
5937
5938 return table;
5939}
5940
9a4e7159 5941static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
e692ab53
NP
5942{
5943 struct ctl_table *entry, *table;
5944 struct sched_domain *sd;
5945 int domain_num = 0, i;
5946 char buf[32];
5947
5948 for_each_domain(cpu, sd)
5949 domain_num++;
5950 entry = table = sd_alloc_ctl_entry(domain_num + 1);
ad1cdc1d
MM
5951 if (table == NULL)
5952 return NULL;
e692ab53
NP
5953
5954 i = 0;
5955 for_each_domain(cpu, sd) {
5956 snprintf(buf, 32, "domain%d", i);
e692ab53 5957 entry->procname = kstrdup(buf, GFP_KERNEL);
c57baf1e 5958 entry->mode = 0555;
e692ab53
NP
5959 entry->child = sd_alloc_ctl_domain_table(sd);
5960 entry++;
5961 i++;
5962 }
5963 return table;
5964}
5965
5966static struct ctl_table_header *sd_sysctl_header;
6382bc90 5967static void register_sched_domain_sysctl(void)
e692ab53 5968{
6ad4c188 5969 int i, cpu_num = num_possible_cpus();
e692ab53
NP
5970 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5971 char buf[32];
5972
7378547f
MM
5973 WARN_ON(sd_ctl_dir[0].child);
5974 sd_ctl_dir[0].child = entry;
5975
ad1cdc1d
MM
5976 if (entry == NULL)
5977 return;
5978
6ad4c188 5979 for_each_possible_cpu(i) {
e692ab53 5980 snprintf(buf, 32, "cpu%d", i);
e692ab53 5981 entry->procname = kstrdup(buf, GFP_KERNEL);
c57baf1e 5982 entry->mode = 0555;
e692ab53 5983 entry->child = sd_alloc_ctl_cpu_table(i);
97b6ea7b 5984 entry++;
e692ab53 5985 }
7378547f
MM
5986
5987 WARN_ON(sd_sysctl_header);
e692ab53
NP
5988 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5989}
6382bc90 5990
7378547f 5991/* may be called multiple times per register */
6382bc90
MM
5992static void unregister_sched_domain_sysctl(void)
5993{
7378547f
MM
5994 if (sd_sysctl_header)
5995 unregister_sysctl_table(sd_sysctl_header);
6382bc90 5996 sd_sysctl_header = NULL;
7378547f
MM
5997 if (sd_ctl_dir[0].child)
5998 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6382bc90 5999}
e692ab53 6000#else
6382bc90
MM
6001static void register_sched_domain_sysctl(void)
6002{
6003}
6004static void unregister_sched_domain_sysctl(void)
e692ab53
NP
6005{
6006}
6007#endif
6008
1f11eb6a
GH
6009static void set_rq_online(struct rq *rq)
6010{
6011 if (!rq->online) {
6012 const struct sched_class *class;
6013
c6c4927b 6014 cpumask_set_cpu(rq->cpu, rq->rd->online);
1f11eb6a
GH
6015 rq->online = 1;
6016
6017 for_each_class(class) {
6018 if (class->rq_online)
6019 class->rq_online(rq);
6020 }
6021 }
6022}
6023
6024static void set_rq_offline(struct rq *rq)
6025{
6026 if (rq->online) {
6027 const struct sched_class *class;
6028
6029 for_each_class(class) {
6030 if (class->rq_offline)
6031 class->rq_offline(rq);
6032 }
6033
c6c4927b 6034 cpumask_clear_cpu(rq->cpu, rq->rd->online);
1f11eb6a
GH
6035 rq->online = 0;
6036 }
6037}
6038
1da177e4
LT
6039/*
6040 * migration_call - callback that gets triggered when a CPU is added.
6041 * Here we can start up the necessary migration thread for the new CPU.
6042 */
48f24c4d
IM
6043static int __cpuinit
6044migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
1da177e4 6045{
48f24c4d 6046 int cpu = (long)hcpu;
1da177e4 6047 unsigned long flags;
969c7921 6048 struct rq *rq = cpu_rq(cpu);
1da177e4
LT
6049
6050 switch (action) {
5be9361c 6051
1da177e4 6052 case CPU_UP_PREPARE:
8bb78442 6053 case CPU_UP_PREPARE_FROZEN:
a468d389 6054 rq->calc_load_update = calc_load_update;
1da177e4 6055 break;
48f24c4d 6056
1da177e4 6057 case CPU_ONLINE:
8bb78442 6058 case CPU_ONLINE_FROZEN:
1f94ef59 6059 /* Update our root-domain */
05fa785c 6060 raw_spin_lock_irqsave(&rq->lock, flags);
1f94ef59 6061 if (rq->rd) {
c6c4927b 6062 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
1f11eb6a
GH
6063
6064 set_rq_online(rq);
1f94ef59 6065 }
05fa785c 6066 raw_spin_unlock_irqrestore(&rq->lock, flags);
1da177e4 6067 break;
48f24c4d 6068
1da177e4 6069#ifdef CONFIG_HOTPLUG_CPU
1da177e4 6070 case CPU_DEAD:
8bb78442 6071 case CPU_DEAD_FROZEN:
1da177e4 6072 migrate_live_tasks(cpu);
1da177e4 6073 /* Idle task back to normal (off runqueue, low prio) */
05fa785c 6074 raw_spin_lock_irq(&rq->lock);
2e1cb74a 6075 deactivate_task(rq, rq->idle, 0);
dd41f596
IM
6076 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6077 rq->idle->sched_class = &idle_sched_class;
1da177e4 6078 migrate_dead_tasks(cpu);
05fa785c 6079 raw_spin_unlock_irq(&rq->lock);
1da177e4
LT
6080 migrate_nr_uninterruptible(rq);
6081 BUG_ON(rq->nr_running != 0);
dce48a84 6082 calc_global_load_remove(rq);
1da177e4 6083 break;
57d885fe 6084
08f503b0
GH
6085 case CPU_DYING:
6086 case CPU_DYING_FROZEN:
57d885fe 6087 /* Update our root-domain */
05fa785c 6088 raw_spin_lock_irqsave(&rq->lock, flags);
57d885fe 6089 if (rq->rd) {
c6c4927b 6090 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
1f11eb6a 6091 set_rq_offline(rq);
57d885fe 6092 }
05fa785c 6093 raw_spin_unlock_irqrestore(&rq->lock, flags);
57d885fe 6094 break;
1da177e4
LT
6095#endif
6096 }
6097 return NOTIFY_OK;
6098}
6099
f38b0820
PM
6100/*
6101 * Register at high priority so that task migration (migrate_all_tasks)
6102 * happens before everything else. This has to be lower priority than
cdd6c482 6103 * the notifier in the perf_event subsystem, though.
1da177e4 6104 */
26c2143b 6105static struct notifier_block __cpuinitdata migration_notifier = {
1da177e4 6106 .notifier_call = migration_call,
50a323b7 6107 .priority = CPU_PRI_MIGRATION,
1da177e4
LT
6108};
6109
3a101d05
TH
6110static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6111 unsigned long action, void *hcpu)
6112{
6113 switch (action & ~CPU_TASKS_FROZEN) {
6114 case CPU_ONLINE:
6115 case CPU_DOWN_FAILED:
6116 set_cpu_active((long)hcpu, true);
6117 return NOTIFY_OK;
6118 default:
6119 return NOTIFY_DONE;
6120 }
6121}
6122
6123static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6124 unsigned long action, void *hcpu)
6125{
6126 switch (action & ~CPU_TASKS_FROZEN) {
6127 case CPU_DOWN_PREPARE:
6128 set_cpu_active((long)hcpu, false);
6129 return NOTIFY_OK;
6130 default:
6131 return NOTIFY_DONE;
6132 }
6133}
6134
7babe8db 6135static int __init migration_init(void)
1da177e4
LT
6136{
6137 void *cpu = (void *)(long)smp_processor_id();
07dccf33 6138 int err;
48f24c4d 6139
3a101d05 6140 /* Initialize migration for the boot CPU */
07dccf33
AM
6141 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6142 BUG_ON(err == NOTIFY_BAD);
1da177e4
LT
6143 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6144 register_cpu_notifier(&migration_notifier);
7babe8db 6145
3a101d05
TH
6146 /* Register cpu active notifiers */
6147 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6148 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6149
a004cd42 6150 return 0;
1da177e4 6151}
7babe8db 6152early_initcall(migration_init);
1da177e4
LT
6153#endif
6154
6155#ifdef CONFIG_SMP
476f3534 6156
3e9830dc 6157#ifdef CONFIG_SCHED_DEBUG
4dcf6aff 6158
f6630114
MT
6159static __read_mostly int sched_domain_debug_enabled;
6160
6161static int __init sched_domain_debug_setup(char *str)
6162{
6163 sched_domain_debug_enabled = 1;
6164
6165 return 0;
6166}
6167early_param("sched_debug", sched_domain_debug_setup);
6168
7c16ec58 6169static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
96f874e2 6170 struct cpumask *groupmask)
1da177e4 6171{
4dcf6aff 6172 struct sched_group *group = sd->groups;
434d53b0 6173 char str[256];
1da177e4 6174
968ea6d8 6175 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
96f874e2 6176 cpumask_clear(groupmask);
4dcf6aff
IM
6177
6178 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6179
6180 if (!(sd->flags & SD_LOAD_BALANCE)) {
3df0fc5b 6181 printk("does not load-balance\n");
4dcf6aff 6182 if (sd->parent)
3df0fc5b
PZ
6183 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6184 " has parent");
4dcf6aff 6185 return -1;
41c7ce9a
NP
6186 }
6187
3df0fc5b 6188 printk(KERN_CONT "span %s level %s\n", str, sd->name);
4dcf6aff 6189
758b2cdc 6190 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
3df0fc5b
PZ
6191 printk(KERN_ERR "ERROR: domain->span does not contain "
6192 "CPU%d\n", cpu);
4dcf6aff 6193 }
758b2cdc 6194 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
3df0fc5b
PZ
6195 printk(KERN_ERR "ERROR: domain->groups does not contain"
6196 " CPU%d\n", cpu);
4dcf6aff 6197 }
1da177e4 6198
4dcf6aff 6199 printk(KERN_DEBUG "%*s groups:", level + 1, "");
1da177e4 6200 do {
4dcf6aff 6201 if (!group) {
3df0fc5b
PZ
6202 printk("\n");
6203 printk(KERN_ERR "ERROR: group is NULL\n");
1da177e4
LT
6204 break;
6205 }
6206
18a3885f 6207 if (!group->cpu_power) {
3df0fc5b
PZ
6208 printk(KERN_CONT "\n");
6209 printk(KERN_ERR "ERROR: domain->cpu_power not "
6210 "set\n");
4dcf6aff
IM
6211 break;
6212 }
1da177e4 6213
758b2cdc 6214 if (!cpumask_weight(sched_group_cpus(group))) {
3df0fc5b
PZ
6215 printk(KERN_CONT "\n");
6216 printk(KERN_ERR "ERROR: empty group\n");
4dcf6aff
IM
6217 break;
6218 }
1da177e4 6219
758b2cdc 6220 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
3df0fc5b
PZ
6221 printk(KERN_CONT "\n");
6222 printk(KERN_ERR "ERROR: repeated CPUs\n");
4dcf6aff
IM
6223 break;
6224 }
1da177e4 6225
758b2cdc 6226 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
1da177e4 6227
968ea6d8 6228 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
381512cf 6229
3df0fc5b 6230 printk(KERN_CONT " %s", str);
18a3885f 6231 if (group->cpu_power != SCHED_LOAD_SCALE) {
3df0fc5b
PZ
6232 printk(KERN_CONT " (cpu_power = %d)",
6233 group->cpu_power);
381512cf 6234 }
1da177e4 6235
4dcf6aff
IM
6236 group = group->next;
6237 } while (group != sd->groups);
3df0fc5b 6238 printk(KERN_CONT "\n");
1da177e4 6239
758b2cdc 6240 if (!cpumask_equal(sched_domain_span(sd), groupmask))
3df0fc5b 6241 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
1da177e4 6242
758b2cdc
RR
6243 if (sd->parent &&
6244 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
3df0fc5b
PZ
6245 printk(KERN_ERR "ERROR: parent span is not a superset "
6246 "of domain->span\n");
4dcf6aff
IM
6247 return 0;
6248}
1da177e4 6249
4dcf6aff
IM
6250static void sched_domain_debug(struct sched_domain *sd, int cpu)
6251{
d5dd3db1 6252 cpumask_var_t groupmask;
4dcf6aff 6253 int level = 0;
1da177e4 6254
f6630114
MT
6255 if (!sched_domain_debug_enabled)
6256 return;
6257
4dcf6aff
IM
6258 if (!sd) {
6259 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6260 return;
6261 }
1da177e4 6262
4dcf6aff
IM
6263 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6264
d5dd3db1 6265 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
7c16ec58
MT
6266 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6267 return;
6268 }
6269
4dcf6aff 6270 for (;;) {
7c16ec58 6271 if (sched_domain_debug_one(sd, cpu, level, groupmask))
4dcf6aff 6272 break;
1da177e4
LT
6273 level++;
6274 sd = sd->parent;
33859f7f 6275 if (!sd)
4dcf6aff
IM
6276 break;
6277 }
d5dd3db1 6278 free_cpumask_var(groupmask);
1da177e4 6279}
6d6bc0ad 6280#else /* !CONFIG_SCHED_DEBUG */
48f24c4d 6281# define sched_domain_debug(sd, cpu) do { } while (0)
6d6bc0ad 6282#endif /* CONFIG_SCHED_DEBUG */
1da177e4 6283
1a20ff27 6284static int sd_degenerate(struct sched_domain *sd)
245af2c7 6285{
758b2cdc 6286 if (cpumask_weight(sched_domain_span(sd)) == 1)
245af2c7
SS
6287 return 1;
6288
6289 /* Following flags need at least 2 groups */
6290 if (sd->flags & (SD_LOAD_BALANCE |
6291 SD_BALANCE_NEWIDLE |
6292 SD_BALANCE_FORK |
89c4710e
SS
6293 SD_BALANCE_EXEC |
6294 SD_SHARE_CPUPOWER |
6295 SD_SHARE_PKG_RESOURCES)) {
245af2c7
SS
6296 if (sd->groups != sd->groups->next)
6297 return 0;
6298 }
6299
6300 /* Following flags don't use groups */
c88d5910 6301 if (sd->flags & (SD_WAKE_AFFINE))
245af2c7
SS
6302 return 0;
6303
6304 return 1;
6305}
6306
48f24c4d
IM
6307static int
6308sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
245af2c7
SS
6309{
6310 unsigned long cflags = sd->flags, pflags = parent->flags;
6311
6312 if (sd_degenerate(parent))
6313 return 1;
6314
758b2cdc 6315 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
245af2c7
SS
6316 return 0;
6317
245af2c7
SS
6318 /* Flags needing groups don't count if only 1 group in parent */
6319 if (parent->groups == parent->groups->next) {
6320 pflags &= ~(SD_LOAD_BALANCE |
6321 SD_BALANCE_NEWIDLE |
6322 SD_BALANCE_FORK |
89c4710e
SS
6323 SD_BALANCE_EXEC |
6324 SD_SHARE_CPUPOWER |
6325 SD_SHARE_PKG_RESOURCES);
5436499e
KC
6326 if (nr_node_ids == 1)
6327 pflags &= ~SD_SERIALIZE;
245af2c7
SS
6328 }
6329 if (~cflags & pflags)
6330 return 0;
6331
6332 return 1;
6333}
6334
c6c4927b
RR
6335static void free_rootdomain(struct root_domain *rd)
6336{
047106ad
PZ
6337 synchronize_sched();
6338
68e74568
RR
6339 cpupri_cleanup(&rd->cpupri);
6340
c6c4927b
RR
6341 free_cpumask_var(rd->rto_mask);
6342 free_cpumask_var(rd->online);
6343 free_cpumask_var(rd->span);
6344 kfree(rd);
6345}
6346
57d885fe
GH
6347static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6348{
a0490fa3 6349 struct root_domain *old_rd = NULL;
57d885fe 6350 unsigned long flags;
57d885fe 6351
05fa785c 6352 raw_spin_lock_irqsave(&rq->lock, flags);
57d885fe
GH
6353
6354 if (rq->rd) {
a0490fa3 6355 old_rd = rq->rd;
57d885fe 6356
c6c4927b 6357 if (cpumask_test_cpu(rq->cpu, old_rd->online))
1f11eb6a 6358 set_rq_offline(rq);
57d885fe 6359
c6c4927b 6360 cpumask_clear_cpu(rq->cpu, old_rd->span);
dc938520 6361
a0490fa3
IM
6362 /*
6363 * If we dont want to free the old_rt yet then
6364 * set old_rd to NULL to skip the freeing later
6365 * in this function:
6366 */
6367 if (!atomic_dec_and_test(&old_rd->refcount))
6368 old_rd = NULL;
57d885fe
GH
6369 }
6370
6371 atomic_inc(&rd->refcount);
6372 rq->rd = rd;
6373
c6c4927b 6374 cpumask_set_cpu(rq->cpu, rd->span);
00aec93d 6375 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
1f11eb6a 6376 set_rq_online(rq);
57d885fe 6377
05fa785c 6378 raw_spin_unlock_irqrestore(&rq->lock, flags);
a0490fa3
IM
6379
6380 if (old_rd)
6381 free_rootdomain(old_rd);
57d885fe
GH
6382}
6383
68c38fc3 6384static int init_rootdomain(struct root_domain *rd)
57d885fe
GH
6385{
6386 memset(rd, 0, sizeof(*rd));
6387
68c38fc3 6388 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
0c910d28 6389 goto out;
68c38fc3 6390 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
c6c4927b 6391 goto free_span;
68c38fc3 6392 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
c6c4927b 6393 goto free_online;
6e0534f2 6394
68c38fc3 6395 if (cpupri_init(&rd->cpupri) != 0)
68e74568 6396 goto free_rto_mask;
c6c4927b 6397 return 0;
6e0534f2 6398
68e74568
RR
6399free_rto_mask:
6400 free_cpumask_var(rd->rto_mask);
c6c4927b
RR
6401free_online:
6402 free_cpumask_var(rd->online);
6403free_span:
6404 free_cpumask_var(rd->span);
0c910d28 6405out:
c6c4927b 6406 return -ENOMEM;
57d885fe
GH
6407}
6408
6409static void init_defrootdomain(void)
6410{
68c38fc3 6411 init_rootdomain(&def_root_domain);
c6c4927b 6412
57d885fe
GH
6413 atomic_set(&def_root_domain.refcount, 1);
6414}
6415
dc938520 6416static struct root_domain *alloc_rootdomain(void)
57d885fe
GH
6417{
6418 struct root_domain *rd;
6419
6420 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6421 if (!rd)
6422 return NULL;
6423
68c38fc3 6424 if (init_rootdomain(rd) != 0) {
c6c4927b
RR
6425 kfree(rd);
6426 return NULL;
6427 }
57d885fe
GH
6428
6429 return rd;
6430}
6431
1da177e4 6432/*
0eab9146 6433 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
1da177e4
LT
6434 * hold the hotplug lock.
6435 */
0eab9146
IM
6436static void
6437cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
1da177e4 6438{
70b97a7f 6439 struct rq *rq = cpu_rq(cpu);
245af2c7
SS
6440 struct sched_domain *tmp;
6441
669c55e9
PZ
6442 for (tmp = sd; tmp; tmp = tmp->parent)
6443 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6444
245af2c7 6445 /* Remove the sched domains which do not contribute to scheduling. */
f29c9b1c 6446 for (tmp = sd; tmp; ) {
245af2c7
SS
6447 struct sched_domain *parent = tmp->parent;
6448 if (!parent)
6449 break;
f29c9b1c 6450
1a848870 6451 if (sd_parent_degenerate(tmp, parent)) {
245af2c7 6452 tmp->parent = parent->parent;
1a848870
SS
6453 if (parent->parent)
6454 parent->parent->child = tmp;
f29c9b1c
LZ
6455 } else
6456 tmp = tmp->parent;
245af2c7
SS
6457 }
6458
1a848870 6459 if (sd && sd_degenerate(sd)) {
245af2c7 6460 sd = sd->parent;
1a848870
SS
6461 if (sd)
6462 sd->child = NULL;
6463 }
1da177e4
LT
6464
6465 sched_domain_debug(sd, cpu);
6466
57d885fe 6467 rq_attach_root(rq, rd);
674311d5 6468 rcu_assign_pointer(rq->sd, sd);
1da177e4
LT
6469}
6470
6471/* cpus with isolated domains */
dcc30a35 6472static cpumask_var_t cpu_isolated_map;
1da177e4
LT
6473
6474/* Setup the mask of cpus configured for isolated domains */
6475static int __init isolated_cpu_setup(char *str)
6476{
bdddd296 6477 alloc_bootmem_cpumask_var(&cpu_isolated_map);
968ea6d8 6478 cpulist_parse(str, cpu_isolated_map);
1da177e4
LT
6479 return 1;
6480}
6481
8927f494 6482__setup("isolcpus=", isolated_cpu_setup);
1da177e4
LT
6483
6484/*
6711cab4
SS
6485 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6486 * to a function which identifies what group(along with sched group) a CPU
96f874e2
RR
6487 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6488 * (due to the fact that we keep track of groups covered with a struct cpumask).
1da177e4
LT
6489 *
6490 * init_sched_build_groups will build a circular linked list of the groups
6491 * covered by the given span, and will set each group's ->cpumask correctly,
6492 * and ->cpu_power to 0.
6493 */
a616058b 6494static void
96f874e2
RR
6495init_sched_build_groups(const struct cpumask *span,
6496 const struct cpumask *cpu_map,
6497 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
7c16ec58 6498 struct sched_group **sg,
96f874e2
RR
6499 struct cpumask *tmpmask),
6500 struct cpumask *covered, struct cpumask *tmpmask)
1da177e4
LT
6501{
6502 struct sched_group *first = NULL, *last = NULL;
1da177e4
LT
6503 int i;
6504
96f874e2 6505 cpumask_clear(covered);
7c16ec58 6506
abcd083a 6507 for_each_cpu(i, span) {
6711cab4 6508 struct sched_group *sg;
7c16ec58 6509 int group = group_fn(i, cpu_map, &sg, tmpmask);
1da177e4
LT
6510 int j;
6511
758b2cdc 6512 if (cpumask_test_cpu(i, covered))
1da177e4
LT
6513 continue;
6514
758b2cdc 6515 cpumask_clear(sched_group_cpus(sg));
18a3885f 6516 sg->cpu_power = 0;
1da177e4 6517
abcd083a 6518 for_each_cpu(j, span) {
7c16ec58 6519 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
1da177e4
LT
6520 continue;
6521
96f874e2 6522 cpumask_set_cpu(j, covered);
758b2cdc 6523 cpumask_set_cpu(j, sched_group_cpus(sg));
1da177e4
LT
6524 }
6525 if (!first)
6526 first = sg;
6527 if (last)
6528 last->next = sg;
6529 last = sg;
6530 }
6531 last->next = first;
6532}
6533
9c1cfda2 6534#define SD_NODES_PER_DOMAIN 16
1da177e4 6535
9c1cfda2 6536#ifdef CONFIG_NUMA
198e2f18 6537
9c1cfda2
JH
6538/**
6539 * find_next_best_node - find the next node to include in a sched_domain
6540 * @node: node whose sched_domain we're building
6541 * @used_nodes: nodes already in the sched_domain
6542 *
41a2d6cf 6543 * Find the next node to include in a given scheduling domain. Simply
9c1cfda2
JH
6544 * finds the closest node not already in the @used_nodes map.
6545 *
6546 * Should use nodemask_t.
6547 */
c5f59f08 6548static int find_next_best_node(int node, nodemask_t *used_nodes)
9c1cfda2
JH
6549{
6550 int i, n, val, min_val, best_node = 0;
6551
6552 min_val = INT_MAX;
6553
076ac2af 6554 for (i = 0; i < nr_node_ids; i++) {
9c1cfda2 6555 /* Start at @node */
076ac2af 6556 n = (node + i) % nr_node_ids;
9c1cfda2
JH
6557
6558 if (!nr_cpus_node(n))
6559 continue;
6560
6561 /* Skip already used nodes */
c5f59f08 6562 if (node_isset(n, *used_nodes))
9c1cfda2
JH
6563 continue;
6564
6565 /* Simple min distance search */
6566 val = node_distance(node, n);
6567
6568 if (val < min_val) {
6569 min_val = val;
6570 best_node = n;
6571 }
6572 }
6573
c5f59f08 6574 node_set(best_node, *used_nodes);
9c1cfda2
JH
6575 return best_node;
6576}
6577
6578/**
6579 * sched_domain_node_span - get a cpumask for a node's sched_domain
6580 * @node: node whose cpumask we're constructing
73486722 6581 * @span: resulting cpumask
9c1cfda2 6582 *
41a2d6cf 6583 * Given a node, construct a good cpumask for its sched_domain to span. It
9c1cfda2
JH
6584 * should be one that prevents unnecessary balancing, but also spreads tasks
6585 * out optimally.
6586 */
96f874e2 6587static void sched_domain_node_span(int node, struct cpumask *span)
9c1cfda2 6588{
c5f59f08 6589 nodemask_t used_nodes;
48f24c4d 6590 int i;
9c1cfda2 6591
6ca09dfc 6592 cpumask_clear(span);
c5f59f08 6593 nodes_clear(used_nodes);
9c1cfda2 6594
6ca09dfc 6595 cpumask_or(span, span, cpumask_of_node(node));
c5f59f08 6596 node_set(node, used_nodes);
9c1cfda2
JH
6597
6598 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
c5f59f08 6599 int next_node = find_next_best_node(node, &used_nodes);
48f24c4d 6600
6ca09dfc 6601 cpumask_or(span, span, cpumask_of_node(next_node));
9c1cfda2 6602 }
9c1cfda2 6603}
6d6bc0ad 6604#endif /* CONFIG_NUMA */
9c1cfda2 6605
5c45bf27 6606int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
48f24c4d 6607
6c99e9ad
RR
6608/*
6609 * The cpus mask in sched_group and sched_domain hangs off the end.
4200efd9
IM
6610 *
6611 * ( See the the comments in include/linux/sched.h:struct sched_group
6612 * and struct sched_domain. )
6c99e9ad
RR
6613 */
6614struct static_sched_group {
6615 struct sched_group sg;
6616 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6617};
6618
6619struct static_sched_domain {
6620 struct sched_domain sd;
6621 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
6622};
6623
49a02c51
AH
6624struct s_data {
6625#ifdef CONFIG_NUMA
6626 int sd_allnodes;
6627 cpumask_var_t domainspan;
6628 cpumask_var_t covered;
6629 cpumask_var_t notcovered;
6630#endif
6631 cpumask_var_t nodemask;
6632 cpumask_var_t this_sibling_map;
6633 cpumask_var_t this_core_map;
01a08546 6634 cpumask_var_t this_book_map;
49a02c51
AH
6635 cpumask_var_t send_covered;
6636 cpumask_var_t tmpmask;
6637 struct sched_group **sched_group_nodes;
6638 struct root_domain *rd;
6639};
6640
2109b99e
AH
6641enum s_alloc {
6642 sa_sched_groups = 0,
6643 sa_rootdomain,
6644 sa_tmpmask,
6645 sa_send_covered,
01a08546 6646 sa_this_book_map,
2109b99e
AH
6647 sa_this_core_map,
6648 sa_this_sibling_map,
6649 sa_nodemask,
6650 sa_sched_group_nodes,
6651#ifdef CONFIG_NUMA
6652 sa_notcovered,
6653 sa_covered,
6654 sa_domainspan,
6655#endif
6656 sa_none,
6657};
6658
9c1cfda2 6659/*
48f24c4d 6660 * SMT sched-domains:
9c1cfda2 6661 */
1da177e4 6662#ifdef CONFIG_SCHED_SMT
6c99e9ad 6663static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
1871e52c 6664static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
48f24c4d 6665
41a2d6cf 6666static int
96f874e2
RR
6667cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6668 struct sched_group **sg, struct cpumask *unused)
1da177e4 6669{
6711cab4 6670 if (sg)
1871e52c 6671 *sg = &per_cpu(sched_groups, cpu).sg;
1da177e4
LT
6672 return cpu;
6673}
6d6bc0ad 6674#endif /* CONFIG_SCHED_SMT */
1da177e4 6675
48f24c4d
IM
6676/*
6677 * multi-core sched-domains:
6678 */
1e9f28fa 6679#ifdef CONFIG_SCHED_MC
6c99e9ad
RR
6680static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6681static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
1e9f28fa 6682
41a2d6cf 6683static int
96f874e2
RR
6684cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6685 struct sched_group **sg, struct cpumask *mask)
1e9f28fa 6686{
6711cab4 6687 int group;
f269893c 6688#ifdef CONFIG_SCHED_SMT
c69fc56d 6689 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
96f874e2 6690 group = cpumask_first(mask);
f269893c
HC
6691#else
6692 group = cpu;
6693#endif
6711cab4 6694 if (sg)
6c99e9ad 6695 *sg = &per_cpu(sched_group_core, group).sg;
6711cab4 6696 return group;
1e9f28fa 6697}
f269893c 6698#endif /* CONFIG_SCHED_MC */
1e9f28fa 6699
01a08546
HC
6700/*
6701 * book sched-domains:
6702 */
6703#ifdef CONFIG_SCHED_BOOK
6704static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6705static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6706
6707static int
6708cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6709 struct sched_group **sg, struct cpumask *mask)
6710{
6711 int group = cpu;
6712#ifdef CONFIG_SCHED_MC
6713 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6714 group = cpumask_first(mask);
6715#elif defined(CONFIG_SCHED_SMT)
6716 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6717 group = cpumask_first(mask);
6718#endif
6719 if (sg)
6720 *sg = &per_cpu(sched_group_book, group).sg;
6721 return group;
6722}
6723#endif /* CONFIG_SCHED_BOOK */
6724
6c99e9ad
RR
6725static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6726static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
48f24c4d 6727
41a2d6cf 6728static int
96f874e2
RR
6729cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6730 struct sched_group **sg, struct cpumask *mask)
1da177e4 6731{
6711cab4 6732 int group;
01a08546
HC
6733#ifdef CONFIG_SCHED_BOOK
6734 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6735 group = cpumask_first(mask);
6736#elif defined(CONFIG_SCHED_MC)
6ca09dfc 6737 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
96f874e2 6738 group = cpumask_first(mask);
1e9f28fa 6739#elif defined(CONFIG_SCHED_SMT)
c69fc56d 6740 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
96f874e2 6741 group = cpumask_first(mask);
1da177e4 6742#else
6711cab4 6743 group = cpu;
1da177e4 6744#endif
6711cab4 6745 if (sg)
6c99e9ad 6746 *sg = &per_cpu(sched_group_phys, group).sg;
6711cab4 6747 return group;
1da177e4
LT
6748}
6749
6750#ifdef CONFIG_NUMA
1da177e4 6751/*
9c1cfda2
JH
6752 * The init_sched_build_groups can't handle what we want to do with node
6753 * groups, so roll our own. Now each node has its own list of groups which
6754 * gets dynamically allocated.
1da177e4 6755 */
62ea9ceb 6756static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
434d53b0 6757static struct sched_group ***sched_group_nodes_bycpu;
1da177e4 6758
62ea9ceb 6759static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
6c99e9ad 6760static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
9c1cfda2 6761
96f874e2
RR
6762static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
6763 struct sched_group **sg,
6764 struct cpumask *nodemask)
9c1cfda2 6765{
6711cab4
SS
6766 int group;
6767
6ca09dfc 6768 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
96f874e2 6769 group = cpumask_first(nodemask);
6711cab4
SS
6770
6771 if (sg)
6c99e9ad 6772 *sg = &per_cpu(sched_group_allnodes, group).sg;
6711cab4 6773 return group;
1da177e4 6774}
6711cab4 6775
08069033
SS
6776static void init_numa_sched_groups_power(struct sched_group *group_head)
6777{
6778 struct sched_group *sg = group_head;
6779 int j;
6780
6781 if (!sg)
6782 return;
3a5c359a 6783 do {
758b2cdc 6784 for_each_cpu(j, sched_group_cpus(sg)) {
3a5c359a 6785 struct sched_domain *sd;
08069033 6786
6c99e9ad 6787 sd = &per_cpu(phys_domains, j).sd;
13318a71 6788 if (j != group_first_cpu(sd->groups)) {
3a5c359a
AK
6789 /*
6790 * Only add "power" once for each
6791 * physical package.
6792 */
6793 continue;
6794 }
08069033 6795
18a3885f 6796 sg->cpu_power += sd->groups->cpu_power;
3a5c359a
AK
6797 }
6798 sg = sg->next;
6799 } while (sg != group_head);
08069033 6800}
0601a88d
AH
6801
6802static int build_numa_sched_groups(struct s_data *d,
6803 const struct cpumask *cpu_map, int num)
6804{
6805 struct sched_domain *sd;
6806 struct sched_group *sg, *prev;
6807 int n, j;
6808
6809 cpumask_clear(d->covered);
6810 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
6811 if (cpumask_empty(d->nodemask)) {
6812 d->sched_group_nodes[num] = NULL;
6813 goto out;
6814 }
6815
6816 sched_domain_node_span(num, d->domainspan);
6817 cpumask_and(d->domainspan, d->domainspan, cpu_map);
6818
6819 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6820 GFP_KERNEL, num);
6821 if (!sg) {
3df0fc5b
PZ
6822 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
6823 num);
0601a88d
AH
6824 return -ENOMEM;
6825 }
6826 d->sched_group_nodes[num] = sg;
6827
6828 for_each_cpu(j, d->nodemask) {
6829 sd = &per_cpu(node_domains, j).sd;
6830 sd->groups = sg;
6831 }
6832
18a3885f 6833 sg->cpu_power = 0;
0601a88d
AH
6834 cpumask_copy(sched_group_cpus(sg), d->nodemask);
6835 sg->next = sg;
6836 cpumask_or(d->covered, d->covered, d->nodemask);
6837
6838 prev = sg;
6839 for (j = 0; j < nr_node_ids; j++) {
6840 n = (num + j) % nr_node_ids;
6841 cpumask_complement(d->notcovered, d->covered);
6842 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
6843 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
6844 if (cpumask_empty(d->tmpmask))
6845 break;
6846 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
6847 if (cpumask_empty(d->tmpmask))
6848 continue;
6849 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6850 GFP_KERNEL, num);
6851 if (!sg) {
3df0fc5b
PZ
6852 printk(KERN_WARNING
6853 "Can not alloc domain group for node %d\n", j);
0601a88d
AH
6854 return -ENOMEM;
6855 }
18a3885f 6856 sg->cpu_power = 0;
0601a88d
AH
6857 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
6858 sg->next = prev->next;
6859 cpumask_or(d->covered, d->covered, d->tmpmask);
6860 prev->next = sg;
6861 prev = sg;
6862 }
6863out:
6864 return 0;
6865}
6d6bc0ad 6866#endif /* CONFIG_NUMA */
1da177e4 6867
a616058b 6868#ifdef CONFIG_NUMA
51888ca2 6869/* Free memory allocated for various sched_group structures */
96f874e2
RR
6870static void free_sched_groups(const struct cpumask *cpu_map,
6871 struct cpumask *nodemask)
51888ca2 6872{
a616058b 6873 int cpu, i;
51888ca2 6874
abcd083a 6875 for_each_cpu(cpu, cpu_map) {
51888ca2
SV
6876 struct sched_group **sched_group_nodes
6877 = sched_group_nodes_bycpu[cpu];
6878
51888ca2
SV
6879 if (!sched_group_nodes)
6880 continue;
6881
076ac2af 6882 for (i = 0; i < nr_node_ids; i++) {
51888ca2
SV
6883 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6884
6ca09dfc 6885 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
96f874e2 6886 if (cpumask_empty(nodemask))
51888ca2
SV
6887 continue;
6888
6889 if (sg == NULL)
6890 continue;
6891 sg = sg->next;
6892next_sg:
6893 oldsg = sg;
6894 sg = sg->next;
6895 kfree(oldsg);
6896 if (oldsg != sched_group_nodes[i])
6897 goto next_sg;
6898 }
6899 kfree(sched_group_nodes);
6900 sched_group_nodes_bycpu[cpu] = NULL;
6901 }
51888ca2 6902}
6d6bc0ad 6903#else /* !CONFIG_NUMA */
96f874e2
RR
6904static void free_sched_groups(const struct cpumask *cpu_map,
6905 struct cpumask *nodemask)
a616058b
SS
6906{
6907}
6d6bc0ad 6908#endif /* CONFIG_NUMA */
51888ca2 6909
89c4710e
SS
6910/*
6911 * Initialize sched groups cpu_power.
6912 *
6913 * cpu_power indicates the capacity of sched group, which is used while
6914 * distributing the load between different sched groups in a sched domain.
6915 * Typically cpu_power for all the groups in a sched domain will be same unless
6916 * there are asymmetries in the topology. If there are asymmetries, group
6917 * having more cpu_power will pickup more load compared to the group having
6918 * less cpu_power.
89c4710e
SS
6919 */
6920static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6921{
6922 struct sched_domain *child;
6923 struct sched_group *group;
f93e65c1
PZ
6924 long power;
6925 int weight;
89c4710e
SS
6926
6927 WARN_ON(!sd || !sd->groups);
6928
13318a71 6929 if (cpu != group_first_cpu(sd->groups))
89c4710e
SS
6930 return;
6931
6932 child = sd->child;
6933
18a3885f 6934 sd->groups->cpu_power = 0;
5517d86b 6935
f93e65c1
PZ
6936 if (!child) {
6937 power = SCHED_LOAD_SCALE;
6938 weight = cpumask_weight(sched_domain_span(sd));
6939 /*
6940 * SMT siblings share the power of a single core.
a52bfd73
PZ
6941 * Usually multiple threads get a better yield out of
6942 * that one core than a single thread would have,
6943 * reflect that in sd->smt_gain.
f93e65c1 6944 */
a52bfd73
PZ
6945 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6946 power *= sd->smt_gain;
f93e65c1 6947 power /= weight;
a52bfd73
PZ
6948 power >>= SCHED_LOAD_SHIFT;
6949 }
18a3885f 6950 sd->groups->cpu_power += power;
89c4710e
SS
6951 return;
6952 }
6953
89c4710e 6954 /*
f93e65c1 6955 * Add cpu_power of each child group to this groups cpu_power.
89c4710e
SS
6956 */
6957 group = child->groups;
6958 do {
18a3885f 6959 sd->groups->cpu_power += group->cpu_power;
89c4710e
SS
6960 group = group->next;
6961 } while (group != child->groups);
6962}
6963
7c16ec58
MT
6964/*
6965 * Initializers for schedule domains
6966 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6967 */
6968
a5d8c348
IM
6969#ifdef CONFIG_SCHED_DEBUG
6970# define SD_INIT_NAME(sd, type) sd->name = #type
6971#else
6972# define SD_INIT_NAME(sd, type) do { } while (0)
6973#endif
6974
7c16ec58 6975#define SD_INIT(sd, type) sd_init_##type(sd)
a5d8c348 6976
7c16ec58
MT
6977#define SD_INIT_FUNC(type) \
6978static noinline void sd_init_##type(struct sched_domain *sd) \
6979{ \
6980 memset(sd, 0, sizeof(*sd)); \
6981 *sd = SD_##type##_INIT; \
1d3504fc 6982 sd->level = SD_LV_##type; \
a5d8c348 6983 SD_INIT_NAME(sd, type); \
7c16ec58
MT
6984}
6985
6986SD_INIT_FUNC(CPU)
6987#ifdef CONFIG_NUMA
6988 SD_INIT_FUNC(ALLNODES)
6989 SD_INIT_FUNC(NODE)
6990#endif
6991#ifdef CONFIG_SCHED_SMT
6992 SD_INIT_FUNC(SIBLING)
6993#endif
6994#ifdef CONFIG_SCHED_MC
6995 SD_INIT_FUNC(MC)
6996#endif
01a08546
HC
6997#ifdef CONFIG_SCHED_BOOK
6998 SD_INIT_FUNC(BOOK)
6999#endif
7c16ec58 7000
1d3504fc
HS
7001static int default_relax_domain_level = -1;
7002
7003static int __init setup_relax_domain_level(char *str)
7004{
30e0e178
LZ
7005 unsigned long val;
7006
7007 val = simple_strtoul(str, NULL, 0);
7008 if (val < SD_LV_MAX)
7009 default_relax_domain_level = val;
7010
1d3504fc
HS
7011 return 1;
7012}
7013__setup("relax_domain_level=", setup_relax_domain_level);
7014
7015static void set_domain_attribute(struct sched_domain *sd,
7016 struct sched_domain_attr *attr)
7017{
7018 int request;
7019
7020 if (!attr || attr->relax_domain_level < 0) {
7021 if (default_relax_domain_level < 0)
7022 return;
7023 else
7024 request = default_relax_domain_level;
7025 } else
7026 request = attr->relax_domain_level;
7027 if (request < sd->level) {
7028 /* turn off idle balance on this domain */
c88d5910 7029 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1d3504fc
HS
7030 } else {
7031 /* turn on idle balance on this domain */
c88d5910 7032 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1d3504fc
HS
7033 }
7034}
7035
2109b99e
AH
7036static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7037 const struct cpumask *cpu_map)
7038{
7039 switch (what) {
7040 case sa_sched_groups:
7041 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7042 d->sched_group_nodes = NULL;
7043 case sa_rootdomain:
7044 free_rootdomain(d->rd); /* fall through */
7045 case sa_tmpmask:
7046 free_cpumask_var(d->tmpmask); /* fall through */
7047 case sa_send_covered:
7048 free_cpumask_var(d->send_covered); /* fall through */
01a08546
HC
7049 case sa_this_book_map:
7050 free_cpumask_var(d->this_book_map); /* fall through */
2109b99e
AH
7051 case sa_this_core_map:
7052 free_cpumask_var(d->this_core_map); /* fall through */
7053 case sa_this_sibling_map:
7054 free_cpumask_var(d->this_sibling_map); /* fall through */
7055 case sa_nodemask:
7056 free_cpumask_var(d->nodemask); /* fall through */
7057 case sa_sched_group_nodes:
d1b55138 7058#ifdef CONFIG_NUMA
2109b99e
AH
7059 kfree(d->sched_group_nodes); /* fall through */
7060 case sa_notcovered:
7061 free_cpumask_var(d->notcovered); /* fall through */
7062 case sa_covered:
7063 free_cpumask_var(d->covered); /* fall through */
7064 case sa_domainspan:
7065 free_cpumask_var(d->domainspan); /* fall through */
3404c8d9 7066#endif
2109b99e
AH
7067 case sa_none:
7068 break;
7069 }
7070}
3404c8d9 7071
2109b99e
AH
7072static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7073 const struct cpumask *cpu_map)
7074{
3404c8d9 7075#ifdef CONFIG_NUMA
2109b99e
AH
7076 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
7077 return sa_none;
7078 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
7079 return sa_domainspan;
7080 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
7081 return sa_covered;
7082 /* Allocate the per-node list of sched groups */
7083 d->sched_group_nodes = kcalloc(nr_node_ids,
7084 sizeof(struct sched_group *), GFP_KERNEL);
7085 if (!d->sched_group_nodes) {
3df0fc5b 7086 printk(KERN_WARNING "Can not alloc sched group node list\n");
2109b99e 7087 return sa_notcovered;
d1b55138 7088 }
2109b99e 7089 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
d1b55138 7090#endif
2109b99e
AH
7091 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7092 return sa_sched_group_nodes;
7093 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7094 return sa_nodemask;
7095 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7096 return sa_this_sibling_map;
01a08546 7097 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
2109b99e 7098 return sa_this_core_map;
01a08546
HC
7099 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7100 return sa_this_book_map;
2109b99e
AH
7101 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7102 return sa_send_covered;
7103 d->rd = alloc_rootdomain();
7104 if (!d->rd) {
3df0fc5b 7105 printk(KERN_WARNING "Cannot alloc root domain\n");
2109b99e 7106 return sa_tmpmask;
57d885fe 7107 }
2109b99e
AH
7108 return sa_rootdomain;
7109}
57d885fe 7110
7f4588f3
AH
7111static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
7112 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
7113{
7114 struct sched_domain *sd = NULL;
7c16ec58 7115#ifdef CONFIG_NUMA
7f4588f3 7116 struct sched_domain *parent;
1da177e4 7117
7f4588f3
AH
7118 d->sd_allnodes = 0;
7119 if (cpumask_weight(cpu_map) >
7120 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7121 sd = &per_cpu(allnodes_domains, i).sd;
7122 SD_INIT(sd, ALLNODES);
1d3504fc 7123 set_domain_attribute(sd, attr);
7f4588f3
AH
7124 cpumask_copy(sched_domain_span(sd), cpu_map);
7125 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7126 d->sd_allnodes = 1;
7127 }
7128 parent = sd;
7129
7130 sd = &per_cpu(node_domains, i).sd;
7131 SD_INIT(sd, NODE);
7132 set_domain_attribute(sd, attr);
7133 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7134 sd->parent = parent;
7135 if (parent)
7136 parent->child = sd;
7137 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
1da177e4 7138#endif
7f4588f3
AH
7139 return sd;
7140}
1da177e4 7141
87cce662
AH
7142static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
7143 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7144 struct sched_domain *parent, int i)
7145{
7146 struct sched_domain *sd;
7147 sd = &per_cpu(phys_domains, i).sd;
7148 SD_INIT(sd, CPU);
7149 set_domain_attribute(sd, attr);
7150 cpumask_copy(sched_domain_span(sd), d->nodemask);
7151 sd->parent = parent;
7152 if (parent)
7153 parent->child = sd;
7154 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7155 return sd;
7156}
1da177e4 7157
01a08546
HC
7158static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7159 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7160 struct sched_domain *parent, int i)
7161{
7162 struct sched_domain *sd = parent;
7163#ifdef CONFIG_SCHED_BOOK
7164 sd = &per_cpu(book_domains, i).sd;
7165 SD_INIT(sd, BOOK);
7166 set_domain_attribute(sd, attr);
7167 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7168 sd->parent = parent;
7169 parent->child = sd;
7170 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7171#endif
7172 return sd;
7173}
7174
410c4081
AH
7175static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
7176 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7177 struct sched_domain *parent, int i)
7178{
7179 struct sched_domain *sd = parent;
1e9f28fa 7180#ifdef CONFIG_SCHED_MC
410c4081
AH
7181 sd = &per_cpu(core_domains, i).sd;
7182 SD_INIT(sd, MC);
7183 set_domain_attribute(sd, attr);
7184 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7185 sd->parent = parent;
7186 parent->child = sd;
7187 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
1e9f28fa 7188#endif
410c4081
AH
7189 return sd;
7190}
1e9f28fa 7191
d8173535
AH
7192static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7193 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7194 struct sched_domain *parent, int i)
7195{
7196 struct sched_domain *sd = parent;
1da177e4 7197#ifdef CONFIG_SCHED_SMT
d8173535
AH
7198 sd = &per_cpu(cpu_domains, i).sd;
7199 SD_INIT(sd, SIBLING);
7200 set_domain_attribute(sd, attr);
7201 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7202 sd->parent = parent;
7203 parent->child = sd;
7204 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
1da177e4 7205#endif
d8173535
AH
7206 return sd;
7207}
1da177e4 7208
0e8e85c9
AH
7209static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7210 const struct cpumask *cpu_map, int cpu)
7211{
7212 switch (l) {
1da177e4 7213#ifdef CONFIG_SCHED_SMT
0e8e85c9
AH
7214 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
7215 cpumask_and(d->this_sibling_map, cpu_map,
7216 topology_thread_cpumask(cpu));
7217 if (cpu == cpumask_first(d->this_sibling_map))
7218 init_sched_build_groups(d->this_sibling_map, cpu_map,
7219 &cpu_to_cpu_group,
7220 d->send_covered, d->tmpmask);
7221 break;
1da177e4 7222#endif
1e9f28fa 7223#ifdef CONFIG_SCHED_MC
a2af04cd
AH
7224 case SD_LV_MC: /* set up multi-core groups */
7225 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7226 if (cpu == cpumask_first(d->this_core_map))
7227 init_sched_build_groups(d->this_core_map, cpu_map,
7228 &cpu_to_core_group,
7229 d->send_covered, d->tmpmask);
7230 break;
01a08546
HC
7231#endif
7232#ifdef CONFIG_SCHED_BOOK
7233 case SD_LV_BOOK: /* set up book groups */
7234 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7235 if (cpu == cpumask_first(d->this_book_map))
7236 init_sched_build_groups(d->this_book_map, cpu_map,
7237 &cpu_to_book_group,
7238 d->send_covered, d->tmpmask);
7239 break;
1e9f28fa 7240#endif
86548096
AH
7241 case SD_LV_CPU: /* set up physical groups */
7242 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7243 if (!cpumask_empty(d->nodemask))
7244 init_sched_build_groups(d->nodemask, cpu_map,
7245 &cpu_to_phys_group,
7246 d->send_covered, d->tmpmask);
7247 break;
1da177e4 7248#ifdef CONFIG_NUMA
de616e36
AH
7249 case SD_LV_ALLNODES:
7250 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
7251 d->send_covered, d->tmpmask);
7252 break;
7253#endif
0e8e85c9
AH
7254 default:
7255 break;
7c16ec58 7256 }
0e8e85c9 7257}
9c1cfda2 7258
2109b99e
AH
7259/*
7260 * Build sched domains for a given set of cpus and attach the sched domains
7261 * to the individual cpus
7262 */
7263static int __build_sched_domains(const struct cpumask *cpu_map,
7264 struct sched_domain_attr *attr)
7265{
7266 enum s_alloc alloc_state = sa_none;
7267 struct s_data d;
294b0c96 7268 struct sched_domain *sd;
2109b99e 7269 int i;
7c16ec58 7270#ifdef CONFIG_NUMA
2109b99e 7271 d.sd_allnodes = 0;
7c16ec58 7272#endif
9c1cfda2 7273
2109b99e
AH
7274 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7275 if (alloc_state != sa_rootdomain)
7276 goto error;
7277 alloc_state = sa_sched_groups;
9c1cfda2 7278
1da177e4 7279 /*
1a20ff27 7280 * Set up domains for cpus specified by the cpu_map.
1da177e4 7281 */
abcd083a 7282 for_each_cpu(i, cpu_map) {
49a02c51
AH
7283 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
7284 cpu_map);
9761eea8 7285
7f4588f3 7286 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
87cce662 7287 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
01a08546 7288 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
410c4081 7289 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
d8173535 7290 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
1da177e4 7291 }
9c1cfda2 7292
abcd083a 7293 for_each_cpu(i, cpu_map) {
0e8e85c9 7294 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
01a08546 7295 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
a2af04cd 7296 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
1da177e4 7297 }
9c1cfda2 7298
1da177e4 7299 /* Set up physical groups */
86548096
AH
7300 for (i = 0; i < nr_node_ids; i++)
7301 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
9c1cfda2 7302
1da177e4
LT
7303#ifdef CONFIG_NUMA
7304 /* Set up node groups */
de616e36
AH
7305 if (d.sd_allnodes)
7306 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
9c1cfda2 7307
0601a88d
AH
7308 for (i = 0; i < nr_node_ids; i++)
7309 if (build_numa_sched_groups(&d, cpu_map, i))
51888ca2 7310 goto error;
1da177e4
LT
7311#endif
7312
7313 /* Calculate CPU power for physical packages and nodes */
5c45bf27 7314#ifdef CONFIG_SCHED_SMT
abcd083a 7315 for_each_cpu(i, cpu_map) {
294b0c96 7316 sd = &per_cpu(cpu_domains, i).sd;
89c4710e 7317 init_sched_groups_power(i, sd);
5c45bf27 7318 }
1da177e4 7319#endif
1e9f28fa 7320#ifdef CONFIG_SCHED_MC
abcd083a 7321 for_each_cpu(i, cpu_map) {
294b0c96 7322 sd = &per_cpu(core_domains, i).sd;
89c4710e 7323 init_sched_groups_power(i, sd);
5c45bf27
SS
7324 }
7325#endif
01a08546
HC
7326#ifdef CONFIG_SCHED_BOOK
7327 for_each_cpu(i, cpu_map) {
7328 sd = &per_cpu(book_domains, i).sd;
7329 init_sched_groups_power(i, sd);
7330 }
7331#endif
1e9f28fa 7332
abcd083a 7333 for_each_cpu(i, cpu_map) {
294b0c96 7334 sd = &per_cpu(phys_domains, i).sd;
89c4710e 7335 init_sched_groups_power(i, sd);
1da177e4
LT
7336 }
7337
9c1cfda2 7338#ifdef CONFIG_NUMA
076ac2af 7339 for (i = 0; i < nr_node_ids; i++)
49a02c51 7340 init_numa_sched_groups_power(d.sched_group_nodes[i]);
9c1cfda2 7341
49a02c51 7342 if (d.sd_allnodes) {
6711cab4 7343 struct sched_group *sg;
f712c0c7 7344
96f874e2 7345 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
49a02c51 7346 d.tmpmask);
f712c0c7
SS
7347 init_numa_sched_groups_power(sg);
7348 }
9c1cfda2
JH
7349#endif
7350
1da177e4 7351 /* Attach the domains */
abcd083a 7352 for_each_cpu(i, cpu_map) {
1da177e4 7353#ifdef CONFIG_SCHED_SMT
6c99e9ad 7354 sd = &per_cpu(cpu_domains, i).sd;
1e9f28fa 7355#elif defined(CONFIG_SCHED_MC)
6c99e9ad 7356 sd = &per_cpu(core_domains, i).sd;
01a08546
HC
7357#elif defined(CONFIG_SCHED_BOOK)
7358 sd = &per_cpu(book_domains, i).sd;
1da177e4 7359#else
6c99e9ad 7360 sd = &per_cpu(phys_domains, i).sd;
1da177e4 7361#endif
49a02c51 7362 cpu_attach_domain(sd, d.rd, i);
1da177e4 7363 }
51888ca2 7364
2109b99e
AH
7365 d.sched_group_nodes = NULL; /* don't free this we still need it */
7366 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7367 return 0;
51888ca2 7368
51888ca2 7369error:
2109b99e
AH
7370 __free_domain_allocs(&d, alloc_state, cpu_map);
7371 return -ENOMEM;
1da177e4 7372}
029190c5 7373
96f874e2 7374static int build_sched_domains(const struct cpumask *cpu_map)
1d3504fc
HS
7375{
7376 return __build_sched_domains(cpu_map, NULL);
7377}
7378
acc3f5d7 7379static cpumask_var_t *doms_cur; /* current sched domains */
029190c5 7380static int ndoms_cur; /* number of sched domains in 'doms_cur' */
4285f594
IM
7381static struct sched_domain_attr *dattr_cur;
7382 /* attribues of custom domains in 'doms_cur' */
029190c5
PJ
7383
7384/*
7385 * Special case: If a kmalloc of a doms_cur partition (array of
4212823f
RR
7386 * cpumask) fails, then fallback to a single sched domain,
7387 * as determined by the single cpumask fallback_doms.
029190c5 7388 */
4212823f 7389static cpumask_var_t fallback_doms;
029190c5 7390
ee79d1bd
HC
7391/*
7392 * arch_update_cpu_topology lets virtualized architectures update the
7393 * cpu core maps. It is supposed to return 1 if the topology changed
7394 * or 0 if it stayed the same.
7395 */
7396int __attribute__((weak)) arch_update_cpu_topology(void)
22e52b07 7397{
ee79d1bd 7398 return 0;
22e52b07
HC
7399}
7400
acc3f5d7
RR
7401cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7402{
7403 int i;
7404 cpumask_var_t *doms;
7405
7406 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7407 if (!doms)
7408 return NULL;
7409 for (i = 0; i < ndoms; i++) {
7410 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7411 free_sched_domains(doms, i);
7412 return NULL;
7413 }
7414 }
7415 return doms;
7416}
7417
7418void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7419{
7420 unsigned int i;
7421 for (i = 0; i < ndoms; i++)
7422 free_cpumask_var(doms[i]);
7423 kfree(doms);
7424}
7425
1a20ff27 7426/*
41a2d6cf 7427 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
029190c5
PJ
7428 * For now this just excludes isolated cpus, but could be used to
7429 * exclude other special cases in the future.
1a20ff27 7430 */
96f874e2 7431static int arch_init_sched_domains(const struct cpumask *cpu_map)
1a20ff27 7432{
7378547f
MM
7433 int err;
7434
22e52b07 7435 arch_update_cpu_topology();
029190c5 7436 ndoms_cur = 1;
acc3f5d7 7437 doms_cur = alloc_sched_domains(ndoms_cur);
029190c5 7438 if (!doms_cur)
acc3f5d7
RR
7439 doms_cur = &fallback_doms;
7440 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
1d3504fc 7441 dattr_cur = NULL;
acc3f5d7 7442 err = build_sched_domains(doms_cur[0]);
6382bc90 7443 register_sched_domain_sysctl();
7378547f
MM
7444
7445 return err;
1a20ff27
DG
7446}
7447
96f874e2
RR
7448static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7449 struct cpumask *tmpmask)
1da177e4 7450{
7c16ec58 7451 free_sched_groups(cpu_map, tmpmask);
9c1cfda2 7452}
1da177e4 7453
1a20ff27
DG
7454/*
7455 * Detach sched domains from a group of cpus specified in cpu_map
7456 * These cpus will now be attached to the NULL domain
7457 */
96f874e2 7458static void detach_destroy_domains(const struct cpumask *cpu_map)
1a20ff27 7459{
96f874e2
RR
7460 /* Save because hotplug lock held. */
7461 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
1a20ff27
DG
7462 int i;
7463
abcd083a 7464 for_each_cpu(i, cpu_map)
57d885fe 7465 cpu_attach_domain(NULL, &def_root_domain, i);
1a20ff27 7466 synchronize_sched();
96f874e2 7467 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
1a20ff27
DG
7468}
7469
1d3504fc
HS
7470/* handle null as "default" */
7471static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7472 struct sched_domain_attr *new, int idx_new)
7473{
7474 struct sched_domain_attr tmp;
7475
7476 /* fast path */
7477 if (!new && !cur)
7478 return 1;
7479
7480 tmp = SD_ATTR_INIT;
7481 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7482 new ? (new + idx_new) : &tmp,
7483 sizeof(struct sched_domain_attr));
7484}
7485
029190c5
PJ
7486/*
7487 * Partition sched domains as specified by the 'ndoms_new'
41a2d6cf 7488 * cpumasks in the array doms_new[] of cpumasks. This compares
029190c5
PJ
7489 * doms_new[] to the current sched domain partitioning, doms_cur[].
7490 * It destroys each deleted domain and builds each new domain.
7491 *
acc3f5d7 7492 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
41a2d6cf
IM
7493 * The masks don't intersect (don't overlap.) We should setup one
7494 * sched domain for each mask. CPUs not in any of the cpumasks will
7495 * not be load balanced. If the same cpumask appears both in the
029190c5
PJ
7496 * current 'doms_cur' domains and in the new 'doms_new', we can leave
7497 * it as it is.
7498 *
acc3f5d7
RR
7499 * The passed in 'doms_new' should be allocated using
7500 * alloc_sched_domains. This routine takes ownership of it and will
7501 * free_sched_domains it when done with it. If the caller failed the
7502 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7503 * and partition_sched_domains() will fallback to the single partition
7504 * 'fallback_doms', it also forces the domains to be rebuilt.
029190c5 7505 *
96f874e2 7506 * If doms_new == NULL it will be replaced with cpu_online_mask.
700018e0
LZ
7507 * ndoms_new == 0 is a special case for destroying existing domains,
7508 * and it will not create the default domain.
dfb512ec 7509 *
029190c5
PJ
7510 * Call with hotplug lock held
7511 */
acc3f5d7 7512void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1d3504fc 7513 struct sched_domain_attr *dattr_new)
029190c5 7514{
dfb512ec 7515 int i, j, n;
d65bd5ec 7516 int new_topology;
029190c5 7517
712555ee 7518 mutex_lock(&sched_domains_mutex);
a1835615 7519
7378547f
MM
7520 /* always unregister in case we don't destroy any domains */
7521 unregister_sched_domain_sysctl();
7522
d65bd5ec
HC
7523 /* Let architecture update cpu core mappings. */
7524 new_topology = arch_update_cpu_topology();
7525
dfb512ec 7526 n = doms_new ? ndoms_new : 0;
029190c5
PJ
7527
7528 /* Destroy deleted domains */
7529 for (i = 0; i < ndoms_cur; i++) {
d65bd5ec 7530 for (j = 0; j < n && !new_topology; j++) {
acc3f5d7 7531 if (cpumask_equal(doms_cur[i], doms_new[j])
1d3504fc 7532 && dattrs_equal(dattr_cur, i, dattr_new, j))
029190c5
PJ
7533 goto match1;
7534 }
7535 /* no match - a current sched domain not in new doms_new[] */
acc3f5d7 7536 detach_destroy_domains(doms_cur[i]);
029190c5
PJ
7537match1:
7538 ;
7539 }
7540
e761b772
MK
7541 if (doms_new == NULL) {
7542 ndoms_cur = 0;
acc3f5d7 7543 doms_new = &fallback_doms;
6ad4c188 7544 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
faa2f98f 7545 WARN_ON_ONCE(dattr_new);
e761b772
MK
7546 }
7547
029190c5
PJ
7548 /* Build new domains */
7549 for (i = 0; i < ndoms_new; i++) {
d65bd5ec 7550 for (j = 0; j < ndoms_cur && !new_topology; j++) {
acc3f5d7 7551 if (cpumask_equal(doms_new[i], doms_cur[j])
1d3504fc 7552 && dattrs_equal(dattr_new, i, dattr_cur, j))
029190c5
PJ
7553 goto match2;
7554 }
7555 /* no match - add a new doms_new */
acc3f5d7 7556 __build_sched_domains(doms_new[i],
1d3504fc 7557 dattr_new ? dattr_new + i : NULL);
029190c5
PJ
7558match2:
7559 ;
7560 }
7561
7562 /* Remember the new sched domains */
acc3f5d7
RR
7563 if (doms_cur != &fallback_doms)
7564 free_sched_domains(doms_cur, ndoms_cur);
1d3504fc 7565 kfree(dattr_cur); /* kfree(NULL) is safe */
029190c5 7566 doms_cur = doms_new;
1d3504fc 7567 dattr_cur = dattr_new;
029190c5 7568 ndoms_cur = ndoms_new;
7378547f
MM
7569
7570 register_sched_domain_sysctl();
a1835615 7571
712555ee 7572 mutex_unlock(&sched_domains_mutex);
029190c5
PJ
7573}
7574
5c45bf27 7575#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
c70f22d2 7576static void arch_reinit_sched_domains(void)
5c45bf27 7577{
95402b38 7578 get_online_cpus();
dfb512ec
MK
7579
7580 /* Destroy domains first to force the rebuild */
7581 partition_sched_domains(0, NULL, NULL);
7582
e761b772 7583 rebuild_sched_domains();
95402b38 7584 put_online_cpus();
5c45bf27
SS
7585}
7586
7587static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7588{
afb8a9b7 7589 unsigned int level = 0;
5c45bf27 7590
afb8a9b7
GS
7591 if (sscanf(buf, "%u", &level) != 1)
7592 return -EINVAL;
7593
7594 /*
7595 * level is always be positive so don't check for
7596 * level < POWERSAVINGS_BALANCE_NONE which is 0
7597 * What happens on 0 or 1 byte write,
7598 * need to check for count as well?
7599 */
7600
7601 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
5c45bf27
SS
7602 return -EINVAL;
7603
7604 if (smt)
afb8a9b7 7605 sched_smt_power_savings = level;
5c45bf27 7606 else
afb8a9b7 7607 sched_mc_power_savings = level;
5c45bf27 7608
c70f22d2 7609 arch_reinit_sched_domains();
5c45bf27 7610
c70f22d2 7611 return count;
5c45bf27
SS
7612}
7613
5c45bf27 7614#ifdef CONFIG_SCHED_MC
f718cd4a 7615static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
c9be0a36 7616 struct sysdev_class_attribute *attr,
f718cd4a 7617 char *page)
5c45bf27
SS
7618{
7619 return sprintf(page, "%u\n", sched_mc_power_savings);
7620}
f718cd4a 7621static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
c9be0a36 7622 struct sysdev_class_attribute *attr,
48f24c4d 7623 const char *buf, size_t count)
5c45bf27
SS
7624{
7625 return sched_power_savings_store(buf, count, 0);
7626}
f718cd4a
AK
7627static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7628 sched_mc_power_savings_show,
7629 sched_mc_power_savings_store);
5c45bf27
SS
7630#endif
7631
7632#ifdef CONFIG_SCHED_SMT
f718cd4a 7633static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
c9be0a36 7634 struct sysdev_class_attribute *attr,
f718cd4a 7635 char *page)
5c45bf27
SS
7636{
7637 return sprintf(page, "%u\n", sched_smt_power_savings);
7638}
f718cd4a 7639static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
c9be0a36 7640 struct sysdev_class_attribute *attr,
48f24c4d 7641 const char *buf, size_t count)
5c45bf27
SS
7642{
7643 return sched_power_savings_store(buf, count, 1);
7644}
f718cd4a
AK
7645static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7646 sched_smt_power_savings_show,
6707de00
AB
7647 sched_smt_power_savings_store);
7648#endif
7649
39aac648 7650int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6707de00
AB
7651{
7652 int err = 0;
7653
7654#ifdef CONFIG_SCHED_SMT
7655 if (smt_capable())
7656 err = sysfs_create_file(&cls->kset.kobj,
7657 &attr_sched_smt_power_savings.attr);
7658#endif
7659#ifdef CONFIG_SCHED_MC
7660 if (!err && mc_capable())
7661 err = sysfs_create_file(&cls->kset.kobj,
7662 &attr_sched_mc_power_savings.attr);
7663#endif
7664 return err;
7665}
6d6bc0ad 7666#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
5c45bf27 7667
1da177e4 7668/*
3a101d05
TH
7669 * Update cpusets according to cpu_active mask. If cpusets are
7670 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7671 * around partition_sched_domains().
1da177e4 7672 */
0b2e918a
TH
7673static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7674 void *hcpu)
e761b772 7675{
3a101d05 7676 switch (action & ~CPU_TASKS_FROZEN) {
e761b772 7677 case CPU_ONLINE:
6ad4c188 7678 case CPU_DOWN_FAILED:
3a101d05 7679 cpuset_update_active_cpus();
e761b772 7680 return NOTIFY_OK;
3a101d05
TH
7681 default:
7682 return NOTIFY_DONE;
7683 }
7684}
e761b772 7685
0b2e918a
TH
7686static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7687 void *hcpu)
3a101d05
TH
7688{
7689 switch (action & ~CPU_TASKS_FROZEN) {
7690 case CPU_DOWN_PREPARE:
7691 cpuset_update_active_cpus();
7692 return NOTIFY_OK;
e761b772
MK
7693 default:
7694 return NOTIFY_DONE;
7695 }
7696}
e761b772
MK
7697
7698static int update_runtime(struct notifier_block *nfb,
7699 unsigned long action, void *hcpu)
1da177e4 7700{
7def2be1
PZ
7701 int cpu = (int)(long)hcpu;
7702
1da177e4 7703 switch (action) {
1da177e4 7704 case CPU_DOWN_PREPARE:
8bb78442 7705 case CPU_DOWN_PREPARE_FROZEN:
7def2be1 7706 disable_runtime(cpu_rq(cpu));
1da177e4
LT
7707 return NOTIFY_OK;
7708
1da177e4 7709 case CPU_DOWN_FAILED:
8bb78442 7710 case CPU_DOWN_FAILED_FROZEN:
1da177e4 7711 case CPU_ONLINE:
8bb78442 7712 case CPU_ONLINE_FROZEN:
7def2be1 7713 enable_runtime(cpu_rq(cpu));
e761b772
MK
7714 return NOTIFY_OK;
7715
1da177e4
LT
7716 default:
7717 return NOTIFY_DONE;
7718 }
1da177e4 7719}
1da177e4
LT
7720
7721void __init sched_init_smp(void)
7722{
dcc30a35
RR
7723 cpumask_var_t non_isolated_cpus;
7724
7725 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
cb5fd13f 7726 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
5c1e1767 7727
434d53b0
MT
7728#if defined(CONFIG_NUMA)
7729 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7730 GFP_KERNEL);
7731 BUG_ON(sched_group_nodes_bycpu == NULL);
7732#endif
95402b38 7733 get_online_cpus();
712555ee 7734 mutex_lock(&sched_domains_mutex);
6ad4c188 7735 arch_init_sched_domains(cpu_active_mask);
dcc30a35
RR
7736 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7737 if (cpumask_empty(non_isolated_cpus))
7738 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
712555ee 7739 mutex_unlock(&sched_domains_mutex);
95402b38 7740 put_online_cpus();
e761b772 7741
3a101d05
TH
7742 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7743 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
e761b772
MK
7744
7745 /* RT runtime code needs to handle some hotplug events */
7746 hotcpu_notifier(update_runtime, 0);
7747
b328ca18 7748 init_hrtick();
5c1e1767
NP
7749
7750 /* Move init over to a non-isolated CPU */
dcc30a35 7751 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
5c1e1767 7752 BUG();
19978ca6 7753 sched_init_granularity();
dcc30a35 7754 free_cpumask_var(non_isolated_cpus);
4212823f 7755
0e3900e6 7756 init_sched_rt_class();
1da177e4
LT
7757}
7758#else
7759void __init sched_init_smp(void)
7760{
19978ca6 7761 sched_init_granularity();
1da177e4
LT
7762}
7763#endif /* CONFIG_SMP */
7764
cd1bb94b
AB
7765const_debug unsigned int sysctl_timer_migration = 1;
7766
1da177e4
LT
7767int in_sched_functions(unsigned long addr)
7768{
1da177e4
LT
7769 return in_lock_functions(addr) ||
7770 (addr >= (unsigned long)__sched_text_start
7771 && addr < (unsigned long)__sched_text_end);
7772}
7773
a9957449 7774static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
dd41f596
IM
7775{
7776 cfs_rq->tasks_timeline = RB_ROOT;
4a55bd5e 7777 INIT_LIST_HEAD(&cfs_rq->tasks);
dd41f596
IM
7778#ifdef CONFIG_FAIR_GROUP_SCHED
7779 cfs_rq->rq = rq;
7780#endif
67e9fb2a 7781 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
dd41f596
IM
7782}
7783
fa85ae24
PZ
7784static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7785{
7786 struct rt_prio_array *array;
7787 int i;
7788
7789 array = &rt_rq->active;
7790 for (i = 0; i < MAX_RT_PRIO; i++) {
7791 INIT_LIST_HEAD(array->queue + i);
7792 __clear_bit(i, array->bitmap);
7793 }
7794 /* delimiter for bitsearch: */
7795 __set_bit(MAX_RT_PRIO, array->bitmap);
7796
052f1dc7 7797#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
e864c499 7798 rt_rq->highest_prio.curr = MAX_RT_PRIO;
398a153b 7799#ifdef CONFIG_SMP
e864c499 7800 rt_rq->highest_prio.next = MAX_RT_PRIO;
48d5e258 7801#endif
48d5e258 7802#endif
fa85ae24
PZ
7803#ifdef CONFIG_SMP
7804 rt_rq->rt_nr_migratory = 0;
fa85ae24 7805 rt_rq->overloaded = 0;
05fa785c 7806 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
fa85ae24
PZ
7807#endif
7808
7809 rt_rq->rt_time = 0;
7810 rt_rq->rt_throttled = 0;
ac086bc2 7811 rt_rq->rt_runtime = 0;
0986b11b 7812 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
6f505b16 7813
052f1dc7 7814#ifdef CONFIG_RT_GROUP_SCHED
23b0fdfc 7815 rt_rq->rt_nr_boosted = 0;
6f505b16
PZ
7816 rt_rq->rq = rq;
7817#endif
fa85ae24
PZ
7818}
7819
6f505b16 7820#ifdef CONFIG_FAIR_GROUP_SCHED
ec7dc8ac
DG
7821static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7822 struct sched_entity *se, int cpu, int add,
7823 struct sched_entity *parent)
6f505b16 7824{
ec7dc8ac 7825 struct rq *rq = cpu_rq(cpu);
6f505b16
PZ
7826 tg->cfs_rq[cpu] = cfs_rq;
7827 init_cfs_rq(cfs_rq, rq);
7828 cfs_rq->tg = tg;
7829 if (add)
7830 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7831
7832 tg->se[cpu] = se;
354d60c2
DG
7833 /* se could be NULL for init_task_group */
7834 if (!se)
7835 return;
7836
ec7dc8ac
DG
7837 if (!parent)
7838 se->cfs_rq = &rq->cfs;
7839 else
7840 se->cfs_rq = parent->my_q;
7841
6f505b16
PZ
7842 se->my_q = cfs_rq;
7843 se->load.weight = tg->shares;
e05510d0 7844 se->load.inv_weight = 0;
ec7dc8ac 7845 se->parent = parent;
6f505b16 7846}
052f1dc7 7847#endif
6f505b16 7848
052f1dc7 7849#ifdef CONFIG_RT_GROUP_SCHED
ec7dc8ac
DG
7850static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7851 struct sched_rt_entity *rt_se, int cpu, int add,
7852 struct sched_rt_entity *parent)
6f505b16 7853{
ec7dc8ac
DG
7854 struct rq *rq = cpu_rq(cpu);
7855
6f505b16
PZ
7856 tg->rt_rq[cpu] = rt_rq;
7857 init_rt_rq(rt_rq, rq);
7858 rt_rq->tg = tg;
ac086bc2 7859 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
6f505b16
PZ
7860 if (add)
7861 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7862
7863 tg->rt_se[cpu] = rt_se;
354d60c2
DG
7864 if (!rt_se)
7865 return;
7866
ec7dc8ac
DG
7867 if (!parent)
7868 rt_se->rt_rq = &rq->rt;
7869 else
7870 rt_se->rt_rq = parent->my_q;
7871
6f505b16 7872 rt_se->my_q = rt_rq;
ec7dc8ac 7873 rt_se->parent = parent;
6f505b16
PZ
7874 INIT_LIST_HEAD(&rt_se->run_list);
7875}
7876#endif
7877
1da177e4
LT
7878void __init sched_init(void)
7879{
dd41f596 7880 int i, j;
434d53b0
MT
7881 unsigned long alloc_size = 0, ptr;
7882
7883#ifdef CONFIG_FAIR_GROUP_SCHED
7884 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7885#endif
7886#ifdef CONFIG_RT_GROUP_SCHED
7887 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
eff766a6 7888#endif
df7c8e84 7889#ifdef CONFIG_CPUMASK_OFFSTACK
8c083f08 7890 alloc_size += num_possible_cpus() * cpumask_size();
434d53b0 7891#endif
434d53b0 7892 if (alloc_size) {
36b7b6d4 7893 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
434d53b0
MT
7894
7895#ifdef CONFIG_FAIR_GROUP_SCHED
7896 init_task_group.se = (struct sched_entity **)ptr;
7897 ptr += nr_cpu_ids * sizeof(void **);
7898
7899 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
7900 ptr += nr_cpu_ids * sizeof(void **);
eff766a6 7901
6d6bc0ad 7902#endif /* CONFIG_FAIR_GROUP_SCHED */
434d53b0
MT
7903#ifdef CONFIG_RT_GROUP_SCHED
7904 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7905 ptr += nr_cpu_ids * sizeof(void **);
7906
7907 init_task_group.rt_rq = (struct rt_rq **)ptr;
eff766a6
PZ
7908 ptr += nr_cpu_ids * sizeof(void **);
7909
6d6bc0ad 7910#endif /* CONFIG_RT_GROUP_SCHED */
df7c8e84
RR
7911#ifdef CONFIG_CPUMASK_OFFSTACK
7912 for_each_possible_cpu(i) {
7913 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
7914 ptr += cpumask_size();
7915 }
7916#endif /* CONFIG_CPUMASK_OFFSTACK */
434d53b0 7917 }
dd41f596 7918
57d885fe
GH
7919#ifdef CONFIG_SMP
7920 init_defrootdomain();
7921#endif
7922
d0b27fa7
PZ
7923 init_rt_bandwidth(&def_rt_bandwidth,
7924 global_rt_period(), global_rt_runtime());
7925
7926#ifdef CONFIG_RT_GROUP_SCHED
7927 init_rt_bandwidth(&init_task_group.rt_bandwidth,
7928 global_rt_period(), global_rt_runtime());
6d6bc0ad 7929#endif /* CONFIG_RT_GROUP_SCHED */
d0b27fa7 7930
7c941438 7931#ifdef CONFIG_CGROUP_SCHED
6f505b16 7932 list_add(&init_task_group.list, &task_groups);
f473aa5e
PZ
7933 INIT_LIST_HEAD(&init_task_group.children);
7934
7c941438 7935#endif /* CONFIG_CGROUP_SCHED */
6f505b16 7936
4a6cc4bd
JK
7937#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7938 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7939 __alignof__(unsigned long));
7940#endif
0a945022 7941 for_each_possible_cpu(i) {
70b97a7f 7942 struct rq *rq;
1da177e4
LT
7943
7944 rq = cpu_rq(i);
05fa785c 7945 raw_spin_lock_init(&rq->lock);
7897986b 7946 rq->nr_running = 0;
dce48a84
TG
7947 rq->calc_load_active = 0;
7948 rq->calc_load_update = jiffies + LOAD_FREQ;
dd41f596 7949 init_cfs_rq(&rq->cfs, rq);
6f505b16 7950 init_rt_rq(&rq->rt, rq);
dd41f596 7951#ifdef CONFIG_FAIR_GROUP_SCHED
4cf86d77 7952 init_task_group.shares = init_task_group_load;
6f505b16 7953 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
354d60c2
DG
7954#ifdef CONFIG_CGROUP_SCHED
7955 /*
7956 * How much cpu bandwidth does init_task_group get?
7957 *
7958 * In case of task-groups formed thr' the cgroup filesystem, it
7959 * gets 100% of the cpu resources in the system. This overall
7960 * system cpu resource is divided among the tasks of
7961 * init_task_group and its child task-groups in a fair manner,
7962 * based on each entity's (task or task-group's) weight
7963 * (se->load.weight).
7964 *
7965 * In other words, if init_task_group has 10 tasks of weight
7966 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7967 * then A0's share of the cpu resource is:
7968 *
0d905bca 7969 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
354d60c2
DG
7970 *
7971 * We achieve this by letting init_task_group's tasks sit
7972 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
7973 */
ec7dc8ac 7974 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
052f1dc7 7975#endif
354d60c2
DG
7976#endif /* CONFIG_FAIR_GROUP_SCHED */
7977
7978 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
052f1dc7 7979#ifdef CONFIG_RT_GROUP_SCHED
6f505b16 7980 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
354d60c2 7981#ifdef CONFIG_CGROUP_SCHED
ec7dc8ac 7982 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
354d60c2 7983#endif
dd41f596 7984#endif
1da177e4 7985
dd41f596
IM
7986 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7987 rq->cpu_load[j] = 0;
fdf3e95d
VP
7988
7989 rq->last_load_update_tick = jiffies;
7990
1da177e4 7991#ifdef CONFIG_SMP
41c7ce9a 7992 rq->sd = NULL;
57d885fe 7993 rq->rd = NULL;
e51fd5e2 7994 rq->cpu_power = SCHED_LOAD_SCALE;
3f029d3c 7995 rq->post_schedule = 0;
1da177e4 7996 rq->active_balance = 0;
dd41f596 7997 rq->next_balance = jiffies;
1da177e4 7998 rq->push_cpu = 0;
0a2966b4 7999 rq->cpu = i;
1f11eb6a 8000 rq->online = 0;
eae0c9df
MG
8001 rq->idle_stamp = 0;
8002 rq->avg_idle = 2*sysctl_sched_migration_cost;
dc938520 8003 rq_attach_root(rq, &def_root_domain);
83cd4fe2
VP
8004#ifdef CONFIG_NO_HZ
8005 rq->nohz_balance_kick = 0;
8006 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8007#endif
1da177e4 8008#endif
8f4d37ec 8009 init_rq_hrtick(rq);
1da177e4 8010 atomic_set(&rq->nr_iowait, 0);
1da177e4
LT
8011 }
8012
2dd73a4f 8013 set_load_weight(&init_task);
b50f60ce 8014
e107be36
AK
8015#ifdef CONFIG_PREEMPT_NOTIFIERS
8016 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8017#endif
8018
c9819f45 8019#ifdef CONFIG_SMP
962cf36c 8020 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
c9819f45
CL
8021#endif
8022
b50f60ce 8023#ifdef CONFIG_RT_MUTEXES
1d615482 8024 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
b50f60ce
HC
8025#endif
8026
1da177e4
LT
8027 /*
8028 * The boot idle thread does lazy MMU switching as well:
8029 */
8030 atomic_inc(&init_mm.mm_count);
8031 enter_lazy_tlb(&init_mm, current);
8032
8033 /*
8034 * Make us the idle thread. Technically, schedule() should not be
8035 * called from this thread, however somewhere below it might be,
8036 * but because we are the idle thread, we just pick up running again
8037 * when this runqueue becomes "idle".
8038 */
8039 init_idle(current, smp_processor_id());
dce48a84
TG
8040
8041 calc_load_update = jiffies + LOAD_FREQ;
8042
dd41f596
IM
8043 /*
8044 * During early bootup we pretend to be a normal task:
8045 */
8046 current->sched_class = &fair_sched_class;
6892b75e 8047
6a7b3dc3 8048 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
49557e62 8049 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
bf4d83f6 8050#ifdef CONFIG_SMP
7d1e6a9b 8051#ifdef CONFIG_NO_HZ
83cd4fe2
VP
8052 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8053 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8054 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8055 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8056 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7d1e6a9b 8057#endif
bdddd296
RR
8058 /* May be allocated at isolcpus cmdline parse time */
8059 if (cpu_isolated_map == NULL)
8060 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
bf4d83f6 8061#endif /* SMP */
6a7b3dc3 8062
cdd6c482 8063 perf_event_init();
0d905bca 8064
6892b75e 8065 scheduler_running = 1;
1da177e4
LT
8066}
8067
8068#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
e4aafea2
FW
8069static inline int preempt_count_equals(int preempt_offset)
8070{
234da7bc 8071 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
e4aafea2
FW
8072
8073 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
8074}
8075
d894837f 8076void __might_sleep(const char *file, int line, int preempt_offset)
1da177e4 8077{
48f24c4d 8078#ifdef in_atomic
1da177e4
LT
8079 static unsigned long prev_jiffy; /* ratelimiting */
8080
e4aafea2
FW
8081 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8082 system_state != SYSTEM_RUNNING || oops_in_progress)
aef745fc
IM
8083 return;
8084 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8085 return;
8086 prev_jiffy = jiffies;
8087
3df0fc5b
PZ
8088 printk(KERN_ERR
8089 "BUG: sleeping function called from invalid context at %s:%d\n",
8090 file, line);
8091 printk(KERN_ERR
8092 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8093 in_atomic(), irqs_disabled(),
8094 current->pid, current->comm);
aef745fc
IM
8095
8096 debug_show_held_locks(current);
8097 if (irqs_disabled())
8098 print_irqtrace_events(current);
8099 dump_stack();
1da177e4
LT
8100#endif
8101}
8102EXPORT_SYMBOL(__might_sleep);
8103#endif
8104
8105#ifdef CONFIG_MAGIC_SYSRQ
3a5e4dc1
AK
8106static void normalize_task(struct rq *rq, struct task_struct *p)
8107{
8108 int on_rq;
3e51f33f 8109
3a5e4dc1
AK
8110 on_rq = p->se.on_rq;
8111 if (on_rq)
8112 deactivate_task(rq, p, 0);
8113 __setscheduler(rq, p, SCHED_NORMAL, 0);
8114 if (on_rq) {
8115 activate_task(rq, p, 0);
8116 resched_task(rq->curr);
8117 }
8118}
8119
1da177e4
LT
8120void normalize_rt_tasks(void)
8121{
a0f98a1c 8122 struct task_struct *g, *p;
1da177e4 8123 unsigned long flags;
70b97a7f 8124 struct rq *rq;
1da177e4 8125
4cf5d77a 8126 read_lock_irqsave(&tasklist_lock, flags);
a0f98a1c 8127 do_each_thread(g, p) {
178be793
IM
8128 /*
8129 * Only normalize user tasks:
8130 */
8131 if (!p->mm)
8132 continue;
8133
6cfb0d5d 8134 p->se.exec_start = 0;
6cfb0d5d 8135#ifdef CONFIG_SCHEDSTATS
41acab88
LDM
8136 p->se.statistics.wait_start = 0;
8137 p->se.statistics.sleep_start = 0;
8138 p->se.statistics.block_start = 0;
6cfb0d5d 8139#endif
dd41f596
IM
8140
8141 if (!rt_task(p)) {
8142 /*
8143 * Renice negative nice level userspace
8144 * tasks back to 0:
8145 */
8146 if (TASK_NICE(p) < 0 && p->mm)
8147 set_user_nice(p, 0);
1da177e4 8148 continue;
dd41f596 8149 }
1da177e4 8150
1d615482 8151 raw_spin_lock(&p->pi_lock);
b29739f9 8152 rq = __task_rq_lock(p);
1da177e4 8153
178be793 8154 normalize_task(rq, p);
3a5e4dc1 8155
b29739f9 8156 __task_rq_unlock(rq);
1d615482 8157 raw_spin_unlock(&p->pi_lock);
a0f98a1c
IM
8158 } while_each_thread(g, p);
8159
4cf5d77a 8160 read_unlock_irqrestore(&tasklist_lock, flags);
1da177e4
LT
8161}
8162
8163#endif /* CONFIG_MAGIC_SYSRQ */
1df5c10a 8164
67fc4e0c 8165#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
1df5c10a 8166/*
67fc4e0c 8167 * These functions are only useful for the IA64 MCA handling, or kdb.
1df5c10a
LT
8168 *
8169 * They can only be called when the whole system has been
8170 * stopped - every CPU needs to be quiescent, and no scheduling
8171 * activity can take place. Using them for anything else would
8172 * be a serious bug, and as a result, they aren't even visible
8173 * under any other configuration.
8174 */
8175
8176/**
8177 * curr_task - return the current task for a given cpu.
8178 * @cpu: the processor in question.
8179 *
8180 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8181 */
36c8b586 8182struct task_struct *curr_task(int cpu)
1df5c10a
LT
8183{
8184 return cpu_curr(cpu);
8185}
8186
67fc4e0c
JW
8187#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8188
8189#ifdef CONFIG_IA64
1df5c10a
LT
8190/**
8191 * set_curr_task - set the current task for a given cpu.
8192 * @cpu: the processor in question.
8193 * @p: the task pointer to set.
8194 *
8195 * Description: This function must only be used when non-maskable interrupts
41a2d6cf
IM
8196 * are serviced on a separate stack. It allows the architecture to switch the
8197 * notion of the current task on a cpu in a non-blocking manner. This function
1df5c10a
LT
8198 * must be called with all CPU's synchronized, and interrupts disabled, the
8199 * and caller must save the original value of the current task (see
8200 * curr_task() above) and restore that value before reenabling interrupts and
8201 * re-starting the system.
8202 *
8203 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8204 */
36c8b586 8205void set_curr_task(int cpu, struct task_struct *p)
1df5c10a
LT
8206{
8207 cpu_curr(cpu) = p;
8208}
8209
8210#endif
29f59db3 8211
bccbe08a
PZ
8212#ifdef CONFIG_FAIR_GROUP_SCHED
8213static void free_fair_sched_group(struct task_group *tg)
6f505b16
PZ
8214{
8215 int i;
8216
8217 for_each_possible_cpu(i) {
8218 if (tg->cfs_rq)
8219 kfree(tg->cfs_rq[i]);
8220 if (tg->se)
8221 kfree(tg->se[i]);
6f505b16
PZ
8222 }
8223
8224 kfree(tg->cfs_rq);
8225 kfree(tg->se);
6f505b16
PZ
8226}
8227
ec7dc8ac
DG
8228static
8229int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
29f59db3 8230{
29f59db3 8231 struct cfs_rq *cfs_rq;
eab17229 8232 struct sched_entity *se;
9b5b7751 8233 struct rq *rq;
29f59db3
SV
8234 int i;
8235
434d53b0 8236 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
29f59db3
SV
8237 if (!tg->cfs_rq)
8238 goto err;
434d53b0 8239 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
29f59db3
SV
8240 if (!tg->se)
8241 goto err;
052f1dc7
PZ
8242
8243 tg->shares = NICE_0_LOAD;
29f59db3
SV
8244
8245 for_each_possible_cpu(i) {
9b5b7751 8246 rq = cpu_rq(i);
29f59db3 8247
eab17229
LZ
8248 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8249 GFP_KERNEL, cpu_to_node(i));
29f59db3
SV
8250 if (!cfs_rq)
8251 goto err;
8252
eab17229
LZ
8253 se = kzalloc_node(sizeof(struct sched_entity),
8254 GFP_KERNEL, cpu_to_node(i));
29f59db3 8255 if (!se)
dfc12eb2 8256 goto err_free_rq;
29f59db3 8257
eab17229 8258 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
bccbe08a
PZ
8259 }
8260
8261 return 1;
8262
49246274 8263err_free_rq:
dfc12eb2 8264 kfree(cfs_rq);
49246274 8265err:
bccbe08a
PZ
8266 return 0;
8267}
8268
8269static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8270{
8271 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8272 &cpu_rq(cpu)->leaf_cfs_rq_list);
8273}
8274
8275static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8276{
8277 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8278}
6d6bc0ad 8279#else /* !CONFG_FAIR_GROUP_SCHED */
bccbe08a
PZ
8280static inline void free_fair_sched_group(struct task_group *tg)
8281{
8282}
8283
ec7dc8ac
DG
8284static inline
8285int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
bccbe08a
PZ
8286{
8287 return 1;
8288}
8289
8290static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8291{
8292}
8293
8294static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8295{
8296}
6d6bc0ad 8297#endif /* CONFIG_FAIR_GROUP_SCHED */
052f1dc7
PZ
8298
8299#ifdef CONFIG_RT_GROUP_SCHED
bccbe08a
PZ
8300static void free_rt_sched_group(struct task_group *tg)
8301{
8302 int i;
8303
d0b27fa7
PZ
8304 destroy_rt_bandwidth(&tg->rt_bandwidth);
8305
bccbe08a
PZ
8306 for_each_possible_cpu(i) {
8307 if (tg->rt_rq)
8308 kfree(tg->rt_rq[i]);
8309 if (tg->rt_se)
8310 kfree(tg->rt_se[i]);
8311 }
8312
8313 kfree(tg->rt_rq);
8314 kfree(tg->rt_se);
8315}
8316
ec7dc8ac
DG
8317static
8318int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
bccbe08a
PZ
8319{
8320 struct rt_rq *rt_rq;
eab17229 8321 struct sched_rt_entity *rt_se;
bccbe08a
PZ
8322 struct rq *rq;
8323 int i;
8324
434d53b0 8325 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
bccbe08a
PZ
8326 if (!tg->rt_rq)
8327 goto err;
434d53b0 8328 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
bccbe08a
PZ
8329 if (!tg->rt_se)
8330 goto err;
8331
d0b27fa7
PZ
8332 init_rt_bandwidth(&tg->rt_bandwidth,
8333 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
bccbe08a
PZ
8334
8335 for_each_possible_cpu(i) {
8336 rq = cpu_rq(i);
8337
eab17229
LZ
8338 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8339 GFP_KERNEL, cpu_to_node(i));
6f505b16
PZ
8340 if (!rt_rq)
8341 goto err;
29f59db3 8342
eab17229
LZ
8343 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8344 GFP_KERNEL, cpu_to_node(i));
6f505b16 8345 if (!rt_se)
dfc12eb2 8346 goto err_free_rq;
29f59db3 8347
eab17229 8348 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
29f59db3
SV
8349 }
8350
bccbe08a
PZ
8351 return 1;
8352
49246274 8353err_free_rq:
dfc12eb2 8354 kfree(rt_rq);
49246274 8355err:
bccbe08a
PZ
8356 return 0;
8357}
8358
8359static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8360{
8361 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8362 &cpu_rq(cpu)->leaf_rt_rq_list);
8363}
8364
8365static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8366{
8367 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8368}
6d6bc0ad 8369#else /* !CONFIG_RT_GROUP_SCHED */
bccbe08a
PZ
8370static inline void free_rt_sched_group(struct task_group *tg)
8371{
8372}
8373
ec7dc8ac
DG
8374static inline
8375int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
bccbe08a
PZ
8376{
8377 return 1;
8378}
8379
8380static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8381{
8382}
8383
8384static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8385{
8386}
6d6bc0ad 8387#endif /* CONFIG_RT_GROUP_SCHED */
bccbe08a 8388
7c941438 8389#ifdef CONFIG_CGROUP_SCHED
bccbe08a
PZ
8390static void free_sched_group(struct task_group *tg)
8391{
8392 free_fair_sched_group(tg);
8393 free_rt_sched_group(tg);
8394 kfree(tg);
8395}
8396
8397/* allocate runqueue etc for a new task group */
ec7dc8ac 8398struct task_group *sched_create_group(struct task_group *parent)
bccbe08a
PZ
8399{
8400 struct task_group *tg;
8401 unsigned long flags;
8402 int i;
8403
8404 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8405 if (!tg)
8406 return ERR_PTR(-ENOMEM);
8407
ec7dc8ac 8408 if (!alloc_fair_sched_group(tg, parent))
bccbe08a
PZ
8409 goto err;
8410
ec7dc8ac 8411 if (!alloc_rt_sched_group(tg, parent))
bccbe08a
PZ
8412 goto err;
8413
8ed36996 8414 spin_lock_irqsave(&task_group_lock, flags);
9b5b7751 8415 for_each_possible_cpu(i) {
bccbe08a
PZ
8416 register_fair_sched_group(tg, i);
8417 register_rt_sched_group(tg, i);
9b5b7751 8418 }
6f505b16 8419 list_add_rcu(&tg->list, &task_groups);
f473aa5e
PZ
8420
8421 WARN_ON(!parent); /* root should already exist */
8422
8423 tg->parent = parent;
f473aa5e 8424 INIT_LIST_HEAD(&tg->children);
09f2724a 8425 list_add_rcu(&tg->siblings, &parent->children);
8ed36996 8426 spin_unlock_irqrestore(&task_group_lock, flags);
29f59db3 8427
9b5b7751 8428 return tg;
29f59db3
SV
8429
8430err:
6f505b16 8431 free_sched_group(tg);
29f59db3
SV
8432 return ERR_PTR(-ENOMEM);
8433}
8434
9b5b7751 8435/* rcu callback to free various structures associated with a task group */
6f505b16 8436static void free_sched_group_rcu(struct rcu_head *rhp)
29f59db3 8437{
29f59db3 8438 /* now it should be safe to free those cfs_rqs */
6f505b16 8439 free_sched_group(container_of(rhp, struct task_group, rcu));
29f59db3
SV
8440}
8441
9b5b7751 8442/* Destroy runqueue etc associated with a task group */
4cf86d77 8443void sched_destroy_group(struct task_group *tg)
29f59db3 8444{
8ed36996 8445 unsigned long flags;
9b5b7751 8446 int i;
29f59db3 8447
8ed36996 8448 spin_lock_irqsave(&task_group_lock, flags);
9b5b7751 8449 for_each_possible_cpu(i) {
bccbe08a
PZ
8450 unregister_fair_sched_group(tg, i);
8451 unregister_rt_sched_group(tg, i);
9b5b7751 8452 }
6f505b16 8453 list_del_rcu(&tg->list);
f473aa5e 8454 list_del_rcu(&tg->siblings);
8ed36996 8455 spin_unlock_irqrestore(&task_group_lock, flags);
9b5b7751 8456
9b5b7751 8457 /* wait for possible concurrent references to cfs_rqs complete */
6f505b16 8458 call_rcu(&tg->rcu, free_sched_group_rcu);
29f59db3
SV
8459}
8460
9b5b7751 8461/* change task's runqueue when it moves between groups.
3a252015
IM
8462 * The caller of this function should have put the task in its new group
8463 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
8464 * reflect its new group.
9b5b7751
SV
8465 */
8466void sched_move_task(struct task_struct *tsk)
29f59db3
SV
8467{
8468 int on_rq, running;
8469 unsigned long flags;
8470 struct rq *rq;
8471
8472 rq = task_rq_lock(tsk, &flags);
8473
051a1d1a 8474 running = task_current(rq, tsk);
29f59db3
SV
8475 on_rq = tsk->se.on_rq;
8476
0e1f3483 8477 if (on_rq)
29f59db3 8478 dequeue_task(rq, tsk, 0);
0e1f3483
HS
8479 if (unlikely(running))
8480 tsk->sched_class->put_prev_task(rq, tsk);
29f59db3 8481
6f505b16 8482 set_task_rq(tsk, task_cpu(tsk));
29f59db3 8483
810b3817
PZ
8484#ifdef CONFIG_FAIR_GROUP_SCHED
8485 if (tsk->sched_class->moved_group)
88ec22d3 8486 tsk->sched_class->moved_group(tsk, on_rq);
810b3817
PZ
8487#endif
8488
0e1f3483
HS
8489 if (unlikely(running))
8490 tsk->sched_class->set_curr_task(rq);
8491 if (on_rq)
371fd7e7 8492 enqueue_task(rq, tsk, 0);
29f59db3 8493
29f59db3
SV
8494 task_rq_unlock(rq, &flags);
8495}
7c941438 8496#endif /* CONFIG_CGROUP_SCHED */
29f59db3 8497
052f1dc7 8498#ifdef CONFIG_FAIR_GROUP_SCHED
c09595f6 8499static void __set_se_shares(struct sched_entity *se, unsigned long shares)
29f59db3
SV
8500{
8501 struct cfs_rq *cfs_rq = se->cfs_rq;
29f59db3
SV
8502 int on_rq;
8503
29f59db3 8504 on_rq = se->on_rq;
62fb1851 8505 if (on_rq)
29f59db3
SV
8506 dequeue_entity(cfs_rq, se, 0);
8507
8508 se->load.weight = shares;
e05510d0 8509 se->load.inv_weight = 0;
29f59db3 8510
62fb1851 8511 if (on_rq)
29f59db3 8512 enqueue_entity(cfs_rq, se, 0);
c09595f6 8513}
62fb1851 8514
c09595f6
PZ
8515static void set_se_shares(struct sched_entity *se, unsigned long shares)
8516{
8517 struct cfs_rq *cfs_rq = se->cfs_rq;
8518 struct rq *rq = cfs_rq->rq;
8519 unsigned long flags;
8520
05fa785c 8521 raw_spin_lock_irqsave(&rq->lock, flags);
c09595f6 8522 __set_se_shares(se, shares);
05fa785c 8523 raw_spin_unlock_irqrestore(&rq->lock, flags);
29f59db3
SV
8524}
8525
8ed36996
PZ
8526static DEFINE_MUTEX(shares_mutex);
8527
4cf86d77 8528int sched_group_set_shares(struct task_group *tg, unsigned long shares)
29f59db3
SV
8529{
8530 int i;
8ed36996 8531 unsigned long flags;
c61935fd 8532
ec7dc8ac
DG
8533 /*
8534 * We can't change the weight of the root cgroup.
8535 */
8536 if (!tg->se[0])
8537 return -EINVAL;
8538
18d95a28
PZ
8539 if (shares < MIN_SHARES)
8540 shares = MIN_SHARES;
cb4ad1ff
MX
8541 else if (shares > MAX_SHARES)
8542 shares = MAX_SHARES;
62fb1851 8543
8ed36996 8544 mutex_lock(&shares_mutex);
9b5b7751 8545 if (tg->shares == shares)
5cb350ba 8546 goto done;
29f59db3 8547
8ed36996 8548 spin_lock_irqsave(&task_group_lock, flags);
bccbe08a
PZ
8549 for_each_possible_cpu(i)
8550 unregister_fair_sched_group(tg, i);
f473aa5e 8551 list_del_rcu(&tg->siblings);
8ed36996 8552 spin_unlock_irqrestore(&task_group_lock, flags);
6b2d7700
SV
8553
8554 /* wait for any ongoing reference to this group to finish */
8555 synchronize_sched();
8556
8557 /*
8558 * Now we are free to modify the group's share on each cpu
8559 * w/o tripping rebalance_share or load_balance_fair.
8560 */
9b5b7751 8561 tg->shares = shares;
c09595f6
PZ
8562 for_each_possible_cpu(i) {
8563 /*
8564 * force a rebalance
8565 */
8566 cfs_rq_set_shares(tg->cfs_rq[i], 0);
cb4ad1ff 8567 set_se_shares(tg->se[i], shares);
c09595f6 8568 }
29f59db3 8569
6b2d7700
SV
8570 /*
8571 * Enable load balance activity on this group, by inserting it back on
8572 * each cpu's rq->leaf_cfs_rq_list.
8573 */
8ed36996 8574 spin_lock_irqsave(&task_group_lock, flags);
bccbe08a
PZ
8575 for_each_possible_cpu(i)
8576 register_fair_sched_group(tg, i);
f473aa5e 8577 list_add_rcu(&tg->siblings, &tg->parent->children);
8ed36996 8578 spin_unlock_irqrestore(&task_group_lock, flags);
5cb350ba 8579done:
8ed36996 8580 mutex_unlock(&shares_mutex);
9b5b7751 8581 return 0;
29f59db3
SV
8582}
8583
5cb350ba
DG
8584unsigned long sched_group_shares(struct task_group *tg)
8585{
8586 return tg->shares;
8587}
052f1dc7 8588#endif
5cb350ba 8589
052f1dc7 8590#ifdef CONFIG_RT_GROUP_SCHED
6f505b16 8591/*
9f0c1e56 8592 * Ensure that the real time constraints are schedulable.
6f505b16 8593 */
9f0c1e56
PZ
8594static DEFINE_MUTEX(rt_constraints_mutex);
8595
8596static unsigned long to_ratio(u64 period, u64 runtime)
8597{
8598 if (runtime == RUNTIME_INF)
9a7e0b18 8599 return 1ULL << 20;
9f0c1e56 8600
9a7e0b18 8601 return div64_u64(runtime << 20, period);
9f0c1e56
PZ
8602}
8603
9a7e0b18
PZ
8604/* Must be called with tasklist_lock held */
8605static inline int tg_has_rt_tasks(struct task_group *tg)
b40b2e8e 8606{
9a7e0b18 8607 struct task_struct *g, *p;
b40b2e8e 8608
9a7e0b18
PZ
8609 do_each_thread(g, p) {
8610 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8611 return 1;
8612 } while_each_thread(g, p);
b40b2e8e 8613
9a7e0b18
PZ
8614 return 0;
8615}
b40b2e8e 8616
9a7e0b18
PZ
8617struct rt_schedulable_data {
8618 struct task_group *tg;
8619 u64 rt_period;
8620 u64 rt_runtime;
8621};
b40b2e8e 8622
9a7e0b18
PZ
8623static int tg_schedulable(struct task_group *tg, void *data)
8624{
8625 struct rt_schedulable_data *d = data;
8626 struct task_group *child;
8627 unsigned long total, sum = 0;
8628 u64 period, runtime;
b40b2e8e 8629
9a7e0b18
PZ
8630 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8631 runtime = tg->rt_bandwidth.rt_runtime;
b40b2e8e 8632
9a7e0b18
PZ
8633 if (tg == d->tg) {
8634 period = d->rt_period;
8635 runtime = d->rt_runtime;
b40b2e8e 8636 }
b40b2e8e 8637
4653f803
PZ
8638 /*
8639 * Cannot have more runtime than the period.
8640 */
8641 if (runtime > period && runtime != RUNTIME_INF)
8642 return -EINVAL;
6f505b16 8643
4653f803
PZ
8644 /*
8645 * Ensure we don't starve existing RT tasks.
8646 */
9a7e0b18
PZ
8647 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8648 return -EBUSY;
6f505b16 8649
9a7e0b18 8650 total = to_ratio(period, runtime);
6f505b16 8651
4653f803
PZ
8652 /*
8653 * Nobody can have more than the global setting allows.
8654 */
8655 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8656 return -EINVAL;
6f505b16 8657
4653f803
PZ
8658 /*
8659 * The sum of our children's runtime should not exceed our own.
8660 */
9a7e0b18
PZ
8661 list_for_each_entry_rcu(child, &tg->children, siblings) {
8662 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8663 runtime = child->rt_bandwidth.rt_runtime;
6f505b16 8664
9a7e0b18
PZ
8665 if (child == d->tg) {
8666 period = d->rt_period;
8667 runtime = d->rt_runtime;
8668 }
6f505b16 8669
9a7e0b18 8670 sum += to_ratio(period, runtime);
9f0c1e56 8671 }
6f505b16 8672
9a7e0b18
PZ
8673 if (sum > total)
8674 return -EINVAL;
8675
8676 return 0;
6f505b16
PZ
8677}
8678
9a7e0b18 8679static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
521f1a24 8680{
9a7e0b18
PZ
8681 struct rt_schedulable_data data = {
8682 .tg = tg,
8683 .rt_period = period,
8684 .rt_runtime = runtime,
8685 };
8686
8687 return walk_tg_tree(tg_schedulable, tg_nop, &data);
521f1a24
DG
8688}
8689
d0b27fa7
PZ
8690static int tg_set_bandwidth(struct task_group *tg,
8691 u64 rt_period, u64 rt_runtime)
6f505b16 8692{
ac086bc2 8693 int i, err = 0;
9f0c1e56 8694
9f0c1e56 8695 mutex_lock(&rt_constraints_mutex);
521f1a24 8696 read_lock(&tasklist_lock);
9a7e0b18
PZ
8697 err = __rt_schedulable(tg, rt_period, rt_runtime);
8698 if (err)
9f0c1e56 8699 goto unlock;
ac086bc2 8700
0986b11b 8701 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
d0b27fa7
PZ
8702 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8703 tg->rt_bandwidth.rt_runtime = rt_runtime;
ac086bc2
PZ
8704
8705 for_each_possible_cpu(i) {
8706 struct rt_rq *rt_rq = tg->rt_rq[i];
8707
0986b11b 8708 raw_spin_lock(&rt_rq->rt_runtime_lock);
ac086bc2 8709 rt_rq->rt_runtime = rt_runtime;
0986b11b 8710 raw_spin_unlock(&rt_rq->rt_runtime_lock);
ac086bc2 8711 }
0986b11b 8712 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
49246274 8713unlock:
521f1a24 8714 read_unlock(&tasklist_lock);
9f0c1e56
PZ
8715 mutex_unlock(&rt_constraints_mutex);
8716
8717 return err;
6f505b16
PZ
8718}
8719
d0b27fa7
PZ
8720int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8721{
8722 u64 rt_runtime, rt_period;
8723
8724 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8725 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8726 if (rt_runtime_us < 0)
8727 rt_runtime = RUNTIME_INF;
8728
8729 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8730}
8731
9f0c1e56
PZ
8732long sched_group_rt_runtime(struct task_group *tg)
8733{
8734 u64 rt_runtime_us;
8735
d0b27fa7 8736 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
9f0c1e56
PZ
8737 return -1;
8738
d0b27fa7 8739 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
9f0c1e56
PZ
8740 do_div(rt_runtime_us, NSEC_PER_USEC);
8741 return rt_runtime_us;
8742}
d0b27fa7
PZ
8743
8744int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8745{
8746 u64 rt_runtime, rt_period;
8747
8748 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8749 rt_runtime = tg->rt_bandwidth.rt_runtime;
8750
619b0488
R
8751 if (rt_period == 0)
8752 return -EINVAL;
8753
d0b27fa7
PZ
8754 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8755}
8756
8757long sched_group_rt_period(struct task_group *tg)
8758{
8759 u64 rt_period_us;
8760
8761 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8762 do_div(rt_period_us, NSEC_PER_USEC);
8763 return rt_period_us;
8764}
8765
8766static int sched_rt_global_constraints(void)
8767{
4653f803 8768 u64 runtime, period;
d0b27fa7
PZ
8769 int ret = 0;
8770
ec5d4989
HS
8771 if (sysctl_sched_rt_period <= 0)
8772 return -EINVAL;
8773
4653f803
PZ
8774 runtime = global_rt_runtime();
8775 period = global_rt_period();
8776
8777 /*
8778 * Sanity check on the sysctl variables.
8779 */
8780 if (runtime > period && runtime != RUNTIME_INF)
8781 return -EINVAL;
10b612f4 8782
d0b27fa7 8783 mutex_lock(&rt_constraints_mutex);
9a7e0b18 8784 read_lock(&tasklist_lock);
4653f803 8785 ret = __rt_schedulable(NULL, 0, 0);
9a7e0b18 8786 read_unlock(&tasklist_lock);
d0b27fa7
PZ
8787 mutex_unlock(&rt_constraints_mutex);
8788
8789 return ret;
8790}
54e99124
DG
8791
8792int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
8793{
8794 /* Don't accept realtime tasks when there is no way for them to run */
8795 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
8796 return 0;
8797
8798 return 1;
8799}
8800
6d6bc0ad 8801#else /* !CONFIG_RT_GROUP_SCHED */
d0b27fa7
PZ
8802static int sched_rt_global_constraints(void)
8803{
ac086bc2
PZ
8804 unsigned long flags;
8805 int i;
8806
ec5d4989
HS
8807 if (sysctl_sched_rt_period <= 0)
8808 return -EINVAL;
8809
60aa605d
PZ
8810 /*
8811 * There's always some RT tasks in the root group
8812 * -- migration, kstopmachine etc..
8813 */
8814 if (sysctl_sched_rt_runtime == 0)
8815 return -EBUSY;
8816
0986b11b 8817 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
ac086bc2
PZ
8818 for_each_possible_cpu(i) {
8819 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8820
0986b11b 8821 raw_spin_lock(&rt_rq->rt_runtime_lock);
ac086bc2 8822 rt_rq->rt_runtime = global_rt_runtime();
0986b11b 8823 raw_spin_unlock(&rt_rq->rt_runtime_lock);
ac086bc2 8824 }
0986b11b 8825 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
ac086bc2 8826
d0b27fa7
PZ
8827 return 0;
8828}
6d6bc0ad 8829#endif /* CONFIG_RT_GROUP_SCHED */
d0b27fa7
PZ
8830
8831int sched_rt_handler(struct ctl_table *table, int write,
8d65af78 8832 void __user *buffer, size_t *lenp,
d0b27fa7
PZ
8833 loff_t *ppos)
8834{
8835 int ret;
8836 int old_period, old_runtime;
8837 static DEFINE_MUTEX(mutex);
8838
8839 mutex_lock(&mutex);
8840 old_period = sysctl_sched_rt_period;
8841 old_runtime = sysctl_sched_rt_runtime;
8842
8d65af78 8843 ret = proc_dointvec(table, write, buffer, lenp, ppos);
d0b27fa7
PZ
8844
8845 if (!ret && write) {
8846 ret = sched_rt_global_constraints();
8847 if (ret) {
8848 sysctl_sched_rt_period = old_period;
8849 sysctl_sched_rt_runtime = old_runtime;
8850 } else {
8851 def_rt_bandwidth.rt_runtime = global_rt_runtime();
8852 def_rt_bandwidth.rt_period =
8853 ns_to_ktime(global_rt_period());
8854 }
8855 }
8856 mutex_unlock(&mutex);
8857
8858 return ret;
8859}
68318b8e 8860
052f1dc7 8861#ifdef CONFIG_CGROUP_SCHED
68318b8e
SV
8862
8863/* return corresponding task_group object of a cgroup */
2b01dfe3 8864static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
68318b8e 8865{
2b01dfe3
PM
8866 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
8867 struct task_group, css);
68318b8e
SV
8868}
8869
8870static struct cgroup_subsys_state *
2b01dfe3 8871cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
68318b8e 8872{
ec7dc8ac 8873 struct task_group *tg, *parent;
68318b8e 8874
2b01dfe3 8875 if (!cgrp->parent) {
68318b8e 8876 /* This is early initialization for the top cgroup */
68318b8e
SV
8877 return &init_task_group.css;
8878 }
8879
ec7dc8ac
DG
8880 parent = cgroup_tg(cgrp->parent);
8881 tg = sched_create_group(parent);
68318b8e
SV
8882 if (IS_ERR(tg))
8883 return ERR_PTR(-ENOMEM);
8884
68318b8e
SV
8885 return &tg->css;
8886}
8887
41a2d6cf
IM
8888static void
8889cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
68318b8e 8890{
2b01dfe3 8891 struct task_group *tg = cgroup_tg(cgrp);
68318b8e
SV
8892
8893 sched_destroy_group(tg);
8894}
8895
41a2d6cf 8896static int
be367d09 8897cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
68318b8e 8898{
b68aa230 8899#ifdef CONFIG_RT_GROUP_SCHED
54e99124 8900 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
b68aa230
PZ
8901 return -EINVAL;
8902#else
68318b8e
SV
8903 /* We don't support RT-tasks being in separate groups */
8904 if (tsk->sched_class != &fair_sched_class)
8905 return -EINVAL;
b68aa230 8906#endif
be367d09
BB
8907 return 0;
8908}
68318b8e 8909
be367d09
BB
8910static int
8911cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8912 struct task_struct *tsk, bool threadgroup)
8913{
8914 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
8915 if (retval)
8916 return retval;
8917 if (threadgroup) {
8918 struct task_struct *c;
8919 rcu_read_lock();
8920 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8921 retval = cpu_cgroup_can_attach_task(cgrp, c);
8922 if (retval) {
8923 rcu_read_unlock();
8924 return retval;
8925 }
8926 }
8927 rcu_read_unlock();
8928 }
68318b8e
SV
8929 return 0;
8930}
8931
8932static void
2b01dfe3 8933cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
be367d09
BB
8934 struct cgroup *old_cont, struct task_struct *tsk,
8935 bool threadgroup)
68318b8e
SV
8936{
8937 sched_move_task(tsk);
be367d09
BB
8938 if (threadgroup) {
8939 struct task_struct *c;
8940 rcu_read_lock();
8941 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8942 sched_move_task(c);
8943 }
8944 rcu_read_unlock();
8945 }
68318b8e
SV
8946}
8947
052f1dc7 8948#ifdef CONFIG_FAIR_GROUP_SCHED
f4c753b7 8949static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
2b01dfe3 8950 u64 shareval)
68318b8e 8951{
2b01dfe3 8952 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
68318b8e
SV
8953}
8954
f4c753b7 8955static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
68318b8e 8956{
2b01dfe3 8957 struct task_group *tg = cgroup_tg(cgrp);
68318b8e
SV
8958
8959 return (u64) tg->shares;
8960}
6d6bc0ad 8961#endif /* CONFIG_FAIR_GROUP_SCHED */
68318b8e 8962
052f1dc7 8963#ifdef CONFIG_RT_GROUP_SCHED
0c70814c 8964static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
06ecb27c 8965 s64 val)
6f505b16 8966{
06ecb27c 8967 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
6f505b16
PZ
8968}
8969
06ecb27c 8970static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
6f505b16 8971{
06ecb27c 8972 return sched_group_rt_runtime(cgroup_tg(cgrp));
6f505b16 8973}
d0b27fa7
PZ
8974
8975static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
8976 u64 rt_period_us)
8977{
8978 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
8979}
8980
8981static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8982{
8983 return sched_group_rt_period(cgroup_tg(cgrp));
8984}
6d6bc0ad 8985#endif /* CONFIG_RT_GROUP_SCHED */
6f505b16 8986
fe5c7cc2 8987static struct cftype cpu_files[] = {
052f1dc7 8988#ifdef CONFIG_FAIR_GROUP_SCHED
fe5c7cc2
PM
8989 {
8990 .name = "shares",
f4c753b7
PM
8991 .read_u64 = cpu_shares_read_u64,
8992 .write_u64 = cpu_shares_write_u64,
fe5c7cc2 8993 },
052f1dc7
PZ
8994#endif
8995#ifdef CONFIG_RT_GROUP_SCHED
6f505b16 8996 {
9f0c1e56 8997 .name = "rt_runtime_us",
06ecb27c
PM
8998 .read_s64 = cpu_rt_runtime_read,
8999 .write_s64 = cpu_rt_runtime_write,
6f505b16 9000 },
d0b27fa7
PZ
9001 {
9002 .name = "rt_period_us",
f4c753b7
PM
9003 .read_u64 = cpu_rt_period_read_uint,
9004 .write_u64 = cpu_rt_period_write_uint,
d0b27fa7 9005 },
052f1dc7 9006#endif
68318b8e
SV
9007};
9008
9009static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
9010{
fe5c7cc2 9011 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
68318b8e
SV
9012}
9013
9014struct cgroup_subsys cpu_cgroup_subsys = {
38605cae
IM
9015 .name = "cpu",
9016 .create = cpu_cgroup_create,
9017 .destroy = cpu_cgroup_destroy,
9018 .can_attach = cpu_cgroup_can_attach,
9019 .attach = cpu_cgroup_attach,
9020 .populate = cpu_cgroup_populate,
9021 .subsys_id = cpu_cgroup_subsys_id,
68318b8e
SV
9022 .early_init = 1,
9023};
9024
052f1dc7 9025#endif /* CONFIG_CGROUP_SCHED */
d842de87
SV
9026
9027#ifdef CONFIG_CGROUP_CPUACCT
9028
9029/*
9030 * CPU accounting code for task groups.
9031 *
9032 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
9033 * (balbir@in.ibm.com).
9034 */
9035
934352f2 9036/* track cpu usage of a group of tasks and its child groups */
d842de87
SV
9037struct cpuacct {
9038 struct cgroup_subsys_state css;
9039 /* cpuusage holds pointer to a u64-type object on every cpu */
43cf38eb 9040 u64 __percpu *cpuusage;
ef12fefa 9041 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
934352f2 9042 struct cpuacct *parent;
d842de87
SV
9043};
9044
9045struct cgroup_subsys cpuacct_subsys;
9046
9047/* return cpu accounting group corresponding to this container */
32cd756a 9048static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
d842de87 9049{
32cd756a 9050 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
d842de87
SV
9051 struct cpuacct, css);
9052}
9053
9054/* return cpu accounting group to which this task belongs */
9055static inline struct cpuacct *task_ca(struct task_struct *tsk)
9056{
9057 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9058 struct cpuacct, css);
9059}
9060
9061/* create a new cpu accounting group */
9062static struct cgroup_subsys_state *cpuacct_create(
32cd756a 9063 struct cgroup_subsys *ss, struct cgroup *cgrp)
d842de87
SV
9064{
9065 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
ef12fefa 9066 int i;
d842de87
SV
9067
9068 if (!ca)
ef12fefa 9069 goto out;
d842de87
SV
9070
9071 ca->cpuusage = alloc_percpu(u64);
ef12fefa
BR
9072 if (!ca->cpuusage)
9073 goto out_free_ca;
9074
9075 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9076 if (percpu_counter_init(&ca->cpustat[i], 0))
9077 goto out_free_counters;
d842de87 9078
934352f2
BR
9079 if (cgrp->parent)
9080 ca->parent = cgroup_ca(cgrp->parent);
9081
d842de87 9082 return &ca->css;
ef12fefa
BR
9083
9084out_free_counters:
9085 while (--i >= 0)
9086 percpu_counter_destroy(&ca->cpustat[i]);
9087 free_percpu(ca->cpuusage);
9088out_free_ca:
9089 kfree(ca);
9090out:
9091 return ERR_PTR(-ENOMEM);
d842de87
SV
9092}
9093
9094/* destroy an existing cpu accounting group */
41a2d6cf 9095static void
32cd756a 9096cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
d842de87 9097{
32cd756a 9098 struct cpuacct *ca = cgroup_ca(cgrp);
ef12fefa 9099 int i;
d842de87 9100
ef12fefa
BR
9101 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9102 percpu_counter_destroy(&ca->cpustat[i]);
d842de87
SV
9103 free_percpu(ca->cpuusage);
9104 kfree(ca);
9105}
9106
720f5498
KC
9107static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9108{
b36128c8 9109 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
720f5498
KC
9110 u64 data;
9111
9112#ifndef CONFIG_64BIT
9113 /*
9114 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9115 */
05fa785c 9116 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
720f5498 9117 data = *cpuusage;
05fa785c 9118 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
720f5498
KC
9119#else
9120 data = *cpuusage;
9121#endif
9122
9123 return data;
9124}
9125
9126static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9127{
b36128c8 9128 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
720f5498
KC
9129
9130#ifndef CONFIG_64BIT
9131 /*
9132 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9133 */
05fa785c 9134 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
720f5498 9135 *cpuusage = val;
05fa785c 9136 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
720f5498
KC
9137#else
9138 *cpuusage = val;
9139#endif
9140}
9141
d842de87 9142/* return total cpu usage (in nanoseconds) of a group */
32cd756a 9143static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
d842de87 9144{
32cd756a 9145 struct cpuacct *ca = cgroup_ca(cgrp);
d842de87
SV
9146 u64 totalcpuusage = 0;
9147 int i;
9148
720f5498
KC
9149 for_each_present_cpu(i)
9150 totalcpuusage += cpuacct_cpuusage_read(ca, i);
d842de87
SV
9151
9152 return totalcpuusage;
9153}
9154
0297b803
DG
9155static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9156 u64 reset)
9157{
9158 struct cpuacct *ca = cgroup_ca(cgrp);
9159 int err = 0;
9160 int i;
9161
9162 if (reset) {
9163 err = -EINVAL;
9164 goto out;
9165 }
9166
720f5498
KC
9167 for_each_present_cpu(i)
9168 cpuacct_cpuusage_write(ca, i, 0);
0297b803 9169
0297b803
DG
9170out:
9171 return err;
9172}
9173
e9515c3c
KC
9174static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9175 struct seq_file *m)
9176{
9177 struct cpuacct *ca = cgroup_ca(cgroup);
9178 u64 percpu;
9179 int i;
9180
9181 for_each_present_cpu(i) {
9182 percpu = cpuacct_cpuusage_read(ca, i);
9183 seq_printf(m, "%llu ", (unsigned long long) percpu);
9184 }
9185 seq_printf(m, "\n");
9186 return 0;
9187}
9188
ef12fefa
BR
9189static const char *cpuacct_stat_desc[] = {
9190 [CPUACCT_STAT_USER] = "user",
9191 [CPUACCT_STAT_SYSTEM] = "system",
9192};
9193
9194static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9195 struct cgroup_map_cb *cb)
9196{
9197 struct cpuacct *ca = cgroup_ca(cgrp);
9198 int i;
9199
9200 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
9201 s64 val = percpu_counter_read(&ca->cpustat[i]);
9202 val = cputime64_to_clock_t(val);
9203 cb->fill(cb, cpuacct_stat_desc[i], val);
9204 }
9205 return 0;
9206}
9207
d842de87
SV
9208static struct cftype files[] = {
9209 {
9210 .name = "usage",
f4c753b7
PM
9211 .read_u64 = cpuusage_read,
9212 .write_u64 = cpuusage_write,
d842de87 9213 },
e9515c3c
KC
9214 {
9215 .name = "usage_percpu",
9216 .read_seq_string = cpuacct_percpu_seq_read,
9217 },
ef12fefa
BR
9218 {
9219 .name = "stat",
9220 .read_map = cpuacct_stats_show,
9221 },
d842de87
SV
9222};
9223
32cd756a 9224static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
d842de87 9225{
32cd756a 9226 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
d842de87
SV
9227}
9228
9229/*
9230 * charge this task's execution time to its accounting group.
9231 *
9232 * called with rq->lock held.
9233 */
9234static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9235{
9236 struct cpuacct *ca;
934352f2 9237 int cpu;
d842de87 9238
c40c6f85 9239 if (unlikely(!cpuacct_subsys.active))
d842de87
SV
9240 return;
9241
934352f2 9242 cpu = task_cpu(tsk);
a18b83b7
BR
9243
9244 rcu_read_lock();
9245
d842de87 9246 ca = task_ca(tsk);
d842de87 9247
934352f2 9248 for (; ca; ca = ca->parent) {
b36128c8 9249 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
d842de87
SV
9250 *cpuusage += cputime;
9251 }
a18b83b7
BR
9252
9253 rcu_read_unlock();
d842de87
SV
9254}
9255
fa535a77
AB
9256/*
9257 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9258 * in cputime_t units. As a result, cpuacct_update_stats calls
9259 * percpu_counter_add with values large enough to always overflow the
9260 * per cpu batch limit causing bad SMP scalability.
9261 *
9262 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9263 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9264 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9265 */
9266#ifdef CONFIG_SMP
9267#define CPUACCT_BATCH \
9268 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9269#else
9270#define CPUACCT_BATCH 0
9271#endif
9272
ef12fefa
BR
9273/*
9274 * Charge the system/user time to the task's accounting group.
9275 */
9276static void cpuacct_update_stats(struct task_struct *tsk,
9277 enum cpuacct_stat_index idx, cputime_t val)
9278{
9279 struct cpuacct *ca;
fa535a77 9280 int batch = CPUACCT_BATCH;
ef12fefa
BR
9281
9282 if (unlikely(!cpuacct_subsys.active))
9283 return;
9284
9285 rcu_read_lock();
9286 ca = task_ca(tsk);
9287
9288 do {
fa535a77 9289 __percpu_counter_add(&ca->cpustat[idx], val, batch);
ef12fefa
BR
9290 ca = ca->parent;
9291 } while (ca);
9292 rcu_read_unlock();
9293}
9294
d842de87
SV
9295struct cgroup_subsys cpuacct_subsys = {
9296 .name = "cpuacct",
9297 .create = cpuacct_create,
9298 .destroy = cpuacct_destroy,
9299 .populate = cpuacct_populate,
9300 .subsys_id = cpuacct_subsys_id,
9301};
9302#endif /* CONFIG_CGROUP_CPUACCT */
03b042bf
PM
9303
9304#ifndef CONFIG_SMP
9305
03b042bf
PM
9306void synchronize_sched_expedited(void)
9307{
fc390cde 9308 barrier();
03b042bf
PM
9309}
9310EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9311
9312#else /* #ifndef CONFIG_SMP */
9313
cc631fb7 9314static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
03b042bf 9315
cc631fb7 9316static int synchronize_sched_expedited_cpu_stop(void *data)
03b042bf 9317{
969c7921
TH
9318 /*
9319 * There must be a full memory barrier on each affected CPU
9320 * between the time that try_stop_cpus() is called and the
9321 * time that it returns.
9322 *
9323 * In the current initial implementation of cpu_stop, the
9324 * above condition is already met when the control reaches
9325 * this point and the following smp_mb() is not strictly
9326 * necessary. Do smp_mb() anyway for documentation and
9327 * robustness against future implementation changes.
9328 */
cc631fb7 9329 smp_mb(); /* See above comment block. */
969c7921 9330 return 0;
03b042bf 9331}
03b042bf
PM
9332
9333/*
9334 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9335 * approach to force grace period to end quickly. This consumes
9336 * significant time on all CPUs, and is thus not recommended for
9337 * any sort of common-case code.
9338 *
9339 * Note that it is illegal to call this function while holding any
9340 * lock that is acquired by a CPU-hotplug notifier. Failing to
9341 * observe this restriction will result in deadlock.
9342 */
9343void synchronize_sched_expedited(void)
9344{
969c7921 9345 int snap, trycount = 0;
03b042bf
PM
9346
9347 smp_mb(); /* ensure prior mod happens before capturing snap. */
969c7921 9348 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
03b042bf 9349 get_online_cpus();
969c7921
TH
9350 while (try_stop_cpus(cpu_online_mask,
9351 synchronize_sched_expedited_cpu_stop,
94458d5e 9352 NULL) == -EAGAIN) {
03b042bf
PM
9353 put_online_cpus();
9354 if (trycount++ < 10)
9355 udelay(trycount * num_online_cpus());
9356 else {
9357 synchronize_sched();
9358 return;
9359 }
969c7921 9360 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
03b042bf
PM
9361 smp_mb(); /* ensure test happens before caller kfree */
9362 return;
9363 }
9364 get_online_cpus();
9365 }
969c7921 9366 atomic_inc(&synchronize_sched_expedited_count);
cc631fb7 9367 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
03b042bf 9368 put_online_cpus();
03b042bf
PM
9369}
9370EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9371
9372#endif /* #else #ifndef CONFIG_SMP */