kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  */
  20
  21 #include <linux/mm.h>
  22 #include <linux/module.h>
  23 #include <linux/nmi.h>
  24 #include <linux/init.h>
  25 #include <linux/uaccess.h>
  26 #include <linux/highmem.h>
  27 #include <linux/smp_lock.h>
  28 #include <asm/mmu_context.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/capability.h>
  31 #include <linux/completion.h>
  32 #include <linux/kernel_stat.h>
  33 #include <linux/debug_locks.h>
  34 #include <linux/security.h>
  35 #include <linux/notifier.h>
  36 #include <linux/profile.h>
  37 #include <linux/freezer.h>
  38 #include <linux/vmalloc.h>
  39 #include <linux/blkdev.h>
  40 #include <linux/delay.h>
  41 #include <linux/smp.h>
  42 #include <linux/threads.h>
  43 #include <linux/timer.h>
  44 #include <linux/rcupdate.h>
  45 #include <linux/cpu.h>
  46 #include <linux/cpuset.h>
  47 #include <linux/percpu.h>
  48 #include <linux/kthread.h>
  49 #include <linux/seq_file.h>
  50 #include <linux/syscalls.h>
  51 #include <linux/times.h>
  52 #include <linux/tsacct_kern.h>
  53 #include <linux/kprobes.h>
  54 #include <linux/delayacct.h>
  55 #include <linux/reciprocal_div.h>
  56 #include <linux/unistd.h>
  57
  58 #include <asm/tlb.h>
  59
  60 /*
  61  * Scheduler clock - returns current time in nanosec units.
  62  * This is default implementation.
  63  * Architectures and sub-architectures can override this.
  64  */
  65 unsigned long long __attribute__((weak)) sched_clock(void)
  66 {
  67         return (unsigned long long)jiffies * (1000000000 / HZ);
  68 }
  69
  70 /*
  71  * Convert user-nice values [ -20 ... 0 ... 19 ]
  72  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  73  * and back.
  74  */
  75 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  76 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  77 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  78
  79 /*
  80  * 'User priority' is the nice value converted to something we
  81  * can work with better when scaling various scheduler parameters,
  82  * it's a [ 0 ... 39 ] range.
  83  */
  84 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  85 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  86 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  87
  88 /*
  89  * Some helpers for converting nanosecond timing to jiffy resolution
  90  */
  91 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  92 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  93
  94 #define NICE_0_LOAD             SCHED_LOAD_SCALE
  95 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
  96
  97 /*
  98  * These are the 'tuning knobs' of the scheduler:
  99  *
 100  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
 101  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
 102  * Timeslices get refilled after they expire.
 103  */
 104 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
 105 #define DEF_TIMESLICE           (100 * HZ / 1000)
 106 #define ON_RUNQUEUE_WEIGHT       30
 107 #define CHILD_PENALTY            95
 108 #define PARENT_PENALTY          100
 109 #define EXIT_WEIGHT               3
 110 #define PRIO_BONUS_RATIO         25
 111 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
 112 #define INTERACTIVE_DELTA         2
 113 #define MAX_SLEEP_AVG           (DEF_TIMESLICE * MAX_BONUS)
 114 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
 115 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
 116
 117 /*
 118  * If a task is 'interactive' then we reinsert it in the active
 119  * array after it has expired its current timeslice. (it will not
 120  * continue to run immediately, it will still roundrobin with
 121  * other interactive tasks.)
 122  *
 123  * This part scales the interactivity limit depending on niceness.
 124  *
 125  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
 126  * Here are a few examples of different nice levels:
 127  *
 128  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
 129  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
 130  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
 131  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
 132  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
 133  *
 134  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
 135  *  priority range a task can explore, a value of '1' means the
 136  *  task is rated interactive.)
 137  *
 138  * Ie. nice +19 tasks can never get 'interactive' enough to be
 139  * reinserted into the active array. And only heavily CPU-hog nice -20
 140  * tasks will be expired. Default nice 0 tasks are somewhere between,
 141  * it takes some effort for them to get interactive, but it's not
 142  * too hard.
 143  */
 144
 145 #define CURRENT_BONUS(p) \
 146         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 147                 MAX_SLEEP_AVG)
 148
 149 #define GRANULARITY     (10 * HZ / 1000 ? : 1)
 150
 151 #ifdef CONFIG_SMP
 152 #define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
 153                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 154                         num_online_cpus())
 155 #else
 156 #define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
 157                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 158 #endif
 159
 160 #define SCALE(v1,v1_max,v2_max) \
 161         (v1) * (v2_max) / (v1_max)
 162
 163 #define DELTA(p) \
 164         (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
 165                 INTERACTIVE_DELTA)
 166
 167 #define TASK_INTERACTIVE(p) \
 168         ((p)->prio <= (p)->static_prio - DELTA(p))
 169
 170 #define INTERACTIVE_SLEEP(p) \
 171         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 172                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 173
 174 #define TASK_PREEMPTS_CURR(p, rq) \
 175         ((p)->prio < (rq)->curr->prio)
 176
 177 #define SCALE_PRIO(x, prio) \
 178         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 179
 180 static unsigned int static_prio_timeslice(int static_prio)
 181 {
 182         if (static_prio < NICE_TO_PRIO(0))
 183                 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 184         else
 185                 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 186 }
 187
 188 #ifdef CONFIG_SMP
 189 /*
 190  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 191  * Since cpu_power is a 'constant', we can use a reciprocal divide.
 192  */
 193 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 194 {
 195         return reciprocal_divide(load, sg->reciprocal_cpu_power);
 196 }
 197
 198 /*
 199  * Each time a sched group cpu_power is changed,
 200  * we must compute its reciprocal value
 201  */
 202 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 203 {
 204         sg->__cpu_power += val;
 205         sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 206 }
 207 #endif
 208
 209 /*
 210  * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 211  * to time slice values: [800ms ... 100ms ... 5ms]
 212  *
 213  * The higher a thread's priority, the bigger timeslices
 214  * it gets during one round of execution. But even the lowest
 215  * priority thread gets MIN_TIMESLICE worth of execution time.
 216  */
 217
 218 static inline unsigned int task_timeslice(struct task_struct *p)
 219 {
 220         return static_prio_timeslice(p->static_prio);
 221 }
 222
 223 static inline int rt_policy(int policy)
 224 {
 225         if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
 226                 return 1;
 227         return 0;
 228 }
 229
 230 static inline int task_has_rt_policy(struct task_struct *p)
 231 {
 232         return rt_policy(p->policy);
 233 }
 234
 235 /*
 236  * This is the priority-queue data structure of the RT scheduling class:
 237  */
 238 struct rt_prio_array {
 239         DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 240         struct list_head queue[MAX_RT_PRIO];
 241 };
 242
 243 struct load_stat {
 244         struct load_weight load;
 245         u64 load_update_start, load_update_last;
 246         unsigned long delta_fair, delta_exec, delta_stat;
 247 };
 248
 249 /* CFS-related fields in a runqueue */
 250 struct cfs_rq {
 251         struct load_weight load;
 252         unsigned long nr_running;
 253
 254         s64 fair_clock;
 255         u64 exec_clock;
 256         s64 wait_runtime;
 257         u64 sleeper_bonus;
 258         unsigned long wait_runtime_overruns, wait_runtime_underruns;
 259
 260         struct rb_root tasks_timeline;
 261         struct rb_node *rb_leftmost;
 262         struct rb_node *rb_load_balance_curr;
 263 #ifdef CONFIG_FAIR_GROUP_SCHED
 264         /* 'curr' points to currently running entity on this cfs_rq.
 265          * It is set to NULL otherwise (i.e when none are currently running).
 266          */
 267         struct sched_entity *curr;
 268         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 269
 270         /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 271          * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 272          * (like users, containers etc.)
 273          *
 274          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 275          * list is used during load balance.
 276          */
 277         struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
 278 #endif
 279 };
 280
 281 /* Real-Time classes' related field in a runqueue: */
 282 struct rt_rq {
 283         struct rt_prio_array active;
 284         int rt_load_balance_idx;
 285         struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 286 };
 287
 288 /*
 289  * The prio-array type of the old scheduler:
 290  */
 291 struct prio_array {
 292         unsigned int nr_active;
 293         DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
 294         struct list_head queue[MAX_PRIO];
 295 };
 296
 297 /*
 298  * This is the main, per-CPU runqueue data structure.
 299  *
 300  * Locking rule: those places that want to lock multiple runqueues
 301  * (such as the load balancing or the thread migration code), lock
 302  * acquire operations must be ordered by ascending &runqueue.
 303  */
 304 struct rq {
 305         spinlock_t lock;        /* runqueue lock */
 306
 307         /*
 308          * nr_running and cpu_load should be in the same cacheline because
 309          * remote CPUs use both these fields when doing load calculation.
 310          */
 311         unsigned long nr_running;
 312         unsigned long raw_weighted_load;
 313         #define CPU_LOAD_IDX_MAX 5
 314         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 315         unsigned char idle_at_tick;
 316 #ifdef CONFIG_NO_HZ
 317         unsigned char in_nohz_recently;
 318 #endif
 319         struct load_stat ls;    /* capture load from *all* tasks on this cpu */
 320         unsigned long nr_load_updates;
 321         u64 nr_switches;
 322
 323         struct cfs_rq cfs;
 324 #ifdef CONFIG_FAIR_GROUP_SCHED
 325         struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
 326 #endif
 327         struct rt_rq  rt;
 328
 329         /*
 330          * This is part of a global counter where only the total sum
 331          * over all CPUs matters. A task can increase this counter on
 332          * one CPU and if it got migrated afterwards it may decrease
 333          * it on another CPU. Always updated under the runqueue lock:
 334          */
 335         unsigned long nr_uninterruptible;
 336
 337         unsigned long expired_timestamp;
 338         unsigned long long most_recent_timestamp;
 339
 340         struct task_struct *curr, *idle;
 341         unsigned long next_balance;
 342         struct mm_struct *prev_mm;
 343
 344         struct prio_array *active, *expired, arrays[2];
 345         int best_expired_prio;
 346
 347         u64 clock, prev_clock_raw;
 348         s64 clock_max_delta;
 349
 350         unsigned int clock_warps, clock_overflows;
 351         unsigned int clock_unstable_events;
 352
 353         struct sched_class *load_balance_class;
 354
 355         atomic_t nr_iowait;
 356
 357 #ifdef CONFIG_SMP
 358         struct sched_domain *sd;
 359
 360         /* For active balancing */
 361         int active_balance;
 362         int push_cpu;
 363         int cpu;                /* cpu of this runqueue */
 364
 365         struct task_struct *migration_thread;
 366         struct list_head migration_queue;
 367 #endif
 368
 369 #ifdef CONFIG_SCHEDSTATS
 370         /* latency stats */
 371         struct sched_info rq_sched_info;
 372
 373         /* sys_sched_yield() stats */
 374         unsigned long yld_exp_empty;
 375         unsigned long yld_act_empty;
 376         unsigned long yld_both_empty;
 377         unsigned long yld_cnt;
 378
 379         /* schedule() stats */
 380         unsigned long sched_switch;
 381         unsigned long sched_cnt;
 382         unsigned long sched_goidle;
 383
 384         /* try_to_wake_up() stats */
 385         unsigned long ttwu_cnt;
 386         unsigned long ttwu_local;
 387 #endif
 388         struct lock_class_key rq_lock_key;
 389 };
 390
 391 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 392 static DEFINE_MUTEX(sched_hotcpu_mutex);
 393
 394 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 395 {
 396         rq->curr->sched_class->check_preempt_curr(rq, p);
 397 }
 398
 399 static inline int cpu_of(struct rq *rq)
 400 {
 401 #ifdef CONFIG_SMP
 402         return rq->cpu;
 403 #else
 404         return 0;
 405 #endif
 406 }
 407
 408 /*
 409  * Per-runqueue clock, as finegrained as the platform can give us:
 410  */
 411 static unsigned long long __rq_clock(struct rq *rq)
 412 {
 413         u64 prev_raw = rq->prev_clock_raw;
 414         u64 now = sched_clock();
 415         s64 delta = now - prev_raw;
 416         u64 clock = rq->clock;
 417
 418         /*
 419          * Protect against sched_clock() occasionally going backwards:
 420          */
 421         if (unlikely(delta < 0)) {
 422                 clock++;
 423                 rq->clock_warps++;
 424         } else {
 425                 /*
 426                  * Catch too large forward jumps too:
 427                  */
 428                 if (unlikely(delta > 2*TICK_NSEC)) {
 429                         clock++;
 430                         rq->clock_overflows++;
 431                 } else {
 432                         if (unlikely(delta > rq->clock_max_delta))
 433                                 rq->clock_max_delta = delta;
 434                         clock += delta;
 435                 }
 436         }
 437
 438         rq->prev_clock_raw = now;
 439         rq->clock = clock;
 440
 441         return clock;
 442 }
 443
 444 static inline unsigned long long rq_clock(struct rq *rq)
 445 {
 446         int this_cpu = smp_processor_id();
 447
 448         if (this_cpu == cpu_of(rq))
 449                 return __rq_clock(rq);
 450
 451         return rq->clock;
 452 }
 453
 454 /*
 455  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 456  * See detach_destroy_domains: synchronize_sched for details.
 457  *
 458  * The domain tree of any CPU may only be accessed from within
 459  * preempt-disabled sections.
 460  */
 461 #define for_each_domain(cpu, __sd) \
 462         for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 463
 464 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 465 #define this_rq()               (&__get_cpu_var(runqueues))
 466 #define task_rq(p)              cpu_rq(task_cpu(p))
 467 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 468
 469 #ifdef CONFIG_FAIR_GROUP_SCHED
 470 /* Change a task's ->cfs_rq if it moves across CPUs */
 471 static inline void set_task_cfs_rq(struct task_struct *p)
 472 {
 473         p->se.cfs_rq = &task_rq(p)->cfs;
 474 }
 475 #else
 476 static inline void set_task_cfs_rq(struct task_struct *p)
 477 {
 478 }
 479 #endif
 480
 481 #ifndef prepare_arch_switch
 482 # define prepare_arch_switch(next)      do { } while (0)
 483 #endif
 484 #ifndef finish_arch_switch
 485 # define finish_arch_switch(prev)       do { } while (0)
 486 #endif
 487
 488 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 489 static inline int task_running(struct rq *rq, struct task_struct *p)
 490 {
 491         return rq->curr == p;
 492 }
 493
 494 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 495 {
 496 }
 497
 498 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 499 {
 500 #ifdef CONFIG_DEBUG_SPINLOCK
 501         /* this is a valid case when another task releases the spinlock */
 502         rq->lock.owner = current;
 503 #endif
 504         /*
 505          * If we are tracking spinlock dependencies then we have to
 506          * fix up the runqueue lock - which gets 'carried over' from
 507          * prev into current:
 508          */
 509         spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 510
 511         spin_unlock_irq(&rq->lock);
 512 }
 513
 514 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 515 static inline int task_running(struct rq *rq, struct task_struct *p)
 516 {
 517 #ifdef CONFIG_SMP
 518         return p->oncpu;
 519 #else
 520         return rq->curr == p;
 521 #endif
 522 }
 523
 524 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 525 {
 526 #ifdef CONFIG_SMP
 527         /*
 528          * We can optimise this out completely for !SMP, because the
 529          * SMP rebalancing from interrupt is the only thing that cares
 530          * here.
 531          */
 532         next->oncpu = 1;
 533 #endif
 534 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 535         spin_unlock_irq(&rq->lock);
 536 #else
 537         spin_unlock(&rq->lock);
 538 #endif
 539 }
 540
 541 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 542 {
 543 #ifdef CONFIG_SMP
 544         /*
 545          * After ->oncpu is cleared, the task can be moved to a different CPU.
 546          * We must ensure this doesn't happen until the switch is completely
 547          * finished.
 548          */
 549         smp_wmb();
 550         prev->oncpu = 0;
 551 #endif
 552 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 553         local_irq_enable();
 554 #endif
 555 }
 556 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 557
 558 /*
 559  * __task_rq_lock - lock the runqueue a given task resides on.
 560  * Must be called interrupts disabled.
 561  */
 562 static inline struct rq *__task_rq_lock(struct task_struct *p)
 563         __acquires(rq->lock)
 564 {
 565         struct rq *rq;
 566
 567 repeat_lock_task:
 568         rq = task_rq(p);
 569         spin_lock(&rq->lock);
 570         if (unlikely(rq != task_rq(p))) {
 571                 spin_unlock(&rq->lock);
 572                 goto repeat_lock_task;
 573         }
 574         return rq;
 575 }
 576
 577 /*
 578  * task_rq_lock - lock the runqueue a given task resides on and disable
 579  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 580  * explicitly disabling preemption.
 581  */
 582 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 583         __acquires(rq->lock)
 584 {
 585         struct rq *rq;
 586
 587 repeat_lock_task:
 588         local_irq_save(*flags);
 589         rq = task_rq(p);
 590         spin_lock(&rq->lock);
 591         if (unlikely(rq != task_rq(p))) {
 592                 spin_unlock_irqrestore(&rq->lock, *flags);
 593                 goto repeat_lock_task;
 594         }
 595         return rq;
 596 }
 597
 598 static inline void __task_rq_unlock(struct rq *rq)
 599         __releases(rq->lock)
 600 {
 601         spin_unlock(&rq->lock);
 602 }
 603
 604 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 605         __releases(rq->lock)
 606 {
 607         spin_unlock_irqrestore(&rq->lock, *flags);
 608 }
 609
 610 /*
 611  * this_rq_lock - lock this runqueue and disable interrupts.
 612  */
 613 static inline struct rq *this_rq_lock(void)
 614         __acquires(rq->lock)
 615 {
 616         struct rq *rq;
 617
 618         local_irq_disable();
 619         rq = this_rq();
 620         spin_lock(&rq->lock);
 621
 622         return rq;
 623 }
 624
 625 /*
 626  * CPU frequency is/was unstable - start new by setting prev_clock_raw:
 627  */
 628 void sched_clock_unstable_event(void)
 629 {
 630         unsigned long flags;
 631         struct rq *rq;
 632
 633         rq = task_rq_lock(current, &flags);
 634         rq->prev_clock_raw = sched_clock();
 635         rq->clock_unstable_events++;
 636         task_rq_unlock(rq, &flags);
 637 }
 638
 639 /*
 640  * resched_task - mark a task 'to be rescheduled now'.
 641  *
 642  * On UP this means the setting of the need_resched flag, on SMP it
 643  * might also involve a cross-CPU call to trigger the scheduler on
 644  * the target CPU.
 645  */
 646 #ifdef CONFIG_SMP
 647
 648 #ifndef tsk_is_polling
 649 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 650 #endif
 651
 652 static void resched_task(struct task_struct *p)
 653 {
 654         int cpu;
 655
 656         assert_spin_locked(&task_rq(p)->lock);
 657
 658         if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 659                 return;
 660
 661         set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 662
 663         cpu = task_cpu(p);
 664         if (cpu == smp_processor_id())
 665                 return;
 666
 667         /* NEED_RESCHED must be visible before we test polling */
 668         smp_mb();
 669         if (!tsk_is_polling(p))
 670                 smp_send_reschedule(cpu);
 671 }
 672
 673 static void resched_cpu(int cpu)
 674 {
 675         struct rq *rq = cpu_rq(cpu);
 676         unsigned long flags;
 677
 678         if (!spin_trylock_irqsave(&rq->lock, flags))
 679                 return;
 680         resched_task(cpu_curr(cpu));
 681         spin_unlock_irqrestore(&rq->lock, flags);
 682 }
 683 #else
 684 static inline void resched_task(struct task_struct *p)
 685 {
 686         assert_spin_locked(&task_rq(p)->lock);
 687         set_tsk_need_resched(p);
 688 }
 689 #endif
 690
 691 static u64 div64_likely32(u64 divident, unsigned long divisor)
 692 {
 693 #if BITS_PER_LONG == 32
 694         if (likely(divident <= 0xffffffffULL))
 695                 return (u32)divident / divisor;
 696         do_div(divident, divisor);
 697
 698         return divident;
 699 #else
 700         return divident / divisor;
 701 #endif
 702 }
 703
 704 #if BITS_PER_LONG == 32
 705 # define WMULT_CONST    (~0UL)
 706 #else
 707 # define WMULT_CONST    (1UL << 32)
 708 #endif
 709
 710 #define WMULT_SHIFT     32
 711
 712 static inline unsigned long
 713 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 714                 struct load_weight *lw)
 715 {
 716         u64 tmp;
 717
 718         if (unlikely(!lw->inv_weight))
 719                 lw->inv_weight = WMULT_CONST / lw->weight;
 720
 721         tmp = (u64)delta_exec * weight;
 722         /*
 723          * Check whether we'd overflow the 64-bit multiplication:
 724          */
 725         if (unlikely(tmp > WMULT_CONST)) {
 726                 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
 727                                 >> (WMULT_SHIFT/2);
 728         } else {
 729                 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 730         }
 731
 732         return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
 733 }
 734
 735 static inline unsigned long
 736 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 737 {
 738         return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 739 }
 740
 741 static void update_load_add(struct load_weight *lw, unsigned long inc)
 742 {
 743         lw->weight += inc;
 744         lw->inv_weight = 0;
 745 }
 746
 747 static void update_load_sub(struct load_weight *lw, unsigned long dec)
 748 {
 749         lw->weight -= dec;
 750         lw->inv_weight = 0;
 751 }
 752
 753 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
 754 {
 755         if (rq->curr != rq->idle && ls->load.weight) {
 756                 ls->delta_exec += ls->delta_stat;
 757                 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
 758                 ls->delta_stat = 0;
 759         }
 760 }
 761
 762 /*
 763  * Update delta_exec, delta_fair fields for rq.
 764  *
 765  * delta_fair clock advances at a rate inversely proportional to
 766  * total load (rq->ls.load.weight) on the runqueue, while
 767  * delta_exec advances at the same rate as wall-clock (provided
 768  * cpu is not idle).
 769  *
 770  * delta_exec / delta_fair is a measure of the (smoothened) load on this
 771  * runqueue over any given interval. This (smoothened) load is used
 772  * during load balance.
 773  *
 774  * This function is called /before/ updating rq->ls.load
 775  * and when switching tasks.
 776  */
 777 static void update_curr_load(struct rq *rq, u64 now)
 778 {
 779         struct load_stat *ls = &rq->ls;
 780         u64 start;
 781
 782         start = ls->load_update_start;
 783         ls->load_update_start = now;
 784         ls->delta_stat += now - start;
 785         /*
 786          * Stagger updates to ls->delta_fair. Very frequent updates
 787          * can be expensive.
 788          */
 789         if (ls->delta_stat >= sysctl_sched_stat_granularity)
 790                 __update_curr_load(rq, ls);
 791 }
 792
 793 /*
 794  * To aid in avoiding the subversion of "niceness" due to uneven distribution
 795  * of tasks with abnormal "nice" values across CPUs the contribution that
 796  * each task makes to its run queue's load is weighted according to its
 797  * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
 798  * scaled version of the new time slice allocation that they receive on time
 799  * slice expiry etc.
 800  */
 801
 802 /*
 803  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
 804  * If static_prio_timeslice() is ever changed to break this assumption then
 805  * this code will need modification
 806  */
 807 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 808 #define load_weight(lp) \
 809         (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 810 #define PRIO_TO_LOAD_WEIGHT(prio) \
 811         load_weight(static_prio_timeslice(prio))
 812 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 813         (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
 814
 815 #define WEIGHT_IDLEPRIO         2
 816 #define WMULT_IDLEPRIO          (1 << 31)
 817
 818 /*
 819  * Nice levels are multiplicative, with a gentle 10% change for every
 820  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 821  * nice 1, it will get ~10% less CPU time than another CPU-bound task
 822  * that remained on nice 0.
 823  *
 824  * The "10% effect" is relative and cumulative: from _any_ nice level,
 825  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 826  * it's +10% CPU usage.
 827  */
 828 static const int prio_to_weight[40] = {
 829 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
 830 /* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
 831 /*   0 */  NICE_0_LOAD /* 1024 */,
 832 /*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
 833 /*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
 834 };
 835
 836 static const u32 prio_to_wmult[40] = {
 837         48356,   60446,   75558,   94446,  118058,  147573,
 838         184467,  230589,  288233,  360285,  450347,
 839         562979,  703746,  879575, 1099582, 1374389,
 840         717986, 2147483, 2684354, 3355443, 4194304,
 841         244160, 6557201, 8196502, 10250518, 12782640,
 842         16025997, 19976592, 24970740, 31350126, 39045157,
 843         49367440, 61356675, 76695844, 95443717, 119304647,
 844         148102320, 186737708, 238609294, 286331153,
 845 };
 846
 847 static inline void
 848 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 849 {
 850         update_curr_load(rq, now);
 851         update_load_add(&rq->ls.load, p->se.load.weight);
 852 }
 853
 854 static inline void
 855 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 856 {
 857         update_curr_load(rq, now);
 858         update_load_sub(&rq->ls.load, p->se.load.weight);
 859 }
 860
 861 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 862 {
 863         rq->nr_running++;
 864         inc_load(rq, p, now);
 865 }
 866
 867 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 868 {
 869         rq->nr_running--;
 870         dec_load(rq, p, now);
 871 }
 872
 873 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 874
 875 /*
 876  * runqueue iterator, to support SMP load-balancing between different
 877  * scheduling classes, without having to expose their internal data
 878  * structures to the load-balancing proper:
 879  */
 880 struct rq_iterator {
 881         void *arg;
 882         struct task_struct *(*start)(void *);
 883         struct task_struct *(*next)(void *);
 884 };
 885
 886 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 887                       unsigned long max_nr_move, unsigned long max_load_move,
 888                       struct sched_domain *sd, enum cpu_idle_type idle,
 889                       int *all_pinned, unsigned long *load_moved,
 890                       int this_best_prio, int best_prio, int best_prio_seen,
 891                       struct rq_iterator *iterator);
 892
 893 #include "sched_stats.h"
 894 #include "sched_rt.c"
 895 #include "sched_fair.c"
 896 #include "sched_idletask.c"
 897 #ifdef CONFIG_SCHED_DEBUG
 898 # include "sched_debug.c"
 899 #endif
 900
 901 #define sched_class_highest (&rt_sched_class)
 902
 903 static void set_load_weight(struct task_struct *p)
 904 {
 905         task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 906         p->se.wait_runtime = 0;
 907
 908         if (task_has_rt_policy(p)) {
 909                 p->se.load.weight = prio_to_weight[0] * 2;
 910                 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 911                 return;
 912         }
 913
 914         /*
 915          * SCHED_IDLE tasks get minimal weight:
 916          */
 917         if (p->policy == SCHED_IDLE) {
 918                 p->se.load.weight = WEIGHT_IDLEPRIO;
 919                 p->se.load.inv_weight = WMULT_IDLEPRIO;
 920                 return;
 921         }
 922
 923         p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 924         p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 925 }
 926
 927 static void
 928 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 929 {
 930         sched_info_queued(p);
 931         p->sched_class->enqueue_task(rq, p, wakeup, now);
 932         p->se.on_rq = 1;
 933 }
 934
 935 static void
 936 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 937 {
 938         p->sched_class->dequeue_task(rq, p, sleep, now);
 939         p->se.on_rq = 0;
 940 }
 941
 942 /*
 943  * __normal_prio - return the priority that is based on the static prio
 944  */
 945 static inline int __normal_prio(struct task_struct *p)
 946 {
 947         return p->static_prio;
 948 }
 949
 950 /*
 951  * Calculate the expected normal priority: i.e. priority
 952  * without taking RT-inheritance into account. Might be
 953  * boosted by interactivity modifiers. Changes upon fork,
 954  * setprio syscalls, and whenever the interactivity
 955  * estimator recalculates.
 956  */
 957 static inline int normal_prio(struct task_struct *p)
 958 {
 959         int prio;
 960
 961         if (task_has_rt_policy(p))
 962                 prio = MAX_RT_PRIO-1 - p->rt_priority;
 963         else
 964                 prio = __normal_prio(p);
 965         return prio;
 966 }
 967
 968 /*
 969  * Calculate the current priority, i.e. the priority
 970  * taken into account by the scheduler. This value might
 971  * be boosted by RT tasks, or might be boosted by
 972  * interactivity modifiers. Will be RT if the task got
 973  * RT-boosted. If not then it returns p->normal_prio.
 974  */
 975 static int effective_prio(struct task_struct *p)
 976 {
 977         p->normal_prio = normal_prio(p);
 978         /*
 979          * If we are RT tasks or we were boosted to RT priority,
 980          * keep the priority unchanged. Otherwise, update priority
 981          * to the normal priority:
 982          */
 983         if (!rt_prio(p->prio))
 984                 return p->normal_prio;
 985         return p->prio;
 986 }
 987
 988 /*
 989  * activate_task - move a task to the runqueue.
 990  */
 991 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 992 {
 993         u64 now = rq_clock(rq);
 994
 995         if (p->state == TASK_UNINTERRUPTIBLE)
 996                 rq->nr_uninterruptible--;
 997
 998         enqueue_task(rq, p, wakeup, now);
 999         inc_nr_running(p, rq, now);
1000 }
1001
1002 /*
1003  * activate_idle_task - move idle task to the _front_ of runqueue.
1004  */
1005 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
1006 {
1007         u64 now = rq_clock(rq);
1008
1009         if (p->state == TASK_UNINTERRUPTIBLE)
1010                 rq->nr_uninterruptible--;
1011
1012         enqueue_task(rq, p, 0, now);
1013         inc_nr_running(p, rq, now);
1014 }
1015
1016 /*
1017  * deactivate_task - remove a task from the runqueue.
1018  */
1019 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1020 {
1021         u64 now = rq_clock(rq);
1022
1023         if (p->state == TASK_UNINTERRUPTIBLE)
1024                 rq->nr_uninterruptible++;
1025
1026         dequeue_task(rq, p, sleep, now);
1027         dec_nr_running(p, rq, now);
1028 }
1029
1030 /**
1031  * task_curr - is this task currently executing on a CPU?
1032  * @p: the task in question.
1033  */
1034 inline int task_curr(const struct task_struct *p)
1035 {
1036         return cpu_curr(task_cpu(p)) == p;
1037 }
1038
1039 /* Used instead of source_load when we know the type == 0 */
1040 unsigned long weighted_cpuload(const int cpu)
1041 {
1042         return cpu_rq(cpu)->ls.load.weight;
1043 }
1044
1045 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1046 {
1047 #ifdef CONFIG_SMP
1048         task_thread_info(p)->cpu = cpu;
1049         set_task_cfs_rq(p);
1050 #endif
1051 }
1052
1053 #ifdef CONFIG_SMP
1054
1055 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1056 {
1057         int old_cpu = task_cpu(p);
1058         struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1059         u64 clock_offset, fair_clock_offset;
1060
1061         clock_offset = old_rq->clock - new_rq->clock;
1062         fair_clock_offset = old_rq->cfs.fair_clock -
1063                                                  new_rq->cfs.fair_clock;
1064         if (p->se.wait_start)
1065                 p->se.wait_start -= clock_offset;
1066         if (p->se.wait_start_fair)
1067                 p->se.wait_start_fair -= fair_clock_offset;
1068         if (p->se.sleep_start)
1069                 p->se.sleep_start -= clock_offset;
1070         if (p->se.block_start)
1071                 p->se.block_start -= clock_offset;
1072         if (p->se.sleep_start_fair)
1073                 p->se.sleep_start_fair -= fair_clock_offset;
1074
1075         __set_task_cpu(p, new_cpu);
1076 }
1077
1078 struct migration_req {
1079         struct list_head list;
1080
1081         struct task_struct *task;
1082         int dest_cpu;
1083
1084         struct completion done;
1085 };
1086
1087 /*
1088  * The task's runqueue lock must be held.
1089  * Returns true if you have to wait for migration thread.
1090  */
1091 static int
1092 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1093 {
1094         struct rq *rq = task_rq(p);
1095
1096         /*
1097          * If the task is not on a runqueue (and not running), then
1098          * it is sufficient to simply update the task's cpu field.
1099          */
1100         if (!p->se.on_rq && !task_running(rq, p)) {
1101                 set_task_cpu(p, dest_cpu);
1102                 return 0;
1103         }
1104
1105         init_completion(&req->done);
1106         req->task = p;
1107         req->dest_cpu = dest_cpu;
1108         list_add(&req->list, &rq->migration_queue);
1109
1110         return 1;
1111 }
1112
1113 /*
1114  * wait_task_inactive - wait for a thread to unschedule.
1115  *
1116  * The caller must ensure that the task *will* unschedule sometime soon,
1117  * else this function might spin for a *long* time. This function can't
1118  * be called with interrupts off, or it may introduce deadlock with
1119  * smp_call_function() if an IPI is sent by the same process we are
1120  * waiting to become inactive.
1121  */
1122 void wait_task_inactive(struct task_struct *p)
1123 {
1124         unsigned long flags;
1125         int running, on_rq;
1126         struct rq *rq;
1127
1128 repeat:
1129         /*
1130          * We do the initial early heuristics without holding
1131          * any task-queue locks at all. We'll only try to get
1132          * the runqueue lock when things look like they will
1133          * work out!
1134          */
1135         rq = task_rq(p);
1136
1137         /*
1138          * If the task is actively running on another CPU
1139          * still, just relax and busy-wait without holding
1140          * any locks.
1141          *
1142          * NOTE! Since we don't hold any locks, it's not
1143          * even sure that "rq" stays as the right runqueue!
1144          * But we don't care, since "task_running()" will
1145          * return false if the runqueue has changed and p
1146          * is actually now running somewhere else!
1147          */
1148         while (task_running(rq, p))
1149                 cpu_relax();
1150
1151         /*
1152          * Ok, time to look more closely! We need the rq
1153          * lock now, to be *sure*. If we're wrong, we'll
1154          * just go back and repeat.
1155          */
1156         rq = task_rq_lock(p, &flags);
1157         running = task_running(rq, p);
1158         on_rq = p->se.on_rq;
1159         task_rq_unlock(rq, &flags);
1160
1161         /*
1162          * Was it really running after all now that we
1163          * checked with the proper locks actually held?
1164          *
1165          * Oops. Go back and try again..
1166          */
1167         if (unlikely(running)) {
1168                 cpu_relax();
1169                 goto repeat;
1170         }
1171
1172         /*
1173          * It's not enough that it's not actively running,
1174          * it must be off the runqueue _entirely_, and not
1175          * preempted!
1176          *
1177          * So if it wa still runnable (but just not actively
1178          * running right now), it's preempted, and we should
1179          * yield - it could be a while.
1180          */
1181         if (unlikely(on_rq)) {
1182                 yield();
1183                 goto repeat;
1184         }
1185
1186         /*
1187          * Ahh, all good. It wasn't running, and it wasn't
1188          * runnable, which means that it will never become
1189          * running in the future either. We're all done!
1190          */
1191 }
1192
1193 /***
1194  * kick_process - kick a running thread to enter/exit the kernel
1195  * @p: the to-be-kicked thread
1196  *
1197  * Cause a process which is running on another CPU to enter
1198  * kernel-mode, without any delay. (to get signals handled.)
1199  *
1200  * NOTE: this function doesnt have to take the runqueue lock,
1201  * because all it wants to ensure is that the remote task enters
1202  * the kernel. If the IPI races and the task has been migrated
1203  * to another CPU then no harm is done and the purpose has been
1204  * achieved as well.
1205  */
1206 void kick_process(struct task_struct *p)
1207 {
1208         int cpu;
1209
1210         preempt_disable();
1211         cpu = task_cpu(p);
1212         if ((cpu != smp_processor_id()) && task_curr(p))
1213                 smp_send_reschedule(cpu);
1214         preempt_enable();
1215 }
1216
1217 /*
1218  * Return a low guess at the load of a migration-source cpu weighted
1219  * according to the scheduling class and "nice" value.
1220  *
1221  * We want to under-estimate the load of migration sources, to
1222  * balance conservatively.
1223  */
1224 static inline unsigned long source_load(int cpu, int type)
1225 {
1226         struct rq *rq = cpu_rq(cpu);
1227         unsigned long total = weighted_cpuload(cpu);
1228
1229         if (type == 0)
1230                 return total;
1231
1232         return min(rq->cpu_load[type-1], total);
1233 }
1234
1235 /*
1236  * Return a high guess at the load of a migration-target cpu weighted
1237  * according to the scheduling class and "nice" value.
1238  */
1239 static inline unsigned long target_load(int cpu, int type)
1240 {
1241         struct rq *rq = cpu_rq(cpu);
1242         unsigned long total = weighted_cpuload(cpu);
1243
1244         if (type == 0)
1245                 return total;
1246
1247         return max(rq->cpu_load[type-1], total);
1248 }
1249
1250 /*
1251  * Return the average load per task on the cpu's run queue
1252  */
1253 static inline unsigned long cpu_avg_load_per_task(int cpu)
1254 {
1255         struct rq *rq = cpu_rq(cpu);
1256         unsigned long total = weighted_cpuload(cpu);
1257         unsigned long n = rq->nr_running;
1258
1259         return n ? total / n : SCHED_LOAD_SCALE;
1260 }
1261
1262 /*
1263  * find_idlest_group finds and returns the least busy CPU group within the
1264  * domain.
1265  */
1266 static struct sched_group *
1267 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1268 {
1269         struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1270         unsigned long min_load = ULONG_MAX, this_load = 0;
1271         int load_idx = sd->forkexec_idx;
1272         int imbalance = 100 + (sd->imbalance_pct-100)/2;
1273
1274         do {
1275                 unsigned long load, avg_load;
1276                 int local_group;
1277                 int i;
1278
1279                 /* Skip over this group if it has no CPUs allowed */
1280                 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1281                         goto nextgroup;
1282
1283                 local_group = cpu_isset(this_cpu, group->cpumask);
1284
1285                 /* Tally up the load of all CPUs in the group */
1286                 avg_load = 0;
1287
1288                 for_each_cpu_mask(i, group->cpumask) {
1289                         /* Bias balancing toward cpus of our domain */
1290                         if (local_group)
1291                                 load = source_load(i, load_idx);
1292                         else
1293                                 load = target_load(i, load_idx);
1294
1295                         avg_load += load;
1296                 }
1297
1298                 /* Adjust by relative CPU power of the group */
1299                 avg_load = sg_div_cpu_power(group,
1300                                 avg_load * SCHED_LOAD_SCALE);
1301
1302                 if (local_group) {
1303                         this_load = avg_load;
1304                         this = group;
1305                 } else if (avg_load < min_load) {
1306                         min_load = avg_load;
1307                         idlest = group;
1308                 }
1309 nextgroup:
1310                 group = group->next;
1311         } while (group != sd->groups);
1312
1313         if (!idlest || 100*this_load < imbalance*min_load)
1314                 return NULL;
1315         return idlest;
1316 }
1317
1318 /*
1319  * find_idlest_cpu - find the idlest cpu among the cpus in group.
1320  */
1321 static int
1322 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1323 {
1324         cpumask_t tmp;
1325         unsigned long load, min_load = ULONG_MAX;
1326         int idlest = -1;
1327         int i;
1328
1329         /* Traverse only the allowed CPUs */
1330         cpus_and(tmp, group->cpumask, p->cpus_allowed);
1331
1332         for_each_cpu_mask(i, tmp) {
1333                 load = weighted_cpuload(i);
1334
1335                 if (load < min_load || (load == min_load && i == this_cpu)) {
1336                         min_load = load;
1337                         idlest = i;
1338                 }
1339         }
1340
1341         return idlest;
1342 }
1343
1344 /*
1345  * sched_balance_self: balance the current task (running on cpu) in domains
1346  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1347  * SD_BALANCE_EXEC.
1348  *
1349  * Balance, ie. select the least loaded group.
1350  *
1351  * Returns the target CPU number, or the same CPU if no balancing is needed.
1352  *
1353  * preempt must be disabled.
1354  */
1355 static int sched_balance_self(int cpu, int flag)
1356 {
1357         struct task_struct *t = current;
1358         struct sched_domain *tmp, *sd = NULL;
1359
1360         for_each_domain(cpu, tmp) {
1361                 /*
1362                  * If power savings logic is enabled for a domain, stop there.
1363                  */
1364                 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1365                         break;
1366                 if (tmp->flags & flag)
1367                         sd = tmp;
1368         }
1369
1370         while (sd) {
1371                 cpumask_t span;
1372                 struct sched_group *group;
1373                 int new_cpu, weight;
1374
1375                 if (!(sd->flags & flag)) {
1376                         sd = sd->child;
1377                         continue;
1378                 }
1379
1380                 span = sd->span;
1381                 group = find_idlest_group(sd, t, cpu);
1382                 if (!group) {
1383                         sd = sd->child;
1384                         continue;
1385                 }
1386
1387                 new_cpu = find_idlest_cpu(group, t, cpu);
1388                 if (new_cpu == -1 || new_cpu == cpu) {
1389                         /* Now try balancing at a lower domain level of cpu */
1390                         sd = sd->child;
1391                         continue;
1392                 }
1393
1394                 /* Now try balancing at a lower domain level of new_cpu */
1395                 cpu = new_cpu;
1396                 sd = NULL;
1397                 weight = cpus_weight(span);
1398                 for_each_domain(cpu, tmp) {
1399                         if (weight <= cpus_weight(tmp->span))
1400                                 break;
1401                         if (tmp->flags & flag)
1402                                 sd = tmp;
1403                 }
1404                 /* while loop will break here if sd == NULL */
1405         }
1406
1407         return cpu;
1408 }
1409
1410 #endif /* CONFIG_SMP */
1411
1412 /*
1413  * wake_idle() will wake a task on an idle cpu if task->cpu is
1414  * not idle and an idle cpu is available.  The span of cpus to
1415  * search starts with cpus closest then further out as needed,
1416  * so we always favor a closer, idle cpu.
1417  *
1418  * Returns the CPU we should wake onto.
1419  */
1420 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1421 static int wake_idle(int cpu, struct task_struct *p)
1422 {
1423         cpumask_t tmp;
1424         struct sched_domain *sd;
1425         int i;
1426
1427         /*
1428          * If it is idle, then it is the best cpu to run this task.
1429          *
1430          * This cpu is also the best, if it has more than one task already.
1431          * Siblings must be also busy(in most cases) as they didn't already
1432          * pickup the extra load from this cpu and hence we need not check
1433          * sibling runqueue info. This will avoid the checks and cache miss
1434          * penalities associated with that.
1435          */
1436         if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1437                 return cpu;
1438
1439         for_each_domain(cpu, sd) {
1440                 if (sd->flags & SD_WAKE_IDLE) {
1441                         cpus_and(tmp, sd->span, p->cpus_allowed);
1442                         for_each_cpu_mask(i, tmp) {
1443                                 if (idle_cpu(i))
1444                                         return i;
1445                         }
1446                 }
1447                 else
1448                         break;
1449         }
1450         return cpu;
1451 }
1452 #else
1453 static inline int wake_idle(int cpu, struct task_struct *p)
1454 {
1455         return cpu;
1456 }
1457 #endif
1458
1459 /***
1460  * try_to_wake_up - wake up a thread
1461  * @p: the to-be-woken-up thread
1462  * @state: the mask of task states that can be woken
1463  * @sync: do a synchronous wakeup?
1464  *
1465  * Put it on the run-queue if it's not already there. The "current"
1466  * thread is always on the run-queue (except when the actual
1467  * re-schedule is in progress), and as such you're allowed to do
1468  * the simpler "current->state = TASK_RUNNING" to mark yourself
1469  * runnable without the overhead of this.
1470  *
1471  * returns failure only if the task is already active.
1472  */
1473 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1474 {
1475         int cpu, this_cpu, success = 0;
1476         unsigned long flags;
1477         long old_state;
1478         struct rq *rq;
1479 #ifdef CONFIG_SMP
1480         struct sched_domain *sd, *this_sd = NULL;
1481         unsigned long load, this_load;
1482         int new_cpu;
1483 #endif
1484
1485         rq = task_rq_lock(p, &flags);
1486         old_state = p->state;
1487         if (!(old_state & state))
1488                 goto out;
1489
1490         if (p->se.on_rq)
1491                 goto out_running;
1492
1493         cpu = task_cpu(p);
1494         this_cpu = smp_processor_id();
1495
1496 #ifdef CONFIG_SMP
1497         if (unlikely(task_running(rq, p)))
1498                 goto out_activate;
1499
1500         new_cpu = cpu;
1501
1502         schedstat_inc(rq, ttwu_cnt);
1503         if (cpu == this_cpu) {
1504                 schedstat_inc(rq, ttwu_local);
1505                 goto out_set_cpu;
1506         }
1507
1508         for_each_domain(this_cpu, sd) {
1509                 if (cpu_isset(cpu, sd->span)) {
1510                         schedstat_inc(sd, ttwu_wake_remote);
1511                         this_sd = sd;
1512                         break;
1513                 }
1514         }
1515
1516         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1517                 goto out_set_cpu;
1518
1519         /*
1520          * Check for affine wakeup and passive balancing possibilities.
1521          */
1522         if (this_sd) {
1523                 int idx = this_sd->wake_idx;
1524                 unsigned int imbalance;
1525
1526                 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1527
1528                 load = source_load(cpu, idx);
1529                 this_load = target_load(this_cpu, idx);
1530
1531                 new_cpu = this_cpu; /* Wake to this CPU if we can */
1532
1533                 if (this_sd->flags & SD_WAKE_AFFINE) {
1534                         unsigned long tl = this_load;
1535                         unsigned long tl_per_task;
1536
1537                         tl_per_task = cpu_avg_load_per_task(this_cpu);
1538
1539                         /*
1540                          * If sync wakeup then subtract the (maximum possible)
1541                          * effect of the currently running task from the load
1542                          * of the current CPU:
1543                          */
1544                         if (sync)
1545                                 tl -= current->se.load.weight;
1546
1547                         if ((tl <= load &&
1548                                 tl + target_load(cpu, idx) <= tl_per_task) ||
1549                                100*(tl + p->se.load.weight) <= imbalance*load) {
1550                                 /*
1551                                  * This domain has SD_WAKE_AFFINE and
1552                                  * p is cache cold in this domain, and
1553                                  * there is no bad imbalance.
1554                                  */
1555                                 schedstat_inc(this_sd, ttwu_move_affine);
1556                                 goto out_set_cpu;
1557                         }
1558                 }
1559
1560                 /*
1561                  * Start passive balancing when half the imbalance_pct
1562                  * limit is reached.
1563                  */
1564                 if (this_sd->flags & SD_WAKE_BALANCE) {
1565                         if (imbalance*this_load <= 100*load) {
1566                                 schedstat_inc(this_sd, ttwu_move_balance);
1567                                 goto out_set_cpu;
1568                         }
1569                 }
1570         }
1571
1572         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1573 out_set_cpu:
1574         new_cpu = wake_idle(new_cpu, p);
1575         if (new_cpu != cpu) {
1576                 set_task_cpu(p, new_cpu);
1577                 task_rq_unlock(rq, &flags);
1578                 /* might preempt at this point */
1579                 rq = task_rq_lock(p, &flags);
1580                 old_state = p->state;
1581                 if (!(old_state & state))
1582                         goto out;
1583                 if (p->se.on_rq)
1584                         goto out_running;
1585
1586                 this_cpu = smp_processor_id();
1587                 cpu = task_cpu(p);
1588         }
1589
1590 out_activate:
1591 #endif /* CONFIG_SMP */
1592         activate_task(rq, p, 1);
1593         /*
1594          * Sync wakeups (i.e. those types of wakeups where the waker
1595          * has indicated that it will leave the CPU in short order)
1596          * don't trigger a preemption, if the woken up task will run on
1597          * this cpu. (in this case the 'I will reschedule' promise of
1598          * the waker guarantees that the freshly woken up task is going
1599          * to be considered on this CPU.)
1600          */
1601         if (!sync || cpu != this_cpu)
1602                 check_preempt_curr(rq, p);
1603         success = 1;
1604
1605 out_running:
1606         p->state = TASK_RUNNING;
1607 out:
1608         task_rq_unlock(rq, &flags);
1609
1610         return success;
1611 }
1612
1613 int fastcall wake_up_process(struct task_struct *p)
1614 {
1615         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1616                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1617 }
1618 EXPORT_SYMBOL(wake_up_process);
1619
1620 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1621 {
1622         return try_to_wake_up(p, state, 0);
1623 }
1624
1625 /*
1626  * Perform scheduler related setup for a newly forked process p.
1627  * p is forked by current.
1628  *
1629  * __sched_fork() is basic setup used by init_idle() too:
1630  */
1631 static void __sched_fork(struct task_struct *p)
1632 {
1633         p->se.wait_start_fair           = 0;
1634         p->se.wait_start                = 0;
1635         p->se.exec_start                = 0;
1636         p->se.sum_exec_runtime          = 0;
1637         p->se.delta_exec                = 0;
1638         p->se.delta_fair_run            = 0;
1639         p->se.delta_fair_sleep          = 0;
1640         p->se.wait_runtime              = 0;
1641         p->se.sum_wait_runtime          = 0;
1642         p->se.sum_sleep_runtime         = 0;
1643         p->se.sleep_start               = 0;
1644         p->se.sleep_start_fair          = 0;
1645         p->se.block_start               = 0;
1646         p->se.sleep_max                 = 0;
1647         p->se.block_max                 = 0;
1648         p->se.exec_max                  = 0;
1649         p->se.wait_max                  = 0;
1650         p->se.wait_runtime_overruns     = 0;
1651         p->se.wait_runtime_underruns    = 0;
1652
1653         INIT_LIST_HEAD(&p->run_list);
1654         p->se.on_rq = 0;
1655
1656         /*
1657          * We mark the process as running here, but have not actually
1658          * inserted it onto the runqueue yet. This guarantees that
1659          * nobody will actually run it, and a signal or other external
1660          * event cannot wake it up and insert it on the runqueue either.
1661          */
1662         p->state = TASK_RUNNING;
1663 }
1664
1665 /*
1666  * fork()/clone()-time setup:
1667  */
1668 void sched_fork(struct task_struct *p, int clone_flags)
1669 {
1670         int cpu = get_cpu();
1671
1672         __sched_fork(p);
1673
1674 #ifdef CONFIG_SMP
1675         cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1676 #endif
1677         __set_task_cpu(p, cpu);
1678
1679         /*
1680          * Make sure we do not leak PI boosting priority to the child:
1681          */
1682         p->prio = current->normal_prio;
1683
1684 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1685         if (likely(sched_info_on()))
1686                 memset(&p->sched_info, 0, sizeof(p->sched_info));
1687 #endif
1688 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1689         p->oncpu = 0;
1690 #endif
1691 #ifdef CONFIG_PREEMPT
1692         /* Want to start with kernel preemption disabled. */
1693         task_thread_info(p)->preempt_count = 1;
1694 #endif
1695         put_cpu();
1696 }
1697
1698 /*
1699  * After fork, child runs first. (default) If set to 0 then
1700  * parent will (try to) run first.
1701  */
1702 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1703
1704 /*
1705  * wake_up_new_task - wake up a newly created task for the first time.
1706  *
1707  * This function will do some initial scheduler statistics housekeeping
1708  * that must be done for every newly created context, then puts the task
1709  * on the runqueue and wakes it.
1710  */
1711 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1712 {
1713         unsigned long flags;
1714         struct rq *rq;
1715         int this_cpu;
1716
1717         rq = task_rq_lock(p, &flags);
1718         BUG_ON(p->state != TASK_RUNNING);
1719         this_cpu = smp_processor_id(); /* parent's CPU */
1720
1721         p->prio = effective_prio(p);
1722
1723         if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1724                         task_cpu(p) != this_cpu || !current->se.on_rq) {
1725                 activate_task(rq, p, 0);
1726         } else {
1727                 /*
1728                  * Let the scheduling class do new task startup
1729                  * management (if any):
1730                  */
1731                 p->sched_class->task_new(rq, p);
1732         }
1733         check_preempt_curr(rq, p);
1734         task_rq_unlock(rq, &flags);
1735 }
1736
1737 /**
1738  * prepare_task_switch - prepare to switch tasks
1739  * @rq: the runqueue preparing to switch
1740  * @next: the task we are going to switch to.
1741  *
1742  * This is called with the rq lock held and interrupts off. It must
1743  * be paired with a subsequent finish_task_switch after the context
1744  * switch.
1745  *
1746  * prepare_task_switch sets up locking and calls architecture specific
1747  * hooks.
1748  */
1749 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1750 {
1751         prepare_lock_switch(rq, next);
1752         prepare_arch_switch(next);
1753 }
1754
1755 /**
1756  * finish_task_switch - clean up after a task-switch
1757  * @rq: runqueue associated with task-switch
1758  * @prev: the thread we just switched away from.
1759  *
1760  * finish_task_switch must be called after the context switch, paired
1761  * with a prepare_task_switch call before the context switch.
1762  * finish_task_switch will reconcile locking set up by prepare_task_switch,
1763  * and do any other architecture-specific cleanup actions.
1764  *
1765  * Note that we may have delayed dropping an mm in context_switch(). If
1766  * so, we finish that here outside of the runqueue lock.  (Doing it
1767  * with the lock held can cause deadlocks; see schedule() for
1768  * details.)
1769  */
1770 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1771         __releases(rq->lock)
1772 {
1773         struct mm_struct *mm = rq->prev_mm;
1774         long prev_state;
1775
1776         rq->prev_mm = NULL;
1777
1778         /*
1779          * A task struct has one reference for the use as "current".
1780          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1781          * schedule one last time. The schedule call will never return, and
1782          * the scheduled task must drop that reference.
1783          * The test for TASK_DEAD must occur while the runqueue locks are
1784          * still held, otherwise prev could be scheduled on another cpu, die
1785          * there before we look at prev->state, and then the reference would
1786          * be dropped twice.
1787          *              Manfred Spraul <manfred@colorfullife.com>
1788          */
1789         prev_state = prev->state;
1790         finish_arch_switch(prev);
1791         finish_lock_switch(rq, prev);
1792         if (mm)
1793                 mmdrop(mm);
1794         if (unlikely(prev_state == TASK_DEAD)) {
1795                 /*
1796                  * Remove function-return probe instances associated with this
1797                  * task and put them back on the free list.
1798                  */
1799                 kprobe_flush_task(prev);
1800                 put_task_struct(prev);
1801         }
1802 }
1803
1804 /**
1805  * schedule_tail - first thing a freshly forked thread must call.
1806  * @prev: the thread we just switched away from.
1807  */
1808 asmlinkage void schedule_tail(struct task_struct *prev)
1809         __releases(rq->lock)
1810 {
1811         struct rq *rq = this_rq();
1812
1813         finish_task_switch(rq, prev);
1814 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1815         /* In this case, finish_task_switch does not reenable preemption */
1816         preempt_enable();
1817 #endif
1818         if (current->set_child_tid)
1819                 put_user(current->pid, current->set_child_tid);
1820 }
1821
1822 /*
1823  * context_switch - switch to the new MM and the new
1824  * thread's register state.
1825  */
1826 static inline void
1827 context_switch(struct rq *rq, struct task_struct *prev,
1828                struct task_struct *next)
1829 {
1830         struct mm_struct *mm, *oldmm;
1831
1832         prepare_task_switch(rq, next);
1833         mm = next->mm;
1834         oldmm = prev->active_mm;
1835         /*
1836          * For paravirt, this is coupled with an exit in switch_to to
1837          * combine the page table reload and the switch backend into
1838          * one hypercall.
1839          */
1840         arch_enter_lazy_cpu_mode();
1841
1842         if (unlikely(!mm)) {
1843                 next->active_mm = oldmm;
1844                 atomic_inc(&oldmm->mm_count);
1845                 enter_lazy_tlb(oldmm, next);
1846         } else
1847                 switch_mm(oldmm, mm, next);
1848
1849         if (unlikely(!prev->mm)) {
1850                 prev->active_mm = NULL;
1851                 rq->prev_mm = oldmm;
1852         }
1853         /*
1854          * Since the runqueue lock will be released by the next
1855          * task (which is an invalid locking op but in the case
1856          * of the scheduler it's an obvious special-case), so we
1857          * do an early lockdep release here:
1858          */
1859 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1860         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1861 #endif
1862
1863         /* Here we just switch the register state and the stack. */
1864         switch_to(prev, next, prev);
1865
1866         barrier();
1867         /*
1868          * this_rq must be evaluated again because prev may have moved
1869          * CPUs since it called schedule(), thus the 'rq' on its stack
1870          * frame will be invalid.
1871          */
1872         finish_task_switch(this_rq(), prev);
1873 }
1874
1875 /*
1876  * nr_running, nr_uninterruptible and nr_context_switches:
1877  *
1878  * externally visible scheduler statistics: current number of runnable
1879  * threads, current number of uninterruptible-sleeping threads, total
1880  * number of context switches performed since bootup.
1881  */
1882 unsigned long nr_running(void)
1883 {
1884         unsigned long i, sum = 0;
1885
1886         for_each_online_cpu(i)
1887                 sum += cpu_rq(i)->nr_running;
1888
1889         return sum;
1890 }
1891
1892 unsigned long nr_uninterruptible(void)
1893 {
1894         unsigned long i, sum = 0;
1895
1896         for_each_possible_cpu(i)
1897                 sum += cpu_rq(i)->nr_uninterruptible;
1898
1899         /*
1900          * Since we read the counters lockless, it might be slightly
1901          * inaccurate. Do not allow it to go below zero though:
1902          */
1903         if (unlikely((long)sum < 0))
1904                 sum = 0;
1905
1906         return sum;
1907 }
1908
1909 unsigned long long nr_context_switches(void)
1910 {
1911         int i;
1912         unsigned long long sum = 0;
1913
1914         for_each_possible_cpu(i)
1915                 sum += cpu_rq(i)->nr_switches;
1916
1917         return sum;
1918 }
1919
1920 unsigned long nr_iowait(void)
1921 {
1922         unsigned long i, sum = 0;
1923
1924         for_each_possible_cpu(i)
1925                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1926
1927         return sum;
1928 }
1929
1930 unsigned long nr_active(void)
1931 {
1932         unsigned long i, running = 0, uninterruptible = 0;
1933
1934         for_each_online_cpu(i) {
1935                 running += cpu_rq(i)->nr_running;
1936                 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1937         }
1938
1939         if (unlikely((long)uninterruptible < 0))
1940                 uninterruptible = 0;
1941
1942         return running + uninterruptible;
1943 }
1944
1945 /*
1946  * Update rq->cpu_load[] statistics. This function is usually called every
1947  * scheduler tick (TICK_NSEC).
1948  */
1949 static void update_cpu_load(struct rq *this_rq)
1950 {
1951         u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1952         unsigned long total_load = this_rq->ls.load.weight;
1953         unsigned long this_load =  total_load;
1954         struct load_stat *ls = &this_rq->ls;
1955         u64 now = __rq_clock(this_rq);
1956         int i, scale;
1957
1958         this_rq->nr_load_updates++;
1959         if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1960                 goto do_avg;
1961
1962         /* Update delta_fair/delta_exec fields first */
1963         update_curr_load(this_rq, now);
1964
1965         fair_delta64 = ls->delta_fair + 1;
1966         ls->delta_fair = 0;
1967
1968         exec_delta64 = ls->delta_exec + 1;
1969         ls->delta_exec = 0;
1970
1971         sample_interval64 = now - ls->load_update_last;
1972         ls->load_update_last = now;
1973
1974         if ((s64)sample_interval64 < (s64)TICK_NSEC)
1975                 sample_interval64 = TICK_NSEC;
1976
1977         if (exec_delta64 > sample_interval64)
1978                 exec_delta64 = sample_interval64;
1979
1980         idle_delta64 = sample_interval64 - exec_delta64;
1981
1982         tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1983         tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1984
1985         this_load = (unsigned long)tmp64;
1986
1987 do_avg:
1988
1989         /* Update our load: */
1990         for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1991                 unsigned long old_load, new_load;
1992
1993                 /* scale is effectively 1 << i now, and >> i divides by scale */
1994
1995                 old_load = this_rq->cpu_load[i];
1996                 new_load = this_load;
1997
1998                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1999         }
2000 }
2001
2002 #ifdef CONFIG_SMP
2003
2004 /*
2005  * double_rq_lock - safely lock two runqueues
2006  *
2007  * Note this does not disable interrupts like task_rq_lock,
2008  * you need to do so manually before calling.
2009  */
2010 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2011         __acquires(rq1->lock)
2012         __acquires(rq2->lock)
2013 {
2014         BUG_ON(!irqs_disabled());
2015         if (rq1 == rq2) {
2016                 spin_lock(&rq1->lock);
2017                 __acquire(rq2->lock);   /* Fake it out ;) */
2018         } else {
2019                 if (rq1 < rq2) {
2020                         spin_lock(&rq1->lock);
2021                         spin_lock(&rq2->lock);
2022                 } else {
2023                         spin_lock(&rq2->lock);
2024                         spin_lock(&rq1->lock);
2025                 }
2026         }
2027 }
2028
2029 /*
2030  * double_rq_unlock - safely unlock two runqueues
2031  *
2032  * Note this does not restore interrupts like task_rq_unlock,
2033  * you need to do so manually after calling.
2034  */
2035 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2036         __releases(rq1->lock)
2037         __releases(rq2->lock)
2038 {
2039         spin_unlock(&rq1->lock);
2040         if (rq1 != rq2)
2041                 spin_unlock(&rq2->lock);
2042         else
2043                 __release(rq2->lock);
2044 }
2045
2046 /*
2047  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2048  */
2049 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2050         __releases(this_rq->lock)
2051         __acquires(busiest->lock)
2052         __acquires(this_rq->lock)
2053 {
2054         if (unlikely(!irqs_disabled())) {
2055                 /* printk() doesn't work good under rq->lock */
2056                 spin_unlock(&this_rq->lock);
2057                 BUG_ON(1);
2058         }
2059         if (unlikely(!spin_trylock(&busiest->lock))) {
2060                 if (busiest < this_rq) {
2061                         spin_unlock(&this_rq->lock);
2062                         spin_lock(&busiest->lock);
2063                         spin_lock(&this_rq->lock);
2064                 } else
2065                         spin_lock(&busiest->lock);
2066         }
2067 }
2068
2069 /*
2070  * If dest_cpu is allowed for this process, migrate the task to it.
2071  * This is accomplished by forcing the cpu_allowed mask to only
2072  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
2073  * the cpu_allowed mask is restored.
2074  */
2075 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2076 {
2077         struct migration_req req;
2078         unsigned long flags;
2079         struct rq *rq;
2080
2081         rq = task_rq_lock(p, &flags);
2082         if (!cpu_isset(dest_cpu, p->cpus_allowed)
2083             || unlikely(cpu_is_offline(dest_cpu)))
2084                 goto out;
2085
2086         /* force the process onto the specified CPU */
2087         if (migrate_task(p, dest_cpu, &req)) {
2088                 /* Need to wait for migration thread (might exit: take ref). */
2089                 struct task_struct *mt = rq->migration_thread;
2090
2091                 get_task_struct(mt);
2092                 task_rq_unlock(rq, &flags);
2093                 wake_up_process(mt);
2094                 put_task_struct(mt);
2095                 wait_for_completion(&req.done);
2096
2097                 return;
2098         }
2099 out:
2100         task_rq_unlock(rq, &flags);
2101 }
2102
2103 /*
2104  * sched_exec - execve() is a valuable balancing opportunity, because at
2105  * this point the task has the smallest effective memory and cache footprint.
2106  */
2107 void sched_exec(void)
2108 {
2109         int new_cpu, this_cpu = get_cpu();
2110         new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2111         put_cpu();
2112         if (new_cpu != this_cpu)
2113                 sched_migrate_task(current, new_cpu);
2114 }
2115
2116 /*
2117  * pull_task - move a task from a remote runqueue to the local runqueue.
2118  * Both runqueues must be locked.
2119  */
2120 static void pull_task(struct rq *src_rq, struct task_struct *p,
2121                       struct rq *this_rq, int this_cpu)
2122 {
2123         deactivate_task(src_rq, p, 0);
2124         set_task_cpu(p, this_cpu);
2125         activate_task(this_rq, p, 0);
2126         /*
2127          * Note that idle threads have a prio of MAX_PRIO, for this test
2128          * to be always true for them.
2129          */
2130         check_preempt_curr(this_rq, p);
2131 }
2132
2133 /*
2134  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2135  */
2136 static
2137 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2138                      struct sched_domain *sd, enum cpu_idle_type idle,
2139                      int *all_pinned)
2140 {
2141         /*
2142          * We do not migrate tasks that are:
2143          * 1) running (obviously), or
2144          * 2) cannot be migrated to this CPU due to cpus_allowed, or
2145          * 3) are cache-hot on their current CPU.
2146          */
2147         if (!cpu_isset(this_cpu, p->cpus_allowed))
2148                 return 0;
2149         *all_pinned = 0;
2150
2151         if (task_running(rq, p))
2152                 return 0;
2153
2154         /*
2155          * Aggressive migration if too many balance attempts have failed:
2156          */
2157         if (sd->nr_balance_failed > sd->cache_nice_tries)
2158                 return 1;
2159
2160         return 1;
2161 }
2162
2163 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2164                       unsigned long max_nr_move, unsigned long max_load_move,
2165                       struct sched_domain *sd, enum cpu_idle_type idle,
2166                       int *all_pinned, unsigned long *load_moved,
2167                       int this_best_prio, int best_prio, int best_prio_seen,
2168                       struct rq_iterator *iterator)
2169 {
2170         int pulled = 0, pinned = 0, skip_for_load;
2171         struct task_struct *p;
2172         long rem_load_move = max_load_move;
2173
2174         if (max_nr_move == 0 || max_load_move == 0)
2175                 goto out;
2176
2177         pinned = 1;
2178
2179         /*
2180          * Start the load-balancing iterator:
2181          */
2182         p = iterator->start(iterator->arg);
2183 next:
2184         if (!p)
2185                 goto out;
2186         /*
2187          * To help distribute high priority tasks accross CPUs we don't
2188          * skip a task if it will be the highest priority task (i.e. smallest
2189          * prio value) on its new queue regardless of its load weight
2190          */
2191         skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2192                                                          SCHED_LOAD_SCALE_FUZZ;
2193         if (skip_for_load && p->prio < this_best_prio)
2194                 skip_for_load = !best_prio_seen && p->prio == best_prio;
2195         if (skip_for_load ||
2196             !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2197
2198                 best_prio_seen |= p->prio == best_prio;
2199                 p = iterator->next(iterator->arg);
2200                 goto next;
2201         }
2202
2203         pull_task(busiest, p, this_rq, this_cpu);
2204         pulled++;
2205         rem_load_move -= p->se.load.weight;
2206
2207         /*
2208          * We only want to steal up to the prescribed number of tasks
2209          * and the prescribed amount of weighted load.
2210          */
2211         if (pulled < max_nr_move && rem_load_move > 0) {
2212                 if (p->prio < this_best_prio)
2213                         this_best_prio = p->prio;
2214                 p = iterator->next(iterator->arg);
2215                 goto next;
2216         }
2217 out:
2218         /*
2219          * Right now, this is the only place pull_task() is called,
2220          * so we can safely collect pull_task() stats here rather than
2221          * inside pull_task().
2222          */
2223         schedstat_add(sd, lb_gained[idle], pulled);
2224
2225         if (all_pinned)
2226                 *all_pinned = pinned;
2227         *load_moved = max_load_move - rem_load_move;
2228         return pulled;
2229 }
2230
2231 /*
2232  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2233  * load from busiest to this_rq, as part of a balancing operation within
2234  * "domain". Returns the number of tasks moved.
2235  *
2236  * Called with both runqueues locked.
2237  */
2238 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2239                       unsigned long max_nr_move, unsigned long max_load_move,
2240                       struct sched_domain *sd, enum cpu_idle_type idle,
2241                       int *all_pinned)
2242 {
2243         struct sched_class *class = sched_class_highest;
2244         unsigned long load_moved, total_nr_moved = 0, nr_moved;
2245         long rem_load_move = max_load_move;
2246
2247         do {
2248                 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2249                                 max_nr_move, (unsigned long)rem_load_move,
2250                                 sd, idle, all_pinned, &load_moved);
2251                 total_nr_moved += nr_moved;
2252                 max_nr_move -= nr_moved;
2253                 rem_load_move -= load_moved;
2254                 class = class->next;
2255         } while (class && max_nr_move && rem_load_move > 0);
2256
2257         return total_nr_moved;
2258 }
2259
2260 /*
2261  * find_busiest_group finds and returns the busiest CPU group within the
2262  * domain. It calculates and returns the amount of weighted load which
2263  * should be moved to restore balance via the imbalance parameter.
2264  */
2265 static struct sched_group *
2266 find_busiest_group(struct sched_domain *sd, int this_cpu,
2267                    unsigned long *imbalance, enum cpu_idle_type idle,
2268                    int *sd_idle, cpumask_t *cpus, int *balance)
2269 {
2270         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2271         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2272         unsigned long max_pull;
2273         unsigned long busiest_load_per_task, busiest_nr_running;
2274         unsigned long this_load_per_task, this_nr_running;
2275         int load_idx;
2276 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2277         int power_savings_balance = 1;
2278         unsigned long leader_nr_running = 0, min_load_per_task = 0;
2279         unsigned long min_nr_running = ULONG_MAX;
2280         struct sched_group *group_min = NULL, *group_leader = NULL;
2281 #endif
2282
2283         max_load = this_load = total_load = total_pwr = 0;
2284         busiest_load_per_task = busiest_nr_running = 0;
2285         this_load_per_task = this_nr_running = 0;
2286         if (idle == CPU_NOT_IDLE)
2287                 load_idx = sd->busy_idx;
2288         else if (idle == CPU_NEWLY_IDLE)
2289                 load_idx = sd->newidle_idx;
2290         else
2291                 load_idx = sd->idle_idx;
2292
2293         do {
2294                 unsigned long load, group_capacity;
2295                 int local_group;
2296                 int i;
2297                 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2298                 unsigned long sum_nr_running, sum_weighted_load;
2299
2300                 local_group = cpu_isset(this_cpu, group->cpumask);
2301
2302                 if (local_group)
2303                         balance_cpu = first_cpu(group->cpumask);
2304
2305                 /* Tally up the load of all CPUs in the group */
2306                 sum_weighted_load = sum_nr_running = avg_load = 0;
2307
2308                 for_each_cpu_mask(i, group->cpumask) {
2309                         struct rq *rq;
2310
2311                         if (!cpu_isset(i, *cpus))
2312                                 continue;
2313
2314                         rq = cpu_rq(i);
2315
2316                         if (*sd_idle && !idle_cpu(i))
2317                                 *sd_idle = 0;
2318
2319                         /* Bias balancing toward cpus of our domain */
2320                         if (local_group) {
2321                                 if (idle_cpu(i) && !first_idle_cpu) {
2322                                         first_idle_cpu = 1;
2323                                         balance_cpu = i;
2324                                 }
2325
2326                                 load = target_load(i, load_idx);
2327                         } else
2328                                 load = source_load(i, load_idx);
2329
2330                         avg_load += load;
2331                         sum_nr_running += rq->nr_running;
2332                         sum_weighted_load += weighted_cpuload(i);
2333                 }
2334
2335                 /*
2336                  * First idle cpu or the first cpu(busiest) in this sched group
2337                  * is eligible for doing load balancing at this and above
2338                  * domains.
2339                  */
2340                 if (local_group && balance_cpu != this_cpu && balance) {
2341                         *balance = 0;
2342                         goto ret;
2343                 }
2344
2345                 total_load += avg_load;
2346                 total_pwr += group->__cpu_power;
2347
2348                 /* Adjust by relative CPU power of the group */
2349                 avg_load = sg_div_cpu_power(group,
2350                                 avg_load * SCHED_LOAD_SCALE);
2351
2352                 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2353
2354                 if (local_group) {
2355                         this_load = avg_load;
2356                         this = group;
2357                         this_nr_running = sum_nr_running;
2358                         this_load_per_task = sum_weighted_load;
2359                 } else if (avg_load > max_load &&
2360                            sum_nr_running > group_capacity) {
2361                         max_load = avg_load;
2362                         busiest = group;
2363                         busiest_nr_running = sum_nr_running;
2364                         busiest_load_per_task = sum_weighted_load;
2365                 }
2366
2367 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2368                 /*
2369                  * Busy processors will not participate in power savings
2370                  * balance.
2371                  */
2372                 if (idle == CPU_NOT_IDLE ||
2373                                 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2374                         goto group_next;
2375
2376                 /*
2377                  * If the local group is idle or completely loaded
2378                  * no need to do power savings balance at this domain
2379                  */
2380                 if (local_group && (this_nr_running >= group_capacity ||
2381                                     !this_nr_running))
2382                         power_savings_balance = 0;
2383
2384                 /*
2385                  * If a group is already running at full capacity or idle,
2386                  * don't include that group in power savings calculations
2387                  */
2388                 if (!power_savings_balance || sum_nr_running >= group_capacity
2389                     || !sum_nr_running)
2390                         goto group_next;
2391
2392                 /*
2393                  * Calculate the group which has the least non-idle load.
2394                  * This is the group from where we need to pick up the load
2395                  * for saving power
2396                  */
2397                 if ((sum_nr_running < min_nr_running) ||
2398                     (sum_nr_running == min_nr_running &&
2399                      first_cpu(group->cpumask) <
2400                      first_cpu(group_min->cpumask))) {
2401                         group_min = group;
2402                         min_nr_running = sum_nr_running;
2403                         min_load_per_task = sum_weighted_load /
2404                                                 sum_nr_running;
2405                 }
2406
2407                 /*
2408                  * Calculate the group which is almost near its
2409                  * capacity but still has some space to pick up some load
2410                  * from other group and save more power
2411                  */
2412                 if (sum_nr_running <= group_capacity - 1) {
2413                         if (sum_nr_running > leader_nr_running ||
2414                             (sum_nr_running == leader_nr_running &&
2415                              first_cpu(group->cpumask) >
2416                               first_cpu(group_leader->cpumask))) {
2417                                 group_leader = group;
2418                                 leader_nr_running = sum_nr_running;
2419                         }
2420                 }
2421 group_next:
2422 #endif
2423                 group = group->next;
2424         } while (group != sd->groups);
2425
2426         if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2427                 goto out_balanced;
2428
2429         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2430
2431         if (this_load >= avg_load ||
2432                         100*max_load <= sd->imbalance_pct*this_load)
2433                 goto out_balanced;
2434
2435         busiest_load_per_task /= busiest_nr_running;
2436         /*
2437          * We're trying to get all the cpus to the average_load, so we don't
2438          * want to push ourselves above the average load, nor do we wish to
2439          * reduce the max loaded cpu below the average load, as either of these
2440          * actions would just result in more rebalancing later, and ping-pong
2441          * tasks around. Thus we look for the minimum possible imbalance.
2442          * Negative imbalances (*we* are more loaded than anyone else) will
2443          * be counted as no imbalance for these purposes -- we can't fix that
2444          * by pulling tasks to us.  Be careful of negative numbers as they'll
2445          * appear as very large values with unsigned longs.
2446          */
2447         if (max_load <= busiest_load_per_task)
2448                 goto out_balanced;
2449
2450         /*
2451          * In the presence of smp nice balancing, certain scenarios can have
2452          * max load less than avg load(as we skip the groups at or below
2453          * its cpu_power, while calculating max_load..)
2454          */
2455         if (max_load < avg_load) {
2456                 *imbalance = 0;
2457                 goto small_imbalance;
2458         }
2459
2460         /* Don't want to pull so many tasks that a group would go idle */
2461         max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2462
2463         /* How much load to actually move to equalise the imbalance */
2464         *imbalance = min(max_pull * busiest->__cpu_power,
2465                                 (avg_load - this_load) * this->__cpu_power)
2466                         / SCHED_LOAD_SCALE;
2467
2468         /*
2469          * if *imbalance is less than the average load per runnable task
2470          * there is no gaurantee that any tasks will be moved so we'll have
2471          * a think about bumping its value to force at least one task to be
2472          * moved
2473          */
2474         if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2475                 unsigned long tmp, pwr_now, pwr_move;
2476                 unsigned int imbn;
2477
2478 small_imbalance:
2479                 pwr_move = pwr_now = 0;
2480                 imbn = 2;
2481                 if (this_nr_running) {
2482                         this_load_per_task /= this_nr_running;
2483                         if (busiest_load_per_task > this_load_per_task)
2484                                 imbn = 1;
2485                 } else
2486                         this_load_per_task = SCHED_LOAD_SCALE;
2487
2488                 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2489                                         busiest_load_per_task * imbn) {
2490                         *imbalance = busiest_load_per_task;
2491                         return busiest;
2492                 }
2493
2494                 /*
2495                  * OK, we don't have enough imbalance to justify moving tasks,
2496                  * however we may be able to increase total CPU power used by
2497                  * moving them.
2498                  */
2499
2500                 pwr_now += busiest->__cpu_power *
2501                                 min(busiest_load_per_task, max_load);
2502                 pwr_now += this->__cpu_power *
2503                                 min(this_load_per_task, this_load);
2504                 pwr_now /= SCHED_LOAD_SCALE;
2505
2506                 /* Amount of load we'd subtract */
2507                 tmp = sg_div_cpu_power(busiest,
2508                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2509                 if (max_load > tmp)
2510                         pwr_move += busiest->__cpu_power *
2511                                 min(busiest_load_per_task, max_load - tmp);
2512
2513                 /* Amount of load we'd add */
2514                 if (max_load * busiest->__cpu_power <
2515                                 busiest_load_per_task * SCHED_LOAD_SCALE)
2516                         tmp = sg_div_cpu_power(this,
2517                                         max_load * busiest->__cpu_power);
2518                 else
2519                         tmp = sg_div_cpu_power(this,
2520                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2521                 pwr_move += this->__cpu_power *
2522                                 min(this_load_per_task, this_load + tmp);
2523                 pwr_move /= SCHED_LOAD_SCALE;
2524
2525                 /* Move if we gain throughput */
2526                 if (pwr_move <= pwr_now)
2527                         goto out_balanced;
2528
2529                 *imbalance = busiest_load_per_task;
2530         }
2531
2532         return busiest;
2533
2534 out_balanced:
2535 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2536         if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2537                 goto ret;
2538
2539         if (this == group_leader && group_leader != group_min) {
2540                 *imbalance = min_load_per_task;
2541                 return group_min;
2542         }
2543 #endif
2544 ret:
2545         *imbalance = 0;
2546         return NULL;
2547 }
2548
2549 /*
2550  * find_busiest_queue - find the busiest runqueue among the cpus in group.
2551  */
2552 static struct rq *
2553 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2554                    unsigned long imbalance, cpumask_t *cpus)
2555 {
2556         struct rq *busiest = NULL, *rq;
2557         unsigned long max_load = 0;
2558         int i;
2559
2560         for_each_cpu_mask(i, group->cpumask) {
2561                 unsigned long wl;
2562
2563                 if (!cpu_isset(i, *cpus))
2564                         continue;
2565
2566                 rq = cpu_rq(i);
2567                 wl = weighted_cpuload(i);
2568
2569                 if (rq->nr_running == 1 && wl > imbalance)
2570                         continue;
2571
2572                 if (wl > max_load) {
2573                         max_load = wl;
2574                         busiest = rq;
2575                 }
2576         }
2577
2578         return busiest;
2579 }
2580
2581 /*
2582  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2583  * so long as it is large enough.
2584  */
2585 #define MAX_PINNED_INTERVAL     512
2586
2587 static inline unsigned long minus_1_or_zero(unsigned long n)
2588 {
2589         return n > 0 ? n - 1 : 0;
2590 }
2591
2592 /*
2593  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2594  * tasks if there is an imbalance.
2595  */
2596 static int load_balance(int this_cpu, struct rq *this_rq,
2597                         struct sched_domain *sd, enum cpu_idle_type idle,
2598                         int *balance)
2599 {
2600         int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2601         struct sched_group *group;
2602         unsigned long imbalance;
2603         struct rq *busiest;
2604         cpumask_t cpus = CPU_MASK_ALL;
2605         unsigned long flags;
2606
2607         /*
2608          * When power savings policy is enabled for the parent domain, idle
2609          * sibling can pick up load irrespective of busy siblings. In this case,
2610          * let the state of idle sibling percolate up as CPU_IDLE, instead of
2611          * portraying it as CPU_NOT_IDLE.
2612          */
2613         if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2614             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2615                 sd_idle = 1;
2616
2617         schedstat_inc(sd, lb_cnt[idle]);
2618
2619 redo:
2620         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2621                                    &cpus, balance);
2622
2623         if (*balance == 0)
2624                 goto out_balanced;
2625
2626         if (!group) {
2627                 schedstat_inc(sd, lb_nobusyg[idle]);
2628                 goto out_balanced;
2629         }
2630
2631         busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2632         if (!busiest) {
2633                 schedstat_inc(sd, lb_nobusyq[idle]);
2634                 goto out_balanced;
2635         }
2636
2637         BUG_ON(busiest == this_rq);
2638
2639         schedstat_add(sd, lb_imbalance[idle], imbalance);
2640
2641         nr_moved = 0;
2642         if (busiest->nr_running > 1) {
2643                 /*
2644                  * Attempt to move tasks. If find_busiest_group has found
2645                  * an imbalance but busiest->nr_running <= 1, the group is
2646                  * still unbalanced. nr_moved simply stays zero, so it is
2647                  * correctly treated as an imbalance.
2648                  */
2649                 local_irq_save(flags);
2650                 double_rq_lock(this_rq, busiest);
2651                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2652                                       minus_1_or_zero(busiest->nr_running),
2653                                       imbalance, sd, idle, &all_pinned);
2654                 double_rq_unlock(this_rq, busiest);
2655                 local_irq_restore(flags);
2656
2657                 /*
2658                  * some other cpu did the load balance for us.
2659                  */
2660                 if (nr_moved && this_cpu != smp_processor_id())
2661                         resched_cpu(this_cpu);
2662
2663                 /* All tasks on this runqueue were pinned by CPU affinity */
2664                 if (unlikely(all_pinned)) {
2665                         cpu_clear(cpu_of(busiest), cpus);
2666                         if (!cpus_empty(cpus))
2667                                 goto redo;
2668                         goto out_balanced;
2669                 }
2670         }
2671
2672         if (!nr_moved) {
2673                 schedstat_inc(sd, lb_failed[idle]);
2674                 sd->nr_balance_failed++;
2675
2676                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2677
2678                         spin_lock_irqsave(&busiest->lock, flags);
2679
2680                         /* don't kick the migration_thread, if the curr
2681                          * task on busiest cpu can't be moved to this_cpu
2682                          */
2683                         if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2684                                 spin_unlock_irqrestore(&busiest->lock, flags);
2685                                 all_pinned = 1;
2686                                 goto out_one_pinned;
2687                         }
2688
2689                         if (!busiest->active_balance) {
2690                                 busiest->active_balance = 1;
2691                                 busiest->push_cpu = this_cpu;
2692                                 active_balance = 1;
2693                         }
2694                         spin_unlock_irqrestore(&busiest->lock, flags);
2695                         if (active_balance)
2696                                 wake_up_process(busiest->migration_thread);
2697
2698                         /*
2699                          * We've kicked active balancing, reset the failure
2700                          * counter.
2701                          */
2702                         sd->nr_balance_failed = sd->cache_nice_tries+1;
2703                 }
2704         } else
2705                 sd->nr_balance_failed = 0;
2706
2707         if (likely(!active_balance)) {
2708                 /* We were unbalanced, so reset the balancing interval */
2709                 sd->balance_interval = sd->min_interval;
2710         } else {
2711                 /*
2712                  * If we've begun active balancing, start to back off. This
2713                  * case may not be covered by the all_pinned logic if there
2714                  * is only 1 task on the busy runqueue (because we don't call
2715                  * move_tasks).
2716                  */
2717                 if (sd->balance_interval < sd->max_interval)
2718                         sd->balance_interval *= 2;
2719         }
2720
2721         if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2722             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2723                 return -1;
2724         return nr_moved;
2725
2726 out_balanced:
2727         schedstat_inc(sd, lb_balanced[idle]);
2728
2729         sd->nr_balance_failed = 0;
2730
2731 out_one_pinned:
2732         /* tune up the balancing interval */
2733         if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2734                         (sd->balance_interval < sd->max_interval))
2735                 sd->balance_interval *= 2;
2736
2737         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2738             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2739                 return -1;
2740         return 0;
2741 }
2742
2743 /*
2744  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2745  * tasks if there is an imbalance.
2746  *
2747  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2748  * this_rq is locked.
2749  */
2750 static int
2751 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2752 {
2753         struct sched_group *group;
2754         struct rq *busiest = NULL;
2755         unsigned long imbalance;
2756         int nr_moved = 0;
2757         int sd_idle = 0;
2758         cpumask_t cpus = CPU_MASK_ALL;
2759
2760         /*
2761          * When power savings policy is enabled for the parent domain, idle
2762          * sibling can pick up load irrespective of busy siblings. In this case,
2763          * let the state of idle sibling percolate up as IDLE, instead of
2764          * portraying it as CPU_NOT_IDLE.
2765          */
2766         if (sd->flags & SD_SHARE_CPUPOWER &&
2767             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2768                 sd_idle = 1;
2769
2770         schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2771 redo:
2772         group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2773                                    &sd_idle, &cpus, NULL);
2774         if (!group) {
2775                 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2776                 goto out_balanced;
2777         }
2778
2779         busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2780                                 &cpus);
2781         if (!busiest) {
2782                 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2783                 goto out_balanced;
2784         }
2785
2786         BUG_ON(busiest == this_rq);
2787
2788         schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2789
2790         nr_moved = 0;
2791         if (busiest->nr_running > 1) {
2792                 /* Attempt to move tasks */
2793                 double_lock_balance(this_rq, busiest);
2794                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2795                                         minus_1_or_zero(busiest->nr_running),
2796                                         imbalance, sd, CPU_NEWLY_IDLE, NULL);
2797                 spin_unlock(&busiest->lock);
2798
2799                 if (!nr_moved) {
2800                         cpu_clear(cpu_of(busiest), cpus);
2801                         if (!cpus_empty(cpus))
2802                                 goto redo;
2803                 }
2804         }
2805
2806         if (!nr_moved) {
2807                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2808                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2809                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2810                         return -1;
2811         } else
2812                 sd->nr_balance_failed = 0;
2813
2814         return nr_moved;
2815
2816 out_balanced:
2817         schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2818         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2819             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2820                 return -1;
2821         sd->nr_balance_failed = 0;
2822
2823         return 0;
2824 }
2825
2826 /*
2827  * idle_balance is called by schedule() if this_cpu is about to become
2828  * idle. Attempts to pull tasks from other CPUs.
2829  */
2830 static void idle_balance(int this_cpu, struct rq *this_rq)
2831 {
2832         struct sched_domain *sd;
2833         int pulled_task = -1;
2834         unsigned long next_balance = jiffies + HZ;
2835
2836         for_each_domain(this_cpu, sd) {
2837                 unsigned long interval;
2838
2839                 if (!(sd->flags & SD_LOAD_BALANCE))
2840                         continue;
2841
2842                 if (sd->flags & SD_BALANCE_NEWIDLE)
2843                         /* If we've pulled tasks over stop searching: */
2844                         pulled_task = load_balance_newidle(this_cpu,
2845                                                                 this_rq, sd);
2846
2847                 interval = msecs_to_jiffies(sd->balance_interval);
2848                 if (time_after(next_balance, sd->last_balance + interval))
2849                         next_balance = sd->last_balance + interval;
2850                 if (pulled_task)
2851                         break;
2852         }
2853         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2854                 /*
2855                  * We are going idle. next_balance may be set based on
2856                  * a busy processor. So reset next_balance.
2857                  */
2858                 this_rq->next_balance = next_balance;
2859         }
2860 }
2861
2862 /*
2863  * active_load_balance is run by migration threads. It pushes running tasks
2864  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2865  * running on each physical CPU where possible, and avoids physical /
2866  * logical imbalances.
2867  *
2868  * Called with busiest_rq locked.
2869  */
2870 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2871 {
2872         int target_cpu = busiest_rq->push_cpu;
2873         struct sched_domain *sd;
2874         struct rq *target_rq;
2875
2876         /* Is there any task to move? */
2877         if (busiest_rq->nr_running <= 1)
2878                 return;
2879
2880         target_rq = cpu_rq(target_cpu);
2881
2882         /*
2883          * This condition is "impossible", if it occurs
2884          * we need to fix it.  Originally reported by
2885          * Bjorn Helgaas on a 128-cpu setup.
2886          */
2887         BUG_ON(busiest_rq == target_rq);
2888
2889         /* move a task from busiest_rq to target_rq */
2890         double_lock_balance(busiest_rq, target_rq);
2891
2892         /* Search for an sd spanning us and the target CPU. */
2893         for_each_domain(target_cpu, sd) {
2894                 if ((sd->flags & SD_LOAD_BALANCE) &&
2895                     cpu_isset(busiest_cpu, sd->span))
2896                                 break;
2897         }
2898
2899         if (likely(sd)) {
2900                 schedstat_inc(sd, alb_cnt);
2901
2902                 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2903                                RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2904                                NULL))
2905                         schedstat_inc(sd, alb_pushed);
2906                 else
2907                         schedstat_inc(sd, alb_failed);
2908         }
2909         spin_unlock(&target_rq->lock);
2910 }
2911
2912 #ifdef CONFIG_NO_HZ
2913 static struct {
2914         atomic_t load_balancer;
2915         cpumask_t  cpu_mask;
2916 } nohz ____cacheline_aligned = {
2917         .load_balancer = ATOMIC_INIT(-1),
2918         .cpu_mask = CPU_MASK_NONE,
2919 };
2920
2921 /*
2922  * This routine will try to nominate the ilb (idle load balancing)
2923  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2924  * load balancing on behalf of all those cpus. If all the cpus in the system
2925  * go into this tickless mode, then there will be no ilb owner (as there is
2926  * no need for one) and all the cpus will sleep till the next wakeup event
2927  * arrives...
2928  *
2929  * For the ilb owner, tick is not stopped. And this tick will be used
2930  * for idle load balancing. ilb owner will still be part of
2931  * nohz.cpu_mask..
2932  *
2933  * While stopping the tick, this cpu will become the ilb owner if there
2934  * is no other owner. And will be the owner till that cpu becomes busy
2935  * or if all cpus in the system stop their ticks at which point
2936  * there is no need for ilb owner.
2937  *
2938  * When the ilb owner becomes busy, it nominates another owner, during the
2939  * next busy scheduler_tick()
2940  */
2941 int select_nohz_load_balancer(int stop_tick)
2942 {
2943         int cpu = smp_processor_id();
2944
2945         if (stop_tick) {
2946                 cpu_set(cpu, nohz.cpu_mask);
2947                 cpu_rq(cpu)->in_nohz_recently = 1;
2948
2949                 /*
2950                  * If we are going offline and still the leader, give up!
2951                  */
2952                 if (cpu_is_offline(cpu) &&
2953                     atomic_read(&nohz.load_balancer) == cpu) {
2954                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2955                                 BUG();
2956                         return 0;
2957                 }
2958
2959                 /* time for ilb owner also to sleep */
2960                 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2961                         if (atomic_read(&nohz.load_balancer) == cpu)
2962                                 atomic_set(&nohz.load_balancer, -1);
2963                         return 0;
2964                 }
2965
2966                 if (atomic_read(&nohz.load_balancer) == -1) {
2967                         /* make me the ilb owner */
2968                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2969                                 return 1;
2970                 } else if (atomic_read(&nohz.load_balancer) == cpu)
2971                         return 1;
2972         } else {
2973                 if (!cpu_isset(cpu, nohz.cpu_mask))
2974                         return 0;
2975
2976                 cpu_clear(cpu, nohz.cpu_mask);
2977
2978                 if (atomic_read(&nohz.load_balancer) == cpu)
2979                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2980                                 BUG();
2981         }
2982         return 0;
2983 }
2984 #endif
2985
2986 static DEFINE_SPINLOCK(balancing);
2987
2988 /*
2989  * It checks each scheduling domain to see if it is due to be balanced,
2990  * and initiates a balancing operation if so.
2991  *
2992  * Balancing parameters are set up in arch_init_sched_domains.
2993  */
2994 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2995 {
2996         int balance = 1;
2997         struct rq *rq = cpu_rq(cpu);
2998         unsigned long interval;
2999         struct sched_domain *sd;
3000         /* Earliest time when we have to do rebalance again */
3001         unsigned long next_balance = jiffies + 60*HZ;
3002
3003         for_each_domain(cpu, sd) {
3004                 if (!(sd->flags & SD_LOAD_BALANCE))
3005                         continue;
3006
3007                 interval = sd->balance_interval;
3008                 if (idle != CPU_IDLE)
3009                         interval *= sd->busy_factor;
3010
3011                 /* scale ms to jiffies */
3012                 interval = msecs_to_jiffies(interval);
3013                 if (unlikely(!interval))
3014                         interval = 1;
3015                 if (interval > HZ*NR_CPUS/10)
3016                         interval = HZ*NR_CPUS/10;
3017
3018
3019                 if (sd->flags & SD_SERIALIZE) {
3020                         if (!spin_trylock(&balancing))
3021                                 goto out;
3022                 }
3023
3024                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3025                         if (load_balance(cpu, rq, sd, idle, &balance)) {
3026                                 /*
3027                                  * We've pulled tasks over so either we're no
3028                                  * longer idle, or one of our SMT siblings is
3029                                  * not idle.
3030                                  */
3031                                 idle = CPU_NOT_IDLE;
3032                         }
3033                         sd->last_balance = jiffies;
3034                 }
3035                 if (sd->flags & SD_SERIALIZE)
3036                         spin_unlock(&balancing);
3037 out:
3038                 if (time_after(next_balance, sd->last_balance + interval))
3039                         next_balance = sd->last_balance + interval;
3040
3041                 /*
3042                  * Stop the load balance at this level. There is another
3043                  * CPU in our sched group which is doing load balancing more
3044                  * actively.
3045                  */
3046                 if (!balance)
3047                         break;
3048         }
3049         rq->next_balance = next_balance;
3050 }
3051
3052 /*
3053  * run_rebalance_domains is triggered when needed from the scheduler tick.
3054  * In CONFIG_NO_HZ case, the idle load balance owner will do the
3055  * rebalancing for all the cpus for whom scheduler ticks are stopped.
3056  */
3057 static void run_rebalance_domains(struct softirq_action *h)
3058 {
3059         int this_cpu = smp_processor_id();
3060         struct rq *this_rq = cpu_rq(this_cpu);
3061         enum cpu_idle_type idle = this_rq->idle_at_tick ?
3062                                                 CPU_IDLE : CPU_NOT_IDLE;
3063
3064         rebalance_domains(this_cpu, idle);
3065
3066 #ifdef CONFIG_NO_HZ
3067         /*
3068          * If this cpu is the owner for idle load balancing, then do the
3069          * balancing on behalf of the other idle cpus whose ticks are
3070          * stopped.
3071          */
3072         if (this_rq->idle_at_tick &&
3073             atomic_read(&nohz.load_balancer) == this_cpu) {
3074                 cpumask_t cpus = nohz.cpu_mask;
3075                 struct rq *rq;
3076                 int balance_cpu;
3077
3078                 cpu_clear(this_cpu, cpus);
3079                 for_each_cpu_mask(balance_cpu, cpus) {
3080                         /*
3081                          * If this cpu gets work to do, stop the load balancing
3082                          * work being done for other cpus. Next load
3083                          * balancing owner will pick it up.
3084                          */
3085                         if (need_resched())
3086                                 break;
3087
3088                         rebalance_domains(balance_cpu, SCHED_IDLE);
3089
3090                         rq = cpu_rq(balance_cpu);
3091                         if (time_after(this_rq->next_balance, rq->next_balance))
3092                                 this_rq->next_balance = rq->next_balance;
3093                 }
3094         }
3095 #endif
3096 }
3097
3098 /*
3099  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3100  *
3101  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3102  * idle load balancing owner or decide to stop the periodic load balancing,
3103  * if the whole system is idle.
3104  */
3105 static inline void trigger_load_balance(struct rq *rq, int cpu)
3106 {
3107 #ifdef CONFIG_NO_HZ
3108         /*
3109          * If we were in the nohz mode recently and busy at the current
3110          * scheduler tick, then check if we need to nominate new idle
3111          * load balancer.
3112          */
3113         if (rq->in_nohz_recently && !rq->idle_at_tick) {
3114                 rq->in_nohz_recently = 0;
3115
3116                 if (atomic_read(&nohz.load_balancer) == cpu) {
3117                         cpu_clear(cpu, nohz.cpu_mask);
3118                         atomic_set(&nohz.load_balancer, -1);
3119                 }
3120
3121                 if (atomic_read(&nohz.load_balancer) == -1) {
3122                         /*
3123                          * simple selection for now: Nominate the
3124                          * first cpu in the nohz list to be the next
3125                          * ilb owner.
3126                          *
3127                          * TBD: Traverse the sched domains and nominate
3128                          * the nearest cpu in the nohz.cpu_mask.
3129                          */
3130                         int ilb = first_cpu(nohz.cpu_mask);
3131
3132                         if (ilb != NR_CPUS)
3133                                 resched_cpu(ilb);
3134                 }
3135         }
3136
3137         /*
3138          * If this cpu is idle and doing idle load balancing for all the
3139          * cpus with ticks stopped, is it time for that to stop?
3140          */
3141         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3142             cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3143                 resched_cpu(cpu);
3144                 return;
3145         }
3146
3147         /*
3148          * If this cpu is idle and the idle load balancing is done by
3149          * someone else, then no need raise the SCHED_SOFTIRQ
3150          */
3151         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3152             cpu_isset(cpu, nohz.cpu_mask))
3153                 return;
3154 #endif
3155         if (time_after_eq(jiffies, rq->next_balance))
3156                 raise_softirq(SCHED_SOFTIRQ);
3157 }
3158
3159 #else   /* CONFIG_SMP */
3160
3161 /*
3162  * on UP we do not need to balance between CPUs:
3163  */
3164 static inline void idle_balance(int cpu, struct rq *rq)
3165 {
3166 }
3167
3168 /* Avoid "used but not defined" warning on UP */
3169 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3170                       unsigned long max_nr_move, unsigned long max_load_move,
3171                       struct sched_domain *sd, enum cpu_idle_type idle,
3172                       int *all_pinned, unsigned long *load_moved,
3173                       int this_best_prio, int best_prio, int best_prio_seen,
3174                       struct rq_iterator *iterator)
3175 {
3176         *load_moved = 0;
3177
3178         return 0;
3179 }
3180
3181 #endif
3182
3183 DEFINE_PER_CPU(struct kernel_stat, kstat);
3184
3185 EXPORT_PER_CPU_SYMBOL(kstat);
3186
3187 /*
3188  * Return p->sum_exec_runtime plus any more ns on the sched_clock
3189  * that have not yet been banked in case the task is currently running.
3190  */
3191 unsigned long long task_sched_runtime(struct task_struct *p)
3192 {
3193         unsigned long flags;
3194         u64 ns, delta_exec;
3195         struct rq *rq;
3196
3197         rq = task_rq_lock(p, &flags);
3198         ns = p->se.sum_exec_runtime;
3199         if (rq->curr == p) {
3200                 delta_exec = rq_clock(rq) - p->se.exec_start;
3201                 if ((s64)delta_exec > 0)
3202                         ns += delta_exec;
3203         }
3204         task_rq_unlock(rq, &flags);
3205
3206         return ns;
3207 }
3208
3209 /*
3210  * Account user cpu time to a process.
3211  * @p: the process that the cpu time gets accounted to
3212  * @hardirq_offset: the offset to subtract from hardirq_count()
3213  * @cputime: the cpu time spent in user space since the last update
3214  */
3215 void account_user_time(struct task_struct *p, cputime_t cputime)
3216 {
3217         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3218         cputime64_t tmp;
3219
3220         p->utime = cputime_add(p->utime, cputime);
3221
3222         /* Add user time to cpustat. */
3223         tmp = cputime_to_cputime64(cputime);
3224         if (TASK_NICE(p) > 0)
3225                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3226         else
3227                 cpustat->user = cputime64_add(cpustat->user, tmp);
3228 }
3229
3230 /*
3231  * Account system cpu time to a process.
3232  * @p: the process that the cpu time gets accounted to
3233  * @hardirq_offset: the offset to subtract from hardirq_count()
3234  * @cputime: the cpu time spent in kernel space since the last update
3235  */
3236 void account_system_time(struct task_struct *p, int hardirq_offset,
3237                          cputime_t cputime)
3238 {
3239         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3240         struct rq *rq = this_rq();
3241         cputime64_t tmp;
3242
3243         p->stime = cputime_add(p->stime, cputime);
3244
3245         /* Add system time to cpustat. */
3246         tmp = cputime_to_cputime64(cputime);
3247         if (hardirq_count() - hardirq_offset)
3248                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3249         else if (softirq_count())
3250                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3251         else if (p != rq->idle)
3252                 cpustat->system = cputime64_add(cpustat->system, tmp);
3253         else if (atomic_read(&rq->nr_iowait) > 0)
3254                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3255         else
3256                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3257         /* Account for system time used */
3258         acct_update_integrals(p);
3259 }
3260
3261 /*
3262  * Account for involuntary wait time.
3263  * @p: the process from which the cpu time has been stolen
3264  * @steal: the cpu time spent in involuntary wait
3265  */
3266 void account_steal_time(struct task_struct *p, cputime_t steal)
3267 {
3268         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3269         cputime64_t tmp = cputime_to_cputime64(steal);
3270         struct rq *rq = this_rq();
3271
3272         if (p == rq->idle) {
3273                 p->stime = cputime_add(p->stime, steal);
3274                 if (atomic_read(&rq->nr_iowait) > 0)
3275                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3276                 else
3277                         cpustat->idle = cputime64_add(cpustat->idle, tmp);
3278         } else
3279                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3280 }
3281
3282 /*
3283  * This function gets called by the timer code, with HZ frequency.
3284  * We call it with interrupts disabled.
3285  *
3286  * It also gets called by the fork code, when changing the parent's
3287  * timeslices.
3288  */
3289 void scheduler_tick(void)
3290 {
3291         int cpu = smp_processor_id();
3292         struct rq *rq = cpu_rq(cpu);
3293         struct task_struct *curr = rq->curr;
3294
3295         spin_lock(&rq->lock);
3296         if (curr != rq->idle) /* FIXME: needed? */
3297                 curr->sched_class->task_tick(rq, curr);
3298         update_cpu_load(rq);
3299         spin_unlock(&rq->lock);
3300
3301 #ifdef CONFIG_SMP
3302         rq->idle_at_tick = idle_cpu(cpu);
3303         trigger_load_balance(rq, cpu);
3304 #endif
3305 }
3306
3307 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3308
3309 void fastcall add_preempt_count(int val)
3310 {
3311         /*
3312          * Underflow?
3313          */
3314         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3315                 return;
3316         preempt_count() += val;
3317         /*
3318          * Spinlock count overflowing soon?
3319          */
3320         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3321                                 PREEMPT_MASK - 10);
3322 }
3323 EXPORT_SYMBOL(add_preempt_count);
3324
3325 void fastcall sub_preempt_count(int val)
3326 {
3327         /*
3328          * Underflow?
3329          */
3330         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3331                 return;
3332         /*
3333          * Is the spinlock portion underflowing?
3334          */
3335         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3336                         !(preempt_count() & PREEMPT_MASK)))
3337                 return;
3338
3339         preempt_count() -= val;
3340 }
3341 EXPORT_SYMBOL(sub_preempt_count);
3342
3343 #endif
3344
3345 /*
3346  * Print scheduling while atomic bug:
3347  */
3348 static noinline void __schedule_bug(struct task_struct *prev)
3349 {
3350         printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3351                 prev->comm, preempt_count(), prev->pid);
3352         debug_show_held_locks(prev);
3353         if (irqs_disabled())
3354                 print_irqtrace_events(prev);
3355         dump_stack();
3356 }
3357
3358 /*
3359  * Various schedule()-time debugging checks and statistics:
3360  */
3361 static inline void schedule_debug(struct task_struct *prev)
3362 {
3363         /*
3364          * Test if we are atomic.  Since do_exit() needs to call into
3365          * schedule() atomically, we ignore that path for now.
3366          * Otherwise, whine if we are scheduling when we should not be.
3367          */
3368         if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3369                 __schedule_bug(prev);
3370
3371         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3372
3373         schedstat_inc(this_rq(), sched_cnt);
3374 }
3375
3376 /*
3377  * Pick up the highest-prio task:
3378  */
3379 static inline struct task_struct *
3380 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3381 {
3382         struct sched_class *class;
3383         struct task_struct *p;
3384
3385         /*
3386          * Optimization: we know that if all tasks are in
3387          * the fair class we can call that function directly:
3388          */
3389         if (likely(rq->nr_running == rq->cfs.nr_running)) {
3390                 p = fair_sched_class.pick_next_task(rq, now);
3391                 if (likely(p))
3392                         return p;
3393         }
3394
3395         class = sched_class_highest;
3396         for ( ; ; ) {
3397                 p = class->pick_next_task(rq, now);
3398                 if (p)
3399                         return p;
3400                 /*
3401                  * Will never be NULL as the idle class always
3402                  * returns a non-NULL p:
3403                  */
3404                 class = class->next;
3405         }
3406 }
3407
3408 /*
3409  * schedule() is the main scheduler function.
3410  */
3411 asmlinkage void __sched schedule(void)
3412 {
3413         struct task_struct *prev, *next;
3414         long *switch_count;
3415         struct rq *rq;
3416         u64 now;
3417         int cpu;
3418
3419 need_resched:
3420         preempt_disable();
3421         cpu = smp_processor_id();
3422         rq = cpu_rq(cpu);
3423         rcu_qsctr_inc(cpu);
3424         prev = rq->curr;
3425         switch_count = &prev->nivcsw;
3426
3427         release_kernel_lock(prev);
3428 need_resched_nonpreemptible:
3429
3430         schedule_debug(prev);
3431
3432         spin_lock_irq(&rq->lock);
3433         clear_tsk_need_resched(prev);
3434
3435         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3436                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3437                                 unlikely(signal_pending(prev)))) {
3438                         prev->state = TASK_RUNNING;
3439                 } else {
3440                         deactivate_task(rq, prev, 1);
3441                 }
3442                 switch_count = &prev->nvcsw;
3443         }
3444
3445         if (unlikely(!rq->nr_running))
3446                 idle_balance(cpu, rq);
3447
3448         now = __rq_clock(rq);
3449         prev->sched_class->put_prev_task(rq, prev, now);
3450         next = pick_next_task(rq, prev, now);
3451
3452         sched_info_switch(prev, next);
3453
3454         if (likely(prev != next)) {
3455                 rq->nr_switches++;
3456                 rq->curr = next;
3457                 ++*switch_count;
3458
3459                 context_switch(rq, prev, next); /* unlocks the rq */
3460         } else
3461                 spin_unlock_irq(&rq->lock);
3462
3463         if (unlikely(reacquire_kernel_lock(current) < 0)) {
3464                 cpu = smp_processor_id();
3465                 rq = cpu_rq(cpu);
3466                 goto need_resched_nonpreemptible;
3467         }
3468         preempt_enable_no_resched();
3469         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3470                 goto need_resched;
3471 }
3472 EXPORT_SYMBOL(schedule);
3473
3474 #ifdef CONFIG_PREEMPT
3475 /*
3476  * this is the entry point to schedule() from in-kernel preemption
3477  * off of preempt_enable.  Kernel preemptions off return from interrupt
3478  * occur there and call schedule directly.
3479  */
3480 asmlinkage void __sched preempt_schedule(void)
3481 {
3482         struct thread_info *ti = current_thread_info();
3483 #ifdef CONFIG_PREEMPT_BKL
3484         struct task_struct *task = current;
3485         int saved_lock_depth;
3486 #endif
3487         /*
3488          * If there is a non-zero preempt_count or interrupts are disabled,
3489          * we do not want to preempt the current task.  Just return..
3490          */
3491         if (likely(ti->preempt_count || irqs_disabled()))
3492                 return;
3493
3494 need_resched:
3495         add_preempt_count(PREEMPT_ACTIVE);
3496         /*
3497          * We keep the big kernel semaphore locked, but we
3498          * clear ->lock_depth so that schedule() doesnt
3499          * auto-release the semaphore:
3500          */
3501 #ifdef CONFIG_PREEMPT_BKL
3502         saved_lock_depth = task->lock_depth;
3503         task->lock_depth = -1;
3504 #endif
3505         schedule();
3506 #ifdef CONFIG_PREEMPT_BKL
3507         task->lock_depth = saved_lock_depth;
3508 #endif
3509         sub_preempt_count(PREEMPT_ACTIVE);
3510
3511         /* we could miss a preemption opportunity between schedule and now */
3512         barrier();
3513         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3514                 goto need_resched;
3515 }
3516 EXPORT_SYMBOL(preempt_schedule);
3517
3518 /*
3519  * this is the entry point to schedule() from kernel preemption
3520  * off of irq context.
3521  * Note, that this is called and return with irqs disabled. This will
3522  * protect us against recursive calling from irq.
3523  */
3524 asmlinkage void __sched preempt_schedule_irq(void)
3525 {
3526         struct thread_info *ti = current_thread_info();
3527 #ifdef CONFIG_PREEMPT_BKL
3528         struct task_struct *task = current;
3529         int saved_lock_depth;
3530 #endif
3531         /* Catch callers which need to be fixed */
3532         BUG_ON(ti->preempt_count || !irqs_disabled());
3533
3534 need_resched:
3535         add_preempt_count(PREEMPT_ACTIVE);
3536         /*
3537          * We keep the big kernel semaphore locked, but we
3538          * clear ->lock_depth so that schedule() doesnt
3539          * auto-release the semaphore:
3540          */
3541 #ifdef CONFIG_PREEMPT_BKL
3542         saved_lock_depth = task->lock_depth;
3543         task->lock_depth = -1;
3544 #endif
3545         local_irq_enable();
3546         schedule();
3547         local_irq_disable();
3548 #ifdef CONFIG_PREEMPT_BKL
3549         task->lock_depth = saved_lock_depth;
3550 #endif
3551         sub_preempt_count(PREEMPT_ACTIVE);
3552
3553         /* we could miss a preemption opportunity between schedule and now */
3554         barrier();
3555         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3556                 goto need_resched;
3557 }
3558
3559 #endif /* CONFIG_PREEMPT */
3560
3561 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3562                           void *key)
3563 {
3564         return try_to_wake_up(curr->private, mode, sync);
3565 }
3566 EXPORT_SYMBOL(default_wake_function);
3567
3568 /*
3569  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
3570  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
3571  * number) then we wake all the non-exclusive tasks and one exclusive task.
3572  *
3573  * There are circumstances in which we can try to wake a task which has already
3574  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
3575  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3576  */
3577 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3578                              int nr_exclusive, int sync, void *key)
3579 {
3580         struct list_head *tmp, *next;
3581
3582         list_for_each_safe(tmp, next, &q->task_list) {
3583                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3584                 unsigned flags = curr->flags;
3585
3586                 if (curr->func(curr, mode, sync, key) &&
3587                                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3588                         break;
3589         }
3590 }
3591
3592 /**
3593  * __wake_up - wake up threads blocked on a waitqueue.
3594  * @q: the waitqueue
3595  * @mode: which threads
3596  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3597  * @key: is directly passed to the wakeup function
3598  */
3599 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3600                         int nr_exclusive, void *key)
3601 {
3602         unsigned long flags;
3603
3604         spin_lock_irqsave(&q->lock, flags);
3605         __wake_up_common(q, mode, nr_exclusive, 0, key);
3606         spin_unlock_irqrestore(&q->lock, flags);
3607 }
3608 EXPORT_SYMBOL(__wake_up);
3609
3610 /*
3611  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3612  */
3613 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3614 {
3615         __wake_up_common(q, mode, 1, 0, NULL);
3616 }
3617
3618 /**
3619  * __wake_up_sync - wake up threads blocked on a waitqueue.
3620  * @q: the waitqueue
3621  * @mode: which threads
3622  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3623  *
3624  * The sync wakeup differs that the waker knows that it will schedule
3625  * away soon, so while the target thread will be woken up, it will not
3626  * be migrated to another CPU - ie. the two threads are 'synchronized'
3627  * with each other. This can prevent needless bouncing between CPUs.
3628  *
3629  * On UP it can prevent extra preemption.
3630  */
3631 void fastcall
3632 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3633 {
3634         unsigned long flags;
3635         int sync = 1;
3636
3637         if (unlikely(!q))
3638                 return;
3639
3640         if (unlikely(!nr_exclusive))
3641                 sync = 0;
3642
3643         spin_lock_irqsave(&q->lock, flags);
3644         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3645         spin_unlock_irqrestore(&q->lock, flags);
3646 }
3647 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3648
3649 void fastcall complete(struct completion *x)
3650 {
3651         unsigned long flags;
3652
3653         spin_lock_irqsave(&x->wait.lock, flags);
3654         x->done++;
3655         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3656                          1, 0, NULL);
3657         spin_unlock_irqrestore(&x->wait.lock, flags);
3658 }
3659 EXPORT_SYMBOL(complete);
3660
3661 void fastcall complete_all(struct completion *x)
3662 {
3663         unsigned long flags;
3664
3665         spin_lock_irqsave(&x->wait.lock, flags);
3666         x->done += UINT_MAX/2;
3667         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3668                          0, 0, NULL);
3669         spin_unlock_irqrestore(&x->wait.lock, flags);
3670 }
3671 EXPORT_SYMBOL(complete_all);
3672
3673 void fastcall __sched wait_for_completion(struct completion *x)
3674 {
3675         might_sleep();
3676
3677         spin_lock_irq(&x->wait.lock);
3678         if (!x->done) {
3679                 DECLARE_WAITQUEUE(wait, current);
3680
3681                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3682                 __add_wait_queue_tail(&x->wait, &wait);
3683                 do {
3684                         __set_current_state(TASK_UNINTERRUPTIBLE);
3685                         spin_unlock_irq(&x->wait.lock);
3686                         schedule();
3687                         spin_lock_irq(&x->wait.lock);
3688                 } while (!x->done);
3689                 __remove_wait_queue(&x->wait, &wait);
3690         }
3691         x->done--;
3692         spin_unlock_irq(&x->wait.lock);
3693 }
3694 EXPORT_SYMBOL(wait_for_completion);
3695
3696 unsigned long fastcall __sched
3697 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3698 {
3699         might_sleep();
3700
3701         spin_lock_irq(&x->wait.lock);
3702         if (!x->done) {
3703                 DECLARE_WAITQUEUE(wait, current);
3704
3705                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3706                 __add_wait_queue_tail(&x->wait, &wait);
3707                 do {
3708                         __set_current_state(TASK_UNINTERRUPTIBLE);
3709                         spin_unlock_irq(&x->wait.lock);
3710                         timeout = schedule_timeout(timeout);
3711                         spin_lock_irq(&x->wait.lock);
3712                         if (!timeout) {
3713                                 __remove_wait_queue(&x->wait, &wait);
3714                                 goto out;
3715                         }
3716                 } while (!x->done);
3717                 __remove_wait_queue(&x->wait, &wait);
3718         }
3719         x->done--;
3720 out:
3721         spin_unlock_irq(&x->wait.lock);
3722         return timeout;
3723 }
3724 EXPORT_SYMBOL(wait_for_completion_timeout);
3725
3726 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3727 {
3728         int ret = 0;
3729
3730         might_sleep();
3731
3732         spin_lock_irq(&x->wait.lock);
3733         if (!x->done) {
3734                 DECLARE_WAITQUEUE(wait, current);
3735
3736                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3737                 __add_wait_queue_tail(&x->wait, &wait);
3738                 do {
3739                         if (signal_pending(current)) {
3740                                 ret = -ERESTARTSYS;
3741                                 __remove_wait_queue(&x->wait, &wait);
3742                                 goto out;
3743                         }
3744                         __set_current_state(TASK_INTERRUPTIBLE);
3745                         spin_unlock_irq(&x->wait.lock);
3746                         schedule();
3747                         spin_lock_irq(&x->wait.lock);
3748                 } while (!x->done);
3749                 __remove_wait_queue(&x->wait, &wait);
3750         }
3751         x->done--;
3752 out:
3753         spin_unlock_irq(&x->wait.lock);
3754
3755         return ret;
3756 }
3757 EXPORT_SYMBOL(wait_for_completion_interruptible);
3758
3759 unsigned long fastcall __sched
3760 wait_for_completion_interruptible_timeout(struct completion *x,
3761                                           unsigned long timeout)
3762 {
3763         might_sleep();
3764
3765         spin_lock_irq(&x->wait.lock);
3766         if (!x->done) {
3767                 DECLARE_WAITQUEUE(wait, current);
3768
3769                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3770                 __add_wait_queue_tail(&x->wait, &wait);
3771                 do {
3772                         if (signal_pending(current)) {
3773                                 timeout = -ERESTARTSYS;
3774                                 __remove_wait_queue(&x->wait, &wait);
3775                                 goto out;
3776                         }
3777                         __set_current_state(TASK_INTERRUPTIBLE);
3778                         spin_unlock_irq(&x->wait.lock);
3779                         timeout = schedule_timeout(timeout);
3780                         spin_lock_irq(&x->wait.lock);
3781                         if (!timeout) {
3782                                 __remove_wait_queue(&x->wait, &wait);
3783                                 goto out;
3784                         }
3785                 } while (!x->done);
3786                 __remove_wait_queue(&x->wait, &wait);
3787         }
3788         x->done--;
3789 out:
3790         spin_unlock_irq(&x->wait.lock);
3791         return timeout;
3792 }
3793 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3794
3795
3796 #define SLEEP_ON_VAR                                    \
3797         unsigned long flags;                            \
3798         wait_queue_t wait;                              \
3799         init_waitqueue_entry(&wait, current);
3800
3801 #define SLEEP_ON_HEAD                                   \
3802         spin_lock_irqsave(&q->lock,flags);              \
3803         __add_wait_queue(q, &wait);                     \
3804         spin_unlock(&q->lock);
3805
3806 #define SLEEP_ON_TAIL                                   \
3807         spin_lock_irq(&q->lock);                        \
3808         __remove_wait_queue(q, &wait);                  \
3809         spin_unlock_irqrestore(&q->lock, flags);
3810
3811 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3812 {
3813         SLEEP_ON_VAR
3814
3815         current->state = TASK_INTERRUPTIBLE;
3816
3817         SLEEP_ON_HEAD
3818         schedule();
3819         SLEEP_ON_TAIL
3820 }
3821 EXPORT_SYMBOL(interruptible_sleep_on);
3822
3823 long fastcall __sched
3824 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3825 {
3826         SLEEP_ON_VAR
3827
3828         current->state = TASK_INTERRUPTIBLE;
3829
3830         SLEEP_ON_HEAD
3831         timeout = schedule_timeout(timeout);
3832         SLEEP_ON_TAIL
3833
3834         return timeout;
3835 }
3836 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3837
3838 void fastcall __sched sleep_on(wait_queue_head_t *q)
3839 {
3840         SLEEP_ON_VAR
3841
3842         current->state = TASK_UNINTERRUPTIBLE;
3843
3844         SLEEP_ON_HEAD
3845         schedule();
3846         SLEEP_ON_TAIL
3847 }
3848 EXPORT_SYMBOL(sleep_on);
3849
3850 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3851 {
3852         SLEEP_ON_VAR
3853
3854         current->state = TASK_UNINTERRUPTIBLE;
3855
3856         SLEEP_ON_HEAD
3857         timeout = schedule_timeout(timeout);
3858         SLEEP_ON_TAIL
3859
3860         return timeout;
3861 }
3862
3863 EXPORT_SYMBOL(sleep_on_timeout);
3864
3865 #ifdef CONFIG_RT_MUTEXES
3866
3867 /*
3868  * rt_mutex_setprio - set the current priority of a task
3869  * @p: task
3870  * @prio: prio value (kernel-internal form)
3871  *
3872  * This function changes the 'effective' priority of a task. It does
3873  * not touch ->normal_prio like __setscheduler().
3874  *
3875  * Used by the rt_mutex code to implement priority inheritance logic.
3876  */
3877 void rt_mutex_setprio(struct task_struct *p, int prio)
3878 {
3879         unsigned long flags;
3880         int oldprio, on_rq;
3881         struct rq *rq;
3882         u64 now;
3883
3884         BUG_ON(prio < 0 || prio > MAX_PRIO);
3885
3886         rq = task_rq_lock(p, &flags);
3887         now = rq_clock(rq);
3888
3889         oldprio = p->prio;
3890         on_rq = p->se.on_rq;
3891         if (on_rq)
3892                 dequeue_task(rq, p, 0, now);
3893
3894         if (rt_prio(prio))
3895                 p->sched_class = &rt_sched_class;
3896         else
3897                 p->sched_class = &fair_sched_class;
3898
3899         p->prio = prio;
3900
3901         if (on_rq) {
3902                 enqueue_task(rq, p, 0, now);
3903                 /*
3904                  * Reschedule if we are currently running on this runqueue and
3905                  * our priority decreased, or if we are not currently running on
3906                  * this runqueue and our priority is higher than the current's
3907                  */
3908                 if (task_running(rq, p)) {
3909                         if (p->prio > oldprio)
3910                                 resched_task(rq->curr);
3911                 } else {
3912                         check_preempt_curr(rq, p);
3913                 }
3914         }
3915         task_rq_unlock(rq, &flags);
3916 }
3917
3918 #endif
3919
3920 void set_user_nice(struct task_struct *p, long nice)
3921 {
3922         int old_prio, delta, on_rq;
3923         unsigned long flags;
3924         struct rq *rq;
3925         u64 now;
3926
3927         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3928                 return;
3929         /*
3930          * We have to be careful, if called from sys_setpriority(),
3931          * the task might be in the middle of scheduling on another CPU.
3932          */
3933         rq = task_rq_lock(p, &flags);
3934         now = rq_clock(rq);
3935         /*
3936          * The RT priorities are set via sched_setscheduler(), but we still
3937          * allow the 'normal' nice value to be set - but as expected
3938          * it wont have any effect on scheduling until the task is
3939          * SCHED_FIFO/SCHED_RR:
3940          */
3941         if (task_has_rt_policy(p)) {
3942                 p->static_prio = NICE_TO_PRIO(nice);
3943                 goto out_unlock;
3944         }
3945         on_rq = p->se.on_rq;
3946         if (on_rq) {
3947                 dequeue_task(rq, p, 0, now);
3948                 dec_load(rq, p, now);
3949         }
3950
3951         p->static_prio = NICE_TO_PRIO(nice);
3952         set_load_weight(p);
3953         old_prio = p->prio;
3954         p->prio = effective_prio(p);
3955         delta = p->prio - old_prio;
3956
3957         if (on_rq) {
3958                 enqueue_task(rq, p, 0, now);
3959                 inc_load(rq, p, now);
3960                 /*
3961                  * If the task increased its priority or is running and
3962                  * lowered its priority, then reschedule its CPU:
3963                  */
3964                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3965                         resched_task(rq->curr);
3966         }
3967 out_unlock:
3968         task_rq_unlock(rq, &flags);
3969 }
3970 EXPORT_SYMBOL(set_user_nice);
3971
3972 /*
3973  * can_nice - check if a task can reduce its nice value
3974  * @p: task
3975  * @nice: nice value
3976  */
3977 int can_nice(const struct task_struct *p, const int nice)
3978 {
3979         /* convert nice value [19,-20] to rlimit style value [1,40] */
3980         int nice_rlim = 20 - nice;
3981
3982         return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3983                 capable(CAP_SYS_NICE));
3984 }
3985
3986 #ifdef __ARCH_WANT_SYS_NICE
3987
3988 /*
3989  * sys_nice - change the priority of the current process.
3990  * @increment: priority increment
3991  *
3992  * sys_setpriority is a more generic, but much slower function that
3993  * does similar things.
3994  */
3995 asmlinkage long sys_nice(int increment)
3996 {
3997         long nice, retval;
3998
3999         /*
4000          * Setpriority might change our priority at the same moment.
4001          * We don't have to worry. Conceptually one call occurs first
4002          * and we have a single winner.
4003          */
4004         if (increment < -40)
4005                 increment = -40;
4006         if (increment > 40)
4007                 increment = 40;
4008
4009         nice = PRIO_TO_NICE(current->static_prio) + increment;
4010         if (nice < -20)
4011                 nice = -20;
4012         if (nice > 19)
4013                 nice = 19;
4014
4015         if (increment < 0 && !can_nice(current, nice))
4016                 return -EPERM;
4017
4018         retval = security_task_setnice(current, nice);
4019         if (retval)
4020                 return retval;
4021
4022         set_user_nice(current, nice);
4023         return 0;
4024 }
4025
4026 #endif
4027
4028 /**
4029  * task_prio - return the priority value of a given task.
4030  * @p: the task in question.
4031  *
4032  * This is the priority value as seen by users in /proc.
4033  * RT tasks are offset by -200. Normal tasks are centered
4034  * around 0, value goes from -16 to +15.
4035  */
4036 int task_prio(const struct task_struct *p)
4037 {
4038         return p->prio - MAX_RT_PRIO;
4039 }
4040
4041 /**
4042  * task_nice - return the nice value of a given task.
4043  * @p: the task in question.
4044  */
4045 int task_nice(const struct task_struct *p)
4046 {
4047         return TASK_NICE(p);
4048 }
4049 EXPORT_SYMBOL_GPL(task_nice);
4050
4051 /**
4052  * idle_cpu - is a given cpu idle currently?
4053  * @cpu: the processor in question.
4054  */
4055 int idle_cpu(int cpu)
4056 {
4057         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4058 }
4059
4060 /**
4061  * idle_task - return the idle task for a given cpu.
4062  * @cpu: the processor in question.
4063  */
4064 struct task_struct *idle_task(int cpu)
4065 {
4066         return cpu_rq(cpu)->idle;
4067 }
4068
4069 /**
4070  * find_process_by_pid - find a process with a matching PID value.
4071  * @pid: the pid in question.
4072  */
4073 static inline struct task_struct *find_process_by_pid(pid_t pid)
4074 {
4075         return pid ? find_task_by_pid(pid) : current;
4076 }
4077
4078 /* Actually do priority change: must hold rq lock. */
4079 static void
4080 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4081 {
4082         BUG_ON(p->se.on_rq);
4083
4084         p->policy = policy;
4085         switch (p->policy) {
4086         case SCHED_NORMAL:
4087         case SCHED_BATCH:
4088         case SCHED_IDLE:
4089                 p->sched_class = &fair_sched_class;
4090                 break;
4091         case SCHED_FIFO:
4092         case SCHED_RR:
4093                 p->sched_class = &rt_sched_class;
4094                 break;
4095         }
4096
4097         p->rt_priority = prio;
4098         p->normal_prio = normal_prio(p);
4099         /* we are holding p->pi_lock already */
4100         p->prio = rt_mutex_getprio(p);
4101         set_load_weight(p);
4102 }
4103
4104 /**
4105  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4106  * @p: the task in question.
4107  * @policy: new policy.
4108  * @param: structure containing the new RT priority.
4109  *
4110  * NOTE that the task may be already dead.
4111  */
4112 int sched_setscheduler(struct task_struct *p, int policy,
4113                        struct sched_param *param)
4114 {
4115         int retval, oldprio, oldpolicy = -1, on_rq;
4116         unsigned long flags;
4117         struct rq *rq;
4118
4119         /* may grab non-irq protected spin_locks */
4120         BUG_ON(in_interrupt());
4121 recheck:
4122         /* double check policy once rq lock held */
4123         if (policy < 0)
4124                 policy = oldpolicy = p->policy;
4125         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4126                         policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4127                         policy != SCHED_IDLE)
4128                 return -EINVAL;
4129         /*
4130          * Valid priorities for SCHED_FIFO and SCHED_RR are
4131          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4132          * SCHED_BATCH and SCHED_IDLE is 0.
4133          */
4134         if (param->sched_priority < 0 ||
4135             (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4136             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4137                 return -EINVAL;
4138         if (rt_policy(policy) != (param->sched_priority != 0))
4139                 return -EINVAL;
4140
4141         /*
4142          * Allow unprivileged RT tasks to decrease priority:
4143          */
4144         if (!capable(CAP_SYS_NICE)) {
4145                 if (rt_policy(policy)) {
4146                         unsigned long rlim_rtprio;
4147
4148                         if (!lock_task_sighand(p, &flags))
4149                                 return -ESRCH;
4150                         rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4151                         unlock_task_sighand(p, &flags);
4152
4153                         /* can't set/change the rt policy */
4154                         if (policy != p->policy && !rlim_rtprio)
4155                                 return -EPERM;
4156
4157                         /* can't increase priority */
4158                         if (param->sched_priority > p->rt_priority &&
4159                             param->sched_priority > rlim_rtprio)
4160                                 return -EPERM;
4161                 }
4162                 /*
4163                  * Like positive nice levels, dont allow tasks to
4164                  * move out of SCHED_IDLE either:
4165                  */
4166                 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4167                         return -EPERM;
4168
4169                 /* can't change other user's priorities */
4170                 if ((current->euid != p->euid) &&
4171                     (current->euid != p->uid))
4172                         return -EPERM;
4173         }
4174
4175         retval = security_task_setscheduler(p, policy, param);
4176         if (retval)
4177                 return retval;
4178         /*
4179          * make sure no PI-waiters arrive (or leave) while we are
4180          * changing the priority of the task:
4181          */
4182         spin_lock_irqsave(&p->pi_lock, flags);
4183         /*
4184          * To be able to change p->policy safely, the apropriate
4185          * runqueue lock must be held.
4186          */
4187         rq = __task_rq_lock(p);
4188         /* recheck policy now with rq lock held */
4189         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4190                 policy = oldpolicy = -1;
4191                 __task_rq_unlock(rq);
4192                 spin_unlock_irqrestore(&p->pi_lock, flags);
4193                 goto recheck;
4194         }
4195         on_rq = p->se.on_rq;
4196         if (on_rq)
4197                 deactivate_task(rq, p, 0);
4198         oldprio = p->prio;
4199         __setscheduler(rq, p, policy, param->sched_priority);
4200         if (on_rq) {
4201                 activate_task(rq, p, 0);
4202                 /*
4203                  * Reschedule if we are currently running on this runqueue and
4204                  * our priority decreased, or if we are not currently running on
4205                  * this runqueue and our priority is higher than the current's
4206                  */
4207                 if (task_running(rq, p)) {
4208                         if (p->prio > oldprio)
4209                                 resched_task(rq->curr);
4210                 } else {
4211                         check_preempt_curr(rq, p);
4212                 }
4213         }
4214         __task_rq_unlock(rq);
4215         spin_unlock_irqrestore(&p->pi_lock, flags);
4216
4217         rt_mutex_adjust_pi(p);
4218
4219         return 0;
4220 }
4221 EXPORT_SYMBOL_GPL(sched_setscheduler);
4222
4223 static int
4224 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4225 {
4226         struct sched_param lparam;
4227         struct task_struct *p;
4228         int retval;
4229
4230         if (!param || pid < 0)
4231                 return -EINVAL;
4232         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4233                 return -EFAULT;
4234
4235         rcu_read_lock();
4236         retval = -ESRCH;
4237         p = find_process_by_pid(pid);
4238         if (p != NULL)
4239                 retval = sched_setscheduler(p, policy, &lparam);
4240         rcu_read_unlock();
4241
4242         return retval;
4243 }
4244
4245 /**
4246  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4247  * @pid: the pid in question.
4248  * @policy: new policy.
4249  * @param: structure containing the new RT priority.
4250  */
4251 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4252                                        struct sched_param __user *param)
4253 {
4254         /* negative values for policy are not valid */
4255         if (policy < 0)
4256                 return -EINVAL;
4257
4258         return do_sched_setscheduler(pid, policy, param);
4259 }
4260
4261 /**
4262  * sys_sched_setparam - set/change the RT priority of a thread
4263  * @pid: the pid in question.
4264  * @param: structure containing the new RT priority.
4265  */
4266 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4267 {
4268         return do_sched_setscheduler(pid, -1, param);
4269 }
4270
4271 /**
4272  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4273  * @pid: the pid in question.
4274  */
4275 asmlinkage long sys_sched_getscheduler(pid_t pid)
4276 {
4277         struct task_struct *p;
4278         int retval = -EINVAL;
4279
4280         if (pid < 0)
4281                 goto out_nounlock;
4282
4283         retval = -ESRCH;
4284         read_lock(&tasklist_lock);
4285         p = find_process_by_pid(pid);
4286         if (p) {
4287                 retval = security_task_getscheduler(p);
4288                 if (!retval)
4289                         retval = p->policy;
4290         }
4291         read_unlock(&tasklist_lock);
4292
4293 out_nounlock:
4294         return retval;
4295 }
4296
4297 /**
4298  * sys_sched_getscheduler - get the RT priority of a thread
4299  * @pid: the pid in question.
4300  * @param: structure containing the RT priority.
4301  */
4302 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4303 {
4304         struct sched_param lp;
4305         struct task_struct *p;
4306         int retval = -EINVAL;
4307
4308         if (!param || pid < 0)
4309                 goto out_nounlock;
4310
4311         read_lock(&tasklist_lock);
4312         p = find_process_by_pid(pid);
4313         retval = -ESRCH;
4314         if (!p)
4315                 goto out_unlock;
4316
4317         retval = security_task_getscheduler(p);
4318         if (retval)
4319                 goto out_unlock;
4320
4321         lp.sched_priority = p->rt_priority;
4322         read_unlock(&tasklist_lock);
4323
4324         /*
4325          * This one might sleep, we cannot do it with a spinlock held ...
4326          */
4327         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4328
4329 out_nounlock:
4330         return retval;
4331
4332 out_unlock:
4333         read_unlock(&tasklist_lock);
4334         return retval;
4335 }
4336
4337 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4338 {
4339         cpumask_t cpus_allowed;
4340         struct task_struct *p;
4341         int retval;
4342
4343         mutex_lock(&sched_hotcpu_mutex);
4344         read_lock(&tasklist_lock);
4345
4346         p = find_process_by_pid(pid);
4347         if (!p) {
4348                 read_unlock(&tasklist_lock);
4349                 mutex_unlock(&sched_hotcpu_mutex);
4350                 return -ESRCH;
4351         }
4352
4353         /*
4354          * It is not safe to call set_cpus_allowed with the
4355          * tasklist_lock held.  We will bump the task_struct's
4356          * usage count and then drop tasklist_lock.
4357          */
4358         get_task_struct(p);
4359         read_unlock(&tasklist_lock);
4360
4361         retval = -EPERM;
4362         if ((current->euid != p->euid) && (current->euid != p->uid) &&
4363                         !capable(CAP_SYS_NICE))
4364                 goto out_unlock;
4365
4366         retval = security_task_setscheduler(p, 0, NULL);
4367         if (retval)
4368                 goto out_unlock;
4369
4370         cpus_allowed = cpuset_cpus_allowed(p);
4371         cpus_and(new_mask, new_mask, cpus_allowed);
4372         retval = set_cpus_allowed(p, new_mask);
4373
4374 out_unlock:
4375         put_task_struct(p);
4376         mutex_unlock(&sched_hotcpu_mutex);
4377         return retval;
4378 }
4379
4380 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4381                              cpumask_t *new_mask)
4382 {
4383         if (len < sizeof(cpumask_t)) {
4384                 memset(new_mask, 0, sizeof(cpumask_t));
4385         } else if (len > sizeof(cpumask_t)) {
4386                 len = sizeof(cpumask_t);
4387         }
4388         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4389 }
4390
4391 /**
4392  * sys_sched_setaffinity - set the cpu affinity of a process
4393  * @pid: pid of the process
4394  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4395  * @user_mask_ptr: user-space pointer to the new cpu mask
4396  */
4397 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4398                                       unsigned long __user *user_mask_ptr)
4399 {
4400         cpumask_t new_mask;
4401         int retval;
4402
4403         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4404         if (retval)
4405                 return retval;
4406
4407         return sched_setaffinity(pid, new_mask);
4408 }
4409
4410 /*
4411  * Represents all cpu's present in the system
4412  * In systems capable of hotplug, this map could dynamically grow
4413  * as new cpu's are detected in the system via any platform specific
4414  * method, such as ACPI for e.g.
4415  */
4416
4417 cpumask_t cpu_present_map __read_mostly;
4418 EXPORT_SYMBOL(cpu_present_map);
4419
4420 #ifndef CONFIG_SMP
4421 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4422 EXPORT_SYMBOL(cpu_online_map);
4423
4424 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4425 EXPORT_SYMBOL(cpu_possible_map);
4426 #endif
4427
4428 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4429 {
4430         struct task_struct *p;
4431         int retval;
4432
4433         mutex_lock(&sched_hotcpu_mutex);
4434         read_lock(&tasklist_lock);
4435
4436         retval = -ESRCH;
4437         p = find_process_by_pid(pid);
4438         if (!p)
4439                 goto out_unlock;
4440
4441         retval = security_task_getscheduler(p);
4442         if (retval)
4443                 goto out_unlock;
4444
4445         cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4446
4447 out_unlock:
4448         read_unlock(&tasklist_lock);
4449         mutex_unlock(&sched_hotcpu_mutex);
4450         if (retval)
4451                 return retval;
4452
4453         return 0;
4454 }
4455
4456 /**
4457  * sys_sched_getaffinity - get the cpu affinity of a process
4458  * @pid: pid of the process
4459  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4460  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4461  */
4462 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4463                                       unsigned long __user *user_mask_ptr)
4464 {
4465         int ret;
4466         cpumask_t mask;
4467
4468         if (len < sizeof(cpumask_t))
4469                 return -EINVAL;
4470
4471         ret = sched_getaffinity(pid, &mask);
4472         if (ret < 0)
4473                 return ret;
4474
4475         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4476                 return -EFAULT;
4477
4478         return sizeof(cpumask_t);
4479 }
4480
4481 /**
4482  * sys_sched_yield - yield the current processor to other threads.
4483  *
4484  * This function yields the current CPU to other tasks. If there are no
4485  * other threads running on this CPU then this function will return.
4486  */
4487 asmlinkage long sys_sched_yield(void)
4488 {
4489         struct rq *rq = this_rq_lock();
4490
4491         schedstat_inc(rq, yld_cnt);
4492         if (unlikely(rq->nr_running == 1))
4493                 schedstat_inc(rq, yld_act_empty);
4494         else
4495                 current->sched_class->yield_task(rq, current);
4496
4497         /*
4498          * Since we are going to call schedule() anyway, there's
4499          * no need to preempt or enable interrupts:
4500          */
4501         __release(rq->lock);
4502         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4503         _raw_spin_unlock(&rq->lock);
4504         preempt_enable_no_resched();
4505
4506         schedule();
4507
4508         return 0;
4509 }
4510
4511 static void __cond_resched(void)
4512 {
4513 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4514         __might_sleep(__FILE__, __LINE__);
4515 #endif
4516         /*
4517          * The BKS might be reacquired before we have dropped
4518          * PREEMPT_ACTIVE, which could trigger a second
4519          * cond_resched() call.
4520          */
4521         do {
4522                 add_preempt_count(PREEMPT_ACTIVE);
4523                 schedule();
4524                 sub_preempt_count(PREEMPT_ACTIVE);
4525         } while (need_resched());
4526 }
4527
4528 int __sched cond_resched(void)
4529 {
4530         if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4531                                         system_state == SYSTEM_RUNNING) {
4532                 __cond_resched();
4533                 return 1;
4534         }
4535         return 0;
4536 }
4537 EXPORT_SYMBOL(cond_resched);
4538
4539 /*
4540  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4541  * call schedule, and on return reacquire the lock.
4542  *
4543  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
4544  * operations here to prevent schedule() from being called twice (once via
4545  * spin_unlock(), once by hand).
4546  */
4547 int cond_resched_lock(spinlock_t *lock)
4548 {
4549         int ret = 0;
4550
4551         if (need_lockbreak(lock)) {
4552                 spin_unlock(lock);
4553                 cpu_relax();
4554                 ret = 1;
4555                 spin_lock(lock);
4556         }
4557         if (need_resched() && system_state == SYSTEM_RUNNING) {
4558                 spin_release(&lock->dep_map, 1, _THIS_IP_);
4559                 _raw_spin_unlock(lock);
4560                 preempt_enable_no_resched();
4561                 __cond_resched();
4562                 ret = 1;
4563                 spin_lock(lock);
4564         }
4565         return ret;
4566 }
4567 EXPORT_SYMBOL(cond_resched_lock);
4568
4569 int __sched cond_resched_softirq(void)
4570 {
4571         BUG_ON(!in_softirq());
4572
4573         if (need_resched() && system_state == SYSTEM_RUNNING) {
4574                 local_bh_enable();
4575                 __cond_resched();
4576                 local_bh_disable();
4577                 return 1;
4578         }
4579         return 0;
4580 }
4581 EXPORT_SYMBOL(cond_resched_softirq);
4582
4583 /**
4584  * yield - yield the current processor to other threads.
4585  *
4586  * This is a shortcut for kernel-space yielding - it marks the
4587  * thread runnable and calls sys_sched_yield().
4588  */
4589 void __sched yield(void)
4590 {
4591         set_current_state(TASK_RUNNING);
4592         sys_sched_yield();
4593 }
4594 EXPORT_SYMBOL(yield);
4595
4596 /*
4597  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
4598  * that process accounting knows that this is a task in IO wait state.
4599  *
4600  * But don't do that if it is a deliberate, throttling IO wait (this task
4601  * has set its backing_dev_info: the queue against which it should throttle)
4602  */
4603 void __sched io_schedule(void)
4604 {
4605         struct rq *rq = &__raw_get_cpu_var(runqueues);
4606
4607         delayacct_blkio_start();
4608         atomic_inc(&rq->nr_iowait);
4609         schedule();
4610         atomic_dec(&rq->nr_iowait);
4611         delayacct_blkio_end();
4612 }
4613 EXPORT_SYMBOL(io_schedule);
4614
4615 long __sched io_schedule_timeout(long timeout)
4616 {
4617         struct rq *rq = &__raw_get_cpu_var(runqueues);
4618         long ret;
4619
4620         delayacct_blkio_start();
4621         atomic_inc(&rq->nr_iowait);
4622         ret = schedule_timeout(timeout);
4623         atomic_dec(&rq->nr_iowait);
4624         delayacct_blkio_end();
4625         return ret;
4626 }
4627
4628 /**
4629  * sys_sched_get_priority_max - return maximum RT priority.
4630  * @policy: scheduling class.
4631  *
4632  * this syscall returns the maximum rt_priority that can be used
4633  * by a given scheduling class.
4634  */
4635 asmlinkage long sys_sched_get_priority_max(int policy)
4636 {
4637         int ret = -EINVAL;
4638
4639         switch (policy) {
4640         case SCHED_FIFO:
4641         case SCHED_RR:
4642                 ret = MAX_USER_RT_PRIO-1;
4643                 break;
4644         case SCHED_NORMAL:
4645         case SCHED_BATCH:
4646         case SCHED_IDLE:
4647                 ret = 0;
4648                 break;
4649         }
4650         return ret;
4651 }
4652
4653 /**
4654  * sys_sched_get_priority_min - return minimum RT priority.
4655  * @policy: scheduling class.
4656  *
4657  * this syscall returns the minimum rt_priority that can be used
4658  * by a given scheduling class.
4659  */
4660 asmlinkage long sys_sched_get_priority_min(int policy)
4661 {
4662         int ret = -EINVAL;
4663
4664         switch (policy) {
4665         case SCHED_FIFO:
4666         case SCHED_RR:
4667                 ret = 1;
4668                 break;
4669         case SCHED_NORMAL:
4670         case SCHED_BATCH:
4671         case SCHED_IDLE:
4672                 ret = 0;
4673         }
4674         return ret;
4675 }
4676
4677 /**
4678  * sys_sched_rr_get_interval - return the default timeslice of a process.
4679  * @pid: pid of the process.
4680  * @interval: userspace pointer to the timeslice value.
4681  *
4682  * this syscall writes the default timeslice value of a given process
4683  * into the user-space timespec buffer. A value of '0' means infinity.
4684  */
4685 asmlinkage
4686 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4687 {
4688         struct task_struct *p;
4689         int retval = -EINVAL;
4690         struct timespec t;
4691
4692         if (pid < 0)
4693                 goto out_nounlock;
4694
4695         retval = -ESRCH;
4696         read_lock(&tasklist_lock);
4697         p = find_process_by_pid(pid);
4698         if (!p)
4699                 goto out_unlock;
4700
4701         retval = security_task_getscheduler(p);
4702         if (retval)
4703                 goto out_unlock;
4704
4705         jiffies_to_timespec(p->policy == SCHED_FIFO ?
4706                                 0 : static_prio_timeslice(p->static_prio), &t);
4707         read_unlock(&tasklist_lock);
4708         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4709 out_nounlock:
4710         return retval;
4711 out_unlock:
4712         read_unlock(&tasklist_lock);
4713         return retval;
4714 }
4715
4716 static const char stat_nam[] = "RSDTtZX";
4717
4718 static void show_task(struct task_struct *p)
4719 {
4720         unsigned long free = 0;
4721         unsigned state;
4722
4723         state = p->state ? __ffs(p->state) + 1 : 0;
4724         printk("%-13.13s %c", p->comm,
4725                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4726 #if (BITS_PER_LONG == 32)
4727         if (state == TASK_RUNNING)
4728                 printk(" running ");
4729         else
4730                 printk(" %08lX ", thread_saved_pc(p));
4731 #else
4732         if (state == TASK_RUNNING)
4733                 printk("  running task   ");
4734         else
4735                 printk(" %016lx ", thread_saved_pc(p));
4736 #endif
4737 #ifdef CONFIG_DEBUG_STACK_USAGE
4738         {
4739                 unsigned long *n = end_of_stack(p);
4740                 while (!*n)
4741                         n++;
4742                 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4743         }
4744 #endif
4745         printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4746         if (!p->mm)
4747                 printk(" (L-TLB)\n");
4748         else
4749                 printk(" (NOTLB)\n");
4750
4751         if (state != TASK_RUNNING)
4752                 show_stack(p, NULL);
4753 }
4754
4755 void show_state_filter(unsigned long state_filter)
4756 {
4757         struct task_struct *g, *p;
4758
4759 #if (BITS_PER_LONG == 32)
4760         printk("\n"
4761                "                         free                        sibling\n");
4762         printk("  task             PC    stack   pid father child younger older\n");
4763 #else
4764         printk("\n"
4765                "                                 free                        sibling\n");
4766         printk("  task                 PC        stack   pid father child younger older\n");
4767 #endif
4768         read_lock(&tasklist_lock);
4769         do_each_thread(g, p) {
4770                 /*
4771                  * reset the NMI-timeout, listing all files on a slow
4772                  * console might take alot of time:
4773                  */
4774                 touch_nmi_watchdog();
4775                 if (!state_filter || (p->state & state_filter))
4776                         show_task(p);
4777         } while_each_thread(g, p);
4778
4779         touch_all_softlockup_watchdogs();
4780
4781 #ifdef CONFIG_SCHED_DEBUG
4782         sysrq_sched_debug_show();
4783 #endif
4784         read_unlock(&tasklist_lock);
4785         /*
4786          * Only show locks if all tasks are dumped:
4787          */
4788         if (state_filter == -1)
4789                 debug_show_all_locks();
4790 }
4791
4792 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4793 {
4794         idle->sched_class = &idle_sched_class;
4795 }
4796
4797 /**
4798  * init_idle - set up an idle thread for a given CPU
4799  * @idle: task in question
4800  * @cpu: cpu the idle task belongs to
4801  *
4802  * NOTE: this function does not set the idle thread's NEED_RESCHED
4803  * flag, to make booting more robust.
4804  */
4805 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4806 {
4807         struct rq *rq = cpu_rq(cpu);
4808         unsigned long flags;
4809
4810         __sched_fork(idle);
4811         idle->se.exec_start = sched_clock();
4812
4813         idle->prio = idle->normal_prio = MAX_PRIO;
4814         idle->cpus_allowed = cpumask_of_cpu(cpu);
4815         __set_task_cpu(idle, cpu);
4816
4817         spin_lock_irqsave(&rq->lock, flags);
4818         rq->curr = rq->idle = idle;
4819 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4820         idle->oncpu = 1;
4821 #endif
4822         spin_unlock_irqrestore(&rq->lock, flags);
4823
4824         /* Set the preempt count _outside_ the spinlocks! */
4825 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4826         task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4827 #else
4828         task_thread_info(idle)->preempt_count = 0;
4829 #endif
4830         /*
4831          * The idle tasks have their own, simple scheduling class:
4832          */
4833         idle->sched_class = &idle_sched_class;
4834 }
4835
4836 /*
4837  * In a system that switches off the HZ timer nohz_cpu_mask
4838  * indicates which cpus entered this state. This is used
4839  * in the rcu update to wait only for active cpus. For system
4840  * which do not switch off the HZ timer nohz_cpu_mask should
4841  * always be CPU_MASK_NONE.
4842  */
4843 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4844
4845 /*
4846  * Increase the granularity value when there are more CPUs,
4847  * because with more CPUs the 'effective latency' as visible
4848  * to users decreases. But the relationship is not linear,
4849  * so pick a second-best guess by going with the log2 of the
4850  * number of CPUs.
4851  *
4852  * This idea comes from the SD scheduler of Con Kolivas:
4853  */
4854 static inline void sched_init_granularity(void)
4855 {
4856         unsigned int factor = 1 + ilog2(num_online_cpus());
4857         const unsigned long gran_limit = 10000000;
4858
4859         sysctl_sched_granularity *= factor;
4860         if (sysctl_sched_granularity > gran_limit)
4861                 sysctl_sched_granularity = gran_limit;
4862
4863         sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4864         sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4865 }
4866
4867 #ifdef CONFIG_SMP
4868 /*
4869  * This is how migration works:
4870  *
4871  * 1) we queue a struct migration_req structure in the source CPU's
4872  *    runqueue and wake up that CPU's migration thread.
4873  * 2) we down() the locked semaphore => thread blocks.
4874  * 3) migration thread wakes up (implicitly it forces the migrated
4875  *    thread off the CPU)
4876  * 4) it gets the migration request and checks whether the migrated
4877  *    task is still in the wrong runqueue.
4878  * 5) if it's in the wrong runqueue then the migration thread removes
4879  *    it and puts it into the right queue.
4880  * 6) migration thread up()s the semaphore.
4881  * 7) we wake up and the migration is done.
4882  */
4883
4884 /*
4885  * Change a given task's CPU affinity. Migrate the thread to a
4886  * proper CPU and schedule it away if the CPU it's executing on
4887  * is removed from the allowed bitmask.
4888  *
4889  * NOTE: the caller must have a valid reference to the task, the
4890  * task must not exit() & deallocate itself prematurely.  The
4891  * call is not atomic; no spinlocks may be held.
4892  */
4893 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4894 {
4895         struct migration_req req;
4896         unsigned long flags;
4897         struct rq *rq;
4898         int ret = 0;
4899
4900         rq = task_rq_lock(p, &flags);
4901         if (!cpus_intersects(new_mask, cpu_online_map)) {
4902                 ret = -EINVAL;
4903                 goto out;
4904         }
4905
4906         p->cpus_allowed = new_mask;
4907         /* Can the task run on the task's current CPU? If so, we're done */
4908         if (cpu_isset(task_cpu(p), new_mask))
4909                 goto out;
4910
4911         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4912                 /* Need help from migration thread: drop lock and wait. */
4913                 task_rq_unlock(rq, &flags);
4914                 wake_up_process(rq->migration_thread);
4915                 wait_for_completion(&req.done);
4916                 tlb_migrate_finish(p->mm);
4917                 return 0;
4918         }
4919 out:
4920         task_rq_unlock(rq, &flags);
4921
4922         return ret;
4923 }
4924 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4925
4926 /*
4927  * Move (not current) task off this cpu, onto dest cpu.  We're doing
4928  * this because either it can't run here any more (set_cpus_allowed()
4929  * away from this CPU, or CPU going down), or because we're
4930  * attempting to rebalance this task on exec (sched_exec).
4931  *
4932  * So we race with normal scheduler movements, but that's OK, as long
4933  * as the task is no longer on this CPU.
4934  *
4935  * Returns non-zero if task was successfully migrated.
4936  */
4937 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4938 {
4939         struct rq *rq_dest, *rq_src;
4940         int ret = 0, on_rq;
4941
4942         if (unlikely(cpu_is_offline(dest_cpu)))
4943                 return ret;
4944
4945         rq_src = cpu_rq(src_cpu);
4946         rq_dest = cpu_rq(dest_cpu);
4947
4948         double_rq_lock(rq_src, rq_dest);
4949         /* Already moved. */
4950         if (task_cpu(p) != src_cpu)
4951                 goto out;
4952         /* Affinity changed (again). */
4953         if (!cpu_isset(dest_cpu, p->cpus_allowed))
4954                 goto out;
4955
4956         on_rq = p->se.on_rq;
4957         if (on_rq)
4958                 deactivate_task(rq_src, p, 0);
4959         set_task_cpu(p, dest_cpu);
4960         if (on_rq) {
4961                 activate_task(rq_dest, p, 0);
4962                 check_preempt_curr(rq_dest, p);
4963         }
4964         ret = 1;
4965 out:
4966         double_rq_unlock(rq_src, rq_dest);
4967         return ret;
4968 }
4969
4970 /*
4971  * migration_thread - this is a highprio system thread that performs
4972  * thread migration by bumping thread off CPU then 'pushing' onto
4973  * another runqueue.
4974  */
4975 static int migration_thread(void *data)
4976 {
4977         int cpu = (long)data;
4978         struct rq *rq;
4979
4980         rq = cpu_rq(cpu);
4981         BUG_ON(rq->migration_thread != current);
4982
4983         set_current_state(TASK_INTERRUPTIBLE);
4984         while (!kthread_should_stop()) {
4985                 struct migration_req *req;
4986                 struct list_head *head;
4987
4988                 try_to_freeze();
4989
4990                 spin_lock_irq(&rq->lock);
4991
4992                 if (cpu_is_offline(cpu)) {
4993                         spin_unlock_irq(&rq->lock);
4994                         goto wait_to_die;
4995                 }
4996
4997                 if (rq->active_balance) {
4998                         active_load_balance(rq, cpu);
4999                         rq->active_balance = 0;
5000                 }
5001
5002                 head = &rq->migration_queue;
5003
5004                 if (list_empty(head)) {
5005                         spin_unlock_irq(&rq->lock);
5006                         schedule();
5007                         set_current_state(TASK_INTERRUPTIBLE);
5008                         continue;
5009                 }
5010                 req = list_entry(head->next, struct migration_req, list);
5011                 list_del_init(head->next);
5012
5013                 spin_unlock(&rq->lock);
5014                 __migrate_task(req->task, cpu, req->dest_cpu);
5015                 local_irq_enable();
5016
5017                 complete(&req->done);
5018         }
5019         __set_current_state(TASK_RUNNING);
5020         return 0;
5021
5022 wait_to_die:
5023         /* Wait for kthread_stop */
5024         set_current_state(TASK_INTERRUPTIBLE);
5025         while (!kthread_should_stop()) {
5026                 schedule();
5027                 set_current_state(TASK_INTERRUPTIBLE);
5028         }
5029         __set_current_state(TASK_RUNNING);
5030         return 0;
5031 }
5032
5033 #ifdef CONFIG_HOTPLUG_CPU
5034 /*
5035  * Figure out where task on dead CPU should go, use force if neccessary.
5036  * NOTE: interrupts should be disabled by the caller
5037  */
5038 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5039 {
5040         unsigned long flags;
5041         cpumask_t mask;
5042         struct rq *rq;
5043         int dest_cpu;
5044
5045 restart:
5046         /* On same node? */
5047         mask = node_to_cpumask(cpu_to_node(dead_cpu));
5048         cpus_and(mask, mask, p->cpus_allowed);
5049         dest_cpu = any_online_cpu(mask);
5050
5051         /* On any allowed CPU? */
5052         if (dest_cpu == NR_CPUS)
5053                 dest_cpu = any_online_cpu(p->cpus_allowed);
5054
5055         /* No more Mr. Nice Guy. */
5056         if (dest_cpu == NR_CPUS) {
5057                 rq = task_rq_lock(p, &flags);
5058                 cpus_setall(p->cpus_allowed);
5059                 dest_cpu = any_online_cpu(p->cpus_allowed);
5060                 task_rq_unlock(rq, &flags);
5061
5062                 /*
5063                  * Don't tell them about moving exiting tasks or
5064                  * kernel threads (both mm NULL), since they never
5065                  * leave kernel.
5066                  */
5067                 if (p->mm && printk_ratelimit())
5068                         printk(KERN_INFO "process %d (%s) no "
5069                                "longer affine to cpu%d\n",
5070                                p->pid, p->comm, dead_cpu);
5071         }
5072         if (!__migrate_task(p, dead_cpu, dest_cpu))
5073                 goto restart;
5074 }
5075
5076 /*
5077  * While a dead CPU has no uninterruptible tasks queued at this point,
5078  * it might still have a nonzero ->nr_uninterruptible counter, because
5079  * for performance reasons the counter is not stricly tracking tasks to
5080  * their home CPUs. So we just add the counter to another CPU's counter,
5081  * to keep the global sum constant after CPU-down:
5082  */
5083 static void migrate_nr_uninterruptible(struct rq *rq_src)
5084 {
5085         struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5086         unsigned long flags;
5087
5088         local_irq_save(flags);
5089         double_rq_lock(rq_src, rq_dest);
5090         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5091         rq_src->nr_uninterruptible = 0;
5092         double_rq_unlock(rq_src, rq_dest);
5093         local_irq_restore(flags);
5094 }
5095
5096 /* Run through task list and migrate tasks from the dead cpu. */
5097 static void migrate_live_tasks(int src_cpu)
5098 {
5099         struct task_struct *p, *t;
5100
5101         write_lock_irq(&tasklist_lock);
5102
5103         do_each_thread(t, p) {
5104                 if (p == current)
5105                         continue;
5106
5107                 if (task_cpu(p) == src_cpu)
5108                         move_task_off_dead_cpu(src_cpu, p);
5109         } while_each_thread(t, p);
5110
5111         write_unlock_irq(&tasklist_lock);
5112 }
5113
5114 /*
5115  * Schedules idle task to be the next runnable task on current CPU.
5116  * It does so by boosting its priority to highest possible and adding it to
5117  * the _front_ of the runqueue. Used by CPU offline code.
5118  */
5119 void sched_idle_next(void)
5120 {
5121         int this_cpu = smp_processor_id();
5122         struct rq *rq = cpu_rq(this_cpu);
5123         struct task_struct *p = rq->idle;
5124         unsigned long flags;
5125
5126         /* cpu has to be offline */
5127         BUG_ON(cpu_online(this_cpu));
5128
5129         /*
5130          * Strictly not necessary since rest of the CPUs are stopped by now
5131          * and interrupts disabled on the current cpu.
5132          */
5133         spin_lock_irqsave(&rq->lock, flags);
5134
5135         __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5136
5137         /* Add idle task to the _front_ of its priority queue: */
5138         activate_idle_task(p, rq);
5139
5140         spin_unlock_irqrestore(&rq->lock, flags);
5141 }
5142
5143 /*
5144  * Ensures that the idle task is using init_mm right before its cpu goes
5145  * offline.
5146  */
5147 void idle_task_exit(void)
5148 {
5149         struct mm_struct *mm = current->active_mm;
5150
5151         BUG_ON(cpu_online(smp_processor_id()));
5152
5153         if (mm != &init_mm)
5154                 switch_mm(mm, &init_mm, current);
5155         mmdrop(mm);
5156 }
5157
5158 /* called under rq->lock with disabled interrupts */
5159 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5160 {
5161         struct rq *rq = cpu_rq(dead_cpu);
5162
5163         /* Must be exiting, otherwise would be on tasklist. */
5164         BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5165
5166         /* Cannot have done final schedule yet: would have vanished. */
5167         BUG_ON(p->state == TASK_DEAD);
5168
5169         get_task_struct(p);
5170
5171         /*
5172          * Drop lock around migration; if someone else moves it,
5173          * that's OK.  No task can be added to this CPU, so iteration is
5174          * fine.
5175          * NOTE: interrupts should be left disabled  --dev@
5176          */
5177         spin_unlock(&rq->lock);
5178         move_task_off_dead_cpu(dead_cpu, p);
5179         spin_lock(&rq->lock);
5180
5181         put_task_struct(p);
5182 }
5183
5184 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5185 static void migrate_dead_tasks(unsigned int dead_cpu)
5186 {
5187         struct rq *rq = cpu_rq(dead_cpu);
5188         struct task_struct *next;
5189
5190         for ( ; ; ) {
5191                 if (!rq->nr_running)
5192                         break;
5193                 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5194                 if (!next)
5195                         break;
5196                 migrate_dead(dead_cpu, next);
5197         }
5198 }
5199 #endif /* CONFIG_HOTPLUG_CPU */
5200
5201 /*
5202  * migration_call - callback that gets triggered when a CPU is added.
5203  * Here we can start up the necessary migration thread for the new CPU.
5204  */
5205 static int __cpuinit
5206 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5207 {
5208         struct task_struct *p;
5209         int cpu = (long)hcpu;
5210         unsigned long flags;
5211         struct rq *rq;
5212
5213         switch (action) {
5214         case CPU_LOCK_ACQUIRE:
5215                 mutex_lock(&sched_hotcpu_mutex);
5216                 break;
5217
5218         case CPU_UP_PREPARE:
5219         case CPU_UP_PREPARE_FROZEN:
5220                 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5221                 if (IS_ERR(p))
5222                         return NOTIFY_BAD;
5223                 p->flags |= PF_NOFREEZE;
5224                 kthread_bind(p, cpu);
5225                 /* Must be high prio: stop_machine expects to yield to it. */
5226                 rq = task_rq_lock(p, &flags);
5227                 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5228                 task_rq_unlock(rq, &flags);
5229                 cpu_rq(cpu)->migration_thread = p;
5230                 break;
5231
5232         case CPU_ONLINE:
5233         case CPU_ONLINE_FROZEN:
5234                 /* Strictly unneccessary, as first user will wake it. */
5235                 wake_up_process(cpu_rq(cpu)->migration_thread);
5236                 break;
5237
5238 #ifdef CONFIG_HOTPLUG_CPU
5239         case CPU_UP_CANCELED:
5240         case CPU_UP_CANCELED_FROZEN:
5241                 if (!cpu_rq(cpu)->migration_thread)
5242                         break;
5243                 /* Unbind it from offline cpu so it can run.  Fall thru. */
5244                 kthread_bind(cpu_rq(cpu)->migration_thread,
5245                              any_online_cpu(cpu_online_map));
5246                 kthread_stop(cpu_rq(cpu)->migration_thread);
5247                 cpu_rq(cpu)->migration_thread = NULL;
5248                 break;
5249
5250         case CPU_DEAD:
5251         case CPU_DEAD_FROZEN:
5252                 migrate_live_tasks(cpu);
5253                 rq = cpu_rq(cpu);
5254                 kthread_stop(rq->migration_thread);
5255                 rq->migration_thread = NULL;
5256                 /* Idle task back to normal (off runqueue, low prio) */
5257                 rq = task_rq_lock(rq->idle, &flags);
5258                 deactivate_task(rq, rq->idle, 0);
5259                 rq->idle->static_prio = MAX_PRIO;
5260                 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5261                 rq->idle->sched_class = &idle_sched_class;
5262                 migrate_dead_tasks(cpu);
5263                 task_rq_unlock(rq, &flags);
5264                 migrate_nr_uninterruptible(rq);
5265                 BUG_ON(rq->nr_running != 0);
5266
5267                 /* No need to migrate the tasks: it was best-effort if
5268                  * they didn't take sched_hotcpu_mutex.  Just wake up
5269                  * the requestors. */
5270                 spin_lock_irq(&rq->lock);
5271                 while (!list_empty(&rq->migration_queue)) {
5272                         struct migration_req *req;
5273
5274                         req = list_entry(rq->migration_queue.next,
5275                                          struct migration_req, list);
5276                         list_del_init(&req->list);
5277                         complete(&req->done);
5278                 }
5279                 spin_unlock_irq(&rq->lock);
5280                 break;
5281 #endif
5282         case CPU_LOCK_RELEASE:
5283                 mutex_unlock(&sched_hotcpu_mutex);
5284                 break;
5285         }
5286         return NOTIFY_OK;
5287 }
5288
5289 /* Register at highest priority so that task migration (migrate_all_tasks)
5290  * happens before everything else.
5291  */
5292 static struct notifier_block __cpuinitdata migration_notifier = {
5293         .notifier_call = migration_call,
5294         .priority = 10
5295 };
5296
5297 int __init migration_init(void)
5298 {
5299         void *cpu = (void *)(long)smp_processor_id();
5300         int err;
5301
5302         /* Start one for the boot CPU: */
5303         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5304         BUG_ON(err == NOTIFY_BAD);
5305         migration_call(&migration_notifier, CPU_ONLINE, cpu);
5306         register_cpu_notifier(&migration_notifier);
5307
5308         return 0;
5309 }
5310 #endif
5311
5312 #ifdef CONFIG_SMP
5313
5314 /* Number of possible processor ids */
5315 int nr_cpu_ids __read_mostly = NR_CPUS;
5316 EXPORT_SYMBOL(nr_cpu_ids);
5317
5318 #undef SCHED_DOMAIN_DEBUG
5319 #ifdef SCHED_DOMAIN_DEBUG
5320 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5321 {
5322         int level = 0;
5323
5324         if (!sd) {
5325                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5326                 return;
5327         }
5328
5329         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5330
5331         do {
5332                 int i;
5333                 char str[NR_CPUS];
5334                 struct sched_group *group = sd->groups;
5335                 cpumask_t groupmask;
5336
5337                 cpumask_scnprintf(str, NR_CPUS, sd->span);
5338                 cpus_clear(groupmask);
5339
5340                 printk(KERN_DEBUG);
5341                 for (i = 0; i < level + 1; i++)
5342                         printk(" ");
5343                 printk("domain %d: ", level);
5344
5345                 if (!(sd->flags & SD_LOAD_BALANCE)) {
5346                         printk("does not load-balance\n");
5347                         if (sd->parent)
5348                                 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5349                                                 " has parent");
5350                         break;
5351                 }
5352
5353                 printk("span %s\n", str);
5354
5355                 if (!cpu_isset(cpu, sd->span))
5356                         printk(KERN_ERR "ERROR: domain->span does not contain "
5357                                         "CPU%d\n", cpu);
5358                 if (!cpu_isset(cpu, group->cpumask))
5359                         printk(KERN_ERR "ERROR: domain->groups does not contain"
5360                                         " CPU%d\n", cpu);
5361
5362                 printk(KERN_DEBUG);
5363                 for (i = 0; i < level + 2; i++)
5364                         printk(" ");
5365                 printk("groups:");
5366                 do {
5367                         if (!group) {
5368                                 printk("\n");
5369                                 printk(KERN_ERR "ERROR: group is NULL\n");
5370                                 break;
5371                         }
5372
5373                         if (!group->__cpu_power) {
5374                                 printk("\n");
5375                                 printk(KERN_ERR "ERROR: domain->cpu_power not "
5376                                                 "set\n");
5377                         }
5378
5379                         if (!cpus_weight(group->cpumask)) {
5380                                 printk("\n");
5381                                 printk(KERN_ERR "ERROR: empty group\n");
5382                         }
5383
5384                         if (cpus_intersects(groupmask, group->cpumask)) {
5385                                 printk("\n");
5386                                 printk(KERN_ERR "ERROR: repeated CPUs\n");
5387                         }
5388
5389                         cpus_or(groupmask, groupmask, group->cpumask);
5390
5391                         cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5392                         printk(" %s", str);
5393
5394                         group = group->next;
5395                 } while (group != sd->groups);
5396                 printk("\n");
5397
5398                 if (!cpus_equal(sd->span, groupmask))
5399                         printk(KERN_ERR "ERROR: groups don't span "
5400                                         "domain->span\n");
5401
5402                 level++;
5403                 sd = sd->parent;
5404                 if (!sd)
5405                         continue;
5406
5407                 if (!cpus_subset(groupmask, sd->span))
5408                         printk(KERN_ERR "ERROR: parent span is not a superset "
5409                                 "of domain->span\n");
5410
5411         } while (sd);
5412 }
5413 #else
5414 # define sched_domain_debug(sd, cpu) do { } while (0)
5415 #endif
5416
5417 static int sd_degenerate(struct sched_domain *sd)
5418 {
5419         if (cpus_weight(sd->span) == 1)
5420                 return 1;
5421
5422         /* Following flags need at least 2 groups */
5423         if (sd->flags & (SD_LOAD_BALANCE |
5424                          SD_BALANCE_NEWIDLE |
5425                          SD_BALANCE_FORK |
5426                          SD_BALANCE_EXEC |
5427                          SD_SHARE_CPUPOWER |
5428                          SD_SHARE_PKG_RESOURCES)) {
5429                 if (sd->groups != sd->groups->next)
5430                         return 0;
5431         }
5432
5433         /* Following flags don't use groups */
5434         if (sd->flags & (SD_WAKE_IDLE |
5435                          SD_WAKE_AFFINE |
5436                          SD_WAKE_BALANCE))
5437                 return 0;
5438
5439         return 1;
5440 }
5441
5442 static int
5443 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5444 {
5445         unsigned long cflags = sd->flags, pflags = parent->flags;
5446
5447         if (sd_degenerate(parent))
5448                 return 1;
5449
5450         if (!cpus_equal(sd->span, parent->span))
5451                 return 0;
5452
5453         /* Does parent contain flags not in child? */
5454         /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5455         if (cflags & SD_WAKE_AFFINE)
5456                 pflags &= ~SD_WAKE_BALANCE;
5457         /* Flags needing groups don't count if only 1 group in parent */
5458         if (parent->groups == parent->groups->next) {
5459                 pflags &= ~(SD_LOAD_BALANCE |
5460                                 SD_BALANCE_NEWIDLE |
5461                                 SD_BALANCE_FORK |
5462                                 SD_BALANCE_EXEC |
5463                                 SD_SHARE_CPUPOWER |
5464                                 SD_SHARE_PKG_RESOURCES);
5465         }
5466         if (~cflags & pflags)
5467                 return 0;
5468
5469         return 1;
5470 }
5471
5472 /*
5473  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
5474  * hold the hotplug lock.
5475  */
5476 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5477 {
5478         struct rq *rq = cpu_rq(cpu);
5479         struct sched_domain *tmp;
5480
5481         /* Remove the sched domains which do not contribute to scheduling. */
5482         for (tmp = sd; tmp; tmp = tmp->parent) {
5483                 struct sched_domain *parent = tmp->parent;
5484                 if (!parent)
5485                         break;
5486                 if (sd_parent_degenerate(tmp, parent)) {
5487                         tmp->parent = parent->parent;
5488                         if (parent->parent)
5489                                 parent->parent->child = tmp;
5490                 }
5491         }
5492
5493         if (sd && sd_degenerate(sd)) {
5494                 sd = sd->parent;
5495                 if (sd)
5496                         sd->child = NULL;
5497         }
5498
5499         sched_domain_debug(sd, cpu);
5500
5501         rcu_assign_pointer(rq->sd, sd);
5502 }
5503
5504 /* cpus with isolated domains */
5505 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5506
5507 /* Setup the mask of cpus configured for isolated domains */
5508 static int __init isolated_cpu_setup(char *str)
5509 {
5510         int ints[NR_CPUS], i;
5511
5512         str = get_options(str, ARRAY_SIZE(ints), ints);
5513         cpus_clear(cpu_isolated_map);
5514         for (i = 1; i <= ints[0]; i++)
5515                 if (ints[i] < NR_CPUS)
5516                         cpu_set(ints[i], cpu_isolated_map);
5517         return 1;
5518 }
5519
5520 __setup ("isolcpus=", isolated_cpu_setup);
5521
5522 /*
5523  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5524  * to a function which identifies what group(along with sched group) a CPU
5525  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5526  * (due to the fact that we keep track of groups covered with a cpumask_t).
5527  *
5528  * init_sched_build_groups will build a circular linked list of the groups
5529  * covered by the given span, and will set each group's ->cpumask correctly,
5530  * and ->cpu_power to 0.
5531  */
5532 static void
5533 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5534                         int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5535                                         struct sched_group **sg))
5536 {
5537         struct sched_group *first = NULL, *last = NULL;
5538         cpumask_t covered = CPU_MASK_NONE;
5539         int i;
5540
5541         for_each_cpu_mask(i, span) {
5542                 struct sched_group *sg;
5543                 int group = group_fn(i, cpu_map, &sg);
5544                 int j;
5545
5546                 if (cpu_isset(i, covered))
5547                         continue;
5548
5549                 sg->cpumask = CPU_MASK_NONE;
5550                 sg->__cpu_power = 0;
5551
5552                 for_each_cpu_mask(j, span) {
5553                         if (group_fn(j, cpu_map, NULL) != group)
5554                                 continue;
5555
5556                         cpu_set(j, covered);
5557                         cpu_set(j, sg->cpumask);
5558                 }
5559                 if (!first)
5560                         first = sg;
5561                 if (last)
5562                         last->next = sg;
5563                 last = sg;
5564         }
5565         last->next = first;
5566 }
5567
5568 #define SD_NODES_PER_DOMAIN 16
5569
5570 #ifdef CONFIG_NUMA
5571
5572 /**
5573  * find_next_best_node - find the next node to include in a sched_domain
5574  * @node: node whose sched_domain we're building
5575  * @used_nodes: nodes already in the sched_domain
5576  *
5577  * Find the next node to include in a given scheduling domain.  Simply
5578  * finds the closest node not already in the @used_nodes map.
5579  *
5580  * Should use nodemask_t.
5581  */
5582 static int find_next_best_node(int node, unsigned long *used_nodes)
5583 {
5584         int i, n, val, min_val, best_node = 0;
5585
5586         min_val = INT_MAX;
5587
5588         for (i = 0; i < MAX_NUMNODES; i++) {
5589                 /* Start at @node */
5590                 n = (node + i) % MAX_NUMNODES;
5591
5592                 if (!nr_cpus_node(n))
5593                         continue;
5594
5595                 /* Skip already used nodes */
5596                 if (test_bit(n, used_nodes))
5597                         continue;
5598
5599                 /* Simple min distance search */
5600                 val = node_distance(node, n);
5601
5602                 if (val < min_val) {
5603                         min_val = val;
5604                         best_node = n;
5605                 }
5606         }
5607
5608         set_bit(best_node, used_nodes);
5609         return best_node;
5610 }
5611
5612 /**
5613  * sched_domain_node_span - get a cpumask for a node's sched_domain
5614  * @node: node whose cpumask we're constructing
5615  * @size: number of nodes to include in this span
5616  *
5617  * Given a node, construct a good cpumask for its sched_domain to span.  It
5618  * should be one that prevents unnecessary balancing, but also spreads tasks
5619  * out optimally.
5620  */
5621 static cpumask_t sched_domain_node_span(int node)
5622 {
5623         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5624         cpumask_t span, nodemask;
5625         int i;
5626
5627         cpus_clear(span);
5628         bitmap_zero(used_nodes, MAX_NUMNODES);
5629
5630         nodemask = node_to_cpumask(node);
5631         cpus_or(span, span, nodemask);
5632         set_bit(node, used_nodes);
5633
5634         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5635                 int next_node = find_next_best_node(node, used_nodes);
5636
5637                 nodemask = node_to_cpumask(next_node);
5638                 cpus_or(span, span, nodemask);
5639         }
5640
5641         return span;
5642 }
5643 #endif
5644
5645 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5646
5647 /*
5648  * SMT sched-domains:
5649  */
5650 #ifdef CONFIG_SCHED_SMT
5651 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5652 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5653
5654 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5655                             struct sched_group **sg)
5656 {
5657         if (sg)
5658                 *sg = &per_cpu(sched_group_cpus, cpu);
5659         return cpu;
5660 }
5661 #endif
5662
5663 /*
5664  * multi-core sched-domains:
5665  */
5666 #ifdef CONFIG_SCHED_MC
5667 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5668 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5669 #endif
5670
5671 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5672 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5673                              struct sched_group **sg)
5674 {
5675         int group;
5676         cpumask_t mask = cpu_sibling_map[cpu];
5677         cpus_and(mask, mask, *cpu_map);
5678         group = first_cpu(mask);
5679         if (sg)
5680                 *sg = &per_cpu(sched_group_core, group);
5681         return group;
5682 }
5683 #elif defined(CONFIG_SCHED_MC)
5684 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5685                              struct sched_group **sg)
5686 {
5687         if (sg)
5688                 *sg = &per_cpu(sched_group_core, cpu);
5689         return cpu;
5690 }
5691 #endif
5692
5693 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5694 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5695
5696 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5697                              struct sched_group **sg)
5698 {
5699         int group;
5700 #ifdef CONFIG_SCHED_MC
5701         cpumask_t mask = cpu_coregroup_map(cpu);
5702         cpus_and(mask, mask, *cpu_map);
5703         group = first_cpu(mask);
5704 #elif defined(CONFIG_SCHED_SMT)
5705         cpumask_t mask = cpu_sibling_map[cpu];
5706         cpus_and(mask, mask, *cpu_map);
5707         group = first_cpu(mask);
5708 #else
5709         group = cpu;
5710 #endif
5711         if (sg)
5712                 *sg = &per_cpu(sched_group_phys, group);
5713         return group;
5714 }
5715
5716 #ifdef CONFIG_NUMA
5717 /*
5718  * The init_sched_build_groups can't handle what we want to do with node
5719  * groups, so roll our own. Now each node has its own list of groups which
5720  * gets dynamically allocated.
5721  */
5722 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5723 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5724
5725 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5726 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5727
5728 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5729                                  struct sched_group **sg)
5730 {
5731         cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5732         int group;
5733
5734         cpus_and(nodemask, nodemask, *cpu_map);
5735         group = first_cpu(nodemask);
5736
5737         if (sg)
5738                 *sg = &per_cpu(sched_group_allnodes, group);
5739         return group;
5740 }
5741
5742 static void init_numa_sched_groups_power(struct sched_group *group_head)
5743 {
5744         struct sched_group *sg = group_head;
5745         int j;
5746
5747         if (!sg)
5748                 return;
5749 next_sg:
5750         for_each_cpu_mask(j, sg->cpumask) {
5751                 struct sched_domain *sd;
5752
5753                 sd = &per_cpu(phys_domains, j);
5754                 if (j != first_cpu(sd->groups->cpumask)) {
5755                         /*
5756                          * Only add "power" once for each
5757                          * physical package.
5758                          */
5759                         continue;
5760                 }
5761
5762                 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5763         }
5764         sg = sg->next;
5765         if (sg != group_head)
5766                 goto next_sg;
5767 }
5768 #endif
5769
5770 #ifdef CONFIG_NUMA
5771 /* Free memory allocated for various sched_group structures */
5772 static void free_sched_groups(const cpumask_t *cpu_map)
5773 {
5774         int cpu, i;
5775
5776         for_each_cpu_mask(cpu, *cpu_map) {
5777                 struct sched_group **sched_group_nodes
5778                         = sched_group_nodes_bycpu[cpu];
5779
5780                 if (!sched_group_nodes)
5781                         continue;
5782
5783                 for (i = 0; i < MAX_NUMNODES; i++) {
5784                         cpumask_t nodemask = node_to_cpumask(i);
5785                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
5786
5787                         cpus_and(nodemask, nodemask, *cpu_map);
5788                         if (cpus_empty(nodemask))
5789                                 continue;
5790
5791                         if (sg == NULL)
5792                                 continue;
5793                         sg = sg->next;
5794 next_sg:
5795                         oldsg = sg;
5796                         sg = sg->next;
5797                         kfree(oldsg);
5798                         if (oldsg != sched_group_nodes[i])
5799                                 goto next_sg;
5800                 }
5801                 kfree(sched_group_nodes);
5802                 sched_group_nodes_bycpu[cpu] = NULL;
5803         }
5804 }
5805 #else
5806 static void free_sched_groups(const cpumask_t *cpu_map)
5807 {
5808 }
5809 #endif
5810
5811 /*
5812  * Initialize sched groups cpu_power.
5813  *
5814  * cpu_power indicates the capacity of sched group, which is used while
5815  * distributing the load between different sched groups in a sched domain.
5816  * Typically cpu_power for all the groups in a sched domain will be same unless
5817  * there are asymmetries in the topology. If there are asymmetries, group
5818  * having more cpu_power will pickup more load compared to the group having
5819  * less cpu_power.
5820  *
5821  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5822  * the maximum number of tasks a group can handle in the presence of other idle
5823  * or lightly loaded groups in the same sched domain.
5824  */
5825 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5826 {
5827         struct sched_domain *child;
5828         struct sched_group *group;
5829
5830         WARN_ON(!sd || !sd->groups);
5831
5832         if (cpu != first_cpu(sd->groups->cpumask))
5833                 return;
5834
5835         child = sd->child;
5836
5837         sd->groups->__cpu_power = 0;
5838
5839         /*
5840          * For perf policy, if the groups in child domain share resources
5841          * (for example cores sharing some portions of the cache hierarchy
5842          * or SMT), then set this domain groups cpu_power such that each group
5843          * can handle only one task, when there are other idle groups in the
5844          * same sched domain.
5845          */
5846         if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5847                        (child->flags &
5848                         (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5849                 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5850                 return;
5851         }
5852
5853         /*
5854          * add cpu_power of each child group to this groups cpu_power
5855          */
5856         group = child->groups;
5857         do {
5858                 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5859                 group = group->next;
5860         } while (group != child->groups);
5861 }
5862
5863 /*
5864  * Build sched domains for a given set of cpus and attach the sched domains
5865  * to the individual cpus
5866  */
5867 static int build_sched_domains(const cpumask_t *cpu_map)
5868 {
5869         int i;
5870 #ifdef CONFIG_NUMA
5871         struct sched_group **sched_group_nodes = NULL;
5872         int sd_allnodes = 0;
5873
5874         /*
5875          * Allocate the per-node list of sched groups
5876          */
5877         sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5878                                            GFP_KERNEL);
5879         if (!sched_group_nodes) {
5880                 printk(KERN_WARNING "Can not alloc sched group node list\n");
5881                 return -ENOMEM;
5882         }
5883         sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5884 #endif
5885
5886         /*
5887          * Set up domains for cpus specified by the cpu_map.
5888          */
5889         for_each_cpu_mask(i, *cpu_map) {
5890                 struct sched_domain *sd = NULL, *p;
5891                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5892
5893                 cpus_and(nodemask, nodemask, *cpu_map);
5894
5895 #ifdef CONFIG_NUMA
5896                 if (cpus_weight(*cpu_map) >
5897                                 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5898                         sd = &per_cpu(allnodes_domains, i);
5899                         *sd = SD_ALLNODES_INIT;
5900                         sd->span = *cpu_map;
5901                         cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5902                         p = sd;
5903                         sd_allnodes = 1;
5904                 } else
5905                         p = NULL;
5906
5907                 sd = &per_cpu(node_domains, i);
5908                 *sd = SD_NODE_INIT;
5909                 sd->span = sched_domain_node_span(cpu_to_node(i));
5910                 sd->parent = p;
5911                 if (p)
5912                         p->child = sd;
5913                 cpus_and(sd->span, sd->span, *cpu_map);
5914 #endif
5915
5916                 p = sd;
5917                 sd = &per_cpu(phys_domains, i);
5918                 *sd = SD_CPU_INIT;
5919                 sd->span = nodemask;
5920                 sd->parent = p;
5921                 if (p)
5922                         p->child = sd;
5923                 cpu_to_phys_group(i, cpu_map, &sd->groups);
5924
5925 #ifdef CONFIG_SCHED_MC
5926                 p = sd;
5927                 sd = &per_cpu(core_domains, i);
5928                 *sd = SD_MC_INIT;
5929                 sd->span = cpu_coregroup_map(i);
5930                 cpus_and(sd->span, sd->span, *cpu_map);
5931                 sd->parent = p;
5932                 p->child = sd;
5933                 cpu_to_core_group(i, cpu_map, &sd->groups);
5934 #endif
5935
5936 #ifdef CONFIG_SCHED_SMT
5937                 p = sd;
5938                 sd = &per_cpu(cpu_domains, i);
5939                 *sd = SD_SIBLING_INIT;
5940                 sd->span = cpu_sibling_map[i];
5941                 cpus_and(sd->span, sd->span, *cpu_map);
5942                 sd->parent = p;
5943                 p->child = sd;
5944                 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5945 #endif
5946         }
5947
5948 #ifdef CONFIG_SCHED_SMT
5949         /* Set up CPU (sibling) groups */
5950         for_each_cpu_mask(i, *cpu_map) {
5951                 cpumask_t this_sibling_map = cpu_sibling_map[i];
5952                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5953                 if (i != first_cpu(this_sibling_map))
5954                         continue;
5955
5956                 init_sched_build_groups(this_sibling_map, cpu_map,
5957                                         &cpu_to_cpu_group);
5958         }
5959 #endif
5960
5961 #ifdef CONFIG_SCHED_MC
5962         /* Set up multi-core groups */
5963         for_each_cpu_mask(i, *cpu_map) {
5964                 cpumask_t this_core_map = cpu_coregroup_map(i);
5965                 cpus_and(this_core_map, this_core_map, *cpu_map);
5966                 if (i != first_cpu(this_core_map))
5967                         continue;
5968                 init_sched_build_groups(this_core_map, cpu_map,
5969                                         &cpu_to_core_group);
5970         }
5971 #endif
5972
5973         /* Set up physical groups */
5974         for (i = 0; i < MAX_NUMNODES; i++) {
5975                 cpumask_t nodemask = node_to_cpumask(i);
5976
5977                 cpus_and(nodemask, nodemask, *cpu_map);
5978                 if (cpus_empty(nodemask))
5979                         continue;
5980
5981                 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5982         }
5983
5984 #ifdef CONFIG_NUMA
5985         /* Set up node groups */
5986         if (sd_allnodes)
5987                 init_sched_build_groups(*cpu_map, cpu_map,
5988                                         &cpu_to_allnodes_group);
5989
5990         for (i = 0; i < MAX_NUMNODES; i++) {
5991                 /* Set up node groups */
5992                 struct sched_group *sg, *prev;
5993                 cpumask_t nodemask = node_to_cpumask(i);
5994                 cpumask_t domainspan;
5995                 cpumask_t covered = CPU_MASK_NONE;
5996                 int j;
5997
5998                 cpus_and(nodemask, nodemask, *cpu_map);
5999                 if (cpus_empty(nodemask)) {
6000                         sched_group_nodes[i] = NULL;
6001                         continue;
6002                 }
6003
6004                 domainspan = sched_domain_node_span(i);
6005                 cpus_and(domainspan, domainspan, *cpu_map);
6006
6007                 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6008                 if (!sg) {
6009                         printk(KERN_WARNING "Can not alloc domain group for "
6010                                 "node %d\n", i);
6011                         goto error;
6012                 }
6013                 sched_group_nodes[i] = sg;
6014                 for_each_cpu_mask(j, nodemask) {
6015                         struct sched_domain *sd;
6016                         sd = &per_cpu(node_domains, j);
6017                         sd->groups = sg;
6018                 }
6019                 sg->__cpu_power = 0;
6020                 sg->cpumask = nodemask;
6021                 sg->next = sg;
6022                 cpus_or(covered, covered, nodemask);
6023                 prev = sg;
6024
6025                 for (j = 0; j < MAX_NUMNODES; j++) {
6026                         cpumask_t tmp, notcovered;
6027                         int n = (i + j) % MAX_NUMNODES;
6028
6029                         cpus_complement(notcovered, covered);
6030                         cpus_and(tmp, notcovered, *cpu_map);
6031                         cpus_and(tmp, tmp, domainspan);
6032                         if (cpus_empty(tmp))
6033                                 break;
6034
6035                         nodemask = node_to_cpumask(n);
6036                         cpus_and(tmp, tmp, nodemask);
6037                         if (cpus_empty(tmp))
6038                                 continue;
6039
6040                         sg = kmalloc_node(sizeof(struct sched_group),
6041                                           GFP_KERNEL, i);
6042                         if (!sg) {
6043                                 printk(KERN_WARNING
6044                                 "Can not alloc domain group for node %d\n", j);
6045                                 goto error;
6046                         }
6047                         sg->__cpu_power = 0;
6048                         sg->cpumask = tmp;
6049                         sg->next = prev->next;
6050                         cpus_or(covered, covered, tmp);
6051                         prev->next = sg;
6052                         prev = sg;
6053                 }
6054         }
6055 #endif
6056
6057         /* Calculate CPU power for physical packages and nodes */
6058 #ifdef CONFIG_SCHED_SMT
6059         for_each_cpu_mask(i, *cpu_map) {
6060                 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6061
6062                 init_sched_groups_power(i, sd);
6063         }
6064 #endif
6065 #ifdef CONFIG_SCHED_MC
6066         for_each_cpu_mask(i, *cpu_map) {
6067                 struct sched_domain *sd = &per_cpu(core_domains, i);
6068
6069                 init_sched_groups_power(i, sd);
6070         }
6071 #endif
6072
6073         for_each_cpu_mask(i, *cpu_map) {
6074                 struct sched_domain *sd = &per_cpu(phys_domains, i);
6075
6076                 init_sched_groups_power(i, sd);
6077         }
6078
6079 #ifdef CONFIG_NUMA
6080         for (i = 0; i < MAX_NUMNODES; i++)
6081                 init_numa_sched_groups_power(sched_group_nodes[i]);
6082
6083         if (sd_allnodes) {
6084                 struct sched_group *sg;
6085
6086                 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6087                 init_numa_sched_groups_power(sg);
6088         }
6089 #endif
6090
6091         /* Attach the domains */
6092         for_each_cpu_mask(i, *cpu_map) {
6093                 struct sched_domain *sd;
6094 #ifdef CONFIG_SCHED_SMT
6095                 sd = &per_cpu(cpu_domains, i);
6096 #elif defined(CONFIG_SCHED_MC)
6097                 sd = &per_cpu(core_domains, i);
6098 #else
6099                 sd = &per_cpu(phys_domains, i);
6100 #endif
6101                 cpu_attach_domain(sd, i);
6102         }
6103
6104         return 0;
6105
6106 #ifdef CONFIG_NUMA
6107 error:
6108         free_sched_groups(cpu_map);
6109         return -ENOMEM;
6110 #endif
6111 }
6112 /*
6113  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
6114  */
6115 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6116 {
6117         cpumask_t cpu_default_map;
6118         int err;
6119
6120         /*
6121          * Setup mask for cpus without special case scheduling requirements.
6122          * For now this just excludes isolated cpus, but could be used to
6123          * exclude other special cases in the future.
6124          */
6125         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6126
6127         err = build_sched_domains(&cpu_default_map);
6128
6129         return err;
6130 }
6131
6132 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6133 {
6134         free_sched_groups(cpu_map);
6135 }
6136
6137 /*
6138  * Detach sched domains from a group of cpus specified in cpu_map
6139  * These cpus will now be attached to the NULL domain
6140  */
6141 static void detach_destroy_domains(const cpumask_t *cpu_map)
6142 {
6143         int i;
6144
6145         for_each_cpu_mask(i, *cpu_map)
6146                 cpu_attach_domain(NULL, i);
6147         synchronize_sched();
6148         arch_destroy_sched_domains(cpu_map);
6149 }
6150
6151 /*
6152  * Partition sched domains as specified by the cpumasks below.
6153  * This attaches all cpus from the cpumasks to the NULL domain,
6154  * waits for a RCU quiescent period, recalculates sched
6155  * domain information and then attaches them back to the
6156  * correct sched domains
6157  * Call with hotplug lock held
6158  */
6159 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6160 {
6161         cpumask_t change_map;
6162         int err = 0;
6163
6164         cpus_and(*partition1, *partition1, cpu_online_map);
6165         cpus_and(*partition2, *partition2, cpu_online_map);
6166         cpus_or(change_map, *partition1, *partition2);
6167
6168         /* Detach sched domains from all of the affected cpus */
6169         detach_destroy_domains(&change_map);
6170         if (!cpus_empty(*partition1))
6171                 err = build_sched_domains(partition1);
6172         if (!err && !cpus_empty(*partition2))
6173                 err = build_sched_domains(partition2);
6174
6175         return err;
6176 }
6177
6178 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6179 int arch_reinit_sched_domains(void)
6180 {
6181         int err;
6182
6183         mutex_lock(&sched_hotcpu_mutex);
6184         detach_destroy_domains(&cpu_online_map);
6185         err = arch_init_sched_domains(&cpu_online_map);
6186         mutex_unlock(&sched_hotcpu_mutex);
6187
6188         return err;
6189 }
6190
6191 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6192 {
6193         int ret;
6194
6195         if (buf[0] != '0' && buf[0] != '1')
6196                 return -EINVAL;
6197
6198         if (smt)
6199                 sched_smt_power_savings = (buf[0] == '1');
6200         else
6201                 sched_mc_power_savings = (buf[0] == '1');
6202
6203         ret = arch_reinit_sched_domains();
6204
6205         return ret ? ret : count;
6206 }
6207
6208 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6209 {
6210         int err = 0;
6211
6212 #ifdef CONFIG_SCHED_SMT
6213         if (smt_capable())
6214                 err = sysfs_create_file(&cls->kset.kobj,
6215                                         &attr_sched_smt_power_savings.attr);
6216 #endif
6217 #ifdef CONFIG_SCHED_MC
6218         if (!err && mc_capable())
6219                 err = sysfs_create_file(&cls->kset.kobj,
6220                                         &attr_sched_mc_power_savings.attr);
6221 #endif
6222         return err;
6223 }
6224 #endif
6225
6226 #ifdef CONFIG_SCHED_MC
6227 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6228 {
6229         return sprintf(page, "%u\n", sched_mc_power_savings);
6230 }
6231 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6232                                             const char *buf, size_t count)
6233 {
6234         return sched_power_savings_store(buf, count, 0);
6235 }
6236 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6237             sched_mc_power_savings_store);
6238 #endif
6239
6240 #ifdef CONFIG_SCHED_SMT
6241 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6242 {
6243         return sprintf(page, "%u\n", sched_smt_power_savings);
6244 }
6245 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6246                                              const char *buf, size_t count)
6247 {
6248         return sched_power_savings_store(buf, count, 1);
6249 }
6250 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6251             sched_smt_power_savings_store);
6252 #endif
6253
6254 /*
6255  * Force a reinitialization of the sched domains hierarchy.  The domains
6256  * and groups cannot be updated in place without racing with the balancing
6257  * code, so we temporarily attach all running cpus to the NULL domain
6258  * which will prevent rebalancing while the sched domains are recalculated.
6259  */
6260 static int update_sched_domains(struct notifier_block *nfb,
6261                                 unsigned long action, void *hcpu)
6262 {
6263         switch (action) {
6264         case CPU_UP_PREPARE:
6265         case CPU_UP_PREPARE_FROZEN:
6266         case CPU_DOWN_PREPARE:
6267         case CPU_DOWN_PREPARE_FROZEN:
6268                 detach_destroy_domains(&cpu_online_map);
6269                 return NOTIFY_OK;
6270
6271         case CPU_UP_CANCELED:
6272         case CPU_UP_CANCELED_FROZEN:
6273         case CPU_DOWN_FAILED:
6274         case CPU_DOWN_FAILED_FROZEN:
6275         case CPU_ONLINE:
6276         case CPU_ONLINE_FROZEN:
6277         case CPU_DEAD:
6278         case CPU_DEAD_FROZEN:
6279                 /*
6280                  * Fall through and re-initialise the domains.
6281                  */
6282                 break;
6283         default:
6284                 return NOTIFY_DONE;
6285         }
6286
6287         /* The hotplug lock is already held by cpu_up/cpu_down */
6288         arch_init_sched_domains(&cpu_online_map);
6289
6290         return NOTIFY_OK;
6291 }
6292
6293 void __init sched_init_smp(void)
6294 {
6295         cpumask_t non_isolated_cpus;
6296
6297         mutex_lock(&sched_hotcpu_mutex);
6298         arch_init_sched_domains(&cpu_online_map);
6299         cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6300         if (cpus_empty(non_isolated_cpus))
6301                 cpu_set(smp_processor_id(), non_isolated_cpus);
6302         mutex_unlock(&sched_hotcpu_mutex);
6303         /* XXX: Theoretical race here - CPU may be hotplugged now */
6304         hotcpu_notifier(update_sched_domains, 0);
6305
6306         /* Move init over to a non-isolated CPU */
6307         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6308                 BUG();
6309         sched_init_granularity();
6310 }
6311 #else
6312 void __init sched_init_smp(void)
6313 {
6314         sched_init_granularity();
6315 }
6316 #endif /* CONFIG_SMP */
6317
6318 int in_sched_functions(unsigned long addr)
6319 {
6320         /* Linker adds these: start and end of __sched functions */
6321         extern char __sched_text_start[], __sched_text_end[];
6322
6323         return in_lock_functions(addr) ||
6324                 (addr >= (unsigned long)__sched_text_start
6325                 && addr < (unsigned long)__sched_text_end);
6326 }
6327
6328 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6329 {
6330         cfs_rq->tasks_timeline = RB_ROOT;
6331         cfs_rq->fair_clock = 1;
6332 #ifdef CONFIG_FAIR_GROUP_SCHED
6333         cfs_rq->rq = rq;
6334 #endif
6335 }
6336
6337 void __init sched_init(void)
6338 {
6339         u64 now = sched_clock();
6340         int highest_cpu = 0;
6341         int i, j;
6342
6343         /*
6344          * Link up the scheduling class hierarchy:
6345          */
6346         rt_sched_class.next = &fair_sched_class;
6347         fair_sched_class.next = &idle_sched_class;
6348         idle_sched_class.next = NULL;
6349
6350         for_each_possible_cpu(i) {
6351                 struct rt_prio_array *array;
6352                 struct rq *rq;
6353
6354                 rq = cpu_rq(i);
6355                 spin_lock_init(&rq->lock);
6356                 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6357                 rq->nr_running = 0;
6358                 rq->clock = 1;
6359                 init_cfs_rq(&rq->cfs, rq);
6360 #ifdef CONFIG_FAIR_GROUP_SCHED
6361                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6362                 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6363 #endif
6364                 rq->ls.load_update_last = now;
6365                 rq->ls.load_update_start = now;
6366
6367                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6368                         rq->cpu_load[j] = 0;
6369 #ifdef CONFIG_SMP
6370                 rq->sd = NULL;
6371                 rq->active_balance = 0;
6372                 rq->next_balance = jiffies;
6373                 rq->push_cpu = 0;
6374                 rq->cpu = i;
6375                 rq->migration_thread = NULL;
6376                 INIT_LIST_HEAD(&rq->migration_queue);
6377 #endif
6378                 atomic_set(&rq->nr_iowait, 0);
6379
6380                 array = &rq->rt.active;
6381                 for (j = 0; j < MAX_RT_PRIO; j++) {
6382                         INIT_LIST_HEAD(array->queue + j);
6383                         __clear_bit(j, array->bitmap);
6384                 }
6385                 highest_cpu = i;
6386                 /* delimiter for bitsearch: */
6387                 __set_bit(MAX_RT_PRIO, array->bitmap);
6388         }
6389
6390         set_load_weight(&init_task);
6391
6392 #ifdef CONFIG_SMP
6393         nr_cpu_ids = highest_cpu + 1;
6394         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6395 #endif
6396
6397 #ifdef CONFIG_RT_MUTEXES
6398         plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6399 #endif
6400
6401         /*
6402          * The boot idle thread does lazy MMU switching as well:
6403          */
6404         atomic_inc(&init_mm.mm_count);
6405         enter_lazy_tlb(&init_mm, current);
6406
6407         /*
6408          * Make us the idle thread. Technically, schedule() should not be
6409          * called from this thread, however somewhere below it might be,
6410          * but because we are the idle thread, we just pick up running again
6411          * when this runqueue becomes "idle".
6412          */
6413         init_idle(current, smp_processor_id());
6414         /*
6415          * During early bootup we pretend to be a normal task:
6416          */
6417         current->sched_class = &fair_sched_class;
6418 }
6419
6420 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6421 void __might_sleep(char *file, int line)
6422 {
6423 #ifdef in_atomic
6424         static unsigned long prev_jiffy;        /* ratelimiting */
6425
6426         if ((in_atomic() || irqs_disabled()) &&
6427             system_state == SYSTEM_RUNNING && !oops_in_progress) {
6428                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6429                         return;
6430                 prev_jiffy = jiffies;
6431                 printk(KERN_ERR "BUG: sleeping function called from invalid"
6432                                 " context at %s:%d\n", file, line);
6433                 printk("in_atomic():%d, irqs_disabled():%d\n",
6434                         in_atomic(), irqs_disabled());
6435                 debug_show_held_locks(current);
6436                 if (irqs_disabled())
6437                         print_irqtrace_events(current);
6438                 dump_stack();
6439         }
6440 #endif
6441 }
6442 EXPORT_SYMBOL(__might_sleep);
6443 #endif
6444
6445 #ifdef CONFIG_MAGIC_SYSRQ
6446 void normalize_rt_tasks(void)
6447 {
6448         struct task_struct *g, *p;
6449         unsigned long flags;
6450         struct rq *rq;
6451         int on_rq;
6452
6453         read_lock_irq(&tasklist_lock);
6454         do_each_thread(g, p) {
6455                 p->se.fair_key                  = 0;
6456                 p->se.wait_runtime              = 0;
6457                 p->se.wait_start_fair           = 0;
6458                 p->se.wait_start                = 0;
6459                 p->se.exec_start                = 0;
6460                 p->se.sleep_start               = 0;
6461                 p->se.sleep_start_fair          = 0;
6462                 p->se.block_start               = 0;
6463                 task_rq(p)->cfs.fair_clock      = 0;
6464                 task_rq(p)->clock               = 0;
6465
6466                 if (!rt_task(p)) {
6467                         /*
6468                          * Renice negative nice level userspace
6469                          * tasks back to 0:
6470                          */
6471                         if (TASK_NICE(p) < 0 && p->mm)
6472                                 set_user_nice(p, 0);
6473                         continue;
6474                 }
6475
6476                 spin_lock_irqsave(&p->pi_lock, flags);
6477                 rq = __task_rq_lock(p);
6478 #ifdef CONFIG_SMP
6479                 /*
6480                  * Do not touch the migration thread:
6481                  */
6482                 if (p == rq->migration_thread)
6483                         goto out_unlock;
6484 #endif
6485
6486                 on_rq = p->se.on_rq;
6487                 if (on_rq)
6488                         deactivate_task(task_rq(p), p, 0);
6489                 __setscheduler(rq, p, SCHED_NORMAL, 0);
6490                 if (on_rq) {
6491                         activate_task(task_rq(p), p, 0);
6492                         resched_task(rq->curr);
6493                 }
6494 #ifdef CONFIG_SMP
6495  out_unlock:
6496 #endif
6497                 __task_rq_unlock(rq);
6498                 spin_unlock_irqrestore(&p->pi_lock, flags);
6499         } while_each_thread(g, p);
6500
6501         read_unlock_irq(&tasklist_lock);
6502 }
6503
6504 #endif /* CONFIG_MAGIC_SYSRQ */
6505
6506 #ifdef CONFIG_IA64
6507 /*
6508  * These functions are only useful for the IA64 MCA handling.
6509  *
6510  * They can only be called when the whole system has been
6511  * stopped - every CPU needs to be quiescent, and no scheduling
6512  * activity can take place. Using them for anything else would
6513  * be a serious bug, and as a result, they aren't even visible
6514  * under any other configuration.
6515  */
6516
6517 /**
6518  * curr_task - return the current task for a given cpu.
6519  * @cpu: the processor in question.
6520  *
6521  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6522  */
6523 struct task_struct *curr_task(int cpu)
6524 {
6525         return cpu_curr(cpu);
6526 }
6527
6528 /**
6529  * set_curr_task - set the current task for a given cpu.
6530  * @cpu: the processor in question.
6531  * @p: the task pointer to set.
6532  *
6533  * Description: This function must only be used when non-maskable interrupts
6534  * are serviced on a separate stack.  It allows the architecture to switch the
6535  * notion of the current task on a cpu in a non-blocking manner.  This function
6536  * must be called with all CPU's synchronized, and interrupts disabled, the
6537  * and caller must save the original value of the current task (see
6538  * curr_task() above) and restore that value before reenabling interrupts and
6539  * re-starting the system.
6540  *
6541  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6542  */
6543 void set_curr_task(int cpu, struct task_struct *p)
6544 {
6545         cpu_curr(cpu) = p;
6546 }
6547
6548 #endif