Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[linux-2.6-block.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index d59307ecd67d2b197a8ca181bc816b2eca23e767..ee271bb661cc923dfa67ae5d5c45a18c71df7cb1 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1393,6 +1393,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
         int last_cpupid, this_cpupid;
  
         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+       last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+
+       /*
+        * Allow first faults or private faults to migrate immediately early in
+        * the lifetime of a task. The magic number 4 is based on waiting for
+        * two full passes of the "multi-stage node selection" test that is
+        * executed below.
+        */
+       if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+           (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
+               return true;
  
         /*
          * Multi-stage node selection is used in conjunction with a periodic
@@ -1411,7 +1422,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
          * This quadric squishes small probabilities, making it less likely we
          * act on an unlikely task<->page relation.
          */
-       last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
         if (!cpupid_pid_unset(last_cpupid) &&
                                 cpupid_to_nid(last_cpupid) != dst_nid)
                 return false;
@@ -3993,7 +4003,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * put back on, and if we advance min_vruntime, we'll be placed back
          * further than we started -- ie. we'll be penalized.
          */
-       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
                 update_min_vruntime(cfs_rq);
  }
  
@@ -4468,9 +4478,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
  
         /*
          * Add to the _head_ of the list, so that an already-started
-        * distribute_cfs_runtime will not see us
+        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+        * not running add to the tail so that later runqueues don't get starved.
          */
-       list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       if (cfs_b->distribute_running)
+               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       else
+               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
  
         /*
          * If we're the first throttled task, make sure the bandwidth
@@ -4614,14 +4628,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
          * in us over-using our runtime if it is all used during this loop, but
          * only by limited amounts in that extreme case.
          */
-       while (throttled && cfs_b->runtime > 0) {
+       while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
                 runtime = cfs_b->runtime;
+               cfs_b->distribute_running = 1;
                 raw_spin_unlock(&cfs_b->lock);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
                 runtime = distribute_cfs_runtime(cfs_b, runtime,
                                                  runtime_expires);
                 raw_spin_lock(&cfs_b->lock);
  
+               cfs_b->distribute_running = 0;
                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
  
                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4732,6 +4748,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  
         /* confirm we're still not at a refresh boundary */
         raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->distribute_running) {
+               raw_spin_unlock(&cfs_b->lock);
+               return;
+       }
+
         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
                 raw_spin_unlock(&cfs_b->lock);
                 return;
@@ -4741,6 +4762,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
                 runtime = cfs_b->runtime;
  
         expires = cfs_b->runtime_expires;
+       if (runtime)
+               cfs_b->distribute_running = 1;
+
         raw_spin_unlock(&cfs_b->lock);
  
         if (!runtime)
@@ -4751,6 +4775,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         raw_spin_lock(&cfs_b->lock);
         if (expires == cfs_b->runtime_expires)
                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
+       cfs_b->distribute_running = 0;
         raw_spin_unlock(&cfs_b->lock);
  }
  
@@ -4859,6 +4884,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         cfs_b->period_timer.function = sched_cfs_period_timer;
         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         cfs_b->slack_timer.function = sched_cfs_slack_timer;
+       cfs_b->distribute_running = 0;
  }
  
  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)