Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / kernel / sched / fair.c
index d59307ecd67d2b197a8ca181bc816b2eca23e767..ee271bb661cc923dfa67ae5d5c45a18c71df7cb1 100644 (file)
@@ -1393,6 +1393,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
        int last_cpupid, this_cpupid;
 
        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+       last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+
+       /*
+        * Allow first faults or private faults to migrate immediately early in
+        * the lifetime of a task. The magic number 4 is based on waiting for
+        * two full passes of the "multi-stage node selection" test that is
+        * executed below.
+        */
+       if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+           (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
+               return true;
 
        /*
         * Multi-stage node selection is used in conjunction with a periodic
@@ -1411,7 +1422,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
         * This quadric squishes small probabilities, making it less likely we
         * act on an unlikely task<->page relation.
         */
-       last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
        if (!cpupid_pid_unset(last_cpupid) &&
                                cpupid_to_nid(last_cpupid) != dst_nid)
                return false;
@@ -3993,7 +4003,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * put back on, and if we advance min_vruntime, we'll be placed back
         * further than we started -- ie. we'll be penalized.
         */
-       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
                update_min_vruntime(cfs_rq);
 }
 
@@ -4468,9 +4478,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
        /*
         * Add to the _head_ of the list, so that an already-started
-        * distribute_cfs_runtime will not see us
+        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+        * not running add to the tail so that later runqueues don't get starved.
         */
-       list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       if (cfs_b->distribute_running)
+               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       else
+               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 
        /*
         * If we're the first throttled task, make sure the bandwidth
@@ -4614,14 +4628,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
         * in us over-using our runtime if it is all used during this loop, but
         * only by limited amounts in that extreme case.
         */
-       while (throttled && cfs_b->runtime > 0) {
+       while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
                runtime = cfs_b->runtime;
+               cfs_b->distribute_running = 1;
                raw_spin_unlock(&cfs_b->lock);
                /* we can't nest cfs_b->lock while distributing bandwidth */
                runtime = distribute_cfs_runtime(cfs_b, runtime,
                                                 runtime_expires);
                raw_spin_lock(&cfs_b->lock);
 
+               cfs_b->distribute_running = 0;
                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 
                cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4732,6 +4748,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 
        /* confirm we're still not at a refresh boundary */
        raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->distribute_running) {
+               raw_spin_unlock(&cfs_b->lock);
+               return;
+       }
+
        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
                raw_spin_unlock(&cfs_b->lock);
                return;
@@ -4741,6 +4762,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
                runtime = cfs_b->runtime;
 
        expires = cfs_b->runtime_expires;
+       if (runtime)
+               cfs_b->distribute_running = 1;
+
        raw_spin_unlock(&cfs_b->lock);
 
        if (!runtime)
@@ -4751,6 +4775,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        raw_spin_lock(&cfs_b->lock);
        if (expires == cfs_b->runtime_expires)
                cfs_b->runtime -= min(runtime, cfs_b->runtime);
+       cfs_b->distribute_running = 0;
        raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -4859,6 +4884,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        cfs_b->period_timer.function = sched_cfs_period_timer;
        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        cfs_b->slack_timer.function = sched_cfs_slack_timer;
+       cfs_b->distribute_running = 0;
 }
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)