[linux-2.6-block.git] / kernel / sched_rt.c

/*
 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
 * policies)
 */

#ifdef CONFIG_SMP
static cpumask_t rt_overload_mask;
static atomic_t rto_count;
static inline int rt_overloaded(void)
{
	return atomic_read(&rto_count);
}
static inline cpumask_t *rt_overload(void)
{
	return &rt_overload_mask;
}
static inline void rt_set_overload(struct rq *rq)
{
	cpu_set(rq->cpu, rt_overload_mask);
	/*
	 * Make sure the mask is visible before we set
	 * the overload count. That is checked to determine
	 * if we should look at the mask. It would be a shame
	 * if we looked at the mask, but the mask was not
	 * updated yet.
	 */
	wmb();
	atomic_inc(&rto_count);
}
static inline void rt_clear_overload(struct rq *rq)
{
	/* the order here really doesn't matter */
	atomic_dec(&rto_count);
	cpu_clear(rq->cpu, rt_overload_mask);
}
#endif /* CONFIG_SMP */

/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
static void update_curr_rt(struct rq *rq)
{
	struct task_struct *curr = rq->curr;
	u64 delta_exec;

	if (!task_has_rt_policy(curr))
		return;

	delta_exec = rq->clock - curr->se.exec_start;
	if (unlikely((s64)delta_exec < 0))
		delta_exec = 0;

	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));

	curr->se.sum_exec_runtime += delta_exec;
	curr->se.exec_start = rq->clock;
	cpuacct_charge(curr, delta_exec);
}

static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
{
	WARN_ON(!rt_task(p));
	rq->rt.rt_nr_running++;
#ifdef CONFIG_SMP
	if (p->prio < rq->rt.highest_prio)
		rq->rt.highest_prio = p->prio;
	if (rq->rt.rt_nr_running > 1)
		rt_set_overload(rq);
#endif /* CONFIG_SMP */
}

static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
{
	WARN_ON(!rt_task(p));
	WARN_ON(!rq->rt.rt_nr_running);
	rq->rt.rt_nr_running--;
#ifdef CONFIG_SMP
	if (rq->rt.rt_nr_running) {
		struct rt_prio_array *array;

		WARN_ON(p->prio < rq->rt.highest_prio);
		if (p->prio == rq->rt.highest_prio) {
			/* recalculate */
			array = &rq->rt.active;
			rq->rt.highest_prio =
				sched_find_first_bit(array->bitmap);
		} /* otherwise leave rq->highest prio alone */
	} else
		rq->rt.highest_prio = MAX_RT_PRIO;
	if (rq->rt.rt_nr_running < 2)
		rt_clear_overload(rq);
#endif /* CONFIG_SMP */
}

static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
{
	struct rt_prio_array *array = &rq->rt.active;

	list_add_tail(&p->run_list, array->queue + p->prio);
	__set_bit(p->prio, array->bitmap);
	inc_cpu_load(rq, p->se.load.weight);

	inc_rt_tasks(p, rq);
}

/*
 * Adding/removing a task to/from a priority array:
 */
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
{
	struct rt_prio_array *array = &rq->rt.active;

	update_curr_rt(rq);

	list_del(&p->run_list);
	if (list_empty(array->queue + p->prio))
		__clear_bit(p->prio, array->bitmap);
	dec_cpu_load(rq, p->se.load.weight);

	dec_rt_tasks(p, rq);
}

/*
 * Put task to the end of the run list without the overhead of dequeue
 * followed by enqueue.
 */
static void requeue_task_rt(struct rq *rq, struct task_struct *p)
{
	struct rt_prio_array *array = &rq->rt.active;

	list_move_tail(&p->run_list, array->queue + p->prio);
}

static void
yield_task_rt(struct rq *rq)
{
	requeue_task_rt(rq, rq->curr);
}

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
{
	if (p->prio < rq->curr->prio)
		resched_task(rq->curr);
}

static struct task_struct *pick_next_task_rt(struct rq *rq)
{
	struct rt_prio_array *array = &rq->rt.active;
	struct task_struct *next;
	struct list_head *queue;
	int idx;

	idx = sched_find_first_bit(array->bitmap);
	if (idx >= MAX_RT_PRIO)
		return NULL;

	queue = array->queue + idx;
	next = list_entry(queue->next, struct task_struct, run_list);

	next->se.exec_start = rq->clock;

	return next;
}

static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
	update_curr_rt(rq);
	p->se.exec_start = 0;
}

#ifdef CONFIG_SMP
/* Only try algorithms three times */
#define RT_MAX_TRIES 3

static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);

static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
	if (!task_running(rq, p) &&
	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
		return 1;
	return 0;
}

/* Return the second highest RT task, NULL otherwise */
static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
						     int cpu)
{
	struct rt_prio_array *array = &rq->rt.active;
	struct task_struct *next;
	struct list_head *queue;
	int idx;

	assert_spin_locked(&rq->lock);

	if (likely(rq->rt.rt_nr_running < 2))
		return NULL;

	idx = sched_find_first_bit(array->bitmap);
	if (unlikely(idx >= MAX_RT_PRIO)) {
		WARN_ON(1); /* rt_nr_running is bad */
		return NULL;
	}

	queue = array->queue + idx;
	BUG_ON(list_empty(queue));

	next = list_entry(queue->next, struct task_struct, run_list);
	if (unlikely(pick_rt_task(rq, next, cpu)))
		goto out;

	if (queue->next->next != queue) {
		/* same prio task */
		next = list_entry(queue->next->next, struct task_struct, run_list);
		if (pick_rt_task(rq, next, cpu))
			goto out;
	}

 retry:
	/* slower, but more flexible */
	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
	if (unlikely(idx >= MAX_RT_PRIO))
		return NULL;

	queue = array->queue + idx;
	BUG_ON(list_empty(queue));

	list_for_each_entry(next, queue, run_list) {
		if (pick_rt_task(rq, next, cpu))
			goto out;
	}

	goto retry;

 out:
	return next;
}

static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);

/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task,
				      struct rq *this_rq)
{
	struct rq *lowest_rq = NULL;
	int cpu;
	int tries;
	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);

	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);

	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
		/*
		 * Scan each rq for the lowest prio.
		 */
		for_each_cpu_mask(cpu, *cpu_mask) {
			struct rq *rq = &per_cpu(runqueues, cpu);

			if (cpu == this_rq->cpu)
				continue;

			/* We look for lowest RT prio or non-rt CPU */
			if (rq->rt.highest_prio >= MAX_RT_PRIO) {
				lowest_rq = rq;
				break;
			}

			/* no locking for now */
			if (rq->rt.highest_prio > task->prio &&
			    (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
				lowest_rq = rq;
			}
		}

		if (!lowest_rq)
			break;

		/* if the prio of this runqueue changed, try again */
		if (double_lock_balance(this_rq, lowest_rq)) {
			/*
			 * We had to unlock the run queue. In
			 * the mean time, task could have
			 * migrated already or had its affinity changed.
			 * Also make sure that it wasn't scheduled on its rq.
			 */
			if (unlikely(task_rq(task) != this_rq ||
				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
				     task_running(this_rq, task) ||
				     !task->se.on_rq)) {
				spin_unlock(&lowest_rq->lock);
				lowest_rq = NULL;
				break;
			}
		}

		/* If this rq is still suitable use it. */
		if (lowest_rq->rt.highest_prio > task->prio)
			break;

		/* try again */
		spin_unlock(&lowest_rq->lock);
		lowest_rq = NULL;
	}

	return lowest_rq;
}

/*
 * If the current CPU has more than one RT task, see if the non
 * running task can migrate over to a CPU that is running a task
 * of lesser priority.
 */
static int push_rt_task(struct rq *this_rq)
{
	struct task_struct *next_task;
	struct rq *lowest_rq;
	int ret = 0;
	int paranoid = RT_MAX_TRIES;

	assert_spin_locked(&this_rq->lock);

	next_task = pick_next_highest_task_rt(this_rq, -1);
	if (!next_task)
		return 0;

 retry:
	if (unlikely(next_task == this_rq->curr)) {
		WARN_ON(1);
		return 0;
	}

	/*
	 * It's possible that the next_task slipped in of
	 * higher priority than current. If that's the case
	 * just reschedule current.
	 */
	if (unlikely(next_task->prio < this_rq->curr->prio)) {
		resched_task(this_rq->curr);
		return 0;
	}

	/* We might release this_rq lock */
	get_task_struct(next_task);

	/* find_lock_lowest_rq locks the rq if found */
	lowest_rq = find_lock_lowest_rq(next_task, this_rq);
	if (!lowest_rq) {
		struct task_struct *task;
		/*
		 * find lock_lowest_rq releases this_rq->lock
		 * so it is possible that next_task has changed.
		 * If it has, then try again.
		 */
		task = pick_next_highest_task_rt(this_rq, -1);
		if (unlikely(task != next_task) && task && paranoid--) {
			put_task_struct(next_task);
			next_task = task;
			goto retry;
		}
		goto out;
	}

	assert_spin_locked(&lowest_rq->lock);

	deactivate_task(this_rq, next_task, 0);
	set_task_cpu(next_task, lowest_rq->cpu);
	activate_task(lowest_rq, next_task, 0);

	resched_task(lowest_rq->curr);

	spin_unlock(&lowest_rq->lock);

	ret = 1;
out:
	put_task_struct(next_task);

	return ret;
}

/*
 * TODO: Currently we just use the second highest prio task on
 *       the queue, and stop when it can't migrate (or there's
 *       no more RT tasks).  There may be a case where a lower
 *       priority RT task has a different affinity than the
 *       higher RT task. In this case the lower RT task could
 *       possibly be able to migrate where as the higher priority
 *       RT task could not.  We currently ignore this issue.
 *       Enhancements are welcome!
 */
static void push_rt_tasks(struct rq *rq)
{
	/* push_rt_task will return true if it moved an RT */
	while (push_rt_task(rq))
		;
}

static int pull_rt_task(struct rq *this_rq)
{
	struct task_struct *next;
	struct task_struct *p;
	struct rq *src_rq;
	cpumask_t *rto_cpumask;
	int this_cpu = this_rq->cpu;
	int cpu;
	int ret = 0;

	assert_spin_locked(&this_rq->lock);

	/*
	 * If cpusets are used, and we have overlapping
	 * run queue cpusets, then this algorithm may not catch all.
	 * This is just the price you pay on trying to keep
	 * dirtying caches down on large SMP machines.
	 */
	if (likely(!rt_overloaded()))
		return 0;

	next = pick_next_task_rt(this_rq);

	rto_cpumask = rt_overload();

	for_each_cpu_mask(cpu, *rto_cpumask) {
		if (this_cpu == cpu)
			continue;

		src_rq = cpu_rq(cpu);
		if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
			/*
			 * It is possible that overlapping cpusets
			 * will miss clearing a non overloaded runqueue.
			 * Clear it now.
			 */
			if (double_lock_balance(this_rq, src_rq)) {
				/* unlocked our runqueue lock */
				struct task_struct *old_next = next;
				next = pick_next_task_rt(this_rq);
				if (next != old_next)
					ret = 1;
			}
			if (likely(src_rq->rt.rt_nr_running <= 1))
				/*
				 * Small chance that this_rq->curr changed
				 * but it's really harmless here.
				 */
				rt_clear_overload(this_rq);
			else
				/*
				 * Heh, the src_rq is now overloaded, since
				 * we already have the src_rq lock, go straight
				 * to pulling tasks from it.
				 */
				goto try_pulling;
			spin_unlock(&src_rq->lock);
			continue;
		}

		/*
		 * We can potentially drop this_rq's lock in
		 * double_lock_balance, and another CPU could
		 * steal our next task - hence we must cause
		 * the caller to recalculate the next task
		 * in that case:
		 */
		if (double_lock_balance(this_rq, src_rq)) {
			struct task_struct *old_next = next;
			next = pick_next_task_rt(this_rq);
			if (next != old_next)
				ret = 1;
		}

		/*
		 * Are there still pullable RT tasks?
		 */
		if (src_rq->rt.rt_nr_running <= 1) {
			spin_unlock(&src_rq->lock);
			continue;
		}

 try_pulling:
		p = pick_next_highest_task_rt(src_rq, this_cpu);

		/*
		 * Do we have an RT task that preempts
		 * the to-be-scheduled task?
		 */
		if (p && (!next || (p->prio < next->prio))) {
			WARN_ON(p == src_rq->curr);
			WARN_ON(!p->se.on_rq);

			/*
			 * There's a chance that p is higher in priority
			 * than what's currently running on its cpu.
			 * This is just that p is wakeing up and hasn't
			 * had a chance to schedule. We only pull
			 * p if it is lower in priority than the
			 * current task on the run queue or
			 * this_rq next task is lower in prio than
			 * the current task on that rq.
			 */
			if (p->prio < src_rq->curr->prio ||
			    (next && next->prio < src_rq->curr->prio))
				goto bail;

			ret = 1;

			deactivate_task(src_rq, p, 0);
			set_task_cpu(p, this_cpu);
			activate_task(this_rq, p, 0);
			/*
			 * We continue with the search, just in
			 * case there's an even higher prio task
			 * in another runqueue. (low likelyhood
			 * but possible)
			 */

			/*
			 * Update next so that we won't pick a task
			 * on another cpu with a priority lower (or equal)
			 * than the one we just picked.
			 */
			next = p;

		}
 bail:
		spin_unlock(&src_rq->lock);
	}

	return ret;
}

static void schedule_balance_rt(struct rq *rq,
				struct task_struct *prev)
{
	/* Try to pull RT tasks here if we lower this rq's prio */
	if (unlikely(rt_task(prev)) &&
	    rq->rt.highest_prio > prev->prio)
		pull_rt_task(rq);
}

static void schedule_tail_balance_rt(struct rq *rq)
{
	/*
	 * If we have more than one rt_task queued, then
	 * see if we can push the other rt_tasks off to other CPUS.
	 * Note we may release the rq lock, and since
	 * the lock was owned by prev, we need to release it
	 * first via finish_lock_switch and then reaquire it here.
	 */
	if (unlikely(rq->rt.rt_nr_running > 1)) {
		spin_lock_irq(&rq->lock);
		push_rt_tasks(rq);
		spin_unlock_irq(&rq->lock);
	}
}

/*
 * Load-balancing iterator. Note: while the runqueue stays locked
 * during the whole iteration, the current task might be
 * dequeued so the iterator has to be dequeue-safe. Here we
 * achieve that by always pre-iterating before returning
 * the current task:
 */
static struct task_struct *load_balance_start_rt(void *arg)
{
	struct rq *rq = arg;
	struct rt_prio_array *array = &rq->rt.active;
	struct list_head *head, *curr;
	struct task_struct *p;
	int idx;

	idx = sched_find_first_bit(array->bitmap);
	if (idx >= MAX_RT_PRIO)
		return NULL;

	head = array->queue + idx;
	curr = head->prev;

	p = list_entry(curr, struct task_struct, run_list);

	curr = curr->prev;

	rq->rt.rt_load_balance_idx = idx;
	rq->rt.rt_load_balance_head = head;
	rq->rt.rt_load_balance_curr = curr;

	return p;
}

static struct task_struct *load_balance_next_rt(void *arg)
{
	struct rq *rq = arg;
	struct rt_prio_array *array = &rq->rt.active;
	struct list_head *head, *curr;
	struct task_struct *p;
	int idx;

	idx = rq->rt.rt_load_balance_idx;
	head = rq->rt.rt_load_balance_head;
	curr = rq->rt.rt_load_balance_curr;

	/*
	 * If we arrived back to the head again then
	 * iterate to the next queue (if any):
	 */
	if (unlikely(head == curr)) {
		int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);

		if (next_idx >= MAX_RT_PRIO)
			return NULL;

		idx = next_idx;
		head = array->queue + idx;
		curr = head->prev;

		rq->rt.rt_load_balance_idx = idx;
		rq->rt.rt_load_balance_head = head;
	}

	p = list_entry(curr, struct task_struct, run_list);

	curr = curr->prev;

	rq->rt.rt_load_balance_curr = curr;

	return p;
}

static unsigned long
load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
		unsigned long max_load_move,
		struct sched_domain *sd, enum cpu_idle_type idle,
		int *all_pinned, int *this_best_prio)
{
	struct rq_iterator rt_rq_iterator;

	rt_rq_iterator.start = load_balance_start_rt;
	rt_rq_iterator.next = load_balance_next_rt;
	/* pass 'busiest' rq argument into
	 * load_balance_[start|next]_rt iterators
	 */
	rt_rq_iterator.arg = busiest;

	return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
			     idle, all_pinned, this_best_prio, &rt_rq_iterator);
}

static int
move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
		 struct sched_domain *sd, enum cpu_idle_type idle)
{
	struct rq_iterator rt_rq_iterator;

	rt_rq_iterator.start = load_balance_start_rt;
	rt_rq_iterator.next = load_balance_next_rt;
	rt_rq_iterator.arg = busiest;

	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
				  &rt_rq_iterator);
}
#else /* CONFIG_SMP */
# define schedule_tail_balance_rt(rq)	do { } while (0)
# define schedule_balance_rt(rq, prev)	do { } while (0)
#endif /* CONFIG_SMP */

static void task_tick_rt(struct rq *rq, struct task_struct *p)
{
	update_curr_rt(rq);

	/*
	 * RR tasks need a special form of timeslice management.
	 * FIFO tasks have no timeslices.
	 */
	if (p->policy != SCHED_RR)
		return;

	if (--p->time_slice)
		return;

	p->time_slice = DEF_TIMESLICE;

	/*
	 * Requeue to the end of queue if we are not the only element
	 * on the queue:
	 */
	if (p->run_list.prev != p->run_list.next) {
		requeue_task_rt(rq, p);
		set_tsk_need_resched(p);
	}
}

static void set_curr_task_rt(struct rq *rq)
{
	struct task_struct *p = rq->curr;

	p->se.exec_start = rq->clock;
}

const struct sched_class rt_sched_class = {
	.next			= &fair_sched_class,
	.enqueue_task		= enqueue_task_rt,
	.dequeue_task		= dequeue_task_rt,
	.yield_task		= yield_task_rt,

	.check_preempt_curr	= check_preempt_curr_rt,

	.pick_next_task		= pick_next_task_rt,
	.put_prev_task		= put_prev_task_rt,

#ifdef CONFIG_SMP
	.load_balance		= load_balance_rt,
	.move_one_task		= move_one_task_rt,
#endif

	.set_curr_task          = set_curr_task_rt,
	.task_tick		= task_tick_rt,
};
Commit	Line	Data
bb44e5d1 IM	1	/*
	2	* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
	3	* policies)
	4	*/
	5
4fd29176 SR	6	#ifdef CONFIG_SMP
	7	static cpumask_t rt_overload_mask;
	8	static atomic_t rto_count;
	9	static inline int rt_overloaded(void)
	10	{
	11	return atomic_read(&rto_count);
	12	}
	13	static inline cpumask_t *rt_overload(void)
	14	{
	15	return &rt_overload_mask;
	16	}
	17	static inline void rt_set_overload(struct rq *rq)
	18	{
	19	cpu_set(rq->cpu, rt_overload_mask);
	20	/*
	21	* Make sure the mask is visible before we set
	22	* the overload count. That is checked to determine
	23	* if we should look at the mask. It would be a shame
	24	* if we looked at the mask, but the mask was not
	25	* updated yet.
	26	*/
	27	wmb();
	28	atomic_inc(&rto_count);
	29	}
	30	static inline void rt_clear_overload(struct rq *rq)
	31	{
	32	/* the order here really doesn't matter */
	33	atomic_dec(&rto_count);
	34	cpu_clear(rq->cpu, rt_overload_mask);
	35	}
	36	#endif /* CONFIG_SMP */
	37
bb44e5d1 IM	38	/*
	39	* Update the current task's runtime statistics. Skip current tasks that
	40	* are not in our scheduling class.
	41	*/
a9957449	42	static void update_curr_rt(struct rq *rq)
bb44e5d1 IM	43	{
	44	struct task_struct *curr = rq->curr;
	45	u64 delta_exec;
	46
	47	if (!task_has_rt_policy(curr))
	48	return;
	49
d281918d	50	delta_exec = rq->clock - curr->se.exec_start;
bb44e5d1 IM	51	if (unlikely((s64)delta_exec < 0))
bb44e5d1 IM	52	delta_exec = 0;
6cfb0d5d IM	53
6cfb0d5d IM	54	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
bb44e5d1 IM	55
bb44e5d1 IM	56	curr->se.sum_exec_runtime += delta_exec;
d281918d	57	curr->se.exec_start = rq->clock;
d842de87	58	cpuacct_charge(curr, delta_exec);
bb44e5d1 IM	59	}
bb44e5d1 IM	60
63489e45 SR	61	static inline void inc_rt_tasks(struct task_struct p, struct rq rq)
	62	{
	63	WARN_ON(!rt_task(p));
	64	rq->rt.rt_nr_running++;
764a9d6f SR	65	#ifdef CONFIG_SMP
	66	if (p->prio < rq->rt.highest_prio)
	67	rq->rt.highest_prio = p->prio;
4fd29176 SR	68	if (rq->rt.rt_nr_running > 1)
4fd29176 SR	69	rt_set_overload(rq);
764a9d6f	70	#endif /* CONFIG_SMP */
63489e45 SR	71	}
	72
	73	static inline void dec_rt_tasks(struct task_struct p, struct rq rq)
	74	{
	75	WARN_ON(!rt_task(p));
	76	WARN_ON(!rq->rt.rt_nr_running);
	77	rq->rt.rt_nr_running--;
764a9d6f SR	78	#ifdef CONFIG_SMP
	79	if (rq->rt.rt_nr_running) {
	80	struct rt_prio_array *array;
	81
	82	WARN_ON(p->prio < rq->rt.highest_prio);
	83	if (p->prio == rq->rt.highest_prio) {
	84	/* recalculate */
	85	array = &rq->rt.active;
	86	rq->rt.highest_prio =
	87	sched_find_first_bit(array->bitmap);
	88	} /* otherwise leave rq->highest prio alone */
	89	} else
	90	rq->rt.highest_prio = MAX_RT_PRIO;
4fd29176 SR	91	if (rq->rt.rt_nr_running < 2)
4fd29176 SR	92	rt_clear_overload(rq);
764a9d6f	93	#endif /* CONFIG_SMP */
63489e45 SR	94	}
63489e45 SR	95
fd390f6a	96	static void enqueue_task_rt(struct rq rq, struct task_struct p, int wakeup)
bb44e5d1 IM	97	{
	98	struct rt_prio_array *array = &rq->rt.active;
	99
	100	list_add_tail(&p->run_list, array->queue + p->prio);
	101	__set_bit(p->prio, array->bitmap);
58e2d4ca	102	inc_cpu_load(rq, p->se.load.weight);
63489e45 SR	103
63489e45 SR	104	inc_rt_tasks(p, rq);
bb44e5d1 IM	105	}
	106
	107	/*
	108	* Adding/removing a task to/from a priority array:
	109	*/
f02231e5	110	static void dequeue_task_rt(struct rq rq, struct task_struct p, int sleep)
bb44e5d1 IM	111	{
	112	struct rt_prio_array *array = &rq->rt.active;
	113
f1e14ef6	114	update_curr_rt(rq);
bb44e5d1 IM	115
	116	list_del(&p->run_list);
	117	if (list_empty(array->queue + p->prio))
	118	__clear_bit(p->prio, array->bitmap);
58e2d4ca	119	dec_cpu_load(rq, p->se.load.weight);
63489e45 SR	120
63489e45 SR	121	dec_rt_tasks(p, rq);
bb44e5d1 IM	122	}
	123
	124	/*
	125	* Put task to the end of the run list without the overhead of dequeue
	126	* followed by enqueue.
	127	*/
	128	static void requeue_task_rt(struct rq rq, struct task_struct p)
	129	{
	130	struct rt_prio_array *array = &rq->rt.active;
	131
	132	list_move_tail(&p->run_list, array->queue + p->prio);
	133	}
	134
	135	static void
4530d7ab	136	yield_task_rt(struct rq *rq)
bb44e5d1	137	{
4530d7ab	138	requeue_task_rt(rq, rq->curr);
bb44e5d1 IM	139	}
	140
	141	/*
	142	* Preempt the current task with a newly woken task if needed:
	143	*/
	144	static void check_preempt_curr_rt(struct rq rq, struct task_struct p)
	145	{
	146	if (p->prio < rq->curr->prio)
	147	resched_task(rq->curr);
	148	}
	149
fb8d4724	150	static struct task_struct pick_next_task_rt(struct rq rq)
bb44e5d1 IM	151	{
	152	struct rt_prio_array *array = &rq->rt.active;
	153	struct task_struct *next;
	154	struct list_head *queue;
	155	int idx;
	156
	157	idx = sched_find_first_bit(array->bitmap);
	158	if (idx >= MAX_RT_PRIO)
	159	return NULL;
	160
	161	queue = array->queue + idx;
	162	next = list_entry(queue->next, struct task_struct, run_list);
	163
d281918d	164	next->se.exec_start = rq->clock;
bb44e5d1 IM	165
	166	return next;
	167	}
	168
31ee529c	169	static void put_prev_task_rt(struct rq rq, struct task_struct p)
bb44e5d1	170	{
f1e14ef6	171	update_curr_rt(rq);
bb44e5d1 IM	172	p->se.exec_start = 0;
	173	}
	174
681f3e68	175	#ifdef CONFIG_SMP
e8fa1362 SR	176	/* Only try algorithms three times */
	177	#define RT_MAX_TRIES 3
	178
	179	static int double_lock_balance(struct rq this_rq, struct rq busiest);
	180	static void deactivate_task(struct rq rq, struct task_struct p, int sleep);
	181
f65eda4f SR	182	static int pick_rt_task(struct rq rq, struct task_struct p, int cpu)
	183	{
	184	if (!task_running(rq, p) &&
	185	(cpu < 0 \|\| cpu_isset(cpu, p->cpus_allowed)))
	186	return 1;
	187	return 0;
	188	}
	189
e8fa1362	190	/* Return the second highest RT task, NULL otherwise */
f65eda4f SR	191	static struct task_struct pick_next_highest_task_rt(struct rq rq,
f65eda4f SR	192	int cpu)
e8fa1362 SR	193	{
	194	struct rt_prio_array *array = &rq->rt.active;
	195	struct task_struct *next;
	196	struct list_head *queue;
	197	int idx;
	198
	199	assert_spin_locked(&rq->lock);
	200
	201	if (likely(rq->rt.rt_nr_running < 2))
	202	return NULL;
	203
	204	idx = sched_find_first_bit(array->bitmap);
	205	if (unlikely(idx >= MAX_RT_PRIO)) {
	206	WARN_ON(1); /* rt_nr_running is bad */
	207	return NULL;
	208	}
	209
	210	queue = array->queue + idx;
f65eda4f SR	211	BUG_ON(list_empty(queue));
f65eda4f SR	212
e8fa1362	213	next = list_entry(queue->next, struct task_struct, run_list);
f65eda4f SR	214	if (unlikely(pick_rt_task(rq, next, cpu)))
f65eda4f SR	215	goto out;
e8fa1362 SR	216
	217	if (queue->next->next != queue) {
	218	/* same prio task */
	219	next = list_entry(queue->next->next, struct task_struct, run_list);
f65eda4f SR	220	if (pick_rt_task(rq, next, cpu))
f65eda4f SR	221	goto out;
e8fa1362 SR	222	}
e8fa1362 SR	223
f65eda4f	224	retry:
e8fa1362 SR	225	/* slower, but more flexible */
e8fa1362 SR	226	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
f65eda4f	227	if (unlikely(idx >= MAX_RT_PRIO))
e8fa1362	228	return NULL;
e8fa1362 SR	229
e8fa1362 SR	230	queue = array->queue + idx;
f65eda4f SR	231	BUG_ON(list_empty(queue));
	232
	233	list_for_each_entry(next, queue, run_list) {
	234	if (pick_rt_task(rq, next, cpu))
	235	goto out;
	236	}
	237
	238	goto retry;
e8fa1362	239
f65eda4f	240	out:
e8fa1362 SR	241	return next;
	242	}
	243
	244	static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
	245
	246	/* Will lock the rq it finds */
	247	static struct rq find_lock_lowest_rq(struct task_struct task,
	248	struct rq *this_rq)
	249	{
	250	struct rq *lowest_rq = NULL;
	251	int cpu;
	252	int tries;
	253	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
	254
	255	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
	256
	257	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
	258	/*
	259	* Scan each rq for the lowest prio.
	260	*/
	261	for_each_cpu_mask(cpu, *cpu_mask) {
	262	struct rq *rq = &per_cpu(runqueues, cpu);
	263
	264	if (cpu == this_rq->cpu)
	265	continue;
	266
	267	/* We look for lowest RT prio or non-rt CPU */
	268	if (rq->rt.highest_prio >= MAX_RT_PRIO) {
	269	lowest_rq = rq;
	270	break;
	271	}
	272
	273	/* no locking for now */
	274	if (rq->rt.highest_prio > task->prio &&
	275	(!lowest_rq \|\| rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
	276	lowest_rq = rq;
	277	}
	278	}
	279
	280	if (!lowest_rq)
	281	break;
	282
	283	/* if the prio of this runqueue changed, try again */
	284	if (double_lock_balance(this_rq, lowest_rq)) {
	285	/*
	286	* We had to unlock the run queue. In
	287	* the mean time, task could have
	288	* migrated already or had its affinity changed.
	289	* Also make sure that it wasn't scheduled on its rq.
	290	*/
	291	if (unlikely(task_rq(task) != this_rq \|\|
	292	!cpu_isset(lowest_rq->cpu, task->cpus_allowed) \|\|
	293	task_running(this_rq, task) \|\|
	294	!task->se.on_rq)) {
	295	spin_unlock(&lowest_rq->lock);
	296	lowest_rq = NULL;
	297	break;
	298	}
	299	}
	300
	301	/* If this rq is still suitable use it. */
	302	if (lowest_rq->rt.highest_prio > task->prio)
	303	break;
	304
305	/* try again */
306	spin_unlock(&lowest_rq->lock);
307	lowest_rq = NULL;
308	}
309
310	return lowest_rq;
311	}
312
313	/*
314	* If the current CPU has more than one RT task, see if the non
315	* running task can migrate over to a CPU that is running a task
316	* of lesser priority.
317	*/
318	static int push_rt_task(struct rq *this_rq)
319	{
320	struct task_struct *next_task;
321	struct rq *lowest_rq;
322	int ret = 0;
323	int paranoid = RT_MAX_TRIES;
324
325	assert_spin_locked(&this_rq->lock);
326
f65eda4f	327	next_task = pick_next_highest_task_rt(this_rq, -1);
e8fa1362 SR	328	if (!next_task)
	329	return 0;
	330
	331	retry:
f65eda4f SR	332	if (unlikely(next_task == this_rq->curr)) {
f65eda4f SR	333	WARN_ON(1);
e8fa1362	334	return 0;
f65eda4f	335	}
e8fa1362 SR	336
	337	/*
	338	* It's possible that the next_task slipped in of
	339	* higher priority than current. If that's the case
	340	* just reschedule current.
	341	*/
	342	if (unlikely(next_task->prio < this_rq->curr->prio)) {
	343	resched_task(this_rq->curr);
	344	return 0;
	345	}
	346
	347	/* We might release this_rq lock */
	348	get_task_struct(next_task);
	349
	350	/* find_lock_lowest_rq locks the rq if found */
	351	lowest_rq = find_lock_lowest_rq(next_task, this_rq);
	352	if (!lowest_rq) {
	353	struct task_struct *task;
	354	/*
	355	* find lock_lowest_rq releases this_rq->lock
	356	* so it is possible that next_task has changed.
	357	* If it has, then try again.
	358	*/
f65eda4f	359	task = pick_next_highest_task_rt(this_rq, -1);
e8fa1362 SR	360	if (unlikely(task != next_task) && task && paranoid--) {
	361	put_task_struct(next_task);
	362	next_task = task;
	363	goto retry;
	364	}
	365	goto out;
	366	}
	367
	368	assert_spin_locked(&lowest_rq->lock);
	369
	370	deactivate_task(this_rq, next_task, 0);
	371	set_task_cpu(next_task, lowest_rq->cpu);
	372	activate_task(lowest_rq, next_task, 0);
	373
	374	resched_task(lowest_rq->curr);
	375
	376	spin_unlock(&lowest_rq->lock);
	377
	378	ret = 1;
	379	out:
	380	put_task_struct(next_task);
	381
	382	return ret;
	383	}
	384
	385	/*
	386	* TODO: Currently we just use the second highest prio task on
	387	* the queue, and stop when it can't migrate (or there's
	388	* no more RT tasks). There may be a case where a lower
	389	* priority RT task has a different affinity than the
	390	* higher RT task. In this case the lower RT task could
	391	* possibly be able to migrate where as the higher priority
	392	* RT task could not. We currently ignore this issue.
	393	* Enhancements are welcome!
	394	*/
	395	static void push_rt_tasks(struct rq *rq)
	396	{
	397	/* push_rt_task will return true if it moved an RT */
	398	while (push_rt_task(rq))
	399	;
	400	}
	401
f65eda4f SR	402	static int pull_rt_task(struct rq *this_rq)
	403	{
	404	struct task_struct *next;
	405	struct task_struct *p;
	406	struct rq *src_rq;
	407	cpumask_t *rto_cpumask;
	408	int this_cpu = this_rq->cpu;
	409	int cpu;
	410	int ret = 0;
	411
	412	assert_spin_locked(&this_rq->lock);
	413
	414	/*
	415	* If cpusets are used, and we have overlapping
	416	* run queue cpusets, then this algorithm may not catch all.
	417	* This is just the price you pay on trying to keep
	418	* dirtying caches down on large SMP machines.
	419	*/
	420	if (likely(!rt_overloaded()))
	421	return 0;
	422
	423	next = pick_next_task_rt(this_rq);
	424
	425	rto_cpumask = rt_overload();
	426
	427	for_each_cpu_mask(cpu, *rto_cpumask) {
	428	if (this_cpu == cpu)
	429	continue;
	430
	431	src_rq = cpu_rq(cpu);
	432	if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
	433	/*
	434	* It is possible that overlapping cpusets
	435	* will miss clearing a non overloaded runqueue.
	436	* Clear it now.
	437	*/
	438	if (double_lock_balance(this_rq, src_rq)) {
	439	/* unlocked our runqueue lock */
	440	struct task_struct *old_next = next;
	441	next = pick_next_task_rt(this_rq);
	442	if (next != old_next)
	443	ret = 1;
	444	}
	445	if (likely(src_rq->rt.rt_nr_running <= 1))
	446	/*
	447	* Small chance that this_rq->curr changed
	448	* but it's really harmless here.
	449	*/
	450	rt_clear_overload(this_rq);
	451	else
	452	/*
	453	* Heh, the src_rq is now overloaded, since
	454	* we already have the src_rq lock, go straight
	455	* to pulling tasks from it.
	456	*/
	457	goto try_pulling;
	458	spin_unlock(&src_rq->lock);
	459	continue;
	460	}
	461
	462	/*
	463	* We can potentially drop this_rq's lock in
	464	* double_lock_balance, and another CPU could
	465	* steal our next task - hence we must cause
466	* the caller to recalculate the next task
467	* in that case:
468	*/
469	if (double_lock_balance(this_rq, src_rq)) {
470	struct task_struct *old_next = next;
471	next = pick_next_task_rt(this_rq);
472	if (next != old_next)
473	ret = 1;
474	}
475
476	/*
477	* Are there still pullable RT tasks?
478	*/
479	if (src_rq->rt.rt_nr_running <= 1) {
480	spin_unlock(&src_rq->lock);
481	continue;
482	}
483
484	try_pulling:
485	p = pick_next_highest_task_rt(src_rq, this_cpu);
486
487	/*
488	* Do we have an RT task that preempts
489	* the to-be-scheduled task?
490	*/
491	if (p && (!next \|\| (p->prio < next->prio))) {
492	WARN_ON(p == src_rq->curr);
493	WARN_ON(!p->se.on_rq);
494
495	/*
496	* There's a chance that p is higher in priority
497	* than what's currently running on its cpu.
498	* This is just that p is wakeing up and hasn't
499	* had a chance to schedule. We only pull
500	* p if it is lower in priority than the
501	* current task on the run queue or
502	* this_rq next task is lower in prio than
503	* the current task on that rq.
504	*/
505	if (p->prio < src_rq->curr->prio \|\|
506	(next && next->prio < src_rq->curr->prio))
507	goto bail;
508
509	ret = 1;
510
511	deactivate_task(src_rq, p, 0);
512	set_task_cpu(p, this_cpu);
513	activate_task(this_rq, p, 0);
514	/*
515	* We continue with the search, just in
516	* case there's an even higher prio task
517	* in another runqueue. (low likelyhood
518	* but possible)
519	*/
520
521	/*
522	* Update next so that we won't pick a task
523	* on another cpu with a priority lower (or equal)
524	* than the one we just picked.
525	*/
526	next = p;
527
528	}
529	bail:
530	spin_unlock(&src_rq->lock);
531	}
532
533	return ret;
534	}
535
536	static void schedule_balance_rt(struct rq *rq,
537	struct task_struct *prev)
538	{
539	/* Try to pull RT tasks here if we lower this rq's prio */
540	if (unlikely(rt_task(prev)) &&
541	rq->rt.highest_prio > prev->prio)
542	pull_rt_task(rq);
543	}
544
e8fa1362 SR	545	static void schedule_tail_balance_rt(struct rq *rq)
	546	{
	547	/*
	548	* If we have more than one rt_task queued, then
	549	* see if we can push the other rt_tasks off to other CPUS.
	550	* Note we may release the rq lock, and since
	551	* the lock was owned by prev, we need to release it
	552	* first via finish_lock_switch and then reaquire it here.
	553	*/
	554	if (unlikely(rq->rt.rt_nr_running > 1)) {
	555	spin_lock_irq(&rq->lock);
	556	push_rt_tasks(rq);
	557	spin_unlock_irq(&rq->lock);
	558	}
	559	}
	560
bb44e5d1 IM	561	/*
	562	* Load-balancing iterator. Note: while the runqueue stays locked
	563	* during the whole iteration, the current task might be
	564	* dequeued so the iterator has to be dequeue-safe. Here we
	565	* achieve that by always pre-iterating before returning
	566	* the current task:
	567	*/
	568	static struct task_struct load_balance_start_rt(void arg)
	569	{
	570	struct rq *rq = arg;
	571	struct rt_prio_array *array = &rq->rt.active;
	572	struct list_head head, curr;
	573	struct task_struct *p;
	574	int idx;
	575
	576	idx = sched_find_first_bit(array->bitmap);
	577	if (idx >= MAX_RT_PRIO)
	578	return NULL;
	579
	580	head = array->queue + idx;
	581	curr = head->prev;
	582
	583	p = list_entry(curr, struct task_struct, run_list);
	584
	585	curr = curr->prev;
	586
	587	rq->rt.rt_load_balance_idx = idx;
	588	rq->rt.rt_load_balance_head = head;
	589	rq->rt.rt_load_balance_curr = curr;
	590
	591	return p;
	592	}
	593
	594	static struct task_struct load_balance_next_rt(void arg)
	595	{
	596	struct rq *rq = arg;
	597	struct rt_prio_array *array = &rq->rt.active;
	598	struct list_head head, curr;
	599	struct task_struct *p;
	600	int idx;
	601
	602	idx = rq->rt.rt_load_balance_idx;
	603	head = rq->rt.rt_load_balance_head;
	604	curr = rq->rt.rt_load_balance_curr;
	605
	606	/*
	607	* If we arrived back to the head again then
	608	* iterate to the next queue (if any):
	609	*/
	610	if (unlikely(head == curr)) {
	611	int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
	612
	613	if (next_idx >= MAX_RT_PRIO)
	614	return NULL;
	615
	616	idx = next_idx;
	617	head = array->queue + idx;
	618	curr = head->prev;
	619
	620	rq->rt.rt_load_balance_idx = idx;
	621	rq->rt.rt_load_balance_head = head;
	622	}
	623
	624	p = list_entry(curr, struct task_struct, run_list);
625
626	curr = curr->prev;
627
628	rq->rt.rt_load_balance_curr = curr;
629
630	return p;
631	}
632
43010659	633	static unsigned long
bb44e5d1	634	load_balance_rt(struct rq this_rq, int this_cpu, struct rq busiest,
e1d1484f PW	635	unsigned long max_load_move,
	636	struct sched_domain *sd, enum cpu_idle_type idle,
	637	int all_pinned, int this_best_prio)
bb44e5d1	638	{
bb44e5d1 IM	639	struct rq_iterator rt_rq_iterator;
bb44e5d1 IM	640
bb44e5d1 IM	641	rt_rq_iterator.start = load_balance_start_rt;
	642	rt_rq_iterator.next = load_balance_next_rt;
	643	/* pass 'busiest' rq argument into
	644	* load_balance_[start\|next]_rt iterators
	645	*/
	646	rt_rq_iterator.arg = busiest;
	647
e1d1484f PW	648	return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
	649	idle, all_pinned, this_best_prio, &rt_rq_iterator);
	650	}
	651
	652	static int
	653	move_one_task_rt(struct rq this_rq, int this_cpu, struct rq busiest,
	654	struct sched_domain *sd, enum cpu_idle_type idle)
	655	{
	656	struct rq_iterator rt_rq_iterator;
	657
	658	rt_rq_iterator.start = load_balance_start_rt;
	659	rt_rq_iterator.next = load_balance_next_rt;
	660	rt_rq_iterator.arg = busiest;
bb44e5d1	661
e1d1484f PW	662	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
e1d1484f PW	663	&rt_rq_iterator);
bb44e5d1	664	}
e8fa1362 SR	665	#else /* CONFIG_SMP */
e8fa1362 SR	666	# define schedule_tail_balance_rt(rq) do { } while (0)
f65eda4f	667	# define schedule_balance_rt(rq, prev) do { } while (0)
e8fa1362	668	#endif /* CONFIG_SMP */
bb44e5d1 IM	669
	670	static void task_tick_rt(struct rq rq, struct task_struct p)
	671	{
67e2be02 PZ	672	update_curr_rt(rq);
67e2be02 PZ	673
bb44e5d1 IM	674	/*
	675	* RR tasks need a special form of timeslice management.
	676	* FIFO tasks have no timeslices.
	677	*/
	678	if (p->policy != SCHED_RR)
	679	return;
	680
	681	if (--p->time_slice)
	682	return;
	683
a4ec24b4	684	p->time_slice = DEF_TIMESLICE;
bb44e5d1	685
98fbc798 DA	686	/*
	687	* Requeue to the end of queue if we are not the only element
	688	* on the queue:
	689	*/
	690	if (p->run_list.prev != p->run_list.next) {
	691	requeue_task_rt(rq, p);
	692	set_tsk_need_resched(p);
	693	}
bb44e5d1 IM	694	}
bb44e5d1 IM	695
83b699ed SV	696	static void set_curr_task_rt(struct rq *rq)
	697	{
	698	struct task_struct *p = rq->curr;
	699
	700	p->se.exec_start = rq->clock;
	701	}
	702
5522d5d5 IM	703	const struct sched_class rt_sched_class = {
5522d5d5 IM	704	.next = &fair_sched_class,
bb44e5d1 IM	705	.enqueue_task = enqueue_task_rt,
	706	.dequeue_task = dequeue_task_rt,
	707	.yield_task = yield_task_rt,
	708
	709	.check_preempt_curr = check_preempt_curr_rt,
	710
	711	.pick_next_task = pick_next_task_rt,
	712	.put_prev_task = put_prev_task_rt,
	713
681f3e68	714	#ifdef CONFIG_SMP
bb44e5d1	715	.load_balance = load_balance_rt,
e1d1484f	716	.move_one_task = move_one_task_rt,
681f3e68	717	#endif
bb44e5d1	718
83b699ed	719	.set_curr_task = set_curr_task_rt,
bb44e5d1	720	.task_tick = task_tick_rt,
bb44e5d1	721	};