[linux-2.6-block.git] / kernel / sched / idle.c

/*
 * Generic entry point for the idle threads
 */
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
#include <linux/cpuhotplug.h>
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
#include <linux/suspend.h>
#include <linux/livepatch.h>

#include <asm/tlb.h>

#include <trace/events/power.h>

#include "sched.h"

/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];

/**
 * sched_idle_set_state - Record idle state for the current CPU.
 * @idle_state: State to record.
 */
void sched_idle_set_state(struct cpuidle_state *idle_state)
{
	idle_set_state(this_rq(), idle_state);
}

static int __read_mostly cpu_idle_force_poll;

void cpu_idle_poll_ctrl(bool enable)
{
	if (enable) {
		cpu_idle_force_poll++;
	} else {
		cpu_idle_force_poll--;
		WARN_ON_ONCE(cpu_idle_force_poll < 0);
	}
}

#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
static int __init cpu_idle_poll_setup(char *__unused)
{
	cpu_idle_force_poll = 1;
	return 1;
}
__setup("nohlt", cpu_idle_poll_setup);

static int __init cpu_idle_nopoll_setup(char *__unused)
{
	cpu_idle_force_poll = 0;
	return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
#endif

static noinline int __cpuidle cpu_idle_poll(void)
{
	rcu_idle_enter();
	trace_cpu_idle_rcuidle(0, smp_processor_id());
	local_irq_enable();
	stop_critical_timings();
	while (!tif_need_resched() &&
		(cpu_idle_force_poll || tick_check_broadcast_expired()))
		cpu_relax();
	start_critical_timings();
	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
	rcu_idle_exit();
	return 1;
}

/* Weak implementations for optional arch specific functions */
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
	cpu_idle_force_poll = 1;
	local_irq_enable();
}

/**
 * default_idle_call - Default CPU idle routine.
 *
 * To use when the cpuidle framework cannot be used.
 */
void __cpuidle default_idle_call(void)
{
	if (current_clr_polling_and_test()) {
		local_irq_enable();
	} else {
		stop_critical_timings();
		arch_cpu_idle();
		start_critical_timings();
	}
}

static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
		      int next_state)
{
	/*
	 * The idle task must be scheduled, it is pointless to go to idle, just
	 * update no idle residency and return.
	 */
	if (current_clr_polling_and_test()) {
		dev->last_residency = 0;
		local_irq_enable();
		return -EBUSY;
	}

	/*
	 * Enter the idle state previously returned by the governor decision.
	 * This function will block until an interrupt occurs and will take
	 * care of re-enabling the local interrupts
	 */
	return cpuidle_enter(drv, dev, next_state);
}

/**
 * cpuidle_idle_call - the main idle function
 *
 * NOTE: no locks or semaphores should be used here
 *
 * On archs that support TIF_POLLING_NRFLAG, is called with polling
 * set, and it returns with polling set.  If it ever stops polling, it
 * must clear the polling bit.
 */
static void cpuidle_idle_call(void)
{
	struct cpuidle_device *dev = cpuidle_get_device();
	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
	int next_state, entered_state;

	/*
	 * Check if the idle task must be rescheduled. If it is the
	 * case, exit the function after re-enabling the local irq.
	 */
	if (need_resched()) {
		local_irq_enable();
		return;
	}

	/*
	 * Tell the RCU framework we are entering an idle section,
	 * so no more rcu read side critical sections and one more
	 * step to the grace period
	 */
	rcu_idle_enter();

	if (cpuidle_not_available(drv, dev)) {
		default_idle_call();
		goto exit_idle;
	}

	/*
	 * Suspend-to-idle ("s2idle") is a system state in which all user space
	 * has been frozen, all I/O devices have been suspended and the only
	 * activity happens here and in iterrupts (if any).  In that case bypass
	 * the cpuidle governor and go stratight for the deepest idle state
	 * available.  Possibly also suspend the local tick and the entire
	 * timekeeping to prevent timer interrupts from kicking us out of idle
	 * until a proper wakeup interrupt happens.
	 */

	if (idle_should_enter_s2idle() || dev->use_deepest_state) {
		if (idle_should_enter_s2idle()) {
			entered_state = cpuidle_enter_s2idle(drv, dev);
			if (entered_state > 0) {
				local_irq_enable();
				goto exit_idle;
			}
		}

		next_state = cpuidle_find_deepest_state(drv, dev);
		call_cpuidle(drv, dev, next_state);
	} else {
		/*
		 * Ask the cpuidle framework to choose a convenient idle state.
		 */
		next_state = cpuidle_select(drv, dev);
		entered_state = call_cpuidle(drv, dev, next_state);
		/*
		 * Give the governor an opportunity to reflect on the outcome
		 */
		cpuidle_reflect(dev, entered_state);
	}

exit_idle:
	__current_set_polling();

	/*
	 * It is up to the idle functions to reenable local interrupts
	 */
	if (WARN_ON_ONCE(irqs_disabled()))
		local_irq_enable();

	rcu_idle_exit();
}

/*
 * Generic idle loop implementation
 *
 * Called with polling cleared.
 */
static void do_idle(void)
{
	int cpu = smp_processor_id();
	/*
	 * If the arch has a polling bit, we maintain an invariant:
	 *
	 * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
	 * rq->idle). This means that, if rq->idle has the polling bit set,
	 * then setting need_resched is guaranteed to cause the CPU to
	 * reschedule.
	 */

	__current_set_polling();
	tick_nohz_idle_enter();

	while (!need_resched()) {
		check_pgt_cache();
		rmb();

		if (cpu_is_offline(cpu)) {
			cpuhp_report_idle_dead();
			arch_cpu_idle_dead();
		}

		local_irq_disable();
		arch_cpu_idle_enter();

		/*
		 * In poll mode we reenable interrupts and spin. Also if we
		 * detected in the wakeup from idle path that the tick
		 * broadcast device expired for us, we don't want to go deep
		 * idle as we know that the IPI is going to arrive right away.
		 */
		if (cpu_idle_force_poll || tick_check_broadcast_expired())
			cpu_idle_poll();
		else
			cpuidle_idle_call();
		arch_cpu_idle_exit();
	}

	/*
	 * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
	 * be set, propagate it into PREEMPT_NEED_RESCHED.
	 *
	 * This is required because for polling idle loops we will not have had
	 * an IPI to fold the state for us.
	 */
	preempt_set_need_resched();
	tick_nohz_idle_exit();
	__current_clr_polling();

	/*
	 * We promise to call sched_ttwu_pending() and reschedule if
	 * need_resched() is set while polling is set. That means that clearing
	 * polling needs to be visible before doing these things.
	 */
	smp_mb__after_atomic();

	sched_ttwu_pending();
	schedule_idle();

	if (unlikely(klp_patch_pending(current)))
		klp_update_patch_state(current);
}

bool cpu_in_idle(unsigned long pc)
{
	return pc >= (unsigned long)__cpuidle_text_start &&
		pc < (unsigned long)__cpuidle_text_end;
}

struct idle_timer {
	struct hrtimer timer;
	int done;
};

static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
{
	struct idle_timer *it = container_of(timer, struct idle_timer, timer);

	WRITE_ONCE(it->done, 1);
	set_tsk_need_resched(current);

	return HRTIMER_NORESTART;
}

void play_idle(unsigned long duration_ms)
{
	struct idle_timer it;

	/*
	 * Only FIFO tasks can disable the tick since they don't need the forced
	 * preemption.
	 */
	WARN_ON_ONCE(current->policy != SCHED_FIFO);
	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
	WARN_ON_ONCE(!duration_ms);

	rcu_sleep_check();
	preempt_disable();
	current->flags |= PF_IDLE;
	cpuidle_use_deepest_state(true);

	it.done = 0;
	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	it.timer.function = idle_inject_timer_fn;
	hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);

	while (!READ_ONCE(it.done))
		do_idle();

	cpuidle_use_deepest_state(false);
	current->flags &= ~PF_IDLE;

	preempt_fold_need_resched();
	preempt_enable();
}
EXPORT_SYMBOL_GPL(play_idle);

void cpu_startup_entry(enum cpuhp_state state)
{
	/*
	 * This #ifdef needs to die, but it's too late in the cycle to
	 * make this generic (arm and sh have never invoked the canary
	 * init for the non boot cpus!). Will be fixed in 3.11
	 */
#ifdef CONFIG_X86
	/*
	 * If we're the non-boot CPU, nothing set the stack canary up
	 * for us. The boot CPU already has it initialized but no harm
	 * in doing it again. This is a good place for updating it, as
	 * we wont ever return from this function (so the invalid
	 * canaries already on the stack wont ever trigger).
	 */
	boot_init_stack_canary();
#endif
	arch_cpu_idle_prepare();
	cpuhp_online_idle(state);
	while (1)
		do_idle();
}
Commit	Line	Data
cf37b6b4 NP	1	/*
	2	* Generic entry point for the idle threads
	3	*/
	4	#include <linux/sched.h>
4c822698	5	#include <linux/sched/idle.h>
cf37b6b4 NP	6	#include <linux/cpu.h>
cf37b6b4 NP	7	#include <linux/cpuidle.h>
8df3e07e	8	#include <linux/cpuhotplug.h>
cf37b6b4 NP	9	#include <linux/tick.h>
	10	#include <linux/mm.h>
	11	#include <linux/stackprotector.h>
38106313	12	#include <linux/suspend.h>
d83a7cb3	13	#include <linux/livepatch.h>
cf37b6b4 NP	14
	15	#include <asm/tlb.h>
	16
	17	#include <trace/events/power.h>
	18
e3baac47 PZ	19	#include "sched.h"
e3baac47 PZ	20
6727ad9e CM	21	/* Linker adds these: start and end of __cpuidle functions */
	22	extern char __cpuidle_text_start[], __cpuidle_text_end[];
	23
faad3849 RW	24	/**
	25	* sched_idle_set_state - Record idle state for the current CPU.
	26	* @idle_state: State to record.
	27	*/
	28	void sched_idle_set_state(struct cpuidle_state *idle_state)
	29	{
	30	idle_set_state(this_rq(), idle_state);
	31	}
	32
cf37b6b4 NP	33	static int __read_mostly cpu_idle_force_poll;
	34
	35	void cpu_idle_poll_ctrl(bool enable)
	36	{
	37	if (enable) {
	38	cpu_idle_force_poll++;
	39	} else {
	40	cpu_idle_force_poll--;
	41	WARN_ON_ONCE(cpu_idle_force_poll < 0);
	42	}
	43	}
	44
	45	#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
	46	static int __init cpu_idle_poll_setup(char *__unused)
	47	{
	48	cpu_idle_force_poll = 1;
	49	return 1;
	50	}
	51	__setup("nohlt", cpu_idle_poll_setup);
	52
	53	static int __init cpu_idle_nopoll_setup(char *__unused)
	54	{
	55	cpu_idle_force_poll = 0;
	56	return 1;
	57	}
	58	__setup("hlt", cpu_idle_nopoll_setup);
	59	#endif
	60
6727ad9e	61	static noinline int __cpuidle cpu_idle_poll(void)
cf37b6b4 NP	62	{
	63	rcu_idle_enter();
	64	trace_cpu_idle_rcuidle(0, smp_processor_id());
	65	local_irq_enable();
9babcd79	66	stop_critical_timings();
ff6f2d29 PM	67	while (!tif_need_resched() &&
ff6f2d29 PM	68	(cpu_idle_force_poll \|\| tick_check_broadcast_expired()))
cf37b6b4	69	cpu_relax();
9babcd79	70	start_critical_timings();
cf37b6b4 NP	71	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
	72	rcu_idle_exit();
	73	return 1;
	74	}
	75
	76	/* Weak implementations for optional arch specific functions */
	77	void __weak arch_cpu_idle_prepare(void) { }
	78	void __weak arch_cpu_idle_enter(void) { }
	79	void __weak arch_cpu_idle_exit(void) { }
	80	void __weak arch_cpu_idle_dead(void) { }
	81	void __weak arch_cpu_idle(void)
	82	{
	83	cpu_idle_force_poll = 1;
	84	local_irq_enable();
	85	}
	86
827a5aef RW	87	/**
	88	* default_idle_call - Default CPU idle routine.
	89	*
	90	* To use when the cpuidle framework cannot be used.
	91	*/
6727ad9e	92	void __cpuidle default_idle_call(void)
82f66327	93	{
63caae84	94	if (current_clr_polling_and_test()) {
82f66327	95	local_irq_enable();
63caae84 LS	96	} else {
63caae84 LS	97	stop_critical_timings();
82f66327	98	arch_cpu_idle();
63caae84 LS	99	start_critical_timings();
63caae84 LS	100	}
82f66327 RW	101	}
82f66327 RW	102
bcf6ad8a RW	103	static int call_cpuidle(struct cpuidle_driver drv, struct cpuidle_device dev,
	104	int next_state)
	105	{
bcf6ad8a RW	106	/*
	107	* The idle task must be scheduled, it is pointless to go to idle, just
	108	* update no idle residency and return.
	109	*/
	110	if (current_clr_polling_and_test()) {
	111	dev->last_residency = 0;
	112	local_irq_enable();
	113	return -EBUSY;
	114	}
	115
bcf6ad8a RW	116	/*
	117	* Enter the idle state previously returned by the governor decision.
	118	* This function will block until an interrupt occurs and will take
	119	* care of re-enabling the local interrupts
	120	*/
827a5aef	121	return cpuidle_enter(drv, dev, next_state);
bcf6ad8a RW	122	}
bcf6ad8a RW	123
30cdd69e DL	124	/**
	125	* cpuidle_idle_call - the main idle function
	126	*
	127	* NOTE: no locks or semaphores should be used here
82c65d60 AL	128	*
	129	* On archs that support TIF_POLLING_NRFLAG, is called with polling
	130	* set, and it returns with polling set. If it ever stops polling, it
	131	* must clear the polling bit.
30cdd69e	132	*/
08c373e5	133	static void cpuidle_idle_call(void)
30cdd69e	134	{
9bd616e3	135	struct cpuidle_device *dev = cpuidle_get_device();
30cdd69e	136	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
37352273	137	int next_state, entered_state;
30cdd69e	138
a1d028bd DL	139	/*
a1d028bd DL	140	* Check if the idle task must be rescheduled. If it is the
c444117f	141	* case, exit the function after re-enabling the local irq.
a1d028bd	142	*/
c444117f	143	if (need_resched()) {
8ca3c642	144	local_irq_enable();
08c373e5	145	return;
8ca3c642 DL	146	}
8ca3c642 DL	147
a1d028bd DL	148	/*
	149	* Tell the RCU framework we are entering an idle section,
	150	* so no more rcu read side critical sections and one more
	151	* step to the grace period
	152	*/
c8cc7d4d DL	153	rcu_idle_enter();
c8cc7d4d DL	154
82f66327 RW	155	if (cpuidle_not_available(drv, dev)) {
	156	default_idle_call();
	157	goto exit_idle;
	158	}
ef2b22ac	159
38106313	160	/*
f02f4f9d	161	* Suspend-to-idle ("s2idle") is a system state in which all user space
38106313 RW	162	* has been frozen, all I/O devices have been suspended and the only
	163	* activity happens here and in iterrupts (if any). In that case bypass
	164	* the cpuidle governor and go stratight for the deepest idle state
	165	* available. Possibly also suspend the local tick and the entire
	166	* timekeeping to prevent timer interrupts from kicking us out of idle
	167	* until a proper wakeup interrupt happens.
	168	*/
bb8313b6	169
f02f4f9d RW	170	if (idle_should_enter_s2idle() \|\| dev->use_deepest_state) {
f02f4f9d RW	171	if (idle_should_enter_s2idle()) {
28ba086e	172	entered_state = cpuidle_enter_s2idle(drv, dev);
bb8313b6 JP	173	if (entered_state > 0) {
	174	local_irq_enable();
	175	goto exit_idle;
	176	}
ef2b22ac RW	177	}
ef2b22ac RW	178
ef2b22ac	179	next_state = cpuidle_find_deepest_state(drv, dev);
bcf6ad8a	180	call_cpuidle(drv, dev, next_state);
ef2b22ac	181	} else {
ef2b22ac RW	182	/*
	183	* Ask the cpuidle framework to choose a convenient idle state.
	184	*/
	185	next_state = cpuidle_select(drv, dev);
bcf6ad8a RW	186	entered_state = call_cpuidle(drv, dev, next_state);
	187	/*
	188	* Give the governor an opportunity to reflect on the outcome
	189	*/
ef2b22ac	190	cpuidle_reflect(dev, entered_state);
bcf6ad8a	191	}
37352273 PZ	192
37352273 PZ	193	exit_idle:
8ca3c642	194	__current_set_polling();
30cdd69e	195
a1d028bd	196	/*
37352273	197	* It is up to the idle functions to reenable local interrupts
a1d028bd	198	*/
c8cc7d4d DL	199	if (WARN_ON_ONCE(irqs_disabled()))
	200	local_irq_enable();
	201
	202	rcu_idle_exit();
30cdd69e	203	}
30cdd69e	204
cf37b6b4 NP	205	/*
cf37b6b4 NP	206	* Generic idle loop implementation
82c65d60 AL	207	*
82c65d60 AL	208	* Called with polling cleared.
cf37b6b4	209	*/
c1de45ca	210	static void do_idle(void)
cf37b6b4	211	{
54b933c6	212	int cpu = smp_processor_id();
c1de45ca PZ	213	/*
	214	* If the arch has a polling bit, we maintain an invariant:
	215	*
	216	* Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
	217	* rq->idle). This means that, if rq->idle has the polling bit set,
	218	* then setting need_resched is guaranteed to cause the CPU to
	219	* reschedule.
	220	*/
cf37b6b4	221
c1de45ca PZ	222	__current_set_polling();
c1de45ca PZ	223	tick_nohz_idle_enter();
cf37b6b4	224
c1de45ca PZ	225	while (!need_resched()) {
	226	check_pgt_cache();
	227	rmb();
cf37b6b4	228
54b933c6	229	if (cpu_is_offline(cpu)) {
c1de45ca PZ	230	cpuhp_report_idle_dead();
c1de45ca PZ	231	arch_cpu_idle_dead();
cf37b6b4	232	}
06d50c65	233
c1de45ca PZ	234	local_irq_disable();
c1de45ca PZ	235	arch_cpu_idle_enter();
82c65d60 AL	236
82c65d60 AL	237	/*
c1de45ca PZ	238	* In poll mode we reenable interrupts and spin. Also if we
	239	* detected in the wakeup from idle path that the tick
	240	* broadcast device expired for us, we don't want to go deep
	241	* idle as we know that the IPI is going to arrive right away.
82c65d60	242	*/
c1de45ca PZ	243	if (cpu_idle_force_poll \|\| tick_check_broadcast_expired())
	244	cpu_idle_poll();
	245	else
	246	cpuidle_idle_call();
	247	arch_cpu_idle_exit();
cf37b6b4	248	}
c1de45ca PZ	249
	250	/*
	251	* Since we fell out of the loop above, we know TIF_NEED_RESCHED must
	252	* be set, propagate it into PREEMPT_NEED_RESCHED.
	253	*
	254	* This is required because for polling idle loops we will not have had
	255	* an IPI to fold the state for us.
	256	*/
	257	preempt_set_need_resched();
	258	tick_nohz_idle_exit();
	259	__current_clr_polling();
	260
	261	/*
	262	* We promise to call sched_ttwu_pending() and reschedule if
	263	* need_resched() is set while polling is set. That means that clearing
	264	* polling needs to be visible before doing these things.
	265	*/
	266	smp_mb__after_atomic();
	267
	268	sched_ttwu_pending();
8663effb	269	schedule_idle();
d83a7cb3 JP	270
	271	if (unlikely(klp_patch_pending(current)))
	272	klp_update_patch_state(current);
cf37b6b4 NP	273	}
cf37b6b4 NP	274
6727ad9e CM	275	bool cpu_in_idle(unsigned long pc)
	276	{
	277	return pc >= (unsigned long)__cpuidle_text_start &&
	278	pc < (unsigned long)__cpuidle_text_end;
	279	}
	280
c1de45ca PZ	281	struct idle_timer {
	282	struct hrtimer timer;
	283	int done;
	284	};
	285
	286	static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
	287	{
	288	struct idle_timer *it = container_of(timer, struct idle_timer, timer);
	289
	290	WRITE_ONCE(it->done, 1);
	291	set_tsk_need_resched(current);
	292
	293	return HRTIMER_NORESTART;
	294	}
	295
	296	void play_idle(unsigned long duration_ms)
	297	{
	298	struct idle_timer it;
	299
	300	/*
	301	* Only FIFO tasks can disable the tick since they don't need the forced
	302	* preemption.
	303	*/
	304	WARN_ON_ONCE(current->policy != SCHED_FIFO);
	305	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
	306	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
	307	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
	308	WARN_ON_ONCE(!duration_ms);
	309
	310	rcu_sleep_check();
	311	preempt_disable();
	312	current->flags \|= PF_IDLE;
	313	cpuidle_use_deepest_state(true);
	314
	315	it.done = 0;
	316	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	317	it.timer.function = idle_inject_timer_fn;
	318	hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
	319
	320	while (!READ_ONCE(it.done))
	321	do_idle();
	322
	323	cpuidle_use_deepest_state(false);
	324	current->flags &= ~PF_IDLE;
	325
	326	preempt_fold_need_resched();
	327	preempt_enable();
	328	}
	329	EXPORT_SYMBOL_GPL(play_idle);
	330
cf37b6b4 NP	331	void cpu_startup_entry(enum cpuhp_state state)
	332	{
	333	/*
	334	* This #ifdef needs to die, but it's too late in the cycle to
	335	* make this generic (arm and sh have never invoked the canary
	336	* init for the non boot cpus!). Will be fixed in 3.11
	337	*/
	338	#ifdef CONFIG_X86
	339	/*
	340	* If we're the non-boot CPU, nothing set the stack canary up
	341	* for us. The boot CPU already has it initialized but no harm
	342	* in doing it again. This is a good place for updating it, as
	343	* we wont ever return from this function (so the invalid
	344	* canaries already on the stack wont ever trigger).
	345	*/
	346	boot_init_stack_canary();
	347	#endif
cf37b6b4	348	arch_cpu_idle_prepare();
8df3e07e	349	cpuhp_online_idle(state);
c1de45ca PZ	350	while (1)
c1de45ca PZ	351	do_idle();
cf37b6b4	352	}