[linux-2.6-block.git] / kernel / rcu / tasks.h

/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Task-based RCU implementations.
 *
 * Copyright (C) 2020 Paul E. McKenney
 */


////////////////////////////////////////////////////////////////////////
//
// Generic data structures.

struct rcu_tasks;
typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp);

/**
 * Definition for a Tasks-RCU-like mechanism.
 * @cbs_head: Head of callback list.
 * @cbs_tail: Tail pointer for callback list.
 * @cbs_wq: Wait queue allowning new callback to get kthread's attention.
 * @cbs_lock: Lock protecting callback list.
 * @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
 * @gp_func: This flavor's grace-period-wait function.
 * @call_func: This flavor's call_rcu()-equivalent function.
 * @name: This flavor's textual name.
 * @kname: This flavor's kthread name.
 */
struct rcu_tasks {
	struct rcu_head *cbs_head;
	struct rcu_head **cbs_tail;
	struct wait_queue_head cbs_wq;
	raw_spinlock_t cbs_lock;
	struct task_struct *kthread_ptr;
	rcu_tasks_gp_func_t gp_func;
	call_rcu_func_t call_func;
	char *name;
	char *kname;
};

#define DEFINE_RCU_TASKS(rt_name, gp, call, n)				\
static struct rcu_tasks rt_name =					\
{									\
	.cbs_tail = &rt_name.cbs_head,					\
	.cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq),	\
	.cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock),		\
	.gp_func = gp,							\
	.call_func = call,						\
	.name = n,							\
	.kname = #rt_name,						\
}

/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);

/* Control stall timeouts.  Disable with <= 0, otherwise jiffies till stall. */
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);

////////////////////////////////////////////////////////////////////////
//
// Generic code.

// Enqueue a callback for the specified flavor of Tasks RCU.
static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
				   struct rcu_tasks *rtp)
{
	unsigned long flags;
	bool needwake;

	rhp->next = NULL;
	rhp->func = func;
	raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
	needwake = !rtp->cbs_head;
	WRITE_ONCE(*rtp->cbs_tail, rhp);
	rtp->cbs_tail = &rhp->next;
	raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
	/* We can't create the thread unless interrupts are enabled. */
	if (needwake && READ_ONCE(rtp->kthread_ptr))
		wake_up(&rtp->cbs_wq);
}

// Wait for a grace period for the specified flavor of Tasks RCU.
static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
{
	/* Complain if the scheduler has not started.  */
	RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
			 "synchronize_rcu_tasks called too soon");

	/* Wait for the grace period. */
	wait_rcu_gp(rtp->call_func);
}

/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
static int __noreturn rcu_tasks_kthread(void *arg)
{
	unsigned long flags;
	struct rcu_head *list;
	struct rcu_head *next;
	struct rcu_tasks *rtp = arg;

	/* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
	housekeeping_affine(current, HK_FLAG_RCU);
	WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!

	/*
	 * Each pass through the following loop makes one check for
	 * newly arrived callbacks, and, if there are some, waits for
	 * one RCU-tasks grace period and then invokes the callbacks.
	 * This loop is terminated by the system going down.  ;-)
	 */
	for (;;) {

		/* Pick up any new callbacks. */
		raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
		list = rtp->cbs_head;
		rtp->cbs_head = NULL;
		rtp->cbs_tail = &rtp->cbs_head;
		raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);

		/* If there were none, wait a bit and start over. */
		if (!list) {
			wait_event_interruptible(rtp->cbs_wq,
						 READ_ONCE(rtp->cbs_head));
			if (!rtp->cbs_head) {
				WARN_ON(signal_pending(current));
				schedule_timeout_interruptible(HZ/10);
			}
			continue;
		}

		// Wait for one grace period.
		rtp->gp_func(rtp);

		/* Invoke the callbacks. */
		while (list) {
			next = list->next;
			local_bh_disable();
			list->func(list);
			local_bh_enable();
			list = next;
			cond_resched();
		}
		/* Paranoid sleep to keep this from entering a tight loop */
		schedule_timeout_uninterruptible(HZ/10);
	}
}

/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */
static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
{
	struct task_struct *t;

	t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
	if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
		return;
	smp_mb(); /* Ensure others see full kthread. */
}

/* Do the srcu_read_lock() for the above synchronize_srcu().  */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
{
	preempt_disable();
	current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
	preempt_enable();
}

/* Do the srcu_read_unlock() for the above synchronize_srcu().  */
void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
{
	preempt_disable();
	__srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
	preempt_enable();
}

#ifndef CONFIG_TINY_RCU

/*
 * Print any non-default Tasks RCU settings.
 */
static void __init rcu_tasks_bootup_oddness(void)
{
#ifdef CONFIG_TASKS_RCU
	if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
		pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
	else
		pr_info("\tTasks RCU enabled.\n");
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_RUDE_RCU
	pr_info("\tRude variant of Tasks RCU enabled.\n");
#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
}

#endif /* #ifndef CONFIG_TINY_RCU */

#ifdef CONFIG_TASKS_RCU

////////////////////////////////////////////////////////////////////////
//
// Simple variant of RCU whose quiescent states are voluntary context
// switch, cond_resched_rcu_qs(), user-space execution, and idle.
// As such, grace periods can take one good long time.  There are no
// read-side primitives similar to rcu_read_lock() and rcu_read_unlock()
// because this implementation is intended to get the system into a safe
// state for some of the manipulations involved in tracing and the like.
// Finally, this implementation does not support high call_rcu_tasks()
// rates from multiple CPUs.  If this is required, per-CPU callback lists
// will be needed.

/* See if tasks are still holding out, complain if so. */
static void check_holdout_task(struct task_struct *t,
			       bool needreport, bool *firstreport)
{
	int cpu;

	if (!READ_ONCE(t->rcu_tasks_holdout) ||
	    t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) ||
	    !READ_ONCE(t->on_rq) ||
	    (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
	     !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
		WRITE_ONCE(t->rcu_tasks_holdout, false);
		list_del_init(&t->rcu_tasks_holdout_list);
		put_task_struct(t);
		return;
	}
	rcu_request_urgent_qs_task(t);
	if (!needreport)
		return;
	if (*firstreport) {
		pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
		*firstreport = false;
	}
	cpu = task_cpu(t);
	pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
		 t, ".I"[is_idle_task(t)],
		 "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
		 t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
		 t->rcu_tasks_idle_cpu, cpu);
	sched_show_task(t);
}

/* Wait for one RCU-tasks grace period. */
static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
{
	struct task_struct *g, *t;
	unsigned long lastreport;
	LIST_HEAD(rcu_tasks_holdouts);
	int fract;

	/*
	 * Wait for all pre-existing t->on_rq and t->nvcsw transitions
	 * to complete.  Invoking synchronize_rcu() suffices because all
	 * these transitions occur with interrupts disabled.  Without this
	 * synchronize_rcu(), a read-side critical section that started
	 * before the grace period might be incorrectly seen as having
	 * started after the grace period.
	 *
	 * This synchronize_rcu() also dispenses with the need for a
	 * memory barrier on the first store to t->rcu_tasks_holdout,
	 * as it forces the store to happen after the beginning of the
	 * grace period.
	 */
	synchronize_rcu();

	/*
	 * There were callbacks, so we need to wait for an RCU-tasks
	 * grace period.  Start off by scanning the task list for tasks
	 * that are not already voluntarily blocked.  Mark these tasks
	 * and make a list of them in rcu_tasks_holdouts.
	 */
	rcu_read_lock();
	for_each_process_thread(g, t) {
		if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) {
			get_task_struct(t);
			t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
			WRITE_ONCE(t->rcu_tasks_holdout, true);
			list_add(&t->rcu_tasks_holdout_list,
				 &rcu_tasks_holdouts);
		}
	}
	rcu_read_unlock();

	/*
	 * Wait for tasks that are in the process of exiting.  This
	 * does only part of the job, ensuring that all tasks that were
	 * previously exiting reach the point where they have disabled
	 * preemption, allowing the later synchronize_rcu() to finish
	 * the job.
	 */
	synchronize_srcu(&tasks_rcu_exit_srcu);

	/*
	 * Each pass through the following loop scans the list of holdout
	 * tasks, removing any that are no longer holdouts.  When the list
	 * is empty, we are done.
	 */
	lastreport = jiffies;

	/* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */
	fract = 10;

	for (;;) {
		bool firstreport;
		bool needreport;
		int rtst;
		struct task_struct *t1;

		if (list_empty(&rcu_tasks_holdouts))
			break;

		/* Slowly back off waiting for holdouts */
		schedule_timeout_interruptible(HZ/fract);

		if (fract > 1)
			fract--;

		rtst = READ_ONCE(rcu_task_stall_timeout);
		needreport = rtst > 0 && time_after(jiffies, lastreport + rtst);
		if (needreport)
			lastreport = jiffies;
		firstreport = true;
		WARN_ON(signal_pending(current));
		list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
					 rcu_tasks_holdout_list) {
			check_holdout_task(t, needreport, &firstreport);
			cond_resched();
		}
	}

	/*
	 * Because ->on_rq and ->nvcsw are not guaranteed to have a full
	 * memory barriers prior to them in the schedule() path, memory
	 * reordering on other CPUs could cause their RCU-tasks read-side
	 * critical sections to extend past the end of the grace period.
	 * However, because these ->nvcsw updates are carried out with
	 * interrupts disabled, we can use synchronize_rcu() to force the
	 * needed ordering on all such CPUs.
	 *
	 * This synchronize_rcu() also confines all ->rcu_tasks_holdout
	 * accesses to be within the grace period, avoiding the need for
	 * memory barriers for ->rcu_tasks_holdout accesses.
	 *
	 * In addition, this synchronize_rcu() waits for exiting tasks
	 * to complete their final preempt_disable() region of execution,
	 * cleaning up after the synchronize_srcu() above.
	 */
	synchronize_rcu();
}

void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");

/**
 * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
 * @rhp: structure to be used for queueing the RCU updates.
 * @func: actual callback function to be invoked after the grace period
 *
 * The callback function will be invoked some time after a full grace
 * period elapses, in other words after all currently executing RCU
 * read-side critical sections have completed. call_rcu_tasks() assumes
 * that the read-side critical sections end at a voluntary context
 * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle,
 * or transition to usermode execution.  As such, there are no read-side
 * primitives analogous to rcu_read_lock() and rcu_read_unlock() because
 * this primitive is intended to determine that all tasks have passed
 * through a safe state, not so much for data-strcuture synchronization.
 *
 * See the description of call_rcu() for more detailed information on
 * memory ordering guarantees.
 */
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
	call_rcu_tasks_generic(rhp, func, &rcu_tasks);
}
EXPORT_SYMBOL_GPL(call_rcu_tasks);

/**
 * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
 *
 * Control will return to the caller some time after a full rcu-tasks
 * grace period has elapsed, in other words after all currently
 * executing rcu-tasks read-side critical sections have elapsed.  These
 * read-side critical sections are delimited by calls to schedule(),
 * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls
 * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
 *
 * This is a very specialized primitive, intended only for a few uses in
 * tracing and other situations requiring manipulation of function
 * preambles and profiling hooks.  The synchronize_rcu_tasks() function
 * is not (yet) intended for heavy use from multiple CPUs.
 *
 * See the description of synchronize_rcu() for more detailed information
 * on memory ordering guarantees.
 */
void synchronize_rcu_tasks(void)
{
	synchronize_rcu_tasks_generic(&rcu_tasks);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);

/**
 * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
 *
 * Although the current implementation is guaranteed to wait, it is not
 * obligated to, for example, if there are no pending callbacks.
 */
void rcu_barrier_tasks(void)
{
	/* There is only one callback queue, so this is easy.  ;-) */
	synchronize_rcu_tasks();
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks);

static int __init rcu_spawn_tasks_kthread(void)
{
	rcu_spawn_tasks_kthread_generic(&rcu_tasks);
	return 0;
}
core_initcall(rcu_spawn_tasks_kthread);

#endif /* #ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_TASKS_RUDE_RCU

////////////////////////////////////////////////////////////////////////
//
// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
// passing an empty function to schedule_on_each_cpu().  This approach
// provides an asynchronous call_rcu_tasks_rude() API and batching
// of concurrent calls to the synchronous synchronize_rcu_rude() API.
// This sends IPIs far and wide and induces otherwise unnecessary context
// switches on all online CPUs, whether idle or not.

// Empty function to allow workqueues to force a context switch.
static void rcu_tasks_be_rude(struct work_struct *work)
{
}

// Wait for one rude RCU-tasks grace period.
static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
{
	schedule_on_each_cpu(rcu_tasks_be_rude);
}

void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
		 "RCU Tasks Rude");

/**
 * call_rcu_tasks_rude() - Queue a callback rude task-based grace period
 * @rhp: structure to be used for queueing the RCU updates.
 * @func: actual callback function to be invoked after the grace period
 *
 * The callback function will be invoked some time after a full grace
 * period elapses, in other words after all currently executing RCU
 * read-side critical sections have completed. call_rcu_tasks_rude()
 * assumes that the read-side critical sections end at context switch,
 * cond_resched_rcu_qs(), or transition to usermode execution.  As such,
 * there are no read-side primitives analogous to rcu_read_lock() and
 * rcu_read_unlock() because this primitive is intended to determine
 * that all tasks have passed through a safe state, not so much for
 * data-strcuture synchronization.
 *
 * See the description of call_rcu() for more detailed information on
 * memory ordering guarantees.
 */
void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
{
	call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude);
}
EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);

/**
 * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period
 *
 * Control will return to the caller some time after a rude rcu-tasks
 * grace period has elapsed, in other words after all currently
 * executing rcu-tasks read-side critical sections have elapsed.  These
 * read-side critical sections are delimited by calls to schedule(),
 * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
 * anyway) cond_resched().
 *
 * This is a very specialized primitive, intended only for a few uses in
 * tracing and other situations requiring manipulation of function preambles
 * and profiling hooks.  The synchronize_rcu_tasks_rude() function is not
 * (yet) intended for heavy use from multiple CPUs.
 *
 * See the description of synchronize_rcu() for more detailed information
 * on memory ordering guarantees.
 */
void synchronize_rcu_tasks_rude(void)
{
	synchronize_rcu_tasks_generic(&rcu_tasks_rude);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);

/**
 * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks.
 *
 * Although the current implementation is guaranteed to wait, it is not
 * obligated to, for example, if there are no pending callbacks.
 */
void rcu_barrier_tasks_rude(void)
{
	/* There is only one callback queue, so this is easy.  ;-) */
	synchronize_rcu_tasks_rude();
}
EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);

static int __init rcu_spawn_tasks_rude_kthread(void)
{
	rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
	return 0;
}
core_initcall(rcu_spawn_tasks_rude_kthread);

#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
Commit	Line	Data
eacd6f04 PM	1	/* SPDX-License-Identifier: GPL-2.0+ */
	2	/*
	3	* Task-based RCU implementations.
	4	*
	5	* Copyright (C) 2020 Paul E. McKenney
	6	*/
	7
5873b8a9 PM	8
	9	////////////////////////////////////////////////////////////////////////
	10	//
	11	// Generic data structures.
	12
	13	struct rcu_tasks;
	14	typedef void (rcu_tasks_gp_func_t)(struct rcu_tasks rtp);
eacd6f04	15
07e10515 PM	16	/**
	17	* Definition for a Tasks-RCU-like mechanism.
	18	* @cbs_head: Head of callback list.
	19	* @cbs_tail: Tail pointer for callback list.
	20	* @cbs_wq: Wait queue allowning new callback to get kthread's attention.
	21	* @cbs_lock: Lock protecting callback list.
	22	* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
5873b8a9 PM	23	* @gp_func: This flavor's grace-period-wait function.
5873b8a9 PM	24	* @call_func: This flavor's call_rcu()-equivalent function.
c97d12a6 PM	25	* @name: This flavor's textual name.
c97d12a6 PM	26	* @kname: This flavor's kthread name.
07e10515 PM	27	*/
	28	struct rcu_tasks {
	29	struct rcu_head *cbs_head;
	30	struct rcu_head **cbs_tail;
	31	struct wait_queue_head cbs_wq;
	32	raw_spinlock_t cbs_lock;
	33	struct task_struct *kthread_ptr;
5873b8a9 PM	34	rcu_tasks_gp_func_t gp_func;
5873b8a9 PM	35	call_rcu_func_t call_func;
c97d12a6 PM	36	char *name;
c97d12a6 PM	37	char *kname;
07e10515 PM	38	};
07e10515 PM	39
c97d12a6 PM	40	#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \
c97d12a6 PM	41	static struct rcu_tasks rt_name = \
07e10515	42	{ \
c97d12a6 PM	43	.cbs_tail = &rt_name.cbs_head, \
	44	.cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \
	45	.cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \
5873b8a9 PM	46	.gp_func = gp, \
5873b8a9 PM	47	.call_func = call, \
c97d12a6 PM	48	.name = n, \
c97d12a6 PM	49	.kname = #rt_name, \
07e10515 PM	50	}
07e10515 PM	51
eacd6f04 PM	52	/* Track exiting tasks in order to allow them to be waited for. */
	53	DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
	54
	55	/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
	56	#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
	57	static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
	58	module_param(rcu_task_stall_timeout, int, 0644);
	59
5873b8a9 PM	60	////////////////////////////////////////////////////////////////////////
	61	//
	62	// Generic code.
	63
	64	// Enqueue a callback for the specified flavor of Tasks RCU.
	65	static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
	66	struct rcu_tasks *rtp)
eacd6f04 PM	67	{
	68	unsigned long flags;
	69	bool needwake;
	70
	71	rhp->next = NULL;
	72	rhp->func = func;
07e10515 PM	73	raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
	74	needwake = !rtp->cbs_head;
	75	WRITE_ONCE(*rtp->cbs_tail, rhp);
	76	rtp->cbs_tail = &rhp->next;
	77	raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
eacd6f04	78	/* We can't create the thread unless interrupts are enabled. */
07e10515 PM	79	if (needwake && READ_ONCE(rtp->kthread_ptr))
07e10515 PM	80	wake_up(&rtp->cbs_wq);
eacd6f04	81	}
eacd6f04	82
5873b8a9 PM	83	// Wait for a grace period for the specified flavor of Tasks RCU.
5873b8a9 PM	84	static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
eacd6f04 PM	85	{
	86	/* Complain if the scheduler has not started. */
	87	RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
	88	"synchronize_rcu_tasks called too soon");
	89
	90	/* Wait for the grace period. */
5873b8a9	91	wait_rcu_gp(rtp->call_func);
eacd6f04 PM	92	}
	93
	94	/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
	95	static int __noreturn rcu_tasks_kthread(void *arg)
	96	{
	97	unsigned long flags;
eacd6f04 PM	98	struct rcu_head *list;
eacd6f04 PM	99	struct rcu_head *next;
07e10515	100	struct rcu_tasks *rtp = arg;
eacd6f04 PM	101
	102	/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
	103	housekeeping_affine(current, HK_FLAG_RCU);
07e10515	104	WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
eacd6f04 PM	105
	106	/*
	107	* Each pass through the following loop makes one check for
	108	* newly arrived callbacks, and, if there are some, waits for
	109	* one RCU-tasks grace period and then invokes the callbacks.
	110	* This loop is terminated by the system going down. ;-)
	111	*/
	112	for (;;) {
	113
	114	/* Pick up any new callbacks. */
07e10515 PM	115	raw_spin_lock_irqsave(&rtp->cbs_lock, flags);
	116	list = rtp->cbs_head;
	117	rtp->cbs_head = NULL;
	118	rtp->cbs_tail = &rtp->cbs_head;
	119	raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags);
eacd6f04 PM	120
	121	/* If there were none, wait a bit and start over. */
	122	if (!list) {
07e10515 PM	123	wait_event_interruptible(rtp->cbs_wq,
	124	READ_ONCE(rtp->cbs_head));
	125	if (!rtp->cbs_head) {
eacd6f04 PM	126	WARN_ON(signal_pending(current));
	127	schedule_timeout_interruptible(HZ/10);
	128	}
	129	continue;
	130	}
	131
5873b8a9 PM	132	// Wait for one grace period.
5873b8a9 PM	133	rtp->gp_func(rtp);
eacd6f04 PM	134
	135	/* Invoke the callbacks. */
	136	while (list) {
	137	next = list->next;
	138	local_bh_disable();
	139	list->func(list);
	140	local_bh_enable();
	141	list = next;
	142	cond_resched();
	143	}
	144	/* Paranoid sleep to keep this from entering a tight loop */
	145	schedule_timeout_uninterruptible(HZ/10);
	146	}
	147	}
	148
5873b8a9 PM	149	/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */
5873b8a9 PM	150	static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
eacd6f04 PM	151	{
	152	struct task_struct *t;
	153
c97d12a6 PM	154	t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
c97d12a6 PM	155	if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
5873b8a9	156	return;
eacd6f04	157	smp_mb(); /* Ensure others see full kthread. */
eacd6f04	158	}
eacd6f04 PM	159
	160	/* Do the srcu_read_lock() for the above synchronize_srcu(). */
	161	void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
	162	{
	163	preempt_disable();
	164	current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
	165	preempt_enable();
	166	}
	167
	168	/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
	169	void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
	170	{
	171	preempt_disable();
	172	__srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
	173	preempt_enable();
	174	}
	175
eacd6f04 PM	176	#ifndef CONFIG_TINY_RCU
	177
	178	/*
	179	* Print any non-default Tasks RCU settings.
	180	*/
	181	static void __init rcu_tasks_bootup_oddness(void)
	182	{
	183	#ifdef CONFIG_TASKS_RCU
	184	if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
	185	pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout);
	186	else
	187	pr_info("\tTasks RCU enabled.\n");
	188	#endif /* #ifdef CONFIG_TASKS_RCU */
c84aad76 PM	189	#ifdef CONFIG_TASKS_RUDE_RCU
	190	pr_info("\tRude variant of Tasks RCU enabled.\n");
	191	#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
eacd6f04 PM	192	}
	193
	194	#endif /* #ifndef CONFIG_TINY_RCU */
5873b8a9 PM	195
	196	#ifdef CONFIG_TASKS_RCU
	197
	198	////////////////////////////////////////////////////////////////////////
	199	//
	200	// Simple variant of RCU whose quiescent states are voluntary context
	201	// switch, cond_resched_rcu_qs(), user-space execution, and idle.
	202	// As such, grace periods can take one good long time. There are no
	203	// read-side primitives similar to rcu_read_lock() and rcu_read_unlock()
	204	// because this implementation is intended to get the system into a safe
	205	// state for some of the manipulations involved in tracing and the like.
	206	// Finally, this implementation does not support high call_rcu_tasks()
	207	// rates from multiple CPUs. If this is required, per-CPU callback lists
	208	// will be needed.
	209
	210	/* See if tasks are still holding out, complain if so. */
	211	static void check_holdout_task(struct task_struct *t,
	212	bool needreport, bool *firstreport)
	213	{
	214	int cpu;
	215
	216	if (!READ_ONCE(t->rcu_tasks_holdout) \|\|
	217	t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) \|\|
	218	!READ_ONCE(t->on_rq) \|\|
	219	(IS_ENABLED(CONFIG_NO_HZ_FULL) &&
	220	!is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
	221	WRITE_ONCE(t->rcu_tasks_holdout, false);
	222	list_del_init(&t->rcu_tasks_holdout_list);
	223	put_task_struct(t);
	224	return;
	225	}
	226	rcu_request_urgent_qs_task(t);
	227	if (!needreport)
	228	return;
	229	if (*firstreport) {
	230	pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
	231	*firstreport = false;
	232	}
	233	cpu = task_cpu(t);
	234	pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
	235	t, ".I"[is_idle_task(t)],
	236	"N."[cpu < 0 \|\| !tick_nohz_full_cpu(cpu)],
	237	t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
	238	t->rcu_tasks_idle_cpu, cpu);
	239	sched_show_task(t);
	240	}
	241
	242	/* Wait for one RCU-tasks grace period. */
	243	static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
	244	{
	245	struct task_struct g, t;
	246	unsigned long lastreport;
	247	LIST_HEAD(rcu_tasks_holdouts);
	248	int fract;
	249
	250	/*
	251	* Wait for all pre-existing t->on_rq and t->nvcsw transitions
	252	* to complete. Invoking synchronize_rcu() suffices because all
	253	* these transitions occur with interrupts disabled. Without this
	254	* synchronize_rcu(), a read-side critical section that started
	255	* before the grace period might be incorrectly seen as having
	256	* started after the grace period.
	257	*
	258	* This synchronize_rcu() also dispenses with the need for a
259	* memory barrier on the first store to t->rcu_tasks_holdout,
260	* as it forces the store to happen after the beginning of the
261	* grace period.
262	*/
263	synchronize_rcu();
264
265	/*
266	* There were callbacks, so we need to wait for an RCU-tasks
267	* grace period. Start off by scanning the task list for tasks
268	* that are not already voluntarily blocked. Mark these tasks
269	* and make a list of them in rcu_tasks_holdouts.
270	*/
271	rcu_read_lock();
272	for_each_process_thread(g, t) {
273	if (t != current && READ_ONCE(t->on_rq) && !is_idle_task(t)) {
274	get_task_struct(t);
275	t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw);
276	WRITE_ONCE(t->rcu_tasks_holdout, true);
277	list_add(&t->rcu_tasks_holdout_list,
278	&rcu_tasks_holdouts);
279	}
280	}
281	rcu_read_unlock();
282
283	/*
284	* Wait for tasks that are in the process of exiting. This
285	* does only part of the job, ensuring that all tasks that were
286	* previously exiting reach the point where they have disabled
287	* preemption, allowing the later synchronize_rcu() to finish
288	* the job.
289	*/
290	synchronize_srcu(&tasks_rcu_exit_srcu);
291
292	/*
293	* Each pass through the following loop scans the list of holdout
294	* tasks, removing any that are no longer holdouts. When the list
295	* is empty, we are done.
296	*/
297	lastreport = jiffies;
298
299	/* Start off with HZ/10 wait and slowly back off to 1 HZ wait. */
300	fract = 10;
301
302	for (;;) {
303	bool firstreport;
304	bool needreport;
305	int rtst;
306	struct task_struct *t1;
307
308	if (list_empty(&rcu_tasks_holdouts))
309	break;
310
311	/* Slowly back off waiting for holdouts */
312	schedule_timeout_interruptible(HZ/fract);
313
314	if (fract > 1)
315	fract--;
316
317	rtst = READ_ONCE(rcu_task_stall_timeout);
318	needreport = rtst > 0 && time_after(jiffies, lastreport + rtst);
319	if (needreport)
320	lastreport = jiffies;
321	firstreport = true;
322	WARN_ON(signal_pending(current));
323	list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
324	rcu_tasks_holdout_list) {
325	check_holdout_task(t, needreport, &firstreport);
326	cond_resched();
327	}
328	}
329
330	/*
331	* Because ->on_rq and ->nvcsw are not guaranteed to have a full
332	* memory barriers prior to them in the schedule() path, memory
333	* reordering on other CPUs could cause their RCU-tasks read-side
334	* critical sections to extend past the end of the grace period.
335	* However, because these ->nvcsw updates are carried out with
336	* interrupts disabled, we can use synchronize_rcu() to force the
337	* needed ordering on all such CPUs.
338	*
339	* This synchronize_rcu() also confines all ->rcu_tasks_holdout
340	* accesses to be within the grace period, avoiding the need for
341	* memory barriers for ->rcu_tasks_holdout accesses.
342	*
343	* In addition, this synchronize_rcu() waits for exiting tasks
344	* to complete their final preempt_disable() region of execution,
345	* cleaning up after the synchronize_srcu() above.
346	*/
347	synchronize_rcu();
348	}
349
350	void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
c97d12a6	351	DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
5873b8a9 PM	352
	353	/**
	354	* call_rcu_tasks() - Queue an RCU for invocation task-based grace period
	355	* @rhp: structure to be used for queueing the RCU updates.
	356	* @func: actual callback function to be invoked after the grace period
	357	*
	358	* The callback function will be invoked some time after a full grace
	359	* period elapses, in other words after all currently executing RCU
	360	* read-side critical sections have completed. call_rcu_tasks() assumes
	361	* that the read-side critical sections end at a voluntary context
	362	* switch (not a preemption!), cond_resched_rcu_qs(), entry into idle,
	363	* or transition to usermode execution. As such, there are no read-side
	364	* primitives analogous to rcu_read_lock() and rcu_read_unlock() because
	365	* this primitive is intended to determine that all tasks have passed
	366	* through a safe state, not so much for data-strcuture synchronization.
	367	*
	368	* See the description of call_rcu() for more detailed information on
	369	* memory ordering guarantees.
	370	*/
	371	void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
	372	{
	373	call_rcu_tasks_generic(rhp, func, &rcu_tasks);
	374	}
	375	EXPORT_SYMBOL_GPL(call_rcu_tasks);
	376
	377	/**
	378	* synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
	379	*
	380	* Control will return to the caller some time after a full rcu-tasks
	381	* grace period has elapsed, in other words after all currently
	382	* executing rcu-tasks read-side critical sections have elapsed. These
	383	* read-side critical sections are delimited by calls to schedule(),
	384	* cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls
	385	* to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
	386	*
	387	* This is a very specialized primitive, intended only for a few uses in
	388	* tracing and other situations requiring manipulation of function
	389	* preambles and profiling hooks. The synchronize_rcu_tasks() function
	390	* is not (yet) intended for heavy use from multiple CPUs.
	391	*
	392	* See the description of synchronize_rcu() for more detailed information
	393	* on memory ordering guarantees.
	394	*/
	395	void synchronize_rcu_tasks(void)
	396	{
	397	synchronize_rcu_tasks_generic(&rcu_tasks);
	398	}
	399	EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
	400
	401	/**
	402	* rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
	403	*
	404	* Although the current implementation is guaranteed to wait, it is not
	405	* obligated to, for example, if there are no pending callbacks.
	406	*/
	407	void rcu_barrier_tasks(void)
	408	{
	409	/* There is only one callback queue, so this is easy. ;-) */
	410	synchronize_rcu_tasks();
	411	}
	412	EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
	413
	414	static int __init rcu_spawn_tasks_kthread(void)
	415	{
416	rcu_spawn_tasks_kthread_generic(&rcu_tasks);
417	return 0;
418	}
419	core_initcall(rcu_spawn_tasks_kthread);
420
421	#endif /* #ifdef CONFIG_TASKS_RCU */
c84aad76 PM	422
	423	#ifdef CONFIG_TASKS_RUDE_RCU
	424
	425	////////////////////////////////////////////////////////////////////////
	426	//
	427	// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
	428	// passing an empty function to schedule_on_each_cpu(). This approach
	429	// provides an asynchronous call_rcu_tasks_rude() API and batching
	430	// of concurrent calls to the synchronous synchronize_rcu_rude() API.
	431	// This sends IPIs far and wide and induces otherwise unnecessary context
	432	// switches on all online CPUs, whether idle or not.
	433
	434	// Empty function to allow workqueues to force a context switch.
	435	static void rcu_tasks_be_rude(struct work_struct *work)
	436	{
	437	}
	438
	439	// Wait for one rude RCU-tasks grace period.
	440	static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)
	441	{
	442	schedule_on_each_cpu(rcu_tasks_be_rude);
	443	}
	444
	445	void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func);
c97d12a6 PM	446	DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude,
c97d12a6 PM	447	"RCU Tasks Rude");
c84aad76 PM	448
	449	/**
	450	* call_rcu_tasks_rude() - Queue a callback rude task-based grace period
	451	* @rhp: structure to be used for queueing the RCU updates.
	452	* @func: actual callback function to be invoked after the grace period
	453	*
	454	* The callback function will be invoked some time after a full grace
	455	* period elapses, in other words after all currently executing RCU
	456	* read-side critical sections have completed. call_rcu_tasks_rude()
	457	* assumes that the read-side critical sections end at context switch,
	458	* cond_resched_rcu_qs(), or transition to usermode execution. As such,
	459	* there are no read-side primitives analogous to rcu_read_lock() and
	460	* rcu_read_unlock() because this primitive is intended to determine
	461	* that all tasks have passed through a safe state, not so much for
	462	* data-strcuture synchronization.
	463	*
	464	* See the description of call_rcu() for more detailed information on
	465	* memory ordering guarantees.
	466	*/
	467	void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func)
	468	{
	469	call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude);
	470	}
	471	EXPORT_SYMBOL_GPL(call_rcu_tasks_rude);
	472
	473	/**
	474	* synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period
	475	*
	476	* Control will return to the caller some time after a rude rcu-tasks
	477	* grace period has elapsed, in other words after all currently
	478	* executing rcu-tasks read-side critical sections have elapsed. These
	479	* read-side critical sections are delimited by calls to schedule(),
	480	* cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
	481	* anyway) cond_resched().
	482	*
	483	* This is a very specialized primitive, intended only for a few uses in
	484	* tracing and other situations requiring manipulation of function preambles
	485	* and profiling hooks. The synchronize_rcu_tasks_rude() function is not
	486	* (yet) intended for heavy use from multiple CPUs.
	487	*
	488	* See the description of synchronize_rcu() for more detailed information
	489	* on memory ordering guarantees.
	490	*/
	491	void synchronize_rcu_tasks_rude(void)
	492	{
	493	synchronize_rcu_tasks_generic(&rcu_tasks_rude);
	494	}
	495	EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude);
	496
	497	/**
	498	* rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks.
	499	*
	500	* Although the current implementation is guaranteed to wait, it is not
	501	* obligated to, for example, if there are no pending callbacks.
	502	*/
	503	void rcu_barrier_tasks_rude(void)
	504	{
	505	/* There is only one callback queue, so this is easy. ;-) */
	506	synchronize_rcu_tasks_rude();
	507	}
	508	EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude);
	509
	510	static int __init rcu_spawn_tasks_rude_kthread(void)
	511	{
512	rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
513	return 0;
514	}
515	core_initcall(rcu_spawn_tasks_rude_kthread);
516
517	#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */