[linux-block.git] / kernel / sched / membarrier.c

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 *
 * membarrier system call
 */
#include "sched.h"

/*
 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
 * except MEMBARRIER_CMD_QUERY.
 */
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
#else
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
#endif

#define MEMBARRIER_CMD_BITMASK						\
	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)

static void ipi_mb(void *info)
{
	smp_mb();	/* IPIs should be serializing but paranoid. */
}

static void ipi_sync_rq_state(void *info)
{
	struct mm_struct *mm = (struct mm_struct *) info;

	if (current->mm != mm)
		return;
	this_cpu_write(runqueues.membarrier_state,
		       atomic_read(&mm->membarrier_state));
	/*
	 * Issue a memory barrier after setting
	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
	 * guarantee that no memory access following registration is reordered
	 * before registration.
	 */
	smp_mb();
}

void membarrier_exec_mmap(struct mm_struct *mm)
{
	/*
	 * Issue a memory barrier before clearing membarrier_state to
	 * guarantee that no memory access prior to exec is reordered after
	 * clearing this state.
	 */
	smp_mb();
	atomic_set(&mm->membarrier_state, 0);
	/*
	 * Keep the runqueue membarrier_state in sync with this mm
	 * membarrier_state.
	 */
	this_cpu_write(runqueues.membarrier_state, 0);
}

static int membarrier_global_expedited(void)
{
	int cpu;
	bool fallback = false;
	cpumask_var_t tmpmask;

	if (num_online_cpus() == 1)
		return 0;

	/*
	 * Matches memory barriers around rq->curr modification in
	 * scheduler.
	 */
	smp_mb();	/* system call entry is not a mb. */

	/*
	 * Expedited membarrier commands guarantee that they won't
	 * block, hence the GFP_NOWAIT allocation flag and fallback
	 * implementation.
	 */
	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
		/* Fallback for OOM. */
		fallback = true;
	}

	cpus_read_lock();
	rcu_read_lock();
	for_each_online_cpu(cpu) {
		struct task_struct *p;

		/*
		 * Skipping the current CPU is OK even through we can be
		 * migrated at any point. The current CPU, at the point
		 * where we read raw_smp_processor_id(), is ensured to
		 * be in program order with respect to the caller
		 * thread. Therefore, we can skip this CPU from the
		 * iteration.
		 */
		if (cpu == raw_smp_processor_id())
			continue;

		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
			continue;

		/*
		 * Skip the CPU if it runs a kernel thread. The scheduler
		 * leaves the prior task mm in place as an optimization when
		 * scheduling a kthread.
		 */
		p = rcu_dereference(cpu_rq(cpu)->curr);
		if (p->flags & PF_KTHREAD)
			continue;

		if (!fallback)
			__cpumask_set_cpu(cpu, tmpmask);
		else
			smp_call_function_single(cpu, ipi_mb, NULL, 1);
	}
	rcu_read_unlock();
	if (!fallback) {
		preempt_disable();
		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
		preempt_enable();
		free_cpumask_var(tmpmask);
	}
	cpus_read_unlock();

	/*
	 * Memory barrier on the caller thread _after_ we finished
	 * waiting for the last IPI. Matches memory barriers around
	 * rq->curr modification in scheduler.
	 */
	smp_mb();	/* exit from system call is not a mb */
	return 0;
}

static int membarrier_private_expedited(int flags)
{
	int cpu;
	bool fallback = false;
	cpumask_var_t tmpmask;

	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
			return -EINVAL;
		if (!(atomic_read(&current->mm->membarrier_state) &
		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
			return -EPERM;
	} else {
		if (!(atomic_read(&current->mm->membarrier_state) &
		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
			return -EPERM;
	}

	if (num_online_cpus() == 1)
		return 0;

	/*
	 * Matches memory barriers around rq->curr modification in
	 * scheduler.
	 */
	smp_mb();	/* system call entry is not a mb. */

	/*
	 * Expedited membarrier commands guarantee that they won't
	 * block, hence the GFP_NOWAIT allocation flag and fallback
	 * implementation.
	 */
	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
		/* Fallback for OOM. */
		fallback = true;
	}

	cpus_read_lock();
	rcu_read_lock();
	for_each_online_cpu(cpu) {
		struct task_struct *p;

		/*
		 * Skipping the current CPU is OK even through we can be
		 * migrated at any point. The current CPU, at the point
		 * where we read raw_smp_processor_id(), is ensured to
		 * be in program order with respect to the caller
		 * thread. Therefore, we can skip this CPU from the
		 * iteration.
		 */
		if (cpu == raw_smp_processor_id())
			continue;
		rcu_read_lock();
		p = rcu_dereference(cpu_rq(cpu)->curr);
		if (p && p->mm == current->mm) {
			if (!fallback)
				__cpumask_set_cpu(cpu, tmpmask);
			else
				smp_call_function_single(cpu, ipi_mb, NULL, 1);
		}
	}
	rcu_read_unlock();
	if (!fallback) {
		preempt_disable();
		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
		preempt_enable();
		free_cpumask_var(tmpmask);
	}
	cpus_read_unlock();

	/*
	 * Memory barrier on the caller thread _after_ we finished
	 * waiting for the last IPI. Matches memory barriers around
	 * rq->curr modification in scheduler.
	 */
	smp_mb();	/* exit from system call is not a mb */

	return 0;
}

static int sync_runqueues_membarrier_state(struct mm_struct *mm)
{
	int membarrier_state = atomic_read(&mm->membarrier_state);
	cpumask_var_t tmpmask;
	int cpu;

	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
		this_cpu_write(runqueues.membarrier_state, membarrier_state);

		/*
		 * For single mm user, we can simply issue a memory barrier
		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
		 * mm and in the current runqueue to guarantee that no memory
		 * access following registration is reordered before
		 * registration.
		 */
		smp_mb();
		return 0;
	}

	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
		return -ENOMEM;

	/*
	 * For mm with multiple users, we need to ensure all future
	 * scheduler executions will observe @mm's new membarrier
	 * state.
	 */
	synchronize_rcu();

	/*
	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
	 * @mm's membarrier state set bits are also set in in the runqueue's
	 * membarrier state. This ensures that a runqueue scheduling
	 * between threads which are users of @mm has its membarrier state
	 * updated.
	 */
	cpus_read_lock();
	rcu_read_lock();
	for_each_online_cpu(cpu) {
		struct rq *rq = cpu_rq(cpu);
		struct task_struct *p;

		p = rcu_dereference(&rq->curr);
		if (p && p->mm == mm)
			__cpumask_set_cpu(cpu, tmpmask);
	}
	rcu_read_unlock();

	preempt_disable();
	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
	preempt_enable();

	free_cpumask_var(tmpmask);
	cpus_read_unlock();

	return 0;
}

static int membarrier_register_global_expedited(void)
{
	struct task_struct *p = current;
	struct mm_struct *mm = p->mm;
	int ret;

	if (atomic_read(&mm->membarrier_state) &
	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
		return 0;
	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
	ret = sync_runqueues_membarrier_state(mm);
	if (ret)
		return ret;
	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
		  &mm->membarrier_state);

	return 0;
}

static int membarrier_register_private_expedited(int flags)
{
	struct task_struct *p = current;
	struct mm_struct *mm = p->mm;
	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
	    ret;

	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
			return -EINVAL;
		ready_state =
			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
	}

	/*
	 * We need to consider threads belonging to different thread
	 * groups, which use the same mm. (CLONE_VM but not
	 * CLONE_THREAD).
	 */
	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
		return 0;
	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
	atomic_or(set_state, &mm->membarrier_state);
	ret = sync_runqueues_membarrier_state(mm);
	if (ret)
		return ret;
	atomic_or(ready_state, &mm->membarrier_state);

	return 0;
}

/**
 * sys_membarrier - issue memory barriers on a set of threads
 * @cmd:   Takes command values defined in enum membarrier_cmd.
 * @flags: Currently needs to be 0. For future extensions.
 *
 * If this system call is not implemented, -ENOSYS is returned. If the
 * command specified does not exist, not available on the running
 * kernel, or if the command argument is invalid, this system call
 * returns -EINVAL. For a given command, with flags argument set to 0,
 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
 * always return the same value until reboot. In addition, it can return
 * -ENOMEM if there is not enough memory available to perform the system
 * call.
 *
 * All memory accesses performed in program order from each targeted thread
 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 * the semantic "barrier()" to represent a compiler barrier forcing memory
 * accesses to be performed in program order across the barrier, and
 * smp_mb() to represent explicit memory barriers forcing full memory
 * ordering across the barrier, we have the following ordering table for
 * each pair of barrier(), sys_membarrier() and smp_mb():
 *
 * The pair ordering is detailed as (O: ordered, X: not ordered):
 *
 *                        barrier()   smp_mb() sys_membarrier()
 *        barrier()          X           X            O
 *        smp_mb()           X           O            O
 *        sys_membarrier()   O           O            O
 */
SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
{
	if (unlikely(flags))
		return -EINVAL;
	switch (cmd) {
	case MEMBARRIER_CMD_QUERY:
	{
		int cmd_mask = MEMBARRIER_CMD_BITMASK;

		if (tick_nohz_full_enabled())
			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
		return cmd_mask;
	}
	case MEMBARRIER_CMD_GLOBAL:
		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
		if (tick_nohz_full_enabled())
			return -EINVAL;
		if (num_online_cpus() > 1)
			synchronize_rcu();
		return 0;
	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
		return membarrier_global_expedited();
	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
		return membarrier_register_global_expedited();
	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
		return membarrier_private_expedited(0);
	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
		return membarrier_register_private_expedited(0);
	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
	default:
		return -EINVAL;
	}
}
Commit	Line	Data
c942fddf	1	// SPDX-License-Identifier: GPL-2.0-or-later
22e4ebb9 MD	2	/*
	3	* Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
	4	*
	5	* membarrier system call
22e4ebb9	6	*/
325ea10c	7	#include "sched.h"
22e4ebb9 MD	8
	9	/*
	10	* Bitmask made from a "or" of all commands within enum membarrier_cmd,
	11	* except MEMBARRIER_CMD_QUERY.
	12	*/
70216e18	13	#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
97fb7a0a IM	14	#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
97fb7a0a IM	15	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
70216e18 MD	16	\| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
	17	#else
	18	#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
	19	#endif
	20
97fb7a0a IM	21	#define MEMBARRIER_CMD_BITMASK \
	22	(MEMBARRIER_CMD_GLOBAL \| MEMBARRIER_CMD_GLOBAL_EXPEDITED \
	23	\| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
	24	\| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
	25	\| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
70216e18	26	\| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
22e4ebb9 MD	27
	28	static void ipi_mb(void *info)
	29	{
	30	smp_mb(); /* IPIs should be serializing but paranoid. */
	31	}
	32
227a4aad MD	33	static void ipi_sync_rq_state(void *info)
	34	{
	35	struct mm_struct mm = (struct mm_struct ) info;
	36
	37	if (current->mm != mm)
	38	return;
	39	this_cpu_write(runqueues.membarrier_state,
	40	atomic_read(&mm->membarrier_state));
	41	/*
	42	* Issue a memory barrier after setting
	43	* MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
	44	* guarantee that no memory access following registration is reordered
	45	* before registration.
	46	*/
	47	smp_mb();
	48	}
	49
	50	void membarrier_exec_mmap(struct mm_struct *mm)
	51	{
	52	/*
	53	* Issue a memory barrier before clearing membarrier_state to
	54	* guarantee that no memory access prior to exec is reordered after
	55	* clearing this state.
	56	*/
	57	smp_mb();
	58	atomic_set(&mm->membarrier_state, 0);
	59	/*
	60	* Keep the runqueue membarrier_state in sync with this mm
	61	* membarrier_state.
	62	*/
	63	this_cpu_write(runqueues.membarrier_state, 0);
	64	}
	65
c5f58bd5 MD	66	static int membarrier_global_expedited(void)
	67	{
	68	int cpu;
	69	bool fallback = false;
	70	cpumask_var_t tmpmask;
	71
	72	if (num_online_cpus() == 1)
	73	return 0;
	74
	75	/*
	76	* Matches memory barriers around rq->curr modification in
	77	* scheduler.
	78	*/
	79	smp_mb(); /* system call entry is not a mb. */
	80
	81	/*
	82	* Expedited membarrier commands guarantee that they won't
	83	* block, hence the GFP_NOWAIT allocation flag and fallback
	84	* implementation.
	85	*/
	86	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
	87	/* Fallback for OOM. */
	88	fallback = true;
	89	}
	90
	91	cpus_read_lock();
227a4aad	92	rcu_read_lock();
c5f58bd5 MD	93	for_each_online_cpu(cpu) {
	94	struct task_struct *p;
	95
	96	/*
	97	* Skipping the current CPU is OK even through we can be
	98	* migrated at any point. The current CPU, at the point
	99	* where we read raw_smp_processor_id(), is ensured to
	100	* be in program order with respect to the caller
	101	* thread. Therefore, we can skip this CPU from the
	102	* iteration.
	103	*/
	104	if (cpu == raw_smp_processor_id())
	105	continue;
97fb7a0a	106
227a4aad MD	107	if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
	108	MEMBARRIER_STATE_GLOBAL_EXPEDITED))
	109	continue;
	110
	111	/*
	112	* Skip the CPU if it runs a kernel thread. The scheduler
	113	* leaves the prior task mm in place as an optimization when
	114	* scheduling a kthread.
	115	*/
154abafc	116	p = rcu_dereference(cpu_rq(cpu)->curr);
227a4aad MD	117	if (p->flags & PF_KTHREAD)
	118	continue;
	119
	120	if (!fallback)
	121	__cpumask_set_cpu(cpu, tmpmask);
	122	else
	123	smp_call_function_single(cpu, ipi_mb, NULL, 1);
c5f58bd5	124	}
227a4aad	125	rcu_read_unlock();
c5f58bd5 MD	126	if (!fallback) {
	127	preempt_disable();
	128	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
	129	preempt_enable();
	130	free_cpumask_var(tmpmask);
	131	}
	132	cpus_read_unlock();
	133
	134	/*
	135	* Memory barrier on the caller thread _after_ we finished
	136	* waiting for the last IPI. Matches memory barriers around
	137	* rq->curr modification in scheduler.
	138	*/
	139	smp_mb(); /* exit from system call is not a mb */
	140	return 0;
	141	}
	142
70216e18	143	static int membarrier_private_expedited(int flags)
22e4ebb9 MD	144	{
	145	int cpu;
	146	bool fallback = false;
	147	cpumask_var_t tmpmask;
	148
70216e18 MD	149	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
	150	if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
	151	return -EINVAL;
	152	if (!(atomic_read(&current->mm->membarrier_state) &
	153	MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
	154	return -EPERM;
	155	} else {
	156	if (!(atomic_read(&current->mm->membarrier_state) &
	157	MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
	158	return -EPERM;
	159	}
a961e409	160
22e4ebb9	161	if (num_online_cpus() == 1)
a961e409	162	return 0;
22e4ebb9 MD	163
	164	/*
	165	* Matches memory barriers around rq->curr modification in
	166	* scheduler.
	167	*/
	168	smp_mb(); /* system call entry is not a mb. */
	169
	170	/*
	171	* Expedited membarrier commands guarantee that they won't
	172	* block, hence the GFP_NOWAIT allocation flag and fallback
	173	* implementation.
	174	*/
	175	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
	176	/* Fallback for OOM. */
	177	fallback = true;
	178	}
	179
	180	cpus_read_lock();
227a4aad	181	rcu_read_lock();
22e4ebb9 MD	182	for_each_online_cpu(cpu) {
	183	struct task_struct *p;
	184
	185	/*
	186	* Skipping the current CPU is OK even through we can be
	187	* migrated at any point. The current CPU, at the point
	188	* where we read raw_smp_processor_id(), is ensured to
	189	* be in program order with respect to the caller
	190	* thread. Therefore, we can skip this CPU from the
	191	* iteration.
	192	*/
	193	if (cpu == raw_smp_processor_id())
	194	continue;
	195	rcu_read_lock();
154abafc	196	p = rcu_dereference(cpu_rq(cpu)->curr);
22e4ebb9 MD	197	if (p && p->mm == current->mm) {
	198	if (!fallback)
	199	__cpumask_set_cpu(cpu, tmpmask);
	200	else
	201	smp_call_function_single(cpu, ipi_mb, NULL, 1);
	202	}
22e4ebb9	203	}
227a4aad	204	rcu_read_unlock();
22e4ebb9	205	if (!fallback) {
54167607	206	preempt_disable();
22e4ebb9	207	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
54167607	208	preempt_enable();
22e4ebb9 MD	209	free_cpumask_var(tmpmask);
	210	}
	211	cpus_read_unlock();
	212
	213	/*
	214	* Memory barrier on the caller thread _after_ we finished
	215	* waiting for the last IPI. Matches memory barriers around
	216	* rq->curr modification in scheduler.
	217	*/
	218	smp_mb(); /* exit from system call is not a mb */
97fb7a0a	219
a961e409 MD	220	return 0;
	221	}
	222
227a4aad MD	223	static int sync_runqueues_membarrier_state(struct mm_struct *mm)
	224	{
	225	int membarrier_state = atomic_read(&mm->membarrier_state);
	226	cpumask_var_t tmpmask;
	227	int cpu;
	228
	229	if (atomic_read(&mm->mm_users) == 1 \|\| num_online_cpus() == 1) {
	230	this_cpu_write(runqueues.membarrier_state, membarrier_state);
	231
	232	/*
	233	* For single mm user, we can simply issue a memory barrier
	234	* after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
	235	* mm and in the current runqueue to guarantee that no memory
	236	* access following registration is reordered before
	237	* registration.
	238	*/
	239	smp_mb();
	240	return 0;
	241	}
	242
	243	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
	244	return -ENOMEM;
	245
	246	/*
	247	* For mm with multiple users, we need to ensure all future
	248	* scheduler executions will observe @mm's new membarrier
	249	* state.
	250	*/
	251	synchronize_rcu();
	252
	253	/*
	254	* For each cpu runqueue, if the task's mm match @mm, ensure that all
	255	* @mm's membarrier state set bits are also set in in the runqueue's
	256	* membarrier state. This ensures that a runqueue scheduling
	257	* between threads which are users of @mm has its membarrier state
	258	* updated.
	259	*/
	260	cpus_read_lock();
	261	rcu_read_lock();
	262	for_each_online_cpu(cpu) {
	263	struct rq *rq = cpu_rq(cpu);
	264	struct task_struct *p;
	265
	266	p = rcu_dereference(&rq->curr);
	267	if (p && p->mm == mm)
	268	__cpumask_set_cpu(cpu, tmpmask);
	269	}
	270	rcu_read_unlock();
	271
	272	preempt_disable();
	273	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
	274	preempt_enable();
	275
	276	free_cpumask_var(tmpmask);
	277	cpus_read_unlock();
	278
	279	return 0;
	280	}
	281
c5f58bd5 MD	282	static int membarrier_register_global_expedited(void)
	283	{
	284	struct task_struct *p = current;
	285	struct mm_struct *mm = p->mm;
227a4aad	286	int ret;
c5f58bd5 MD	287
	288	if (atomic_read(&mm->membarrier_state) &
	289	MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
	290	return 0;
	291	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
227a4aad MD	292	ret = sync_runqueues_membarrier_state(mm);
	293	if (ret)
	294	return ret;
c5f58bd5 MD	295	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
c5f58bd5 MD	296	&mm->membarrier_state);
97fb7a0a	297
c5f58bd5 MD	298	return 0;
	299	}
	300
70216e18	301	static int membarrier_register_private_expedited(int flags)
a961e409 MD	302	{
	303	struct task_struct *p = current;
	304	struct mm_struct *mm = p->mm;
227a4aad MD	305	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
	306	set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
	307	ret;
70216e18 MD	308
	309	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
	310	if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
	311	return -EINVAL;
227a4aad MD	312	ready_state =
227a4aad MD	313	MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
70216e18	314	}
a961e409 MD	315
	316	/*
	317	* We need to consider threads belonging to different thread
	318	* groups, which use the same mm. (CLONE_VM but not
	319	* CLONE_THREAD).
	320	*/
227a4aad	321	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
c5f58bd5	322	return 0;
70216e18	323	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
227a4aad MD	324	set_state \|= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
	325	atomic_or(set_state, &mm->membarrier_state);
	326	ret = sync_runqueues_membarrier_state(mm);
	327	if (ret)
	328	return ret;
	329	atomic_or(ready_state, &mm->membarrier_state);
97fb7a0a	330
c5f58bd5	331	return 0;
22e4ebb9 MD	332	}
	333
	334	/**
	335	* sys_membarrier - issue memory barriers on a set of threads
	336	* @cmd: Takes command values defined in enum membarrier_cmd.
	337	* @flags: Currently needs to be 0. For future extensions.
	338	*
	339	* If this system call is not implemented, -ENOSYS is returned. If the
	340	* command specified does not exist, not available on the running
	341	* kernel, or if the command argument is invalid, this system call
	342	* returns -EINVAL. For a given command, with flags argument set to 0,
227a4aad MD	343	* if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
	344	* always return the same value until reboot. In addition, it can return
	345	* -ENOMEM if there is not enough memory available to perform the system
	346	* call.
22e4ebb9 MD	347	*
	348	* All memory accesses performed in program order from each targeted thread
	349	* is guaranteed to be ordered with respect to sys_membarrier(). If we use
	350	* the semantic "barrier()" to represent a compiler barrier forcing memory
	351	* accesses to be performed in program order across the barrier, and
	352	* smp_mb() to represent explicit memory barriers forcing full memory
	353	* ordering across the barrier, we have the following ordering table for
	354	* each pair of barrier(), sys_membarrier() and smp_mb():
	355	*
	356	* The pair ordering is detailed as (O: ordered, X: not ordered):
	357	*
	358	* barrier() smp_mb() sys_membarrier()
	359	* barrier() X X O
	360	* smp_mb() X O O
	361	* sys_membarrier() O O O
	362	*/
	363	SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
	364	{
	365	if (unlikely(flags))
	366	return -EINVAL;
	367	switch (cmd) {
	368	case MEMBARRIER_CMD_QUERY:
	369	{
	370	int cmd_mask = MEMBARRIER_CMD_BITMASK;
	371
	372	if (tick_nohz_full_enabled())
c5f58bd5	373	cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
22e4ebb9 MD	374	return cmd_mask;
22e4ebb9 MD	375	}
c5f58bd5 MD	376	case MEMBARRIER_CMD_GLOBAL:
c5f58bd5 MD	377	/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
22e4ebb9 MD	378	if (tick_nohz_full_enabled())
	379	return -EINVAL;
	380	if (num_online_cpus() > 1)
78d125d3	381	synchronize_rcu();
22e4ebb9	382	return 0;
c5f58bd5 MD	383	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
	384	return membarrier_global_expedited();
	385	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
	386	return membarrier_register_global_expedited();
22e4ebb9	387	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
70216e18	388	return membarrier_private_expedited(0);
a961e409	389	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
70216e18 MD	390	return membarrier_register_private_expedited(0);
	391	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
	392	return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
	393	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
	394	return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
22e4ebb9 MD	395	default:
	396	return -EINVAL;
	397	}
	398	}