[linux-block.git] / kernel / sched / membarrier.c

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 *
 * membarrier system call
 */
#include "sched.h"

/*
 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
 * except MEMBARRIER_CMD_QUERY.
 */
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK			\
	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE			\
	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
#else
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK	0
#endif

#define MEMBARRIER_CMD_BITMASK						\
	(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED	\
	| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED			\
	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\
	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)

static void ipi_mb(void *info)
{
	smp_mb();	/* IPIs should be serializing but paranoid. */
}

static void ipi_sync_rq_state(void *info)
{
	struct mm_struct *mm = (struct mm_struct *) info;

	if (current->mm != mm)
		return;
	this_cpu_write(runqueues.membarrier_state,
		       atomic_read(&mm->membarrier_state));
	/*
	 * Issue a memory barrier after setting
	 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
	 * guarantee that no memory access following registration is reordered
	 * before registration.
	 */
	smp_mb();
}

void membarrier_exec_mmap(struct mm_struct *mm)
{
	/*
	 * Issue a memory barrier before clearing membarrier_state to
	 * guarantee that no memory access prior to exec is reordered after
	 * clearing this state.
	 */
	smp_mb();
	atomic_set(&mm->membarrier_state, 0);
	/*
	 * Keep the runqueue membarrier_state in sync with this mm
	 * membarrier_state.
	 */
	this_cpu_write(runqueues.membarrier_state, 0);
}

static int membarrier_global_expedited(void)
{
	int cpu;
	cpumask_var_t tmpmask;

	if (num_online_cpus() == 1)
		return 0;

	/*
	 * Matches memory barriers around rq->curr modification in
	 * scheduler.
	 */
	smp_mb();	/* system call entry is not a mb. */

	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
		return -ENOMEM;

	cpus_read_lock();
	rcu_read_lock();
	for_each_online_cpu(cpu) {
		struct task_struct *p;

		/*
		 * Skipping the current CPU is OK even through we can be
		 * migrated at any point. The current CPU, at the point
		 * where we read raw_smp_processor_id(), is ensured to
		 * be in program order with respect to the caller
		 * thread. Therefore, we can skip this CPU from the
		 * iteration.
		 */
		if (cpu == raw_smp_processor_id())
			continue;

		if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
		    MEMBARRIER_STATE_GLOBAL_EXPEDITED))
			continue;

		/*
		 * Skip the CPU if it runs a kernel thread. The scheduler
		 * leaves the prior task mm in place as an optimization when
		 * scheduling a kthread.
		 */
		p = rcu_dereference(cpu_rq(cpu)->curr);
		if (p->flags & PF_KTHREAD)
			continue;

		__cpumask_set_cpu(cpu, tmpmask);
	}
	rcu_read_unlock();

	preempt_disable();
	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
	preempt_enable();

	free_cpumask_var(tmpmask);
	cpus_read_unlock();

	/*
	 * Memory barrier on the caller thread _after_ we finished
	 * waiting for the last IPI. Matches memory barriers around
	 * rq->curr modification in scheduler.
	 */
	smp_mb();	/* exit from system call is not a mb */
	return 0;
}

static int membarrier_private_expedited(int flags)
{
	int cpu;
	cpumask_var_t tmpmask;
	struct mm_struct *mm = current->mm;

	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
			return -EINVAL;
		if (!(atomic_read(&mm->membarrier_state) &
		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
			return -EPERM;
	} else {
		if (!(atomic_read(&mm->membarrier_state) &
		      MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
			return -EPERM;
	}

	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
		return 0;

	/*
	 * Matches memory barriers around rq->curr modification in
	 * scheduler.
	 */
	smp_mb();	/* system call entry is not a mb. */

	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
		return -ENOMEM;

	cpus_read_lock();
	rcu_read_lock();
	for_each_online_cpu(cpu) {
		struct task_struct *p;

		/*
		 * Skipping the current CPU is OK even through we can be
		 * migrated at any point. The current CPU, at the point
		 * where we read raw_smp_processor_id(), is ensured to
		 * be in program order with respect to the caller
		 * thread. Therefore, we can skip this CPU from the
		 * iteration.
		 */
		if (cpu == raw_smp_processor_id())
			continue;
		p = rcu_dereference(cpu_rq(cpu)->curr);
		if (p && p->mm == mm)
			__cpumask_set_cpu(cpu, tmpmask);
	}
	rcu_read_unlock();

	preempt_disable();
	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
	preempt_enable();

	free_cpumask_var(tmpmask);
	cpus_read_unlock();

	/*
	 * Memory barrier on the caller thread _after_ we finished
	 * waiting for the last IPI. Matches memory barriers around
	 * rq->curr modification in scheduler.
	 */
	smp_mb();	/* exit from system call is not a mb */

	return 0;
}

static int sync_runqueues_membarrier_state(struct mm_struct *mm)
{
	int membarrier_state = atomic_read(&mm->membarrier_state);
	cpumask_var_t tmpmask;
	int cpu;

	if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
		this_cpu_write(runqueues.membarrier_state, membarrier_state);

		/*
		 * For single mm user, we can simply issue a memory barrier
		 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
		 * mm and in the current runqueue to guarantee that no memory
		 * access following registration is reordered before
		 * registration.
		 */
		smp_mb();
		return 0;
	}

	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
		return -ENOMEM;

	/*
	 * For mm with multiple users, we need to ensure all future
	 * scheduler executions will observe @mm's new membarrier
	 * state.
	 */
	synchronize_rcu();

	/*
	 * For each cpu runqueue, if the task's mm match @mm, ensure that all
	 * @mm's membarrier state set bits are also set in in the runqueue's
	 * membarrier state. This ensures that a runqueue scheduling
	 * between threads which are users of @mm has its membarrier state
	 * updated.
	 */
	cpus_read_lock();
	rcu_read_lock();
	for_each_online_cpu(cpu) {
		struct rq *rq = cpu_rq(cpu);
		struct task_struct *p;

		p = rcu_dereference(rq->curr);
		if (p && p->mm == mm)
			__cpumask_set_cpu(cpu, tmpmask);
	}
	rcu_read_unlock();

	preempt_disable();
	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
	preempt_enable();

	free_cpumask_var(tmpmask);
	cpus_read_unlock();

	return 0;
}

static int membarrier_register_global_expedited(void)
{
	struct task_struct *p = current;
	struct mm_struct *mm = p->mm;
	int ret;

	if (atomic_read(&mm->membarrier_state) &
	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
		return 0;
	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
	ret = sync_runqueues_membarrier_state(mm);
	if (ret)
		return ret;
	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
		  &mm->membarrier_state);

	return 0;
}

static int membarrier_register_private_expedited(int flags)
{
	struct task_struct *p = current;
	struct mm_struct *mm = p->mm;
	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
	    ret;

	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
			return -EINVAL;
		ready_state =
			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
	}

	/*
	 * We need to consider threads belonging to different thread
	 * groups, which use the same mm. (CLONE_VM but not
	 * CLONE_THREAD).
	 */
	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
		return 0;
	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
	atomic_or(set_state, &mm->membarrier_state);
	ret = sync_runqueues_membarrier_state(mm);
	if (ret)
		return ret;
	atomic_or(ready_state, &mm->membarrier_state);

	return 0;
}

/**
 * sys_membarrier - issue memory barriers on a set of threads
 * @cmd:   Takes command values defined in enum membarrier_cmd.
 * @flags: Currently needs to be 0. For future extensions.
 *
 * If this system call is not implemented, -ENOSYS is returned. If the
 * command specified does not exist, not available on the running
 * kernel, or if the command argument is invalid, this system call
 * returns -EINVAL. For a given command, with flags argument set to 0,
 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
 * always return the same value until reboot. In addition, it can return
 * -ENOMEM if there is not enough memory available to perform the system
 * call.
 *
 * All memory accesses performed in program order from each targeted thread
 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
 * the semantic "barrier()" to represent a compiler barrier forcing memory
 * accesses to be performed in program order across the barrier, and
 * smp_mb() to represent explicit memory barriers forcing full memory
 * ordering across the barrier, we have the following ordering table for
 * each pair of barrier(), sys_membarrier() and smp_mb():
 *
 * The pair ordering is detailed as (O: ordered, X: not ordered):
 *
 *                        barrier()   smp_mb() sys_membarrier()
 *        barrier()          X           X            O
 *        smp_mb()           X           O            O
 *        sys_membarrier()   O           O            O
 */
SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
{
	if (unlikely(flags))
		return -EINVAL;
	switch (cmd) {
	case MEMBARRIER_CMD_QUERY:
	{
		int cmd_mask = MEMBARRIER_CMD_BITMASK;

		if (tick_nohz_full_enabled())
			cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
		return cmd_mask;
	}
	case MEMBARRIER_CMD_GLOBAL:
		/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
		if (tick_nohz_full_enabled())
			return -EINVAL;
		if (num_online_cpus() > 1)
			synchronize_rcu();
		return 0;
	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
		return membarrier_global_expedited();
	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
		return membarrier_register_global_expedited();
	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
		return membarrier_private_expedited(0);
	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
		return membarrier_register_private_expedited(0);
	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
		return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
		return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
	default:
		return -EINVAL;
	}
}
Commit	Line	Data
c942fddf	1	// SPDX-License-Identifier: GPL-2.0-or-later
22e4ebb9 MD	2	/*
	3	* Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
	4	*
	5	* membarrier system call
22e4ebb9	6	*/
325ea10c	7	#include "sched.h"
22e4ebb9 MD	8
	9	/*
	10	* Bitmask made from a "or" of all commands within enum membarrier_cmd,
	11	* except MEMBARRIER_CMD_QUERY.
	12	*/
70216e18	13	#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
97fb7a0a IM	14	#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
97fb7a0a IM	15	(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
70216e18 MD	16	\| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
	17	#else
	18	#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
	19	#endif
	20
97fb7a0a IM	21	#define MEMBARRIER_CMD_BITMASK \
	22	(MEMBARRIER_CMD_GLOBAL \| MEMBARRIER_CMD_GLOBAL_EXPEDITED \
	23	\| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
	24	\| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
	25	\| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
70216e18	26	\| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
22e4ebb9 MD	27
	28	static void ipi_mb(void *info)
	29	{
	30	smp_mb(); /* IPIs should be serializing but paranoid. */
	31	}
	32
227a4aad MD	33	static void ipi_sync_rq_state(void *info)
	34	{
	35	struct mm_struct mm = (struct mm_struct ) info;
	36
	37	if (current->mm != mm)
	38	return;
	39	this_cpu_write(runqueues.membarrier_state,
	40	atomic_read(&mm->membarrier_state));
	41	/*
	42	* Issue a memory barrier after setting
	43	* MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
	44	* guarantee that no memory access following registration is reordered
	45	* before registration.
	46	*/
	47	smp_mb();
	48	}
	49
	50	void membarrier_exec_mmap(struct mm_struct *mm)
	51	{
	52	/*
	53	* Issue a memory barrier before clearing membarrier_state to
	54	* guarantee that no memory access prior to exec is reordered after
	55	* clearing this state.
	56	*/
	57	smp_mb();
	58	atomic_set(&mm->membarrier_state, 0);
	59	/*
	60	* Keep the runqueue membarrier_state in sync with this mm
	61	* membarrier_state.
	62	*/
	63	this_cpu_write(runqueues.membarrier_state, 0);
	64	}
	65
c5f58bd5 MD	66	static int membarrier_global_expedited(void)
	67	{
	68	int cpu;
c5f58bd5 MD	69	cpumask_var_t tmpmask;
	70
	71	if (num_online_cpus() == 1)
	72	return 0;
	73
	74	/*
	75	* Matches memory barriers around rq->curr modification in
	76	* scheduler.
	77	*/
	78	smp_mb(); /* system call entry is not a mb. */
	79
c172e0a3 MD	80	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
c172e0a3 MD	81	return -ENOMEM;
c5f58bd5 MD	82
c5f58bd5 MD	83	cpus_read_lock();
227a4aad	84	rcu_read_lock();
c5f58bd5 MD	85	for_each_online_cpu(cpu) {
	86	struct task_struct *p;
	87
	88	/*
	89	* Skipping the current CPU is OK even through we can be
	90	* migrated at any point. The current CPU, at the point
	91	* where we read raw_smp_processor_id(), is ensured to
	92	* be in program order with respect to the caller
	93	* thread. Therefore, we can skip this CPU from the
	94	* iteration.
	95	*/
	96	if (cpu == raw_smp_processor_id())
	97	continue;
97fb7a0a	98
227a4aad MD	99	if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
	100	MEMBARRIER_STATE_GLOBAL_EXPEDITED))
	101	continue;
	102
	103	/*
	104	* Skip the CPU if it runs a kernel thread. The scheduler
	105	* leaves the prior task mm in place as an optimization when
	106	* scheduling a kthread.
	107	*/
154abafc	108	p = rcu_dereference(cpu_rq(cpu)->curr);
227a4aad MD	109	if (p->flags & PF_KTHREAD)
	110	continue;
	111
c172e0a3	112	__cpumask_set_cpu(cpu, tmpmask);
c5f58bd5	113	}
227a4aad	114	rcu_read_unlock();
c172e0a3 MD	115
	116	preempt_disable();
	117	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
	118	preempt_enable();
	119
	120	free_cpumask_var(tmpmask);
c5f58bd5 MD	121	cpus_read_unlock();
	122
	123	/*
	124	* Memory barrier on the caller thread _after_ we finished
	125	* waiting for the last IPI. Matches memory barriers around
	126	* rq->curr modification in scheduler.
	127	*/
	128	smp_mb(); /* exit from system call is not a mb */
	129	return 0;
	130	}
	131
70216e18	132	static int membarrier_private_expedited(int flags)
22e4ebb9 MD	133	{
22e4ebb9 MD	134	int cpu;
22e4ebb9	135	cpumask_var_t tmpmask;
c6d68c1c	136	struct mm_struct *mm = current->mm;
22e4ebb9	137
70216e18 MD	138	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
	139	if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
	140	return -EINVAL;
c6d68c1c	141	if (!(atomic_read(&mm->membarrier_state) &
70216e18 MD	142	MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
	143	return -EPERM;
	144	} else {
c6d68c1c	145	if (!(atomic_read(&mm->membarrier_state) &
70216e18 MD	146	MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
	147	return -EPERM;
	148	}
a961e409	149
c6d68c1c	150	if (atomic_read(&mm->mm_users) == 1 \|\| num_online_cpus() == 1)
a961e409	151	return 0;
22e4ebb9 MD	152
	153	/*
	154	* Matches memory barriers around rq->curr modification in
	155	* scheduler.
	156	*/
	157	smp_mb(); /* system call entry is not a mb. */
	158
c172e0a3 MD	159	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
c172e0a3 MD	160	return -ENOMEM;
22e4ebb9 MD	161
22e4ebb9 MD	162	cpus_read_lock();
227a4aad	163	rcu_read_lock();
22e4ebb9 MD	164	for_each_online_cpu(cpu) {
	165	struct task_struct *p;
	166
	167	/*
	168	* Skipping the current CPU is OK even through we can be
	169	* migrated at any point. The current CPU, at the point
	170	* where we read raw_smp_processor_id(), is ensured to
	171	* be in program order with respect to the caller
	172	* thread. Therefore, we can skip this CPU from the
	173	* iteration.
	174	*/
	175	if (cpu == raw_smp_processor_id())
	176	continue;
154abafc	177	p = rcu_dereference(cpu_rq(cpu)->curr);
c172e0a3 MD	178	if (p && p->mm == mm)
c172e0a3 MD	179	__cpumask_set_cpu(cpu, tmpmask);
22e4ebb9	180	}
227a4aad	181	rcu_read_unlock();
c172e0a3 MD	182
	183	preempt_disable();
	184	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
	185	preempt_enable();
	186
	187	free_cpumask_var(tmpmask);
22e4ebb9 MD	188	cpus_read_unlock();
	189
	190	/*
	191	* Memory barrier on the caller thread _after_ we finished
	192	* waiting for the last IPI. Matches memory barriers around
	193	* rq->curr modification in scheduler.
	194	*/
	195	smp_mb(); /* exit from system call is not a mb */
97fb7a0a	196
a961e409 MD	197	return 0;
	198	}
	199
227a4aad MD	200	static int sync_runqueues_membarrier_state(struct mm_struct *mm)
	201	{
	202	int membarrier_state = atomic_read(&mm->membarrier_state);
	203	cpumask_var_t tmpmask;
	204	int cpu;
	205
	206	if (atomic_read(&mm->mm_users) == 1 \|\| num_online_cpus() == 1) {
	207	this_cpu_write(runqueues.membarrier_state, membarrier_state);
	208
	209	/*
	210	* For single mm user, we can simply issue a memory barrier
	211	* after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
	212	* mm and in the current runqueue to guarantee that no memory
	213	* access following registration is reordered before
	214	* registration.
	215	*/
	216	smp_mb();
	217	return 0;
	218	}
	219
	220	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
	221	return -ENOMEM;
	222
	223	/*
	224	* For mm with multiple users, we need to ensure all future
	225	* scheduler executions will observe @mm's new membarrier
	226	* state.
	227	*/
	228	synchronize_rcu();
	229
	230	/*
	231	* For each cpu runqueue, if the task's mm match @mm, ensure that all
	232	* @mm's membarrier state set bits are also set in in the runqueue's
	233	* membarrier state. This ensures that a runqueue scheduling
	234	* between threads which are users of @mm has its membarrier state
	235	* updated.
	236	*/
	237	cpus_read_lock();
	238	rcu_read_lock();
	239	for_each_online_cpu(cpu) {
	240	struct rq *rq = cpu_rq(cpu);
	241	struct task_struct *p;
	242
c172e0a3	243	p = rcu_dereference(rq->curr);
227a4aad MD	244	if (p && p->mm == mm)
	245	__cpumask_set_cpu(cpu, tmpmask);
	246	}
	247	rcu_read_unlock();
	248
	249	preempt_disable();
	250	smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
	251	preempt_enable();
	252
	253	free_cpumask_var(tmpmask);
	254	cpus_read_unlock();
	255
	256	return 0;
	257	}
	258
c5f58bd5 MD	259	static int membarrier_register_global_expedited(void)
	260	{
	261	struct task_struct *p = current;
	262	struct mm_struct *mm = p->mm;
227a4aad	263	int ret;
c5f58bd5 MD	264
	265	if (atomic_read(&mm->membarrier_state) &
	266	MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
	267	return 0;
	268	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
227a4aad MD	269	ret = sync_runqueues_membarrier_state(mm);
	270	if (ret)
	271	return ret;
c5f58bd5 MD	272	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
c5f58bd5 MD	273	&mm->membarrier_state);
97fb7a0a	274
c5f58bd5 MD	275	return 0;
	276	}
	277
70216e18	278	static int membarrier_register_private_expedited(int flags)
a961e409 MD	279	{
	280	struct task_struct *p = current;
	281	struct mm_struct *mm = p->mm;
227a4aad MD	282	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
	283	set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
	284	ret;
70216e18 MD	285
	286	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
	287	if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
	288	return -EINVAL;
227a4aad MD	289	ready_state =
227a4aad MD	290	MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
70216e18	291	}
a961e409 MD	292
	293	/*
	294	* We need to consider threads belonging to different thread
	295	* groups, which use the same mm. (CLONE_VM but not
	296	* CLONE_THREAD).
	297	*/
227a4aad	298	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
c5f58bd5	299	return 0;
70216e18	300	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
227a4aad MD	301	set_state \|= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
	302	atomic_or(set_state, &mm->membarrier_state);
	303	ret = sync_runqueues_membarrier_state(mm);
	304	if (ret)
	305	return ret;
	306	atomic_or(ready_state, &mm->membarrier_state);
97fb7a0a	307
c5f58bd5	308	return 0;
22e4ebb9 MD	309	}
	310
	311	/**
	312	* sys_membarrier - issue memory barriers on a set of threads
	313	* @cmd: Takes command values defined in enum membarrier_cmd.
	314	* @flags: Currently needs to be 0. For future extensions.
	315	*
	316	* If this system call is not implemented, -ENOSYS is returned. If the
	317	* command specified does not exist, not available on the running
	318	* kernel, or if the command argument is invalid, this system call
	319	* returns -EINVAL. For a given command, with flags argument set to 0,
227a4aad MD	320	* if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
	321	* always return the same value until reboot. In addition, it can return
	322	* -ENOMEM if there is not enough memory available to perform the system
	323	* call.
22e4ebb9 MD	324	*
	325	* All memory accesses performed in program order from each targeted thread
	326	* is guaranteed to be ordered with respect to sys_membarrier(). If we use
	327	* the semantic "barrier()" to represent a compiler barrier forcing memory
	328	* accesses to be performed in program order across the barrier, and
	329	* smp_mb() to represent explicit memory barriers forcing full memory
	330	* ordering across the barrier, we have the following ordering table for
	331	* each pair of barrier(), sys_membarrier() and smp_mb():
	332	*
	333	* The pair ordering is detailed as (O: ordered, X: not ordered):
	334	*
	335	* barrier() smp_mb() sys_membarrier()
	336	* barrier() X X O
	337	* smp_mb() X O O
	338	* sys_membarrier() O O O
	339	*/
	340	SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
	341	{
	342	if (unlikely(flags))
	343	return -EINVAL;
	344	switch (cmd) {
	345	case MEMBARRIER_CMD_QUERY:
	346	{
	347	int cmd_mask = MEMBARRIER_CMD_BITMASK;
	348
	349	if (tick_nohz_full_enabled())
c5f58bd5	350	cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
22e4ebb9 MD	351	return cmd_mask;
22e4ebb9 MD	352	}
c5f58bd5 MD	353	case MEMBARRIER_CMD_GLOBAL:
c5f58bd5 MD	354	/* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
22e4ebb9 MD	355	if (tick_nohz_full_enabled())
	356	return -EINVAL;
	357	if (num_online_cpus() > 1)
78d125d3	358	synchronize_rcu();
22e4ebb9	359	return 0;
c5f58bd5 MD	360	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
	361	return membarrier_global_expedited();
	362	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
	363	return membarrier_register_global_expedited();
22e4ebb9	364	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
70216e18	365	return membarrier_private_expedited(0);
a961e409	366	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
70216e18 MD	367	return membarrier_register_private_expedited(0);
	368	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
	369	return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
	370	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
	371	return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
22e4ebb9 MD	372	default:
	373	return -EINVAL;
	374	}
	375	}