selftests, sched/membarrier: Add multi-threaded test
[linux-block.git] / kernel / sched / membarrier.c
CommitLineData
c942fddf 1// SPDX-License-Identifier: GPL-2.0-or-later
22e4ebb9
MD
2/*
3 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
4 *
5 * membarrier system call
22e4ebb9 6 */
325ea10c 7#include "sched.h"
22e4ebb9
MD
8
9/*
10 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
11 * except MEMBARRIER_CMD_QUERY.
12 */
70216e18 13#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
97fb7a0a
IM
14#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
15 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
70216e18
MD
16 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
17#else
18#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
19#endif
20
97fb7a0a
IM
21#define MEMBARRIER_CMD_BITMASK \
22 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
23 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
24 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
25 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
70216e18 26 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
22e4ebb9
MD
27
28static void ipi_mb(void *info)
29{
30 smp_mb(); /* IPIs should be serializing but paranoid. */
31}
32
227a4aad
MD
33static void ipi_sync_rq_state(void *info)
34{
35 struct mm_struct *mm = (struct mm_struct *) info;
36
37 if (current->mm != mm)
38 return;
39 this_cpu_write(runqueues.membarrier_state,
40 atomic_read(&mm->membarrier_state));
41 /*
42 * Issue a memory barrier after setting
43 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
44 * guarantee that no memory access following registration is reordered
45 * before registration.
46 */
47 smp_mb();
48}
49
50void membarrier_exec_mmap(struct mm_struct *mm)
51{
52 /*
53 * Issue a memory barrier before clearing membarrier_state to
54 * guarantee that no memory access prior to exec is reordered after
55 * clearing this state.
56 */
57 smp_mb();
58 atomic_set(&mm->membarrier_state, 0);
59 /*
60 * Keep the runqueue membarrier_state in sync with this mm
61 * membarrier_state.
62 */
63 this_cpu_write(runqueues.membarrier_state, 0);
64}
65
c5f58bd5
MD
66static int membarrier_global_expedited(void)
67{
68 int cpu;
69 bool fallback = false;
70 cpumask_var_t tmpmask;
71
72 if (num_online_cpus() == 1)
73 return 0;
74
75 /*
76 * Matches memory barriers around rq->curr modification in
77 * scheduler.
78 */
79 smp_mb(); /* system call entry is not a mb. */
80
81 /*
82 * Expedited membarrier commands guarantee that they won't
83 * block, hence the GFP_NOWAIT allocation flag and fallback
84 * implementation.
85 */
86 if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
87 /* Fallback for OOM. */
88 fallback = true;
89 }
90
91 cpus_read_lock();
227a4aad 92 rcu_read_lock();
c5f58bd5
MD
93 for_each_online_cpu(cpu) {
94 struct task_struct *p;
95
96 /*
97 * Skipping the current CPU is OK even through we can be
98 * migrated at any point. The current CPU, at the point
99 * where we read raw_smp_processor_id(), is ensured to
100 * be in program order with respect to the caller
101 * thread. Therefore, we can skip this CPU from the
102 * iteration.
103 */
104 if (cpu == raw_smp_processor_id())
105 continue;
97fb7a0a 106
227a4aad
MD
107 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
108 MEMBARRIER_STATE_GLOBAL_EXPEDITED))
109 continue;
110
111 /*
112 * Skip the CPU if it runs a kernel thread. The scheduler
113 * leaves the prior task mm in place as an optimization when
114 * scheduling a kthread.
115 */
154abafc 116 p = rcu_dereference(cpu_rq(cpu)->curr);
227a4aad
MD
117 if (p->flags & PF_KTHREAD)
118 continue;
119
120 if (!fallback)
121 __cpumask_set_cpu(cpu, tmpmask);
122 else
123 smp_call_function_single(cpu, ipi_mb, NULL, 1);
c5f58bd5 124 }
227a4aad 125 rcu_read_unlock();
c5f58bd5
MD
126 if (!fallback) {
127 preempt_disable();
128 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
129 preempt_enable();
130 free_cpumask_var(tmpmask);
131 }
132 cpus_read_unlock();
133
134 /*
135 * Memory barrier on the caller thread _after_ we finished
136 * waiting for the last IPI. Matches memory barriers around
137 * rq->curr modification in scheduler.
138 */
139 smp_mb(); /* exit from system call is not a mb */
140 return 0;
141}
142
70216e18 143static int membarrier_private_expedited(int flags)
22e4ebb9
MD
144{
145 int cpu;
146 bool fallback = false;
147 cpumask_var_t tmpmask;
148
70216e18
MD
149 if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
150 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
151 return -EINVAL;
152 if (!(atomic_read(&current->mm->membarrier_state) &
153 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
154 return -EPERM;
155 } else {
156 if (!(atomic_read(&current->mm->membarrier_state) &
157 MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
158 return -EPERM;
159 }
a961e409 160
22e4ebb9 161 if (num_online_cpus() == 1)
a961e409 162 return 0;
22e4ebb9
MD
163
164 /*
165 * Matches memory barriers around rq->curr modification in
166 * scheduler.
167 */
168 smp_mb(); /* system call entry is not a mb. */
169
170 /*
171 * Expedited membarrier commands guarantee that they won't
172 * block, hence the GFP_NOWAIT allocation flag and fallback
173 * implementation.
174 */
175 if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
176 /* Fallback for OOM. */
177 fallback = true;
178 }
179
180 cpus_read_lock();
227a4aad 181 rcu_read_lock();
22e4ebb9
MD
182 for_each_online_cpu(cpu) {
183 struct task_struct *p;
184
185 /*
186 * Skipping the current CPU is OK even through we can be
187 * migrated at any point. The current CPU, at the point
188 * where we read raw_smp_processor_id(), is ensured to
189 * be in program order with respect to the caller
190 * thread. Therefore, we can skip this CPU from the
191 * iteration.
192 */
193 if (cpu == raw_smp_processor_id())
194 continue;
195 rcu_read_lock();
154abafc 196 p = rcu_dereference(cpu_rq(cpu)->curr);
22e4ebb9
MD
197 if (p && p->mm == current->mm) {
198 if (!fallback)
199 __cpumask_set_cpu(cpu, tmpmask);
200 else
201 smp_call_function_single(cpu, ipi_mb, NULL, 1);
202 }
22e4ebb9 203 }
227a4aad 204 rcu_read_unlock();
22e4ebb9 205 if (!fallback) {
54167607 206 preempt_disable();
22e4ebb9 207 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
54167607 208 preempt_enable();
22e4ebb9
MD
209 free_cpumask_var(tmpmask);
210 }
211 cpus_read_unlock();
212
213 /*
214 * Memory barrier on the caller thread _after_ we finished
215 * waiting for the last IPI. Matches memory barriers around
216 * rq->curr modification in scheduler.
217 */
218 smp_mb(); /* exit from system call is not a mb */
97fb7a0a 219
a961e409
MD
220 return 0;
221}
222
227a4aad
MD
223static int sync_runqueues_membarrier_state(struct mm_struct *mm)
224{
225 int membarrier_state = atomic_read(&mm->membarrier_state);
226 cpumask_var_t tmpmask;
227 int cpu;
228
229 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
230 this_cpu_write(runqueues.membarrier_state, membarrier_state);
231
232 /*
233 * For single mm user, we can simply issue a memory barrier
234 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
235 * mm and in the current runqueue to guarantee that no memory
236 * access following registration is reordered before
237 * registration.
238 */
239 smp_mb();
240 return 0;
241 }
242
243 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
244 return -ENOMEM;
245
246 /*
247 * For mm with multiple users, we need to ensure all future
248 * scheduler executions will observe @mm's new membarrier
249 * state.
250 */
251 synchronize_rcu();
252
253 /*
254 * For each cpu runqueue, if the task's mm match @mm, ensure that all
255 * @mm's membarrier state set bits are also set in in the runqueue's
256 * membarrier state. This ensures that a runqueue scheduling
257 * between threads which are users of @mm has its membarrier state
258 * updated.
259 */
260 cpus_read_lock();
261 rcu_read_lock();
262 for_each_online_cpu(cpu) {
263 struct rq *rq = cpu_rq(cpu);
264 struct task_struct *p;
265
266 p = rcu_dereference(&rq->curr);
267 if (p && p->mm == mm)
268 __cpumask_set_cpu(cpu, tmpmask);
269 }
270 rcu_read_unlock();
271
272 preempt_disable();
273 smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
274 preempt_enable();
275
276 free_cpumask_var(tmpmask);
277 cpus_read_unlock();
278
279 return 0;
280}
281
c5f58bd5
MD
282static int membarrier_register_global_expedited(void)
283{
284 struct task_struct *p = current;
285 struct mm_struct *mm = p->mm;
227a4aad 286 int ret;
c5f58bd5
MD
287
288 if (atomic_read(&mm->membarrier_state) &
289 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
290 return 0;
291 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
227a4aad
MD
292 ret = sync_runqueues_membarrier_state(mm);
293 if (ret)
294 return ret;
c5f58bd5
MD
295 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
296 &mm->membarrier_state);
97fb7a0a 297
c5f58bd5
MD
298 return 0;
299}
300
70216e18 301static int membarrier_register_private_expedited(int flags)
a961e409
MD
302{
303 struct task_struct *p = current;
304 struct mm_struct *mm = p->mm;
227a4aad
MD
305 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
306 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
307 ret;
70216e18
MD
308
309 if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
310 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
311 return -EINVAL;
227a4aad
MD
312 ready_state =
313 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
70216e18 314 }
a961e409
MD
315
316 /*
317 * We need to consider threads belonging to different thread
318 * groups, which use the same mm. (CLONE_VM but not
319 * CLONE_THREAD).
320 */
227a4aad 321 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
c5f58bd5 322 return 0;
70216e18 323 if (flags & MEMBARRIER_FLAG_SYNC_CORE)
227a4aad
MD
324 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
325 atomic_or(set_state, &mm->membarrier_state);
326 ret = sync_runqueues_membarrier_state(mm);
327 if (ret)
328 return ret;
329 atomic_or(ready_state, &mm->membarrier_state);
97fb7a0a 330
c5f58bd5 331 return 0;
22e4ebb9
MD
332}
333
334/**
335 * sys_membarrier - issue memory barriers on a set of threads
336 * @cmd: Takes command values defined in enum membarrier_cmd.
337 * @flags: Currently needs to be 0. For future extensions.
338 *
339 * If this system call is not implemented, -ENOSYS is returned. If the
340 * command specified does not exist, not available on the running
341 * kernel, or if the command argument is invalid, this system call
342 * returns -EINVAL. For a given command, with flags argument set to 0,
227a4aad
MD
343 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
344 * always return the same value until reboot. In addition, it can return
345 * -ENOMEM if there is not enough memory available to perform the system
346 * call.
22e4ebb9
MD
347 *
348 * All memory accesses performed in program order from each targeted thread
349 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
350 * the semantic "barrier()" to represent a compiler barrier forcing memory
351 * accesses to be performed in program order across the barrier, and
352 * smp_mb() to represent explicit memory barriers forcing full memory
353 * ordering across the barrier, we have the following ordering table for
354 * each pair of barrier(), sys_membarrier() and smp_mb():
355 *
356 * The pair ordering is detailed as (O: ordered, X: not ordered):
357 *
358 * barrier() smp_mb() sys_membarrier()
359 * barrier() X X O
360 * smp_mb() X O O
361 * sys_membarrier() O O O
362 */
363SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
364{
365 if (unlikely(flags))
366 return -EINVAL;
367 switch (cmd) {
368 case MEMBARRIER_CMD_QUERY:
369 {
370 int cmd_mask = MEMBARRIER_CMD_BITMASK;
371
372 if (tick_nohz_full_enabled())
c5f58bd5 373 cmd_mask &= ~MEMBARRIER_CMD_GLOBAL;
22e4ebb9
MD
374 return cmd_mask;
375 }
c5f58bd5
MD
376 case MEMBARRIER_CMD_GLOBAL:
377 /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */
22e4ebb9
MD
378 if (tick_nohz_full_enabled())
379 return -EINVAL;
380 if (num_online_cpus() > 1)
78d125d3 381 synchronize_rcu();
22e4ebb9 382 return 0;
c5f58bd5
MD
383 case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
384 return membarrier_global_expedited();
385 case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
386 return membarrier_register_global_expedited();
22e4ebb9 387 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
70216e18 388 return membarrier_private_expedited(0);
a961e409 389 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
70216e18
MD
390 return membarrier_register_private_expedited(0);
391 case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
392 return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
393 case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
394 return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
22e4ebb9
MD
395 default:
396 return -EINVAL;
397 }
398}