Commit | Line | Data |
---|---|---|
38b4df64 | 1 | // SPDX-License-Identifier: GPL-2.0+ |
1da177e4 LT |
2 | /* |
3 | * Read-Copy Update mechanism for mutual exclusion | |
4 | * | |
01c1c660 | 5 | * Copyright IBM Corporation, 2001 |
1da177e4 LT |
6 | * |
7 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | |
8 | * Manfred Spraul <manfred@colorfullife.com> | |
a71fca58 | 9 | * |
38b4df64 | 10 | * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> |
1da177e4 LT |
11 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. |
12 | * Papers: | |
13 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | |
14 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | |
15 | * | |
16 | * For detailed explanation of Read-Copy Update mechanism see - | |
a71fca58 | 17 | * http://lse.sourceforge.net/locking/rcupdate.html |
1da177e4 LT |
18 | * |
19 | */ | |
20 | #include <linux/types.h> | |
21 | #include <linux/kernel.h> | |
22 | #include <linux/init.h> | |
23 | #include <linux/spinlock.h> | |
24 | #include <linux/smp.h> | |
25 | #include <linux/interrupt.h> | |
3f07c014 | 26 | #include <linux/sched/signal.h> |
b17b0153 | 27 | #include <linux/sched/debug.h> |
60063497 | 28 | #include <linux/atomic.h> |
1da177e4 | 29 | #include <linux/bitops.h> |
1da177e4 LT |
30 | #include <linux/percpu.h> |
31 | #include <linux/notifier.h> | |
1da177e4 | 32 | #include <linux/cpu.h> |
9331b315 | 33 | #include <linux/mutex.h> |
9984de1a | 34 | #include <linux/export.h> |
e3818b8d | 35 | #include <linux/hardirq.h> |
e3ebfb96 | 36 | #include <linux/delay.h> |
e77b7041 | 37 | #include <linux/moduleparam.h> |
8315f422 | 38 | #include <linux/kthread.h> |
4ff475ed | 39 | #include <linux/tick.h> |
f9411ebe | 40 | #include <linux/rcupdate_wait.h> |
78634061 | 41 | #include <linux/sched/isolation.h> |
a39f15b9 | 42 | #include <linux/kprobes.h> |
1da177e4 | 43 | |
29c00b4a | 44 | #define CREATE_TRACE_POINTS |
29c00b4a PM |
45 | |
46 | #include "rcu.h" | |
47 | ||
4102adab PM |
48 | #ifdef MODULE_PARAM_PREFIX |
49 | #undef MODULE_PARAM_PREFIX | |
50 | #endif | |
51 | #define MODULE_PARAM_PREFIX "rcupdate." | |
52 | ||
79cfea02 | 53 | #ifndef CONFIG_TINY_RCU |
3caec62f | 54 | extern int rcu_expedited; /* from sysctl */ |
3705b88d | 55 | module_param(rcu_expedited, int, 0); |
3caec62f | 56 | extern int rcu_normal; /* from sysctl */ |
5a9be7c6 | 57 | module_param(rcu_normal, int, 0); |
3e42ec1a PM |
58 | static int rcu_normal_after_boot; |
59 | module_param(rcu_normal_after_boot, int, 0); | |
79cfea02 | 60 | #endif /* #ifndef CONFIG_TINY_RCU */ |
3e42ec1a | 61 | |
293e2421 | 62 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
d5671f6b DV |
63 | /** |
64 | * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? | |
65 | * | |
66 | * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an | |
67 | * RCU-sched read-side critical section. In absence of | |
68 | * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side | |
69 | * critical section unless it can prove otherwise. Note that disabling | |
70 | * of preemption (including disabling irqs) counts as an RCU-sched | |
71 | * read-side critical section. This is useful for debug checks in functions | |
72 | * that required that they be called within an RCU-sched read-side | |
73 | * critical section. | |
74 | * | |
75 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot | |
76 | * and while lockdep is disabled. | |
77 | * | |
78 | * Note that if the CPU is in the idle loop from an RCU point of | |
79 | * view (ie: that we are in the section between rcu_idle_enter() and | |
80 | * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU | |
81 | * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs | |
82 | * that are in such a section, considering these as in extended quiescent | |
83 | * state, so such a CPU is effectively never in an RCU read-side critical | |
84 | * section regardless of what RCU primitives it invokes. This state of | |
85 | * affairs is required --- we need to keep an RCU-free window in idle | |
86 | * where the CPU may possibly enter into low power mode. This way we can | |
87 | * notice an extended quiescent state to other CPUs that started a grace | |
88 | * period. Otherwise we would delay any grace period as long as we run in | |
89 | * the idle task. | |
90 | * | |
91 | * Similarly, we avoid claiming an SRCU read lock held if the current | |
92 | * CPU is offline. | |
93 | */ | |
94 | int rcu_read_lock_sched_held(void) | |
95 | { | |
96 | int lockdep_opinion = 0; | |
97 | ||
98 | if (!debug_lockdep_rcu_enabled()) | |
99 | return 1; | |
100 | if (!rcu_is_watching()) | |
101 | return 0; | |
102 | if (!rcu_lockdep_current_cpu_online()) | |
103 | return 0; | |
104 | if (debug_locks) | |
105 | lockdep_opinion = lock_is_held(&rcu_sched_lock_map); | |
293e2421 | 106 | return lockdep_opinion || !preemptible(); |
d5671f6b DV |
107 | } |
108 | EXPORT_SYMBOL(rcu_read_lock_sched_held); | |
109 | #endif | |
110 | ||
0d39482c PM |
111 | #ifndef CONFIG_TINY_RCU |
112 | ||
5a9be7c6 PM |
113 | /* |
114 | * Should expedited grace-period primitives always fall back to their | |
115 | * non-expedited counterparts? Intended for use within RCU. Note | |
116 | * that if the user specifies both rcu_expedited and rcu_normal, then | |
52d7e48b | 117 | * rcu_normal wins. (Except during the time period during boot from |
900b1028 | 118 | * when the first task is spawned until the rcu_set_runtime_mode() |
52d7e48b | 119 | * core_initcall() is invoked, at which point everything is expedited.) |
5a9be7c6 PM |
120 | */ |
121 | bool rcu_gp_is_normal(void) | |
122 | { | |
52d7e48b PM |
123 | return READ_ONCE(rcu_normal) && |
124 | rcu_scheduler_active != RCU_SCHEDULER_INIT; | |
5a9be7c6 | 125 | } |
4f2a848c | 126 | EXPORT_SYMBOL_GPL(rcu_gp_is_normal); |
5a9be7c6 | 127 | |
7c6094db | 128 | static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); |
0d39482c PM |
129 | |
130 | /* | |
131 | * Should normal grace-period primitives be expedited? Intended for | |
132 | * use within RCU. Note that this function takes the rcu_expedited | |
52d7e48b PM |
133 | * sysfs/boot variable and rcu_scheduler_active into account as well |
134 | * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp() | |
135 | * until rcu_gp_is_expedited() returns false is a -really- bad idea. | |
0d39482c PM |
136 | */ |
137 | bool rcu_gp_is_expedited(void) | |
138 | { | |
52d7e48b PM |
139 | return rcu_expedited || atomic_read(&rcu_expedited_nesting) || |
140 | rcu_scheduler_active == RCU_SCHEDULER_INIT; | |
0d39482c PM |
141 | } |
142 | EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); | |
143 | ||
144 | /** | |
145 | * rcu_expedite_gp - Expedite future RCU grace periods | |
146 | * | |
147 | * After a call to this function, future calls to synchronize_rcu() and | |
148 | * friends act as the corresponding synchronize_rcu_expedited() function | |
149 | * had instead been called. | |
150 | */ | |
151 | void rcu_expedite_gp(void) | |
152 | { | |
153 | atomic_inc(&rcu_expedited_nesting); | |
154 | } | |
155 | EXPORT_SYMBOL_GPL(rcu_expedite_gp); | |
156 | ||
157 | /** | |
158 | * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation | |
159 | * | |
160 | * Undo a prior call to rcu_expedite_gp(). If all prior calls to | |
161 | * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(), | |
162 | * and if the rcu_expedited sysfs/boot parameter is not set, then all | |
163 | * subsequent calls to synchronize_rcu() and friends will return to | |
164 | * their normal non-expedited behavior. | |
165 | */ | |
166 | void rcu_unexpedite_gp(void) | |
167 | { | |
168 | atomic_dec(&rcu_expedited_nesting); | |
169 | } | |
170 | EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); | |
171 | ||
ee42571f PM |
172 | /* |
173 | * Inform RCU of the end of the in-kernel boot sequence. | |
174 | */ | |
175 | void rcu_end_inkernel_boot(void) | |
176 | { | |
7c6094db | 177 | rcu_unexpedite_gp(); |
3e42ec1a PM |
178 | if (rcu_normal_after_boot) |
179 | WRITE_ONCE(rcu_normal, 1); | |
ee42571f | 180 | } |
0d39482c | 181 | |
79cfea02 PM |
182 | #endif /* #ifndef CONFIG_TINY_RCU */ |
183 | ||
900b1028 PM |
184 | /* |
185 | * Test each non-SRCU synchronous grace-period wait API. This is | |
186 | * useful just after a change in mode for these primitives, and | |
187 | * during early boot. | |
188 | */ | |
189 | void rcu_test_sync_prims(void) | |
190 | { | |
191 | if (!IS_ENABLED(CONFIG_PROVE_RCU)) | |
192 | return; | |
193 | synchronize_rcu(); | |
900b1028 | 194 | synchronize_rcu_expedited(); |
900b1028 PM |
195 | } |
196 | ||
197 | #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) | |
198 | ||
199 | /* | |
200 | * Switch to run-time mode once RCU has fully initialized. | |
201 | */ | |
202 | static int __init rcu_set_runtime_mode(void) | |
203 | { | |
204 | rcu_test_sync_prims(); | |
205 | rcu_scheduler_active = RCU_SCHEDULER_RUNNING; | |
206 | rcu_test_sync_prims(); | |
207 | return 0; | |
208 | } | |
209 | core_initcall(rcu_set_runtime_mode); | |
210 | ||
211 | #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ | |
212 | ||
162cc279 PM |
213 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
214 | static struct lock_class_key rcu_lock_key; | |
215 | struct lockdep_map rcu_lock_map = | |
216 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | |
217 | EXPORT_SYMBOL_GPL(rcu_lock_map); | |
632ee200 PM |
218 | |
219 | static struct lock_class_key rcu_bh_lock_key; | |
220 | struct lockdep_map rcu_bh_lock_map = | |
221 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); | |
222 | EXPORT_SYMBOL_GPL(rcu_bh_lock_map); | |
223 | ||
224 | static struct lock_class_key rcu_sched_lock_key; | |
225 | struct lockdep_map rcu_sched_lock_map = | |
226 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); | |
227 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); | |
e3818b8d | 228 | |
24ef659a PM |
229 | static struct lock_class_key rcu_callback_key; |
230 | struct lockdep_map rcu_callback_map = | |
231 | STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); | |
232 | EXPORT_SYMBOL_GPL(rcu_callback_map); | |
233 | ||
a0a5a056 | 234 | int notrace debug_lockdep_rcu_enabled(void) |
bc293d62 | 235 | { |
52d7e48b | 236 | return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && |
bc293d62 PM |
237 | current->lockdep_recursion == 0; |
238 | } | |
239 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | |
a39f15b9 | 240 | NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); |
bc293d62 | 241 | |
85b39d30 ON |
242 | /** |
243 | * rcu_read_lock_held() - might we be in RCU read-side critical section? | |
244 | * | |
245 | * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU | |
246 | * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, | |
247 | * this assumes we are in an RCU read-side critical section unless it can | |
248 | * prove otherwise. This is useful for debug checks in functions that | |
249 | * require that they be called within an RCU read-side critical section. | |
250 | * | |
251 | * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot | |
252 | * and while lockdep is disabled. | |
253 | * | |
254 | * Note that rcu_read_lock() and the matching rcu_read_unlock() must | |
255 | * occur in the same context, for example, it is illegal to invoke | |
256 | * rcu_read_unlock() in process context if the matching rcu_read_lock() | |
257 | * was invoked from within an irq handler. | |
258 | * | |
259 | * Note that rcu_read_lock() is disallowed if the CPU is either idle or | |
260 | * offline from an RCU perspective, so check for those as well. | |
261 | */ | |
262 | int rcu_read_lock_held(void) | |
263 | { | |
264 | if (!debug_lockdep_rcu_enabled()) | |
265 | return 1; | |
266 | if (!rcu_is_watching()) | |
267 | return 0; | |
268 | if (!rcu_lockdep_current_cpu_online()) | |
269 | return 0; | |
270 | return lock_is_held(&rcu_lock_map); | |
271 | } | |
272 | EXPORT_SYMBOL_GPL(rcu_read_lock_held); | |
273 | ||
e3818b8d | 274 | /** |
ca5ecddf | 275 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? |
e3818b8d PM |
276 | * |
277 | * Check for bottom half being disabled, which covers both the | |
278 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses | |
279 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) | |
ca5ecddf PM |
280 | * will show the situation. This is useful for debug checks in functions |
281 | * that require that they be called within an RCU read-side critical | |
282 | * section. | |
e3818b8d PM |
283 | * |
284 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. | |
c0d6d01b | 285 | * |
82fcecfa | 286 | * Note that rcu_read_lock_bh() is disallowed if the CPU is either idle or |
c0d6d01b | 287 | * offline from an RCU perspective, so check for those as well. |
e3818b8d PM |
288 | */ |
289 | int rcu_read_lock_bh_held(void) | |
290 | { | |
291 | if (!debug_lockdep_rcu_enabled()) | |
292 | return 1; | |
5c173eb8 | 293 | if (!rcu_is_watching()) |
e6b80a3b | 294 | return 0; |
c0d6d01b PM |
295 | if (!rcu_lockdep_current_cpu_online()) |
296 | return 0; | |
773e3f93 | 297 | return in_softirq() || irqs_disabled(); |
e3818b8d PM |
298 | } |
299 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | |
300 | ||
301 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | |
302 | ||
ee376dbd PM |
303 | /** |
304 | * wakeme_after_rcu() - Callback function to awaken a task after grace period | |
305 | * @head: Pointer to rcu_head member within rcu_synchronize structure | |
306 | * | |
307 | * Awaken the corresponding task now that a grace period has elapsed. | |
fbf6bfca | 308 | */ |
ee376dbd | 309 | void wakeme_after_rcu(struct rcu_head *head) |
21a1ea9e | 310 | { |
01c1c660 PM |
311 | struct rcu_synchronize *rcu; |
312 | ||
313 | rcu = container_of(head, struct rcu_synchronize, head); | |
314 | complete(&rcu->completion); | |
21a1ea9e | 315 | } |
ec90a194 | 316 | EXPORT_SYMBOL_GPL(wakeme_after_rcu); |
ee84b824 | 317 | |
ec90a194 PM |
318 | void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, |
319 | struct rcu_synchronize *rs_array) | |
2c42818e | 320 | { |
ec90a194 | 321 | int i; |
68ab0b42 | 322 | int j; |
ec90a194 | 323 | |
06462efc | 324 | /* Initialize and register callbacks for each crcu_array element. */ |
ec90a194 PM |
325 | for (i = 0; i < n; i++) { |
326 | if (checktiny && | |
309ba859 | 327 | (crcu_array[i] == call_rcu)) { |
ec90a194 PM |
328 | might_sleep(); |
329 | continue; | |
330 | } | |
331 | init_rcu_head_on_stack(&rs_array[i].head); | |
332 | init_completion(&rs_array[i].completion); | |
68ab0b42 PM |
333 | for (j = 0; j < i; j++) |
334 | if (crcu_array[j] == crcu_array[i]) | |
335 | break; | |
336 | if (j == i) | |
337 | (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); | |
ec90a194 PM |
338 | } |
339 | ||
340 | /* Wait for all callbacks to be invoked. */ | |
341 | for (i = 0; i < n; i++) { | |
342 | if (checktiny && | |
309ba859 | 343 | (crcu_array[i] == call_rcu)) |
ec90a194 | 344 | continue; |
68ab0b42 PM |
345 | for (j = 0; j < i; j++) |
346 | if (crcu_array[j] == crcu_array[i]) | |
347 | break; | |
348 | if (j == i) | |
349 | wait_for_completion(&rs_array[i].completion); | |
ec90a194 PM |
350 | destroy_rcu_head_on_stack(&rs_array[i].head); |
351 | } | |
2c42818e | 352 | } |
ec90a194 | 353 | EXPORT_SYMBOL_GPL(__wait_rcu_gp); |
2c42818e | 354 | |
551d55a9 | 355 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD |
546a9d85 | 356 | void init_rcu_head(struct rcu_head *head) |
551d55a9 MD |
357 | { |
358 | debug_object_init(head, &rcuhead_debug_descr); | |
359 | } | |
156baec3 | 360 | EXPORT_SYMBOL_GPL(init_rcu_head); |
551d55a9 | 361 | |
546a9d85 | 362 | void destroy_rcu_head(struct rcu_head *head) |
551d55a9 MD |
363 | { |
364 | debug_object_free(head, &rcuhead_debug_descr); | |
365 | } | |
156baec3 | 366 | EXPORT_SYMBOL_GPL(destroy_rcu_head); |
551d55a9 | 367 | |
b9fdac7f | 368 | static bool rcuhead_is_static_object(void *addr) |
551d55a9 | 369 | { |
b9fdac7f | 370 | return true; |
551d55a9 MD |
371 | } |
372 | ||
373 | /** | |
374 | * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects | |
375 | * @head: pointer to rcu_head structure to be initialized | |
376 | * | |
377 | * This function informs debugobjects of a new rcu_head structure that | |
378 | * has been allocated as an auto variable on the stack. This function | |
379 | * is not required for rcu_head structures that are statically defined or | |
380 | * that are dynamically allocated on the heap. This function has no | |
381 | * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | |
382 | */ | |
383 | void init_rcu_head_on_stack(struct rcu_head *head) | |
384 | { | |
385 | debug_object_init_on_stack(head, &rcuhead_debug_descr); | |
386 | } | |
387 | EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); | |
388 | ||
389 | /** | |
390 | * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects | |
391 | * @head: pointer to rcu_head structure to be initialized | |
392 | * | |
393 | * This function informs debugobjects that an on-stack rcu_head structure | |
394 | * is about to go out of scope. As with init_rcu_head_on_stack(), this | |
395 | * function is not required for rcu_head structures that are statically | |
396 | * defined or that are dynamically allocated on the heap. Also as with | |
397 | * init_rcu_head_on_stack(), this function has no effect for | |
398 | * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | |
399 | */ | |
400 | void destroy_rcu_head_on_stack(struct rcu_head *head) | |
401 | { | |
402 | debug_object_free(head, &rcuhead_debug_descr); | |
403 | } | |
404 | EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); | |
405 | ||
406 | struct debug_obj_descr rcuhead_debug_descr = { | |
407 | .name = "rcu_head", | |
b9fdac7f | 408 | .is_static_object = rcuhead_is_static_object, |
551d55a9 MD |
409 | }; |
410 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | |
411 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | |
91afaf30 | 412 | |
28f6569a | 413 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) |
e66c33d5 | 414 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, |
52494535 PM |
415 | unsigned long secs, |
416 | unsigned long c_old, unsigned long c) | |
91afaf30 | 417 | { |
52494535 | 418 | trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); |
91afaf30 PM |
419 | } |
420 | EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | |
421 | #else | |
52494535 PM |
422 | #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ |
423 | do { } while (0) | |
91afaf30 | 424 | #endif |
6bfc09e2 | 425 | |
c682db55 PM |
426 | #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) |
427 | /* Get rcutorture access to sched_setaffinity(). */ | |
428 | long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |
429 | { | |
430 | int ret; | |
431 | ||
432 | ret = sched_setaffinity(pid, in_mask); | |
433 | WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); | |
434 | return ret; | |
435 | } | |
436 | EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); | |
437 | #endif | |
438 | ||
6bfc09e2 | 439 | #ifdef CONFIG_RCU_STALL_COMMON |
6bfc09e2 | 440 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
f22ce091 | 441 | EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); |
6bfc09e2 | 442 | module_param(rcu_cpu_stall_suppress, int, 0644); |
10462d6f | 443 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; |
6bfc09e2 | 444 | module_param(rcu_cpu_stall_timeout, int, 0644); |
6bfc09e2 | 445 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
8315f422 PM |
446 | |
447 | #ifdef CONFIG_TASKS_RCU | |
448 | ||
449 | /* | |
6f56f714 PM |
450 | * Simple variant of RCU whose quiescent states are voluntary context |
451 | * switch, cond_resched_rcu_qs(), user-space execution, and idle. | |
452 | * As such, grace periods can take one good long time. There are no | |
453 | * read-side primitives similar to rcu_read_lock() and rcu_read_unlock() | |
454 | * because this implementation is intended to get the system into a safe | |
455 | * state for some of the manipulations involved in tracing and the like. | |
456 | * Finally, this implementation does not support high call_rcu_tasks() | |
457 | * rates from multiple CPUs. If this is required, per-CPU callback lists | |
458 | * will be needed. | |
8315f422 PM |
459 | */ |
460 | ||
461 | /* Global list of callbacks and associated lock. */ | |
462 | static struct rcu_head *rcu_tasks_cbs_head; | |
463 | static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; | |
c7b24d2b | 464 | static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); |
8315f422 PM |
465 | static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); |
466 | ||
3f95aa81 | 467 | /* Track exiting tasks in order to allow them to be waited for. */ |
ccdd29ff | 468 | DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); |
3f95aa81 PM |
469 | |
470 | /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ | |
59d80fd8 PM |
471 | #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) |
472 | static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; | |
3f95aa81 PM |
473 | module_param(rcu_task_stall_timeout, int, 0644); |
474 | ||
4929c913 | 475 | static struct task_struct *rcu_tasks_kthread_ptr; |
84a8f446 | 476 | |
a68a2bb2 PM |
477 | /** |
478 | * call_rcu_tasks() - Queue an RCU for invocation task-based grace period | |
479 | * @rhp: structure to be used for queueing the RCU updates. | |
480 | * @func: actual callback function to be invoked after the grace period | |
481 | * | |
482 | * The callback function will be invoked some time after a full grace | |
483 | * period elapses, in other words after all currently executing RCU | |
484 | * read-side critical sections have completed. call_rcu_tasks() assumes | |
485 | * that the read-side critical sections end at a voluntary context | |
6f56f714 PM |
486 | * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, |
487 | * or transition to usermode execution. As such, there are no read-side | |
488 | * primitives analogous to rcu_read_lock() and rcu_read_unlock() because | |
489 | * this primitive is intended to determine that all tasks have passed | |
490 | * through a safe state, not so much for data-strcuture synchronization. | |
a68a2bb2 PM |
491 | * |
492 | * See the description of call_rcu() for more detailed information on | |
493 | * memory ordering guarantees. | |
84a8f446 | 494 | */ |
b6a4ae76 | 495 | void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) |
8315f422 PM |
496 | { |
497 | unsigned long flags; | |
c7b24d2b | 498 | bool needwake; |
8315f422 PM |
499 | |
500 | rhp->next = NULL; | |
501 | rhp->func = func; | |
502 | raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); | |
c7b24d2b | 503 | needwake = !rcu_tasks_cbs_head; |
8315f422 PM |
504 | *rcu_tasks_cbs_tail = rhp; |
505 | rcu_tasks_cbs_tail = &rhp->next; | |
506 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); | |
4929c913 | 507 | /* We can't create the thread unless interrupts are enabled. */ |
c63eb17f | 508 | if (needwake && READ_ONCE(rcu_tasks_kthread_ptr)) |
c7b24d2b | 509 | wake_up(&rcu_tasks_cbs_wq); |
8315f422 PM |
510 | } |
511 | EXPORT_SYMBOL_GPL(call_rcu_tasks); | |
512 | ||
53c6d4ed PM |
513 | /** |
514 | * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. | |
515 | * | |
516 | * Control will return to the caller some time after a full rcu-tasks | |
517 | * grace period has elapsed, in other words after all currently | |
518 | * executing rcu-tasks read-side critical sections have elapsed. These | |
519 | * read-side critical sections are delimited by calls to schedule(), | |
cee43939 | 520 | * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls |
53c6d4ed PM |
521 | * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). |
522 | * | |
523 | * This is a very specialized primitive, intended only for a few uses in | |
524 | * tracing and other situations requiring manipulation of function | |
525 | * preambles and profiling hooks. The synchronize_rcu_tasks() function | |
526 | * is not (yet) intended for heavy use from multiple CPUs. | |
527 | * | |
528 | * Note that this guarantee implies further memory-ordering guarantees. | |
529 | * On systems with more than one CPU, when synchronize_rcu_tasks() returns, | |
530 | * each CPU is guaranteed to have executed a full memory barrier since the | |
531 | * end of its last RCU-tasks read-side critical section whose beginning | |
532 | * preceded the call to synchronize_rcu_tasks(). In addition, each CPU | |
533 | * having an RCU-tasks read-side critical section that extends beyond | |
534 | * the return from synchronize_rcu_tasks() is guaranteed to have executed | |
535 | * a full memory barrier after the beginning of synchronize_rcu_tasks() | |
536 | * and before the beginning of that RCU-tasks read-side critical section. | |
537 | * Note that these guarantees include CPUs that are offline, idle, or | |
538 | * executing in user mode, as well as CPUs that are executing in the kernel. | |
539 | * | |
540 | * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned | |
541 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | |
542 | * to have executed a full memory barrier during the execution of | |
543 | * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU | |
544 | * (but again only if the system has more than one CPU). | |
545 | */ | |
546 | void synchronize_rcu_tasks(void) | |
547 | { | |
548 | /* Complain if the scheduler has not started. */ | |
52d7e48b | 549 | RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, |
f78f5b90 | 550 | "synchronize_rcu_tasks called too soon"); |
53c6d4ed PM |
551 | |
552 | /* Wait for the grace period. */ | |
553 | wait_rcu_gp(call_rcu_tasks); | |
554 | } | |
06c2a923 | 555 | EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); |
53c6d4ed PM |
556 | |
557 | /** | |
558 | * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. | |
559 | * | |
560 | * Although the current implementation is guaranteed to wait, it is not | |
561 | * obligated to, for example, if there are no pending callbacks. | |
562 | */ | |
563 | void rcu_barrier_tasks(void) | |
564 | { | |
565 | /* There is only one callback queue, so this is easy. ;-) */ | |
566 | synchronize_rcu_tasks(); | |
567 | } | |
06c2a923 | 568 | EXPORT_SYMBOL_GPL(rcu_barrier_tasks); |
53c6d4ed | 569 | |
52db30ab PM |
570 | /* See if tasks are still holding out, complain if so. */ |
571 | static void check_holdout_task(struct task_struct *t, | |
572 | bool needreport, bool *firstreport) | |
8315f422 | 573 | { |
4ff475ed PM |
574 | int cpu; |
575 | ||
7d0ae808 PM |
576 | if (!READ_ONCE(t->rcu_tasks_holdout) || |
577 | t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || | |
578 | !READ_ONCE(t->on_rq) || | |
176f8f7a PM |
579 | (IS_ENABLED(CONFIG_NO_HZ_FULL) && |
580 | !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { | |
7d0ae808 | 581 | WRITE_ONCE(t->rcu_tasks_holdout, false); |
8f20a5e8 | 582 | list_del_init(&t->rcu_tasks_holdout_list); |
8315f422 | 583 | put_task_struct(t); |
52db30ab | 584 | return; |
8315f422 | 585 | } |
bcbfdd01 | 586 | rcu_request_urgent_qs_task(t); |
52db30ab PM |
587 | if (!needreport) |
588 | return; | |
589 | if (*firstreport) { | |
590 | pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); | |
591 | *firstreport = false; | |
592 | } | |
4ff475ed PM |
593 | cpu = task_cpu(t); |
594 | pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", | |
595 | t, ".I"[is_idle_task(t)], | |
596 | "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], | |
597 | t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, | |
598 | t->rcu_tasks_idle_cpu, cpu); | |
52db30ab | 599 | sched_show_task(t); |
8315f422 PM |
600 | } |
601 | ||
602 | /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ | |
603 | static int __noreturn rcu_tasks_kthread(void *arg) | |
604 | { | |
605 | unsigned long flags; | |
606 | struct task_struct *g, *t; | |
52db30ab | 607 | unsigned long lastreport; |
8315f422 PM |
608 | struct rcu_head *list; |
609 | struct rcu_head *next; | |
610 | LIST_HEAD(rcu_tasks_holdouts); | |
c03be752 | 611 | int fract; |
8315f422 | 612 | |
60ced495 | 613 | /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ |
de201559 | 614 | housekeeping_affine(current, HK_FLAG_RCU); |
8315f422 PM |
615 | |
616 | /* | |
617 | * Each pass through the following loop makes one check for | |
618 | * newly arrived callbacks, and, if there are some, waits for | |
619 | * one RCU-tasks grace period and then invokes the callbacks. | |
620 | * This loop is terminated by the system going down. ;-) | |
621 | */ | |
622 | for (;;) { | |
623 | ||
624 | /* Pick up any new callbacks. */ | |
625 | raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); | |
626 | list = rcu_tasks_cbs_head; | |
627 | rcu_tasks_cbs_head = NULL; | |
628 | rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; | |
629 | raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); | |
630 | ||
631 | /* If there were none, wait a bit and start over. */ | |
632 | if (!list) { | |
c7b24d2b PM |
633 | wait_event_interruptible(rcu_tasks_cbs_wq, |
634 | rcu_tasks_cbs_head); | |
635 | if (!rcu_tasks_cbs_head) { | |
636 | WARN_ON(signal_pending(current)); | |
637 | schedule_timeout_interruptible(HZ/10); | |
638 | } | |
8315f422 PM |
639 | continue; |
640 | } | |
641 | ||
642 | /* | |
643 | * Wait for all pre-existing t->on_rq and t->nvcsw | |
06462efc | 644 | * transitions to complete. Invoking synchronize_rcu() |
8315f422 | 645 | * suffices because all these transitions occur with |
06462efc | 646 | * interrupts disabled. Without this synchronize_rcu(), |
8315f422 PM |
647 | * a read-side critical section that started before the |
648 | * grace period might be incorrectly seen as having started | |
649 | * after the grace period. | |
650 | * | |
06462efc | 651 | * This synchronize_rcu() also dispenses with the |
8315f422 PM |
652 | * need for a memory barrier on the first store to |
653 | * ->rcu_tasks_holdout, as it forces the store to happen | |
654 | * after the beginning of the grace period. | |
655 | */ | |
06462efc | 656 | synchronize_rcu(); |
8315f422 PM |
657 | |
658 | /* | |
659 | * There were callbacks, so we need to wait for an | |
660 | * RCU-tasks grace period. Start off by scanning | |
661 | * the task list for tasks that are not already | |
662 | * voluntarily blocked. Mark these tasks and make | |
663 | * a list of them in rcu_tasks_holdouts. | |
664 | */ | |
665 | rcu_read_lock(); | |
666 | for_each_process_thread(g, t) { | |
7d0ae808 | 667 | if (t != current && READ_ONCE(t->on_rq) && |
8315f422 PM |
668 | !is_idle_task(t)) { |
669 | get_task_struct(t); | |
7d0ae808 PM |
670 | t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); |
671 | WRITE_ONCE(t->rcu_tasks_holdout, true); | |
8315f422 PM |
672 | list_add(&t->rcu_tasks_holdout_list, |
673 | &rcu_tasks_holdouts); | |
674 | } | |
675 | } | |
676 | rcu_read_unlock(); | |
677 | ||
3f95aa81 PM |
678 | /* |
679 | * Wait for tasks that are in the process of exiting. | |
680 | * This does only part of the job, ensuring that all | |
681 | * tasks that were previously exiting reach the point | |
682 | * where they have disabled preemption, allowing the | |
06462efc | 683 | * later synchronize_rcu() to finish the job. |
3f95aa81 PM |
684 | */ |
685 | synchronize_srcu(&tasks_rcu_exit_srcu); | |
686 | ||
8315f422 PM |
687 | /* |
688 | * Each pass through the following loop scans the list | |
689 | * of holdout tasks, removing any that are no longer | |
690 | * holdouts. When the list is empty, we are done. | |
691 | */ | |
52db30ab | 692 | lastreport = jiffies; |
c03be752 SRV |
693 | |
694 | /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ | |
695 | fract = 10; | |
696 | ||
697 | for (;;) { | |
52db30ab PM |
698 | bool firstreport; |
699 | bool needreport; | |
700 | int rtst; | |
8f20a5e8 | 701 | struct task_struct *t1; |
52db30ab | 702 | |
c03be752 SRV |
703 | if (list_empty(&rcu_tasks_holdouts)) |
704 | break; | |
705 | ||
706 | /* Slowly back off waiting for holdouts */ | |
707 | schedule_timeout_interruptible(HZ/fract); | |
708 | ||
709 | if (fract > 1) | |
710 | fract--; | |
711 | ||
7d0ae808 | 712 | rtst = READ_ONCE(rcu_task_stall_timeout); |
52db30ab PM |
713 | needreport = rtst > 0 && |
714 | time_after(jiffies, lastreport + rtst); | |
715 | if (needreport) | |
716 | lastreport = jiffies; | |
717 | firstreport = true; | |
8315f422 | 718 | WARN_ON(signal_pending(current)); |
8f20a5e8 PM |
719 | list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, |
720 | rcu_tasks_holdout_list) { | |
52db30ab | 721 | check_holdout_task(t, needreport, &firstreport); |
8f20a5e8 PM |
722 | cond_resched(); |
723 | } | |
8315f422 PM |
724 | } |
725 | ||
726 | /* | |
727 | * Because ->on_rq and ->nvcsw are not guaranteed | |
728 | * to have a full memory barriers prior to them in the | |
729 | * schedule() path, memory reordering on other CPUs could | |
730 | * cause their RCU-tasks read-side critical sections to | |
731 | * extend past the end of the grace period. However, | |
732 | * because these ->nvcsw updates are carried out with | |
06462efc | 733 | * interrupts disabled, we can use synchronize_rcu() |
8315f422 PM |
734 | * to force the needed ordering on all such CPUs. |
735 | * | |
06462efc | 736 | * This synchronize_rcu() also confines all |
8315f422 PM |
737 | * ->rcu_tasks_holdout accesses to be within the grace |
738 | * period, avoiding the need for memory barriers for | |
739 | * ->rcu_tasks_holdout accesses. | |
3f95aa81 | 740 | * |
06462efc | 741 | * In addition, this synchronize_rcu() waits for exiting |
3f95aa81 PM |
742 | * tasks to complete their final preempt_disable() region |
743 | * of execution, cleaning up after the synchronize_srcu() | |
744 | * above. | |
8315f422 | 745 | */ |
06462efc | 746 | synchronize_rcu(); |
8315f422 PM |
747 | |
748 | /* Invoke the callbacks. */ | |
749 | while (list) { | |
750 | next = list->next; | |
751 | local_bh_disable(); | |
752 | list->func(list); | |
753 | local_bh_enable(); | |
754 | list = next; | |
755 | cond_resched(); | |
756 | } | |
cd23ac8d | 757 | /* Paranoid sleep to keep this from entering a tight loop */ |
c7b24d2b | 758 | schedule_timeout_uninterruptible(HZ/10); |
8315f422 PM |
759 | } |
760 | } | |
761 | ||
c63eb17f PM |
762 | /* Spawn rcu_tasks_kthread() at core_initcall() time. */ |
763 | static int __init rcu_spawn_tasks_kthread(void) | |
8315f422 | 764 | { |
84a8f446 | 765 | struct task_struct *t; |
8315f422 PM |
766 | |
767 | t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); | |
f0ad56e8 PM |
768 | if (WARN_ONCE(IS_ERR(t), "%s: Could not start Tasks-RCU grace-period kthread, OOM is now expected behavior\n", __func__)) |
769 | return 0; | |
84a8f446 | 770 | smp_mb(); /* Ensure others see full kthread. */ |
7d0ae808 | 771 | WRITE_ONCE(rcu_tasks_kthread_ptr, t); |
c63eb17f | 772 | return 0; |
8315f422 | 773 | } |
c63eb17f | 774 | core_initcall(rcu_spawn_tasks_kthread); |
8315f422 | 775 | |
ccdd29ff PM |
776 | /* Do the srcu_read_lock() for the above synchronize_srcu(). */ |
777 | void exit_tasks_rcu_start(void) | |
778 | { | |
779 | preempt_disable(); | |
780 | current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); | |
781 | preempt_enable(); | |
782 | } | |
783 | ||
784 | /* Do the srcu_read_unlock() for the above synchronize_srcu(). */ | |
785 | void exit_tasks_rcu_finish(void) | |
786 | { | |
787 | preempt_disable(); | |
788 | __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); | |
789 | preempt_enable(); | |
790 | } | |
791 | ||
8315f422 | 792 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
aa23c6fb | 793 | |
59d80fd8 PM |
794 | #ifndef CONFIG_TINY_RCU |
795 | ||
796 | /* | |
797 | * Print any non-default Tasks RCU settings. | |
798 | */ | |
799 | static void __init rcu_tasks_bootup_oddness(void) | |
800 | { | |
801 | #ifdef CONFIG_TASKS_RCU | |
802 | if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) | |
803 | pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); | |
804 | else | |
805 | pr_info("\tTasks RCU enabled.\n"); | |
806 | #endif /* #ifdef CONFIG_TASKS_RCU */ | |
807 | } | |
808 | ||
809 | #endif /* #ifndef CONFIG_TINY_RCU */ | |
810 | ||
aa23c6fb PK |
811 | #ifdef CONFIG_PROVE_RCU |
812 | ||
813 | /* | |
72ce30dd | 814 | * Early boot self test parameters. |
aa23c6fb PK |
815 | */ |
816 | static bool rcu_self_test; | |
aa23c6fb | 817 | module_param(rcu_self_test, bool, 0444); |
aa23c6fb PK |
818 | |
819 | static int rcu_self_test_counter; | |
820 | ||
821 | static void test_callback(struct rcu_head *r) | |
822 | { | |
823 | rcu_self_test_counter++; | |
824 | pr_info("RCU test callback executed %d\n", rcu_self_test_counter); | |
825 | } | |
826 | ||
e0fcba9a PM |
827 | DEFINE_STATIC_SRCU(early_srcu); |
828 | ||
aa23c6fb PK |
829 | static void early_boot_test_call_rcu(void) |
830 | { | |
831 | static struct rcu_head head; | |
e0fcba9a | 832 | static struct rcu_head shead; |
aa23c6fb PK |
833 | |
834 | call_rcu(&head, test_callback); | |
e0fcba9a PM |
835 | if (IS_ENABLED(CONFIG_SRCU)) |
836 | call_srcu(&early_srcu, &shead, test_callback); | |
aa23c6fb PK |
837 | } |
838 | ||
839 | void rcu_early_boot_tests(void) | |
840 | { | |
841 | pr_info("Running RCU self tests\n"); | |
842 | ||
843 | if (rcu_self_test) | |
844 | early_boot_test_call_rcu(); | |
52d7e48b | 845 | rcu_test_sync_prims(); |
aa23c6fb PK |
846 | } |
847 | ||
848 | static int rcu_verify_early_boot_tests(void) | |
849 | { | |
850 | int ret = 0; | |
851 | int early_boot_test_counter = 0; | |
852 | ||
853 | if (rcu_self_test) { | |
854 | early_boot_test_counter++; | |
855 | rcu_barrier(); | |
e0fcba9a PM |
856 | if (IS_ENABLED(CONFIG_SRCU)) { |
857 | early_boot_test_counter++; | |
858 | srcu_barrier(&early_srcu); | |
859 | } | |
aa23c6fb | 860 | } |
aa23c6fb PK |
861 | if (rcu_self_test_counter != early_boot_test_counter) { |
862 | WARN_ON(1); | |
863 | ret = -1; | |
864 | } | |
865 | ||
866 | return ret; | |
867 | } | |
868 | late_initcall(rcu_verify_early_boot_tests); | |
869 | #else | |
870 | void rcu_early_boot_tests(void) {} | |
871 | #endif /* CONFIG_PROVE_RCU */ | |
59d80fd8 PM |
872 | |
873 | #ifndef CONFIG_TINY_RCU | |
874 | ||
875 | /* | |
876 | * Print any significant non-default boot-time settings. | |
877 | */ | |
878 | void __init rcupdate_announce_bootup_oddness(void) | |
879 | { | |
880 | if (rcu_normal) | |
881 | pr_info("\tNo expedited grace period (rcu_normal).\n"); | |
882 | else if (rcu_normal_after_boot) | |
883 | pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n"); | |
884 | else if (rcu_expedited) | |
885 | pr_info("\tAll grace periods are expedited (rcu_expedited).\n"); | |
886 | if (rcu_cpu_stall_suppress) | |
887 | pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n"); | |
888 | if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT) | |
889 | pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout); | |
890 | rcu_tasks_bootup_oddness(); | |
891 | } | |
892 | ||
893 | #endif /* #ifndef CONFIG_TINY_RCU */ |