srcu: Make rcutorture writer stalls print SRCU GP state
[linux-2.6-block.git] / kernel / rcu / srcutree.c
CommitLineData
dad81a20
PM
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 * Copyright (C) Fujitsu, 2012
20 *
21 * Author: Paul McKenney <paulmck@us.ibm.com>
22 * Lai Jiangshan <laijs@cn.fujitsu.com>
23 *
24 * For detailed explanation of Read-Copy Update mechanism see -
25 * Documentation/RCU/ *.txt
26 *
27 */
28
29#include <linux/export.h>
30#include <linux/mutex.h>
31#include <linux/percpu.h>
32#include <linux/preempt.h>
33#include <linux/rcupdate_wait.h>
34#include <linux/sched.h>
35#include <linux/smp.h>
36#include <linux/delay.h>
37#include <linux/srcu.h>
38
dad81a20
PM
39#include "rcu.h"
40
da915ad5
PM
41static void srcu_invoke_callbacks(struct work_struct *work);
42static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
43
44/*
45 * Initialize SRCU combining tree. Note that statically allocated
46 * srcu_struct structures might already have srcu_read_lock() and
47 * srcu_read_unlock() running against them. So if the is_static parameter
48 * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
49 */
50static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
dad81a20 51{
da915ad5
PM
52 int cpu;
53 int i;
54 int level = 0;
55 int levelspread[RCU_NUM_LVLS];
56 struct srcu_data *sdp;
57 struct srcu_node *snp;
58 struct srcu_node *snp_first;
59
60 /* Work out the overall tree geometry. */
61 sp->level[0] = &sp->node[0];
62 for (i = 1; i < rcu_num_lvls; i++)
63 sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
64 rcu_init_levelspread(levelspread, num_rcu_lvl);
65
66 /* Each pass through this loop initializes one srcu_node structure. */
67 rcu_for_each_node_breadth_first(sp, snp) {
68 spin_lock_init(&snp->lock);
c7e88067
PM
69 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
70 ARRAY_SIZE(snp->srcu_data_have_cbs));
71 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
da915ad5 72 snp->srcu_have_cbs[i] = 0;
c7e88067
PM
73 snp->srcu_data_have_cbs[i] = 0;
74 }
da915ad5
PM
75 snp->grplo = -1;
76 snp->grphi = -1;
77 if (snp == &sp->node[0]) {
78 /* Root node, special case. */
79 snp->srcu_parent = NULL;
80 continue;
81 }
82
83 /* Non-root node. */
84 if (snp == sp->level[level + 1])
85 level++;
86 snp->srcu_parent = sp->level[level - 1] +
87 (snp - sp->level[level]) /
88 levelspread[level - 1];
89 }
90
91 /*
92 * Initialize the per-CPU srcu_data array, which feeds into the
93 * leaves of the srcu_node tree.
94 */
95 WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
96 ARRAY_SIZE(sdp->srcu_unlock_count));
97 level = rcu_num_lvls - 1;
98 snp_first = sp->level[level];
99 for_each_possible_cpu(cpu) {
100 sdp = per_cpu_ptr(sp->sda, cpu);
101 spin_lock_init(&sdp->lock);
102 rcu_segcblist_init(&sdp->srcu_cblist);
103 sdp->srcu_cblist_invoking = false;
104 sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
105 sdp->mynode = &snp_first[cpu / levelspread[level]];
106 for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
107 if (snp->grplo < 0)
108 snp->grplo = cpu;
109 snp->grphi = cpu;
110 }
111 sdp->cpu = cpu;
112 INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
113 sdp->sp = sp;
c7e88067 114 sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
da915ad5
PM
115 if (is_static)
116 continue;
117
118 /* Dynamically allocated, better be no srcu_read_locks()! */
119 for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
120 sdp->srcu_lock_count[i] = 0;
121 sdp->srcu_unlock_count[i] = 0;
122 }
123 }
124}
125
126/*
127 * Initialize non-compile-time initialized fields, including the
128 * associated srcu_node and srcu_data structures. The is_static
129 * parameter is passed through to init_srcu_struct_nodes(), and
130 * also tells us that ->sda has already been wired up to srcu_data.
131 */
132static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
133{
134 mutex_init(&sp->srcu_cb_mutex);
135 mutex_init(&sp->srcu_gp_mutex);
136 sp->srcu_idx = 0;
dad81a20
PM
137 sp->srcu_gp_seq = 0;
138 atomic_set(&sp->srcu_exp_cnt, 0);
da915ad5
PM
139 sp->srcu_barrier_seq = 0;
140 mutex_init(&sp->srcu_barrier_mutex);
141 atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
dad81a20 142 INIT_DELAYED_WORK(&sp->work, process_srcu);
da915ad5
PM
143 if (!is_static)
144 sp->sda = alloc_percpu(struct srcu_data);
145 init_srcu_struct_nodes(sp, is_static);
146 smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
147 return sp->sda ? 0 : -ENOMEM;
dad81a20
PM
148}
149
150#ifdef CONFIG_DEBUG_LOCK_ALLOC
151
152int __init_srcu_struct(struct srcu_struct *sp, const char *name,
153 struct lock_class_key *key)
154{
155 /* Don't re-initialize a lock while it is held. */
156 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
157 lockdep_init_map(&sp->dep_map, name, key, 0);
da915ad5
PM
158 spin_lock_init(&sp->gp_lock);
159 return init_srcu_struct_fields(sp, false);
dad81a20
PM
160}
161EXPORT_SYMBOL_GPL(__init_srcu_struct);
162
163#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
164
165/**
166 * init_srcu_struct - initialize a sleep-RCU structure
167 * @sp: structure to initialize.
168 *
169 * Must invoke this on a given srcu_struct before passing that srcu_struct
170 * to any other function. Each srcu_struct represents a separate domain
171 * of SRCU protection.
172 */
173int init_srcu_struct(struct srcu_struct *sp)
174{
da915ad5
PM
175 spin_lock_init(&sp->gp_lock);
176 return init_srcu_struct_fields(sp, false);
dad81a20
PM
177}
178EXPORT_SYMBOL_GPL(init_srcu_struct);
179
180#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
181
182/*
da915ad5
PM
183 * First-use initialization of statically allocated srcu_struct
184 * structure. Wiring up the combining tree is more than can be
185 * done with compile-time initialization, so this check is added
186 * to each update-side SRCU primitive. Use ->gp_lock, which -is-
187 * compile-time initialized, to resolve races involving multiple
188 * CPUs trying to garner first-use privileges.
189 */
190static void check_init_srcu_struct(struct srcu_struct *sp)
191{
192 unsigned long flags;
193
194 WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
195 /* The smp_load_acquire() pairs with the smp_store_release(). */
196 if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
197 return; /* Already initialized. */
198 spin_lock_irqsave(&sp->gp_lock, flags);
199 if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
200 spin_unlock_irqrestore(&sp->gp_lock, flags);
201 return;
202 }
203 init_srcu_struct_fields(sp, true);
204 spin_unlock_irqrestore(&sp->gp_lock, flags);
205}
206
207/*
208 * Returns approximate total of the readers' ->srcu_lock_count[] values
209 * for the rank of per-CPU counters specified by idx.
dad81a20
PM
210 */
211static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
212{
213 int cpu;
214 unsigned long sum = 0;
215
216 for_each_possible_cpu(cpu) {
da915ad5 217 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
dad81a20 218
da915ad5 219 sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
dad81a20
PM
220 }
221 return sum;
222}
223
224/*
da915ad5
PM
225 * Returns approximate total of the readers' ->srcu_unlock_count[] values
226 * for the rank of per-CPU counters specified by idx.
dad81a20
PM
227 */
228static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
229{
230 int cpu;
231 unsigned long sum = 0;
232
233 for_each_possible_cpu(cpu) {
da915ad5 234 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
dad81a20 235
da915ad5 236 sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
dad81a20
PM
237 }
238 return sum;
239}
240
241/*
242 * Return true if the number of pre-existing readers is determined to
243 * be zero.
244 */
245static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
246{
247 unsigned long unlocks;
248
249 unlocks = srcu_readers_unlock_idx(sp, idx);
250
251 /*
252 * Make sure that a lock is always counted if the corresponding
253 * unlock is counted. Needs to be a smp_mb() as the read side may
254 * contain a read from a variable that is written to before the
255 * synchronize_srcu() in the write side. In this case smp_mb()s
256 * A and B act like the store buffering pattern.
257 *
258 * This smp_mb() also pairs with smp_mb() C to prevent accesses
259 * after the synchronize_srcu() from being executed before the
260 * grace period ends.
261 */
262 smp_mb(); /* A */
263
264 /*
265 * If the locks are the same as the unlocks, then there must have
266 * been no readers on this index at some time in between. This does
267 * not mean that there are no more readers, as one could have read
268 * the current index but not have incremented the lock counter yet.
269 *
270 * Possible bug: There is no guarantee that there haven't been
da915ad5 271 * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
dad81a20
PM
272 * counted, meaning that this could return true even if there are
273 * still active readers. Since there are no memory barriers around
da915ad5 274 * srcu_flip(), the CPU is not required to increment ->srcu_idx
dad81a20
PM
275 * before running srcu_readers_unlock_idx(), which means that there
276 * could be an arbitrarily large number of critical sections that
277 * execute after srcu_readers_unlock_idx() but use the old value
da915ad5 278 * of ->srcu_idx.
dad81a20
PM
279 */
280 return srcu_readers_lock_idx(sp, idx) == unlocks;
281}
282
283/**
284 * srcu_readers_active - returns true if there are readers. and false
285 * otherwise
286 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
287 *
288 * Note that this is not an atomic primitive, and can therefore suffer
289 * severe errors when invoked on an active srcu_struct. That said, it
290 * can be useful as an error check at cleanup time.
291 */
292static bool srcu_readers_active(struct srcu_struct *sp)
293{
294 int cpu;
295 unsigned long sum = 0;
296
297 for_each_possible_cpu(cpu) {
da915ad5 298 struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
dad81a20 299
da915ad5
PM
300 sum += READ_ONCE(cpuc->srcu_lock_count[0]);
301 sum += READ_ONCE(cpuc->srcu_lock_count[1]);
302 sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
303 sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
dad81a20
PM
304 }
305 return sum;
306}
307
308#define SRCU_INTERVAL 1
309
310/**
311 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
312 * @sp: structure to clean up.
313 *
314 * Must invoke this after you are finished using a given srcu_struct that
315 * was initialized via init_srcu_struct(), else you leak memory.
316 */
317void cleanup_srcu_struct(struct srcu_struct *sp)
318{
da915ad5
PM
319 int cpu;
320
dad81a20
PM
321 WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
322 if (WARN_ON(srcu_readers_active(sp)))
323 return; /* Leakage unless caller handles error. */
dad81a20 324 flush_delayed_work(&sp->work);
da915ad5
PM
325 for_each_possible_cpu(cpu)
326 flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
327 if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
328 WARN_ON(srcu_readers_active(sp))) {
329 pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
dad81a20
PM
330 return; /* Caller forgot to stop doing call_srcu()? */
331 }
da915ad5
PM
332 free_percpu(sp->sda);
333 sp->sda = NULL;
dad81a20
PM
334}
335EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
336
337/*
338 * Counts the new reader in the appropriate per-CPU element of the
339 * srcu_struct. Must be called from process context.
340 * Returns an index that must be passed to the matching srcu_read_unlock().
341 */
342int __srcu_read_lock(struct srcu_struct *sp)
343{
344 int idx;
345
da915ad5
PM
346 idx = READ_ONCE(sp->srcu_idx) & 0x1;
347 __this_cpu_inc(sp->sda->srcu_lock_count[idx]);
dad81a20
PM
348 smp_mb(); /* B */ /* Avoid leaking the critical section. */
349 return idx;
350}
351EXPORT_SYMBOL_GPL(__srcu_read_lock);
352
353/*
354 * Removes the count for the old reader from the appropriate per-CPU
355 * element of the srcu_struct. Note that this may well be a different
356 * CPU than that which was incremented by the corresponding srcu_read_lock().
357 * Must be called from process context.
358 */
359void __srcu_read_unlock(struct srcu_struct *sp, int idx)
360{
361 smp_mb(); /* C */ /* Avoid leaking the critical section. */
da915ad5 362 this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
dad81a20
PM
363}
364EXPORT_SYMBOL_GPL(__srcu_read_unlock);
365
366/*
367 * We use an adaptive strategy for synchronize_srcu() and especially for
368 * synchronize_srcu_expedited(). We spin for a fixed time period
369 * (defined below) to allow SRCU readers to exit their read-side critical
370 * sections. If there are still some readers after a few microseconds,
371 * we repeatedly block for 1-millisecond time periods.
372 */
373#define SRCU_RETRY_CHECK_DELAY 5
374
375/*
376 * Start an SRCU grace period.
377 */
378static void srcu_gp_start(struct srcu_struct *sp)
379{
da915ad5 380 struct srcu_data *sdp = this_cpu_ptr(sp->sda);
dad81a20
PM
381 int state;
382
da915ad5
PM
383 RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
384 "Invoked srcu_gp_start() without ->gp_lock!");
385 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
386 rcu_segcblist_advance(&sdp->srcu_cblist,
387 rcu_seq_current(&sp->srcu_gp_seq));
388 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
389 rcu_seq_snap(&sp->srcu_gp_seq));
dad81a20
PM
390 rcu_seq_start(&sp->srcu_gp_seq);
391 state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
392 WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
393}
394
da915ad5
PM
395/*
396 * Track online CPUs to guide callback workqueue placement.
397 */
398DEFINE_PER_CPU(bool, srcu_online);
399
400void srcu_online_cpu(unsigned int cpu)
401{
402 WRITE_ONCE(per_cpu(srcu_online, cpu), true);
403}
404
405void srcu_offline_cpu(unsigned int cpu)
406{
407 WRITE_ONCE(per_cpu(srcu_online, cpu), false);
408}
409
410/*
411 * Place the workqueue handler on the specified CPU if online, otherwise
412 * just run it whereever. This is useful for placing workqueue handlers
413 * that are to invoke the specified CPU's callbacks.
414 */
415static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
416 struct delayed_work *dwork,
417 unsigned long delay)
418{
419 bool ret;
420
421 preempt_disable();
422 if (READ_ONCE(per_cpu(srcu_online, cpu)))
423 ret = queue_delayed_work_on(cpu, wq, dwork, delay);
424 else
425 ret = queue_delayed_work(wq, dwork, delay);
426 preempt_enable();
427 return ret;
428}
429
430/*
431 * Schedule callback invocation for the specified srcu_data structure,
432 * if possible, on the corresponding CPU.
433 */
434static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
435{
436 srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
437 &sdp->work, delay);
438}
439
440/*
441 * Schedule callback invocation for all srcu_data structures associated
c7e88067
PM
442 * with the specified srcu_node structure that have callbacks for the
443 * just-completed grace period, the one corresponding to idx. If possible,
444 * schedule this invocation on the corresponding CPUs.
da915ad5 445 */
c7e88067
PM
446static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
447 unsigned long mask)
da915ad5
PM
448{
449 int cpu;
450
c7e88067
PM
451 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
452 if (!(mask & (1 << (cpu - snp->grplo))))
453 continue;
0497b489
PM
454 srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu),
455 atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
c7e88067 456 }
da915ad5
PM
457}
458
459/*
460 * Note the end of an SRCU grace period. Initiates callback invocation
461 * and starts a new grace period if needed.
462 *
463 * The ->srcu_cb_mutex acquisition does not protect any data, but
464 * instead prevents more than one grace period from starting while we
465 * are initiating callback invocation. This allows the ->srcu_have_cbs[]
466 * array to have a finite number of elements.
467 */
468static void srcu_gp_end(struct srcu_struct *sp)
469{
470 bool cbs;
471 unsigned long gpseq;
472 int idx;
473 int idxnext;
c7e88067 474 unsigned long mask;
da915ad5
PM
475 struct srcu_node *snp;
476
477 /* Prevent more than one additional grace period. */
478 mutex_lock(&sp->srcu_cb_mutex);
479
480 /* End the current grace period. */
481 spin_lock_irq(&sp->gp_lock);
482 idx = rcu_seq_state(sp->srcu_gp_seq);
483 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
484 rcu_seq_end(&sp->srcu_gp_seq);
485 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
486 spin_unlock_irq(&sp->gp_lock);
487 mutex_unlock(&sp->srcu_gp_mutex);
488 /* A new grace period can start at this point. But only one. */
489
490 /* Initiate callback invocation as needed. */
491 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
492 idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
493 rcu_for_each_node_breadth_first(sp, snp) {
494 spin_lock_irq(&snp->lock);
495 cbs = false;
496 if (snp >= sp->level[rcu_num_lvls - 1])
497 cbs = snp->srcu_have_cbs[idx] == gpseq;
498 snp->srcu_have_cbs[idx] = gpseq;
499 rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
c7e88067
PM
500 mask = snp->srcu_data_have_cbs[idx];
501 snp->srcu_data_have_cbs[idx] = 0;
da915ad5
PM
502 spin_unlock_irq(&snp->lock);
503 if (cbs) {
504 smp_mb(); /* GP end before CB invocation. */
c7e88067 505 srcu_schedule_cbs_snp(sp, snp, mask);
da915ad5
PM
506 }
507 }
508
509 /* Callback initiation done, allow grace periods after next. */
510 mutex_unlock(&sp->srcu_cb_mutex);
511
512 /* Start a new grace period if needed. */
513 spin_lock_irq(&sp->gp_lock);
514 gpseq = rcu_seq_current(&sp->srcu_gp_seq);
515 if (!rcu_seq_state(gpseq) &&
516 ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
517 srcu_gp_start(sp);
518 spin_unlock_irq(&sp->gp_lock);
519 /* Throttle expedited grace periods: Should be rare! */
520 srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) &&
521 rcu_seq_ctr(gpseq) & 0xf
522 ? 0
523 : SRCU_INTERVAL);
524 } else {
525 spin_unlock_irq(&sp->gp_lock);
526 }
527}
528
529/*
530 * Funnel-locking scheme to scalably mediate many concurrent grace-period
531 * requests. The winner has to do the work of actually starting grace
532 * period s. Losers must either ensure that their desired grace-period
533 * number is recorded on at least their leaf srcu_node structure, or they
534 * must take steps to invoke their own callbacks.
535 */
536static void srcu_funnel_gp_start(struct srcu_struct *sp,
537 struct srcu_data *sdp,
538 unsigned long s)
539{
540 unsigned long flags;
541 int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
542 struct srcu_node *snp = sdp->mynode;
543 unsigned long snp_seq;
544
545 /* Each pass through the loop does one level of the srcu_node tree. */
546 for (; snp != NULL; snp = snp->srcu_parent) {
547 if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
548 return; /* GP already done and CBs recorded. */
549 spin_lock_irqsave(&snp->lock, flags);
550 if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
551 snp_seq = snp->srcu_have_cbs[idx];
c7e88067
PM
552 if (snp == sdp->mynode && snp_seq == s)
553 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
da915ad5
PM
554 spin_unlock_irqrestore(&snp->lock, flags);
555 if (snp == sdp->mynode && snp_seq != s) {
556 smp_mb(); /* CBs after GP! */
557 srcu_schedule_cbs_sdp(sdp, 0);
558 }
559 return;
560 }
561 snp->srcu_have_cbs[idx] = s;
c7e88067
PM
562 if (snp == sdp->mynode)
563 snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
da915ad5
PM
564 spin_unlock_irqrestore(&snp->lock, flags);
565 }
566
567 /* Top of tree, must ensure the grace period will be started. */
568 spin_lock_irqsave(&sp->gp_lock, flags);
569 if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
570 /*
571 * Record need for grace period s. Pair with load
572 * acquire setting up for initialization.
573 */
574 smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
575 }
576
577 /* If grace period not already done and none in progress, start it. */
578 if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
579 rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
580 WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
581 srcu_gp_start(sp);
582 queue_delayed_work(system_power_efficient_wq, &sp->work,
583 atomic_read(&sp->srcu_exp_cnt)
584 ? 0
585 : SRCU_INTERVAL);
586 }
587 spin_unlock_irqrestore(&sp->gp_lock, flags);
588}
589
dad81a20
PM
590/*
591 * Wait until all readers counted by array index idx complete, but
592 * loop an additional time if there is an expedited grace period pending.
da915ad5 593 * The caller must ensure that ->srcu_idx is not changed while checking.
dad81a20
PM
594 */
595static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
596{
597 for (;;) {
598 if (srcu_readers_active_idx_check(sp, idx))
599 return true;
600 if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
601 return false;
602 udelay(SRCU_RETRY_CHECK_DELAY);
603 }
604}
605
606/*
da915ad5
PM
607 * Increment the ->srcu_idx counter so that future SRCU readers will
608 * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
dad81a20
PM
609 * us to wait for pre-existing readers in a starvation-free manner.
610 */
611static void srcu_flip(struct srcu_struct *sp)
612{
da915ad5 613 WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
dad81a20
PM
614
615 /*
616 * Ensure that if the updater misses an __srcu_read_unlock()
617 * increment, that task's next __srcu_read_lock() will see the
618 * above counter update. Note that both this memory barrier
619 * and the one in srcu_readers_active_idx_check() provide the
620 * guarantee for __srcu_read_lock().
621 */
622 smp_mb(); /* D */ /* Pairs with C. */
623}
624
625/*
da915ad5
PM
626 * Enqueue an SRCU callback on the srcu_data structure associated with
627 * the current CPU and the specified srcu_struct structure, initiating
628 * grace-period processing if it is not already running.
dad81a20
PM
629 *
630 * Note that all CPUs must agree that the grace period extended beyond
631 * all pre-existing SRCU read-side critical section. On systems with
632 * more than one CPU, this means that when "func()" is invoked, each CPU
633 * is guaranteed to have executed a full memory barrier since the end of
634 * its last corresponding SRCU read-side critical section whose beginning
635 * preceded the call to call_rcu(). It also means that each CPU executing
636 * an SRCU read-side critical section that continues beyond the start of
637 * "func()" must have executed a memory barrier after the call_rcu()
638 * but before the beginning of that SRCU read-side critical section.
639 * Note that these guarantees include CPUs that are offline, idle, or
640 * executing in user mode, as well as CPUs that are executing in the kernel.
641 *
642 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
643 * resulting SRCU callback function "func()", then both CPU A and CPU
644 * B are guaranteed to execute a full memory barrier during the time
645 * interval between the call to call_rcu() and the invocation of "func()".
646 * This guarantee applies even if CPU A and CPU B are the same CPU (but
647 * again only if the system has more than one CPU).
648 *
649 * Of course, these guarantees apply only for invocations of call_srcu(),
650 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
651 * srcu_struct structure.
652 */
da915ad5 653void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
dad81a20
PM
654 rcu_callback_t func)
655{
656 unsigned long flags;
da915ad5
PM
657 bool needgp = false;
658 unsigned long s;
659 struct srcu_data *sdp;
660
661 check_init_srcu_struct(sp);
662 rhp->func = func;
663 local_irq_save(flags);
664 sdp = this_cpu_ptr(sp->sda);
665 spin_lock(&sdp->lock);
666 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
667 rcu_segcblist_advance(&sdp->srcu_cblist,
668 rcu_seq_current(&sp->srcu_gp_seq));
669 s = rcu_seq_snap(&sp->srcu_gp_seq);
670 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
671 if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
672 sdp->srcu_gp_seq_needed = s;
673 needgp = true;
dad81a20 674 }
da915ad5
PM
675 spin_unlock_irqrestore(&sdp->lock, flags);
676 if (needgp)
677 srcu_funnel_gp_start(sp, sdp, s);
dad81a20
PM
678}
679EXPORT_SYMBOL_GPL(call_srcu);
680
dad81a20
PM
681/*
682 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
683 */
684static void __synchronize_srcu(struct srcu_struct *sp)
685{
686 struct rcu_synchronize rcu;
dad81a20
PM
687
688 RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
689 lock_is_held(&rcu_bh_lock_map) ||
690 lock_is_held(&rcu_lock_map) ||
691 lock_is_held(&rcu_sched_lock_map),
692 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
693
694 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
695 return;
696 might_sleep();
da915ad5 697 check_init_srcu_struct(sp);
dad81a20 698 init_completion(&rcu.completion);
da915ad5
PM
699 init_rcu_head_on_stack(&rcu.head);
700 call_srcu(sp, &rcu.head, wakeme_after_rcu);
dad81a20 701 wait_for_completion(&rcu.completion);
da915ad5 702 destroy_rcu_head_on_stack(&rcu.head);
dad81a20
PM
703}
704
705/**
706 * synchronize_srcu_expedited - Brute-force SRCU grace period
707 * @sp: srcu_struct with which to synchronize.
708 *
709 * Wait for an SRCU grace period to elapse, but be more aggressive about
710 * spinning rather than blocking when waiting.
711 *
712 * Note that synchronize_srcu_expedited() has the same deadlock and
713 * memory-ordering properties as does synchronize_srcu().
714 */
715void synchronize_srcu_expedited(struct srcu_struct *sp)
716{
717 bool do_norm = rcu_gp_is_normal();
718
da915ad5 719 check_init_srcu_struct(sp);
dad81a20
PM
720 if (!do_norm) {
721 atomic_inc(&sp->srcu_exp_cnt);
722 smp_mb__after_atomic(); /* increment before GP. */
723 }
724 __synchronize_srcu(sp);
725 if (!do_norm) {
726 smp_mb__before_atomic(); /* GP before decrement. */
da915ad5 727 WARN_ON_ONCE(atomic_dec_return(&sp->srcu_exp_cnt) < 0);
dad81a20
PM
728 }
729}
730EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
731
732/**
733 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
734 * @sp: srcu_struct with which to synchronize.
735 *
736 * Wait for the count to drain to zero of both indexes. To avoid the
737 * possible starvation of synchronize_srcu(), it waits for the count of
da915ad5
PM
738 * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
739 * and then flip the srcu_idx and wait for the count of the other index.
dad81a20
PM
740 *
741 * Can block; must be called from process context.
742 *
743 * Note that it is illegal to call synchronize_srcu() from the corresponding
744 * SRCU read-side critical section; doing so will result in deadlock.
745 * However, it is perfectly legal to call synchronize_srcu() on one
746 * srcu_struct from some other srcu_struct's read-side critical section,
747 * as long as the resulting graph of srcu_structs is acyclic.
748 *
749 * There are memory-ordering constraints implied by synchronize_srcu().
750 * On systems with more than one CPU, when synchronize_srcu() returns,
751 * each CPU is guaranteed to have executed a full memory barrier since
752 * the end of its last corresponding SRCU-sched read-side critical section
753 * whose beginning preceded the call to synchronize_srcu(). In addition,
754 * each CPU having an SRCU read-side critical section that extends beyond
755 * the return from synchronize_srcu() is guaranteed to have executed a
756 * full memory barrier after the beginning of synchronize_srcu() and before
757 * the beginning of that SRCU read-side critical section. Note that these
758 * guarantees include CPUs that are offline, idle, or executing in user mode,
759 * as well as CPUs that are executing in the kernel.
760 *
761 * Furthermore, if CPU A invoked synchronize_srcu(), which returned
762 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
763 * to have executed a full memory barrier during the execution of
764 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
765 * are the same CPU, but again only if the system has more than one CPU.
766 *
767 * Of course, these memory-ordering guarantees apply only when
768 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
769 * passed the same srcu_struct structure.
770 */
771void synchronize_srcu(struct srcu_struct *sp)
772{
773 if (rcu_gp_is_expedited())
774 synchronize_srcu_expedited(sp);
775 else
776 __synchronize_srcu(sp);
777}
778EXPORT_SYMBOL_GPL(synchronize_srcu);
779
da915ad5
PM
780/*
781 * Callback function for srcu_barrier() use.
782 */
783static void srcu_barrier_cb(struct rcu_head *rhp)
784{
785 struct srcu_data *sdp;
786 struct srcu_struct *sp;
787
788 sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
789 sp = sdp->sp;
790 if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
791 complete(&sp->srcu_barrier_completion);
792}
793
dad81a20
PM
794/**
795 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
796 * @sp: srcu_struct on which to wait for in-flight callbacks.
797 */
798void srcu_barrier(struct srcu_struct *sp)
799{
da915ad5
PM
800 int cpu;
801 struct srcu_data *sdp;
802 unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
803
804 check_init_srcu_struct(sp);
805 mutex_lock(&sp->srcu_barrier_mutex);
806 if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
807 smp_mb(); /* Force ordering following return. */
808 mutex_unlock(&sp->srcu_barrier_mutex);
809 return; /* Someone else did our work for us. */
810 }
811 rcu_seq_start(&sp->srcu_barrier_seq);
812 init_completion(&sp->srcu_barrier_completion);
813
814 /* Initial count prevents reaching zero until all CBs are posted. */
815 atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
816
817 /*
818 * Each pass through this loop enqueues a callback, but only
819 * on CPUs already having callbacks enqueued. Note that if
820 * a CPU already has callbacks enqueue, it must have already
821 * registered the need for a future grace period, so all we
822 * need do is enqueue a callback that will use the same
823 * grace period as the last callback already in the queue.
824 */
825 for_each_possible_cpu(cpu) {
826 sdp = per_cpu_ptr(sp->sda, cpu);
827 spin_lock_irq(&sdp->lock);
828 atomic_inc(&sp->srcu_barrier_cpu_cnt);
829 sdp->srcu_barrier_head.func = srcu_barrier_cb;
830 if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
831 &sdp->srcu_barrier_head, 0))
832 atomic_dec(&sp->srcu_barrier_cpu_cnt);
833 spin_unlock_irq(&sdp->lock);
834 }
835
836 /* Remove the initial count, at which point reaching zero can happen. */
837 if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
838 complete(&sp->srcu_barrier_completion);
839 wait_for_completion(&sp->srcu_barrier_completion);
840
841 rcu_seq_end(&sp->srcu_barrier_seq);
842 mutex_unlock(&sp->srcu_barrier_mutex);
dad81a20
PM
843}
844EXPORT_SYMBOL_GPL(srcu_barrier);
845
846/**
847 * srcu_batches_completed - return batches completed.
848 * @sp: srcu_struct on which to report batch completion.
849 *
850 * Report the number of batches, correlated with, but not necessarily
851 * precisely the same as, the number of grace periods that have elapsed.
852 */
853unsigned long srcu_batches_completed(struct srcu_struct *sp)
854{
da915ad5 855 return sp->srcu_idx;
dad81a20
PM
856}
857EXPORT_SYMBOL_GPL(srcu_batches_completed);
858
859/*
da915ad5
PM
860 * Core SRCU state machine. Push state bits of ->srcu_gp_seq
861 * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
862 * completed in that state.
dad81a20 863 */
da915ad5 864static void srcu_advance_state(struct srcu_struct *sp)
dad81a20
PM
865{
866 int idx;
867
da915ad5
PM
868 mutex_lock(&sp->srcu_gp_mutex);
869
dad81a20
PM
870 /*
871 * Because readers might be delayed for an extended period after
da915ad5 872 * fetching ->srcu_idx for their index, at any point in time there
dad81a20
PM
873 * might well be readers using both idx=0 and idx=1. We therefore
874 * need to wait for readers to clear from both index values before
875 * invoking a callback.
876 *
877 * The load-acquire ensures that we see the accesses performed
878 * by the prior grace period.
879 */
880 idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
881 if (idx == SRCU_STATE_IDLE) {
da915ad5
PM
882 spin_lock_irq(&sp->gp_lock);
883 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
884 WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
885 spin_unlock_irq(&sp->gp_lock);
886 mutex_unlock(&sp->srcu_gp_mutex);
dad81a20
PM
887 return;
888 }
889 idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
890 if (idx == SRCU_STATE_IDLE)
891 srcu_gp_start(sp);
da915ad5
PM
892 spin_unlock_irq(&sp->gp_lock);
893 if (idx != SRCU_STATE_IDLE) {
894 mutex_unlock(&sp->srcu_gp_mutex);
dad81a20 895 return; /* Someone else started the grace period. */
da915ad5 896 }
dad81a20
PM
897 }
898
899 if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
da915ad5
PM
900 idx = 1 ^ (sp->srcu_idx & 1);
901 if (!try_check_zero(sp, idx, 1)) {
902 mutex_unlock(&sp->srcu_gp_mutex);
dad81a20 903 return; /* readers present, retry later. */
da915ad5 904 }
dad81a20
PM
905 srcu_flip(sp);
906 rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
907 }
908
909 if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
910
911 /*
912 * SRCU read-side critical sections are normally short,
913 * so check at least twice in quick succession after a flip.
914 */
da915ad5
PM
915 idx = 1 ^ (sp->srcu_idx & 1);
916 if (!try_check_zero(sp, idx, 2)) {
917 mutex_unlock(&sp->srcu_gp_mutex);
918 return; /* readers present, retry later. */
919 }
920 srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */
dad81a20
PM
921 }
922}
923
924/*
925 * Invoke a limited number of SRCU callbacks that have passed through
926 * their grace period. If there are more to do, SRCU will reschedule
927 * the workqueue. Note that needed memory barriers have been executed
928 * in this task's context by srcu_readers_active_idx_check().
929 */
da915ad5 930static void srcu_invoke_callbacks(struct work_struct *work)
dad81a20 931{
da915ad5 932 bool more;
dad81a20
PM
933 struct rcu_cblist ready_cbs;
934 struct rcu_head *rhp;
da915ad5
PM
935 struct srcu_data *sdp;
936 struct srcu_struct *sp;
dad81a20 937
da915ad5
PM
938 sdp = container_of(work, struct srcu_data, work.work);
939 sp = sdp->sp;
dad81a20 940 rcu_cblist_init(&ready_cbs);
da915ad5
PM
941 spin_lock_irq(&sdp->lock);
942 smp_mb(); /* Old grace periods before callback invocation! */
943 rcu_segcblist_advance(&sdp->srcu_cblist,
944 rcu_seq_current(&sp->srcu_gp_seq));
945 if (sdp->srcu_cblist_invoking ||
946 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
947 spin_unlock_irq(&sdp->lock);
948 return; /* Someone else on the job or nothing to do. */
949 }
950
951 /* We are on the job! Extract and invoke ready callbacks. */
952 sdp->srcu_cblist_invoking = true;
953 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
954 spin_unlock_irq(&sdp->lock);
dad81a20
PM
955 rhp = rcu_cblist_dequeue(&ready_cbs);
956 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
957 local_bh_disable();
958 rhp->func(rhp);
959 local_bh_enable();
960 }
da915ad5
PM
961
962 /*
963 * Update counts, accelerate new callbacks, and if needed,
964 * schedule another round of callback invocation.
965 */
966 spin_lock_irq(&sdp->lock);
967 rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
968 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
969 rcu_seq_snap(&sp->srcu_gp_seq));
970 sdp->srcu_cblist_invoking = false;
971 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
972 spin_unlock_irq(&sdp->lock);
973 if (more)
974 srcu_schedule_cbs_sdp(sdp, 0);
dad81a20
PM
975}
976
977/*
978 * Finished one round of SRCU grace period. Start another if there are
979 * more SRCU callbacks queued, otherwise put SRCU into not-running state.
980 */
981static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
982{
da915ad5 983 bool pushgp = true;
dad81a20 984
da915ad5
PM
985 spin_lock_irq(&sp->gp_lock);
986 if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
987 if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
988 /* All requests fulfilled, time to go idle. */
989 pushgp = false;
990 }
991 } else if (!rcu_seq_state(sp->srcu_gp_seq)) {
992 /* Outstanding request and no GP. Start one. */
993 srcu_gp_start(sp);
dad81a20 994 }
da915ad5 995 spin_unlock_irq(&sp->gp_lock);
dad81a20 996
da915ad5 997 if (pushgp)
dad81a20
PM
998 queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
999}
1000
1001/*
1002 * This is the work-queue function that handles SRCU grace periods.
1003 */
1004void process_srcu(struct work_struct *work)
1005{
1006 struct srcu_struct *sp;
1007
1008 sp = container_of(work, struct srcu_struct, work.work);
1009
da915ad5 1010 srcu_advance_state(sp);
dad81a20
PM
1011 srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
1012}
1013EXPORT_SYMBOL_GPL(process_srcu);
7f6733c3
PM
1014
1015void srcutorture_get_gp_data(enum rcutorture_type test_type,
1016 struct srcu_struct *sp, int *flags,
1017 unsigned long *gpnum,
1018 unsigned long *completed)
1019{
1020 if (test_type != SRCU_FLAVOR)
1021 return;
1022 *flags = 0;
1023 *completed = rcu_seq_ctr(sp->srcu_gp_seq);
1024 *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
1025}
1026EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);