rcu: Improve srcu_readers_active_idx()'s cache locality
[linux-2.6-block.git] / kernel / srcu.c
CommitLineData
621934ee
PM
1/*
2 * Sleepable Read-Copy Update mechanism for mutual exclusion.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2006
19 *
20 * Author: Paul McKenney <paulmck@us.ibm.com>
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26
9984de1a 27#include <linux/export.h>
621934ee
PM
28#include <linux/mutex.h>
29#include <linux/percpu.h>
30#include <linux/preempt.h>
31#include <linux/rcupdate.h>
32#include <linux/sched.h>
621934ee 33#include <linux/smp.h>
46fdb093 34#include <linux/delay.h>
621934ee
PM
35#include <linux/srcu.h>
36
632ee200
PM
37static int init_srcu_struct_fields(struct srcu_struct *sp)
38{
39 sp->completed = 0;
40 mutex_init(&sp->mutex);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM;
43}
44
45#ifdef CONFIG_DEBUG_LOCK_ALLOC
46
47int __init_srcu_struct(struct srcu_struct *sp, const char *name,
48 struct lock_class_key *key)
49{
632ee200
PM
50 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0);
632ee200
PM
53 return init_srcu_struct_fields(sp);
54}
55EXPORT_SYMBOL_GPL(__init_srcu_struct);
56
57#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
58
621934ee
PM
59/**
60 * init_srcu_struct - initialize a sleep-RCU structure
61 * @sp: structure to initialize.
62 *
63 * Must invoke this on a given srcu_struct before passing that srcu_struct
64 * to any other function. Each srcu_struct represents a separate domain
65 * of SRCU protection.
66 */
e6a92013 67int init_srcu_struct(struct srcu_struct *sp)
621934ee 68{
632ee200 69 return init_srcu_struct_fields(sp);
621934ee 70}
0cd397d3 71EXPORT_SYMBOL_GPL(init_srcu_struct);
621934ee 72
632ee200
PM
73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74
b52ce066
LJ
75/*
76 * Returns approximate total of the readers' ->seq[] values for the
77 * rank of per-CPU counters specified by idx.
78 */
79static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
80{
81 int cpu;
82 unsigned long sum = 0;
83 unsigned long t;
84
85 for_each_possible_cpu(cpu) {
86 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
87 sum += t;
88 }
89 return sum;
90}
91
621934ee 92/*
cef50120 93 * Returns approximate number of readers active on the specified rank
b52ce066 94 * of the per-CPU ->c[] counters.
621934ee 95 */
cef50120
PM
96static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
97{
98 int cpu;
99 unsigned long sum = 0;
100 unsigned long t;
621934ee 101
cef50120
PM
102 for_each_possible_cpu(cpu) {
103 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
104 sum += t;
cef50120 105 }
b52ce066 106 return sum;
cef50120
PM
107}
108
109/*
b52ce066
LJ
110 * Return true if the number of pre-existing readers is determined to
111 * be stably zero. An example unstable zero can occur if the call
112 * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
113 * but due to task migration, sees the corresponding __srcu_read_unlock()
114 * decrement. This can happen because srcu_readers_active_idx() takes
115 * time to sum the array, and might in fact be interrupted or preempted
116 * partway through the summation.
cef50120
PM
117 */
118static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
621934ee 119{
b52ce066
LJ
120 unsigned long seq;
121
122 seq = srcu_readers_seq_idx(sp, idx);
123
124 /*
125 * The following smp_mb() A pairs with the smp_mb() B located in
126 * __srcu_read_lock(). This pairing ensures that if an
127 * __srcu_read_lock() increments its counter after the summation
128 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
129 * critical section will see any changes made prior to the start
130 * of the current SRCU grace period.
131 *
132 * Also, if the above call to srcu_readers_seq_idx() saw the
133 * increment of ->seq[], then the call to srcu_readers_active_idx()
134 * must see the increment of ->c[].
135 */
136 smp_mb(); /* A */
621934ee 137
cef50120
PM
138 /*
139 * Note that srcu_readers_active_idx() can incorrectly return
140 * zero even though there is a pre-existing reader throughout.
141 * To see this, suppose that task A is in a very long SRCU
142 * read-side critical section that started on CPU 0, and that
b52ce066 143 * no other reader exists, so that the sum of the counters
cef50120
PM
144 * is equal to one. Then suppose that task B starts executing
145 * srcu_readers_active_idx(), summing up to CPU 1, and then that
146 * task C starts reading on CPU 0, so that its increment is not
147 * summed, but finishes reading on CPU 2, so that its decrement
148 * -is- summed. Then when task B completes its sum, it will
149 * incorrectly get zero, despite the fact that task A has been
150 * in its SRCU read-side critical section the whole time.
151 *
152 * We therefore do a validation step should srcu_readers_active_idx()
153 * return zero.
154 */
155 if (srcu_readers_active_idx(sp, idx) != 0)
156 return false;
157
158 /*
b52ce066
LJ
159 * The remainder of this function is the validation step.
160 * The following smp_mb() D pairs with the smp_mb() C in
161 * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
162 * by srcu_readers_active_idx() above, then any destructive
163 * operation performed after the grace period will happen after
164 * the corresponding SRCU read-side critical section.
cef50120 165 *
b52ce066
LJ
166 * Note that there can be at most NR_CPUS worth of readers using
167 * the old index, which is not enough to overflow even a 32-bit
168 * integer. (Yes, this does mean that systems having more than
169 * a billion or so CPUs need to be 64-bit systems.) Therefore,
170 * the sum of the ->seq[] counters cannot possibly overflow.
171 * Therefore, the only way that the return values of the two
172 * calls to srcu_readers_seq_idx() can be equal is if there were
173 * no increments of the corresponding rank of ->seq[] counts
174 * in the interim. But the missed-increment scenario laid out
175 * above includes an increment of the ->seq[] counter by
176 * the corresponding __srcu_read_lock(). Therefore, if this
177 * scenario occurs, the return values from the two calls to
178 * srcu_readers_seq_idx() will differ, and thus the validation
179 * step below suffices.
cef50120 180 */
b52ce066
LJ
181 smp_mb(); /* D */
182
183 return srcu_readers_seq_idx(sp, idx) == seq;
621934ee
PM
184}
185
186/**
187 * srcu_readers_active - returns approximate number of readers.
188 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
189 *
190 * Note that this is not an atomic primitive, and can therefore suffer
191 * severe errors when invoked on an active srcu_struct. That said, it
192 * can be useful as an error check at cleanup time.
193 */
bb695170 194static int srcu_readers_active(struct srcu_struct *sp)
621934ee 195{
dc879175
LJ
196 int cpu;
197 unsigned long sum = 0;
198
199 for_each_possible_cpu(cpu) {
200 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]);
201 sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]);
202 }
203 return sum;
621934ee
PM
204}
205
206/**
207 * cleanup_srcu_struct - deconstruct a sleep-RCU structure
208 * @sp: structure to clean up.
209 *
210 * Must invoke this after you are finished using a given srcu_struct that
211 * was initialized via init_srcu_struct(), else you leak memory.
212 */
213void cleanup_srcu_struct(struct srcu_struct *sp)
214{
215 int sum;
216
217 sum = srcu_readers_active(sp);
218 WARN_ON(sum); /* Leakage unless caller handles error. */
219 if (sum != 0)
220 return;
221 free_percpu(sp->per_cpu_ref);
222 sp->per_cpu_ref = NULL;
223}
0cd397d3 224EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
621934ee 225
632ee200 226/*
621934ee
PM
227 * Counts the new reader in the appropriate per-CPU element of the
228 * srcu_struct. Must be called from process context.
229 * Returns an index that must be passed to the matching srcu_read_unlock().
230 */
632ee200 231int __srcu_read_lock(struct srcu_struct *sp)
621934ee
PM
232{
233 int idx;
234
235 preempt_disable();
cef50120
PM
236 idx = rcu_dereference_index_check(sp->completed,
237 rcu_read_lock_sched_held()) & 0x1;
b52ce066 238 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
cef50120 239 smp_mb(); /* B */ /* Avoid leaking the critical section. */
b52ce066 240 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
621934ee
PM
241 preempt_enable();
242 return idx;
243}
632ee200 244EXPORT_SYMBOL_GPL(__srcu_read_lock);
621934ee 245
632ee200 246/*
621934ee
PM
247 * Removes the count for the old reader from the appropriate per-CPU
248 * element of the srcu_struct. Note that this may well be a different
249 * CPU than that which was incremented by the corresponding srcu_read_lock().
250 * Must be called from process context.
251 */
632ee200 252void __srcu_read_unlock(struct srcu_struct *sp, int idx)
621934ee
PM
253{
254 preempt_disable();
cef50120 255 smp_mb(); /* C */ /* Avoid leaking the critical section. */
440253c1 256 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
621934ee
PM
257 preempt_enable();
258}
632ee200 259EXPORT_SYMBOL_GPL(__srcu_read_unlock);
621934ee 260
c072a388
PM
261/*
262 * We use an adaptive strategy for synchronize_srcu() and especially for
263 * synchronize_srcu_expedited(). We spin for a fixed time period
264 * (defined below) to allow SRCU readers to exit their read-side critical
265 * sections. If there are still some readers after 10 microseconds,
266 * we repeatedly block for 1-millisecond time periods. This approach
267 * has done well in testing, so there is no need for a config parameter.
268 */
cef50120
PM
269#define SYNCHRONIZE_SRCU_READER_DELAY 5
270
18108ebf
LJ
271/*
272 * Wait until all pre-existing readers complete. Such readers
273 * will have used the index specified by "idx".
274 */
944ce9af 275static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
cef50120 276{
cef50120
PM
277 int trycount = 0;
278
cef50120
PM
279 /*
280 * SRCU read-side critical sections are normally short, so wait
281 * a small amount of time before possibly blocking.
282 */
283 if (!srcu_readers_active_idx_check(sp, idx)) {
284 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
285 while (!srcu_readers_active_idx_check(sp, idx)) {
286 if (expedited && ++ trycount < 10)
287 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
288 else
289 schedule_timeout_interruptible(1);
290 }
291 }
cef50120 292}
c072a388 293
18108ebf 294static void srcu_flip(struct srcu_struct *sp)
944ce9af 295{
18108ebf 296 sp->completed++;
944ce9af
LJ
297}
298
0cd397d3
PM
299/*
300 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
621934ee 301 */
cef50120 302static void __synchronize_srcu(struct srcu_struct *sp, bool expedited)
621934ee 303{
18108ebf
LJ
304 int busy_idx;
305
fe15d706
PM
306 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
307 !lock_is_held(&rcu_bh_lock_map) &&
308 !lock_is_held(&rcu_lock_map) &&
309 !lock_is_held(&rcu_sched_lock_map),
310 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
311
621934ee 312 mutex_lock(&sp->mutex);
18108ebf 313 busy_idx = sp->completed & 0X1UL;
621934ee 314
621934ee 315 /*
18108ebf
LJ
316 * If we recently flipped the index, there will be some readers
317 * using idx=0 and others using idx=1. Therefore, two calls to
318 * wait_idx()s suffice to ensure that all pre-existing readers
319 * have completed:
320 *
321 * __synchronize_srcu() {
322 * wait_idx(sp, 0, expedited);
323 * wait_idx(sp, 1, expedited);
324 * }
325 *
326 * Starvation is prevented by the fact that we flip the index.
327 * While we wait on one index to clear out, almost all new readers
328 * will be using the other index. The number of new readers using the
329 * index we are waiting on is sharply bounded by roughly the number
330 * of CPUs.
331 *
332 * How can new readers possibly using the old pre-flip value of
333 * the index? Consider the following sequence of events:
334 *
944ce9af
LJ
335 * Suppose that during the previous grace period, a reader
336 * picked up the old value of the index, but did not increment
337 * its counter until after the previous instance of
338 * __synchronize_srcu() did the counter summation and recheck.
339 * That previous grace period was OK because the reader did
340 * not start until after the grace period started, so the grace
341 * period was not obligated to wait for that reader.
342 *
18108ebf
LJ
343 * However, this sequence of events is quite improbable, so
344 * this call to wait_idx(), which waits on really old readers
345 * describe in this comment above, will almost never need to wait.
621934ee 346 */
18108ebf 347 wait_idx(sp, 1 - busy_idx, expedited);
944ce9af 348
18108ebf
LJ
349 /* Flip the index to avoid reader-induced starvation. */
350 srcu_flip(sp);
351
352 /* Wait for recent pre-existing readers. */
353 wait_idx(sp, busy_idx, expedited);
944ce9af 354
621934ee
PM
355 mutex_unlock(&sp->mutex);
356}
357
0cd397d3
PM
358/**
359 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
360 * @sp: srcu_struct with which to synchronize.
361 *
362 * Flip the completed counter, and wait for the old count to drain to zero.
363 * As with classic RCU, the updater must use some separate means of
364 * synchronizing concurrent updates. Can block; must be called from
365 * process context.
366 *
367 * Note that it is illegal to call synchronize_srcu() from the corresponding
368 * SRCU read-side critical section; doing so will result in deadlock.
369 * However, it is perfectly legal to call synchronize_srcu() on one
370 * srcu_struct from some other srcu_struct's read-side critical section.
371 */
372void synchronize_srcu(struct srcu_struct *sp)
373{
cef50120 374 __synchronize_srcu(sp, 0);
0cd397d3
PM
375}
376EXPORT_SYMBOL_GPL(synchronize_srcu);
377
378/**
236fefaf 379 * synchronize_srcu_expedited - Brute-force SRCU grace period
0cd397d3
PM
380 * @sp: srcu_struct with which to synchronize.
381 *
cef50120
PM
382 * Wait for an SRCU grace period to elapse, but be more aggressive about
383 * spinning rather than blocking when waiting.
0cd397d3 384 *
236fefaf 385 * Note that it is illegal to call this function while holding any lock
cef50120 386 * that is acquired by a CPU-hotplug notifier. It is also illegal to call
236fefaf
PM
387 * synchronize_srcu_expedited() from the corresponding SRCU read-side
388 * critical section; doing so will result in deadlock. However, it is
389 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
390 * from some other srcu_struct's read-side critical section, as long as
391 * the resulting graph of srcu_structs is acyclic.
0cd397d3
PM
392 */
393void synchronize_srcu_expedited(struct srcu_struct *sp)
394{
cef50120 395 __synchronize_srcu(sp, 1);
0cd397d3
PM
396}
397EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
398
621934ee
PM
399/**
400 * srcu_batches_completed - return batches completed.
401 * @sp: srcu_struct on which to report batch completion.
402 *
403 * Report the number of batches, correlated with, but not necessarily
404 * precisely the same as, the number of grace periods that have elapsed.
405 */
406
407long srcu_batches_completed(struct srcu_struct *sp)
408{
409 return sp->completed;
410}
621934ee 411EXPORT_SYMBOL_GPL(srcu_batches_completed);