Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
c4e05116 IM |
2 | /* kernel/rwsem.c: R/W semaphores, public implementation |
3 | * | |
4 | * Written by David Howells (dhowells@redhat.com). | |
5 | * Derived from asm-i386/semaphore.h | |
5dec94d4 WL |
6 | * |
7 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> | |
8 | * and Michel Lespinasse <walken@google.com> | |
9 | * | |
10 | * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> | |
11 | * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. | |
12 | * | |
4f23dbc1 WL |
13 | * Rwsem count bit fields re-definition and rwsem rearchitecture by |
14 | * Waiman Long <longman@redhat.com> and | |
15 | * Peter Zijlstra <peterz@infradead.org>. | |
c4e05116 IM |
16 | */ |
17 | ||
18 | #include <linux/types.h> | |
19 | #include <linux/kernel.h> | |
c7af77b5 | 20 | #include <linux/sched.h> |
5dec94d4 WL |
21 | #include <linux/sched/rt.h> |
22 | #include <linux/sched/task.h> | |
b17b0153 | 23 | #include <linux/sched/debug.h> |
5dec94d4 WL |
24 | #include <linux/sched/wake_q.h> |
25 | #include <linux/sched/signal.h> | |
7d43f1ce | 26 | #include <linux/sched/clock.h> |
9984de1a | 27 | #include <linux/export.h> |
c4e05116 | 28 | #include <linux/rwsem.h> |
60063497 | 29 | #include <linux/atomic.h> |
c4e05116 | 30 | |
5dec94d4 WL |
31 | #include "lock_events.h" |
32 | ||
33 | /* | |
7d43f1ce | 34 | * The least significant 3 bits of the owner value has the following |
5dec94d4 | 35 | * meanings when set. |
02f1082b | 36 | * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers |
7d43f1ce WL |
37 | * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. |
38 | * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. | |
5dec94d4 | 39 | * |
7d43f1ce WL |
40 | * When the rwsem is either owned by an anonymous writer, or it is |
41 | * reader-owned, but a spinning writer has timed out, both nonspinnable | |
42 | * bits will be set to disable optimistic spinning by readers and writers. | |
43 | * In the later case, the last unlocking reader should then check the | |
44 | * writer nonspinnable bit and clear it only to give writers preference | |
45 | * to acquire the lock via optimistic spinning, but not readers. Similar | |
46 | * action is also done in the reader slowpath. | |
47 | ||
5dec94d4 WL |
48 | * When a writer acquires a rwsem, it puts its task_struct pointer |
49 | * into the owner field. It is cleared after an unlock. | |
50 | * | |
51 | * When a reader acquires a rwsem, it will also puts its task_struct | |
7d43f1ce WL |
52 | * pointer into the owner field with the RWSEM_READER_OWNED bit set. |
53 | * On unlock, the owner field will largely be left untouched. So | |
54 | * for a free or reader-owned rwsem, the owner value may contain | |
55 | * information about the last reader that acquires the rwsem. | |
5dec94d4 WL |
56 | * |
57 | * That information may be helpful in debugging cases where the system | |
58 | * seems to hang on a reader owned rwsem especially if only one reader | |
59 | * is involved. Ideally we would like to track all the readers that own | |
60 | * a rwsem, but the overhead is simply too big. | |
5cfd92e1 WL |
61 | * |
62 | * Reader optimistic spinning is helpful when the reader critical section | |
63 | * is short and there aren't that many readers around. It makes readers | |
64 | * relatively more preferred than writers. When a writer times out spinning | |
65 | * on a reader-owned lock and set the nospinnable bits, there are two main | |
66 | * reasons for that. | |
67 | * | |
68 | * 1) The reader critical section is long, perhaps the task sleeps after | |
69 | * acquiring the read lock. | |
70 | * 2) There are just too many readers contending the lock causing it to | |
71 | * take a while to service all of them. | |
72 | * | |
73 | * In the former case, long reader critical section will impede the progress | |
74 | * of writers which is usually more important for system performance. In | |
75 | * the later case, reader optimistic spinning tends to make the reader | |
76 | * groups that contain readers that acquire the lock together smaller | |
77 | * leading to more of them. That may hurt performance in some cases. In | |
78 | * other words, the setting of nonspinnable bits indicates that reader | |
79 | * optimistic spinning may not be helpful for those workloads that cause | |
80 | * it. | |
81 | * | |
82 | * Therefore, any writers that had observed the setting of the writer | |
83 | * nonspinnable bit for a given rwsem after they fail to acquire the lock | |
84 | * via optimistic spinning will set the reader nonspinnable bit once they | |
85 | * acquire the write lock. Similarly, readers that observe the setting | |
86 | * of reader nonspinnable bit at slowpath entry will set the reader | |
87 | * nonspinnable bits when they acquire the read lock via the wakeup path. | |
88 | * | |
89 | * Once the reader nonspinnable bit is on, it will only be reset when | |
90 | * a writer is able to acquire the rwsem in the fast path or somehow a | |
91 | * reader or writer in the slowpath doesn't observe the nonspinable bit. | |
92 | * | |
93 | * This is to discourage reader optmistic spinning on that particular | |
94 | * rwsem and make writers more preferred. This adaptive disabling of reader | |
95 | * optimistic spinning will alleviate the negative side effect of this | |
96 | * feature. | |
5dec94d4 WL |
97 | */ |
98 | #define RWSEM_READER_OWNED (1UL << 0) | |
7d43f1ce WL |
99 | #define RWSEM_RD_NONSPINNABLE (1UL << 1) |
100 | #define RWSEM_WR_NONSPINNABLE (1UL << 2) | |
101 | #define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) | |
02f1082b | 102 | #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) |
5dec94d4 WL |
103 | |
104 | #ifdef CONFIG_DEBUG_RWSEMS | |
105 | # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ | |
106 | if (!debug_locks_silent && \ | |
fce45cd4 | 107 | WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ |
5dec94d4 | 108 | #c, atomic_long_read(&(sem)->count), \ |
fce45cd4 | 109 | (unsigned long) sem->magic, \ |
94a9717b | 110 | atomic_long_read(&(sem)->owner), (long)current, \ |
5dec94d4 WL |
111 | list_empty(&(sem)->wait_list) ? "" : "not ")) \ |
112 | debug_locks_off(); \ | |
113 | } while (0) | |
114 | #else | |
115 | # define DEBUG_RWSEMS_WARN_ON(c, sem) | |
116 | #endif | |
117 | ||
118 | /* | |
a15ea1a3 | 119 | * On 64-bit architectures, the bit definitions of the count are: |
5dec94d4 | 120 | * |
a15ea1a3 WL |
121 | * Bit 0 - writer locked bit |
122 | * Bit 1 - waiters present bit | |
123 | * Bit 2 - lock handoff bit | |
124 | * Bits 3-7 - reserved | |
125 | * Bits 8-62 - 55-bit reader count | |
126 | * Bit 63 - read fail bit | |
127 | * | |
128 | * On 32-bit architectures, the bit definitions of the count are: | |
129 | * | |
130 | * Bit 0 - writer locked bit | |
131 | * Bit 1 - waiters present bit | |
132 | * Bit 2 - lock handoff bit | |
133 | * Bits 3-7 - reserved | |
134 | * Bits 8-30 - 23-bit reader count | |
135 | * Bit 31 - read fail bit | |
136 | * | |
137 | * It is not likely that the most significant bit (read fail bit) will ever | |
138 | * be set. This guard bit is still checked anyway in the down_read() fastpath | |
139 | * just in case we need to use up more of the reader bits for other purpose | |
140 | * in the future. | |
5dec94d4 WL |
141 | * |
142 | * atomic_long_fetch_add() is used to obtain reader lock, whereas | |
143 | * atomic_long_cmpxchg() will be used to obtain writer lock. | |
4f23dbc1 WL |
144 | * |
145 | * There are three places where the lock handoff bit may be set or cleared. | |
146 | * 1) rwsem_mark_wake() for readers. | |
147 | * 2) rwsem_try_write_lock() for writers. | |
148 | * 3) Error path of rwsem_down_write_slowpath(). | |
149 | * | |
150 | * For all the above cases, wait_lock will be held. A writer must also | |
151 | * be the first one in the wait_list to be eligible for setting the handoff | |
152 | * bit. So concurrent setting/clearing of handoff bit is not possible. | |
5dec94d4 WL |
153 | */ |
154 | #define RWSEM_WRITER_LOCKED (1UL << 0) | |
155 | #define RWSEM_FLAG_WAITERS (1UL << 1) | |
4f23dbc1 | 156 | #define RWSEM_FLAG_HANDOFF (1UL << 2) |
a15ea1a3 | 157 | #define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1)) |
4f23dbc1 | 158 | |
5dec94d4 WL |
159 | #define RWSEM_READER_SHIFT 8 |
160 | #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) | |
161 | #define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) | |
162 | #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED | |
163 | #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) | |
4f23dbc1 | 164 | #define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ |
a15ea1a3 | 165 | RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL) |
5dec94d4 WL |
166 | |
167 | /* | |
168 | * All writes to owner are protected by WRITE_ONCE() to make sure that | |
169 | * store tearing can't happen as optimistic spinners may read and use | |
170 | * the owner value concurrently without lock. Read from owner, however, | |
171 | * may not need READ_ONCE() as long as the pointer value is only used | |
172 | * for comparison and isn't being dereferenced. | |
173 | */ | |
174 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | |
175 | { | |
94a9717b | 176 | atomic_long_set(&sem->owner, (long)current); |
5dec94d4 WL |
177 | } |
178 | ||
179 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | |
180 | { | |
94a9717b WL |
181 | atomic_long_set(&sem->owner, 0); |
182 | } | |
183 | ||
184 | /* | |
185 | * Test the flags in the owner field. | |
186 | */ | |
187 | static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) | |
188 | { | |
189 | return atomic_long_read(&sem->owner) & flags; | |
5dec94d4 WL |
190 | } |
191 | ||
192 | /* | |
193 | * The task_struct pointer of the last owning reader will be left in | |
194 | * the owner field. | |
195 | * | |
196 | * Note that the owner value just indicates the task has owned the rwsem | |
197 | * previously, it may not be the real owner or one of the real owners | |
198 | * anymore when that field is examined, so take it with a grain of salt. | |
5cfd92e1 WL |
199 | * |
200 | * The reader non-spinnable bit is preserved. | |
5dec94d4 WL |
201 | */ |
202 | static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, | |
203 | struct task_struct *owner) | |
204 | { | |
5cfd92e1 WL |
205 | unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | |
206 | (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); | |
5dec94d4 | 207 | |
94a9717b | 208 | atomic_long_set(&sem->owner, val); |
5dec94d4 WL |
209 | } |
210 | ||
211 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | |
212 | { | |
213 | __rwsem_set_reader_owned(sem, current); | |
214 | } | |
215 | ||
216 | /* | |
94a9717b | 217 | * Return true if the rwsem is owned by a reader. |
5dec94d4 | 218 | */ |
94a9717b | 219 | static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) |
5dec94d4 | 220 | { |
94a9717b WL |
221 | #ifdef CONFIG_DEBUG_RWSEMS |
222 | /* | |
223 | * Check the count to see if it is write-locked. | |
224 | */ | |
225 | long count = atomic_long_read(&sem->count); | |
226 | ||
227 | if (count & RWSEM_WRITER_MASK) | |
228 | return false; | |
229 | #endif | |
230 | return rwsem_test_oflags(sem, RWSEM_READER_OWNED); | |
5dec94d4 WL |
231 | } |
232 | ||
233 | #ifdef CONFIG_DEBUG_RWSEMS | |
234 | /* | |
235 | * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there | |
236 | * is a task pointer in owner of a reader-owned rwsem, it will be the | |
237 | * real owner or one of the real owners. The only exception is when the | |
238 | * unlock is done by up_read_non_owner(). | |
239 | */ | |
240 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) | |
241 | { | |
94a9717b WL |
242 | unsigned long val = atomic_long_read(&sem->owner); |
243 | ||
244 | while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { | |
245 | if (atomic_long_try_cmpxchg(&sem->owner, &val, | |
246 | val & RWSEM_OWNER_FLAGS_MASK)) | |
247 | return; | |
248 | } | |
5dec94d4 WL |
249 | } |
250 | #else | |
251 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) | |
252 | { | |
253 | } | |
254 | #endif | |
255 | ||
7d43f1ce WL |
256 | /* |
257 | * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag | |
258 | * remains set. Otherwise, the operation will be aborted. | |
259 | */ | |
260 | static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) | |
261 | { | |
262 | unsigned long owner = atomic_long_read(&sem->owner); | |
263 | ||
264 | do { | |
265 | if (!(owner & RWSEM_READER_OWNED)) | |
266 | break; | |
267 | if (owner & RWSEM_NONSPINNABLE) | |
268 | break; | |
269 | } while (!atomic_long_try_cmpxchg(&sem->owner, &owner, | |
270 | owner | RWSEM_NONSPINNABLE)); | |
271 | } | |
272 | ||
a15ea1a3 WL |
273 | static inline bool rwsem_read_trylock(struct rw_semaphore *sem) |
274 | { | |
275 | long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); | |
276 | if (WARN_ON_ONCE(cnt < 0)) | |
277 | rwsem_set_nonspinnable(sem); | |
278 | return !(cnt & RWSEM_READ_FAILED_MASK); | |
279 | } | |
280 | ||
94a9717b WL |
281 | /* |
282 | * Return just the real task structure pointer of the owner | |
283 | */ | |
284 | static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) | |
285 | { | |
286 | return (struct task_struct *) | |
287 | (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); | |
288 | } | |
289 | ||
290 | /* | |
291 | * Return the real task structure pointer of the owner and the embedded | |
292 | * flags in the owner. pflags must be non-NULL. | |
293 | */ | |
294 | static inline struct task_struct * | |
295 | rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) | |
296 | { | |
297 | unsigned long owner = atomic_long_read(&sem->owner); | |
298 | ||
299 | *pflags = owner & RWSEM_OWNER_FLAGS_MASK; | |
300 | return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); | |
301 | } | |
302 | ||
5dec94d4 WL |
303 | /* |
304 | * Guide to the rw_semaphore's count field. | |
305 | * | |
306 | * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned | |
307 | * by a writer. | |
308 | * | |
309 | * The lock is owned by readers when | |
310 | * (1) the RWSEM_WRITER_LOCKED isn't set in count, | |
311 | * (2) some of the reader bits are set in count, and | |
312 | * (3) the owner field has RWSEM_READ_OWNED bit set. | |
313 | * | |
314 | * Having some reader bits set is not enough to guarantee a readers owned | |
315 | * lock as the readers may be in the process of backing out from the count | |
316 | * and a writer has just released the lock. So another writer may steal | |
317 | * the lock immediately after that. | |
318 | */ | |
319 | ||
320 | /* | |
321 | * Initialize an rwsem: | |
322 | */ | |
323 | void __init_rwsem(struct rw_semaphore *sem, const char *name, | |
324 | struct lock_class_key *key) | |
325 | { | |
326 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | |
327 | /* | |
328 | * Make sure we are not reinitializing a held semaphore: | |
329 | */ | |
330 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | |
de8f5e4f | 331 | lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); |
fce45cd4 DB |
332 | #endif |
333 | #ifdef CONFIG_DEBUG_RWSEMS | |
334 | sem->magic = sem; | |
5dec94d4 WL |
335 | #endif |
336 | atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); | |
337 | raw_spin_lock_init(&sem->wait_lock); | |
338 | INIT_LIST_HEAD(&sem->wait_list); | |
94a9717b | 339 | atomic_long_set(&sem->owner, 0L); |
5dec94d4 WL |
340 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
341 | osq_lock_init(&sem->osq); | |
342 | #endif | |
343 | } | |
5dec94d4 WL |
344 | EXPORT_SYMBOL(__init_rwsem); |
345 | ||
346 | enum rwsem_waiter_type { | |
347 | RWSEM_WAITING_FOR_WRITE, | |
348 | RWSEM_WAITING_FOR_READ | |
349 | }; | |
350 | ||
351 | struct rwsem_waiter { | |
352 | struct list_head list; | |
353 | struct task_struct *task; | |
354 | enum rwsem_waiter_type type; | |
4f23dbc1 | 355 | unsigned long timeout; |
5cfd92e1 | 356 | unsigned long last_rowner; |
5dec94d4 | 357 | }; |
4f23dbc1 WL |
358 | #define rwsem_first_waiter(sem) \ |
359 | list_first_entry(&sem->wait_list, struct rwsem_waiter, list) | |
5dec94d4 WL |
360 | |
361 | enum rwsem_wake_type { | |
362 | RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ | |
363 | RWSEM_WAKE_READERS, /* Wake readers only */ | |
364 | RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ | |
365 | }; | |
366 | ||
4f23dbc1 WL |
367 | enum writer_wait_state { |
368 | WRITER_NOT_FIRST, /* Writer is not first in wait list */ | |
369 | WRITER_FIRST, /* Writer is first in wait list */ | |
370 | WRITER_HANDOFF /* Writer is first & handoff needed */ | |
371 | }; | |
372 | ||
373 | /* | |
374 | * The typical HZ value is either 250 or 1000. So set the minimum waiting | |
375 | * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait | |
376 | * queue before initiating the handoff protocol. | |
377 | */ | |
378 | #define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) | |
379 | ||
d3681e26 WL |
380 | /* |
381 | * Magic number to batch-wakeup waiting readers, even when writers are | |
382 | * also present in the queue. This both limits the amount of work the | |
383 | * waking thread must do and also prevents any potential counter overflow, | |
384 | * however unlikely. | |
385 | */ | |
386 | #define MAX_READERS_WAKEUP 0x100 | |
387 | ||
5dec94d4 WL |
388 | /* |
389 | * handle the lock release when processes blocked on it that can now run | |
390 | * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must | |
391 | * have been set. | |
392 | * - there must be someone on the queue | |
393 | * - the wait_lock must be held by the caller | |
394 | * - tasks are marked for wakeup, the caller must later invoke wake_up_q() | |
395 | * to actually wakeup the blocked task(s) and drop the reference count, | |
396 | * preferably when the wait_lock is released | |
397 | * - woken process blocks are discarded from the list after having task zeroed | |
398 | * - writers are only marked woken if downgrading is false | |
399 | */ | |
6cef7ff6 WL |
400 | static void rwsem_mark_wake(struct rw_semaphore *sem, |
401 | enum rwsem_wake_type wake_type, | |
402 | struct wake_q_head *wake_q) | |
5dec94d4 WL |
403 | { |
404 | struct rwsem_waiter *waiter, *tmp; | |
405 | long oldcount, woken = 0, adjustment = 0; | |
406 | struct list_head wlist; | |
407 | ||
4f23dbc1 WL |
408 | lockdep_assert_held(&sem->wait_lock); |
409 | ||
5dec94d4 WL |
410 | /* |
411 | * Take a peek at the queue head waiter such that we can determine | |
412 | * the wakeup(s) to perform. | |
413 | */ | |
4f23dbc1 | 414 | waiter = rwsem_first_waiter(sem); |
5dec94d4 WL |
415 | |
416 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | |
417 | if (wake_type == RWSEM_WAKE_ANY) { | |
418 | /* | |
419 | * Mark writer at the front of the queue for wakeup. | |
420 | * Until the task is actually later awoken later by | |
421 | * the caller, other writers are able to steal it. | |
422 | * Readers, on the other hand, will block as they | |
423 | * will notice the queued writer. | |
424 | */ | |
425 | wake_q_add(wake_q, waiter->task); | |
426 | lockevent_inc(rwsem_wake_writer); | |
427 | } | |
428 | ||
429 | return; | |
430 | } | |
431 | ||
a15ea1a3 WL |
432 | /* |
433 | * No reader wakeup if there are too many of them already. | |
434 | */ | |
435 | if (unlikely(atomic_long_read(&sem->count) < 0)) | |
436 | return; | |
437 | ||
5dec94d4 WL |
438 | /* |
439 | * Writers might steal the lock before we grant it to the next reader. | |
440 | * We prefer to do the first reader grant before counting readers | |
441 | * so we can bail out early if a writer stole the lock. | |
442 | */ | |
443 | if (wake_type != RWSEM_WAKE_READ_OWNED) { | |
5cfd92e1 WL |
444 | struct task_struct *owner; |
445 | ||
5dec94d4 WL |
446 | adjustment = RWSEM_READER_BIAS; |
447 | oldcount = atomic_long_fetch_add(adjustment, &sem->count); | |
448 | if (unlikely(oldcount & RWSEM_WRITER_MASK)) { | |
4f23dbc1 WL |
449 | /* |
450 | * When we've been waiting "too" long (for writers | |
451 | * to give up the lock), request a HANDOFF to | |
452 | * force the issue. | |
453 | */ | |
454 | if (!(oldcount & RWSEM_FLAG_HANDOFF) && | |
455 | time_after(jiffies, waiter->timeout)) { | |
456 | adjustment -= RWSEM_FLAG_HANDOFF; | |
457 | lockevent_inc(rwsem_rlock_handoff); | |
458 | } | |
459 | ||
460 | atomic_long_add(-adjustment, &sem->count); | |
5dec94d4 WL |
461 | return; |
462 | } | |
463 | /* | |
464 | * Set it to reader-owned to give spinners an early | |
465 | * indication that readers now have the lock. | |
5cfd92e1 WL |
466 | * The reader nonspinnable bit seen at slowpath entry of |
467 | * the reader is copied over. | |
5dec94d4 | 468 | */ |
5cfd92e1 WL |
469 | owner = waiter->task; |
470 | if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { | |
471 | owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); | |
472 | lockevent_inc(rwsem_opt_norspin); | |
473 | } | |
474 | __rwsem_set_reader_owned(sem, owner); | |
5dec94d4 WL |
475 | } |
476 | ||
477 | /* | |
d3681e26 WL |
478 | * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the |
479 | * queue. We know that the woken will be at least 1 as we accounted | |
5dec94d4 WL |
480 | * for above. Note we increment the 'active part' of the count by the |
481 | * number of readers before waking any processes up. | |
482 | * | |
d3681e26 WL |
483 | * This is an adaptation of the phase-fair R/W locks where at the |
484 | * reader phase (first waiter is a reader), all readers are eligible | |
485 | * to acquire the lock at the same time irrespective of their order | |
486 | * in the queue. The writers acquire the lock according to their | |
487 | * order in the queue. | |
488 | * | |
5dec94d4 WL |
489 | * We have to do wakeup in 2 passes to prevent the possibility that |
490 | * the reader count may be decremented before it is incremented. It | |
491 | * is because the to-be-woken waiter may not have slept yet. So it | |
492 | * may see waiter->task got cleared, finish its critical section and | |
493 | * do an unlock before the reader count increment. | |
494 | * | |
495 | * 1) Collect the read-waiters in a separate list, count them and | |
496 | * fully increment the reader count in rwsem. | |
497 | * 2) For each waiters in the new list, clear waiter->task and | |
498 | * put them into wake_q to be woken up later. | |
499 | */ | |
d3681e26 WL |
500 | INIT_LIST_HEAD(&wlist); |
501 | list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { | |
5dec94d4 | 502 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) |
d3681e26 | 503 | continue; |
5dec94d4 WL |
504 | |
505 | woken++; | |
d3681e26 WL |
506 | list_move_tail(&waiter->list, &wlist); |
507 | ||
508 | /* | |
509 | * Limit # of readers that can be woken up per wakeup call. | |
510 | */ | |
511 | if (woken >= MAX_READERS_WAKEUP) | |
512 | break; | |
5dec94d4 | 513 | } |
5dec94d4 WL |
514 | |
515 | adjustment = woken * RWSEM_READER_BIAS - adjustment; | |
516 | lockevent_cond_inc(rwsem_wake_reader, woken); | |
517 | if (list_empty(&sem->wait_list)) { | |
518 | /* hit end of list above */ | |
519 | adjustment -= RWSEM_FLAG_WAITERS; | |
520 | } | |
521 | ||
4f23dbc1 WL |
522 | /* |
523 | * When we've woken a reader, we no longer need to force writers | |
524 | * to give up the lock and we can clear HANDOFF. | |
525 | */ | |
526 | if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) | |
527 | adjustment -= RWSEM_FLAG_HANDOFF; | |
528 | ||
5dec94d4 WL |
529 | if (adjustment) |
530 | atomic_long_add(adjustment, &sem->count); | |
531 | ||
532 | /* 2nd pass */ | |
533 | list_for_each_entry_safe(waiter, tmp, &wlist, list) { | |
534 | struct task_struct *tsk; | |
535 | ||
536 | tsk = waiter->task; | |
537 | get_task_struct(tsk); | |
538 | ||
539 | /* | |
540 | * Ensure calling get_task_struct() before setting the reader | |
6cef7ff6 | 541 | * waiter to nil such that rwsem_down_read_slowpath() cannot |
5dec94d4 WL |
542 | * race with do_exit() by always holding a reference count |
543 | * to the task to wakeup. | |
544 | */ | |
545 | smp_store_release(&waiter->task, NULL); | |
546 | /* | |
547 | * Ensure issuing the wakeup (either by us or someone else) | |
548 | * after setting the reader waiter to nil. | |
549 | */ | |
550 | wake_q_add_safe(wake_q, tsk); | |
551 | } | |
552 | } | |
553 | ||
554 | /* | |
555 | * This function must be called with the sem->wait_lock held to prevent | |
556 | * race conditions between checking the rwsem wait list and setting the | |
557 | * sem->count accordingly. | |
4f23dbc1 WL |
558 | * |
559 | * If wstate is WRITER_HANDOFF, it will make sure that either the handoff | |
560 | * bit is set or the lock is acquired with handoff bit cleared. | |
5dec94d4 | 561 | */ |
00f3c5a3 | 562 | static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, |
4f23dbc1 | 563 | enum writer_wait_state wstate) |
5dec94d4 | 564 | { |
00f3c5a3 | 565 | long count, new; |
5dec94d4 | 566 | |
4f23dbc1 | 567 | lockdep_assert_held(&sem->wait_lock); |
5dec94d4 | 568 | |
00f3c5a3 | 569 | count = atomic_long_read(&sem->count); |
4f23dbc1 WL |
570 | do { |
571 | bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); | |
5dec94d4 | 572 | |
4f23dbc1 WL |
573 | if (has_handoff && wstate == WRITER_NOT_FIRST) |
574 | return false; | |
5dec94d4 | 575 | |
4f23dbc1 WL |
576 | new = count; |
577 | ||
578 | if (count & RWSEM_LOCK_MASK) { | |
579 | if (has_handoff || (wstate != WRITER_HANDOFF)) | |
580 | return false; | |
581 | ||
582 | new |= RWSEM_FLAG_HANDOFF; | |
583 | } else { | |
584 | new |= RWSEM_WRITER_LOCKED; | |
585 | new &= ~RWSEM_FLAG_HANDOFF; | |
586 | ||
587 | if (list_is_singular(&sem->wait_list)) | |
588 | new &= ~RWSEM_FLAG_WAITERS; | |
589 | } | |
590 | } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); | |
591 | ||
592 | /* | |
593 | * We have either acquired the lock with handoff bit cleared or | |
594 | * set the handoff bit. | |
595 | */ | |
596 | if (new & RWSEM_FLAG_HANDOFF) | |
597 | return false; | |
598 | ||
599 | rwsem_set_owner(sem); | |
600 | return true; | |
5dec94d4 WL |
601 | } |
602 | ||
603 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | |
cf69482d WL |
604 | /* |
605 | * Try to acquire read lock before the reader is put on wait queue. | |
606 | * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff | |
607 | * is ongoing. | |
608 | */ | |
609 | static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) | |
610 | { | |
611 | long count = atomic_long_read(&sem->count); | |
612 | ||
613 | if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) | |
614 | return false; | |
615 | ||
616 | count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); | |
617 | if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { | |
618 | rwsem_set_reader_owned(sem); | |
619 | lockevent_inc(rwsem_opt_rlock); | |
620 | return true; | |
621 | } | |
622 | ||
623 | /* Back out the change */ | |
624 | atomic_long_add(-RWSEM_READER_BIAS, &sem->count); | |
625 | return false; | |
626 | } | |
627 | ||
5dec94d4 WL |
628 | /* |
629 | * Try to acquire write lock before the writer has been put on wait queue. | |
630 | */ | |
631 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | |
632 | { | |
633 | long count = atomic_long_read(&sem->count); | |
634 | ||
4f23dbc1 | 635 | while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) { |
5dec94d4 | 636 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, |
4f23dbc1 | 637 | count | RWSEM_WRITER_LOCKED)) { |
5dec94d4 WL |
638 | rwsem_set_owner(sem); |
639 | lockevent_inc(rwsem_opt_wlock); | |
640 | return true; | |
641 | } | |
642 | } | |
643 | return false; | |
644 | } | |
645 | ||
646 | static inline bool owner_on_cpu(struct task_struct *owner) | |
647 | { | |
648 | /* | |
649 | * As lock holder preemption issue, we both skip spinning if | |
650 | * task is not on cpu or its cpu is preempted | |
651 | */ | |
652 | return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); | |
653 | } | |
654 | ||
7d43f1ce WL |
655 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, |
656 | unsigned long nonspinnable) | |
5dec94d4 WL |
657 | { |
658 | struct task_struct *owner; | |
94a9717b | 659 | unsigned long flags; |
5dec94d4 WL |
660 | bool ret = true; |
661 | ||
cf69482d WL |
662 | if (need_resched()) { |
663 | lockevent_inc(rwsem_opt_fail); | |
5dec94d4 | 664 | return false; |
cf69482d | 665 | } |
5dec94d4 | 666 | |
cf69482d | 667 | preempt_disable(); |
5dec94d4 | 668 | rcu_read_lock(); |
94a9717b | 669 | owner = rwsem_owner_flags(sem, &flags); |
78134300 WL |
670 | /* |
671 | * Don't check the read-owner as the entry may be stale. | |
672 | */ | |
673 | if ((flags & nonspinnable) || | |
674 | (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) | |
94a9717b | 675 | ret = false; |
5dec94d4 | 676 | rcu_read_unlock(); |
cf69482d WL |
677 | preempt_enable(); |
678 | ||
679 | lockevent_cond_inc(rwsem_opt_fail, !ret); | |
5dec94d4 WL |
680 | return ret; |
681 | } | |
682 | ||
683 | /* | |
3f6d517a WL |
684 | * The rwsem_spin_on_owner() function returns the folowing 4 values |
685 | * depending on the lock owner state. | |
686 | * OWNER_NULL : owner is currently NULL | |
687 | * OWNER_WRITER: when owner changes and is a writer | |
688 | * OWNER_READER: when owner changes and the new owner may be a reader. | |
689 | * OWNER_NONSPINNABLE: | |
690 | * when optimistic spinning has to stop because either the | |
691 | * owner stops running, is unknown, or its timeslice has | |
692 | * been used up. | |
5dec94d4 | 693 | */ |
3f6d517a WL |
694 | enum owner_state { |
695 | OWNER_NULL = 1 << 0, | |
696 | OWNER_WRITER = 1 << 1, | |
697 | OWNER_READER = 1 << 2, | |
698 | OWNER_NONSPINNABLE = 1 << 3, | |
699 | }; | |
7d43f1ce | 700 | #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) |
3f6d517a | 701 | |
94a9717b | 702 | static inline enum owner_state |
7d43f1ce | 703 | rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) |
5dec94d4 | 704 | { |
7d43f1ce | 705 | if (flags & nonspinnable) |
3f6d517a WL |
706 | return OWNER_NONSPINNABLE; |
707 | ||
94a9717b | 708 | if (flags & RWSEM_READER_OWNED) |
3f6d517a WL |
709 | return OWNER_READER; |
710 | ||
94a9717b | 711 | return owner ? OWNER_WRITER : OWNER_NULL; |
3f6d517a WL |
712 | } |
713 | ||
7d43f1ce WL |
714 | static noinline enum owner_state |
715 | rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) | |
3f6d517a | 716 | { |
94a9717b WL |
717 | struct task_struct *new, *owner; |
718 | unsigned long flags, new_flags; | |
719 | enum owner_state state; | |
3f6d517a | 720 | |
94a9717b | 721 | owner = rwsem_owner_flags(sem, &flags); |
7d43f1ce | 722 | state = rwsem_owner_state(owner, flags, nonspinnable); |
3f6d517a WL |
723 | if (state != OWNER_WRITER) |
724 | return state; | |
5dec94d4 WL |
725 | |
726 | rcu_read_lock(); | |
3f6d517a | 727 | for (;;) { |
91d2a812 WL |
728 | /* |
729 | * When a waiting writer set the handoff flag, it may spin | |
730 | * on the owner as well. Once that writer acquires the lock, | |
731 | * we can spin on it. So we don't need to quit even when the | |
732 | * handoff bit is set. | |
733 | */ | |
94a9717b WL |
734 | new = rwsem_owner_flags(sem, &new_flags); |
735 | if ((new != owner) || (new_flags != flags)) { | |
7d43f1ce | 736 | state = rwsem_owner_state(new, new_flags, nonspinnable); |
3f6d517a WL |
737 | break; |
738 | } | |
739 | ||
5dec94d4 WL |
740 | /* |
741 | * Ensure we emit the owner->on_cpu, dereference _after_ | |
742 | * checking sem->owner still matches owner, if that fails, | |
743 | * owner might point to free()d memory, if it still matches, | |
744 | * the rcu_read_lock() ensures the memory stays valid. | |
745 | */ | |
746 | barrier(); | |
747 | ||
5dec94d4 | 748 | if (need_resched() || !owner_on_cpu(owner)) { |
3f6d517a WL |
749 | state = OWNER_NONSPINNABLE; |
750 | break; | |
5dec94d4 WL |
751 | } |
752 | ||
753 | cpu_relax(); | |
754 | } | |
755 | rcu_read_unlock(); | |
756 | ||
3f6d517a | 757 | return state; |
5dec94d4 WL |
758 | } |
759 | ||
7d43f1ce WL |
760 | /* |
761 | * Calculate reader-owned rwsem spinning threshold for writer | |
762 | * | |
763 | * The more readers own the rwsem, the longer it will take for them to | |
764 | * wind down and free the rwsem. So the empirical formula used to | |
765 | * determine the actual spinning time limit here is: | |
766 | * | |
767 | * Spinning threshold = (10 + nr_readers/2)us | |
768 | * | |
769 | * The limit is capped to a maximum of 25us (30 readers). This is just | |
770 | * a heuristic and is subjected to change in the future. | |
771 | */ | |
772 | static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) | |
773 | { | |
774 | long count = atomic_long_read(&sem->count); | |
775 | int readers = count >> RWSEM_READER_SHIFT; | |
776 | u64 delta; | |
777 | ||
778 | if (readers > 30) | |
779 | readers = 30; | |
780 | delta = (20 + readers) * NSEC_PER_USEC / 2; | |
781 | ||
782 | return sched_clock() + delta; | |
783 | } | |
784 | ||
cf69482d | 785 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) |
5dec94d4 WL |
786 | { |
787 | bool taken = false; | |
990fa738 | 788 | int prev_owner_state = OWNER_NULL; |
7d43f1ce WL |
789 | int loop = 0; |
790 | u64 rspin_threshold = 0; | |
791 | unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE | |
792 | : RWSEM_RD_NONSPINNABLE; | |
5dec94d4 WL |
793 | |
794 | preempt_disable(); | |
795 | ||
796 | /* sem->wait_lock should not be held when doing optimistic spinning */ | |
5dec94d4 WL |
797 | if (!osq_lock(&sem->osq)) |
798 | goto done; | |
799 | ||
800 | /* | |
801 | * Optimistically spin on the owner field and attempt to acquire the | |
802 | * lock whenever the owner changes. Spinning will be stopped when: | |
803 | * 1) the owning writer isn't running; or | |
7d43f1ce | 804 | * 2) readers own the lock and spinning time has exceeded limit. |
5dec94d4 | 805 | */ |
990fa738 | 806 | for (;;) { |
7d43f1ce | 807 | enum owner_state owner_state; |
990fa738 | 808 | |
7d43f1ce | 809 | owner_state = rwsem_spin_on_owner(sem, nonspinnable); |
990fa738 WL |
810 | if (!(owner_state & OWNER_SPINNABLE)) |
811 | break; | |
812 | ||
5dec94d4 WL |
813 | /* |
814 | * Try to acquire the lock | |
815 | */ | |
cf69482d WL |
816 | taken = wlock ? rwsem_try_write_lock_unqueued(sem) |
817 | : rwsem_try_read_lock_unqueued(sem); | |
818 | ||
819 | if (taken) | |
5dec94d4 | 820 | break; |
5dec94d4 | 821 | |
7d43f1ce WL |
822 | /* |
823 | * Time-based reader-owned rwsem optimistic spinning | |
824 | */ | |
825 | if (wlock && (owner_state == OWNER_READER)) { | |
826 | /* | |
827 | * Re-initialize rspin_threshold every time when | |
828 | * the owner state changes from non-reader to reader. | |
829 | * This allows a writer to steal the lock in between | |
830 | * 2 reader phases and have the threshold reset at | |
831 | * the beginning of the 2nd reader phase. | |
832 | */ | |
833 | if (prev_owner_state != OWNER_READER) { | |
834 | if (rwsem_test_oflags(sem, nonspinnable)) | |
835 | break; | |
836 | rspin_threshold = rwsem_rspin_threshold(sem); | |
837 | loop = 0; | |
838 | } | |
839 | ||
840 | /* | |
841 | * Check time threshold once every 16 iterations to | |
842 | * avoid calling sched_clock() too frequently so | |
843 | * as to reduce the average latency between the times | |
844 | * when the lock becomes free and when the spinner | |
845 | * is ready to do a trylock. | |
846 | */ | |
847 | else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) { | |
848 | rwsem_set_nonspinnable(sem); | |
849 | lockevent_inc(rwsem_opt_nospin); | |
850 | break; | |
851 | } | |
852 | } | |
853 | ||
5dec94d4 | 854 | /* |
990fa738 WL |
855 | * An RT task cannot do optimistic spinning if it cannot |
856 | * be sure the lock holder is running or live-lock may | |
857 | * happen if the current task and the lock holder happen | |
858 | * to run in the same CPU. However, aborting optimistic | |
859 | * spinning while a NULL owner is detected may miss some | |
860 | * opportunity where spinning can continue without causing | |
861 | * problem. | |
862 | * | |
863 | * There are 2 possible cases where an RT task may be able | |
864 | * to continue spinning. | |
865 | * | |
866 | * 1) The lock owner is in the process of releasing the | |
867 | * lock, sem->owner is cleared but the lock has not | |
868 | * been released yet. | |
869 | * 2) The lock was free and owner cleared, but another | |
870 | * task just comes in and acquire the lock before | |
871 | * we try to get it. The new owner may be a spinnable | |
872 | * writer. | |
873 | * | |
874 | * To take advantage of two scenarios listed agove, the RT | |
875 | * task is made to retry one more time to see if it can | |
876 | * acquire the lock or continue spinning on the new owning | |
877 | * writer. Of course, if the time lag is long enough or the | |
878 | * new owner is not a writer or spinnable, the RT task will | |
879 | * quit spinning. | |
880 | * | |
881 | * If the owner is a writer, the need_resched() check is | |
882 | * done inside rwsem_spin_on_owner(). If the owner is not | |
883 | * a writer, need_resched() check needs to be done here. | |
5dec94d4 | 884 | */ |
990fa738 WL |
885 | if (owner_state != OWNER_WRITER) { |
886 | if (need_resched()) | |
887 | break; | |
888 | if (rt_task(current) && | |
889 | (prev_owner_state != OWNER_WRITER)) | |
890 | break; | |
891 | } | |
892 | prev_owner_state = owner_state; | |
5dec94d4 WL |
893 | |
894 | /* | |
895 | * The cpu_relax() call is a compiler barrier which forces | |
896 | * everything in this loop to be re-loaded. We don't need | |
897 | * memory barriers as we'll eventually observe the right | |
898 | * values at the cost of a few extra spins. | |
899 | */ | |
900 | cpu_relax(); | |
901 | } | |
902 | osq_unlock(&sem->osq); | |
903 | done: | |
904 | preempt_enable(); | |
905 | lockevent_cond_inc(rwsem_opt_fail, !taken); | |
906 | return taken; | |
907 | } | |
7d43f1ce WL |
908 | |
909 | /* | |
910 | * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should | |
911 | * only be called when the reader count reaches 0. | |
912 | * | |
913 | * This give writers better chance to acquire the rwsem first before | |
914 | * readers when the rwsem was being held by readers for a relatively long | |
915 | * period of time. Race can happen that an optimistic spinner may have | |
916 | * just stolen the rwsem and set the owner, but just clearing the | |
917 | * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. | |
918 | */ | |
919 | static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) | |
920 | { | |
921 | if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) | |
922 | atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); | |
923 | } | |
5cfd92e1 WL |
924 | |
925 | /* | |
926 | * This function is called when the reader fails to acquire the lock via | |
927 | * optimistic spinning. In this case we will still attempt to do a trylock | |
928 | * when comparing the rwsem state right now with the state when entering | |
929 | * the slowpath indicates that the reader is still in a valid reader phase. | |
930 | * This happens when the following conditions are true: | |
931 | * | |
932 | * 1) The lock is currently reader owned, and | |
933 | * 2) The lock is previously not reader-owned or the last read owner changes. | |
934 | * | |
935 | * In the former case, we have transitioned from a writer phase to a | |
936 | * reader-phase while spinning. In the latter case, it means the reader | |
937 | * phase hasn't ended when we entered the optimistic spinning loop. In | |
938 | * both cases, the reader is eligible to acquire the lock. This is the | |
939 | * secondary path where a read lock is acquired optimistically. | |
940 | * | |
941 | * The reader non-spinnable bit wasn't set at time of entry or it will | |
942 | * not be here at all. | |
943 | */ | |
944 | static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, | |
945 | unsigned long last_rowner) | |
946 | { | |
947 | unsigned long owner = atomic_long_read(&sem->owner); | |
948 | ||
949 | if (!(owner & RWSEM_READER_OWNED)) | |
950 | return false; | |
951 | ||
952 | if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && | |
953 | rwsem_try_read_lock_unqueued(sem)) { | |
954 | lockevent_inc(rwsem_opt_rlock2); | |
955 | lockevent_add(rwsem_opt_fail, -1); | |
956 | return true; | |
957 | } | |
958 | return false; | |
959 | } | |
5dec94d4 | 960 | #else |
7d43f1ce WL |
961 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, |
962 | unsigned long nonspinnable) | |
cf69482d WL |
963 | { |
964 | return false; | |
965 | } | |
966 | ||
967 | static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) | |
5dec94d4 WL |
968 | { |
969 | return false; | |
970 | } | |
7d43f1ce WL |
971 | |
972 | static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } | |
5cfd92e1 WL |
973 | |
974 | static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, | |
975 | unsigned long last_rowner) | |
976 | { | |
977 | return false; | |
978 | } | |
91d2a812 WL |
979 | |
980 | static inline int | |
981 | rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) | |
982 | { | |
983 | return 0; | |
984 | } | |
985 | #define OWNER_NULL 1 | |
5dec94d4 WL |
986 | #endif |
987 | ||
988 | /* | |
989 | * Wait for the read lock to be granted | |
990 | */ | |
6cef7ff6 WL |
991 | static struct rw_semaphore __sched * |
992 | rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) | |
5dec94d4 WL |
993 | { |
994 | long count, adjustment = -RWSEM_READER_BIAS; | |
995 | struct rwsem_waiter waiter; | |
996 | DEFINE_WAKE_Q(wake_q); | |
a15ea1a3 | 997 | bool wake = false; |
5dec94d4 | 998 | |
5cfd92e1 WL |
999 | /* |
1000 | * Save the current read-owner of rwsem, if available, and the | |
1001 | * reader nonspinnable bit. | |
1002 | */ | |
1003 | waiter.last_rowner = atomic_long_read(&sem->owner); | |
1004 | if (!(waiter.last_rowner & RWSEM_READER_OWNED)) | |
1005 | waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; | |
1006 | ||
7d43f1ce | 1007 | if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) |
cf69482d WL |
1008 | goto queue; |
1009 | ||
1010 | /* | |
1011 | * Undo read bias from down_read() and do optimistic spinning. | |
1012 | */ | |
1013 | atomic_long_add(-RWSEM_READER_BIAS, &sem->count); | |
1014 | adjustment = 0; | |
1015 | if (rwsem_optimistic_spin(sem, false)) { | |
6ffddfb9 | 1016 | /* rwsem_optimistic_spin() implies ACQUIRE on success */ |
cf69482d WL |
1017 | /* |
1018 | * Wake up other readers in the wait list if the front | |
1019 | * waiter is a reader. | |
1020 | */ | |
1021 | if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { | |
1022 | raw_spin_lock_irq(&sem->wait_lock); | |
1023 | if (!list_empty(&sem->wait_list)) | |
1024 | rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, | |
1025 | &wake_q); | |
1026 | raw_spin_unlock_irq(&sem->wait_lock); | |
1027 | wake_up_q(&wake_q); | |
1028 | } | |
1029 | return sem; | |
5cfd92e1 | 1030 | } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { |
6ffddfb9 | 1031 | /* rwsem_reader_phase_trylock() implies ACQUIRE on success */ |
5cfd92e1 | 1032 | return sem; |
cf69482d WL |
1033 | } |
1034 | ||
1035 | queue: | |
5dec94d4 WL |
1036 | waiter.task = current; |
1037 | waiter.type = RWSEM_WAITING_FOR_READ; | |
4f23dbc1 | 1038 | waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; |
5dec94d4 WL |
1039 | |
1040 | raw_spin_lock_irq(&sem->wait_lock); | |
1041 | if (list_empty(&sem->wait_list)) { | |
1042 | /* | |
1043 | * In case the wait queue is empty and the lock isn't owned | |
4f23dbc1 WL |
1044 | * by a writer or has the handoff bit set, this reader can |
1045 | * exit the slowpath and return immediately as its | |
1046 | * RWSEM_READER_BIAS has already been set in the count. | |
5dec94d4 | 1047 | */ |
cf69482d | 1048 | if (adjustment && !(atomic_long_read(&sem->count) & |
4f23dbc1 | 1049 | (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { |
e1b98fa3 JS |
1050 | /* Provide lock ACQUIRE */ |
1051 | smp_acquire__after_ctrl_dep(); | |
5dec94d4 WL |
1052 | raw_spin_unlock_irq(&sem->wait_lock); |
1053 | rwsem_set_reader_owned(sem); | |
1054 | lockevent_inc(rwsem_rlock_fast); | |
1055 | return sem; | |
1056 | } | |
1057 | adjustment += RWSEM_FLAG_WAITERS; | |
1058 | } | |
1059 | list_add_tail(&waiter.list, &sem->wait_list); | |
1060 | ||
1061 | /* we're now waiting on the lock, but no longer actively locking */ | |
cf69482d WL |
1062 | if (adjustment) |
1063 | count = atomic_long_add_return(adjustment, &sem->count); | |
1064 | else | |
1065 | count = atomic_long_read(&sem->count); | |
5dec94d4 WL |
1066 | |
1067 | /* | |
1068 | * If there are no active locks, wake the front queued process(es). | |
1069 | * | |
1070 | * If there are no writers and we are first in the queue, | |
1071 | * wake our own waiter to join the existing active readers ! | |
1072 | */ | |
7d43f1ce WL |
1073 | if (!(count & RWSEM_LOCK_MASK)) { |
1074 | clear_wr_nonspinnable(sem); | |
1075 | wake = true; | |
1076 | } | |
1077 | if (wake || (!(count & RWSEM_WRITER_MASK) && | |
1078 | (adjustment & RWSEM_FLAG_WAITERS))) | |
6cef7ff6 | 1079 | rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
5dec94d4 WL |
1080 | |
1081 | raw_spin_unlock_irq(&sem->wait_lock); | |
1082 | wake_up_q(&wake_q); | |
1083 | ||
1084 | /* wait to be given the lock */ | |
6ffddfb9 | 1085 | for (;;) { |
5dec94d4 | 1086 | set_current_state(state); |
99143f82 | 1087 | if (!smp_load_acquire(&waiter.task)) { |
6ffddfb9 | 1088 | /* Matches rwsem_mark_wake()'s smp_store_release(). */ |
5dec94d4 | 1089 | break; |
99143f82 | 1090 | } |
5dec94d4 WL |
1091 | if (signal_pending_state(state, current)) { |
1092 | raw_spin_lock_irq(&sem->wait_lock); | |
1093 | if (waiter.task) | |
1094 | goto out_nolock; | |
1095 | raw_spin_unlock_irq(&sem->wait_lock); | |
6ffddfb9 | 1096 | /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ |
5dec94d4 WL |
1097 | break; |
1098 | } | |
1099 | schedule(); | |
1100 | lockevent_inc(rwsem_sleep_reader); | |
1101 | } | |
1102 | ||
1103 | __set_current_state(TASK_RUNNING); | |
1104 | lockevent_inc(rwsem_rlock); | |
1105 | return sem; | |
6ffddfb9 | 1106 | |
5dec94d4 WL |
1107 | out_nolock: |
1108 | list_del(&waiter.list); | |
4f23dbc1 WL |
1109 | if (list_empty(&sem->wait_list)) { |
1110 | atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, | |
1111 | &sem->count); | |
1112 | } | |
5dec94d4 WL |
1113 | raw_spin_unlock_irq(&sem->wait_lock); |
1114 | __set_current_state(TASK_RUNNING); | |
1115 | lockevent_inc(rwsem_rlock_fail); | |
1116 | return ERR_PTR(-EINTR); | |
1117 | } | |
1118 | ||
5cfd92e1 WL |
1119 | /* |
1120 | * This function is called by the a write lock owner. So the owner value | |
1121 | * won't get changed by others. | |
1122 | */ | |
1123 | static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, | |
1124 | bool disable) | |
1125 | { | |
1126 | if (unlikely(disable)) { | |
1127 | atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); | |
1128 | lockevent_inc(rwsem_opt_norspin); | |
1129 | } | |
1130 | } | |
1131 | ||
5dec94d4 WL |
1132 | /* |
1133 | * Wait until we successfully acquire the write lock | |
1134 | */ | |
6cef7ff6 WL |
1135 | static struct rw_semaphore * |
1136 | rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) | |
5dec94d4 WL |
1137 | { |
1138 | long count; | |
5cfd92e1 | 1139 | bool disable_rspin; |
4f23dbc1 | 1140 | enum writer_wait_state wstate; |
5dec94d4 WL |
1141 | struct rwsem_waiter waiter; |
1142 | struct rw_semaphore *ret = sem; | |
1143 | DEFINE_WAKE_Q(wake_q); | |
1144 | ||
1145 | /* do optimistic spinning and steal lock if possible */ | |
7d43f1ce | 1146 | if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && |
6ffddfb9 PZ |
1147 | rwsem_optimistic_spin(sem, true)) { |
1148 | /* rwsem_optimistic_spin() implies ACQUIRE on success */ | |
5dec94d4 | 1149 | return sem; |
6ffddfb9 | 1150 | } |
5dec94d4 | 1151 | |
5cfd92e1 WL |
1152 | /* |
1153 | * Disable reader optimistic spinning for this rwsem after | |
1154 | * acquiring the write lock when the setting of the nonspinnable | |
1155 | * bits are observed. | |
1156 | */ | |
1157 | disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; | |
1158 | ||
5dec94d4 WL |
1159 | /* |
1160 | * Optimistic spinning failed, proceed to the slowpath | |
1161 | * and block until we can acquire the sem. | |
1162 | */ | |
1163 | waiter.task = current; | |
1164 | waiter.type = RWSEM_WAITING_FOR_WRITE; | |
4f23dbc1 | 1165 | waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; |
5dec94d4 WL |
1166 | |
1167 | raw_spin_lock_irq(&sem->wait_lock); | |
1168 | ||
1169 | /* account for this before adding a new element to the list */ | |
4f23dbc1 | 1170 | wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; |
5dec94d4 WL |
1171 | |
1172 | list_add_tail(&waiter.list, &sem->wait_list); | |
1173 | ||
1174 | /* we're now waiting on the lock */ | |
4f23dbc1 | 1175 | if (wstate == WRITER_NOT_FIRST) { |
5dec94d4 WL |
1176 | count = atomic_long_read(&sem->count); |
1177 | ||
1178 | /* | |
4f23dbc1 WL |
1179 | * If there were already threads queued before us and: |
1180 | * 1) there are no no active locks, wake the front | |
1181 | * queued process(es) as the handoff bit might be set. | |
1182 | * 2) there are no active writers and some readers, the lock | |
1183 | * must be read owned; so we try to wake any read lock | |
1184 | * waiters that were queued ahead of us. | |
5dec94d4 | 1185 | */ |
4f23dbc1 WL |
1186 | if (count & RWSEM_WRITER_MASK) |
1187 | goto wait; | |
5dec94d4 | 1188 | |
4f23dbc1 WL |
1189 | rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) |
1190 | ? RWSEM_WAKE_READERS | |
1191 | : RWSEM_WAKE_ANY, &wake_q); | |
5dec94d4 | 1192 | |
00f3c5a3 WL |
1193 | if (!wake_q_empty(&wake_q)) { |
1194 | /* | |
1195 | * We want to minimize wait_lock hold time especially | |
1196 | * when a large number of readers are to be woken up. | |
1197 | */ | |
1198 | raw_spin_unlock_irq(&sem->wait_lock); | |
1199 | wake_up_q(&wake_q); | |
1200 | wake_q_init(&wake_q); /* Used again, reinit */ | |
1201 | raw_spin_lock_irq(&sem->wait_lock); | |
1202 | } | |
5dec94d4 | 1203 | } else { |
00f3c5a3 | 1204 | atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); |
5dec94d4 WL |
1205 | } |
1206 | ||
4f23dbc1 | 1207 | wait: |
5dec94d4 WL |
1208 | /* wait until we successfully acquire the lock */ |
1209 | set_current_state(state); | |
6ffddfb9 PZ |
1210 | for (;;) { |
1211 | if (rwsem_try_write_lock(sem, wstate)) { | |
1212 | /* rwsem_try_write_lock() implies ACQUIRE on success */ | |
5dec94d4 | 1213 | break; |
6ffddfb9 | 1214 | } |
4f23dbc1 | 1215 | |
5dec94d4 WL |
1216 | raw_spin_unlock_irq(&sem->wait_lock); |
1217 | ||
91d2a812 WL |
1218 | /* |
1219 | * After setting the handoff bit and failing to acquire | |
1220 | * the lock, attempt to spin on owner to accelerate lock | |
1221 | * transfer. If the previous owner is a on-cpu writer and it | |
1222 | * has just released the lock, OWNER_NULL will be returned. | |
1223 | * In this case, we attempt to acquire the lock again | |
1224 | * without sleeping. | |
1225 | */ | |
39e7234f WL |
1226 | if (wstate == WRITER_HANDOFF && |
1227 | rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL) | |
91d2a812 WL |
1228 | goto trylock_again; |
1229 | ||
5dec94d4 | 1230 | /* Block until there are no active lockers. */ |
4f23dbc1 | 1231 | for (;;) { |
5dec94d4 WL |
1232 | if (signal_pending_state(state, current)) |
1233 | goto out_nolock; | |
1234 | ||
1235 | schedule(); | |
1236 | lockevent_inc(rwsem_sleep_writer); | |
1237 | set_current_state(state); | |
4f23dbc1 WL |
1238 | /* |
1239 | * If HANDOFF bit is set, unconditionally do | |
1240 | * a trylock. | |
1241 | */ | |
1242 | if (wstate == WRITER_HANDOFF) | |
1243 | break; | |
1244 | ||
1245 | if ((wstate == WRITER_NOT_FIRST) && | |
1246 | (rwsem_first_waiter(sem) == &waiter)) | |
1247 | wstate = WRITER_FIRST; | |
1248 | ||
5dec94d4 | 1249 | count = atomic_long_read(&sem->count); |
4f23dbc1 WL |
1250 | if (!(count & RWSEM_LOCK_MASK)) |
1251 | break; | |
1252 | ||
1253 | /* | |
1254 | * The setting of the handoff bit is deferred | |
1255 | * until rwsem_try_write_lock() is called. | |
1256 | */ | |
1257 | if ((wstate == WRITER_FIRST) && (rt_task(current) || | |
1258 | time_after(jiffies, waiter.timeout))) { | |
1259 | wstate = WRITER_HANDOFF; | |
1260 | lockevent_inc(rwsem_wlock_handoff); | |
1261 | break; | |
1262 | } | |
1263 | } | |
91d2a812 | 1264 | trylock_again: |
5dec94d4 WL |
1265 | raw_spin_lock_irq(&sem->wait_lock); |
1266 | } | |
1267 | __set_current_state(TASK_RUNNING); | |
1268 | list_del(&waiter.list); | |
5cfd92e1 | 1269 | rwsem_disable_reader_optspin(sem, disable_rspin); |
5dec94d4 WL |
1270 | raw_spin_unlock_irq(&sem->wait_lock); |
1271 | lockevent_inc(rwsem_wlock); | |
1272 | ||
1273 | return ret; | |
1274 | ||
1275 | out_nolock: | |
1276 | __set_current_state(TASK_RUNNING); | |
1277 | raw_spin_lock_irq(&sem->wait_lock); | |
1278 | list_del(&waiter.list); | |
4f23dbc1 WL |
1279 | |
1280 | if (unlikely(wstate == WRITER_HANDOFF)) | |
1281 | atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); | |
1282 | ||
5dec94d4 WL |
1283 | if (list_empty(&sem->wait_list)) |
1284 | atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); | |
1285 | else | |
6cef7ff6 | 1286 | rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
5dec94d4 WL |
1287 | raw_spin_unlock_irq(&sem->wait_lock); |
1288 | wake_up_q(&wake_q); | |
1289 | lockevent_inc(rwsem_wlock_fail); | |
1290 | ||
1291 | return ERR_PTR(-EINTR); | |
1292 | } | |
1293 | ||
5dec94d4 WL |
1294 | /* |
1295 | * handle waking up a waiter on the semaphore | |
1296 | * - up_read/up_write has decremented the active part of count if we come here | |
1297 | */ | |
4f23dbc1 | 1298 | static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) |
5dec94d4 WL |
1299 | { |
1300 | unsigned long flags; | |
1301 | DEFINE_WAKE_Q(wake_q); | |
1302 | ||
1303 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | |
1304 | ||
1305 | if (!list_empty(&sem->wait_list)) | |
6cef7ff6 | 1306 | rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); |
5dec94d4 WL |
1307 | |
1308 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | |
1309 | wake_up_q(&wake_q); | |
1310 | ||
1311 | return sem; | |
1312 | } | |
5dec94d4 WL |
1313 | |
1314 | /* | |
1315 | * downgrade a write lock into a read lock | |
1316 | * - caller incremented waiting part of count and discovered it still negative | |
1317 | * - just wake up any readers at the front of the queue | |
1318 | */ | |
6cef7ff6 | 1319 | static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) |
5dec94d4 WL |
1320 | { |
1321 | unsigned long flags; | |
1322 | DEFINE_WAKE_Q(wake_q); | |
1323 | ||
1324 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | |
1325 | ||
1326 | if (!list_empty(&sem->wait_list)) | |
6cef7ff6 | 1327 | rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); |
5dec94d4 WL |
1328 | |
1329 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | |
1330 | wake_up_q(&wake_q); | |
1331 | ||
1332 | return sem; | |
1333 | } | |
5dec94d4 WL |
1334 | |
1335 | /* | |
1336 | * lock for reading | |
1337 | */ | |
7f26482a | 1338 | static inline void __down_read(struct rw_semaphore *sem) |
5dec94d4 | 1339 | { |
a15ea1a3 | 1340 | if (!rwsem_read_trylock(sem)) { |
6cef7ff6 | 1341 | rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); |
94a9717b | 1342 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); |
5dec94d4 WL |
1343 | } else { |
1344 | rwsem_set_reader_owned(sem); | |
1345 | } | |
1346 | } | |
1347 | ||
1348 | static inline int __down_read_killable(struct rw_semaphore *sem) | |
1349 | { | |
a15ea1a3 | 1350 | if (!rwsem_read_trylock(sem)) { |
6cef7ff6 | 1351 | if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) |
5dec94d4 | 1352 | return -EINTR; |
94a9717b | 1353 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); |
5dec94d4 WL |
1354 | } else { |
1355 | rwsem_set_reader_owned(sem); | |
1356 | } | |
1357 | return 0; | |
1358 | } | |
1359 | ||
1360 | static inline int __down_read_trylock(struct rw_semaphore *sem) | |
1361 | { | |
fce45cd4 DB |
1362 | long tmp; |
1363 | ||
1364 | DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); | |
1365 | ||
5dec94d4 WL |
1366 | /* |
1367 | * Optimize for the case when the rwsem is not locked at all. | |
1368 | */ | |
fce45cd4 | 1369 | tmp = RWSEM_UNLOCKED_VALUE; |
5dec94d4 WL |
1370 | do { |
1371 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, | |
1372 | tmp + RWSEM_READER_BIAS)) { | |
1373 | rwsem_set_reader_owned(sem); | |
1374 | return 1; | |
1375 | } | |
1376 | } while (!(tmp & RWSEM_READ_FAILED_MASK)); | |
1377 | return 0; | |
1378 | } | |
1379 | ||
1380 | /* | |
1381 | * lock for writing | |
1382 | */ | |
7f26482a | 1383 | static inline void __down_write(struct rw_semaphore *sem) |
5dec94d4 | 1384 | { |
6cef7ff6 WL |
1385 | long tmp = RWSEM_UNLOCKED_VALUE; |
1386 | ||
1387 | if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, | |
1388 | RWSEM_WRITER_LOCKED))) | |
1389 | rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); | |
5cfd92e1 WL |
1390 | else |
1391 | rwsem_set_owner(sem); | |
5dec94d4 WL |
1392 | } |
1393 | ||
1394 | static inline int __down_write_killable(struct rw_semaphore *sem) | |
1395 | { | |
6cef7ff6 WL |
1396 | long tmp = RWSEM_UNLOCKED_VALUE; |
1397 | ||
1398 | if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, | |
1399 | RWSEM_WRITER_LOCKED))) { | |
1400 | if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) | |
5dec94d4 | 1401 | return -EINTR; |
5cfd92e1 WL |
1402 | } else { |
1403 | rwsem_set_owner(sem); | |
6cef7ff6 | 1404 | } |
5dec94d4 WL |
1405 | return 0; |
1406 | } | |
1407 | ||
1408 | static inline int __down_write_trylock(struct rw_semaphore *sem) | |
1409 | { | |
fce45cd4 | 1410 | long tmp; |
5dec94d4 | 1411 | |
fce45cd4 DB |
1412 | DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); |
1413 | ||
1414 | tmp = RWSEM_UNLOCKED_VALUE; | |
6cef7ff6 WL |
1415 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, |
1416 | RWSEM_WRITER_LOCKED)) { | |
5dec94d4 WL |
1417 | rwsem_set_owner(sem); |
1418 | return true; | |
1419 | } | |
1420 | return false; | |
1421 | } | |
1422 | ||
1423 | /* | |
1424 | * unlock after reading | |
1425 | */ | |
7f26482a | 1426 | static inline void __up_read(struct rw_semaphore *sem) |
5dec94d4 WL |
1427 | { |
1428 | long tmp; | |
1429 | ||
fce45cd4 | 1430 | DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); |
94a9717b | 1431 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); |
fce45cd4 | 1432 | |
5dec94d4 WL |
1433 | rwsem_clear_reader_owned(sem); |
1434 | tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); | |
a15ea1a3 | 1435 | DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); |
6cef7ff6 | 1436 | if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == |
7d43f1ce WL |
1437 | RWSEM_FLAG_WAITERS)) { |
1438 | clear_wr_nonspinnable(sem); | |
4f23dbc1 | 1439 | rwsem_wake(sem, tmp); |
7d43f1ce | 1440 | } |
5dec94d4 WL |
1441 | } |
1442 | ||
1443 | /* | |
1444 | * unlock after writing | |
1445 | */ | |
7f26482a | 1446 | static inline void __up_write(struct rw_semaphore *sem) |
5dec94d4 | 1447 | { |
6cef7ff6 WL |
1448 | long tmp; |
1449 | ||
fce45cd4 | 1450 | DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); |
02f1082b WL |
1451 | /* |
1452 | * sem->owner may differ from current if the ownership is transferred | |
1453 | * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. | |
1454 | */ | |
94a9717b WL |
1455 | DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && |
1456 | !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); | |
fce45cd4 | 1457 | |
5dec94d4 | 1458 | rwsem_clear_owner(sem); |
6cef7ff6 WL |
1459 | tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); |
1460 | if (unlikely(tmp & RWSEM_FLAG_WAITERS)) | |
4f23dbc1 | 1461 | rwsem_wake(sem, tmp); |
5dec94d4 WL |
1462 | } |
1463 | ||
1464 | /* | |
1465 | * downgrade write lock to read lock | |
1466 | */ | |
1467 | static inline void __downgrade_write(struct rw_semaphore *sem) | |
1468 | { | |
1469 | long tmp; | |
1470 | ||
1471 | /* | |
1472 | * When downgrading from exclusive to shared ownership, | |
1473 | * anything inside the write-locked region cannot leak | |
1474 | * into the read side. In contrast, anything in the | |
1475 | * read-locked region is ok to be re-ordered into the | |
1476 | * write side. As such, rely on RELEASE semantics. | |
1477 | */ | |
94a9717b | 1478 | DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); |
5dec94d4 WL |
1479 | tmp = atomic_long_fetch_add_release( |
1480 | -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); | |
1481 | rwsem_set_reader_owned(sem); | |
1482 | if (tmp & RWSEM_FLAG_WAITERS) | |
1483 | rwsem_downgrade_wake(sem); | |
1484 | } | |
4fc828e2 | 1485 | |
c4e05116 IM |
1486 | /* |
1487 | * lock for reading | |
1488 | */ | |
c7af77b5 | 1489 | void __sched down_read(struct rw_semaphore *sem) |
c4e05116 IM |
1490 | { |
1491 | might_sleep(); | |
1492 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | |
1493 | ||
4fe87745 | 1494 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
c4e05116 | 1495 | } |
c4e05116 IM |
1496 | EXPORT_SYMBOL(down_read); |
1497 | ||
76f8507f KT |
1498 | int __sched down_read_killable(struct rw_semaphore *sem) |
1499 | { | |
1500 | might_sleep(); | |
1501 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | |
1502 | ||
1503 | if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { | |
5facae4f | 1504 | rwsem_release(&sem->dep_map, _RET_IP_); |
76f8507f KT |
1505 | return -EINTR; |
1506 | } | |
1507 | ||
76f8507f KT |
1508 | return 0; |
1509 | } | |
76f8507f KT |
1510 | EXPORT_SYMBOL(down_read_killable); |
1511 | ||
c4e05116 IM |
1512 | /* |
1513 | * trylock for reading -- returns 1 if successful, 0 if contention | |
1514 | */ | |
1515 | int down_read_trylock(struct rw_semaphore *sem) | |
1516 | { | |
1517 | int ret = __down_read_trylock(sem); | |
1518 | ||
c7580c1e | 1519 | if (ret == 1) |
c4e05116 IM |
1520 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); |
1521 | return ret; | |
1522 | } | |
c4e05116 IM |
1523 | EXPORT_SYMBOL(down_read_trylock); |
1524 | ||
1525 | /* | |
1526 | * lock for writing | |
1527 | */ | |
c7af77b5 | 1528 | void __sched down_write(struct rw_semaphore *sem) |
c4e05116 IM |
1529 | { |
1530 | might_sleep(); | |
1531 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | |
4fe87745 | 1532 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
c4e05116 | 1533 | } |
c4e05116 IM |
1534 | EXPORT_SYMBOL(down_write); |
1535 | ||
916633a4 MH |
1536 | /* |
1537 | * lock for writing | |
1538 | */ | |
1539 | int __sched down_write_killable(struct rw_semaphore *sem) | |
1540 | { | |
1541 | might_sleep(); | |
1542 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | |
1543 | ||
6cef7ff6 WL |
1544 | if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, |
1545 | __down_write_killable)) { | |
5facae4f | 1546 | rwsem_release(&sem->dep_map, _RET_IP_); |
916633a4 MH |
1547 | return -EINTR; |
1548 | } | |
1549 | ||
916633a4 MH |
1550 | return 0; |
1551 | } | |
916633a4 MH |
1552 | EXPORT_SYMBOL(down_write_killable); |
1553 | ||
c4e05116 IM |
1554 | /* |
1555 | * trylock for writing -- returns 1 if successful, 0 if contention | |
1556 | */ | |
1557 | int down_write_trylock(struct rw_semaphore *sem) | |
1558 | { | |
1559 | int ret = __down_write_trylock(sem); | |
1560 | ||
c7580c1e | 1561 | if (ret == 1) |
428e6ce0 | 1562 | rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); |
4fc828e2 | 1563 | |
c4e05116 IM |
1564 | return ret; |
1565 | } | |
c4e05116 IM |
1566 | EXPORT_SYMBOL(down_write_trylock); |
1567 | ||
1568 | /* | |
1569 | * release a read lock | |
1570 | */ | |
1571 | void up_read(struct rw_semaphore *sem) | |
1572 | { | |
5facae4f | 1573 | rwsem_release(&sem->dep_map, _RET_IP_); |
c4e05116 IM |
1574 | __up_read(sem); |
1575 | } | |
c4e05116 IM |
1576 | EXPORT_SYMBOL(up_read); |
1577 | ||
1578 | /* | |
1579 | * release a write lock | |
1580 | */ | |
1581 | void up_write(struct rw_semaphore *sem) | |
1582 | { | |
5facae4f | 1583 | rwsem_release(&sem->dep_map, _RET_IP_); |
c4e05116 IM |
1584 | __up_write(sem); |
1585 | } | |
c4e05116 IM |
1586 | EXPORT_SYMBOL(up_write); |
1587 | ||
1588 | /* | |
1589 | * downgrade write lock to read lock | |
1590 | */ | |
1591 | void downgrade_write(struct rw_semaphore *sem) | |
1592 | { | |
6419c4af | 1593 | lock_downgrade(&sem->dep_map, _RET_IP_); |
c4e05116 IM |
1594 | __downgrade_write(sem); |
1595 | } | |
c4e05116 | 1596 | EXPORT_SYMBOL(downgrade_write); |
4ea2176d IM |
1597 | |
1598 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | |
1599 | ||
1600 | void down_read_nested(struct rw_semaphore *sem, int subclass) | |
1601 | { | |
1602 | might_sleep(); | |
1603 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); | |
4fe87745 | 1604 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
4ea2176d | 1605 | } |
4ea2176d IM |
1606 | EXPORT_SYMBOL(down_read_nested); |
1607 | ||
1b963c81 JK |
1608 | void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) |
1609 | { | |
1610 | might_sleep(); | |
1611 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); | |
1b963c81 JK |
1612 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
1613 | } | |
1b963c81 JK |
1614 | EXPORT_SYMBOL(_down_write_nest_lock); |
1615 | ||
84759c6d KO |
1616 | void down_read_non_owner(struct rw_semaphore *sem) |
1617 | { | |
1618 | might_sleep(); | |
84759c6d | 1619 | __down_read(sem); |
925b9cd1 | 1620 | __rwsem_set_reader_owned(sem, NULL); |
84759c6d | 1621 | } |
84759c6d KO |
1622 | EXPORT_SYMBOL(down_read_non_owner); |
1623 | ||
4ea2176d IM |
1624 | void down_write_nested(struct rw_semaphore *sem, int subclass) |
1625 | { | |
1626 | might_sleep(); | |
1627 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | |
4fe87745 | 1628 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
4ea2176d | 1629 | } |
4ea2176d IM |
1630 | EXPORT_SYMBOL(down_write_nested); |
1631 | ||
887bddfa AV |
1632 | int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) |
1633 | { | |
1634 | might_sleep(); | |
1635 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | |
1636 | ||
6cef7ff6 WL |
1637 | if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, |
1638 | __down_write_killable)) { | |
5facae4f | 1639 | rwsem_release(&sem->dep_map, _RET_IP_); |
887bddfa AV |
1640 | return -EINTR; |
1641 | } | |
1642 | ||
887bddfa AV |
1643 | return 0; |
1644 | } | |
887bddfa AV |
1645 | EXPORT_SYMBOL(down_write_killable_nested); |
1646 | ||
84759c6d KO |
1647 | void up_read_non_owner(struct rw_semaphore *sem) |
1648 | { | |
94a9717b | 1649 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); |
84759c6d KO |
1650 | __up_read(sem); |
1651 | } | |
84759c6d KO |
1652 | EXPORT_SYMBOL(up_read_non_owner); |
1653 | ||
4ea2176d | 1654 | #endif |