Commit | Line | Data |
---|---|---|
a046f1a0 PZ |
1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | ||
3 | #include <linux/sched/task.h> | |
4 | #include <linux/sched/signal.h> | |
5 | #include <linux/freezer.h> | |
6 | ||
7 | #include "futex.h" | |
8 | ||
9 | /* | |
10 | * READ this before attempting to hack on futexes! | |
11 | * | |
12 | * Basic futex operation and ordering guarantees | |
13 | * ============================================= | |
14 | * | |
15 | * The waiter reads the futex value in user space and calls | |
16 | * futex_wait(). This function computes the hash bucket and acquires | |
17 | * the hash bucket lock. After that it reads the futex user space value | |
18 | * again and verifies that the data has not changed. If it has not changed | |
19 | * it enqueues itself into the hash bucket, releases the hash bucket lock | |
20 | * and schedules. | |
21 | * | |
22 | * The waker side modifies the user space value of the futex and calls | |
23 | * futex_wake(). This function computes the hash bucket and acquires the | |
24 | * hash bucket lock. Then it looks for waiters on that futex in the hash | |
25 | * bucket and wakes them. | |
26 | * | |
27 | * In futex wake up scenarios where no tasks are blocked on a futex, taking | |
28 | * the hb spinlock can be avoided and simply return. In order for this | |
29 | * optimization to work, ordering guarantees must exist so that the waiter | |
30 | * being added to the list is acknowledged when the list is concurrently being | |
31 | * checked by the waker, avoiding scenarios like the following: | |
32 | * | |
33 | * CPU 0 CPU 1 | |
34 | * val = *futex; | |
35 | * sys_futex(WAIT, futex, val); | |
36 | * futex_wait(futex, val); | |
37 | * uval = *futex; | |
38 | * *futex = newval; | |
39 | * sys_futex(WAKE, futex); | |
40 | * futex_wake(futex); | |
41 | * if (queue_empty()) | |
42 | * return; | |
43 | * if (uval == val) | |
44 | * lock(hash_bucket(futex)); | |
45 | * queue(); | |
46 | * unlock(hash_bucket(futex)); | |
47 | * schedule(); | |
48 | * | |
49 | * This would cause the waiter on CPU 0 to wait forever because it | |
50 | * missed the transition of the user space value from val to newval | |
51 | * and the waker did not find the waiter in the hash bucket queue. | |
52 | * | |
53 | * The correct serialization ensures that a waiter either observes | |
54 | * the changed user space value before blocking or is woken by a | |
55 | * concurrent waker: | |
56 | * | |
57 | * CPU 0 CPU 1 | |
58 | * val = *futex; | |
59 | * sys_futex(WAIT, futex, val); | |
60 | * futex_wait(futex, val); | |
61 | * | |
62 | * waiters++; (a) | |
63 | * smp_mb(); (A) <-- paired with -. | |
64 | * | | |
65 | * lock(hash_bucket(futex)); | | |
66 | * | | |
67 | * uval = *futex; | | |
68 | * | *futex = newval; | |
69 | * | sys_futex(WAKE, futex); | |
70 | * | futex_wake(futex); | |
71 | * | | |
72 | * `--------> smp_mb(); (B) | |
73 | * if (uval == val) | |
74 | * queue(); | |
75 | * unlock(hash_bucket(futex)); | |
76 | * schedule(); if (waiters) | |
77 | * lock(hash_bucket(futex)); | |
78 | * else wake_waiters(futex); | |
79 | * waiters--; (b) unlock(hash_bucket(futex)); | |
80 | * | |
81 | * Where (A) orders the waiters increment and the futex value read through | |
82 | * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write | |
83 | * to futex and the waiters read (see futex_hb_waiters_pending()). | |
84 | * | |
85 | * This yields the following case (where X:=waiters, Y:=futex): | |
86 | * | |
87 | * X = Y = 0 | |
88 | * | |
89 | * w[X]=1 w[Y]=1 | |
90 | * MB MB | |
91 | * r[Y]=y r[X]=x | |
92 | * | |
93 | * Which guarantees that x==0 && y==0 is impossible; which translates back into | |
94 | * the guarantee that we cannot both miss the futex variable change and the | |
95 | * enqueue. | |
96 | * | |
97 | * Note that a new waiter is accounted for in (a) even when it is possible that | |
98 | * the wait call can return error, in which case we backtrack from it in (b). | |
99 | * Refer to the comment in futex_q_lock(). | |
100 | * | |
101 | * Similarly, in order to account for waiters being requeued on another | |
102 | * address we always increment the waiters for the destination bucket before | |
103 | * acquiring the lock. It then decrements them again after releasing it - | |
104 | * the code that actually moves the futex(es) between hash buckets (requeue_futex) | |
105 | * will do the additional required waiter count housekeeping. This is done for | |
106 | * double_lock_hb() and double_unlock_hb(), respectively. | |
107 | */ | |
108 | ||
109 | /* | |
110 | * The hash bucket lock must be held when this is called. | |
111 | * Afterwards, the futex_q must not be accessed. Callers | |
112 | * must ensure to later call wake_up_q() for the actual | |
113 | * wakeups to occur. | |
114 | */ | |
115 | void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) | |
116 | { | |
117 | struct task_struct *p = q->task; | |
118 | ||
119 | if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) | |
120 | return; | |
121 | ||
122 | get_task_struct(p); | |
123 | __futex_unqueue(q); | |
124 | /* | |
125 | * The waiting task can free the futex_q as soon as q->lock_ptr = NULL | |
126 | * is written, without taking any locks. This is possible in the event | |
127 | * of a spurious wakeup, for example. A memory barrier is required here | |
128 | * to prevent the following store to lock_ptr from getting ahead of the | |
129 | * plist_del in __futex_unqueue(). | |
130 | */ | |
131 | smp_store_release(&q->lock_ptr, NULL); | |
132 | ||
133 | /* | |
134 | * Queue the task for later wakeup for after we've released | |
135 | * the hb->lock. | |
136 | */ | |
137 | wake_q_add_safe(wake_q, p); | |
138 | } | |
139 | ||
140 | /* | |
141 | * Wake up waiters matching bitset queued on this futex (uaddr). | |
142 | */ | |
143 | int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |
144 | { | |
145 | struct futex_hash_bucket *hb; | |
146 | struct futex_q *this, *next; | |
147 | union futex_key key = FUTEX_KEY_INIT; | |
148 | int ret; | |
149 | DEFINE_WAKE_Q(wake_q); | |
150 | ||
151 | if (!bitset) | |
152 | return -EINVAL; | |
153 | ||
154 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); | |
155 | if (unlikely(ret != 0)) | |
156 | return ret; | |
157 | ||
158 | hb = futex_hash(&key); | |
159 | ||
160 | /* Make sure we really have tasks to wakeup */ | |
161 | if (!futex_hb_waiters_pending(hb)) | |
162 | return ret; | |
163 | ||
164 | spin_lock(&hb->lock); | |
165 | ||
166 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | |
167 | if (futex_match (&this->key, &key)) { | |
168 | if (this->pi_state || this->rt_waiter) { | |
169 | ret = -EINVAL; | |
170 | break; | |
171 | } | |
172 | ||
173 | /* Check if one of the bits is set in both bitsets */ | |
174 | if (!(this->bitset & bitset)) | |
175 | continue; | |
176 | ||
177 | futex_wake_mark(&wake_q, this); | |
178 | if (++ret >= nr_wake) | |
179 | break; | |
180 | } | |
181 | } | |
182 | ||
183 | spin_unlock(&hb->lock); | |
184 | wake_up_q(&wake_q); | |
185 | return ret; | |
186 | } | |
187 | ||
188 | static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) | |
189 | { | |
190 | unsigned int op = (encoded_op & 0x70000000) >> 28; | |
191 | unsigned int cmp = (encoded_op & 0x0f000000) >> 24; | |
192 | int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); | |
193 | int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); | |
194 | int oldval, ret; | |
195 | ||
196 | if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { | |
197 | if (oparg < 0 || oparg > 31) { | |
198 | char comm[sizeof(current->comm)]; | |
199 | /* | |
200 | * kill this print and return -EINVAL when userspace | |
201 | * is sane again | |
202 | */ | |
203 | pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", | |
204 | get_task_comm(comm, current), oparg); | |
205 | oparg &= 31; | |
206 | } | |
207 | oparg = 1 << oparg; | |
208 | } | |
209 | ||
210 | pagefault_disable(); | |
211 | ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); | |
212 | pagefault_enable(); | |
213 | if (ret) | |
214 | return ret; | |
215 | ||
216 | switch (cmp) { | |
217 | case FUTEX_OP_CMP_EQ: | |
218 | return oldval == cmparg; | |
219 | case FUTEX_OP_CMP_NE: | |
220 | return oldval != cmparg; | |
221 | case FUTEX_OP_CMP_LT: | |
222 | return oldval < cmparg; | |
223 | case FUTEX_OP_CMP_GE: | |
224 | return oldval >= cmparg; | |
225 | case FUTEX_OP_CMP_LE: | |
226 | return oldval <= cmparg; | |
227 | case FUTEX_OP_CMP_GT: | |
228 | return oldval > cmparg; | |
229 | default: | |
230 | return -ENOSYS; | |
231 | } | |
232 | } | |
233 | ||
234 | /* | |
235 | * Wake up all waiters hashed on the physical page that is mapped | |
236 | * to this virtual address: | |
237 | */ | |
238 | int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, | |
239 | int nr_wake, int nr_wake2, int op) | |
240 | { | |
241 | union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; | |
242 | struct futex_hash_bucket *hb1, *hb2; | |
243 | struct futex_q *this, *next; | |
244 | int ret, op_ret; | |
245 | DEFINE_WAKE_Q(wake_q); | |
246 | ||
247 | retry: | |
248 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); | |
249 | if (unlikely(ret != 0)) | |
250 | return ret; | |
251 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); | |
252 | if (unlikely(ret != 0)) | |
253 | return ret; | |
254 | ||
255 | hb1 = futex_hash(&key1); | |
256 | hb2 = futex_hash(&key2); | |
257 | ||
258 | retry_private: | |
259 | double_lock_hb(hb1, hb2); | |
260 | op_ret = futex_atomic_op_inuser(op, uaddr2); | |
261 | if (unlikely(op_ret < 0)) { | |
262 | double_unlock_hb(hb1, hb2); | |
263 | ||
264 | if (!IS_ENABLED(CONFIG_MMU) || | |
265 | unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { | |
266 | /* | |
267 | * we don't get EFAULT from MMU faults if we don't have | |
268 | * an MMU, but we might get them from range checking | |
269 | */ | |
270 | ret = op_ret; | |
271 | return ret; | |
272 | } | |
273 | ||
274 | if (op_ret == -EFAULT) { | |
275 | ret = fault_in_user_writeable(uaddr2); | |
276 | if (ret) | |
277 | return ret; | |
278 | } | |
279 | ||
280 | cond_resched(); | |
281 | if (!(flags & FLAGS_SHARED)) | |
282 | goto retry_private; | |
283 | goto retry; | |
284 | } | |
285 | ||
286 | plist_for_each_entry_safe(this, next, &hb1->chain, list) { | |
287 | if (futex_match (&this->key, &key1)) { | |
288 | if (this->pi_state || this->rt_waiter) { | |
289 | ret = -EINVAL; | |
290 | goto out_unlock; | |
291 | } | |
292 | futex_wake_mark(&wake_q, this); | |
293 | if (++ret >= nr_wake) | |
294 | break; | |
295 | } | |
296 | } | |
297 | ||
298 | if (op_ret > 0) { | |
299 | op_ret = 0; | |
300 | plist_for_each_entry_safe(this, next, &hb2->chain, list) { | |
301 | if (futex_match (&this->key, &key2)) { | |
302 | if (this->pi_state || this->rt_waiter) { | |
303 | ret = -EINVAL; | |
304 | goto out_unlock; | |
305 | } | |
306 | futex_wake_mark(&wake_q, this); | |
307 | if (++op_ret >= nr_wake2) | |
308 | break; | |
309 | } | |
310 | } | |
311 | ret += op_ret; | |
312 | } | |
313 | ||
314 | out_unlock: | |
315 | double_unlock_hb(hb1, hb2); | |
316 | wake_up_q(&wake_q); | |
317 | return ret; | |
318 | } | |
319 | ||
320 | static long futex_wait_restart(struct restart_block *restart); | |
321 | ||
322 | /** | |
323 | * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal | |
324 | * @hb: the futex hash bucket, must be locked by the caller | |
325 | * @q: the futex_q to queue up on | |
326 | * @timeout: the prepared hrtimer_sleeper, or null for no timeout | |
327 | */ | |
328 | void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, | |
329 | struct hrtimer_sleeper *timeout) | |
330 | { | |
331 | /* | |
332 | * The task state is guaranteed to be set before another task can | |
333 | * wake it. set_current_state() is implemented using smp_store_mb() and | |
334 | * futex_queue() calls spin_unlock() upon completion, both serializing | |
335 | * access to the hash list and forcing another memory barrier. | |
336 | */ | |
337 | set_current_state(TASK_INTERRUPTIBLE); | |
338 | futex_queue(q, hb); | |
339 | ||
340 | /* Arm the timer */ | |
341 | if (timeout) | |
342 | hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); | |
343 | ||
344 | /* | |
345 | * If we have been removed from the hash list, then another task | |
346 | * has tried to wake us, and we can skip the call to schedule(). | |
347 | */ | |
348 | if (likely(!plist_node_empty(&q->list))) { | |
349 | /* | |
350 | * If the timer has already expired, current will already be | |
351 | * flagged for rescheduling. Only call schedule if there | |
352 | * is no timeout, or if it has yet to expire. | |
353 | */ | |
354 | if (!timeout || timeout->task) | |
355 | freezable_schedule(); | |
356 | } | |
357 | __set_current_state(TASK_RUNNING); | |
358 | } | |
359 | ||
bf69bad3 AA |
360 | /** |
361 | * unqueue_multiple - Remove various futexes from their hash bucket | |
362 | * @v: The list of futexes to unqueue | |
363 | * @count: Number of futexes in the list | |
364 | * | |
365 | * Helper to unqueue a list of futexes. This can't fail. | |
366 | * | |
367 | * Return: | |
368 | * - >=0 - Index of the last futex that was awoken; | |
369 | * - -1 - No futex was awoken | |
370 | */ | |
371 | static int unqueue_multiple(struct futex_vector *v, int count) | |
372 | { | |
373 | int ret = -1, i; | |
374 | ||
375 | for (i = 0; i < count; i++) { | |
376 | if (!futex_unqueue(&v[i].q)) | |
377 | ret = i; | |
378 | } | |
379 | ||
380 | return ret; | |
381 | } | |
382 | ||
383 | /** | |
384 | * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes | |
385 | * @vs: The futex list to wait on | |
386 | * @count: The size of the list | |
387 | * @woken: Index of the last woken futex, if any. Used to notify the | |
388 | * caller that it can return this index to userspace (return parameter) | |
389 | * | |
390 | * Prepare multiple futexes in a single step and enqueue them. This may fail if | |
391 | * the futex list is invalid or if any futex was already awoken. On success the | |
392 | * task is ready to interruptible sleep. | |
393 | * | |
394 | * Return: | |
395 | * - 1 - One of the futexes was woken by another thread | |
396 | * - 0 - Success | |
397 | * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL | |
398 | */ | |
399 | static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) | |
400 | { | |
401 | struct futex_hash_bucket *hb; | |
402 | bool retry = false; | |
403 | int ret, i; | |
404 | u32 uval; | |
405 | ||
406 | /* | |
407 | * Enqueuing multiple futexes is tricky, because we need to enqueue | |
408 | * each futex on the list before dealing with the next one to avoid | |
409 | * deadlocking on the hash bucket. But, before enqueuing, we need to | |
410 | * make sure that current->state is TASK_INTERRUPTIBLE, so we don't | |
411 | * lose any wake events, which cannot be done before the get_futex_key | |
412 | * of the next key, because it calls get_user_pages, which can sleep. | |
413 | * Thus, we fetch the list of futexes keys in two steps, by first | |
414 | * pinning all the memory keys in the futex key, and only then we read | |
415 | * each key and queue the corresponding futex. | |
416 | * | |
417 | * Private futexes doesn't need to recalculate hash in retry, so skip | |
418 | * get_futex_key() when retrying. | |
419 | */ | |
420 | retry: | |
421 | for (i = 0; i < count; i++) { | |
422 | if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) | |
423 | continue; | |
424 | ||
425 | ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), | |
426 | !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), | |
427 | &vs[i].q.key, FUTEX_READ); | |
428 | ||
429 | if (unlikely(ret)) | |
430 | return ret; | |
431 | } | |
432 | ||
433 | set_current_state(TASK_INTERRUPTIBLE); | |
434 | ||
435 | for (i = 0; i < count; i++) { | |
436 | u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; | |
437 | struct futex_q *q = &vs[i].q; | |
438 | u32 val = (u32)vs[i].w.val; | |
439 | ||
440 | hb = futex_q_lock(q); | |
441 | ret = futex_get_value_locked(&uval, uaddr); | |
442 | ||
443 | if (!ret && uval == val) { | |
444 | /* | |
445 | * The bucket lock can't be held while dealing with the | |
446 | * next futex. Queue each futex at this moment so hb can | |
447 | * be unlocked. | |
448 | */ | |
449 | futex_queue(q, hb); | |
450 | continue; | |
451 | } | |
452 | ||
453 | futex_q_unlock(hb); | |
454 | __set_current_state(TASK_RUNNING); | |
455 | ||
456 | /* | |
457 | * Even if something went wrong, if we find out that a futex | |
458 | * was woken, we don't return error and return this index to | |
459 | * userspace | |
460 | */ | |
461 | *woken = unqueue_multiple(vs, i); | |
462 | if (*woken >= 0) | |
463 | return 1; | |
464 | ||
465 | if (ret) { | |
466 | /* | |
467 | * If we need to handle a page fault, we need to do so | |
468 | * without any lock and any enqueued futex (otherwise | |
469 | * we could lose some wakeup). So we do it here, after | |
470 | * undoing all the work done so far. In success, we | |
471 | * retry all the work. | |
472 | */ | |
473 | if (get_user(uval, uaddr)) | |
474 | return -EFAULT; | |
475 | ||
476 | retry = true; | |
477 | goto retry; | |
478 | } | |
479 | ||
480 | if (uval != val) | |
481 | return -EWOULDBLOCK; | |
482 | } | |
483 | ||
484 | return 0; | |
485 | } | |
486 | ||
487 | /** | |
488 | * futex_sleep_multiple - Check sleeping conditions and sleep | |
489 | * @vs: List of futexes to wait for | |
490 | * @count: Length of vs | |
491 | * @to: Timeout | |
492 | * | |
493 | * Sleep if and only if the timeout hasn't expired and no futex on the list has | |
494 | * been woken up. | |
495 | */ | |
496 | static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, | |
497 | struct hrtimer_sleeper *to) | |
498 | { | |
499 | if (to && !to->task) | |
500 | return; | |
501 | ||
502 | for (; count; count--, vs++) { | |
503 | if (!READ_ONCE(vs->q.lock_ptr)) | |
504 | return; | |
505 | } | |
506 | ||
507 | freezable_schedule(); | |
508 | } | |
509 | ||
510 | /** | |
511 | * futex_wait_multiple - Prepare to wait on and enqueue several futexes | |
512 | * @vs: The list of futexes to wait on | |
513 | * @count: The number of objects | |
514 | * @to: Timeout before giving up and returning to userspace | |
515 | * | |
516 | * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function | |
517 | * sleeps on a group of futexes and returns on the first futex that is | |
518 | * wake, or after the timeout has elapsed. | |
519 | * | |
520 | * Return: | |
521 | * - >=0 - Hint to the futex that was awoken | |
522 | * - <0 - On error | |
523 | */ | |
524 | int futex_wait_multiple(struct futex_vector *vs, unsigned int count, | |
525 | struct hrtimer_sleeper *to) | |
526 | { | |
527 | int ret, hint = 0; | |
528 | ||
529 | if (to) | |
530 | hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); | |
531 | ||
532 | while (1) { | |
533 | ret = futex_wait_multiple_setup(vs, count, &hint); | |
534 | if (ret) { | |
535 | if (ret > 0) { | |
536 | /* A futex was woken during setup */ | |
537 | ret = hint; | |
538 | } | |
539 | return ret; | |
540 | } | |
541 | ||
542 | futex_sleep_multiple(vs, count, to); | |
543 | ||
544 | __set_current_state(TASK_RUNNING); | |
545 | ||
546 | ret = unqueue_multiple(vs, count); | |
547 | if (ret >= 0) | |
548 | return ret; | |
549 | ||
550 | if (to && !to->task) | |
551 | return -ETIMEDOUT; | |
552 | else if (signal_pending(current)) | |
553 | return -ERESTARTSYS; | |
554 | /* | |
555 | * The final case is a spurious wakeup, for | |
556 | * which just retry. | |
557 | */ | |
558 | } | |
559 | } | |
560 | ||
a046f1a0 PZ |
561 | /** |
562 | * futex_wait_setup() - Prepare to wait on a futex | |
563 | * @uaddr: the futex userspace address | |
564 | * @val: the expected value | |
565 | * @flags: futex flags (FLAGS_SHARED, etc.) | |
566 | * @q: the associated futex_q | |
567 | * @hb: storage for hash_bucket pointer to be returned to caller | |
568 | * | |
569 | * Setup the futex_q and locate the hash_bucket. Get the futex value and | |
570 | * compare it with the expected value. Handle atomic faults internally. | |
571 | * Return with the hb lock held on success, and unlocked on failure. | |
572 | * | |
573 | * Return: | |
574 | * - 0 - uaddr contains val and hb has been locked; | |
575 | * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked | |
576 | */ | |
577 | int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | |
578 | struct futex_q *q, struct futex_hash_bucket **hb) | |
579 | { | |
580 | u32 uval; | |
581 | int ret; | |
582 | ||
583 | /* | |
584 | * Access the page AFTER the hash-bucket is locked. | |
585 | * Order is important: | |
586 | * | |
587 | * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); | |
588 | * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } | |
589 | * | |
590 | * The basic logical guarantee of a futex is that it blocks ONLY | |
591 | * if cond(var) is known to be true at the time of blocking, for | |
592 | * any cond. If we locked the hash-bucket after testing *uaddr, that | |
593 | * would open a race condition where we could block indefinitely with | |
594 | * cond(var) false, which would violate the guarantee. | |
595 | * | |
596 | * On the other hand, we insert q and release the hash-bucket only | |
597 | * after testing *uaddr. This guarantees that futex_wait() will NOT | |
598 | * absorb a wakeup if *uaddr does not match the desired values | |
599 | * while the syscall executes. | |
600 | */ | |
601 | retry: | |
602 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); | |
603 | if (unlikely(ret != 0)) | |
604 | return ret; | |
605 | ||
606 | retry_private: | |
607 | *hb = futex_q_lock(q); | |
608 | ||
609 | ret = futex_get_value_locked(&uval, uaddr); | |
610 | ||
611 | if (ret) { | |
612 | futex_q_unlock(*hb); | |
613 | ||
614 | ret = get_user(uval, uaddr); | |
615 | if (ret) | |
616 | return ret; | |
617 | ||
618 | if (!(flags & FLAGS_SHARED)) | |
619 | goto retry_private; | |
620 | ||
621 | goto retry; | |
622 | } | |
623 | ||
624 | if (uval != val) { | |
625 | futex_q_unlock(*hb); | |
626 | ret = -EWOULDBLOCK; | |
627 | } | |
628 | ||
629 | return ret; | |
630 | } | |
631 | ||
632 | int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) | |
633 | { | |
634 | struct hrtimer_sleeper timeout, *to; | |
635 | struct restart_block *restart; | |
636 | struct futex_hash_bucket *hb; | |
637 | struct futex_q q = futex_q_init; | |
638 | int ret; | |
639 | ||
640 | if (!bitset) | |
641 | return -EINVAL; | |
642 | q.bitset = bitset; | |
643 | ||
644 | to = futex_setup_timer(abs_time, &timeout, flags, | |
645 | current->timer_slack_ns); | |
646 | retry: | |
647 | /* | |
648 | * Prepare to wait on uaddr. On success, it holds hb->lock and q | |
649 | * is initialized. | |
650 | */ | |
651 | ret = futex_wait_setup(uaddr, val, flags, &q, &hb); | |
652 | if (ret) | |
653 | goto out; | |
654 | ||
655 | /* futex_queue and wait for wakeup, timeout, or a signal. */ | |
656 | futex_wait_queue(hb, &q, to); | |
657 | ||
658 | /* If we were woken (and unqueued), we succeeded, whatever. */ | |
659 | ret = 0; | |
660 | if (!futex_unqueue(&q)) | |
661 | goto out; | |
662 | ret = -ETIMEDOUT; | |
663 | if (to && !to->task) | |
664 | goto out; | |
665 | ||
666 | /* | |
667 | * We expect signal_pending(current), but we might be the | |
668 | * victim of a spurious wakeup as well. | |
669 | */ | |
670 | if (!signal_pending(current)) | |
671 | goto retry; | |
672 | ||
673 | ret = -ERESTARTSYS; | |
674 | if (!abs_time) | |
675 | goto out; | |
676 | ||
677 | restart = ¤t->restart_block; | |
678 | restart->futex.uaddr = uaddr; | |
679 | restart->futex.val = val; | |
680 | restart->futex.time = *abs_time; | |
681 | restart->futex.bitset = bitset; | |
682 | restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; | |
683 | ||
684 | ret = set_restart_fn(restart, futex_wait_restart); | |
685 | ||
686 | out: | |
687 | if (to) { | |
688 | hrtimer_cancel(&to->timer); | |
689 | destroy_hrtimer_on_stack(&to->timer); | |
690 | } | |
691 | return ret; | |
692 | } | |
693 | ||
694 | static long futex_wait_restart(struct restart_block *restart) | |
695 | { | |
696 | u32 __user *uaddr = restart->futex.uaddr; | |
697 | ktime_t t, *tp = NULL; | |
698 | ||
699 | if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { | |
700 | t = restart->futex.time; | |
701 | tp = &t; | |
702 | } | |
703 | restart->fn = do_no_restart_syscall; | |
704 | ||
705 | return (long)futex_wait(uaddr, restart->futex.flags, | |
706 | restart->futex.val, tp, restart->futex.bitset); | |
707 | } | |
708 |