Commit | Line | Data |
---|---|---|
85dc28fa PZ |
1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | ||
3 | #include <linux/slab.h> | |
d14f9e93 | 4 | #include <linux/sched/rt.h> |
85dc28fa PZ |
5 | #include <linux/sched/task.h> |
6 | ||
7 | #include "futex.h" | |
8 | #include "../locking/rtmutex_common.h" | |
9 | ||
10 | /* | |
11 | * PI code: | |
12 | */ | |
13 | int refill_pi_state_cache(void) | |
14 | { | |
15 | struct futex_pi_state *pi_state; | |
16 | ||
17 | if (likely(current->pi_state_cache)) | |
18 | return 0; | |
19 | ||
20 | pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); | |
21 | ||
22 | if (!pi_state) | |
23 | return -ENOMEM; | |
24 | ||
25 | INIT_LIST_HEAD(&pi_state->list); | |
26 | /* pi_mutex gets initialized later */ | |
27 | pi_state->owner = NULL; | |
28 | refcount_set(&pi_state->refcount, 1); | |
29 | pi_state->key = FUTEX_KEY_INIT; | |
30 | ||
31 | current->pi_state_cache = pi_state; | |
32 | ||
33 | return 0; | |
34 | } | |
35 | ||
36 | static struct futex_pi_state *alloc_pi_state(void) | |
37 | { | |
38 | struct futex_pi_state *pi_state = current->pi_state_cache; | |
39 | ||
40 | WARN_ON(!pi_state); | |
41 | current->pi_state_cache = NULL; | |
42 | ||
43 | return pi_state; | |
44 | } | |
45 | ||
46 | static void pi_state_update_owner(struct futex_pi_state *pi_state, | |
47 | struct task_struct *new_owner) | |
48 | { | |
49 | struct task_struct *old_owner = pi_state->owner; | |
50 | ||
51 | lockdep_assert_held(&pi_state->pi_mutex.wait_lock); | |
52 | ||
53 | if (old_owner) { | |
54 | raw_spin_lock(&old_owner->pi_lock); | |
55 | WARN_ON(list_empty(&pi_state->list)); | |
56 | list_del_init(&pi_state->list); | |
57 | raw_spin_unlock(&old_owner->pi_lock); | |
58 | } | |
59 | ||
60 | if (new_owner) { | |
61 | raw_spin_lock(&new_owner->pi_lock); | |
62 | WARN_ON(!list_empty(&pi_state->list)); | |
63 | list_add(&pi_state->list, &new_owner->pi_state_list); | |
64 | pi_state->owner = new_owner; | |
65 | raw_spin_unlock(&new_owner->pi_lock); | |
66 | } | |
67 | } | |
68 | ||
69 | void get_pi_state(struct futex_pi_state *pi_state) | |
70 | { | |
71 | WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); | |
72 | } | |
73 | ||
74 | /* | |
75 | * Drops a reference to the pi_state object and frees or caches it | |
76 | * when the last reference is gone. | |
77 | */ | |
78 | void put_pi_state(struct futex_pi_state *pi_state) | |
79 | { | |
80 | if (!pi_state) | |
81 | return; | |
82 | ||
83 | if (!refcount_dec_and_test(&pi_state->refcount)) | |
84 | return; | |
85 | ||
86 | /* | |
87 | * If pi_state->owner is NULL, the owner is most probably dying | |
88 | * and has cleaned up the pi_state already | |
89 | */ | |
90 | if (pi_state->owner) { | |
91 | unsigned long flags; | |
92 | ||
93 | raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); | |
94 | pi_state_update_owner(pi_state, NULL); | |
95 | rt_mutex_proxy_unlock(&pi_state->pi_mutex); | |
96 | raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); | |
97 | } | |
98 | ||
99 | if (current->pi_state_cache) { | |
100 | kfree(pi_state); | |
101 | } else { | |
102 | /* | |
103 | * pi_state->list is already empty. | |
104 | * clear pi_state->owner. | |
105 | * refcount is at 0 - put it back to 1. | |
106 | */ | |
107 | pi_state->owner = NULL; | |
108 | refcount_set(&pi_state->refcount, 1); | |
109 | current->pi_state_cache = pi_state; | |
110 | } | |
111 | } | |
112 | ||
113 | /* | |
114 | * We need to check the following states: | |
115 | * | |
116 | * Waiter | pi_state | pi->owner | uTID | uODIED | ? | |
117 | * | |
118 | * [1] NULL | --- | --- | 0 | 0/1 | Valid | |
119 | * [2] NULL | --- | --- | >0 | 0/1 | Valid | |
120 | * | |
121 | * [3] Found | NULL | -- | Any | 0/1 | Invalid | |
122 | * | |
123 | * [4] Found | Found | NULL | 0 | 1 | Valid | |
124 | * [5] Found | Found | NULL | >0 | 1 | Invalid | |
125 | * | |
126 | * [6] Found | Found | task | 0 | 1 | Valid | |
127 | * | |
128 | * [7] Found | Found | NULL | Any | 0 | Invalid | |
129 | * | |
130 | * [8] Found | Found | task | ==taskTID | 0/1 | Valid | |
131 | * [9] Found | Found | task | 0 | 0 | Invalid | |
132 | * [10] Found | Found | task | !=taskTID | 0/1 | Invalid | |
133 | * | |
134 | * [1] Indicates that the kernel can acquire the futex atomically. We | |
135 | * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. | |
136 | * | |
137 | * [2] Valid, if TID does not belong to a kernel thread. If no matching | |
138 | * thread is found then it indicates that the owner TID has died. | |
139 | * | |
140 | * [3] Invalid. The waiter is queued on a non PI futex | |
141 | * | |
142 | * [4] Valid state after exit_robust_list(), which sets the user space | |
143 | * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. | |
144 | * | |
145 | * [5] The user space value got manipulated between exit_robust_list() | |
146 | * and exit_pi_state_list() | |
147 | * | |
148 | * [6] Valid state after exit_pi_state_list() which sets the new owner in | |
149 | * the pi_state but cannot access the user space value. | |
150 | * | |
151 | * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. | |
152 | * | |
153 | * [8] Owner and user space value match | |
154 | * | |
155 | * [9] There is no transient state which sets the user space TID to 0 | |
156 | * except exit_robust_list(), but this is indicated by the | |
157 | * FUTEX_OWNER_DIED bit. See [4] | |
158 | * | |
159 | * [10] There is no transient state which leaves owner and user space | |
160 | * TID out of sync. Except one error case where the kernel is denied | |
161 | * write access to the user address, see fixup_pi_state_owner(). | |
162 | * | |
163 | * | |
164 | * Serialization and lifetime rules: | |
165 | * | |
166 | * hb->lock: | |
167 | * | |
168 | * hb -> futex_q, relation | |
169 | * futex_q -> pi_state, relation | |
170 | * | |
171 | * (cannot be raw because hb can contain arbitrary amount | |
172 | * of futex_q's) | |
173 | * | |
174 | * pi_mutex->wait_lock: | |
175 | * | |
176 | * {uval, pi_state} | |
177 | * | |
178 | * (and pi_mutex 'obviously') | |
179 | * | |
180 | * p->pi_lock: | |
181 | * | |
182 | * p->pi_state_list -> pi_state->list, relation | |
183 | * pi_mutex->owner -> pi_state->owner, relation | |
184 | * | |
185 | * pi_state->refcount: | |
186 | * | |
187 | * pi_state lifetime | |
188 | * | |
189 | * | |
190 | * Lock order: | |
191 | * | |
192 | * hb->lock | |
193 | * pi_mutex->wait_lock | |
194 | * p->pi_lock | |
195 | * | |
196 | */ | |
197 | ||
198 | /* | |
199 | * Validate that the existing waiter has a pi_state and sanity check | |
200 | * the pi_state against the user space value. If correct, attach to | |
201 | * it. | |
202 | */ | |
203 | static int attach_to_pi_state(u32 __user *uaddr, u32 uval, | |
204 | struct futex_pi_state *pi_state, | |
205 | struct futex_pi_state **ps) | |
206 | { | |
207 | pid_t pid = uval & FUTEX_TID_MASK; | |
208 | u32 uval2; | |
209 | int ret; | |
210 | ||
211 | /* | |
212 | * Userspace might have messed up non-PI and PI futexes [3] | |
213 | */ | |
214 | if (unlikely(!pi_state)) | |
215 | return -EINVAL; | |
216 | ||
217 | /* | |
218 | * We get here with hb->lock held, and having found a | |
219 | * futex_top_waiter(). This means that futex_lock_pi() of said futex_q | |
220 | * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(), | |
221 | * which in turn means that futex_lock_pi() still has a reference on | |
222 | * our pi_state. | |
223 | * | |
224 | * The waiter holding a reference on @pi_state also protects against | |
225 | * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() | |
226 | * and futex_wait_requeue_pi() as it cannot go to 0 and consequently | |
227 | * free pi_state before we can take a reference ourselves. | |
228 | */ | |
229 | WARN_ON(!refcount_read(&pi_state->refcount)); | |
230 | ||
231 | /* | |
232 | * Now that we have a pi_state, we can acquire wait_lock | |
233 | * and do the state validation. | |
234 | */ | |
235 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | |
236 | ||
237 | /* | |
238 | * Since {uval, pi_state} is serialized by wait_lock, and our current | |
239 | * uval was read without holding it, it can have changed. Verify it | |
240 | * still is what we expect it to be, otherwise retry the entire | |
241 | * operation. | |
242 | */ | |
243 | if (futex_get_value_locked(&uval2, uaddr)) | |
244 | goto out_efault; | |
245 | ||
246 | if (uval != uval2) | |
247 | goto out_eagain; | |
248 | ||
249 | /* | |
250 | * Handle the owner died case: | |
251 | */ | |
252 | if (uval & FUTEX_OWNER_DIED) { | |
253 | /* | |
254 | * exit_pi_state_list sets owner to NULL and wakes the | |
255 | * topmost waiter. The task which acquires the | |
256 | * pi_state->rt_mutex will fixup owner. | |
257 | */ | |
258 | if (!pi_state->owner) { | |
259 | /* | |
260 | * No pi state owner, but the user space TID | |
261 | * is not 0. Inconsistent state. [5] | |
262 | */ | |
263 | if (pid) | |
264 | goto out_einval; | |
265 | /* | |
266 | * Take a ref on the state and return success. [4] | |
267 | */ | |
268 | goto out_attach; | |
269 | } | |
270 | ||
271 | /* | |
272 | * If TID is 0, then either the dying owner has not | |
273 | * yet executed exit_pi_state_list() or some waiter | |
274 | * acquired the rtmutex in the pi state, but did not | |
275 | * yet fixup the TID in user space. | |
276 | * | |
277 | * Take a ref on the state and return success. [6] | |
278 | */ | |
279 | if (!pid) | |
280 | goto out_attach; | |
281 | } else { | |
282 | /* | |
283 | * If the owner died bit is not set, then the pi_state | |
284 | * must have an owner. [7] | |
285 | */ | |
286 | if (!pi_state->owner) | |
287 | goto out_einval; | |
288 | } | |
289 | ||
290 | /* | |
291 | * Bail out if user space manipulated the futex value. If pi | |
292 | * state exists then the owner TID must be the same as the | |
293 | * user space TID. [9/10] | |
294 | */ | |
295 | if (pid != task_pid_vnr(pi_state->owner)) | |
296 | goto out_einval; | |
297 | ||
298 | out_attach: | |
299 | get_pi_state(pi_state); | |
300 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | |
301 | *ps = pi_state; | |
302 | return 0; | |
303 | ||
304 | out_einval: | |
305 | ret = -EINVAL; | |
306 | goto out_error; | |
307 | ||
308 | out_eagain: | |
309 | ret = -EAGAIN; | |
310 | goto out_error; | |
311 | ||
312 | out_efault: | |
313 | ret = -EFAULT; | |
314 | goto out_error; | |
315 | ||
316 | out_error: | |
317 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | |
318 | return ret; | |
319 | } | |
320 | ||
321 | static int handle_exit_race(u32 __user *uaddr, u32 uval, | |
322 | struct task_struct *tsk) | |
323 | { | |
324 | u32 uval2; | |
325 | ||
326 | /* | |
327 | * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the | |
328 | * caller that the alleged owner is busy. | |
329 | */ | |
330 | if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) | |
331 | return -EBUSY; | |
332 | ||
333 | /* | |
334 | * Reread the user space value to handle the following situation: | |
335 | * | |
336 | * CPU0 CPU1 | |
337 | * | |
338 | * sys_exit() sys_futex() | |
339 | * do_exit() futex_lock_pi() | |
340 | * futex_lock_pi_atomic() | |
341 | * exit_signals(tsk) No waiters: | |
342 | * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID | |
343 | * mm_release(tsk) Set waiter bit | |
344 | * exit_robust_list(tsk) { *uaddr = 0x80000PID; | |
345 | * Set owner died attach_to_pi_owner() { | |
346 | * *uaddr = 0xC0000000; tsk = get_task(PID); | |
347 | * } if (!tsk->flags & PF_EXITING) { | |
348 | * ... attach(); | |
349 | * tsk->futex_state = } else { | |
350 | * FUTEX_STATE_DEAD; if (tsk->futex_state != | |
351 | * FUTEX_STATE_DEAD) | |
352 | * return -EAGAIN; | |
353 | * return -ESRCH; <--- FAIL | |
354 | * } | |
355 | * | |
356 | * Returning ESRCH unconditionally is wrong here because the | |
357 | * user space value has been changed by the exiting task. | |
358 | * | |
359 | * The same logic applies to the case where the exiting task is | |
360 | * already gone. | |
361 | */ | |
362 | if (futex_get_value_locked(&uval2, uaddr)) | |
363 | return -EFAULT; | |
364 | ||
365 | /* If the user space value has changed, try again. */ | |
366 | if (uval2 != uval) | |
367 | return -EAGAIN; | |
368 | ||
369 | /* | |
370 | * The exiting task did not have a robust list, the robust list was | |
371 | * corrupted or the user space value in *uaddr is simply bogus. | |
372 | * Give up and tell user space. | |
373 | */ | |
374 | return -ESRCH; | |
375 | } | |
376 | ||
377 | static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, | |
378 | struct futex_pi_state **ps) | |
379 | { | |
380 | /* | |
381 | * No existing pi state. First waiter. [2] | |
382 | * | |
383 | * This creates pi_state, we have hb->lock held, this means nothing can | |
384 | * observe this state, wait_lock is irrelevant. | |
385 | */ | |
386 | struct futex_pi_state *pi_state = alloc_pi_state(); | |
387 | ||
388 | /* | |
389 | * Initialize the pi_mutex in locked state and make @p | |
390 | * the owner of it: | |
391 | */ | |
392 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | |
393 | ||
394 | /* Store the key for possible exit cleanups: */ | |
395 | pi_state->key = *key; | |
396 | ||
397 | WARN_ON(!list_empty(&pi_state->list)); | |
398 | list_add(&pi_state->list, &p->pi_state_list); | |
399 | /* | |
400 | * Assignment without holding pi_state->pi_mutex.wait_lock is safe | |
401 | * because there is no concurrency as the object is not published yet. | |
402 | */ | |
403 | pi_state->owner = p; | |
404 | ||
405 | *ps = pi_state; | |
406 | } | |
407 | /* | |
408 | * Lookup the task for the TID provided from user space and attach to | |
409 | * it after doing proper sanity checks. | |
410 | */ | |
411 | static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, | |
412 | struct futex_pi_state **ps, | |
413 | struct task_struct **exiting) | |
414 | { | |
415 | pid_t pid = uval & FUTEX_TID_MASK; | |
416 | struct task_struct *p; | |
417 | ||
418 | /* | |
419 | * We are the first waiter - try to look up the real owner and attach | |
420 | * the new pi_state to it, but bail out when TID = 0 [1] | |
421 | * | |
422 | * The !pid check is paranoid. None of the call sites should end up | |
423 | * with pid == 0, but better safe than sorry. Let the caller retry | |
424 | */ | |
425 | if (!pid) | |
426 | return -EAGAIN; | |
427 | p = find_get_task_by_vpid(pid); | |
428 | if (!p) | |
429 | return handle_exit_race(uaddr, uval, NULL); | |
430 | ||
431 | if (unlikely(p->flags & PF_KTHREAD)) { | |
432 | put_task_struct(p); | |
433 | return -EPERM; | |
434 | } | |
435 | ||
436 | /* | |
437 | * We need to look at the task state to figure out, whether the | |
438 | * task is exiting. To protect against the change of the task state | |
439 | * in futex_exit_release(), we do this protected by p->pi_lock: | |
440 | */ | |
441 | raw_spin_lock_irq(&p->pi_lock); | |
442 | if (unlikely(p->futex_state != FUTEX_STATE_OK)) { | |
443 | /* | |
444 | * The task is on the way out. When the futex state is | |
445 | * FUTEX_STATE_DEAD, we know that the task has finished | |
446 | * the cleanup: | |
447 | */ | |
448 | int ret = handle_exit_race(uaddr, uval, p); | |
449 | ||
450 | raw_spin_unlock_irq(&p->pi_lock); | |
451 | /* | |
452 | * If the owner task is between FUTEX_STATE_EXITING and | |
453 | * FUTEX_STATE_DEAD then store the task pointer and keep | |
454 | * the reference on the task struct. The calling code will | |
455 | * drop all locks, wait for the task to reach | |
456 | * FUTEX_STATE_DEAD and then drop the refcount. This is | |
457 | * required to prevent a live lock when the current task | |
458 | * preempted the exiting task between the two states. | |
459 | */ | |
460 | if (ret == -EBUSY) | |
461 | *exiting = p; | |
462 | else | |
463 | put_task_struct(p); | |
464 | return ret; | |
465 | } | |
466 | ||
467 | __attach_to_pi_owner(p, key, ps); | |
468 | raw_spin_unlock_irq(&p->pi_lock); | |
469 | ||
470 | put_task_struct(p); | |
471 | ||
472 | return 0; | |
473 | } | |
474 | ||
475 | static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) | |
476 | { | |
477 | int err; | |
478 | u32 curval; | |
479 | ||
480 | if (unlikely(should_fail_futex(true))) | |
481 | return -EFAULT; | |
482 | ||
483 | err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); | |
484 | if (unlikely(err)) | |
485 | return err; | |
486 | ||
487 | /* If user space value changed, let the caller retry */ | |
488 | return curval != uval ? -EAGAIN : 0; | |
489 | } | |
490 | ||
491 | /** | |
492 | * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex | |
493 | * @uaddr: the pi futex user address | |
494 | * @hb: the pi futex hash bucket | |
495 | * @key: the futex key associated with uaddr and hb | |
496 | * @ps: the pi_state pointer where we store the result of the | |
497 | * lookup | |
498 | * @task: the task to perform the atomic lock work for. This will | |
499 | * be "current" except in the case of requeue pi. | |
500 | * @exiting: Pointer to store the task pointer of the owner task | |
501 | * which is in the middle of exiting | |
502 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | |
503 | * | |
504 | * Return: | |
505 | * - 0 - ready to wait; | |
506 | * - 1 - acquired the lock; | |
507 | * - <0 - error | |
508 | * | |
509 | * The hb->lock must be held by the caller. | |
510 | * | |
511 | * @exiting is only set when the return value is -EBUSY. If so, this holds | |
512 | * a refcount on the exiting task on return and the caller needs to drop it | |
513 | * after waiting for the exit to complete. | |
514 | */ | |
515 | int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |
516 | union futex_key *key, | |
517 | struct futex_pi_state **ps, | |
518 | struct task_struct *task, | |
519 | struct task_struct **exiting, | |
520 | int set_waiters) | |
521 | { | |
522 | u32 uval, newval, vpid = task_pid_vnr(task); | |
523 | struct futex_q *top_waiter; | |
524 | int ret; | |
525 | ||
526 | /* | |
527 | * Read the user space value first so we can validate a few | |
528 | * things before proceeding further. | |
529 | */ | |
530 | if (futex_get_value_locked(&uval, uaddr)) | |
531 | return -EFAULT; | |
532 | ||
533 | if (unlikely(should_fail_futex(true))) | |
534 | return -EFAULT; | |
535 | ||
536 | /* | |
537 | * Detect deadlocks. | |
538 | */ | |
539 | if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) | |
540 | return -EDEADLK; | |
541 | ||
542 | if ((unlikely(should_fail_futex(true)))) | |
543 | return -EDEADLK; | |
544 | ||
545 | /* | |
546 | * Lookup existing state first. If it exists, try to attach to | |
547 | * its pi_state. | |
548 | */ | |
549 | top_waiter = futex_top_waiter(hb, key); | |
550 | if (top_waiter) | |
551 | return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); | |
552 | ||
553 | /* | |
554 | * No waiter and user TID is 0. We are here because the | |
555 | * waiters or the owner died bit is set or called from | |
556 | * requeue_cmp_pi or for whatever reason something took the | |
557 | * syscall. | |
558 | */ | |
559 | if (!(uval & FUTEX_TID_MASK)) { | |
560 | /* | |
561 | * We take over the futex. No other waiters and the user space | |
562 | * TID is 0. We preserve the owner died bit. | |
563 | */ | |
564 | newval = uval & FUTEX_OWNER_DIED; | |
565 | newval |= vpid; | |
566 | ||
567 | /* The futex requeue_pi code can enforce the waiters bit */ | |
568 | if (set_waiters) | |
569 | newval |= FUTEX_WAITERS; | |
570 | ||
571 | ret = lock_pi_update_atomic(uaddr, uval, newval); | |
572 | if (ret) | |
573 | return ret; | |
574 | ||
575 | /* | |
576 | * If the waiter bit was requested the caller also needs PI | |
577 | * state attached to the new owner of the user space futex. | |
578 | * | |
579 | * @task is guaranteed to be alive and it cannot be exiting | |
580 | * because it is either sleeping or waiting in | |
581 | * futex_requeue_pi_wakeup_sync(). | |
582 | * | |
583 | * No need to do the full attach_to_pi_owner() exercise | |
584 | * because @task is known and valid. | |
585 | */ | |
586 | if (set_waiters) { | |
587 | raw_spin_lock_irq(&task->pi_lock); | |
588 | __attach_to_pi_owner(task, key, ps); | |
589 | raw_spin_unlock_irq(&task->pi_lock); | |
590 | } | |
591 | return 1; | |
592 | } | |
593 | ||
594 | /* | |
595 | * First waiter. Set the waiters bit before attaching ourself to | |
596 | * the owner. If owner tries to unlock, it will be forced into | |
597 | * the kernel and blocked on hb->lock. | |
598 | */ | |
599 | newval = uval | FUTEX_WAITERS; | |
600 | ret = lock_pi_update_atomic(uaddr, uval, newval); | |
601 | if (ret) | |
602 | return ret; | |
603 | /* | |
604 | * If the update of the user space value succeeded, we try to | |
605 | * attach to the owner. If that fails, no harm done, we only | |
606 | * set the FUTEX_WAITERS bit in the user space variable. | |
607 | */ | |
608 | return attach_to_pi_owner(uaddr, newval, key, ps, exiting); | |
609 | } | |
610 | ||
611 | /* | |
612 | * Caller must hold a reference on @pi_state. | |
613 | */ | |
fbeb558b PZ |
614 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, |
615 | struct futex_pi_state *pi_state, | |
616 | struct rt_mutex_waiter *top_waiter) | |
85dc28fa | 617 | { |
85dc28fa PZ |
618 | struct task_struct *new_owner; |
619 | bool postunlock = false; | |
620 | DEFINE_RT_WAKE_Q(wqh); | |
621 | u32 curval, newval; | |
622 | int ret = 0; | |
623 | ||
85dc28fa PZ |
624 | new_owner = top_waiter->task; |
625 | ||
626 | /* | |
627 | * We pass it to the next owner. The WAITERS bit is always kept | |
628 | * enabled while there is PI state around. We cleanup the owner | |
629 | * died bit, because we are the owner. | |
630 | */ | |
631 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | |
632 | ||
633 | if (unlikely(should_fail_futex(true))) { | |
634 | ret = -EFAULT; | |
635 | goto out_unlock; | |
636 | } | |
637 | ||
638 | ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); | |
639 | if (!ret && (curval != uval)) { | |
640 | /* | |
641 | * If a unconditional UNLOCK_PI operation (user space did not | |
642 | * try the TID->0 transition) raced with a waiter setting the | |
643 | * FUTEX_WAITERS flag between get_user() and locking the hash | |
644 | * bucket lock, retry the operation. | |
645 | */ | |
646 | if ((FUTEX_TID_MASK & curval) == uval) | |
647 | ret = -EAGAIN; | |
648 | else | |
649 | ret = -EINVAL; | |
650 | } | |
651 | ||
652 | if (!ret) { | |
653 | /* | |
654 | * This is a point of no return; once we modified the uval | |
655 | * there is no going back and subsequent operations must | |
656 | * not fail. | |
657 | */ | |
658 | pi_state_update_owner(pi_state, new_owner); | |
659 | postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); | |
660 | } | |
661 | ||
662 | out_unlock: | |
663 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | |
664 | ||
665 | if (postunlock) | |
666 | rt_mutex_postunlock(&wqh); | |
667 | ||
668 | return ret; | |
669 | } | |
670 | ||
671 | static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |
672 | struct task_struct *argowner) | |
673 | { | |
674 | struct futex_pi_state *pi_state = q->pi_state; | |
675 | struct task_struct *oldowner, *newowner; | |
676 | u32 uval, curval, newval, newtid; | |
677 | int err = 0; | |
678 | ||
679 | oldowner = pi_state->owner; | |
680 | ||
681 | /* | |
682 | * We are here because either: | |
683 | * | |
684 | * - we stole the lock and pi_state->owner needs updating to reflect | |
685 | * that (@argowner == current), | |
686 | * | |
687 | * or: | |
688 | * | |
689 | * - someone stole our lock and we need to fix things to point to the | |
690 | * new owner (@argowner == NULL). | |
691 | * | |
692 | * Either way, we have to replace the TID in the user space variable. | |
693 | * This must be atomic as we have to preserve the owner died bit here. | |
694 | * | |
695 | * Note: We write the user space value _before_ changing the pi_state | |
696 | * because we can fault here. Imagine swapped out pages or a fork | |
697 | * that marked all the anonymous memory readonly for cow. | |
698 | * | |
699 | * Modifying pi_state _before_ the user space value would leave the | |
700 | * pi_state in an inconsistent state when we fault here, because we | |
701 | * need to drop the locks to handle the fault. This might be observed | |
702 | * in the PID checks when attaching to PI state . | |
703 | */ | |
704 | retry: | |
705 | if (!argowner) { | |
706 | if (oldowner != current) { | |
707 | /* | |
708 | * We raced against a concurrent self; things are | |
709 | * already fixed up. Nothing to do. | |
710 | */ | |
711 | return 0; | |
712 | } | |
713 | ||
714 | if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { | |
715 | /* We got the lock. pi_state is correct. Tell caller. */ | |
716 | return 1; | |
717 | } | |
718 | ||
719 | /* | |
720 | * The trylock just failed, so either there is an owner or | |
721 | * there is a higher priority waiter than this one. | |
722 | */ | |
723 | newowner = rt_mutex_owner(&pi_state->pi_mutex); | |
724 | /* | |
725 | * If the higher priority waiter has not yet taken over the | |
726 | * rtmutex then newowner is NULL. We can't return here with | |
727 | * that state because it's inconsistent vs. the user space | |
728 | * state. So drop the locks and try again. It's a valid | |
729 | * situation and not any different from the other retry | |
730 | * conditions. | |
731 | */ | |
732 | if (unlikely(!newowner)) { | |
733 | err = -EAGAIN; | |
734 | goto handle_err; | |
735 | } | |
736 | } else { | |
737 | WARN_ON_ONCE(argowner != current); | |
738 | if (oldowner == current) { | |
739 | /* | |
740 | * We raced against a concurrent self; things are | |
741 | * already fixed up. Nothing to do. | |
742 | */ | |
743 | return 1; | |
744 | } | |
745 | newowner = argowner; | |
746 | } | |
747 | ||
748 | newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | |
749 | /* Owner died? */ | |
750 | if (!pi_state->owner) | |
751 | newtid |= FUTEX_OWNER_DIED; | |
752 | ||
753 | err = futex_get_value_locked(&uval, uaddr); | |
754 | if (err) | |
755 | goto handle_err; | |
756 | ||
757 | for (;;) { | |
758 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | |
759 | ||
760 | err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); | |
761 | if (err) | |
762 | goto handle_err; | |
763 | ||
764 | if (curval == uval) | |
765 | break; | |
766 | uval = curval; | |
767 | } | |
768 | ||
769 | /* | |
770 | * We fixed up user space. Now we need to fix the pi_state | |
771 | * itself. | |
772 | */ | |
773 | pi_state_update_owner(pi_state, newowner); | |
774 | ||
775 | return argowner == current; | |
776 | ||
777 | /* | |
778 | * In order to reschedule or handle a page fault, we need to drop the | |
779 | * locks here. In the case of a fault, this gives the other task | |
780 | * (either the highest priority waiter itself or the task which stole | |
781 | * the rtmutex) the chance to try the fixup of the pi_state. So once we | |
782 | * are back from handling the fault we need to check the pi_state after | |
783 | * reacquiring the locks and before trying to do another fixup. When | |
784 | * the fixup has been done already we simply return. | |
785 | * | |
786 | * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely | |
787 | * drop hb->lock since the caller owns the hb -> futex_q relation. | |
788 | * Dropping the pi_mutex->wait_lock requires the state revalidate. | |
789 | */ | |
790 | handle_err: | |
791 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | |
792 | spin_unlock(q->lock_ptr); | |
793 | ||
794 | switch (err) { | |
795 | case -EFAULT: | |
796 | err = fault_in_user_writeable(uaddr); | |
797 | break; | |
798 | ||
799 | case -EAGAIN: | |
800 | cond_resched(); | |
801 | err = 0; | |
802 | break; | |
803 | ||
804 | default: | |
805 | WARN_ON_ONCE(1); | |
806 | break; | |
807 | } | |
808 | ||
809 | spin_lock(q->lock_ptr); | |
810 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | |
811 | ||
812 | /* | |
813 | * Check if someone else fixed it for us: | |
814 | */ | |
815 | if (pi_state->owner != oldowner) | |
816 | return argowner == current; | |
817 | ||
818 | /* Retry if err was -EAGAIN or the fault in succeeded */ | |
819 | if (!err) | |
820 | goto retry; | |
821 | ||
822 | /* | |
823 | * fault_in_user_writeable() failed so user state is immutable. At | |
824 | * best we can make the kernel state consistent but user state will | |
825 | * be most likely hosed and any subsequent unlock operation will be | |
826 | * rejected due to PI futex rule [10]. | |
827 | * | |
828 | * Ensure that the rtmutex owner is also the pi_state owner despite | |
829 | * the user space value claiming something different. There is no | |
830 | * point in unlocking the rtmutex if current is the owner as it | |
831 | * would need to wait until the next waiter has taken the rtmutex | |
832 | * to guarantee consistent state. Keep it simple. Userspace asked | |
833 | * for this wreckaged state. | |
834 | * | |
835 | * The rtmutex has an owner - either current or some other | |
836 | * task. See the EAGAIN loop above. | |
837 | */ | |
838 | pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); | |
839 | ||
840 | return err; | |
841 | } | |
842 | ||
843 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |
844 | struct task_struct *argowner) | |
845 | { | |
846 | struct futex_pi_state *pi_state = q->pi_state; | |
847 | int ret; | |
848 | ||
849 | lockdep_assert_held(q->lock_ptr); | |
850 | ||
851 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | |
852 | ret = __fixup_pi_state_owner(uaddr, q, argowner); | |
853 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | |
854 | return ret; | |
855 | } | |
856 | ||
857 | /** | |
858 | * fixup_pi_owner() - Post lock pi_state and corner case management | |
859 | * @uaddr: user address of the futex | |
860 | * @q: futex_q (contains pi_state and access to the rt_mutex) | |
861 | * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) | |
862 | * | |
863 | * After attempting to lock an rt_mutex, this function is called to cleanup | |
864 | * the pi_state owner as well as handle race conditions that may allow us to | |
865 | * acquire the lock. Must be called with the hb lock held. | |
866 | * | |
867 | * Return: | |
868 | * - 1 - success, lock taken; | |
869 | * - 0 - success, lock not taken; | |
870 | * - <0 - on error (-EFAULT) | |
871 | */ | |
872 | int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) | |
873 | { | |
874 | if (locked) { | |
875 | /* | |
876 | * Got the lock. We might not be the anticipated owner if we | |
877 | * did a lock-steal - fix up the PI-state in that case: | |
878 | * | |
879 | * Speculative pi_state->owner read (we don't hold wait_lock); | |
880 | * since we own the lock pi_state->owner == current is the | |
881 | * stable state, anything else needs more attention. | |
882 | */ | |
883 | if (q->pi_state->owner != current) | |
884 | return fixup_pi_state_owner(uaddr, q, current); | |
885 | return 1; | |
886 | } | |
887 | ||
888 | /* | |
889 | * If we didn't get the lock; check if anybody stole it from us. In | |
890 | * that case, we need to fix up the uval to point to them instead of | |
891 | * us, otherwise bad things happen. [10] | |
892 | * | |
893 | * Another speculative read; pi_state->owner == current is unstable | |
894 | * but needs our attention. | |
895 | */ | |
896 | if (q->pi_state->owner == current) | |
897 | return fixup_pi_state_owner(uaddr, q, NULL); | |
898 | ||
899 | /* | |
900 | * Paranoia check. If we did not take the lock, then we should not be | |
901 | * the owner of the rt_mutex. Warn and establish consistent state. | |
902 | */ | |
903 | if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) | |
904 | return fixup_pi_state_owner(uaddr, q, current); | |
905 | ||
906 | return 0; | |
907 | } | |
908 | ||
909 | /* | |
910 | * Userspace tried a 0 -> TID atomic transition of the futex value | |
911 | * and failed. The kernel side here does the whole locking operation: | |
912 | * if there are waiters then it will block as a consequence of relying | |
913 | * on rt-mutexes, it does PI, etc. (Due to races the kernel might see | |
914 | * a 0 value of the futex too.). | |
915 | * | |
916 | * Also serves as futex trylock_pi()'ing, and due semantics. | |
917 | */ | |
918 | int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) | |
919 | { | |
920 | struct hrtimer_sleeper timeout, *to; | |
921 | struct task_struct *exiting = NULL; | |
922 | struct rt_mutex_waiter rt_waiter; | |
923 | struct futex_hash_bucket *hb; | |
924 | struct futex_q q = futex_q_init; | |
925 | int res, ret; | |
926 | ||
927 | if (!IS_ENABLED(CONFIG_FUTEX_PI)) | |
928 | return -ENOSYS; | |
929 | ||
930 | if (refill_pi_state_cache()) | |
931 | return -ENOMEM; | |
932 | ||
933 | to = futex_setup_timer(time, &timeout, flags, 0); | |
934 | ||
935 | retry: | |
3b63a55f | 936 | ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); |
85dc28fa PZ |
937 | if (unlikely(ret != 0)) |
938 | goto out; | |
939 | ||
940 | retry_private: | |
941 | hb = futex_q_lock(&q); | |
942 | ||
943 | ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, | |
944 | &exiting, 0); | |
945 | if (unlikely(ret)) { | |
946 | /* | |
947 | * Atomic work succeeded and we got the lock, | |
948 | * or failed. Either way, we do _not_ block. | |
949 | */ | |
950 | switch (ret) { | |
951 | case 1: | |
952 | /* We got the lock. */ | |
953 | ret = 0; | |
954 | goto out_unlock_put_key; | |
955 | case -EFAULT: | |
956 | goto uaddr_faulted; | |
957 | case -EBUSY: | |
958 | case -EAGAIN: | |
959 | /* | |
960 | * Two reasons for this: | |
961 | * - EBUSY: Task is exiting and we just wait for the | |
962 | * exit to complete. | |
963 | * - EAGAIN: The user space value changed. | |
964 | */ | |
965 | futex_q_unlock(hb); | |
966 | /* | |
967 | * Handle the case where the owner is in the middle of | |
968 | * exiting. Wait for the exit to complete otherwise | |
969 | * this task might loop forever, aka. live lock. | |
970 | */ | |
971 | wait_for_owner_exiting(ret, exiting); | |
972 | cond_resched(); | |
973 | goto retry; | |
974 | default: | |
975 | goto out_unlock_put_key; | |
976 | } | |
977 | } | |
978 | ||
979 | WARN_ON(!q.pi_state); | |
980 | ||
981 | /* | |
982 | * Only actually queue now that the atomic ops are done: | |
983 | */ | |
984 | __futex_queue(&q, hb); | |
985 | ||
986 | if (trylock) { | |
987 | ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); | |
988 | /* Fixup the trylock return value: */ | |
989 | ret = ret ? 0 : -EWOULDBLOCK; | |
990 | goto no_block; | |
991 | } | |
992 | ||
d14f9e93 SAS |
993 | /* |
994 | * Must be done before we enqueue the waiter, here is unfortunately | |
995 | * under the hb lock, but that *should* work because it does nothing. | |
996 | */ | |
997 | rt_mutex_pre_schedule(); | |
998 | ||
85dc28fa PZ |
999 | rt_mutex_init_waiter(&rt_waiter); |
1000 | ||
1001 | /* | |
68290613 | 1002 | * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not |
85dc28fa PZ |
1003 | * hold it while doing rt_mutex_start_proxy(), because then it will |
1004 | * include hb->lock in the blocking chain, even through we'll not in | |
1005 | * fact hold it while blocking. This will lead it to report -EDEADLK | |
1006 | * and BUG when futex_unlock_pi() interleaves with this. | |
1007 | * | |
1008 | * Therefore acquire wait_lock while holding hb->lock, but drop the | |
1009 | * latter before calling __rt_mutex_start_proxy_lock(). This | |
1010 | * interleaves with futex_unlock_pi() -- which does a similar lock | |
1011 | * handoff -- such that the latter can observe the futex_q::pi_state | |
1012 | * before __rt_mutex_start_proxy_lock() is done. | |
1013 | */ | |
1014 | raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); | |
1015 | spin_unlock(q.lock_ptr); | |
1016 | /* | |
1017 | * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter | |
1018 | * such that futex_unlock_pi() is guaranteed to observe the waiter when | |
1019 | * it sees the futex_q::pi_state. | |
1020 | */ | |
1021 | ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); | |
1022 | raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); | |
1023 | ||
1024 | if (ret) { | |
1025 | if (ret == 1) | |
1026 | ret = 0; | |
1027 | goto cleanup; | |
1028 | } | |
1029 | ||
1030 | if (unlikely(to)) | |
1031 | hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); | |
1032 | ||
1033 | ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); | |
1034 | ||
1035 | cleanup: | |
85dc28fa PZ |
1036 | /* |
1037 | * If we failed to acquire the lock (deadlock/signal/timeout), we must | |
fbeb558b PZ |
1038 | * must unwind the above, however we canont lock hb->lock because |
1039 | * rt_mutex already has a waiter enqueued and hb->lock can itself try | |
1040 | * and enqueue an rt_waiter through rtlock. | |
1041 | * | |
1042 | * Doing the cleanup without holding hb->lock can cause inconsistent | |
1043 | * state between hb and pi_state, but only in the direction of not | |
1044 | * seeing a waiter that is leaving. | |
1045 | * | |
1046 | * See futex_unlock_pi(), it deals with this inconsistency. | |
85dc28fa | 1047 | * |
fbeb558b PZ |
1048 | * There be dragons here, since we must deal with the inconsistency on |
1049 | * the way out (here), it is impossible to detect/warn about the race | |
1050 | * the other way around (missing an incoming waiter). | |
1051 | * | |
1052 | * What could possibly go wrong... | |
85dc28fa PZ |
1053 | */ |
1054 | if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) | |
1055 | ret = 0; | |
1056 | ||
fbeb558b PZ |
1057 | /* |
1058 | * Now that the rt_waiter has been dequeued, it is safe to use | |
1059 | * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up | |
1060 | * the | |
1061 | */ | |
1062 | spin_lock(q.lock_ptr); | |
d14f9e93 SAS |
1063 | /* |
1064 | * Waiter is unqueued. | |
1065 | */ | |
1066 | rt_mutex_post_schedule(); | |
85dc28fa PZ |
1067 | no_block: |
1068 | /* | |
1069 | * Fixup the pi_state owner and possibly acquire the lock if we | |
1070 | * haven't already. | |
1071 | */ | |
1072 | res = fixup_pi_owner(uaddr, &q, !ret); | |
1073 | /* | |
1074 | * If fixup_pi_owner() returned an error, propagate that. If it acquired | |
1075 | * the lock, clear our -ETIMEDOUT or -EINTR. | |
1076 | */ | |
1077 | if (res) | |
1078 | ret = (res < 0) ? res : 0; | |
1079 | ||
1080 | futex_unqueue_pi(&q); | |
1081 | spin_unlock(q.lock_ptr); | |
1082 | goto out; | |
1083 | ||
1084 | out_unlock_put_key: | |
1085 | futex_q_unlock(hb); | |
1086 | ||
1087 | out: | |
1088 | if (to) { | |
1089 | hrtimer_cancel(&to->timer); | |
1090 | destroy_hrtimer_on_stack(&to->timer); | |
1091 | } | |
1092 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | |
1093 | ||
1094 | uaddr_faulted: | |
1095 | futex_q_unlock(hb); | |
1096 | ||
1097 | ret = fault_in_user_writeable(uaddr); | |
1098 | if (ret) | |
1099 | goto out; | |
1100 | ||
1101 | if (!(flags & FLAGS_SHARED)) | |
1102 | goto retry_private; | |
1103 | ||
1104 | goto retry; | |
1105 | } | |
1106 | ||
1107 | /* | |
1108 | * Userspace attempted a TID -> 0 atomic transition, and failed. | |
1109 | * This is the in-kernel slowpath: we look up the PI state (if any), | |
1110 | * and do the rt-mutex unlock. | |
1111 | */ | |
1112 | int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) | |
1113 | { | |
1114 | u32 curval, uval, vpid = task_pid_vnr(current); | |
1115 | union futex_key key = FUTEX_KEY_INIT; | |
1116 | struct futex_hash_bucket *hb; | |
1117 | struct futex_q *top_waiter; | |
1118 | int ret; | |
1119 | ||
1120 | if (!IS_ENABLED(CONFIG_FUTEX_PI)) | |
1121 | return -ENOSYS; | |
1122 | ||
1123 | retry: | |
1124 | if (get_user(uval, uaddr)) | |
1125 | return -EFAULT; | |
1126 | /* | |
1127 | * We release only a lock we actually own: | |
1128 | */ | |
1129 | if ((uval & FUTEX_TID_MASK) != vpid) | |
1130 | return -EPERM; | |
1131 | ||
3b63a55f | 1132 | ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE); |
85dc28fa PZ |
1133 | if (ret) |
1134 | return ret; | |
1135 | ||
1136 | hb = futex_hash(&key); | |
1137 | spin_lock(&hb->lock); | |
e626cb02 | 1138 | retry_hb: |
85dc28fa PZ |
1139 | |
1140 | /* | |
1141 | * Check waiters first. We do not trust user space values at | |
1142 | * all and we at least want to know if user space fiddled | |
1143 | * with the futex value instead of blindly unlocking. | |
1144 | */ | |
1145 | top_waiter = futex_top_waiter(hb, &key); | |
1146 | if (top_waiter) { | |
1147 | struct futex_pi_state *pi_state = top_waiter->pi_state; | |
fbeb558b | 1148 | struct rt_mutex_waiter *rt_waiter; |
85dc28fa PZ |
1149 | |
1150 | ret = -EINVAL; | |
1151 | if (!pi_state) | |
1152 | goto out_unlock; | |
1153 | ||
1154 | /* | |
1155 | * If current does not own the pi_state then the futex is | |
1156 | * inconsistent and user space fiddled with the futex value. | |
1157 | */ | |
1158 | if (pi_state->owner != current) | |
1159 | goto out_unlock; | |
1160 | ||
85dc28fa PZ |
1161 | /* |
1162 | * By taking wait_lock while still holding hb->lock, we ensure | |
fbeb558b PZ |
1163 | * there is no point where we hold neither; and thereby |
1164 | * wake_futex_pi() must observe any new waiters. | |
1165 | * | |
1166 | * Since the cleanup: case in futex_lock_pi() removes the | |
1167 | * rt_waiter without holding hb->lock, it is possible for | |
1168 | * wake_futex_pi() to not find a waiter while the above does, | |
1169 | * in this case the waiter is on the way out and it can be | |
1170 | * ignored. | |
85dc28fa PZ |
1171 | * |
1172 | * In particular; this forces __rt_mutex_start_proxy() to | |
1173 | * complete such that we're guaranteed to observe the | |
fbeb558b | 1174 | * rt_waiter. |
85dc28fa PZ |
1175 | */ |
1176 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | |
fbeb558b PZ |
1177 | |
1178 | /* | |
1179 | * Futex vs rt_mutex waiter state -- if there are no rt_mutex | |
1180 | * waiters even though futex thinks there are, then the waiter | |
e626cb02 SAS |
1181 | * is leaving. The entry needs to be removed from the list so a |
1182 | * new futex_lock_pi() is not using this stale PI-state while | |
1183 | * the futex is available in user space again. | |
1184 | * There can be more than one task on its way out so it needs | |
1185 | * to retry. | |
fbeb558b PZ |
1186 | */ |
1187 | rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); | |
1188 | if (!rt_waiter) { | |
e626cb02 | 1189 | __futex_unqueue(top_waiter); |
fbeb558b | 1190 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); |
e626cb02 | 1191 | goto retry_hb; |
fbeb558b PZ |
1192 | } |
1193 | ||
1194 | get_pi_state(pi_state); | |
85dc28fa PZ |
1195 | spin_unlock(&hb->lock); |
1196 | ||
1197 | /* drops pi_state->pi_mutex.wait_lock */ | |
fbeb558b | 1198 | ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter); |
85dc28fa PZ |
1199 | |
1200 | put_pi_state(pi_state); | |
1201 | ||
1202 | /* | |
1203 | * Success, we're done! No tricky corner cases. | |
1204 | */ | |
1205 | if (!ret) | |
1206 | return ret; | |
1207 | /* | |
1208 | * The atomic access to the futex value generated a | |
1209 | * pagefault, so retry the user-access and the wakeup: | |
1210 | */ | |
1211 | if (ret == -EFAULT) | |
1212 | goto pi_faulted; | |
1213 | /* | |
1214 | * A unconditional UNLOCK_PI op raced against a waiter | |
1215 | * setting the FUTEX_WAITERS bit. Try again. | |
1216 | */ | |
1217 | if (ret == -EAGAIN) | |
1218 | goto pi_retry; | |
1219 | /* | |
1220 | * wake_futex_pi has detected invalid state. Tell user | |
1221 | * space. | |
1222 | */ | |
1223 | return ret; | |
1224 | } | |
1225 | ||
1226 | /* | |
1227 | * We have no kernel internal state, i.e. no waiters in the | |
1228 | * kernel. Waiters which are about to queue themselves are stuck | |
1229 | * on hb->lock. So we can safely ignore them. We do neither | |
1230 | * preserve the WAITERS bit not the OWNER_DIED one. We are the | |
1231 | * owner. | |
1232 | */ | |
1233 | if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) { | |
1234 | spin_unlock(&hb->lock); | |
1235 | switch (ret) { | |
1236 | case -EFAULT: | |
1237 | goto pi_faulted; | |
1238 | ||
1239 | case -EAGAIN: | |
1240 | goto pi_retry; | |
1241 | ||
1242 | default: | |
1243 | WARN_ON_ONCE(1); | |
1244 | return ret; | |
1245 | } | |
1246 | } | |
1247 | ||
1248 | /* | |
1249 | * If uval has changed, let user space handle it. | |
1250 | */ | |
1251 | ret = (curval == uval) ? 0 : -EAGAIN; | |
1252 | ||
1253 | out_unlock: | |
1254 | spin_unlock(&hb->lock); | |
1255 | return ret; | |
1256 | ||
1257 | pi_retry: | |
1258 | cond_resched(); | |
1259 | goto retry; | |
1260 | ||
1261 | pi_faulted: | |
1262 | ||
1263 | ret = fault_in_user_writeable(uaddr); | |
1264 | if (!ret) | |
1265 | goto retry; | |
1266 | ||
1267 | return ret; | |
1268 | } | |
1269 |