arm64: mm: use ptep_clear() instead of pte_clear() in clear_flush()
[linux-2.6-block.git] / fs / userfaultfd.c
CommitLineData
20c8ccb1 1// SPDX-License-Identifier: GPL-2.0-only
86039bd3
AA
2/*
3 * fs/userfaultfd.c
4 *
5 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6 * Copyright (C) 2008-2009 Red Hat, Inc.
7 * Copyright (C) 2015 Red Hat, Inc.
8 *
86039bd3
AA
9 * Some part derived from fs/eventfd.c (anon inode setup) and
10 * mm/ksm.c (mm hashing).
11 */
12
9cd75c3c 13#include <linux/list.h>
86039bd3 14#include <linux/hashtable.h>
174cd4b1 15#include <linux/sched/signal.h>
6e84f315 16#include <linux/sched/mm.h>
86039bd3 17#include <linux/mm.h>
17fca131 18#include <linux/mm_inline.h>
6dfeaff9 19#include <linux/mmu_notifier.h>
86039bd3
AA
20#include <linux/poll.h>
21#include <linux/slab.h>
22#include <linux/seq_file.h>
23#include <linux/file.h>
24#include <linux/bug.h>
25#include <linux/anon_inodes.h>
26#include <linux/syscalls.h>
27#include <linux/userfaultfd_k.h>
28#include <linux/mempolicy.h>
29#include <linux/ioctl.h>
30#include <linux/security.h>
cab350af 31#include <linux/hugetlb.h>
5c041f5d 32#include <linux/swapops.h>
2d5de004 33#include <linux/miscdevice.h>
86039bd3 34
2d337b71
Z
35static int sysctl_unprivileged_userfaultfd __read_mostly;
36
37#ifdef CONFIG_SYSCTL
38static struct ctl_table vm_userfaultfd_table[] = {
39 {
40 .procname = "unprivileged_userfaultfd",
41 .data = &sysctl_unprivileged_userfaultfd,
42 .maxlen = sizeof(sysctl_unprivileged_userfaultfd),
43 .mode = 0644,
44 .proc_handler = proc_dointvec_minmax,
45 .extra1 = SYSCTL_ZERO,
46 .extra2 = SYSCTL_ONE,
47 },
48 { }
49};
50#endif
cefdca0a 51
3004ec9c
AA
52static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
53
3004ec9c
AA
54/*
55 * Start with fault_pending_wqh and fault_wqh so they're more likely
56 * to be in the same cacheline.
cbcfa130
EB
57 *
58 * Locking order:
59 * fd_wqh.lock
60 * fault_pending_wqh.lock
61 * fault_wqh.lock
62 * event_wqh.lock
63 *
64 * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
65 * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
66 * also taken in IRQ context.
3004ec9c 67 */
86039bd3 68struct userfaultfd_ctx {
15b726ef
AA
69 /* waitqueue head for the pending (i.e. not read) userfaults */
70 wait_queue_head_t fault_pending_wqh;
71 /* waitqueue head for the userfaults */
86039bd3
AA
72 wait_queue_head_t fault_wqh;
73 /* waitqueue head for the pseudo fd to wakeup poll/read */
74 wait_queue_head_t fd_wqh;
9cd75c3c
PE
75 /* waitqueue head for events */
76 wait_queue_head_t event_wqh;
2c5b7e1b 77 /* a refile sequence protected by fault_pending_wqh lock */
2ca97ac8 78 seqcount_spinlock_t refile_seq;
3004ec9c 79 /* pseudo fd refcounting */
ca880420 80 refcount_t refcount;
86039bd3
AA
81 /* userfaultfd syscall flags */
82 unsigned int flags;
9cd75c3c
PE
83 /* features requested from the userspace */
84 unsigned int features;
86039bd3
AA
85 /* released */
86 bool released;
df2cc96e 87 /* memory mappings are changing because of non-cooperative event */
a759a909 88 atomic_t mmap_changing;
86039bd3
AA
89 /* mm with one ore more vmas attached to this userfaultfd_ctx */
90 struct mm_struct *mm;
91};
92
893e26e6
PE
93struct userfaultfd_fork_ctx {
94 struct userfaultfd_ctx *orig;
95 struct userfaultfd_ctx *new;
96 struct list_head list;
97};
98
897ab3e0
MR
99struct userfaultfd_unmap_ctx {
100 struct userfaultfd_ctx *ctx;
101 unsigned long start;
102 unsigned long end;
103 struct list_head list;
104};
105
86039bd3 106struct userfaultfd_wait_queue {
a9b85f94 107 struct uffd_msg msg;
ac6424b9 108 wait_queue_entry_t wq;
86039bd3 109 struct userfaultfd_ctx *ctx;
15a77c6f 110 bool waken;
86039bd3
AA
111};
112
113struct userfaultfd_wake_range {
114 unsigned long start;
115 unsigned long len;
116};
117
22e5fe2a
NA
118/* internal indication that UFFD_API ioctl was successfully executed */
119#define UFFD_FEATURE_INITIALIZED (1u << 31)
120
121static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
122{
123 return ctx->features & UFFD_FEATURE_INITIALIZED;
124}
125
2bad466c
PX
126/*
127 * Whether WP_UNPOPULATED is enabled on the uffd context. It is only
128 * meaningful when userfaultfd_wp()==true on the vma and when it's
129 * anonymous.
130 */
131bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
132{
133 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
134
135 if (!ctx)
136 return false;
137
138 return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
139}
140
51d3d5eb
DH
141static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
142 vm_flags_t flags)
143{
144 const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
145
1c71222e 146 vm_flags_reset(vma, flags);
51d3d5eb
DH
147 /*
148 * For shared mappings, we want to enable writenotify while
149 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
150 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
151 */
152 if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
153 vma_set_page_prot(vma);
154}
155
ac6424b9 156static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
86039bd3
AA
157 int wake_flags, void *key)
158{
159 struct userfaultfd_wake_range *range = key;
160 int ret;
161 struct userfaultfd_wait_queue *uwq;
162 unsigned long start, len;
163
164 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
165 ret = 0;
86039bd3
AA
166 /* len == 0 means wake all */
167 start = range->start;
168 len = range->len;
a9b85f94
AA
169 if (len && (start > uwq->msg.arg.pagefault.address ||
170 start + len <= uwq->msg.arg.pagefault.address))
86039bd3 171 goto out;
15a77c6f
AA
172 WRITE_ONCE(uwq->waken, true);
173 /*
a9668cd6
PZ
174 * The Program-Order guarantees provided by the scheduler
175 * ensure uwq->waken is visible before the task is woken.
15a77c6f 176 */
86039bd3 177 ret = wake_up_state(wq->private, mode);
a9668cd6 178 if (ret) {
86039bd3
AA
179 /*
180 * Wake only once, autoremove behavior.
181 *
a9668cd6
PZ
182 * After the effect of list_del_init is visible to the other
183 * CPUs, the waitqueue may disappear from under us, see the
184 * !list_empty_careful() in handle_userfault().
185 *
186 * try_to_wake_up() has an implicit smp_mb(), and the
187 * wq->private is read before calling the extern function
188 * "wake_up_state" (which in turns calls try_to_wake_up).
86039bd3 189 */
2055da97 190 list_del_init(&wq->entry);
a9668cd6 191 }
86039bd3
AA
192out:
193 return ret;
194}
195
196/**
197 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
198 * context.
199 * @ctx: [in] Pointer to the userfaultfd context.
86039bd3
AA
200 */
201static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
202{
ca880420 203 refcount_inc(&ctx->refcount);
86039bd3
AA
204}
205
206/**
207 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
208 * context.
209 * @ctx: [in] Pointer to userfaultfd context.
210 *
211 * The userfaultfd context reference must have been previously acquired either
212 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
213 */
214static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
215{
ca880420 216 if (refcount_dec_and_test(&ctx->refcount)) {
86039bd3
AA
217 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
218 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
219 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
220 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
9cd75c3c
PE
221 VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
222 VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
86039bd3
AA
223 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
224 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
d2005e3f 225 mmdrop(ctx->mm);
3004ec9c 226 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
86039bd3
AA
227 }
228}
229
a9b85f94 230static inline void msg_init(struct uffd_msg *msg)
86039bd3 231{
a9b85f94
AA
232 BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
233 /*
234 * Must use memset to zero out the paddings or kernel data is
235 * leaked to userland.
236 */
237 memset(msg, 0, sizeof(struct uffd_msg));
238}
239
240static inline struct uffd_msg userfault_msg(unsigned long address,
d172b1a3 241 unsigned long real_address,
a9b85f94 242 unsigned int flags,
9d4ac934
AP
243 unsigned long reason,
244 unsigned int features)
a9b85f94
AA
245{
246 struct uffd_msg msg;
d172b1a3 247
a9b85f94
AA
248 msg_init(&msg);
249 msg.event = UFFD_EVENT_PAGEFAULT;
824ddc60 250
d172b1a3
NA
251 msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
252 real_address : address;
253
7677f7fd
AR
254 /*
255 * These flags indicate why the userfault occurred:
256 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
257 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
258 * - Neither of these flags being set indicates a MISSING fault.
259 *
260 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
261 * fault. Otherwise, it was a read fault.
262 */
86039bd3 263 if (flags & FAULT_FLAG_WRITE)
a9b85f94 264 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
86039bd3 265 if (reason & VM_UFFD_WP)
a9b85f94 266 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
7677f7fd
AR
267 if (reason & VM_UFFD_MINOR)
268 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
9d4ac934 269 if (features & UFFD_FEATURE_THREAD_ID)
a36985d3 270 msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
a9b85f94 271 return msg;
86039bd3
AA
272}
273
369cd212
MK
274#ifdef CONFIG_HUGETLB_PAGE
275/*
276 * Same functionality as userfaultfd_must_wait below with modifications for
277 * hugepmd ranges.
278 */
279static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
29a22b9e
SB
280 struct vm_fault *vmf,
281 unsigned long reason)
369cd212 282{
29a22b9e 283 struct vm_area_struct *vma = vmf->vma;
1e2c0436 284 pte_t *ptep, pte;
369cd212
MK
285 bool ret = true;
286
29a22b9e 287 assert_fault_locked(vmf);
1e2c0436 288
29a22b9e 289 ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
1e2c0436 290 if (!ptep)
369cd212
MK
291 goto out;
292
293 ret = false;
1e2c0436 294 pte = huge_ptep_get(ptep);
369cd212
MK
295
296 /*
297 * Lockless access: we're in a wait_event so it's ok if it
5c041f5d
PX
298 * changes under us. PTE markers should be handled the same as none
299 * ptes here.
369cd212 300 */
5c041f5d 301 if (huge_pte_none_mostly(pte))
369cd212 302 ret = true;
1e2c0436 303 if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
369cd212
MK
304 ret = true;
305out:
306 return ret;
307}
308#else
309static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
29a22b9e
SB
310 struct vm_fault *vmf,
311 unsigned long reason)
369cd212
MK
312{
313 return false; /* should never get here */
314}
315#endif /* CONFIG_HUGETLB_PAGE */
316
8d2afd96
AA
317/*
318 * Verify the pagetables are still not ok after having reigstered into
319 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
320 * userfault that has already been resolved, if userfaultfd_read and
321 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
322 * threads.
323 */
324static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
29a22b9e 325 struct vm_fault *vmf,
8d2afd96
AA
326 unsigned long reason)
327{
328 struct mm_struct *mm = ctx->mm;
29a22b9e 329 unsigned long address = vmf->address;
8d2afd96 330 pgd_t *pgd;
c2febafc 331 p4d_t *p4d;
8d2afd96
AA
332 pud_t *pud;
333 pmd_t *pmd, _pmd;
334 pte_t *pte;
c33c7948 335 pte_t ptent;
8d2afd96
AA
336 bool ret = true;
337
29a22b9e 338 assert_fault_locked(vmf);
8d2afd96
AA
339
340 pgd = pgd_offset(mm, address);
341 if (!pgd_present(*pgd))
342 goto out;
c2febafc
KS
343 p4d = p4d_offset(pgd, address);
344 if (!p4d_present(*p4d))
345 goto out;
346 pud = pud_offset(p4d, address);
8d2afd96
AA
347 if (!pud_present(*pud))
348 goto out;
349 pmd = pmd_offset(pud, address);
2b683a4f 350again:
26e1a0c3 351 _pmd = pmdp_get_lockless(pmd);
a365ac09 352 if (pmd_none(_pmd))
8d2afd96
AA
353 goto out;
354
355 ret = false;
2b683a4f 356 if (!pmd_present(_pmd) || pmd_devmap(_pmd))
a365ac09
HY
357 goto out;
358
63b2d417
AA
359 if (pmd_trans_huge(_pmd)) {
360 if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
361 ret = true;
8d2afd96 362 goto out;
63b2d417 363 }
8d2afd96 364
8d2afd96 365 pte = pte_offset_map(pmd, address);
2b683a4f
HD
366 if (!pte) {
367 ret = true;
368 goto again;
369 }
8d2afd96
AA
370 /*
371 * Lockless access: we're in a wait_event so it's ok if it
5c041f5d
PX
372 * changes under us. PTE markers should be handled the same as none
373 * ptes here.
8d2afd96 374 */
c33c7948
RR
375 ptent = ptep_get(pte);
376 if (pte_none_mostly(ptent))
8d2afd96 377 ret = true;
c33c7948 378 if (!pte_write(ptent) && (reason & VM_UFFD_WP))
63b2d417 379 ret = true;
8d2afd96
AA
380 pte_unmap(pte);
381
382out:
383 return ret;
384}
385
2f064a59 386static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
3e69ad08
PX
387{
388 if (flags & FAULT_FLAG_INTERRUPTIBLE)
389 return TASK_INTERRUPTIBLE;
390
391 if (flags & FAULT_FLAG_KILLABLE)
392 return TASK_KILLABLE;
393
394 return TASK_UNINTERRUPTIBLE;
395}
396
86039bd3
AA
397/*
398 * The locking rules involved in returning VM_FAULT_RETRY depending on
399 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
400 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
401 * recommendation in __lock_page_or_retry is not an understatement.
402 *
c1e8d7c6 403 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
86039bd3
AA
404 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
405 * not set.
406 *
407 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
408 * set, VM_FAULT_RETRY can still be returned if and only if there are
c1e8d7c6 409 * fatal_signal_pending()s, and the mmap_lock must be released before
86039bd3
AA
410 * returning it.
411 */
2b740303 412vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
86039bd3 413{
b8da2e46
PX
414 struct vm_area_struct *vma = vmf->vma;
415 struct mm_struct *mm = vma->vm_mm;
86039bd3
AA
416 struct userfaultfd_ctx *ctx;
417 struct userfaultfd_wait_queue uwq;
2b740303 418 vm_fault_t ret = VM_FAULT_SIGBUS;
3e69ad08 419 bool must_wait;
2f064a59 420 unsigned int blocking_state;
86039bd3 421
64c2b203
AA
422 /*
423 * We don't do userfault handling for the final child pid update.
424 *
425 * We also don't do userfault handling during
426 * coredumping. hugetlbfs has the special
48498071 427 * hugetlb_follow_page_mask() to skip missing pages in the
64c2b203
AA
428 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
429 * the no_page_table() helper in follow_page_mask(), but the
430 * shmem_vm_ops->fault method is invoked even during
c1e8d7c6 431 * coredumping without mmap_lock and it ends up here.
64c2b203
AA
432 */
433 if (current->flags & (PF_EXITING|PF_DUMPCORE))
434 goto out;
435
436 /*
c1e8d7c6
ML
437 * Coredumping runs without mmap_lock so we can only check that
438 * the mmap_lock is held, if PF_DUMPCORE was not set.
64c2b203 439 */
29a22b9e 440 assert_fault_locked(vmf);
64c2b203 441
b8da2e46 442 ctx = vma->vm_userfaultfd_ctx.ctx;
86039bd3 443 if (!ctx)
ba85c702 444 goto out;
86039bd3
AA
445
446 BUG_ON(ctx->mm != mm);
447
7677f7fd
AR
448 /* Any unrecognized flag is a bug. */
449 VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
450 /* 0 or > 1 flags set is a bug; we expect exactly 1. */
451 VM_BUG_ON(!reason || (reason & (reason - 1)));
86039bd3 452
2d6d6f5a
PS
453 if (ctx->features & UFFD_FEATURE_SIGBUS)
454 goto out;
2d5de004 455 if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
37cd0575 456 goto out;
2d6d6f5a 457
86039bd3
AA
458 /*
459 * If it's already released don't get it. This avoids to loop
460 * in __get_user_pages if userfaultfd_release waits on the
c1e8d7c6 461 * caller of handle_userfault to release the mmap_lock.
86039bd3 462 */
6aa7de05 463 if (unlikely(READ_ONCE(ctx->released))) {
656710a6
AA
464 /*
465 * Don't return VM_FAULT_SIGBUS in this case, so a non
466 * cooperative manager can close the uffd after the
467 * last UFFDIO_COPY, without risking to trigger an
468 * involuntary SIGBUS if the process was starting the
469 * userfaultfd while the userfaultfd was still armed
470 * (but after the last UFFDIO_COPY). If the uffd
471 * wasn't already closed when the userfault reached
472 * this point, that would normally be solved by
473 * userfaultfd_must_wait returning 'false'.
474 *
475 * If we were to return VM_FAULT_SIGBUS here, the non
476 * cooperative manager would be instead forced to
477 * always call UFFDIO_UNREGISTER before it can safely
478 * close the uffd.
479 */
480 ret = VM_FAULT_NOPAGE;
ba85c702 481 goto out;
656710a6 482 }
86039bd3
AA
483
484 /*
485 * Check that we can return VM_FAULT_RETRY.
486 *
487 * NOTE: it should become possible to return VM_FAULT_RETRY
488 * even if FAULT_FLAG_TRIED is set without leading to gup()
489 * -EBUSY failures, if the userfaultfd is to be extended for
490 * VM_UFFD_WP tracking and we intend to arm the userfault
491 * without first stopping userland access to the memory. For
492 * VM_UFFD_MISSING userfaults this is enough for now.
493 */
82b0f8c3 494 if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
86039bd3
AA
495 /*
496 * Validate the invariant that nowait must allow retry
497 * to be sure not to return SIGBUS erroneously on
498 * nowait invocations.
499 */
82b0f8c3 500 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
86039bd3
AA
501#ifdef CONFIG_DEBUG_VM
502 if (printk_ratelimit()) {
503 printk(KERN_WARNING
82b0f8c3
JK
504 "FAULT_FLAG_ALLOW_RETRY missing %x\n",
505 vmf->flags);
86039bd3
AA
506 dump_stack();
507 }
508#endif
ba85c702 509 goto out;
86039bd3
AA
510 }
511
512 /*
513 * Handle nowait, not much to do other than tell it to retry
514 * and wait.
515 */
ba85c702 516 ret = VM_FAULT_RETRY;
82b0f8c3 517 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
ba85c702 518 goto out;
86039bd3 519
c1e8d7c6 520 /* take the reference before dropping the mmap_lock */
86039bd3
AA
521 userfaultfd_ctx_get(ctx);
522
86039bd3
AA
523 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
524 uwq.wq.private = current;
d172b1a3
NA
525 uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
526 reason, ctx->features);
86039bd3 527 uwq.ctx = ctx;
15a77c6f 528 uwq.waken = false;
86039bd3 529
3e69ad08 530 blocking_state = userfaultfd_get_blocking_state(vmf->flags);
dfa37dc3 531
b8da2e46
PX
532 /*
533 * Take the vma lock now, in order to safely call
534 * userfaultfd_huge_must_wait() later. Since acquiring the
535 * (sleepable) vma lock can modify the current task state, that
536 * must be before explicitly calling set_current_state().
537 */
538 if (is_vm_hugetlb_page(vma))
539 hugetlb_vma_lock_read(vma);
540
cbcfa130 541 spin_lock_irq(&ctx->fault_pending_wqh.lock);
86039bd3
AA
542 /*
543 * After the __add_wait_queue the uwq is visible to userland
544 * through poll/read().
545 */
15b726ef
AA
546 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
547 /*
548 * The smp_mb() after __set_current_state prevents the reads
549 * following the spin_unlock to happen before the list_add in
550 * __add_wait_queue.
551 */
15a77c6f 552 set_current_state(blocking_state);
cbcfa130 553 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3 554
b8da2e46 555 if (!is_vm_hugetlb_page(vma))
29a22b9e 556 must_wait = userfaultfd_must_wait(ctx, vmf, reason);
369cd212 557 else
29a22b9e 558 must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
b8da2e46
PX
559 if (is_vm_hugetlb_page(vma))
560 hugetlb_vma_unlock_read(vma);
29a22b9e 561 release_fault_lock(vmf);
8d2afd96 562
f9bf3522 563 if (likely(must_wait && !READ_ONCE(ctx->released))) {
a9a08845 564 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
86039bd3 565 schedule();
ba85c702 566 }
86039bd3 567
ba85c702 568 __set_current_state(TASK_RUNNING);
15b726ef
AA
569
570 /*
571 * Here we race with the list_del; list_add in
572 * userfaultfd_ctx_read(), however because we don't ever run
573 * list_del_init() to refile across the two lists, the prev
574 * and next pointers will never point to self. list_add also
575 * would never let any of the two pointers to point to
576 * self. So list_empty_careful won't risk to see both pointers
577 * pointing to self at any time during the list refile. The
578 * only case where list_del_init() is called is the full
579 * removal in the wake function and there we don't re-list_add
580 * and it's fine not to block on the spinlock. The uwq on this
581 * kernel stack can be released after the list_del_init.
582 */
2055da97 583 if (!list_empty_careful(&uwq.wq.entry)) {
cbcfa130 584 spin_lock_irq(&ctx->fault_pending_wqh.lock);
15b726ef
AA
585 /*
586 * No need of list_del_init(), the uwq on the stack
587 * will be freed shortly anyway.
588 */
2055da97 589 list_del(&uwq.wq.entry);
cbcfa130 590 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3 591 }
86039bd3
AA
592
593 /*
594 * ctx may go away after this if the userfault pseudo fd is
595 * already released.
596 */
597 userfaultfd_ctx_put(ctx);
598
ba85c702
AA
599out:
600 return ret;
86039bd3
AA
601}
602
8c9e7bb7
AA
603static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
604 struct userfaultfd_wait_queue *ewq)
9cd75c3c 605{
0cbb4b4f
AA
606 struct userfaultfd_ctx *release_new_ctx;
607
9a69a829
AA
608 if (WARN_ON_ONCE(current->flags & PF_EXITING))
609 goto out;
9cd75c3c
PE
610
611 ewq->ctx = ctx;
612 init_waitqueue_entry(&ewq->wq, current);
0cbb4b4f 613 release_new_ctx = NULL;
9cd75c3c 614
cbcfa130 615 spin_lock_irq(&ctx->event_wqh.lock);
9cd75c3c
PE
616 /*
617 * After the __add_wait_queue the uwq is visible to userland
618 * through poll/read().
619 */
620 __add_wait_queue(&ctx->event_wqh, &ewq->wq);
621 for (;;) {
622 set_current_state(TASK_KILLABLE);
623 if (ewq->msg.event == 0)
624 break;
6aa7de05 625 if (READ_ONCE(ctx->released) ||
9cd75c3c 626 fatal_signal_pending(current)) {
384632e6
AA
627 /*
628 * &ewq->wq may be queued in fork_event, but
629 * __remove_wait_queue ignores the head
630 * parameter. It would be a problem if it
631 * didn't.
632 */
9cd75c3c 633 __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
7eb76d45
MR
634 if (ewq->msg.event == UFFD_EVENT_FORK) {
635 struct userfaultfd_ctx *new;
636
637 new = (struct userfaultfd_ctx *)
638 (unsigned long)
639 ewq->msg.arg.reserved.reserved1;
0cbb4b4f 640 release_new_ctx = new;
7eb76d45 641 }
9cd75c3c
PE
642 break;
643 }
644
cbcfa130 645 spin_unlock_irq(&ctx->event_wqh.lock);
9cd75c3c 646
a9a08845 647 wake_up_poll(&ctx->fd_wqh, EPOLLIN);
9cd75c3c
PE
648 schedule();
649
cbcfa130 650 spin_lock_irq(&ctx->event_wqh.lock);
9cd75c3c
PE
651 }
652 __set_current_state(TASK_RUNNING);
cbcfa130 653 spin_unlock_irq(&ctx->event_wqh.lock);
9cd75c3c 654
0cbb4b4f
AA
655 if (release_new_ctx) {
656 struct vm_area_struct *vma;
657 struct mm_struct *mm = release_new_ctx->mm;
69dbe6da 658 VMA_ITERATOR(vmi, mm, 0);
0cbb4b4f
AA
659
660 /* the various vma->vm_userfaultfd_ctx still points to it */
d8ed45c5 661 mmap_write_lock(mm);
69dbe6da 662 for_each_vma(vmi, vma) {
31e810aa 663 if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
60081bf1 664 vma_start_write(vma);
0cbb4b4f 665 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
51d3d5eb
DH
666 userfaultfd_set_vm_flags(vma,
667 vma->vm_flags & ~__VM_UFFD_FLAGS);
31e810aa 668 }
69dbe6da 669 }
d8ed45c5 670 mmap_write_unlock(mm);
0cbb4b4f
AA
671
672 userfaultfd_ctx_put(release_new_ctx);
673 }
674
9cd75c3c
PE
675 /*
676 * ctx may go away after this if the userfault pseudo fd is
677 * already released.
678 */
9a69a829 679out:
a759a909
NA
680 atomic_dec(&ctx->mmap_changing);
681 VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
9cd75c3c 682 userfaultfd_ctx_put(ctx);
9cd75c3c
PE
683}
684
685static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
686 struct userfaultfd_wait_queue *ewq)
687{
688 ewq->msg.event = 0;
689 wake_up_locked(&ctx->event_wqh);
690 __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
691}
692
893e26e6
PE
693int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
694{
695 struct userfaultfd_ctx *ctx = NULL, *octx;
696 struct userfaultfd_fork_ctx *fctx;
697
698 octx = vma->vm_userfaultfd_ctx.ctx;
699 if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
60081bf1 700 vma_start_write(vma);
893e26e6 701 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
51d3d5eb 702 userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
893e26e6
PE
703 return 0;
704 }
705
706 list_for_each_entry(fctx, fcs, list)
707 if (fctx->orig == octx) {
708 ctx = fctx->new;
709 break;
710 }
711
712 if (!ctx) {
713 fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
714 if (!fctx)
715 return -ENOMEM;
716
717 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
718 if (!ctx) {
719 kfree(fctx);
720 return -ENOMEM;
721 }
722
ca880420 723 refcount_set(&ctx->refcount, 1);
893e26e6 724 ctx->flags = octx->flags;
893e26e6
PE
725 ctx->features = octx->features;
726 ctx->released = false;
a759a909 727 atomic_set(&ctx->mmap_changing, 0);
893e26e6 728 ctx->mm = vma->vm_mm;
00bb31fa 729 mmgrab(ctx->mm);
893e26e6
PE
730
731 userfaultfd_ctx_get(octx);
a759a909 732 atomic_inc(&octx->mmap_changing);
893e26e6
PE
733 fctx->orig = octx;
734 fctx->new = ctx;
735 list_add_tail(&fctx->list, fcs);
736 }
737
738 vma->vm_userfaultfd_ctx.ctx = ctx;
739 return 0;
740}
741
8c9e7bb7 742static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
893e26e6
PE
743{
744 struct userfaultfd_ctx *ctx = fctx->orig;
745 struct userfaultfd_wait_queue ewq;
746
747 msg_init(&ewq.msg);
748
749 ewq.msg.event = UFFD_EVENT_FORK;
750 ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
751
8c9e7bb7 752 userfaultfd_event_wait_completion(ctx, &ewq);
893e26e6
PE
753}
754
755void dup_userfaultfd_complete(struct list_head *fcs)
756{
893e26e6
PE
757 struct userfaultfd_fork_ctx *fctx, *n;
758
759 list_for_each_entry_safe(fctx, n, fcs, list) {
8c9e7bb7 760 dup_fctx(fctx);
893e26e6
PE
761 list_del(&fctx->list);
762 kfree(fctx);
763 }
764}
765
72f87654
PE
766void mremap_userfaultfd_prep(struct vm_area_struct *vma,
767 struct vm_userfaultfd_ctx *vm_ctx)
768{
769 struct userfaultfd_ctx *ctx;
770
771 ctx = vma->vm_userfaultfd_ctx.ctx;
3cfd22be
PX
772
773 if (!ctx)
774 return;
775
776 if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
72f87654
PE
777 vm_ctx->ctx = ctx;
778 userfaultfd_ctx_get(ctx);
a759a909 779 atomic_inc(&ctx->mmap_changing);
3cfd22be
PX
780 } else {
781 /* Drop uffd context if remap feature not enabled */
60081bf1 782 vma_start_write(vma);
3cfd22be 783 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
51d3d5eb 784 userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
72f87654
PE
785 }
786}
787
90794bf1 788void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
72f87654
PE
789 unsigned long from, unsigned long to,
790 unsigned long len)
791{
90794bf1 792 struct userfaultfd_ctx *ctx = vm_ctx->ctx;
72f87654
PE
793 struct userfaultfd_wait_queue ewq;
794
795 if (!ctx)
796 return;
797
798 if (to & ~PAGE_MASK) {
799 userfaultfd_ctx_put(ctx);
800 return;
801 }
802
803 msg_init(&ewq.msg);
804
805 ewq.msg.event = UFFD_EVENT_REMAP;
806 ewq.msg.arg.remap.from = from;
807 ewq.msg.arg.remap.to = to;
808 ewq.msg.arg.remap.len = len;
809
810 userfaultfd_event_wait_completion(ctx, &ewq);
811}
812
70ccb92f 813bool userfaultfd_remove(struct vm_area_struct *vma,
d811914d 814 unsigned long start, unsigned long end)
05ce7724
PE
815{
816 struct mm_struct *mm = vma->vm_mm;
817 struct userfaultfd_ctx *ctx;
818 struct userfaultfd_wait_queue ewq;
819
820 ctx = vma->vm_userfaultfd_ctx.ctx;
d811914d 821 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
70ccb92f 822 return true;
05ce7724
PE
823
824 userfaultfd_ctx_get(ctx);
a759a909 825 atomic_inc(&ctx->mmap_changing);
d8ed45c5 826 mmap_read_unlock(mm);
05ce7724 827
05ce7724
PE
828 msg_init(&ewq.msg);
829
d811914d
MR
830 ewq.msg.event = UFFD_EVENT_REMOVE;
831 ewq.msg.arg.remove.start = start;
832 ewq.msg.arg.remove.end = end;
05ce7724
PE
833
834 userfaultfd_event_wait_completion(ctx, &ewq);
835
70ccb92f 836 return false;
05ce7724
PE
837}
838
897ab3e0
MR
839static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
840 unsigned long start, unsigned long end)
841{
842 struct userfaultfd_unmap_ctx *unmap_ctx;
843
844 list_for_each_entry(unmap_ctx, unmaps, list)
845 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
846 unmap_ctx->end == end)
847 return true;
848
849 return false;
850}
851
65ac1320 852int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
69dbe6da 853 unsigned long end, struct list_head *unmaps)
897ab3e0 854{
65ac1320
LH
855 struct userfaultfd_unmap_ctx *unmap_ctx;
856 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
897ab3e0 857
65ac1320
LH
858 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
859 has_unmap_ctx(ctx, unmaps, start, end))
860 return 0;
897ab3e0 861
65ac1320
LH
862 unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
863 if (!unmap_ctx)
864 return -ENOMEM;
897ab3e0 865
65ac1320
LH
866 userfaultfd_ctx_get(ctx);
867 atomic_inc(&ctx->mmap_changing);
868 unmap_ctx->ctx = ctx;
869 unmap_ctx->start = start;
870 unmap_ctx->end = end;
871 list_add_tail(&unmap_ctx->list, unmaps);
897ab3e0
MR
872
873 return 0;
874}
875
876void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
877{
878 struct userfaultfd_unmap_ctx *ctx, *n;
879 struct userfaultfd_wait_queue ewq;
880
881 list_for_each_entry_safe(ctx, n, uf, list) {
882 msg_init(&ewq.msg);
883
884 ewq.msg.event = UFFD_EVENT_UNMAP;
885 ewq.msg.arg.remove.start = ctx->start;
886 ewq.msg.arg.remove.end = ctx->end;
887
888 userfaultfd_event_wait_completion(ctx->ctx, &ewq);
889
890 list_del(&ctx->list);
891 kfree(ctx);
892 }
893}
894
86039bd3
AA
895static int userfaultfd_release(struct inode *inode, struct file *file)
896{
897 struct userfaultfd_ctx *ctx = file->private_data;
898 struct mm_struct *mm = ctx->mm;
899 struct vm_area_struct *vma, *prev;
900 /* len == 0 means wake all */
901 struct userfaultfd_wake_range range = { .len = 0, };
902 unsigned long new_flags;
11a9b902 903 VMA_ITERATOR(vmi, mm, 0);
86039bd3 904
6aa7de05 905 WRITE_ONCE(ctx->released, true);
86039bd3 906
d2005e3f
ON
907 if (!mmget_not_zero(mm))
908 goto wakeup;
909
86039bd3
AA
910 /*
911 * Flush page faults out of all CPUs. NOTE: all page faults
912 * must be retried without returning VM_FAULT_SIGBUS if
913 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
c1e8d7c6 914 * changes while handle_userfault released the mmap_lock. So
86039bd3 915 * it's critical that released is set to true (above), before
c1e8d7c6 916 * taking the mmap_lock for writing.
86039bd3 917 */
d8ed45c5 918 mmap_write_lock(mm);
86039bd3 919 prev = NULL;
11a9b902 920 for_each_vma(vmi, vma) {
86039bd3
AA
921 cond_resched();
922 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
7677f7fd 923 !!(vma->vm_flags & __VM_UFFD_FLAGS));
86039bd3
AA
924 if (vma->vm_userfaultfd_ctx.ctx != ctx) {
925 prev = vma;
926 continue;
927 }
7677f7fd 928 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
9760ebff 929 prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
4d45e75a
JH
930 new_flags, vma->anon_vma,
931 vma->vm_file, vma->vm_pgoff,
932 vma_policy(vma),
5c26f6ac 933 NULL_VM_UFFD_CTX, anon_vma_name(vma));
69dbe6da 934 if (prev) {
4d45e75a 935 vma = prev;
69dbe6da 936 } else {
4d45e75a 937 prev = vma;
69dbe6da
LH
938 }
939
60081bf1 940 vma_start_write(vma);
51d3d5eb 941 userfaultfd_set_vm_flags(vma, new_flags);
86039bd3
AA
942 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
943 }
d8ed45c5 944 mmap_write_unlock(mm);
d2005e3f
ON
945 mmput(mm);
946wakeup:
86039bd3 947 /*
15b726ef 948 * After no new page faults can wait on this fault_*wqh, flush
86039bd3 949 * the last page faults that may have been already waiting on
15b726ef 950 * the fault_*wqh.
86039bd3 951 */
cbcfa130 952 spin_lock_irq(&ctx->fault_pending_wqh.lock);
ac5be6b4 953 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
c430d1e8 954 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
cbcfa130 955 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3 956
5a18b64e
MR
957 /* Flush pending events that may still wait on event_wqh */
958 wake_up_all(&ctx->event_wqh);
959
a9a08845 960 wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
86039bd3
AA
961 userfaultfd_ctx_put(ctx);
962 return 0;
963}
964
15b726ef 965/* fault_pending_wqh.lock must be hold by the caller */
6dcc27fd
PE
966static inline struct userfaultfd_wait_queue *find_userfault_in(
967 wait_queue_head_t *wqh)
86039bd3 968{
ac6424b9 969 wait_queue_entry_t *wq;
15b726ef 970 struct userfaultfd_wait_queue *uwq;
86039bd3 971
456a7378 972 lockdep_assert_held(&wqh->lock);
86039bd3 973
15b726ef 974 uwq = NULL;
6dcc27fd 975 if (!waitqueue_active(wqh))
15b726ef
AA
976 goto out;
977 /* walk in reverse to provide FIFO behavior to read userfaults */
2055da97 978 wq = list_last_entry(&wqh->head, typeof(*wq), entry);
15b726ef
AA
979 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
980out:
981 return uwq;
86039bd3 982}
6dcc27fd
PE
983
984static inline struct userfaultfd_wait_queue *find_userfault(
985 struct userfaultfd_ctx *ctx)
986{
987 return find_userfault_in(&ctx->fault_pending_wqh);
988}
86039bd3 989
9cd75c3c
PE
990static inline struct userfaultfd_wait_queue *find_userfault_evt(
991 struct userfaultfd_ctx *ctx)
992{
993 return find_userfault_in(&ctx->event_wqh);
994}
995
076ccb76 996static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
86039bd3
AA
997{
998 struct userfaultfd_ctx *ctx = file->private_data;
076ccb76 999 __poll_t ret;
86039bd3
AA
1000
1001 poll_wait(file, &ctx->fd_wqh, wait);
1002
22e5fe2a 1003 if (!userfaultfd_is_initialized(ctx))
a9a08845 1004 return EPOLLERR;
9cd75c3c 1005
22e5fe2a
NA
1006 /*
1007 * poll() never guarantees that read won't block.
1008 * userfaults can be waken before they're read().
1009 */
1010 if (unlikely(!(file->f_flags & O_NONBLOCK)))
a9a08845 1011 return EPOLLERR;
22e5fe2a
NA
1012 /*
1013 * lockless access to see if there are pending faults
1014 * __pollwait last action is the add_wait_queue but
1015 * the spin_unlock would allow the waitqueue_active to
1016 * pass above the actual list_add inside
1017 * add_wait_queue critical section. So use a full
1018 * memory barrier to serialize the list_add write of
1019 * add_wait_queue() with the waitqueue_active read
1020 * below.
1021 */
1022 ret = 0;
1023 smp_mb();
1024 if (waitqueue_active(&ctx->fault_pending_wqh))
1025 ret = EPOLLIN;
1026 else if (waitqueue_active(&ctx->event_wqh))
1027 ret = EPOLLIN;
1028
1029 return ret;
86039bd3
AA
1030}
1031
893e26e6
PE
1032static const struct file_operations userfaultfd_fops;
1033
b537900f
DC
1034static int resolve_userfault_fork(struct userfaultfd_ctx *new,
1035 struct inode *inode,
893e26e6
PE
1036 struct uffd_msg *msg)
1037{
1038 int fd;
893e26e6 1039
b537900f 1040 fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
abec3d01 1041 O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
893e26e6
PE
1042 if (fd < 0)
1043 return fd;
1044
893e26e6
PE
1045 msg->arg.reserved.reserved1 = 0;
1046 msg->arg.fork.ufd = fd;
893e26e6
PE
1047 return 0;
1048}
1049
86039bd3 1050static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
b537900f 1051 struct uffd_msg *msg, struct inode *inode)
86039bd3
AA
1052{
1053 ssize_t ret;
1054 DECLARE_WAITQUEUE(wait, current);
15b726ef 1055 struct userfaultfd_wait_queue *uwq;
893e26e6
PE
1056 /*
1057 * Handling fork event requires sleeping operations, so
1058 * we drop the event_wqh lock, then do these ops, then
1059 * lock it back and wake up the waiter. While the lock is
1060 * dropped the ewq may go away so we keep track of it
1061 * carefully.
1062 */
1063 LIST_HEAD(fork_event);
1064 struct userfaultfd_ctx *fork_nctx = NULL;
86039bd3 1065
15b726ef 1066 /* always take the fd_wqh lock before the fault_pending_wqh lock */
ae62c16e 1067 spin_lock_irq(&ctx->fd_wqh.lock);
86039bd3
AA
1068 __add_wait_queue(&ctx->fd_wqh, &wait);
1069 for (;;) {
1070 set_current_state(TASK_INTERRUPTIBLE);
15b726ef
AA
1071 spin_lock(&ctx->fault_pending_wqh.lock);
1072 uwq = find_userfault(ctx);
1073 if (uwq) {
2c5b7e1b
AA
1074 /*
1075 * Use a seqcount to repeat the lockless check
1076 * in wake_userfault() to avoid missing
1077 * wakeups because during the refile both
1078 * waitqueue could become empty if this is the
1079 * only userfault.
1080 */
1081 write_seqcount_begin(&ctx->refile_seq);
1082
86039bd3 1083 /*
15b726ef
AA
1084 * The fault_pending_wqh.lock prevents the uwq
1085 * to disappear from under us.
1086 *
1087 * Refile this userfault from
1088 * fault_pending_wqh to fault_wqh, it's not
1089 * pending anymore after we read it.
1090 *
1091 * Use list_del() by hand (as
1092 * userfaultfd_wake_function also uses
1093 * list_del_init() by hand) to be sure nobody
1094 * changes __remove_wait_queue() to use
1095 * list_del_init() in turn breaking the
1096 * !list_empty_careful() check in
2055da97 1097 * handle_userfault(). The uwq->wq.head list
15b726ef
AA
1098 * must never be empty at any time during the
1099 * refile, or the waitqueue could disappear
1100 * from under us. The "wait_queue_head_t"
1101 * parameter of __remove_wait_queue() is unused
1102 * anyway.
86039bd3 1103 */
2055da97 1104 list_del(&uwq->wq.entry);
c430d1e8 1105 add_wait_queue(&ctx->fault_wqh, &uwq->wq);
15b726ef 1106
2c5b7e1b
AA
1107 write_seqcount_end(&ctx->refile_seq);
1108
a9b85f94
AA
1109 /* careful to always initialize msg if ret == 0 */
1110 *msg = uwq->msg;
15b726ef 1111 spin_unlock(&ctx->fault_pending_wqh.lock);
86039bd3
AA
1112 ret = 0;
1113 break;
1114 }
15b726ef 1115 spin_unlock(&ctx->fault_pending_wqh.lock);
9cd75c3c
PE
1116
1117 spin_lock(&ctx->event_wqh.lock);
1118 uwq = find_userfault_evt(ctx);
1119 if (uwq) {
1120 *msg = uwq->msg;
1121
893e26e6
PE
1122 if (uwq->msg.event == UFFD_EVENT_FORK) {
1123 fork_nctx = (struct userfaultfd_ctx *)
1124 (unsigned long)
1125 uwq->msg.arg.reserved.reserved1;
2055da97 1126 list_move(&uwq->wq.entry, &fork_event);
384632e6
AA
1127 /*
1128 * fork_nctx can be freed as soon as
1129 * we drop the lock, unless we take a
1130 * reference on it.
1131 */
1132 userfaultfd_ctx_get(fork_nctx);
893e26e6
PE
1133 spin_unlock(&ctx->event_wqh.lock);
1134 ret = 0;
1135 break;
1136 }
1137
9cd75c3c
PE
1138 userfaultfd_event_complete(ctx, uwq);
1139 spin_unlock(&ctx->event_wqh.lock);
1140 ret = 0;
1141 break;
1142 }
1143 spin_unlock(&ctx->event_wqh.lock);
1144
86039bd3
AA
1145 if (signal_pending(current)) {
1146 ret = -ERESTARTSYS;
1147 break;
1148 }
1149 if (no_wait) {
1150 ret = -EAGAIN;
1151 break;
1152 }
ae62c16e 1153 spin_unlock_irq(&ctx->fd_wqh.lock);
86039bd3 1154 schedule();
ae62c16e 1155 spin_lock_irq(&ctx->fd_wqh.lock);
86039bd3
AA
1156 }
1157 __remove_wait_queue(&ctx->fd_wqh, &wait);
1158 __set_current_state(TASK_RUNNING);
ae62c16e 1159 spin_unlock_irq(&ctx->fd_wqh.lock);
86039bd3 1160
893e26e6 1161 if (!ret && msg->event == UFFD_EVENT_FORK) {
b537900f 1162 ret = resolve_userfault_fork(fork_nctx, inode, msg);
cbcfa130 1163 spin_lock_irq(&ctx->event_wqh.lock);
384632e6
AA
1164 if (!list_empty(&fork_event)) {
1165 /*
1166 * The fork thread didn't abort, so we can
1167 * drop the temporary refcount.
1168 */
1169 userfaultfd_ctx_put(fork_nctx);
1170
1171 uwq = list_first_entry(&fork_event,
1172 typeof(*uwq),
1173 wq.entry);
1174 /*
1175 * If fork_event list wasn't empty and in turn
1176 * the event wasn't already released by fork
1177 * (the event is allocated on fork kernel
1178 * stack), put the event back to its place in
1179 * the event_wq. fork_event head will be freed
1180 * as soon as we return so the event cannot
1181 * stay queued there no matter the current
1182 * "ret" value.
1183 */
1184 list_del(&uwq->wq.entry);
1185 __add_wait_queue(&ctx->event_wqh, &uwq->wq);
893e26e6 1186
384632e6
AA
1187 /*
1188 * Leave the event in the waitqueue and report
1189 * error to userland if we failed to resolve
1190 * the userfault fork.
1191 */
1192 if (likely(!ret))
893e26e6 1193 userfaultfd_event_complete(ctx, uwq);
384632e6
AA
1194 } else {
1195 /*
1196 * Here the fork thread aborted and the
1197 * refcount from the fork thread on fork_nctx
1198 * has already been released. We still hold
1199 * the reference we took before releasing the
1200 * lock above. If resolve_userfault_fork
1201 * failed we've to drop it because the
1202 * fork_nctx has to be freed in such case. If
1203 * it succeeded we'll hold it because the new
1204 * uffd references it.
1205 */
1206 if (ret)
1207 userfaultfd_ctx_put(fork_nctx);
893e26e6 1208 }
cbcfa130 1209 spin_unlock_irq(&ctx->event_wqh.lock);
893e26e6
PE
1210 }
1211
86039bd3
AA
1212 return ret;
1213}
1214
1215static ssize_t userfaultfd_read(struct file *file, char __user *buf,
1216 size_t count, loff_t *ppos)
1217{
1218 struct userfaultfd_ctx *ctx = file->private_data;
1219 ssize_t _ret, ret = 0;
a9b85f94 1220 struct uffd_msg msg;
86039bd3 1221 int no_wait = file->f_flags & O_NONBLOCK;
b537900f 1222 struct inode *inode = file_inode(file);
86039bd3 1223
22e5fe2a 1224 if (!userfaultfd_is_initialized(ctx))
86039bd3 1225 return -EINVAL;
86039bd3
AA
1226
1227 for (;;) {
a9b85f94 1228 if (count < sizeof(msg))
86039bd3 1229 return ret ? ret : -EINVAL;
b537900f 1230 _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
86039bd3
AA
1231 if (_ret < 0)
1232 return ret ? ret : _ret;
a9b85f94 1233 if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
86039bd3 1234 return ret ? ret : -EFAULT;
a9b85f94
AA
1235 ret += sizeof(msg);
1236 buf += sizeof(msg);
1237 count -= sizeof(msg);
86039bd3
AA
1238 /*
1239 * Allow to read more than one fault at time but only
1240 * block if waiting for the very first one.
1241 */
1242 no_wait = O_NONBLOCK;
1243 }
1244}
1245
1246static void __wake_userfault(struct userfaultfd_ctx *ctx,
1247 struct userfaultfd_wake_range *range)
1248{
cbcfa130 1249 spin_lock_irq(&ctx->fault_pending_wqh.lock);
86039bd3 1250 /* wake all in the range and autoremove */
15b726ef 1251 if (waitqueue_active(&ctx->fault_pending_wqh))
ac5be6b4 1252 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
15b726ef
AA
1253 range);
1254 if (waitqueue_active(&ctx->fault_wqh))
c430d1e8 1255 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
cbcfa130 1256 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3
AA
1257}
1258
1259static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
1260 struct userfaultfd_wake_range *range)
1261{
2c5b7e1b
AA
1262 unsigned seq;
1263 bool need_wakeup;
1264
86039bd3
AA
1265 /*
1266 * To be sure waitqueue_active() is not reordered by the CPU
1267 * before the pagetable update, use an explicit SMP memory
3e4e28c5 1268 * barrier here. PT lock release or mmap_read_unlock(mm) still
86039bd3
AA
1269 * have release semantics that can allow the
1270 * waitqueue_active() to be reordered before the pte update.
1271 */
1272 smp_mb();
1273
1274 /*
1275 * Use waitqueue_active because it's very frequent to
1276 * change the address space atomically even if there are no
1277 * userfaults yet. So we take the spinlock only when we're
1278 * sure we've userfaults to wake.
1279 */
2c5b7e1b
AA
1280 do {
1281 seq = read_seqcount_begin(&ctx->refile_seq);
1282 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
1283 waitqueue_active(&ctx->fault_wqh);
1284 cond_resched();
1285 } while (read_seqcount_retry(&ctx->refile_seq, seq));
1286 if (need_wakeup)
86039bd3
AA
1287 __wake_userfault(ctx, range);
1288}
1289
2ef5d724
AR
1290static __always_inline int validate_unaligned_range(
1291 struct mm_struct *mm, __u64 start, __u64 len)
86039bd3
AA
1292{
1293 __u64 task_size = mm->task_size;
1294
86039bd3
AA
1295 if (len & ~PAGE_MASK)
1296 return -EINVAL;
1297 if (!len)
1298 return -EINVAL;
e71e2ace 1299 if (start < mmap_min_addr)
86039bd3 1300 return -EINVAL;
e71e2ace 1301 if (start >= task_size)
86039bd3 1302 return -EINVAL;
e71e2ace 1303 if (len > task_size - start)
86039bd3 1304 return -EINVAL;
2ef5d724
AR
1305 if (start + len <= start)
1306 return -EINVAL;
86039bd3
AA
1307 return 0;
1308}
1309
2ef5d724
AR
1310static __always_inline int validate_range(struct mm_struct *mm,
1311 __u64 start, __u64 len)
1312{
1313 if (start & ~PAGE_MASK)
1314 return -EINVAL;
1315
1316 return validate_unaligned_range(mm, start, len);
1317}
1318
86039bd3
AA
1319static int userfaultfd_register(struct userfaultfd_ctx *ctx,
1320 unsigned long arg)
1321{
1322 struct mm_struct *mm = ctx->mm;
1323 struct vm_area_struct *vma, *prev, *cur;
1324 int ret;
1325 struct uffdio_register uffdio_register;
1326 struct uffdio_register __user *user_uffdio_register;
1327 unsigned long vm_flags, new_flags;
1328 bool found;
ce53e8e6 1329 bool basic_ioctls;
86039bd3 1330 unsigned long start, end, vma_end;
11a9b902 1331 struct vma_iterator vmi;
5543d3c4 1332 pgoff_t pgoff;
86039bd3
AA
1333
1334 user_uffdio_register = (struct uffdio_register __user *) arg;
1335
1336 ret = -EFAULT;
1337 if (copy_from_user(&uffdio_register, user_uffdio_register,
1338 sizeof(uffdio_register)-sizeof(__u64)))
1339 goto out;
1340
1341 ret = -EINVAL;
1342 if (!uffdio_register.mode)
1343 goto out;
7677f7fd 1344 if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
86039bd3
AA
1345 goto out;
1346 vm_flags = 0;
1347 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
1348 vm_flags |= VM_UFFD_MISSING;
00b151f2
PX
1349 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
1350#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
1351 goto out;
1352#endif
86039bd3 1353 vm_flags |= VM_UFFD_WP;
00b151f2 1354 }
7677f7fd
AR
1355 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
1356#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
1357 goto out;
1358#endif
1359 vm_flags |= VM_UFFD_MINOR;
1360 }
86039bd3 1361
e71e2ace 1362 ret = validate_range(mm, uffdio_register.range.start,
86039bd3
AA
1363 uffdio_register.range.len);
1364 if (ret)
1365 goto out;
1366
1367 start = uffdio_register.range.start;
1368 end = start + uffdio_register.range.len;
1369
d2005e3f
ON
1370 ret = -ENOMEM;
1371 if (!mmget_not_zero(mm))
1372 goto out;
1373
11a9b902 1374 ret = -EINVAL;
d8ed45c5 1375 mmap_write_lock(mm);
11a9b902
LH
1376 vma_iter_init(&vmi, mm, start);
1377 vma = vma_find(&vmi, end);
86039bd3
AA
1378 if (!vma)
1379 goto out_unlock;
1380
cab350af
MK
1381 /*
1382 * If the first vma contains huge pages, make sure start address
1383 * is aligned to huge page size.
1384 */
1385 if (is_vm_hugetlb_page(vma)) {
1386 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1387
1388 if (start & (vma_hpagesize - 1))
1389 goto out_unlock;
1390 }
1391
86039bd3
AA
1392 /*
1393 * Search for not compatible vmas.
86039bd3
AA
1394 */
1395 found = false;
ce53e8e6 1396 basic_ioctls = false;
11a9b902
LH
1397 cur = vma;
1398 do {
86039bd3
AA
1399 cond_resched();
1400
1401 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
7677f7fd 1402 !!(cur->vm_flags & __VM_UFFD_FLAGS));
86039bd3
AA
1403
1404 /* check not compatible vmas */
1405 ret = -EINVAL;
63b2d417 1406 if (!vma_can_userfault(cur, vm_flags))
86039bd3 1407 goto out_unlock;
29ec9066
AA
1408
1409 /*
1410 * UFFDIO_COPY will fill file holes even without
1411 * PROT_WRITE. This check enforces that if this is a
1412 * MAP_SHARED, the process has write permission to the backing
1413 * file. If VM_MAYWRITE is set it also enforces that on a
1414 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
1415 * F_WRITE_SEAL can be taken until the vma is destroyed.
1416 */
1417 ret = -EPERM;
1418 if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
1419 goto out_unlock;
1420
cab350af
MK
1421 /*
1422 * If this vma contains ending address, and huge pages
1423 * check alignment.
1424 */
1425 if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
1426 end > cur->vm_start) {
1427 unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
1428
1429 ret = -EINVAL;
1430
1431 if (end & (vma_hpagesize - 1))
1432 goto out_unlock;
1433 }
63b2d417
AA
1434 if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
1435 goto out_unlock;
86039bd3
AA
1436
1437 /*
1438 * Check that this vma isn't already owned by a
1439 * different userfaultfd. We can't allow more than one
1440 * userfaultfd to own a single vma simultaneously or we
1441 * wouldn't know which one to deliver the userfaults to.
1442 */
1443 ret = -EBUSY;
1444 if (cur->vm_userfaultfd_ctx.ctx &&
1445 cur->vm_userfaultfd_ctx.ctx != ctx)
1446 goto out_unlock;
1447
cab350af
MK
1448 /*
1449 * Note vmas containing huge pages
1450 */
ce53e8e6
MR
1451 if (is_vm_hugetlb_page(cur))
1452 basic_ioctls = true;
cab350af 1453
86039bd3 1454 found = true;
11a9b902 1455 } for_each_vma_range(vmi, cur, end);
86039bd3
AA
1456 BUG_ON(!found);
1457
11a9b902
LH
1458 vma_iter_set(&vmi, start);
1459 prev = vma_prev(&vmi);
270aa010
PX
1460 if (vma->vm_start < start)
1461 prev = vma;
86039bd3
AA
1462
1463 ret = 0;
11a9b902 1464 for_each_vma_range(vmi, vma, end) {
86039bd3
AA
1465 cond_resched();
1466
63b2d417 1467 BUG_ON(!vma_can_userfault(vma, vm_flags));
86039bd3
AA
1468 BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
1469 vma->vm_userfaultfd_ctx.ctx != ctx);
29ec9066 1470 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
86039bd3
AA
1471
1472 /*
1473 * Nothing to do: this vma is already registered into this
1474 * userfaultfd and with the right tracking mode too.
1475 */
1476 if (vma->vm_userfaultfd_ctx.ctx == ctx &&
1477 (vma->vm_flags & vm_flags) == vm_flags)
1478 goto skip;
1479
1480 if (vma->vm_start > start)
1481 start = vma->vm_start;
1482 vma_end = min(end, vma->vm_end);
1483
7677f7fd 1484 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
5543d3c4 1485 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
9760ebff 1486 prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
5543d3c4 1487 vma->anon_vma, vma->vm_file, pgoff,
86039bd3 1488 vma_policy(vma),
9a10064f 1489 ((struct vm_userfaultfd_ctx){ ctx }),
5c26f6ac 1490 anon_vma_name(vma));
86039bd3 1491 if (prev) {
69dbe6da 1492 /* vma_merge() invalidated the mas */
86039bd3
AA
1493 vma = prev;
1494 goto next;
1495 }
1496 if (vma->vm_start < start) {
9760ebff 1497 ret = split_vma(&vmi, vma, start, 1);
86039bd3
AA
1498 if (ret)
1499 break;
1500 }
1501 if (vma->vm_end > end) {
9760ebff 1502 ret = split_vma(&vmi, vma, end, 0);
86039bd3
AA
1503 if (ret)
1504 break;
1505 }
1506 next:
1507 /*
1508 * In the vma_merge() successful mprotect-like case 8:
1509 * the next vma was merged into the current one and
1510 * the current one has not been updated yet.
1511 */
60081bf1 1512 vma_start_write(vma);
51d3d5eb 1513 userfaultfd_set_vm_flags(vma, new_flags);
86039bd3
AA
1514 vma->vm_userfaultfd_ctx.ctx = ctx;
1515
6dfeaff9
PX
1516 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
1517 hugetlb_unshare_all_pmds(vma);
1518
86039bd3
AA
1519 skip:
1520 prev = vma;
1521 start = vma->vm_end;
11a9b902
LH
1522 }
1523
86039bd3 1524out_unlock:
d8ed45c5 1525 mmap_write_unlock(mm);
d2005e3f 1526 mmput(mm);
86039bd3 1527 if (!ret) {
14819305
PX
1528 __u64 ioctls_out;
1529
1530 ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
1531 UFFD_API_RANGE_IOCTLS;
1532
1533 /*
1534 * Declare the WP ioctl only if the WP mode is
1535 * specified and all checks passed with the range
1536 */
1537 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
1538 ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
1539
f6191471
AR
1540 /* CONTINUE ioctl is only supported for MINOR ranges. */
1541 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1542 ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1543
86039bd3
AA
1544 /*
1545 * Now that we scanned all vmas we can already tell
1546 * userland which ioctls methods are guaranteed to
1547 * succeed on this range.
1548 */
14819305 1549 if (put_user(ioctls_out, &user_uffdio_register->ioctls))
86039bd3
AA
1550 ret = -EFAULT;
1551 }
1552out:
1553 return ret;
1554}
1555
1556static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
1557 unsigned long arg)
1558{
1559 struct mm_struct *mm = ctx->mm;
1560 struct vm_area_struct *vma, *prev, *cur;
1561 int ret;
1562 struct uffdio_range uffdio_unregister;
1563 unsigned long new_flags;
1564 bool found;
1565 unsigned long start, end, vma_end;
1566 const void __user *buf = (void __user *)arg;
11a9b902 1567 struct vma_iterator vmi;
5543d3c4 1568 pgoff_t pgoff;
86039bd3
AA
1569
1570 ret = -EFAULT;
1571 if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
1572 goto out;
1573
e71e2ace 1574 ret = validate_range(mm, uffdio_unregister.start,
86039bd3
AA
1575 uffdio_unregister.len);
1576 if (ret)
1577 goto out;
1578
1579 start = uffdio_unregister.start;
1580 end = start + uffdio_unregister.len;
1581
d2005e3f
ON
1582 ret = -ENOMEM;
1583 if (!mmget_not_zero(mm))
1584 goto out;
1585
d8ed45c5 1586 mmap_write_lock(mm);
86039bd3 1587 ret = -EINVAL;
11a9b902
LH
1588 vma_iter_init(&vmi, mm, start);
1589 vma = vma_find(&vmi, end);
1590 if (!vma)
86039bd3
AA
1591 goto out_unlock;
1592
cab350af
MK
1593 /*
1594 * If the first vma contains huge pages, make sure start address
1595 * is aligned to huge page size.
1596 */
1597 if (is_vm_hugetlb_page(vma)) {
1598 unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
1599
1600 if (start & (vma_hpagesize - 1))
1601 goto out_unlock;
1602 }
1603
86039bd3
AA
1604 /*
1605 * Search for not compatible vmas.
86039bd3
AA
1606 */
1607 found = false;
11a9b902
LH
1608 cur = vma;
1609 do {
86039bd3
AA
1610 cond_resched();
1611
1612 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
7677f7fd 1613 !!(cur->vm_flags & __VM_UFFD_FLAGS));
86039bd3
AA
1614
1615 /*
1616 * Check not compatible vmas, not strictly required
1617 * here as not compatible vmas cannot have an
1618 * userfaultfd_ctx registered on them, but this
1619 * provides for more strict behavior to notice
1620 * unregistration errors.
1621 */
63b2d417 1622 if (!vma_can_userfault(cur, cur->vm_flags))
86039bd3
AA
1623 goto out_unlock;
1624
1625 found = true;
11a9b902 1626 } for_each_vma_range(vmi, cur, end);
86039bd3
AA
1627 BUG_ON(!found);
1628
11a9b902
LH
1629 vma_iter_set(&vmi, start);
1630 prev = vma_prev(&vmi);
270aa010
PX
1631 if (vma->vm_start < start)
1632 prev = vma;
1633
86039bd3 1634 ret = 0;
11a9b902 1635 for_each_vma_range(vmi, vma, end) {
86039bd3
AA
1636 cond_resched();
1637
63b2d417 1638 BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
86039bd3
AA
1639
1640 /*
1641 * Nothing to do: this vma is already registered into this
1642 * userfaultfd and with the right tracking mode too.
1643 */
1644 if (!vma->vm_userfaultfd_ctx.ctx)
1645 goto skip;
1646
01e881f5
AA
1647 WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
1648
86039bd3
AA
1649 if (vma->vm_start > start)
1650 start = vma->vm_start;
1651 vma_end = min(end, vma->vm_end);
1652
09fa5296
AA
1653 if (userfaultfd_missing(vma)) {
1654 /*
1655 * Wake any concurrent pending userfault while
1656 * we unregister, so they will not hang
1657 * permanently and it avoids userland to call
1658 * UFFDIO_WAKE explicitly.
1659 */
1660 struct userfaultfd_wake_range range;
1661 range.start = start;
1662 range.len = vma_end - start;
1663 wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
1664 }
1665
f369b07c
PX
1666 /* Reset ptes for the whole vma range if wr-protected */
1667 if (userfaultfd_wp(vma))
61c50040 1668 uffd_wp_range(vma, start, vma_end - start, false);
f369b07c 1669
7677f7fd 1670 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
5543d3c4 1671 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
9760ebff 1672 prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
5543d3c4 1673 vma->anon_vma, vma->vm_file, pgoff,
86039bd3 1674 vma_policy(vma),
5c26f6ac 1675 NULL_VM_UFFD_CTX, anon_vma_name(vma));
86039bd3
AA
1676 if (prev) {
1677 vma = prev;
1678 goto next;
1679 }
1680 if (vma->vm_start < start) {
9760ebff 1681 ret = split_vma(&vmi, vma, start, 1);
86039bd3
AA
1682 if (ret)
1683 break;
1684 }
1685 if (vma->vm_end > end) {
9760ebff 1686 ret = split_vma(&vmi, vma, end, 0);
86039bd3
AA
1687 if (ret)
1688 break;
1689 }
1690 next:
1691 /*
1692 * In the vma_merge() successful mprotect-like case 8:
1693 * the next vma was merged into the current one and
1694 * the current one has not been updated yet.
1695 */
60081bf1 1696 vma_start_write(vma);
51d3d5eb 1697 userfaultfd_set_vm_flags(vma, new_flags);
86039bd3
AA
1698 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
1699
1700 skip:
1701 prev = vma;
1702 start = vma->vm_end;
11a9b902
LH
1703 }
1704
86039bd3 1705out_unlock:
d8ed45c5 1706 mmap_write_unlock(mm);
d2005e3f 1707 mmput(mm);
86039bd3
AA
1708out:
1709 return ret;
1710}
1711
1712/*
ba85c702
AA
1713 * userfaultfd_wake may be used in combination with the
1714 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
86039bd3
AA
1715 */
1716static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
1717 unsigned long arg)
1718{
1719 int ret;
1720 struct uffdio_range uffdio_wake;
1721 struct userfaultfd_wake_range range;
1722 const void __user *buf = (void __user *)arg;
1723
1724 ret = -EFAULT;
1725 if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
1726 goto out;
1727
e71e2ace 1728 ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
86039bd3
AA
1729 if (ret)
1730 goto out;
1731
1732 range.start = uffdio_wake.start;
1733 range.len = uffdio_wake.len;
1734
1735 /*
1736 * len == 0 means wake all and we don't want to wake all here,
1737 * so check it again to be sure.
1738 */
1739 VM_BUG_ON(!range.len);
1740
1741 wake_userfault(ctx, &range);
1742 ret = 0;
1743
1744out:
1745 return ret;
1746}
1747
ad465cae
AA
1748static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
1749 unsigned long arg)
1750{
1751 __s64 ret;
1752 struct uffdio_copy uffdio_copy;
1753 struct uffdio_copy __user *user_uffdio_copy;
1754 struct userfaultfd_wake_range range;
d9712937 1755 uffd_flags_t flags = 0;
ad465cae
AA
1756
1757 user_uffdio_copy = (struct uffdio_copy __user *) arg;
1758
df2cc96e 1759 ret = -EAGAIN;
a759a909 1760 if (atomic_read(&ctx->mmap_changing))
df2cc96e
MR
1761 goto out;
1762
ad465cae
AA
1763 ret = -EFAULT;
1764 if (copy_from_user(&uffdio_copy, user_uffdio_copy,
1765 /* don't copy "copy" last field */
1766 sizeof(uffdio_copy)-sizeof(__s64)))
1767 goto out;
1768
2ef5d724
AR
1769 ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
1770 uffdio_copy.len);
1771 if (ret)
1772 goto out;
e71e2ace 1773 ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
ad465cae
AA
1774 if (ret)
1775 goto out;
2ef5d724 1776
ad465cae 1777 ret = -EINVAL;
72981e0e 1778 if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
ad465cae 1779 goto out;
d9712937
AR
1780 if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
1781 flags |= MFILL_ATOMIC_WP;
d2005e3f 1782 if (mmget_not_zero(ctx->mm)) {
a734991c
AR
1783 ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
1784 uffdio_copy.len, &ctx->mmap_changing,
d9712937 1785 flags);
d2005e3f 1786 mmput(ctx->mm);
96333187 1787 } else {
e86b298b 1788 return -ESRCH;
d2005e3f 1789 }
ad465cae
AA
1790 if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
1791 return -EFAULT;
1792 if (ret < 0)
1793 goto out;
1794 BUG_ON(!ret);
1795 /* len == 0 would wake all */
1796 range.len = ret;
1797 if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
1798 range.start = uffdio_copy.dst;
1799 wake_userfault(ctx, &range);
1800 }
1801 ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
1802out:
1803 return ret;
1804}
1805
1806static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
1807 unsigned long arg)
1808{
1809 __s64 ret;
1810 struct uffdio_zeropage uffdio_zeropage;
1811 struct uffdio_zeropage __user *user_uffdio_zeropage;
1812 struct userfaultfd_wake_range range;
1813
1814 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
1815
df2cc96e 1816 ret = -EAGAIN;
a759a909 1817 if (atomic_read(&ctx->mmap_changing))
df2cc96e
MR
1818 goto out;
1819
ad465cae
AA
1820 ret = -EFAULT;
1821 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
1822 /* don't copy "zeropage" last field */
1823 sizeof(uffdio_zeropage)-sizeof(__s64)))
1824 goto out;
1825
e71e2ace 1826 ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
ad465cae
AA
1827 uffdio_zeropage.range.len);
1828 if (ret)
1829 goto out;
1830 ret = -EINVAL;
1831 if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
1832 goto out;
1833
d2005e3f 1834 if (mmget_not_zero(ctx->mm)) {
a734991c
AR
1835 ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
1836 uffdio_zeropage.range.len,
1837 &ctx->mmap_changing);
d2005e3f 1838 mmput(ctx->mm);
9d95aa4b 1839 } else {
e86b298b 1840 return -ESRCH;
d2005e3f 1841 }
ad465cae
AA
1842 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
1843 return -EFAULT;
1844 if (ret < 0)
1845 goto out;
1846 /* len == 0 would wake all */
1847 BUG_ON(!ret);
1848 range.len = ret;
1849 if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
1850 range.start = uffdio_zeropage.range.start;
1851 wake_userfault(ctx, &range);
1852 }
1853 ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
1854out:
1855 return ret;
1856}
1857
63b2d417
AA
1858static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
1859 unsigned long arg)
1860{
1861 int ret;
1862 struct uffdio_writeprotect uffdio_wp;
1863 struct uffdio_writeprotect __user *user_uffdio_wp;
1864 struct userfaultfd_wake_range range;
23080e27 1865 bool mode_wp, mode_dontwake;
63b2d417 1866
a759a909 1867 if (atomic_read(&ctx->mmap_changing))
63b2d417
AA
1868 return -EAGAIN;
1869
1870 user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
1871
1872 if (copy_from_user(&uffdio_wp, user_uffdio_wp,
1873 sizeof(struct uffdio_writeprotect)))
1874 return -EFAULT;
1875
e71e2ace 1876 ret = validate_range(ctx->mm, uffdio_wp.range.start,
63b2d417
AA
1877 uffdio_wp.range.len);
1878 if (ret)
1879 return ret;
1880
1881 if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
1882 UFFDIO_WRITEPROTECT_MODE_WP))
1883 return -EINVAL;
23080e27
PX
1884
1885 mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
1886 mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
1887
1888 if (mode_wp && mode_dontwake)
63b2d417
AA
1889 return -EINVAL;
1890
cb185d5f
NA
1891 if (mmget_not_zero(ctx->mm)) {
1892 ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
1893 uffdio_wp.range.len, mode_wp,
1894 &ctx->mmap_changing);
1895 mmput(ctx->mm);
1896 } else {
1897 return -ESRCH;
1898 }
1899
63b2d417
AA
1900 if (ret)
1901 return ret;
1902
23080e27 1903 if (!mode_wp && !mode_dontwake) {
63b2d417
AA
1904 range.start = uffdio_wp.range.start;
1905 range.len = uffdio_wp.range.len;
1906 wake_userfault(ctx, &range);
1907 }
1908 return ret;
1909}
1910
f6191471
AR
1911static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1912{
1913 __s64 ret;
1914 struct uffdio_continue uffdio_continue;
1915 struct uffdio_continue __user *user_uffdio_continue;
1916 struct userfaultfd_wake_range range;
02891844 1917 uffd_flags_t flags = 0;
f6191471
AR
1918
1919 user_uffdio_continue = (struct uffdio_continue __user *)arg;
1920
1921 ret = -EAGAIN;
a759a909 1922 if (atomic_read(&ctx->mmap_changing))
f6191471
AR
1923 goto out;
1924
1925 ret = -EFAULT;
1926 if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1927 /* don't copy the output fields */
1928 sizeof(uffdio_continue) - (sizeof(__s64))))
1929 goto out;
1930
e71e2ace 1931 ret = validate_range(ctx->mm, uffdio_continue.range.start,
f6191471
AR
1932 uffdio_continue.range.len);
1933 if (ret)
1934 goto out;
1935
1936 ret = -EINVAL;
02891844
AR
1937 if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
1938 UFFDIO_CONTINUE_MODE_WP))
f6191471 1939 goto out;
02891844
AR
1940 if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
1941 flags |= MFILL_ATOMIC_WP;
f6191471
AR
1942
1943 if (mmget_not_zero(ctx->mm)) {
a734991c
AR
1944 ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
1945 uffdio_continue.range.len,
02891844 1946 &ctx->mmap_changing, flags);
f6191471
AR
1947 mmput(ctx->mm);
1948 } else {
1949 return -ESRCH;
1950 }
1951
1952 if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1953 return -EFAULT;
1954 if (ret < 0)
1955 goto out;
1956
1957 /* len == 0 would wake all */
1958 BUG_ON(!ret);
1959 range.len = ret;
1960 if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1961 range.start = uffdio_continue.range.start;
1962 wake_userfault(ctx, &range);
1963 }
1964 ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1965
1966out:
1967 return ret;
1968}
1969
fc71884a
AR
1970static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
1971{
1972 __s64 ret;
1973 struct uffdio_poison uffdio_poison;
1974 struct uffdio_poison __user *user_uffdio_poison;
1975 struct userfaultfd_wake_range range;
1976
1977 user_uffdio_poison = (struct uffdio_poison __user *)arg;
1978
1979 ret = -EAGAIN;
1980 if (atomic_read(&ctx->mmap_changing))
1981 goto out;
1982
1983 ret = -EFAULT;
1984 if (copy_from_user(&uffdio_poison, user_uffdio_poison,
1985 /* don't copy the output fields */
1986 sizeof(uffdio_poison) - (sizeof(__s64))))
1987 goto out;
1988
1989 ret = validate_range(ctx->mm, uffdio_poison.range.start,
1990 uffdio_poison.range.len);
1991 if (ret)
1992 goto out;
1993
1994 ret = -EINVAL;
1995 if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
1996 goto out;
1997
1998 if (mmget_not_zero(ctx->mm)) {
1999 ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
2000 uffdio_poison.range.len,
2001 &ctx->mmap_changing, 0);
2002 mmput(ctx->mm);
2003 } else {
2004 return -ESRCH;
2005 }
2006
2007 if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
2008 return -EFAULT;
2009 if (ret < 0)
2010 goto out;
2011
2012 /* len == 0 would wake all */
2013 BUG_ON(!ret);
2014 range.len = ret;
2015 if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
2016 range.start = uffdio_poison.range.start;
2017 wake_userfault(ctx, &range);
2018 }
2019 ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
2020
2021out:
2022 return ret;
2023}
2024
9cd75c3c
PE
2025static inline unsigned int uffd_ctx_features(__u64 user_features)
2026{
2027 /*
22e5fe2a
NA
2028 * For the current set of features the bits just coincide. Set
2029 * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
9cd75c3c 2030 */
22e5fe2a 2031 return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
9cd75c3c
PE
2032}
2033
86039bd3
AA
2034/*
2035 * userland asks for a certain API version and we return which bits
2036 * and ioctl commands are implemented in this kernel for such API
2037 * version or -EINVAL if unknown.
2038 */
2039static int userfaultfd_api(struct userfaultfd_ctx *ctx,
2040 unsigned long arg)
2041{
2042 struct uffdio_api uffdio_api;
2043 void __user *buf = (void __user *)arg;
22e5fe2a 2044 unsigned int ctx_features;
86039bd3 2045 int ret;
65603144 2046 __u64 features;
86039bd3 2047
86039bd3 2048 ret = -EFAULT;
a9b85f94 2049 if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
86039bd3 2050 goto out;
2ff559f3
PX
2051 features = uffdio_api.features;
2052 ret = -EINVAL;
2053 if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
2054 goto err_out;
3c1c24d9
MR
2055 ret = -EPERM;
2056 if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
2057 goto err_out;
65603144
AA
2058 /* report all available features and ioctls to userland */
2059 uffdio_api.features = UFFD_API_FEATURES;
7677f7fd 2060#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
964ab004
AR
2061 uffdio_api.features &=
2062 ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
00b151f2
PX
2063#endif
2064#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
2065 uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
b1f9e876
PX
2066#endif
2067#ifndef CONFIG_PTE_MARKER_UFFD_WP
2068 uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
2bad466c 2069 uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
7677f7fd 2070#endif
86039bd3
AA
2071 uffdio_api.ioctls = UFFD_API_IOCTLS;
2072 ret = -EFAULT;
2073 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2074 goto out;
22e5fe2a 2075
65603144 2076 /* only enable the requested features for this uffd context */
22e5fe2a
NA
2077 ctx_features = uffd_ctx_features(features);
2078 ret = -EINVAL;
2079 if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
2080 goto err_out;
2081
86039bd3
AA
2082 ret = 0;
2083out:
2084 return ret;
3c1c24d9
MR
2085err_out:
2086 memset(&uffdio_api, 0, sizeof(uffdio_api));
2087 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
2088 ret = -EFAULT;
2089 goto out;
86039bd3
AA
2090}
2091
2092static long userfaultfd_ioctl(struct file *file, unsigned cmd,
2093 unsigned long arg)
2094{
2095 int ret = -EINVAL;
2096 struct userfaultfd_ctx *ctx = file->private_data;
2097
22e5fe2a 2098 if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
e6485a47
AA
2099 return -EINVAL;
2100
86039bd3
AA
2101 switch(cmd) {
2102 case UFFDIO_API:
2103 ret = userfaultfd_api(ctx, arg);
2104 break;
2105 case UFFDIO_REGISTER:
2106 ret = userfaultfd_register(ctx, arg);
2107 break;
2108 case UFFDIO_UNREGISTER:
2109 ret = userfaultfd_unregister(ctx, arg);
2110 break;
2111 case UFFDIO_WAKE:
2112 ret = userfaultfd_wake(ctx, arg);
2113 break;
ad465cae
AA
2114 case UFFDIO_COPY:
2115 ret = userfaultfd_copy(ctx, arg);
2116 break;
2117 case UFFDIO_ZEROPAGE:
2118 ret = userfaultfd_zeropage(ctx, arg);
2119 break;
63b2d417
AA
2120 case UFFDIO_WRITEPROTECT:
2121 ret = userfaultfd_writeprotect(ctx, arg);
2122 break;
f6191471
AR
2123 case UFFDIO_CONTINUE:
2124 ret = userfaultfd_continue(ctx, arg);
2125 break;
fc71884a
AR
2126 case UFFDIO_POISON:
2127 ret = userfaultfd_poison(ctx, arg);
2128 break;
86039bd3
AA
2129 }
2130 return ret;
2131}
2132
2133#ifdef CONFIG_PROC_FS
2134static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
2135{
2136 struct userfaultfd_ctx *ctx = f->private_data;
ac6424b9 2137 wait_queue_entry_t *wq;
86039bd3
AA
2138 unsigned long pending = 0, total = 0;
2139
cbcfa130 2140 spin_lock_irq(&ctx->fault_pending_wqh.lock);
2055da97 2141 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
15b726ef
AA
2142 pending++;
2143 total++;
2144 }
2055da97 2145 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
86039bd3
AA
2146 total++;
2147 }
cbcfa130 2148 spin_unlock_irq(&ctx->fault_pending_wqh.lock);
86039bd3
AA
2149
2150 /*
2151 * If more protocols will be added, there will be all shown
2152 * separated by a space. Like this:
2153 * protocols: aa:... bb:...
2154 */
2155 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
045098e9 2156 pending, total, UFFD_API, ctx->features,
86039bd3
AA
2157 UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
2158}
2159#endif
2160
2161static const struct file_operations userfaultfd_fops = {
2162#ifdef CONFIG_PROC_FS
2163 .show_fdinfo = userfaultfd_show_fdinfo,
2164#endif
2165 .release = userfaultfd_release,
2166 .poll = userfaultfd_poll,
2167 .read = userfaultfd_read,
2168 .unlocked_ioctl = userfaultfd_ioctl,
1832f2d8 2169 .compat_ioctl = compat_ptr_ioctl,
86039bd3
AA
2170 .llseek = noop_llseek,
2171};
2172
3004ec9c
AA
2173static void init_once_userfaultfd_ctx(void *mem)
2174{
2175 struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
2176
2177 init_waitqueue_head(&ctx->fault_pending_wqh);
2178 init_waitqueue_head(&ctx->fault_wqh);
9cd75c3c 2179 init_waitqueue_head(&ctx->event_wqh);
3004ec9c 2180 init_waitqueue_head(&ctx->fd_wqh);
2ca97ac8 2181 seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
3004ec9c
AA
2182}
2183
2d5de004 2184static int new_userfaultfd(int flags)
86039bd3 2185{
86039bd3 2186 struct userfaultfd_ctx *ctx;
284cd241 2187 int fd;
86039bd3
AA
2188
2189 BUG_ON(!current->mm);
2190
2191 /* Check the UFFD_* constants for consistency. */
37cd0575 2192 BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
86039bd3
AA
2193 BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
2194 BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
2195
37cd0575 2196 if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
284cd241 2197 return -EINVAL;
86039bd3 2198
3004ec9c 2199 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
86039bd3 2200 if (!ctx)
284cd241 2201 return -ENOMEM;
86039bd3 2202
ca880420 2203 refcount_set(&ctx->refcount, 1);
86039bd3 2204 ctx->flags = flags;
9cd75c3c 2205 ctx->features = 0;
86039bd3 2206 ctx->released = false;
a759a909 2207 atomic_set(&ctx->mmap_changing, 0);
86039bd3
AA
2208 ctx->mm = current->mm;
2209 /* prevent the mm struct to be freed */
f1f10076 2210 mmgrab(ctx->mm);
86039bd3 2211
b537900f 2212 fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
abec3d01 2213 O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
284cd241 2214 if (fd < 0) {
d2005e3f 2215 mmdrop(ctx->mm);
3004ec9c 2216 kmem_cache_free(userfaultfd_ctx_cachep, ctx);
c03e946f 2217 }
86039bd3 2218 return fd;
86039bd3 2219}
3004ec9c 2220
2d5de004
AR
2221static inline bool userfaultfd_syscall_allowed(int flags)
2222{
2223 /* Userspace-only page faults are always allowed */
2224 if (flags & UFFD_USER_MODE_ONLY)
2225 return true;
2226
2227 /*
2228 * The user is requesting a userfaultfd which can handle kernel faults.
2229 * Privileged users are always allowed to do this.
2230 */
2231 if (capable(CAP_SYS_PTRACE))
2232 return true;
2233
2234 /* Otherwise, access to kernel fault handling is sysctl controlled. */
2235 return sysctl_unprivileged_userfaultfd;
2236}
2237
2238SYSCALL_DEFINE1(userfaultfd, int, flags)
2239{
2240 if (!userfaultfd_syscall_allowed(flags))
2241 return -EPERM;
2242
2243 return new_userfaultfd(flags);
2244}
2245
2246static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
2247{
2248 if (cmd != USERFAULTFD_IOC_NEW)
2249 return -EINVAL;
2250
2251 return new_userfaultfd(flags);
2252}
2253
2254static const struct file_operations userfaultfd_dev_fops = {
2255 .unlocked_ioctl = userfaultfd_dev_ioctl,
2256 .compat_ioctl = userfaultfd_dev_ioctl,
2257 .owner = THIS_MODULE,
2258 .llseek = noop_llseek,
2259};
2260
2261static struct miscdevice userfaultfd_misc = {
2262 .minor = MISC_DYNAMIC_MINOR,
2263 .name = "userfaultfd",
2264 .fops = &userfaultfd_dev_fops
2265};
2266
3004ec9c
AA
2267static int __init userfaultfd_init(void)
2268{
2d5de004
AR
2269 int ret;
2270
2271 ret = misc_register(&userfaultfd_misc);
2272 if (ret)
2273 return ret;
2274
3004ec9c
AA
2275 userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
2276 sizeof(struct userfaultfd_ctx),
2277 0,
2278 SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2279 init_once_userfaultfd_ctx);
2d337b71
Z
2280#ifdef CONFIG_SYSCTL
2281 register_sysctl_init("vm", vm_userfaultfd_table);
2282#endif
3004ec9c
AA
2283 return 0;
2284}
2285__initcall(userfaultfd_init);