Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
932b18e0 AA |
2 | /* |
3 | * include/linux/userfaultfd_k.h | |
4 | * | |
5 | * Copyright (C) 2015 Red Hat, Inc. | |
6 | * | |
7 | */ | |
8 | ||
9 | #ifndef _LINUX_USERFAULTFD_K_H | |
10 | #define _LINUX_USERFAULTFD_K_H | |
11 | ||
12 | #ifdef CONFIG_USERFAULTFD | |
13 | ||
14 | #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */ | |
15 | ||
16 | #include <linux/fcntl.h> | |
55adf4de | 17 | #include <linux/mm.h> |
1db9dbc2 PX |
18 | #include <linux/swap.h> |
19 | #include <linux/swapops.h> | |
55adf4de | 20 | #include <asm-generic/pgtable_uffd.h> |
b1f9e876 | 21 | #include <linux/hugetlb_inline.h> |
932b18e0 | 22 | |
7677f7fd AR |
23 | /* The set of all possible UFFD-related VM flags. */ |
24 | #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) | |
25 | ||
932b18e0 AA |
26 | /* |
27 | * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining | |
28 | * new flags, since they might collide with O_* ones. We want | |
29 | * to re-use O_* flags that couldn't possibly have a meaning | |
30 | * from userfaultfd, in order to leave a free define-space for | |
31 | * shared O_* flags. | |
32 | */ | |
33 | #define UFFD_CLOEXEC O_CLOEXEC | |
34 | #define UFFD_NONBLOCK O_NONBLOCK | |
35 | ||
36 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) | |
37 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) | |
38 | ||
f91e6b41 LG |
39 | /* |
40 | * Start with fault_pending_wqh and fault_wqh so they're more likely | |
41 | * to be in the same cacheline. | |
42 | * | |
43 | * Locking order: | |
44 | * fd_wqh.lock | |
45 | * fault_pending_wqh.lock | |
46 | * fault_wqh.lock | |
47 | * event_wqh.lock | |
48 | * | |
49 | * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, | |
50 | * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's | |
51 | * also taken in IRQ context. | |
52 | */ | |
53 | struct userfaultfd_ctx { | |
54 | /* waitqueue head for the pending (i.e. not read) userfaults */ | |
55 | wait_queue_head_t fault_pending_wqh; | |
56 | /* waitqueue head for the userfaults */ | |
57 | wait_queue_head_t fault_wqh; | |
58 | /* waitqueue head for the pseudo fd to wakeup poll/read */ | |
59 | wait_queue_head_t fd_wqh; | |
60 | /* waitqueue head for events */ | |
61 | wait_queue_head_t event_wqh; | |
62 | /* a refile sequence protected by fault_pending_wqh lock */ | |
63 | seqcount_spinlock_t refile_seq; | |
64 | /* pseudo fd refcounting */ | |
65 | refcount_t refcount; | |
66 | /* userfaultfd syscall flags */ | |
67 | unsigned int flags; | |
68 | /* features requested from the userspace */ | |
69 | unsigned int features; | |
70 | /* released */ | |
71 | bool released; | |
5e4c24a5 LG |
72 | /* |
73 | * Prevents userfaultfd operations (fill/move/wp) from happening while | |
74 | * some non-cooperative event(s) is taking place. Increments are done | |
75 | * in write-mode. Whereas, userfaultfd operations, which includes | |
76 | * reading mmap_changing, is done under read-mode. | |
77 | */ | |
78 | struct rw_semaphore map_changing_lock; | |
f91e6b41 LG |
79 | /* memory mappings are changing because of non-cooperative event */ |
80 | atomic_t mmap_changing; | |
81 | /* mm with one ore more vmas attached to this userfaultfd_ctx */ | |
82 | struct mm_struct *mm; | |
83 | }; | |
84 | ||
2b740303 | 85 | extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); |
932b18e0 | 86 | |
d9712937 AR |
87 | /* A combined operation mode + behavior flags. */ |
88 | typedef unsigned int __bitwise uffd_flags_t; | |
89 | ||
90 | /* Mutually exclusive modes of operation. */ | |
91 | enum mfill_atomic_mode { | |
92 | MFILL_ATOMIC_COPY, | |
93 | MFILL_ATOMIC_ZEROPAGE, | |
94 | MFILL_ATOMIC_CONTINUE, | |
fc71884a | 95 | MFILL_ATOMIC_POISON, |
d9712937 | 96 | NR_MFILL_ATOMIC_MODES, |
f6191471 AR |
97 | }; |
98 | ||
d9712937 AR |
99 | #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) |
100 | #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) | |
101 | #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) | |
102 | #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1)) | |
103 | ||
104 | static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected) | |
105 | { | |
106 | return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected); | |
107 | } | |
108 | ||
109 | static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) | |
110 | { | |
111 | flags &= ~MFILL_ATOMIC_MODE_MASK; | |
112 | return flags | ((__force uffd_flags_t) mode); | |
113 | } | |
114 | ||
115 | /* Flags controlling behavior. These behavior changes are mode-independent. */ | |
116 | #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) | |
117 | ||
61c50040 | 118 | extern int mfill_atomic_install_pte(pmd_t *dst_pmd, |
7d64ae3a AR |
119 | struct vm_area_struct *dst_vma, |
120 | unsigned long dst_addr, struct page *page, | |
d9712937 | 121 | bool newly_allocated, uffd_flags_t flags); |
7d64ae3a | 122 | |
5e4c24a5 | 123 | extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, |
a734991c | 124 | unsigned long src_start, unsigned long len, |
5e4c24a5 LG |
125 | uffd_flags_t flags); |
126 | extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, | |
a734991c | 127 | unsigned long dst_start, |
5e4c24a5 LG |
128 | unsigned long len); |
129 | extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, | |
130 | unsigned long len, uffd_flags_t flags); | |
131 | extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, | |
132 | unsigned long len, uffd_flags_t flags); | |
133 | extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, | |
134 | unsigned long len, bool enable_wp); | |
61c50040 | 135 | extern long uffd_wp_range(struct vm_area_struct *vma, |
f369b07c | 136 | unsigned long start, unsigned long len, bool enable_wp); |
c1a4de99 | 137 | |
adef4406 AA |
138 | /* move_pages */ |
139 | void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); | |
140 | void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); | |
867a43a3 LG |
141 | ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, |
142 | unsigned long src_start, unsigned long len, __u64 flags); | |
adef4406 AA |
143 | int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, |
144 | struct vm_area_struct *dst_vma, | |
145 | struct vm_area_struct *src_vma, | |
146 | unsigned long dst_addr, unsigned long src_addr); | |
147 | ||
932b18e0 AA |
148 | /* mm helpers */ |
149 | static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, | |
150 | struct vm_userfaultfd_ctx vm_ctx) | |
151 | { | |
152 | return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; | |
153 | } | |
154 | ||
c1991e07 | 155 | /* |
0d9cadab AR |
156 | * Never enable huge pmd sharing on some uffd registered vmas: |
157 | * | |
158 | * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. | |
159 | * | |
160 | * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for | |
161 | * VMAs which share huge pmds. (If you have two mappings to the same | |
162 | * underlying pages, and fault in the non-UFFD-registered one with a write, | |
163 | * with huge pmd sharing this would *also* setup the second UFFD-registered | |
164 | * mapping, and we'd not get minor faults.) | |
c1991e07 PX |
165 | */ |
166 | static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) | |
167 | { | |
0d9cadab | 168 | return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); |
c1991e07 PX |
169 | } |
170 | ||
9c28a205 PX |
171 | /* |
172 | * Don't do fault around for either WP or MINOR registered uffd range. For | |
173 | * MINOR registered range, fault around will be a total disaster and ptes can | |
174 | * be installed without notifications; for WP it should mostly be fine as long | |
175 | * as the fault around checks for pte_none() before the installation, however | |
176 | * to be super safe we just forbid it. | |
177 | */ | |
178 | static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) | |
179 | { | |
180 | return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); | |
181 | } | |
182 | ||
932b18e0 AA |
183 | static inline bool userfaultfd_missing(struct vm_area_struct *vma) |
184 | { | |
185 | return vma->vm_flags & VM_UFFD_MISSING; | |
186 | } | |
187 | ||
1df319e0 SL |
188 | static inline bool userfaultfd_wp(struct vm_area_struct *vma) |
189 | { | |
190 | return vma->vm_flags & VM_UFFD_WP; | |
191 | } | |
192 | ||
7677f7fd AR |
193 | static inline bool userfaultfd_minor(struct vm_area_struct *vma) |
194 | { | |
195 | return vma->vm_flags & VM_UFFD_MINOR; | |
196 | } | |
197 | ||
55adf4de AA |
198 | static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, |
199 | pte_t pte) | |
200 | { | |
201 | return userfaultfd_wp(vma) && pte_uffd_wp(pte); | |
202 | } | |
203 | ||
204 | static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, | |
205 | pmd_t pmd) | |
206 | { | |
207 | return userfaultfd_wp(vma) && pmd_uffd_wp(pmd); | |
208 | } | |
209 | ||
932b18e0 AA |
210 | static inline bool userfaultfd_armed(struct vm_area_struct *vma) |
211 | { | |
7677f7fd | 212 | return vma->vm_flags & __VM_UFFD_FLAGS; |
932b18e0 AA |
213 | } |
214 | ||
b1f9e876 | 215 | static inline bool vma_can_userfault(struct vm_area_struct *vma, |
d61ea1cb PX |
216 | unsigned long vm_flags, |
217 | bool wp_async) | |
b1f9e876 | 218 | { |
d61ea1cb PX |
219 | vm_flags &= __VM_UFFD_FLAGS; |
220 | ||
9651fced JD |
221 | if (vm_flags & VM_DROPPABLE) |
222 | return false; | |
223 | ||
67eae54b PX |
224 | if ((vm_flags & VM_UFFD_MINOR) && |
225 | (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) | |
226 | return false; | |
d61ea1cb PX |
227 | |
228 | /* | |
229 | * If wp async enabled, and WP is the only mode enabled, allow any | |
230 | * memory type. | |
231 | */ | |
232 | if (wp_async && (vm_flags == VM_UFFD_WP)) | |
233 | return true; | |
234 | ||
b1f9e876 PX |
235 | #ifndef CONFIG_PTE_MARKER_UFFD_WP |
236 | /* | |
237 | * If user requested uffd-wp but not enabled pte markers for | |
238 | * uffd-wp, then shmem & hugetlbfs are not supported but only | |
239 | * anonymous. | |
240 | */ | |
241 | if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) | |
242 | return false; | |
243 | #endif | |
d61ea1cb PX |
244 | |
245 | /* By default, allow any of anon|shmem|hugetlb */ | |
b1f9e876 PX |
246 | return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || |
247 | vma_is_shmem(vma); | |
248 | } | |
249 | ||
0cef0bb8 RR |
250 | static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) |
251 | { | |
252 | struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; | |
253 | ||
254 | return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0; | |
255 | } | |
256 | ||
893e26e6 PE |
257 | extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); |
258 | extern void dup_userfaultfd_complete(struct list_head *); | |
f64e67e5 | 259 | void dup_userfaultfd_fail(struct list_head *); |
893e26e6 | 260 | |
72f87654 PE |
261 | extern void mremap_userfaultfd_prep(struct vm_area_struct *, |
262 | struct vm_userfaultfd_ctx *); | |
90794bf1 | 263 | extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, |
72f87654 PE |
264 | unsigned long from, unsigned long to, |
265 | unsigned long len); | |
266 | ||
70ccb92f | 267 | extern bool userfaultfd_remove(struct vm_area_struct *vma, |
d811914d MR |
268 | unsigned long start, |
269 | unsigned long end); | |
05ce7724 | 270 | |
65ac1320 LH |
271 | extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, |
272 | unsigned long start, unsigned long end, struct list_head *uf); | |
897ab3e0 MR |
273 | extern void userfaultfd_unmap_complete(struct mm_struct *mm, |
274 | struct list_head *uf); | |
2bad466c | 275 | extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); |
d61ea1cb | 276 | extern bool userfaultfd_wp_async(struct vm_area_struct *vma); |
897ab3e0 | 277 | |
a17c7d8f LS |
278 | void userfaultfd_reset_ctx(struct vm_area_struct *vma); |
279 | ||
280 | struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, | |
281 | struct vm_area_struct *prev, | |
282 | struct vm_area_struct *vma, | |
283 | unsigned long start, | |
284 | unsigned long end); | |
285 | ||
286 | int userfaultfd_register_range(struct userfaultfd_ctx *ctx, | |
287 | struct vm_area_struct *vma, | |
288 | unsigned long vm_flags, | |
289 | unsigned long start, unsigned long end, | |
290 | bool wp_async); | |
291 | ||
292 | void userfaultfd_release_new(struct userfaultfd_ctx *ctx); | |
293 | ||
294 | void userfaultfd_release_all(struct mm_struct *mm, | |
295 | struct userfaultfd_ctx *ctx); | |
296 | ||
932b18e0 AA |
297 | #else /* CONFIG_USERFAULTFD */ |
298 | ||
299 | /* mm helpers */ | |
2b740303 SJ |
300 | static inline vm_fault_t handle_userfault(struct vm_fault *vmf, |
301 | unsigned long reason) | |
932b18e0 AA |
302 | { |
303 | return VM_FAULT_SIGBUS; | |
304 | } | |
305 | ||
52526ca7 MUA |
306 | static inline long uffd_wp_range(struct vm_area_struct *vma, |
307 | unsigned long start, unsigned long len, | |
308 | bool enable_wp) | |
309 | { | |
310 | return false; | |
311 | } | |
312 | ||
932b18e0 AA |
313 | static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, |
314 | struct vm_userfaultfd_ctx vm_ctx) | |
315 | { | |
316 | return true; | |
317 | } | |
318 | ||
319 | static inline bool userfaultfd_missing(struct vm_area_struct *vma) | |
320 | { | |
321 | return false; | |
322 | } | |
323 | ||
1df319e0 SL |
324 | static inline bool userfaultfd_wp(struct vm_area_struct *vma) |
325 | { | |
326 | return false; | |
327 | } | |
328 | ||
7677f7fd AR |
329 | static inline bool userfaultfd_minor(struct vm_area_struct *vma) |
330 | { | |
331 | return false; | |
332 | } | |
333 | ||
55adf4de AA |
334 | static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, |
335 | pte_t pte) | |
336 | { | |
337 | return false; | |
338 | } | |
339 | ||
340 | static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, | |
341 | pmd_t pmd) | |
342 | { | |
343 | return false; | |
344 | } | |
345 | ||
346 | ||
932b18e0 AA |
347 | static inline bool userfaultfd_armed(struct vm_area_struct *vma) |
348 | { | |
349 | return false; | |
350 | } | |
351 | ||
893e26e6 PE |
352 | static inline int dup_userfaultfd(struct vm_area_struct *vma, |
353 | struct list_head *l) | |
354 | { | |
355 | return 0; | |
356 | } | |
357 | ||
358 | static inline void dup_userfaultfd_complete(struct list_head *l) | |
359 | { | |
360 | } | |
361 | ||
f64e67e5 LS |
362 | static inline void dup_userfaultfd_fail(struct list_head *l) |
363 | { | |
364 | } | |
365 | ||
72f87654 PE |
366 | static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, |
367 | struct vm_userfaultfd_ctx *ctx) | |
368 | { | |
369 | } | |
370 | ||
90794bf1 | 371 | static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, |
72f87654 PE |
372 | unsigned long from, |
373 | unsigned long to, | |
374 | unsigned long len) | |
375 | { | |
376 | } | |
05ce7724 | 377 | |
70ccb92f | 378 | static inline bool userfaultfd_remove(struct vm_area_struct *vma, |
d811914d MR |
379 | unsigned long start, |
380 | unsigned long end) | |
05ce7724 | 381 | { |
70ccb92f | 382 | return true; |
05ce7724 | 383 | } |
897ab3e0 | 384 | |
65ac1320 | 385 | static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, |
897ab3e0 MR |
386 | unsigned long start, unsigned long end, |
387 | struct list_head *uf) | |
388 | { | |
389 | return 0; | |
390 | } | |
391 | ||
392 | static inline void userfaultfd_unmap_complete(struct mm_struct *mm, | |
393 | struct list_head *uf) | |
394 | { | |
395 | } | |
ca49ca71 | 396 | |
9c28a205 PX |
397 | static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) |
398 | { | |
399 | return false; | |
400 | } | |
401 | ||
2bad466c PX |
402 | static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) |
403 | { | |
404 | return false; | |
405 | } | |
406 | ||
d61ea1cb PX |
407 | static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) |
408 | { | |
409 | return false; | |
410 | } | |
411 | ||
0cef0bb8 RR |
412 | static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) |
413 | { | |
414 | return false; | |
415 | } | |
416 | ||
932b18e0 AA |
417 | #endif /* CONFIG_USERFAULTFD */ |
418 | ||
2bad466c PX |
419 | static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) |
420 | { | |
421 | /* Only wr-protect mode uses pte markers */ | |
422 | if (!userfaultfd_wp(vma)) | |
423 | return false; | |
424 | ||
425 | /* File-based uffd-wp always need markers */ | |
426 | if (!vma_is_anonymous(vma)) | |
427 | return true; | |
428 | ||
429 | /* | |
430 | * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED | |
431 | * enabled (to apply markers on zero pages). | |
432 | */ | |
433 | return userfaultfd_wp_unpopulated(vma); | |
434 | } | |
435 | ||
1db9dbc2 PX |
436 | static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) |
437 | { | |
438 | #ifdef CONFIG_PTE_MARKER_UFFD_WP | |
439 | return is_pte_marker_entry(entry) && | |
440 | (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); | |
441 | #else | |
442 | return false; | |
443 | #endif | |
444 | } | |
445 | ||
446 | static inline bool pte_marker_uffd_wp(pte_t pte) | |
447 | { | |
448 | #ifdef CONFIG_PTE_MARKER_UFFD_WP | |
449 | swp_entry_t entry; | |
450 | ||
451 | if (!is_swap_pte(pte)) | |
452 | return false; | |
453 | ||
454 | entry = pte_to_swp_entry(pte); | |
455 | ||
456 | return pte_marker_entry_uffd_wp(entry); | |
457 | #else | |
458 | return false; | |
459 | #endif | |
460 | } | |
461 | ||
462 | /* | |
463 | * Returns true if this is a swap pte and was uffd-wp wr-protected in either | |
464 | * forms (pte marker or a normal swap pte), false otherwise. | |
465 | */ | |
466 | static inline bool pte_swp_uffd_wp_any(pte_t pte) | |
467 | { | |
468 | #ifdef CONFIG_PTE_MARKER_UFFD_WP | |
469 | if (!is_swap_pte(pte)) | |
470 | return false; | |
471 | ||
472 | if (pte_swp_uffd_wp(pte)) | |
473 | return true; | |
474 | ||
475 | if (pte_marker_uffd_wp(pte)) | |
476 | return true; | |
477 | #endif | |
478 | return false; | |
479 | } | |
480 | ||
932b18e0 | 481 | #endif /* _LINUX_USERFAULTFD_K_H */ |