Commit | Line | Data |
---|---|---|
5d752600 MK |
1 | /* |
2 | * memfd_create system call and file sealing support | |
3 | * | |
4 | * Code was originally included in shmem.c, and broken out to facilitate | |
5 | * use by hugetlbfs as well as tmpfs. | |
6 | * | |
7 | * This file is released under the GPL. | |
8 | */ | |
9 | ||
10 | #include <linux/fs.h> | |
11 | #include <linux/vfs.h> | |
12 | #include <linux/pagemap.h> | |
13 | #include <linux/file.h> | |
14 | #include <linux/mm.h> | |
15 | #include <linux/sched/signal.h> | |
16 | #include <linux/khugepaged.h> | |
17 | #include <linux/syscalls.h> | |
18 | #include <linux/hugetlb.h> | |
19 | #include <linux/shmem_fs.h> | |
20 | #include <linux/memfd.h> | |
105ff533 | 21 | #include <linux/pid_namespace.h> |
5d752600 | 22 | #include <uapi/linux/memfd.h> |
7d0f0f06 | 23 | #include "swap.h" |
5d752600 MK |
24 | |
25 | /* | |
2313216f | 26 | * We need a tag: a new tag would expand every xa_node by 8 bytes, |
5d752600 MK |
27 | * so reuse a tag which we firmly believe is never set or cleared on tmpfs |
28 | * or hugetlbfs because they are memory only filesystems. | |
29 | */ | |
30 | #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE | |
31 | #define LAST_SCAN 4 /* about 150ms max */ | |
32 | ||
b4d02baa DH |
33 | static bool memfd_folio_has_extra_refs(struct folio *folio) |
34 | { | |
35 | return folio_ref_count(folio) - folio_mapcount(folio) != | |
36 | folio_nr_pages(folio); | |
37 | } | |
38 | ||
ef3038a5 | 39 | static void memfd_tag_pins(struct xa_state *xas) |
5d752600 | 40 | { |
b4d02baa | 41 | struct folio *folio; |
f2b277c4 | 42 | int latency = 0; |
5d752600 MK |
43 | |
44 | lru_add_drain(); | |
5d752600 | 45 | |
ef3038a5 | 46 | xas_lock_irq(xas); |
b4d02baa DH |
47 | xas_for_each(xas, folio, ULONG_MAX) { |
48 | if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) | |
ef3038a5 | 49 | xas_set_mark(xas, MEMFD_TAG_PINNED); |
5d752600 | 50 | |
b4d02baa | 51 | if (++latency < XA_CHECK_SCHED) |
ef3038a5 | 52 | continue; |
f2b277c4 | 53 | latency = 0; |
ef3038a5 MW |
54 | |
55 | xas_pause(xas); | |
56 | xas_unlock_irq(xas); | |
57 | cond_resched(); | |
58 | xas_lock_irq(xas); | |
5d752600 | 59 | } |
ef3038a5 | 60 | xas_unlock_irq(xas); |
5d752600 MK |
61 | } |
62 | ||
89c1905d VK |
63 | /* |
64 | * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c). | |
65 | * It is mainly called to allocate a folio in a memfd when the caller | |
66 | * (memfd_pin_folios()) cannot find a folio in the page cache at a given | |
67 | * index in the mapping. | |
68 | */ | |
69 | struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) | |
70 | { | |
71 | #ifdef CONFIG_HUGETLB_PAGE | |
72 | struct folio *folio; | |
73 | gfp_t gfp_mask; | |
74 | int err; | |
75 | ||
76 | if (is_file_hugepages(memfd)) { | |
77 | /* | |
78 | * The folio would most likely be accessed by a DMA driver, | |
79 | * therefore, we have zone memory constraints where we can | |
80 | * alloc from. Also, the folio will be pinned for an indefinite | |
81 | * amount of time, so it is not expected to be migrated away. | |
82 | */ | |
9289f020 SS |
83 | struct hstate *h = hstate_file(memfd); |
84 | ||
85 | gfp_mask = htlb_alloc_mask(h); | |
89c1905d | 86 | gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); |
9289f020 | 87 | idx >>= huge_page_order(h); |
89c1905d | 88 | |
9289f020 | 89 | folio = alloc_hugetlb_folio_reserve(h, |
26a8ea80 SS |
90 | numa_node_id(), |
91 | NULL, | |
92 | gfp_mask); | |
dc677b5f | 93 | if (folio) { |
89c1905d VK |
94 | err = hugetlb_add_to_page_cache(folio, |
95 | memfd->f_mapping, | |
96 | idx); | |
97 | if (err) { | |
98 | folio_put(folio); | |
89c1905d VK |
99 | return ERR_PTR(err); |
100 | } | |
9289f020 | 101 | folio_unlock(folio); |
89c1905d VK |
102 | return folio; |
103 | } | |
104 | return ERR_PTR(-ENOMEM); | |
105 | } | |
106 | #endif | |
107 | return shmem_read_folio(memfd->f_mapping, idx); | |
108 | } | |
109 | ||
5d752600 MK |
110 | /* |
111 | * Setting SEAL_WRITE requires us to verify there's no pending writer. However, | |
112 | * via get_user_pages(), drivers might have some pending I/O without any active | |
b4d02baa | 113 | * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios |
5d752600 MK |
114 | * and see whether it has an elevated ref-count. If so, we tag them and wait for |
115 | * them to be dropped. | |
116 | * The caller must guarantee that no new user will acquire writable references | |
b4d02baa | 117 | * to those folios to avoid races. |
5d752600 MK |
118 | */ |
119 | static int memfd_wait_for_pins(struct address_space *mapping) | |
120 | { | |
2313216f | 121 | XA_STATE(xas, &mapping->i_pages, 0); |
b4d02baa | 122 | struct folio *folio; |
5d752600 MK |
123 | int error, scan; |
124 | ||
ef3038a5 | 125 | memfd_tag_pins(&xas); |
5d752600 MK |
126 | |
127 | error = 0; | |
128 | for (scan = 0; scan <= LAST_SCAN; scan++) { | |
f2b277c4 | 129 | int latency = 0; |
2313216f MW |
130 | |
131 | if (!xas_marked(&xas, MEMFD_TAG_PINNED)) | |
5d752600 MK |
132 | break; |
133 | ||
134 | if (!scan) | |
135 | lru_add_drain_all(); | |
136 | else if (schedule_timeout_killable((HZ << scan) / 200)) | |
137 | scan = LAST_SCAN; | |
138 | ||
2313216f MW |
139 | xas_set(&xas, 0); |
140 | xas_lock_irq(&xas); | |
b4d02baa | 141 | xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { |
2313216f | 142 | bool clear = true; |
f2b277c4 | 143 | |
b4d02baa DH |
144 | if (!xa_is_value(folio) && |
145 | memfd_folio_has_extra_refs(folio)) { | |
5d752600 MK |
146 | /* |
147 | * On the last scan, we clean up all those tags | |
148 | * we inserted; but make a note that we still | |
b4d02baa | 149 | * found folios pinned. |
5d752600 | 150 | */ |
2313216f MW |
151 | if (scan == LAST_SCAN) |
152 | error = -EBUSY; | |
153 | else | |
154 | clear = false; | |
5d752600 | 155 | } |
2313216f MW |
156 | if (clear) |
157 | xas_clear_mark(&xas, MEMFD_TAG_PINNED); | |
f2b277c4 | 158 | |
b4d02baa | 159 | if (++latency < XA_CHECK_SCHED) |
2313216f | 160 | continue; |
f2b277c4 | 161 | latency = 0; |
5d752600 | 162 | |
2313216f MW |
163 | xas_pause(&xas); |
164 | xas_unlock_irq(&xas); | |
165 | cond_resched(); | |
166 | xas_lock_irq(&xas); | |
5d752600 | 167 | } |
2313216f | 168 | xas_unlock_irq(&xas); |
5d752600 MK |
169 | } |
170 | ||
171 | return error; | |
172 | } | |
173 | ||
fa00b8ef | 174 | static unsigned int *memfd_file_seals_ptr(struct file *file) |
5d752600 MK |
175 | { |
176 | if (shmem_file(file)) | |
177 | return &SHMEM_I(file_inode(file))->seals; | |
178 | ||
179 | #ifdef CONFIG_HUGETLBFS | |
180 | if (is_file_hugepages(file)) | |
181 | return &HUGETLBFS_I(file_inode(file))->seals; | |
182 | #endif | |
183 | ||
184 | return NULL; | |
185 | } | |
186 | ||
187 | #define F_ALL_SEALS (F_SEAL_SEAL | \ | |
6fd73538 | 188 | F_SEAL_EXEC | \ |
5d752600 MK |
189 | F_SEAL_SHRINK | \ |
190 | F_SEAL_GROW | \ | |
ab3948f5 JFG |
191 | F_SEAL_WRITE | \ |
192 | F_SEAL_FUTURE_WRITE) | |
5d752600 MK |
193 | |
194 | static int memfd_add_seals(struct file *file, unsigned int seals) | |
195 | { | |
196 | struct inode *inode = file_inode(file); | |
197 | unsigned int *file_seals; | |
198 | int error; | |
199 | ||
200 | /* | |
201 | * SEALING | |
202 | * Sealing allows multiple parties to share a tmpfs or hugetlbfs file | |
203 | * but restrict access to a specific subset of file operations. Seals | |
204 | * can only be added, but never removed. This way, mutually untrusted | |
205 | * parties can share common memory regions with a well-defined policy. | |
206 | * A malicious peer can thus never perform unwanted operations on a | |
207 | * shared object. | |
208 | * | |
209 | * Seals are only supported on special tmpfs or hugetlbfs files and | |
210 | * always affect the whole underlying inode. Once a seal is set, it | |
211 | * may prevent some kinds of access to the file. Currently, the | |
212 | * following seals are defined: | |
213 | * SEAL_SEAL: Prevent further seals from being set on this file | |
214 | * SEAL_SHRINK: Prevent the file from shrinking | |
215 | * SEAL_GROW: Prevent the file from growing | |
216 | * SEAL_WRITE: Prevent write access to the file | |
6fd73538 | 217 | * SEAL_EXEC: Prevent modification of the exec bits in the file mode |
5d752600 MK |
218 | * |
219 | * As we don't require any trust relationship between two parties, we | |
220 | * must prevent seals from being removed. Therefore, sealing a file | |
221 | * only adds a given set of seals to the file, it never touches | |
222 | * existing seals. Furthermore, the "setting seals"-operation can be | |
223 | * sealed itself, which basically prevents any further seal from being | |
224 | * added. | |
225 | * | |
226 | * Semantics of sealing are only defined on volatile files. Only | |
227 | * anonymous tmpfs and hugetlbfs files support sealing. More | |
228 | * importantly, seals are never written to disk. Therefore, there's | |
229 | * no plan to support it on other file types. | |
230 | */ | |
231 | ||
232 | if (!(file->f_mode & FMODE_WRITE)) | |
233 | return -EPERM; | |
234 | if (seals & ~(unsigned int)F_ALL_SEALS) | |
235 | return -EINVAL; | |
236 | ||
237 | inode_lock(inode); | |
238 | ||
239 | file_seals = memfd_file_seals_ptr(file); | |
240 | if (!file_seals) { | |
241 | error = -EINVAL; | |
242 | goto unlock; | |
243 | } | |
244 | ||
245 | if (*file_seals & F_SEAL_SEAL) { | |
246 | error = -EPERM; | |
247 | goto unlock; | |
248 | } | |
249 | ||
250 | if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { | |
251 | error = mapping_deny_writable(file->f_mapping); | |
252 | if (error) | |
253 | goto unlock; | |
254 | ||
255 | error = memfd_wait_for_pins(file->f_mapping); | |
256 | if (error) { | |
257 | mapping_allow_writable(file->f_mapping); | |
258 | goto unlock; | |
259 | } | |
260 | } | |
261 | ||
c4f75bc8 | 262 | /* |
33c9b01e | 263 | * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. |
c4f75bc8 JX |
264 | */ |
265 | if (seals & F_SEAL_EXEC && inode->i_mode & 0111) | |
266 | seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; | |
267 | ||
5d752600 MK |
268 | *file_seals |= seals; |
269 | error = 0; | |
270 | ||
271 | unlock: | |
272 | inode_unlock(inode); | |
273 | return error; | |
274 | } | |
275 | ||
276 | static int memfd_get_seals(struct file *file) | |
277 | { | |
278 | unsigned int *seals = memfd_file_seals_ptr(file); | |
279 | ||
280 | return seals ? *seals : -EINVAL; | |
281 | } | |
282 | ||
f7b8f70b | 283 | long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) |
5d752600 MK |
284 | { |
285 | long error; | |
286 | ||
287 | switch (cmd) { | |
288 | case F_ADD_SEALS: | |
5d752600 MK |
289 | error = memfd_add_seals(file, arg); |
290 | break; | |
291 | case F_GET_SEALS: | |
292 | error = memfd_get_seals(file); | |
293 | break; | |
294 | default: | |
295 | error = -EINVAL; | |
296 | break; | |
297 | } | |
298 | ||
299 | return error; | |
300 | } | |
301 | ||
302 | #define MFD_NAME_PREFIX "memfd:" | |
303 | #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) | |
304 | #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) | |
305 | ||
105ff533 | 306 | #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) |
5d752600 | 307 | |
72de2591 JX |
308 | static int check_sysctl_memfd_noexec(unsigned int *flags) |
309 | { | |
310 | #ifdef CONFIG_SYSCTL | |
9876cfe8 AS |
311 | struct pid_namespace *ns = task_active_pid_ns(current); |
312 | int sysctl = pidns_memfd_noexec_scope(ns); | |
72de2591 JX |
313 | |
314 | if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { | |
202e1422 | 315 | if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) |
72de2591 JX |
316 | *flags |= MFD_NOEXEC_SEAL; |
317 | else | |
318 | *flags |= MFD_EXEC; | |
319 | } | |
320 | ||
202e1422 AS |
321 | if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { |
322 | pr_err_ratelimited( | |
323 | "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", | |
324 | current->comm, task_pid_nr(current), sysctl); | |
72de2591 JX |
325 | return -EACCES; |
326 | } | |
327 | #endif | |
72de2591 JX |
328 | return 0; |
329 | } | |
330 | ||
fa00b8ef LS |
331 | static inline bool is_write_sealed(unsigned int seals) |
332 | { | |
333 | return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); | |
334 | } | |
335 | ||
336 | static int check_write_seal(unsigned long *vm_flags_ptr) | |
337 | { | |
338 | unsigned long vm_flags = *vm_flags_ptr; | |
339 | unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); | |
340 | ||
33c9b01e | 341 | /* If a private mapping then writability is irrelevant. */ |
fa00b8ef LS |
342 | if (!(mask & VM_SHARED)) |
343 | return 0; | |
344 | ||
345 | /* | |
346 | * New PROT_WRITE and MAP_SHARED mmaps are not allowed when | |
347 | * write seals are active. | |
348 | */ | |
349 | if (mask & VM_WRITE) | |
350 | return -EPERM; | |
351 | ||
352 | /* | |
353 | * This is a read-only mapping, disallow mprotect() from making a | |
354 | * write-sealed mapping writable in future. | |
355 | */ | |
356 | *vm_flags_ptr &= ~VM_MAYWRITE; | |
357 | ||
358 | return 0; | |
359 | } | |
360 | ||
361 | int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) | |
362 | { | |
363 | int err = 0; | |
364 | unsigned int *seals_ptr = memfd_file_seals_ptr(file); | |
365 | unsigned int seals = seals_ptr ? *seals_ptr : 0; | |
366 | ||
367 | if (is_write_sealed(seals)) | |
368 | err = check_write_seal(vm_flags_ptr); | |
369 | ||
370 | return err; | |
371 | } | |
372 | ||
f5dbcd90 | 373 | static int sanitize_flags(unsigned int *flags_ptr) |
5d752600 | 374 | { |
f5dbcd90 | 375 | unsigned int flags = *flags_ptr; |
5d752600 MK |
376 | |
377 | if (!(flags & MFD_HUGETLB)) { | |
378 | if (flags & ~(unsigned int)MFD_ALL_FLAGS) | |
379 | return -EINVAL; | |
380 | } else { | |
381 | /* Allow huge page size encoding in flags. */ | |
382 | if (flags & ~(unsigned int)(MFD_ALL_FLAGS | | |
383 | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) | |
384 | return -EINVAL; | |
385 | } | |
386 | ||
105ff533 JX |
387 | /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ |
388 | if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) | |
389 | return -EINVAL; | |
390 | ||
f5dbcd90 IM |
391 | return check_sysctl_memfd_noexec(flags_ptr); |
392 | } | |
393 | ||
394 | static char *alloc_name(const char __user *uname) | |
395 | { | |
396 | int error; | |
397 | char *name; | |
398 | long len; | |
72de2591 | 399 | |
8f65ac0b | 400 | name = kmalloc(NAME_MAX + 1, GFP_KERNEL); |
5d752600 | 401 | if (!name) |
f5dbcd90 | 402 | return ERR_PTR(-ENOMEM); |
5d752600 MK |
403 | |
404 | strcpy(name, MFD_NAME_PREFIX); | |
8f65ac0b IM |
405 | /* returned length does not include terminating zero */ |
406 | len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); | |
407 | if (len < 0) { | |
5d752600 MK |
408 | error = -EFAULT; |
409 | goto err_name; | |
8f65ac0b IM |
410 | } else if (len > MFD_NAME_MAX_LEN) { |
411 | error = -EINVAL; | |
5d752600 MK |
412 | goto err_name; |
413 | } | |
414 | ||
f5dbcd90 IM |
415 | return name; |
416 | ||
417 | err_name: | |
418 | kfree(name); | |
419 | return ERR_PTR(error); | |
420 | } | |
421 | ||
422 | static struct file *alloc_file(const char *name, unsigned int flags) | |
423 | { | |
424 | unsigned int *file_seals; | |
425 | struct file *file; | |
5d752600 MK |
426 | |
427 | if (flags & MFD_HUGETLB) { | |
83c1fd76 | 428 | file = hugetlb_file_setup(name, 0, VM_NORESERVE, |
5d752600 MK |
429 | HUGETLB_ANONHUGE_INODE, |
430 | (flags >> MFD_HUGE_SHIFT) & | |
431 | MFD_HUGE_MASK); | |
f5dbcd90 | 432 | } else { |
5d752600 | 433 | file = shmem_file_setup(name, 0, VM_NORESERVE); |
5d752600 | 434 | } |
f5dbcd90 IM |
435 | if (IS_ERR(file)) |
436 | return file; | |
5d752600 | 437 | file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; |
c9c554f2 | 438 | file->f_flags |= O_LARGEFILE; |
5d752600 | 439 | |
105ff533 JX |
440 | if (flags & MFD_NOEXEC_SEAL) { |
441 | struct inode *inode = file_inode(file); | |
442 | ||
443 | inode->i_mode &= ~0111; | |
444 | file_seals = memfd_file_seals_ptr(file); | |
935d44ac RS |
445 | if (file_seals) { |
446 | *file_seals &= ~F_SEAL_SEAL; | |
447 | *file_seals |= F_SEAL_EXEC; | |
448 | } | |
105ff533 JX |
449 | } else if (flags & MFD_ALLOW_SEALING) { |
450 | /* MFD_EXEC and MFD_ALLOW_SEALING are set */ | |
5d752600 | 451 | file_seals = memfd_file_seals_ptr(file); |
935d44ac RS |
452 | if (file_seals) |
453 | *file_seals &= ~F_SEAL_SEAL; | |
5d752600 MK |
454 | } |
455 | ||
f5dbcd90 IM |
456 | return file; |
457 | } | |
458 | ||
459 | SYSCALL_DEFINE2(memfd_create, | |
460 | const char __user *, uname, | |
461 | unsigned int, flags) | |
462 | { | |
463 | struct file *file; | |
464 | int fd, error; | |
465 | char *name; | |
466 | ||
467 | error = sanitize_flags(&flags); | |
468 | if (error < 0) | |
469 | return error; | |
470 | ||
471 | name = alloc_name(uname); | |
472 | if (IS_ERR(name)) | |
473 | return PTR_ERR(name); | |
474 | ||
475 | fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); | |
476 | if (fd < 0) { | |
477 | error = fd; | |
478 | goto err_name; | |
479 | } | |
480 | ||
481 | file = alloc_file(name, flags); | |
482 | if (IS_ERR(file)) { | |
483 | error = PTR_ERR(file); | |
484 | goto err_fd; | |
485 | } | |
486 | ||
5d752600 MK |
487 | fd_install(fd, file); |
488 | kfree(name); | |
489 | return fd; | |
490 | ||
491 | err_fd: | |
492 | put_unused_fd(fd); | |
493 | err_name: | |
494 | kfree(name); | |
495 | return error; | |
496 | } |