mm/memfd.c

   1 /*
   2  * memfd_create system call and file sealing support
   3  *
   4  * Code was originally included in shmem.c, and broken out to facilitate
   5  * use by hugetlbfs as well as tmpfs.
   6  *
   7  * This file is released under the GPL.
   8  */
   9
  10 #include <linux/fs.h>
  11 #include <linux/vfs.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/file.h>
  14 #include <linux/mm.h>
  15 #include <linux/sched/signal.h>
  16 #include <linux/khugepaged.h>
  17 #include <linux/syscalls.h>
  18 #include <linux/hugetlb.h>
  19 #include <linux/shmem_fs.h>
  20 #include <linux/memfd.h>
  21 #include <linux/pid_namespace.h>
  22 #include <uapi/linux/memfd.h>
  23 #include "swap.h"
  24
  25 /*
  26  * We need a tag: a new tag would expand every xa_node by 8 bytes,
  27  * so reuse a tag which we firmly believe is never set or cleared on tmpfs
  28  * or hugetlbfs because they are memory only filesystems.
  29  */
  30 #define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
  31 #define LAST_SCAN               4       /* about 150ms max */
  32
  33 static bool memfd_folio_has_extra_refs(struct folio *folio)
  34 {
  35         return folio_ref_count(folio) - folio_mapcount(folio) !=
  36                folio_nr_pages(folio);
  37 }
  38
  39 static void memfd_tag_pins(struct xa_state *xas)
  40 {
  41         struct folio *folio;
  42         int latency = 0;
  43
  44         lru_add_drain();
  45
  46         xas_lock_irq(xas);
  47         xas_for_each(xas, folio, ULONG_MAX) {
  48                 if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
  49                         xas_set_mark(xas, MEMFD_TAG_PINNED);
  50
  51                 if (++latency < XA_CHECK_SCHED)
  52                         continue;
  53                 latency = 0;
  54
  55                 xas_pause(xas);
  56                 xas_unlock_irq(xas);
  57                 cond_resched();
  58                 xas_lock_irq(xas);
  59         }
  60         xas_unlock_irq(xas);
  61 }
  62
  63 /*
  64  * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c).
  65  * It is mainly called to allocate a folio in a memfd when the caller
  66  * (memfd_pin_folios()) cannot find a folio in the page cache at a given
  67  * index in the mapping.
  68  */
  69 struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
  70 {
  71 #ifdef CONFIG_HUGETLB_PAGE
  72         struct folio *folio;
  73         gfp_t gfp_mask;
  74         int err;
  75
  76         if (is_file_hugepages(memfd)) {
  77                 /*
  78                  * The folio would most likely be accessed by a DMA driver,
  79                  * therefore, we have zone memory constraints where we can
  80                  * alloc from. Also, the folio will be pinned for an indefinite
  81                  * amount of time, so it is not expected to be migrated away.
  82                  */
  83                 struct hstate *h = hstate_file(memfd);
  84
  85                 gfp_mask = htlb_alloc_mask(h);
  86                 gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
  87                 idx >>= huge_page_order(h);
  88
  89                 folio = alloc_hugetlb_folio_reserve(h,
  90                                                     numa_node_id(),
  91                                                     NULL,
  92                                                     gfp_mask);
  93                 if (folio) {
  94                         err = hugetlb_add_to_page_cache(folio,
  95                                                         memfd->f_mapping,
  96                                                         idx);
  97                         if (err) {
  98                                 folio_put(folio);
  99                                 return ERR_PTR(err);
 100                         }
 101                         folio_unlock(folio);
 102                         return folio;
 103                 }
 104                 return ERR_PTR(-ENOMEM);
 105         }
 106 #endif
 107         return shmem_read_folio(memfd->f_mapping, idx);
 108 }
 109
 110 /*
 111  * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
 112  * via get_user_pages(), drivers might have some pending I/O without any active
 113  * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
 114  * and see whether it has an elevated ref-count. If so, we tag them and wait for
 115  * them to be dropped.
 116  * The caller must guarantee that no new user will acquire writable references
 117  * to those folios to avoid races.
 118  */
 119 static int memfd_wait_for_pins(struct address_space *mapping)
 120 {
 121         XA_STATE(xas, &mapping->i_pages, 0);
 122         struct folio *folio;
 123         int error, scan;
 124
 125         memfd_tag_pins(&xas);
 126
 127         error = 0;
 128         for (scan = 0; scan <= LAST_SCAN; scan++) {
 129                 int latency = 0;
 130
 131                 if (!xas_marked(&xas, MEMFD_TAG_PINNED))
 132                         break;
 133
 134                 if (!scan)
 135                         lru_add_drain_all();
 136                 else if (schedule_timeout_killable((HZ << scan) / 200))
 137                         scan = LAST_SCAN;
 138
 139                 xas_set(&xas, 0);
 140                 xas_lock_irq(&xas);
 141                 xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
 142                         bool clear = true;
 143
 144                         if (!xa_is_value(folio) &&
 145                             memfd_folio_has_extra_refs(folio)) {
 146                                 /*
 147                                  * On the last scan, we clean up all those tags
 148                                  * we inserted; but make a note that we still
 149                                  * found folios pinned.
 150                                  */
 151                                 if (scan == LAST_SCAN)
 152                                         error = -EBUSY;
 153                                 else
 154                                         clear = false;
 155                         }
 156                         if (clear)
 157                                 xas_clear_mark(&xas, MEMFD_TAG_PINNED);
 158
 159                         if (++latency < XA_CHECK_SCHED)
 160                                 continue;
 161                         latency = 0;
 162
 163                         xas_pause(&xas);
 164                         xas_unlock_irq(&xas);
 165                         cond_resched();
 166                         xas_lock_irq(&xas);
 167                 }
 168                 xas_unlock_irq(&xas);
 169         }
 170
 171         return error;
 172 }
 173
 174 static unsigned int *memfd_file_seals_ptr(struct file *file)
 175 {
 176         if (shmem_file(file))
 177                 return &SHMEM_I(file_inode(file))->seals;
 178
 179 #ifdef CONFIG_HUGETLBFS
 180         if (is_file_hugepages(file))
 181                 return &HUGETLBFS_I(file_inode(file))->seals;
 182 #endif
 183
 184         return NULL;
 185 }
 186
 187 #define F_ALL_SEALS (F_SEAL_SEAL | \
 188                      F_SEAL_EXEC | \
 189                      F_SEAL_SHRINK | \
 190                      F_SEAL_GROW | \
 191                      F_SEAL_WRITE | \
 192                      F_SEAL_FUTURE_WRITE)
 193
 194 static int memfd_add_seals(struct file *file, unsigned int seals)
 195 {
 196         struct inode *inode = file_inode(file);
 197         unsigned int *file_seals;
 198         int error;
 199
 200         /*
 201          * SEALING
 202          * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
 203          * but restrict access to a specific subset of file operations. Seals
 204          * can only be added, but never removed. This way, mutually untrusted
 205          * parties can share common memory regions with a well-defined policy.
 206          * A malicious peer can thus never perform unwanted operations on a
 207          * shared object.
 208          *
 209          * Seals are only supported on special tmpfs or hugetlbfs files and
 210          * always affect the whole underlying inode. Once a seal is set, it
 211          * may prevent some kinds of access to the file. Currently, the
 212          * following seals are defined:
 213          *   SEAL_SEAL: Prevent further seals from being set on this file
 214          *   SEAL_SHRINK: Prevent the file from shrinking
 215          *   SEAL_GROW: Prevent the file from growing
 216          *   SEAL_WRITE: Prevent write access to the file
 217          *   SEAL_EXEC: Prevent modification of the exec bits in the file mode
 218          *
 219          * As we don't require any trust relationship between two parties, we
 220          * must prevent seals from being removed. Therefore, sealing a file
 221          * only adds a given set of seals to the file, it never touches
 222          * existing seals. Furthermore, the "setting seals"-operation can be
 223          * sealed itself, which basically prevents any further seal from being
 224          * added.
 225          *
 226          * Semantics of sealing are only defined on volatile files. Only
 227          * anonymous tmpfs and hugetlbfs files support sealing. More
 228          * importantly, seals are never written to disk. Therefore, there's
 229          * no plan to support it on other file types.
 230          */
 231
 232         if (!(file->f_mode & FMODE_WRITE))
 233                 return -EPERM;
 234         if (seals & ~(unsigned int)F_ALL_SEALS)
 235                 return -EINVAL;
 236
 237         inode_lock(inode);
 238
 239         file_seals = memfd_file_seals_ptr(file);
 240         if (!file_seals) {
 241                 error = -EINVAL;
 242                 goto unlock;
 243         }
 244
 245         if (*file_seals & F_SEAL_SEAL) {
 246                 error = -EPERM;
 247                 goto unlock;
 248         }
 249
 250         if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
 251                 error = mapping_deny_writable(file->f_mapping);
 252                 if (error)
 253                         goto unlock;
 254
 255                 error = memfd_wait_for_pins(file->f_mapping);
 256                 if (error) {
 257                         mapping_allow_writable(file->f_mapping);
 258                         goto unlock;
 259                 }
 260         }
 261
 262         /*
 263          * SEAL_EXEC implies SEAL_WRITE, making W^X from the start.
 264          */
 265         if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
 266                 seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
 267
 268         *file_seals |= seals;
 269         error = 0;
 270
 271 unlock:
 272         inode_unlock(inode);
 273         return error;
 274 }
 275
 276 static int memfd_get_seals(struct file *file)
 277 {
 278         unsigned int *seals = memfd_file_seals_ptr(file);
 279
 280         return seals ? *seals : -EINVAL;
 281 }
 282
 283 long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 284 {
 285         long error;
 286
 287         switch (cmd) {
 288         case F_ADD_SEALS:
 289                 error = memfd_add_seals(file, arg);
 290                 break;
 291         case F_GET_SEALS:
 292                 error = memfd_get_seals(file);
 293                 break;
 294         default:
 295                 error = -EINVAL;
 296                 break;
 297         }
 298
 299         return error;
 300 }
 301
 302 #define MFD_NAME_PREFIX "memfd:"
 303 #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
 304 #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
 305
 306 #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
 307
 308 static int check_sysctl_memfd_noexec(unsigned int *flags)
 309 {
 310 #ifdef CONFIG_SYSCTL
 311         struct pid_namespace *ns = task_active_pid_ns(current);
 312         int sysctl = pidns_memfd_noexec_scope(ns);
 313
 314         if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
 315                 if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
 316                         *flags |= MFD_NOEXEC_SEAL;
 317                 else
 318                         *flags |= MFD_EXEC;
 319         }
 320
 321         if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
 322                 pr_err_ratelimited(
 323                         "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
 324                         current->comm, task_pid_nr(current), sysctl);
 325                 return -EACCES;
 326         }
 327 #endif
 328         return 0;
 329 }
 330
 331 static inline bool is_write_sealed(unsigned int seals)
 332 {
 333         return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
 334 }
 335
 336 static int check_write_seal(unsigned long *vm_flags_ptr)
 337 {
 338         unsigned long vm_flags = *vm_flags_ptr;
 339         unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);
 340
 341         /* If a private mapping then writability is irrelevant. */
 342         if (!(mask & VM_SHARED))
 343                 return 0;
 344
 345         /*
 346          * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
 347          * write seals are active.
 348          */
 349         if (mask & VM_WRITE)
 350                 return -EPERM;
 351
 352         /*
 353          * This is a read-only mapping, disallow mprotect() from making a
 354          * write-sealed mapping writable in future.
 355          */
 356         *vm_flags_ptr &= ~VM_MAYWRITE;
 357
 358         return 0;
 359 }
 360
 361 int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr)
 362 {
 363         int err = 0;
 364         unsigned int *seals_ptr = memfd_file_seals_ptr(file);
 365         unsigned int seals = seals_ptr ? *seals_ptr : 0;
 366
 367         if (is_write_sealed(seals))
 368                 err = check_write_seal(vm_flags_ptr);
 369
 370         return err;
 371 }
 372
 373 static int sanitize_flags(unsigned int *flags_ptr)
 374 {
 375         unsigned int flags = *flags_ptr;
 376
 377         if (!(flags & MFD_HUGETLB)) {
 378                 if (flags & ~(unsigned int)MFD_ALL_FLAGS)
 379                         return -EINVAL;
 380         } else {
 381                 /* Allow huge page size encoding in flags. */
 382                 if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
 383                                 (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
 384                         return -EINVAL;
 385         }
 386
 387         /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
 388         if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
 389                 return -EINVAL;
 390
 391         return check_sysctl_memfd_noexec(flags_ptr);
 392 }
 393
 394 static char *alloc_name(const char __user *uname)
 395 {
 396         int error;
 397         char *name;
 398         long len;
 399
 400         name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
 401         if (!name)
 402                 return ERR_PTR(-ENOMEM);
 403
 404         strcpy(name, MFD_NAME_PREFIX);
 405         /* returned length does not include terminating zero */
 406         len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
 407         if (len < 0) {
 408                 error = -EFAULT;
 409                 goto err_name;
 410         } else if (len > MFD_NAME_MAX_LEN) {
 411                 error = -EINVAL;
 412                 goto err_name;
 413         }
 414
 415         return name;
 416
 417 err_name:
 418         kfree(name);
 419         return ERR_PTR(error);
 420 }
 421
 422 static struct file *alloc_file(const char *name, unsigned int flags)
 423 {
 424         unsigned int *file_seals;
 425         struct file *file;
 426
 427         if (flags & MFD_HUGETLB) {
 428                 file = hugetlb_file_setup(name, 0, VM_NORESERVE,
 429                                         HUGETLB_ANONHUGE_INODE,
 430                                         (flags >> MFD_HUGE_SHIFT) &
 431                                         MFD_HUGE_MASK);
 432         } else {
 433                 file = shmem_file_setup(name, 0, VM_NORESERVE);
 434         }
 435         if (IS_ERR(file))
 436                 return file;
 437         file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
 438         file->f_flags |= O_LARGEFILE;
 439
 440         if (flags & MFD_NOEXEC_SEAL) {
 441                 struct inode *inode = file_inode(file);
 442
 443                 inode->i_mode &= ~0111;
 444                 file_seals = memfd_file_seals_ptr(file);
 445                 if (file_seals) {
 446                         *file_seals &= ~F_SEAL_SEAL;
 447                         *file_seals |= F_SEAL_EXEC;
 448                 }
 449         } else if (flags & MFD_ALLOW_SEALING) {
 450                 /* MFD_EXEC and MFD_ALLOW_SEALING are set */
 451                 file_seals = memfd_file_seals_ptr(file);
 452                 if (file_seals)
 453                         *file_seals &= ~F_SEAL_SEAL;
 454         }
 455
 456         return file;
 457 }
 458
 459 SYSCALL_DEFINE2(memfd_create,
 460                 const char __user *, uname,
 461                 unsigned int, flags)
 462 {
 463         struct file *file;
 464         int fd, error;
 465         char *name;
 466
 467         error = sanitize_flags(&flags);
 468         if (error < 0)
 469                 return error;
 470
 471         name = alloc_name(uname);
 472         if (IS_ERR(name))
 473                 return PTR_ERR(name);
 474
 475         fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
 476         if (fd < 0) {
 477                 error = fd;
 478                 goto err_name;
 479         }
 480
 481         file = alloc_file(name, flags);
 482         if (IS_ERR(file)) {
 483                 error = PTR_ERR(file);
 484                 goto err_fd;
 485         }
 486
 487         fd_install(fd, file);
 488         kfree(name);
 489         return fd;
 490
 491 err_fd:
 492         put_unused_fd(fd);
 493 err_name:
 494         kfree(name);
 495         return error;
 496 }