Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / mm / shmem.c
index b76521ed372dfdfc03117b2773b39f104b9901a6..e40a08c5c6d78ac629b52e4956b4cb5c13948b6c 100644 (file)
@@ -76,7 +76,6 @@ static struct vfsmount *shm_mnt;
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <uapi/linux/memfd.h>
-#include <linux/userfaultfd_k.h>
 #include <linux/rmap.h>
 #include <linux/uuid.h>
 
@@ -116,10 +115,12 @@ struct shmem_options {
        bool full_inums;
        int huge;
        int seen;
+       bool noswap;
 #define SHMEM_SEEN_BLOCKS 1
 #define SHMEM_SEEN_INODES 2
 #define SHMEM_SEEN_HUGE 4
 #define SHMEM_SEEN_INUMS 8
+#define SHMEM_SEEN_NOSWAP 16
 };
 
 #ifdef CONFIG_TMPFS
@@ -603,7 +604,7 @@ next:
 
                index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
                folio = filemap_get_folio(inode->i_mapping, index);
-               if (!folio)
+               if (IS_ERR(folio))
                        goto drop;
 
                /* No huge page at the end of the file: nothing to split */
@@ -883,14 +884,21 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
 
        /*
         * At first avoid shmem_get_folio(,,,SGP_READ): that fails
-        * beyond i_size, and reports fallocated pages as holes.
+        * beyond i_size, and reports fallocated folios as holes.
         */
-       folio = __filemap_get_folio(inode->i_mapping, index,
-                                       FGP_ENTRY | FGP_LOCK, 0);
-       if (!xa_is_value(folio))
+       folio = filemap_get_entry(inode->i_mapping, index);
+       if (!folio)
                return folio;
+       if (!xa_is_value(folio)) {
+               folio_lock(folio);
+               if (folio->mapping == inode->i_mapping)
+                       return folio;
+               /* The folio has been swapped out */
+               folio_unlock(folio);
+               folio_put(folio);
+       }
        /*
-        * But read a page back from swap if any of it is within i_size
+        * But read a folio back from swap if any of it is within i_size
         * (although in some cases this is just a waste of time).
         */
        folio = NULL;
@@ -1331,12 +1339,29 @@ int shmem_unuse(unsigned int type)
 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct folio *folio = page_folio(page);
-       struct shmem_inode_info *info;
-       struct address_space *mapping;
-       struct inode *inode;
+       struct address_space *mapping = folio->mapping;
+       struct inode *inode = mapping->host;
+       struct shmem_inode_info *info = SHMEM_I(inode);
+       struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        swp_entry_t swap;
        pgoff_t index;
 
+       /*
+        * Our capabilities prevent regular writeback or sync from ever calling
+        * shmem_writepage; but a stacking filesystem might use ->writepage of
+        * its underlying filesystem, in which case tmpfs should write out to
+        * swap only in response to memory pressure, and not for the writeback
+        * threads or sync.
+        */
+       if (WARN_ON_ONCE(!wbc->for_reclaim))
+               goto redirty;
+
+       if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+               goto redirty;
+
+       if (!total_swap_pages)
+               goto redirty;
+
        /*
         * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
         * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
@@ -1351,27 +1376,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                folio_clear_dirty(folio);
        }
 
-       BUG_ON(!folio_test_locked(folio));
-       mapping = folio->mapping;
        index = folio->index;
-       inode = mapping->host;
-       info = SHMEM_I(inode);
-       if (info->flags & VM_LOCKED)
-               goto redirty;
-       if (!total_swap_pages)
-               goto redirty;
-
-       /*
-        * Our capabilities prevent regular writeback or sync from ever calling
-        * shmem_writepage; but a stacking filesystem might use ->writepage of
-        * its underlying filesystem, in which case tmpfs should write out to
-        * swap only in response to memory pressure, and not for the writeback
-        * threads or sync.
-        */
-       if (!wbc->for_reclaim) {
-               WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
-               goto redirty;
-       }
 
        /*
         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
@@ -1874,12 +1879,10 @@ repeat:
        sbinfo = SHMEM_SB(inode->i_sb);
        charge_mm = vma ? vma->vm_mm : NULL;
 
-       folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
+       folio = filemap_get_entry(mapping, index);
        if (folio && vma && userfaultfd_minor(vma)) {
-               if (!xa_is_value(folio)) {
-                       folio_unlock(folio);
+               if (!xa_is_value(folio))
                        folio_put(folio);
-               }
                *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
                return 0;
        }
@@ -1895,6 +1898,14 @@ repeat:
        }
 
        if (folio) {
+               folio_lock(folio);
+
+               /* Has the folio been truncated or swapped out? */
+               if (unlikely(folio->mapping != mapping)) {
+                       folio_unlock(folio);
+                       folio_put(folio);
+                       goto repeat;
+               }
                if (sgp == SGP_WRITE)
                        folio_mark_accessed(folio);
                if (folio_test_uptodate(folio))
@@ -2376,6 +2387,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
                        shmem_set_inode_flags(inode, info->fsflags);
                INIT_LIST_HEAD(&info->shrinklist);
                INIT_LIST_HEAD(&info->swaplist);
+               if (sbinfo->noswap)
+                       mapping_set_unevictable(inode->i_mapping);
                simple_xattrs_init(&info->xattrs);
                cache_no_acl(inode);
                mapping_set_large_folios(inode->i_mapping);
@@ -2415,13 +2428,12 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
 }
 
 #ifdef CONFIG_USERFAULTFD
-int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
-                          pmd_t *dst_pmd,
+int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
                           struct vm_area_struct *dst_vma,
                           unsigned long dst_addr,
                           unsigned long src_addr,
-                          bool zeropage, bool wp_copy,
-                          struct page **pagep)
+                          uffd_flags_t flags,
+                          struct folio **foliop)
 {
        struct inode *inode = file_inode(dst_vma->vm_file);
        struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2439,20 +2451,20 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
                 * and now we find ourselves with -ENOMEM. Release the page, to
                 * avoid a BUG_ON in our caller.
                 */
-               if (unlikely(*pagep)) {
-                       put_page(*pagep);
-                       *pagep = NULL;
+               if (unlikely(*foliop)) {
+                       folio_put(*foliop);
+                       *foliop = NULL;
                }
                return -ENOMEM;
        }
 
-       if (!*pagep) {
+       if (!*foliop) {
                ret = -ENOMEM;
                folio = shmem_alloc_folio(gfp, info, pgoff);
                if (!folio)
                        goto out_unacct_blocks;
 
-               if (!zeropage) {        /* COPY */
+               if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
                        page_kaddr = kmap_local_folio(folio, 0);
                        /*
                         * The read mmap_lock is held here.  Despite the
@@ -2478,7 +2490,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 
                        /* fallback to copy_from_user outside mmap_lock */
                        if (unlikely(ret)) {
-                               *pagep = &folio->page;
+                               *foliop = folio;
                                ret = -ENOENT;
                                /* don't free the page */
                                goto out_unacct_blocks;
@@ -2489,9 +2501,9 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
                        clear_user_highpage(&folio->page, dst_addr);
                }
        } else {
-               folio = page_folio(*pagep);
+               folio = *foliop;
                VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-               *pagep = NULL;
+               *foliop = NULL;
        }
 
        VM_BUG_ON(folio_test_locked(folio));
@@ -2506,12 +2518,12 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
                goto out_release;
 
        ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
-                                     gfp & GFP_RECLAIM_MASK, dst_mm);
+                                     gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
        if (ret)
                goto out_release;
 
-       ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
-                                      &folio->page, true, wp_copy);
+       ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
+                                      &folio->page, true, flags);
        if (ret)
                goto out_delete_from_cache;
 
@@ -3200,7 +3212,7 @@ static const char *shmem_get_link(struct dentry *dentry,
 
        if (!dentry) {
                folio = filemap_get_folio(inode->i_mapping, 0);
-               if (!folio)
+               if (IS_ERR(folio))
                        return ERR_PTR(-ECHILD);
                if (PageHWPoison(folio_page(folio, 0)) ||
                    !folio_test_uptodate(folio)) {
@@ -3459,6 +3471,7 @@ enum shmem_param {
        Opt_uid,
        Opt_inode32,
        Opt_inode64,
+       Opt_noswap,
 };
 
 static const struct constant_table shmem_param_enums_huge[] = {
@@ -3480,6 +3493,7 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
        fsparam_u32   ("uid",           Opt_uid),
        fsparam_flag  ("inode32",       Opt_inode32),
        fsparam_flag  ("inode64",       Opt_inode64),
+       fsparam_flag  ("noswap",        Opt_noswap),
        {}
 };
 
@@ -3563,6 +3577,14 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
                ctx->full_inums = true;
                ctx->seen |= SHMEM_SEEN_INUMS;
                break;
+       case Opt_noswap:
+               if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
+                       return invalfc(fc,
+                                      "Turning off swap in unprivileged tmpfs mounts unsupported");
+               }
+               ctx->noswap = true;
+               ctx->seen |= SHMEM_SEEN_NOSWAP;
+               break;
        }
        return 0;
 
@@ -3661,6 +3683,14 @@ static int shmem_reconfigure(struct fs_context *fc)
                err = "Current inum too high to switch to 32-bit inums";
                goto out;
        }
+       if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
+               err = "Cannot disable swap on remount";
+               goto out;
+       }
+       if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
+               err = "Cannot enable swap on remount if it was disabled on first mount";
+               goto out;
+       }
 
        if (ctx->seen & SHMEM_SEEN_HUGE)
                sbinfo->huge = ctx->huge;
@@ -3681,6 +3711,10 @@ static int shmem_reconfigure(struct fs_context *fc)
                sbinfo->mpol = ctx->mpol;       /* transfers initial ref */
                ctx->mpol = NULL;
        }
+
+       if (ctx->noswap)
+               sbinfo->noswap = true;
+
        raw_spin_unlock(&sbinfo->stat_lock);
        mpol_put(mpol);
        return 0;
@@ -3735,6 +3769,8 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
                seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
 #endif
        shmem_show_mpol(seq, sbinfo->mpol);
+       if (sbinfo->noswap)
+               seq_printf(seq, ",noswap");
        return 0;
 }
 
@@ -3778,6 +3814,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
                        ctx->inodes = shmem_default_max_inodes();
                if (!(ctx->seen & SHMEM_SEEN_INUMS))
                        ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
+               sbinfo->noswap = ctx->noswap;
        } else {
                sb->s_flags |= SB_NOUSER;
        }