Merge tag 'vfs-6.9.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 11 Mar 2024 16:38:17 +0000 (09:38 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 11 Mar 2024 16:38:17 +0000 (09:38 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 11 Mar 2024 16:38:17 +0000 (09:38 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 11 Mar 2024 16:38:17 +0000 (09:38 -0700)
diff --combined fs/dcache.c

index 6ebccba333368d06667eb6c1ee433046bd0ab7d8,be9e10155c8dae3fe58062e46b44b30c12940e34..71a8e943a0fa506c93fd7f11400de9a5d7e23e01
--- 1/fs/dcache.c
--- 2/fs/dcache.c
+++ b/fs/dcache.c
@@@ -3061,10 -3061,7 +3061,10 @@@ static enum d_walk_ret d_genocide_kill(
                 if (d_unhashed(dentry) || !dentry->d_inode)
                         return D_WALK_SKIP;
   
- -              dentry->d_lockref.count--;
+ +              if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
+ +                      dentry->d_flags |= DCACHE_GENOCIDE;
+ +                      dentry->d_lockref.count--;
+ +              }
         }
         return D_WALK_CONTINUE;
   }
@@@ -3139,7 -3136,7 +3139,7 @@@ static void __init dcache_init(void
          * of the dcache.
          */
         dentry_cache = KMEM_CACHE_USERCOPY(dentry,
-               SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+               SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
                 d_iname);
   
         /* Hash may have been set up in dcache_init_early */
diff --combined fs/ext4/super.c

index 0f931d0c227daa8b00950667d8b8bb42a7a28a48,215b4614eb1583916ae3b37e528694651937e25b..d068d550d271e9eebe023f070eab6dadd51a01de
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -1525,7 -1525,7 +1525,7 @@@ void ext4_clear_inode(struct inode *ino
         ext4_fc_del(inode);
         invalidate_inode_buffers(inode);
         clear_inode(inode);
- -      ext4_discard_preallocations(inode, 0);
+ +      ext4_discard_preallocations(inode);
         ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
         dquot_drop(inode);
         if (EXT4_I(inode)->jinode) {
@@@ -5484,6 -5484,7 +5484,7 @@@ static int __ext4_fill_super(struct fs_
                 goto failed_mount4;
         }
   
+       generic_set_sb_d_ops(sb);
         sb->s_root = d_make_root(root);
         if (!sb->s_root) {
                 ext4_msg(sb, KERN_ERR, "get root dentry failed");
diff --combined fs/hugetlbfs/inode.c

index d746866ae3b6ba79a4ed1d8b6600c29cfc28e005,af82364c84d8d2860aae9fa8cfaf65b5d1f688b2..6502c7e776d195e1d004908964f74ce5a76d6db2
--- 1/fs/hugetlbfs/inode.c
--- 2/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@@ -100,7 -100,6 +100,7 @@@ static int hugetlbfs_file_mmap(struct f
         loff_t len, vma_len;
         int ret;
         struct hstate *h = hstate_file(file);
+ +      vm_flags_t vm_flags;
   
         /*
          * vma address alignment (but not the pgoff alignment) has
@@@ -142,20 -141,10 +142,20 @@@
         file_accessed(file);
   
         ret = -ENOMEM;
+ +
+ +      vm_flags = vma->vm_flags;
+ +      /*
+ +       * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
+ +       * reserving here. Note: only for SHM hugetlbfs file, the inode
+ +       * flag S_PRIVATE is set.
+ +       */
+ +      if (inode->i_flags & S_PRIVATE)
+ +              vm_flags |= VM_NORESERVE;
+ +
         if (!hugetlb_reserve_pages(inode,
                                 vma->vm_pgoff >> huge_page_order(h),
                                 len >> huge_page_shift(h), vma,
- -                              vma->vm_flags))
+ +                              vm_flags))
                 goto out;
   
         ret = 0;
@@@ -351,7 -340,7 +351,7 @@@ static ssize_t hugetlbfs_read_iter(stru
                 } else {
                         folio_unlock(folio);
   
- -                      if (!folio_test_has_hwpoisoned(folio))
+ +                      if (!folio_test_hwpoison(folio))
                                 want = nr;
                         else {
                                 /*
@@@ -933,7 -922,7 +933,7 @@@ static int hugetlbfs_setattr(struct mnt
         unsigned int ia_valid = attr->ia_valid;
         struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
   
-       error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+       error = setattr_prepare(idmap, dentry, attr);
         if (error)
                 return error;
   
@@@ -950,7 -939,7 +950,7 @@@
                 hugetlb_vmtruncate(inode, newsize);
         }
   
-       setattr_copy(&nop_mnt_idmap, inode, attr);
+       setattr_copy(idmap, inode, attr);
         mark_inode_dirty(inode);
         return 0;
   }
@@@ -985,6 -974,7 +985,7 @@@ static struct inode *hugetlbfs_get_root
   static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
   
   static struct inode *hugetlbfs_get_inode(struct super_block *sb,
+                                       struct mnt_idmap *idmap,
                                         struct inode *dir,
                                         umode_t mode, dev_t dev)
   {
@@@ -1006,7 -996,7 +1007,7 @@@
                 struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
   
                 inode->i_ino = get_next_ino();
-               inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+               inode_init_owner(idmap, inode, dir, mode);
                 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
                                 &hugetlbfs_i_mmap_rwsem_key);
                 inode->i_mapping->a_ops = &hugetlbfs_aops;
@@@ -1050,7 -1040,7 +1051,7 @@@ static int hugetlbfs_mknod(struct mnt_i
   {
         struct inode *inode;
   
-       inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
+       inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev);
         if (!inode)
                 return -ENOSPC;
         inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@@ -1062,7 -1052,7 +1063,7 @@@
   static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                            struct dentry *dentry, umode_t mode)
   {
-       int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry,
+       int retval = hugetlbfs_mknod(idmap, dir, dentry,
                                      mode | S_IFDIR, 0);
         if (!retval)
                 inc_nlink(dir);
@@@ -1073,7 -1063,7 +1074,7 @@@ static int hugetlbfs_create(struct mnt_
                             struct inode *dir, struct dentry *dentry,
                             umode_t mode, bool excl)
   {
-       return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0);
+       return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
   }
   
   static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
@@@ -1082,7 -1072,7 +1083,7 @@@
   {
         struct inode *inode;
   
-       inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
+       inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0);
         if (!inode)
                 return -ENOSPC;
         inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@@ -1094,10 -1084,11 +1095,11 @@@ static int hugetlbfs_symlink(struct mnt
                              struct inode *dir, struct dentry *dentry,
                              const char *symname)
   {
+       const umode_t mode = S_IFLNK|S_IRWXUGO;
         struct inode *inode;
         int error = -ENOSPC;
   
-       inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+       inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0);
         if (inode) {
                 int l = strlen(symname)+1;
                 error = page_symlink(inode, symname, l);
@@@ -1365,7 -1356,6 +1367,7 @@@ static int hugetlbfs_parse_param(struc
   {
         struct hugetlbfs_fs_context *ctx = fc->fs_private;
         struct fs_parse_result result;
+ +      struct hstate *h;
         char *rest;
         unsigned long ps;
         int opt;
@@@ -1410,12 -1400,11 +1412,12 @@@
   
         case Opt_pagesize:
                 ps = memparse(param->string, &rest);
- -              ctx->hstate = size_to_hstate(ps);
- -              if (!ctx->hstate) {
+ +              h = size_to_hstate(ps);
+ +              if (!h) {
                         pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
                         return -EINVAL;
                 }
+ +              ctx->hstate = h;
                 return 0;
   
         case Opt_min_size:
@@@ -1566,6 -1555,7 +1568,7 @@@ static struct file_system_type hugetlbf
         .init_fs_context        = hugetlbfs_init_fs_context,
         .parameters             = hugetlb_fs_parameters,
         .kill_sb                = kill_litter_super,
+       .fs_flags               = FS_ALLOW_IDMAP,
   };
   
   static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@@ -1619,7 -1609,9 +1622,9 @@@ struct file *hugetlb_file_setup(const c
         }
   
         file = ERR_PTR(-ENOSPC);
-       inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
+       /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts.  */
+       inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL,
+                                   S_IFREG | S_IRWXUGO, 0);
         if (!inode)
                 goto out;
         if (creat_flags == HUGETLB_SHMFS_INODE)
diff --combined fs/namei.c

index 9342fa6a38c2bad85c13144b8d8ae4940e88e7e6,4c961e07157c090ba5e5fec157ffff0aeb6afe06..d0c4a3e9278e444d0fd6e504ba89a3ba335c7fcf
--- 1/fs/namei.c
--- 2/fs/namei.c
+++ b/fs/namei.c
@@@ -1717,11 -1717,7 +1717,11 @@@ static inline int may_lookup(struct mnt
   {
         if (nd->flags & LOOKUP_RCU) {
                 int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
- -              if (err != -ECHILD || !try_to_unlazy(nd))
+ +              if (!err)               // success, keep going
+ +                      return 0;
+ +              if (!try_to_unlazy(nd))
+ +                      return -ECHILD; // redo it all non-lazy
+ +              if (err != -ECHILD)     // hard error
                         return err;
         }
         return inode_permission(idmap, nd->inode, MAY_EXEC);
@@@ -2680,10 -2676,8 +2680,8 @@@ static int lookup_one_common(struct mnt
         if (!len)
                 return -EACCES;
   
-       if (unlikely(name[0] == '.')) {
-               if (len < 2 || (len == 2 && name[1] == '.'))
-                       return -EACCES;
-       }
+       if (is_dot_dotdot(name, len))
+               return -EACCES;
   
         while (len--) {
                 unsigned int c = *(const unsigned char *)name++;
diff --combined fs/ntfs3/namei.c

index cae41db0aaa7d13e1fb4e0132b79261156a39306,144aa80cca433e5fd3c3722b149b879cc32e3254..084d19d78397c9b6108b36b9730c59a806a66534
--- 1/fs/ntfs3/namei.c
--- 2/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@@ -181,9 -181,6 +181,9 @@@ static int ntfs_unlink(struct inode *di
         struct ntfs_inode *ni = ntfs_i(dir);
         int err;
   
+ +      if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+ +              return -EIO;
+ +
         ni_lock_dir(ni);
   
         err = ntfs_unlink_inode(dir, dentry);
@@@ -202,9 -199,6 +202,9 @@@ static int ntfs_symlink(struct mnt_idma
         u32 size = strlen(symname);
         struct inode *inode;
   
+ +      if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+ +              return -EIO;
+ +
         inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0,
                                   symname, size, NULL);
   
@@@ -233,9 -227,6 +233,9 @@@ static int ntfs_rmdir(struct inode *dir
         struct ntfs_inode *ni = ntfs_i(dir);
         int err;
   
+ +      if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+ +              return -EIO;
+ +
         ni_lock_dir(ni);
   
         err = ntfs_unlink_inode(dir, dentry);
@@@ -273,9 -264,6 +273,9 @@@ static int ntfs_rename(struct mnt_idma
                       1024);
         static_assert(PATH_MAX >= 4 * 1024);
   
+ +      if (unlikely(ntfs3_forced_shutdown(sb)))
+ +              return -EIO;
+ +
         if (flags & ~RENAME_NOREPLACE)
                 return -EINVAL;
   
@@@ -431,7 -419,7 +431,7 @@@ static int ntfs_atomic_open(struct inod
          * fnd contains tree's path to insert to.
          * If fnd is not NULL then dir is locked.
          */
-       inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni,
+       inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni,
                                   mode, 0, NULL, 0, fnd);
         err = IS_ERR(inode) ? PTR_ERR(inode) :
                               finish_open(file, dentry, ntfs_file_open);
diff --combined fs/overlayfs/super.c

index 2eef6c70b2aed54027b9ec2b1b544101ea32aefc,df2ad2f6079829f0eee5566d782f215b28796acb..36d4b8b1f784462dffe665fb83f7a756eeba3262
--- 1/fs/overlayfs/super.c
--- 2/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@@ -28,41 -28,38 +28,38 @@@ MODULE_LICENSE("GPL")
   
   struct ovl_dir_cache;
   
- static struct dentry *ovl_d_real(struct dentry *dentry,
-                                const struct inode *inode)
+ static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type)
   {
-       struct dentry *real = NULL, *lower;
+       struct dentry *upper, *lower;
         int err;
   
-       /*
-        * vfs is only expected to call d_real() with NULL from d_real_inode()
-        * and with overlay inode from file_dentry() on an overlay file.
-        *
-        * TODO: remove @inode argument from d_real() API, remove code in this
-        * function that deals with non-NULL @inode and remove d_real() call
-        * from file_dentry().
-        */
-       if (inode && d_inode(dentry) == inode)
-               return dentry;
-       else if (inode)
+       switch (type) {
+       case D_REAL_DATA:
+       case D_REAL_METADATA:
+               break;
+       default:
                 goto bug;
+       }
   
         if (!d_is_reg(dentry)) {
                 /* d_real_inode() is only relevant for regular files */
                 return dentry;
         }
   
-       real = ovl_dentry_upper(dentry);
-       if (real && (inode == d_inode(real)))
-               return real;
+       upper = ovl_dentry_upper(dentry);
+       if (upper && (type == D_REAL_METADATA ||
+                     ovl_has_upperdata(d_inode(dentry))))
+               return upper;
   
-       if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
-               return real;
+       if (type == D_REAL_METADATA) {
+               lower = ovl_dentry_lower(dentry);
+               goto real_lower;
+       }
   
         /*
-        * Best effort lazy lookup of lowerdata for !inode case to return
+        * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return
          * the real lowerdata dentry.  The only current caller of d_real() with
-        * NULL inode is d_real_inode() from trace_uprobe and this caller is
+        * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is
          * likely going to be followed reading from the file, before placing
          * uprobes on offset within the file, so lowerdata should be available
          * when setting the uprobe.
@@@ -73,18 -70,13 +70,13 @@@
         lower = ovl_dentry_lowerdata(dentry);
         if (!lower)
                 goto bug;
-       real = lower;
   
-       /* Handle recursion */
-       real = d_real(real, inode);
+ real_lower:
+       /* Handle recursion into stacked lower fs */
+       return d_real(lower, type);
   
-       if (!inode || inode == d_inode(real))
-               return real;
   bug:
-       WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n",
-            __func__, dentry, inode ? inode->i_sb->s_id : "NULL",
-            inode ? inode->i_ino : 0, real,
-            real && d_inode(real) ? d_inode(real)->i_ino : 0);
+       WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type);
         return dentry;
   }
   
@@@ -1249,7 -1241,6 +1241,7 @@@ static struct dentry *ovl_get_root(stru
                                    struct ovl_entry *oe)
   {
         struct dentry *root;
+ +      struct ovl_fs *ofs = OVL_FS(sb);
         struct ovl_path *lowerpath = ovl_lowerstack(oe);
         unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
         int fsid = lowerpath->layer->fsid;
@@@ -1271,20 -1262,6 +1263,20 @@@
                         ovl_set_flag(OVL_IMPURE, d_inode(root));
         }
   
+ +      /* Look for xwhiteouts marker except in the lowermost layer */
+ +      for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) {
+ +              struct path path = {
+ +                      .mnt = lowerpath->layer->mnt,
+ +                      .dentry = lowerpath->dentry,
+ +              };
+ +
+ +              /* overlay.opaque=x means xwhiteouts directory */
+ +              if (ovl_get_opaquedir_val(ofs, &path) == 'x') {
+ +                      ovl_layer_set_xwhiteouts(ofs, lowerpath->layer);
+ +                      ovl_dentry_set_xwhiteouts(root);
+ +              }
+ +      }
+ +
         /* Root is always merge -> can have whiteouts */
         ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
         ovl_dentry_set_flag(OVL_E_CONNECTED, root);
diff --combined fs/proc/inode.c

index 05350f3c2812c57562e9208da69d0e98835dadc9,b9c5cb63dd504ca661b0eab5e430d54bda679606..dcd513dccf55cbfa50d5abe965b7a636dcff8353
--- 1/fs/proc/inode.c
--- 2/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@@ -30,6 -30,7 +30,6 @@@
   
   static void proc_evict_inode(struct inode *inode)
   {
- -      struct proc_dir_entry *de;
         struct ctl_table_header *head;
         struct proc_inode *ei = PROC_I(inode);
   
@@@ -37,8 -38,17 +37,8 @@@
         clear_inode(inode);
   
         /* Stop tracking associated processes */
- -      if (ei->pid) {
+ +      if (ei->pid)
                 proc_pid_evict_inode(ei);
- -              ei->pid = NULL;
- -      }
- -
- -      /* Let go of any associated proc directory entry */
- -      de = ei->pde;
- -      if (de) {
- -              pde_put(de);
- -              ei->pde = NULL;
- -      }
   
         head = ei->sysctl;
         if (head) {
@@@ -70,13 -80,6 +70,13 @@@ static struct inode *proc_alloc_inode(s
   
   static void proc_free_inode(struct inode *inode)
   {
+ +      struct proc_inode *ei = PROC_I(inode);
+ +
+ +      if (ei->pid)
+ +              put_pid(ei->pid);
+ +      /* Let go of any associated proc directory entry */
+ +      if (ei->pde)
+ +              pde_put(ei->pde);
         kmem_cache_free(proc_inode_cachep, PROC_I(inode));
   }
   
@@@ -92,7 -95,7 +92,7 @@@ void __init proc_init_kmemcache(void
         proc_inode_cachep = kmem_cache_create("proc_inode_cache",
                                              sizeof(struct proc_inode),
                                              0, (SLAB_RECLAIM_ACCOUNT|
-                                               SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+                                               SLAB_ACCOUNT|
                                                 SLAB_PANIC),
                                              init_once);
         pde_opener_cache =
diff --combined include/linux/dcache.h

index d07cf2f1bb7db18c37333fd211b2e5b18657b254,d616a745a34c695ea3ea835d2e3bbf8dbf71d775..bf53e3894aae33ef15218463db3c9f85985b9e26
--- 1/include/linux/dcache.h
--- 2/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@@ -125,6 -125,11 +125,11 @@@ enum dentry_d_lock_clas
         DENTRY_D_LOCK_NESTED
   };
   
+ enum d_real_type {
+       D_REAL_DATA,
+       D_REAL_METADATA,
+ };
+ 
   struct dentry_operations {
         int (*d_revalidate)(struct dentry *, unsigned int);
         int (*d_weak_revalidate)(struct dentry *, unsigned int);
@@@ -139,7 -144,7 +144,7 @@@
         char *(*d_dname)(struct dentry *, char *, int);
         struct vfsmount *(*d_automount)(struct path *);
         int (*d_manage)(const struct path *, bool);
-       struct dentry *(*d_real)(struct dentry *, const struct inode *);
+       struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
   } ____cacheline_aligned;
   
   /*
@@@ -173,7 -178,6 +178,7 @@@
   #define DCACHE_DONTCACHE              BIT(7) /* Purge from memory on final dput() */
   
   #define DCACHE_CANT_MOUNT             BIT(8)
+ +#define DCACHE_GENOCIDE                       BIT(9)
   #define DCACHE_SHRINK_LIST            BIT(10)
   
   #define DCACHE_OP_WEAK_REVALIDATE     BIT(11)
@@@ -547,24 -551,23 +552,23 @@@ static inline struct inode *d_backing_i
   /**
    * d_real - Return the real dentry
    * @dentry: the dentry to query
-  * @inode: inode to select the dentry from multiple layers (can be NULL)
+  * @type: the type of real dentry (data or metadata)
    *
    * If dentry is on a union/overlay, then return the underlying, real dentry.
    * Otherwise return the dentry itself.
    *
    * See also: Documentation/filesystems/vfs.rst
    */
- static inline struct dentry *d_real(struct dentry *dentry,
-                                   const struct inode *inode)
+ static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type)
   {
         if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
-               return dentry->d_op->d_real(dentry, inode);
+               return dentry->d_op->d_real(dentry, type);
         else
                 return dentry;
   }
   
   /**
-  * d_real_inode - Return the real inode
+  * d_real_inode - Return the real inode hosting the data
    * @dentry: The dentry to query
    *
    * If dentry is on a union/overlay, then return the underlying, real inode.
@@@ -573,7 -576,7 +577,7 @@@
   static inline struct inode *d_real_inode(const struct dentry *dentry)
   {
         /* This usage of d_real() results in const dentry */
-       return d_backing_inode(d_real((struct dentry *) dentry, NULL));
+       return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA));
   }
   
   struct name_snapshot {
diff --combined include/linux/fs.h

index 1fbc72c5f112c750b87e7d752e4d5871258ddabe,30323dc70b7a91d3ad42e072128eebf58f7ad953..2ba751d097c1d110d5a4a4fc2ca4b418765cd142
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -43,6 -43,7 +43,7 @@@
   #include <linux/cred.h>
   #include <linux/mnt_idmapping.h>
   #include <linux/slab.h>
+ #include <linux/maple_tree.h>
   
   #include <asm/byteorder.h>
   #include <uapi/linux/fs.h>
@@@ -352,8 -353,6 +353,8 @@@ enum rw_hint 
    * unrelated IO (like cache flushing, new IO generation, etc).
    */
   #define IOCB_DIO_CALLER_COMP  (1 << 22)
+ +/* kiocb is a read or write operation submitted by fs/aio.c. */
+ +#define IOCB_AIO_RW           (1 << 23)
   
   /* for use in trace events */
   #define TRACE_IOCB_STRINGS \
@@@ -484,10 -483,10 +485,10 @@@ struct address_space 
         pgoff_t                 writeback_index;
         const struct address_space_operations *a_ops;
         unsigned long           flags;
-       struct rw_semaphore     i_mmap_rwsem;
         errseq_t                wb_err;
         spinlock_t              i_private_lock;
         struct list_head        i_private_list;
+       struct rw_semaphore     i_mmap_rwsem;
         void *                  i_private_data;
   } __attribute__((aligned(sizeof(long)))) __randomize_layout;
         /*
@@@ -909,7 -908,8 +910,8 @@@ static inline loff_t i_size_read(const 
         preempt_enable();
         return i_size;
   #else
-       return inode->i_size;
+       /* Pairs with smp_store_release() in i_size_write() */
+       return smp_load_acquire(&inode->i_size);
   #endif
   }
   
@@@ -931,7 -931,12 +933,12 @@@ static inline void i_size_write(struct 
         inode->i_size = i_size;
         preempt_enable();
   #else
-       inode->i_size = i_size;
+       /*
+        * Pairs with smp_load_acquire() in i_size_read() to ensure
+        * changes related to inode size (such as page contents) are
+        * visible before we see the changed inode size.
+        */
+       smp_store_release(&inode->i_size, i_size);
   #endif
   }
   
@@@ -1080,9 -1085,20 +1087,20 @@@ static inline struct inode *file_inode(
         return f->f_inode;
   }
   
+ /*
+  * file_dentry() is a relic from the days that overlayfs was using files with a
+  * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs.
+  * In those days, file_dentry() was needed to get the underlying fs dentry that
+  * matches f_inode.
+  * Files with "fake" path should not exist nowadays, so use an assertion to make
+  * sure that file_dentry() was not papering over filesystem bugs.
+  */
   static inline struct dentry *file_dentry(const struct file *file)
   {
-       return d_real(file->f_path.dentry, file_inode(file));
+       struct dentry *dentry = file->f_path.dentry;
+ 
+       WARN_ON_ONCE(d_inode(dentry) != file_inode(file));
+       return dentry;
   }
   
   struct fasync_struct {
@@@ -2103,6 -2119,9 +2121,6 @@@ int __generic_remap_file_range_prep(str
   int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t *count, unsigned int remap_flags);
- -extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
- -                                struct file *file_out, loff_t pos_out,
- -                                loff_t len, unsigned int remap_flags);
   extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    loff_t len, unsigned int remap_flags);
@@@ -2927,6 -2946,17 +2945,17 @@@ extern bool path_is_under(const struct 
   
   extern char *file_path(struct file *, char *, int);
   
+ /**
+  * is_dot_dotdot - returns true only if @name is "." or ".."
+  * @name: file name to check
+  * @len: length of file name, in bytes
+  */
+ static inline bool is_dot_dotdot(const char *name, size_t len)
+ {
+       return len && unlikely(name[0] == '.') &&
+               (len == 1 || (len == 2 && name[1] == '.'));
+ }
+ 
   #include <linux/err.h>
   
   /* needed for stackable file system support */
@@@ -3259,13 -3289,14 +3288,14 @@@ extern ssize_t simple_write_to_buffer(v
                 const void __user *from, size_t count);
   
   struct offset_ctx {
-       struct xarray           xa;
-       u32                     next_offset;
+       struct maple_tree       mt;
+       unsigned long           next_offset;
   };
   
   void simple_offset_init(struct offset_ctx *octx);
   int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
   void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
+ int simple_offset_empty(struct dentry *dentry);
   int simple_offset_rename_exchange(struct inode *old_dir,
                                   struct dentry *old_dentry,
                                   struct inode *new_dir,
@@@ -3279,7 -3310,16 +3309,16 @@@ extern int generic_file_fsync(struct fi
   
   extern int generic_check_addressable(unsigned, u64);
   
- extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
+ extern void generic_set_sb_d_ops(struct super_block *sb);
+ 
+ static inline bool sb_has_encoding(const struct super_block *sb)
+ {
+ #if IS_ENABLED(CONFIG_UNICODE)
+       return !!sb->s_encoding;
+ #else
+       return false;
+ #endif
+ }
   
   int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
                 unsigned int ia_valid);
@@@ -3334,6 -3374,8 +3373,8 @@@ static inline int kiocb_set_rw_flags(st
                 return 0;
         if (unlikely(flags & ~RWF_SUPPORTED))
                 return -EOPNOTSUPP;
+       if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND)))
+               return -EINVAL;
   
         if (flags & RWF_NOWAIT) {
                 if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
@@@ -3344,6 -3386,12 +3385,12 @@@
         if (flags & RWF_SYNC)
                 kiocb_flags |= IOCB_DSYNC;
   
+       if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) {
+               if (IS_APPEND(file_inode(ki->ki_filp)))
+                       return -EPERM;
+               ki->ki_flags &= ~IOCB_APPEND;
+       }
+ 
         ki->ki_flags |= kiocb_flags;
         return 0;
   }
diff --combined lib/iov_iter.c

index cf2eb2b2f983797190a4000f5a6a388ac23aae36,73715d10c812bfb1d7612408453792f2d8ab89f9..4a6a9f419bd7eb8cf1370eed5f42c98eb6914f3e
--- 1/lib/iov_iter.c
--- 2/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@@ -166,6 -166,7 +166,6 @@@ void iov_iter_init(struct iov_iter *i, 
         WARN_ON(direction & ~(READ | WRITE));
         *i = (struct iov_iter) {
                 .iter_type = ITER_IOVEC,
- -              .copy_mc = false,
                 .nofault = false,
                 .data_source = direction,
                 .__iov = iov,
@@@ -243,9 -244,27 +243,9 @@@ size_t _copy_mc_to_iter(const void *add
   EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
   #endif /* CONFIG_ARCH_HAS_COPY_MC */
   
- -static __always_inline
- -size_t memcpy_from_iter_mc(void *iter_from, size_t progress,
- -                         size_t len, void *to, void *priv2)
- -{
- -      return copy_mc_to_kernel(to + progress, iter_from, len);
- -}
- -
- -static size_t __copy_from_iter_mc(void *addr, size_t bytes, struct iov_iter *i)
- -{
- -      if (unlikely(i->count < bytes))
- -              bytes = i->count;
- -      if (unlikely(!bytes))
- -              return 0;
- -      return iterate_bvec(i, bytes, addr, NULL, memcpy_from_iter_mc);
- -}
- -
   static __always_inline
   size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
   {
- -      if (unlikely(iov_iter_is_copy_mc(i)))
- -              return __copy_from_iter_mc(addr, bytes, i);
         return iterate_and_advance(i, bytes, addr,
                                    copy_from_user_iter, memcpy_from_iter);
   }
@@@ -614,6 -633,7 +614,6 @@@ void iov_iter_kvec(struct iov_iter *i, 
         WARN_ON(direction & ~(READ | WRITE));
         *i = (struct iov_iter){
                 .iter_type = ITER_KVEC,
- -              .copy_mc = false,
                 .data_source = direction,
                 .kvec = kvec,
                 .nr_segs = nr_segs,
@@@ -630,6 -650,7 +630,6 @@@ void iov_iter_bvec(struct iov_iter *i, 
         WARN_ON(direction & ~(READ | WRITE));
         *i = (struct iov_iter){
                 .iter_type = ITER_BVEC,
- -              .copy_mc = false,
                 .data_source = direction,
                 .bvec = bvec,
                 .nr_segs = nr_segs,
@@@ -658,6 -679,7 +658,6 @@@ void iov_iter_xarray(struct iov_iter *i
         BUG_ON(direction & ~1);
         *i = (struct iov_iter) {
                 .iter_type = ITER_XARRAY,
- -              .copy_mc = false,
                 .data_source = direction,
                 .xarray = xarray,
                 .xarray_start = start,
@@@ -681,6 -703,7 +681,6 @@@ void iov_iter_discard(struct iov_iter *
         BUG_ON(direction != READ);
         *i = (struct iov_iter){
                 .iter_type = ITER_DISCARD,
- -              .copy_mc = false,
                 .data_source = false,
                 .count = count,
                 .iov_offset = 0
@@@ -691,12 -714,11 +691,11 @@@ EXPORT_SYMBOL(iov_iter_discard)
   static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
                                    unsigned len_mask)
   {
+       const struct iovec *iov = iter_iov(i);
         size_t size = i->count;
         size_t skip = i->iov_offset;
-       unsigned k;
   
-       for (k = 0; k < i->nr_segs; k++, skip = 0) {
-               const struct iovec *iov = iter_iov(i) + k;
+       do {
                 size_t len = iov->iov_len - skip;
   
                 if (len > size)
@@@ -706,34 -728,36 +705,36 @@@
                 if ((unsigned long)(iov->iov_base + skip) & addr_mask)
                         return false;
   
+               iov++;
                 size -= len;
-               if (!size)
-                       break;
-       }
+               skip = 0;
+       } while (size);
+ 
         return true;
   }
   
   static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
                                   unsigned len_mask)
   {
-       size_t size = i->count;
+       const struct bio_vec *bvec = i->bvec;
         unsigned skip = i->iov_offset;
-       unsigned k;
+       size_t size = i->count;
   
-       for (k = 0; k < i->nr_segs; k++, skip = 0) {
-               size_t len = i->bvec[k].bv_len - skip;
+       do {
+               size_t len = bvec->bv_len;
   
                 if (len > size)
                         len = size;
                 if (len & len_mask)
                         return false;
-               if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask)
+               if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
                         return false;
   
+               bvec++;
                 size -= len;
-               if (!size)
-                       break;
-       }
+               skip = 0;
+       } while (size);
+ 
         return true;
   }
   
@@@ -777,13 -801,12 +778,12 @@@ EXPORT_SYMBOL_GPL(iov_iter_is_aligned)
   
   static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
   {
+       const struct iovec *iov = iter_iov(i);
         unsigned long res = 0;
         size_t size = i->count;
         size_t skip = i->iov_offset;
-       unsigned k;
   
-       for (k = 0; k < i->nr_segs; k++, skip = 0) {
-               const struct iovec *iov = iter_iov(i) + k;
+       do {
                 size_t len = iov->iov_len - skip;
                 if (len) {
                         res |= (unsigned long)iov->iov_base + skip;
@@@ -791,30 -814,31 +791,31 @@@
                                 len = size;
                         res |= len;
                         size -= len;
-                       if (!size)
-                               break;
                 }
-       }
+               iov++;
+               skip = 0;
+       } while (size);
         return res;
   }
   
   static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
   {
+       const struct bio_vec *bvec = i->bvec;
         unsigned res = 0;
         size_t size = i->count;
         unsigned skip = i->iov_offset;
-       unsigned k;
   
-       for (k = 0; k < i->nr_segs; k++, skip = 0) {
-               size_t len = i->bvec[k].bv_len - skip;
-               res |= (unsigned long)i->bvec[k].bv_offset + skip;
+       do {
+               size_t len = bvec->bv_len - skip;
+               res |= (unsigned long)bvec->bv_offset + skip;
                 if (len > size)
                         len = size;
                 res |= len;
+               bvec++;
                 size -= len;
-               if (!size)
-                       break;
-       }
+               skip = 0;
+       } while (size);
+ 
         return res;
   }
   
@@@ -1143,11 -1167,12 +1144,12 @@@ const void *dup_iter(struct iov_iter *n
   EXPORT_SYMBOL(dup_iter);
   
   static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
-               const struct iovec __user *uvec, unsigned long nr_segs)
+               const struct iovec __user *uvec, u32 nr_segs)
   {
         const struct compat_iovec __user *uiov =
                 (const struct compat_iovec __user *)uvec;
-       int ret = -EFAULT, i;
+       int ret = -EFAULT;
+       u32 i;
   
         if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                 return -EFAULT;
diff --combined mm/backing-dev.c

index e039d05304dd9ca52da735962c0ef951fb448ec5,039dc74b505a850846afe681d40ad59855fb8bd1..5f2be8c8df11f1ba31a1c2ea78be76651eb98747
--- 1/mm/backing-dev.c
--- 2/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@@ -372,31 -372,6 +372,6 @@@ static int __init default_bdi_init(void
   }
   subsys_initcall(default_bdi_init);
   
- /*
-  * This function is used when the first inode for this wb is marked dirty. It
-  * wakes-up the corresponding bdi thread which should then take care of the
-  * periodic background write-out of dirty inodes. Since the write-out would
-  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
-  * set up a timer which wakes the bdi thread up later.
-  *
-  * Note, we wouldn't bother setting up the timer, but this function is on the
-  * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
-  * by delaying the wake-up.
-  *
-  * We have to be careful not to postpone flush work if it is scheduled for
-  * earlier. Thus we use queue_delayed_work().
-  */
- void wb_wakeup_delayed(struct bdi_writeback *wb)
- {
-       unsigned long timeout;
- 
-       timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-       spin_lock_irq(&wb->work_lock);
-       if (test_bit(WB_registered, &wb->state))
-               queue_delayed_work(bdi_wq, &wb->dwork, timeout);
-       spin_unlock_irq(&wb->work_lock);
- }
- 
   static void wb_update_bandwidth_workfn(struct work_struct *work)
   {
         struct bdi_writeback *wb = container_of(to_delayed_work(work),
@@@ -436,6 -411,7 +411,6 @@@ static int wb_init(struct bdi_writebac
         INIT_LIST_HEAD(&wb->work_list);
         INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
         INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- -      wb->dirty_sleep = jiffies;
   
         err = fprop_local_init_percpu(&wb->completions, gfp);
         if (err)
@@@ -920,7 -896,6 +895,7 @@@ int bdi_init(struct backing_dev_info *b
         INIT_LIST_HEAD(&bdi->bdi_list);
         INIT_LIST_HEAD(&bdi->wb_list);
         init_waitqueue_head(&bdi->wb_waitq);
+ +      bdi->last_bdp_sleep = jiffies;
   
         return cgwb_bdi_init(bdi);
   }
diff --combined mm/filemap.c

index 4a30de98a8c75daec31d1d79d15a9d9514e9fd1d,a72dd2eafd5acec830946091c3e11ba0b13f75b1..8df4797c5287fa748aef1c26dd63f92dc24f03fd
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -2608,15 -2608,6 +2608,6 @@@ ssize_t filemap_read(struct kiocb *iocb
                         goto put_folios;
                 end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
   
-               /*
-                * Pairs with a barrier in
-                * block_write_end()->mark_buffer_dirty() or other page
-                * dirtying routines like iomap_write_end() to ensure
-                * changes to page contents are visible before we see
-                * increased inode size.
-                */
-               smp_rmb();
- 
                 /*
                  * Once we start copying data, we don't want to be touching any
                  * cachelines that might be contended:
@@@ -4111,40 -4102,28 +4102,40 @@@ static void filemap_cachestat(struct ad
   
         rcu_read_lock();
         xas_for_each(&xas, folio, last_index) {
+ +              int order;
                 unsigned long nr_pages;
                 pgoff_t folio_first_index, folio_last_index;
   
+ +              /*
+ +               * Don't deref the folio. It is not pinned, and might
+ +               * get freed (and reused) underneath us.
+ +               *
+ +               * We *could* pin it, but that would be expensive for
+ +               * what should be a fast and lightweight syscall.
+ +               *
+ +               * Instead, derive all information of interest from
+ +               * the rcu-protected xarray.
+ +               */
+ +
                 if (xas_retry(&xas, folio))
                         continue;
   
+ +              order = xa_get_order(xas.xa, xas.xa_index);
+ +              nr_pages = 1 << order;
+ +              folio_first_index = round_down(xas.xa_index, 1 << order);
+ +              folio_last_index = folio_first_index + nr_pages - 1;
+ +
+ +              /* Folios might straddle the range boundaries, only count covered pages */
+ +              if (folio_first_index < first_index)
+ +                      nr_pages -= first_index - folio_first_index;
+ +
+ +              if (folio_last_index > last_index)
+ +                      nr_pages -= folio_last_index - last_index;
+ +
                 if (xa_is_value(folio)) {
                         /* page is evicted */
                         void *shadow = (void *)folio;
                         bool workingset; /* not used */
- -                      int order = xa_get_order(xas.xa, xas.xa_index);
- -
- -                      nr_pages = 1 << order;
- -                      folio_first_index = round_down(xas.xa_index, 1 << order);
- -                      folio_last_index = folio_first_index + nr_pages - 1;
- -
- -                      /* Folios might straddle the range boundaries, only count covered pages */
- -                      if (folio_first_index < first_index)
- -                              nr_pages -= first_index - folio_first_index;
- -
- -                      if (folio_last_index > last_index)
- -                              nr_pages -= folio_last_index - last_index;
   
                         cs->nr_evicted += nr_pages;
   
@@@ -4162,13 -4141,24 +4153,13 @@@
                         goto resched;
                 }
   
- -              nr_pages = folio_nr_pages(folio);
- -              folio_first_index = folio_pgoff(folio);
- -              folio_last_index = folio_first_index + nr_pages - 1;
- -
- -              /* Folios might straddle the range boundaries, only count covered pages */
- -              if (folio_first_index < first_index)
- -                      nr_pages -= first_index - folio_first_index;
- -
- -              if (folio_last_index > last_index)
- -                      nr_pages -= folio_last_index - last_index;
- -
                 /* page is in cache */
                 cs->nr_cache += nr_pages;
   
- -              if (folio_test_dirty(folio))
+ +              if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
                         cs->nr_dirty += nr_pages;
   
- -              if (folio_test_writeback(folio))
+ +              if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
                         cs->nr_writeback += nr_pages;
   
   resched:
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 11 Mar 2024 16:38:17 +0000 (09:38 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 11 Mar 2024 16:38:17 +0000 (09:38 -0700)
		1	2
fs/dcache.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/hugetlbfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ntfs3/namei.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/overlayfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/dcache.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
lib/iov_iter.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/backing-dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history