pidfs: persist information

author Christian Brauner <brauner@kernel.org>

Wed, 18 Jun 2025 20:53:39 +0000 (22:53 +0200)

committer Christian Brauner <brauner@kernel.org>

Thu, 19 Jun 2025 12:28:24 +0000 (14:28 +0200)
author Christian Brauner <brauner@kernel.org>
Wed, 18 Jun 2025 20:53:39 +0000 (22:53 +0200)
committer Christian Brauner <brauner@kernel.org>
Thu, 19 Jun 2025 12:28:24 +0000 (14:28 +0200)
diff --git a/fs/pidfs.c b/fs/pidfs.c

index ff2560b34ed1794663d046e1f1f5c28ca51b6864..6a907457b1fe740133306d88e462a49e6fdc495d 100644 (file)
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -25,7 +25,10 @@
  #include "internal.h"
  #include "mount.h"
  
+#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
+
  static struct kmem_cache *pidfs_cachep __ro_after_init;
+static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
  
  /*
   * Stashes information that userspace needs to access even after the
@@ -37,6 +40,11 @@ struct pidfs_exit_info {
         __u32 coredump_mask;
  };
  
+struct pidfs_attr {
+       struct pidfs_exit_info __pei;
+       struct pidfs_exit_info *exit_info;
+};
+
  struct pidfs_inode {
         struct pidfs_exit_info __pei;
         struct pidfs_exit_info *exit_info;
@@ -125,6 +133,7 @@ void pidfs_add_pid(struct pid *pid)
  
         pid->ino = pidfs_ino_nr;
         pid->stashed = NULL;
+       pid->attr = NULL;
         pidfs_ino_nr++;
  
         write_seqcount_begin(&pidmap_lock_seq);
@@ -139,6 +148,18 @@ void pidfs_remove_pid(struct pid *pid)
         write_seqcount_end(&pidmap_lock_seq);
  }
  
+void pidfs_free_pid(struct pid *pid)
+{
+       /*
+        * Any dentry must've been wiped from the pid by now.
+        * Otherwise there's a reference count bug.
+        */
+       VFS_WARN_ON_ONCE(pid->stashed);
+
+       if (!IS_ERR(pid->attr))
+               kfree(pid->attr);
+}
+
  #ifdef CONFIG_PROC_FS
  /**
   * pidfd_show_fdinfo - print information about a pidfd
@@ -261,13 +282,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
  static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
  {
         struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
-       struct inode *inode = file_inode(file);
         struct pid *pid = pidfd_pid(file);
         size_t usize = _IOC_SIZE(cmd);
         struct pidfd_info kinfo = {};
         struct pidfs_exit_info *exit_info;
         struct user_namespace *user_ns;
         struct task_struct *task;
+       struct pidfs_attr *attr;
         const struct cred *c;
         __u64 mask;
  
@@ -286,8 +307,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
         if (!pid_in_current_pidns(pid))
                 return -ESRCH;
  
+       attr = READ_ONCE(pid->attr);
         if (mask & PIDFD_INFO_EXIT) {
-               exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+               exit_info = READ_ONCE(attr->exit_info);
                 if (exit_info) {
                         kinfo.mask |= PIDFD_INFO_EXIT;
  #ifdef CONFIG_CGROUPS
@@ -300,7 +322,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
  
         if (mask & PIDFD_INFO_COREDUMP) {
                 kinfo.mask |= PIDFD_INFO_COREDUMP;
-               kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+               kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
         }
  
         task = get_pid_task(pid, PIDTYPE_PID);
@@ -552,41 +574,61 @@ struct pid *pidfd_pid(const struct file *file)
   * task has been reaped which cannot happen until we're out of
   * release_task().
   *
- * If this struct pid is referred to by a pidfd then
- * stashed_dentry_get() will return the dentry and inode for that struct
- * pid. Since we've taken a reference on it there's now an additional
- * reference from the exit path on it. Which is fine. We're going to put
- * it again in a second and we know that the pid is kept alive anyway.
+ * If this struct pid has at least once been referred to by a pidfd then
+ * pid->attr will be allocated. If not we mark the struct pid as dead so
+ * anyone who is trying to register it with pidfs will fail to do so.
+ * Otherwise we would hand out pidfs for reaped tasks without having
+ * exit information available.
   *
- * Worst case is that we've filled in the info and immediately free the
- * dentry and inode afterwards since the pidfd has been closed. Since
+ * Worst case is that we've filled in the info and the pid gets freed
+ * right away in free_pid() when no one holds a pidfd anymore. Since
   * pidfs_exit() currently is placed after exit_task_work() we know that
- * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ * it cannot be us aka the exiting task holding a pidfd to itself.
   */
  void pidfs_exit(struct task_struct *tsk)
  {
-       struct dentry *dentry;
+       struct pid *pid = task_pid(tsk);
+       struct pidfs_attr *attr;
+       struct pidfs_exit_info *exit_info;
+#ifdef CONFIG_CGROUPS
+       struct cgroup *cgrp;
+#endif
  
         might_sleep();
  
-       dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
-       if (dentry) {
-               struct inode *inode = d_inode(dentry);
-               struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
-#ifdef CONFIG_CGROUPS
-               struct cgroup *cgrp;
+       guard(spinlock_irq)(&pid->wait_pidfd.lock);
+       attr = pid->attr;
+       if (!attr) {
+               /*
+                * No one ever held a pidfd for this struct pid.
+                * Mark it as dead so no one can add a pidfs
+                * entry anymore. We're about to be reaped and
+                * so no exit information would be available.
+                */
+               pid->attr = PIDFS_PID_DEAD;
+               return;
+       }
  
-               rcu_read_lock();
-               cgrp = task_dfl_cgroup(tsk);
-               exit_info->cgroupid = cgroup_id(cgrp);
-               rcu_read_unlock();
+       /*
+        * If @pid->attr is set someone might still legitimately hold a
+        * pidfd to @pid or someone might concurrently still be getting
+        * a reference to an already stashed dentry from @pid->stashed.
+        * So defer cleaning @pid->attr until the last reference to @pid
+        * is put
+        */
+
+       exit_info = &attr->__pei;
+
+#ifdef CONFIG_CGROUPS
+       rcu_read_lock();
+       cgrp = task_dfl_cgroup(tsk);
+       exit_info->cgroupid = cgroup_id(cgrp);
+       rcu_read_unlock();
  #endif
-               exit_info->exit_code = tsk->exit_code;
+       exit_info->exit_code = tsk->exit_code;
  
-               /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
-               smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
-               dput(dentry);
-       }
+       /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+       smp_store_release(&attr->exit_info, &attr->__pei);
  }
  
  #ifdef CONFIG_COREDUMP
@@ -594,16 +636,15 @@ void pidfs_coredump(const struct coredump_params *cprm)
  {
         struct pid *pid = cprm->pid;
         struct pidfs_exit_info *exit_info;
-       struct dentry *dentry;
-       struct inode *inode;
+       struct pidfs_attr *attr;
         __u32 coredump_mask = 0;
  
-       dentry = pid->stashed;
-       if (WARN_ON_ONCE(!dentry))
-               return;
+       attr = READ_ONCE(pid->attr);
  
-       inode = d_inode(dentry);
-       exit_info = &pidfs_i(inode)->__pei;
+       VFS_WARN_ON_ONCE(!attr);
+       VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
+
+       exit_info = &attr->__pei;
         /* Note how we were coredumped. */
         coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
         /* Note that we actually did coredump. */
@@ -663,7 +704,7 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
  
  static void pidfs_free_inode(struct inode *inode)
  {
-       kmem_cache_free(pidfs_cachep, pidfs_i(inode));
+       kfree(pidfs_i(inode));
  }
  
  static const struct super_operations pidfs_sops = {
@@ -831,8 +872,13 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
          * recorded and published can be handled correctly.
          */
         if (unlikely(!pid_has_task(pid, type))) {
-               struct inode *inode = d_inode(path->dentry);
-               return !!READ_ONCE(pidfs_i(inode)->exit_info);
+               struct pidfs_attr *attr;
+
+               attr = READ_ONCE(pid->attr);
+               if (!attr)
+                       return false;
+               if (!READ_ONCE(attr->exit_info))
+                       return false;
         }
  
         return true;
@@ -878,9 +924,67 @@ static void pidfs_put_data(void *data)
         put_pid(pid);
  }
  
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs. Needs to be paired with
+ * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+       struct pidfs_attr *new_attr __free(kfree) = NULL;
+       struct pidfs_attr *attr;
+
+       might_sleep();
+
+       if (!pid)
+               return 0;
+
+       attr = READ_ONCE(pid->attr);
+       if (unlikely(attr == PIDFS_PID_DEAD))
+               return PTR_ERR(PIDFS_PID_DEAD);
+       if (attr)
+               return 0;
+
+       new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
+       if (!new_attr)
+               return -ENOMEM;
+
+       /* Synchronize with pidfs_exit(). */
+       guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+       attr = pid->attr;
+       if (unlikely(attr == PIDFS_PID_DEAD))
+               return PTR_ERR(PIDFS_PID_DEAD);
+       if (unlikely(attr))
+               return 0;
+
+       pid->attr = no_free_ptr(new_attr);
+       return 0;
+}
+
+static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
+                                        struct dentry *dentry)
+{
+       int ret;
+       struct pid *pid = d_inode(dentry)->i_private;
+
+       VFS_WARN_ON_ONCE(stashed != &pid->stashed);
+
+       ret = pidfs_register_pid(pid);
+       if (ret)
+               return ERR_PTR(ret);
+
+       return stash_dentry(stashed, dentry);
+}
+
  static const struct stashed_operations pidfs_stashed_ops = {
-       .init_inode = pidfs_init_inode,
-       .put_data = pidfs_put_data,
+       .stash_dentry   = pidfs_stash_dentry,
+       .init_inode     = pidfs_init_inode,
+       .put_data       = pidfs_put_data,
  };
  
  static int pidfs_init_fs_context(struct fs_context *fc)
@@ -936,33 +1040,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
         return pidfd_file;
  }
  
-/**
- * pidfs_register_pid - register a struct pid in pidfs
- * @pid: pid to pin
- *
- * Register a struct pid in pidfs. Needs to be paired with
- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
- *
- * Return: On success zero, on error a negative error code is returned.
- */
-int pidfs_register_pid(struct pid *pid)
-{
-       struct path path __free(path_put) = {};
-       int ret;
-
-       might_sleep();
-
-       if (!pid)
-               return 0;
-
-       ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
-       if (unlikely(ret))
-               return ret;
-       /* Keep the dentry and only put the reference to the mount. */
-       path.dentry = NULL;
-       return 0;
-}
-
  /**
   * pidfs_get_pid - pin a struct pid through pidfs
   * @pid: pid to pin
@@ -1008,6 +1085,9 @@ void __init pidfs_init(void)
                                          (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
                                           SLAB_ACCOUNT | SLAB_PANIC),
                                          pidfs_inode_init_once);
+       pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
+                                        (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+                                         SLAB_ACCOUNT | SLAB_PANIC), NULL);
         pidfs_mnt = kern_mount(&pidfs_type);
         if (IS_ERR(pidfs_mnt))
                 panic("Failed to mount pidfs pseudo filesystem");
diff --git a/include/linux/pid.h b/include/linux/pid.h

index 00646a692dd451971bb414826560ebe6f6ad9a6d..003a1027d219f08d91abc1a2939835731b824240 100644 (file)
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -47,6 +47,8 @@
  
  #define RESERVED_PIDS 300
  
+struct pidfs_attr;
+
  struct upid {
         int nr;
         struct pid_namespace *ns;
@@ -60,6 +62,7 @@ struct pid {
                 u64 ino;
                 struct rb_node pidfs_node;
                 struct dentry *stashed;
+               struct pidfs_attr *attr;
         };
         /* lists of tasks that use this pid */
         struct hlist_head tasks[PIDTYPE_MAX];
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h

index 77e7db194914601a519c4683e6a777e4b9c281d9..8f6ed59bb3fb69969d826a1592ac68fb0330f8f2 100644 (file)
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations;
  int pidfs_register_pid(struct pid *pid);
  void pidfs_get_pid(struct pid *pid);
  void pidfs_put_pid(struct pid *pid);
+void pidfs_free_pid(struct pid *pid);
  
  #endif /* _LINUX_PID_FS_H */
diff --git a/kernel/pid.c b/kernel/pid.c

index 8317bcbc7cf7daa88602bbd9ea912934ca2c4ca2..07db7d8d066c64f8cbf9833efd4f3a18b751b0d1 100644 (file)
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -100,7 +100,7 @@ void put_pid(struct pid *pid)
  
         ns = pid->numbers[pid->level].ns;
         if (refcount_dec_and_test(&pid->count)) {
-               WARN_ON_ONCE(pid->stashed);
+               pidfs_free_pid(pid);
                 kmem_cache_free(ns->pid_cachep, pid);
                 put_pid_ns(ns);
         }
author	Christian Brauner <brauner@kernel.org>
	Wed, 18 Jun 2025 20:53:39 +0000 (22:53 +0200)
committer	Christian Brauner <brauner@kernel.org>
	Thu, 19 Jun 2025 12:28:24 +0000 (14:28 +0200)
fs/pidfs.c		patch \| blob \| blame \| history
include/linux/pid.h		patch \| blob \| blame \| history
include/linux/pidfs.h		patch \| blob \| blame \| history
kernel/pid.c		patch \| blob \| blame \| history