#include "internal.h"
#include "mount.h"
+#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
+
static struct kmem_cache *pidfs_cachep __ro_after_init;
+static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
/*
* Stashes information that userspace needs to access even after the
__u32 coredump_mask;
};
+struct pidfs_attr {
+ struct pidfs_exit_info __pei;
+ struct pidfs_exit_info *exit_info;
+};
+
struct pidfs_inode {
struct pidfs_exit_info __pei;
struct pidfs_exit_info *exit_info;
pid->ino = pidfs_ino_nr;
pid->stashed = NULL;
+ pid->attr = NULL;
pidfs_ino_nr++;
write_seqcount_begin(&pidmap_lock_seq);
write_seqcount_end(&pidmap_lock_seq);
}
+void pidfs_free_pid(struct pid *pid)
+{
+ /*
+ * Any dentry must've been wiped from the pid by now.
+ * Otherwise there's a reference count bug.
+ */
+ VFS_WARN_ON_ONCE(pid->stashed);
+
+ if (!IS_ERR(pid->attr))
+ kfree(pid->attr);
+}
+
#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
- struct inode *inode = file_inode(file);
struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
struct task_struct *task;
+ struct pidfs_attr *attr;
const struct cred *c;
__u64 mask;
if (!pid_in_current_pidns(pid))
return -ESRCH;
+ attr = READ_ONCE(pid->attr);
if (mask & PIDFD_INFO_EXIT) {
- exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+ exit_info = READ_ONCE(attr->exit_info);
if (exit_info) {
kinfo.mask |= PIDFD_INFO_EXIT;
#ifdef CONFIG_CGROUPS
if (mask & PIDFD_INFO_COREDUMP) {
kinfo.mask |= PIDFD_INFO_COREDUMP;
- kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+ kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
}
task = get_pid_task(pid, PIDTYPE_PID);
* task has been reaped which cannot happen until we're out of
* release_task().
*
- * If this struct pid is referred to by a pidfd then
- * stashed_dentry_get() will return the dentry and inode for that struct
- * pid. Since we've taken a reference on it there's now an additional
- * reference from the exit path on it. Which is fine. We're going to put
- * it again in a second and we know that the pid is kept alive anyway.
+ * If this struct pid has at least once been referred to by a pidfd then
+ * pid->attr will be allocated. If not we mark the struct pid as dead so
+ * anyone who is trying to register it with pidfs will fail to do so.
+ * Otherwise we would hand out pidfs for reaped tasks without having
+ * exit information available.
*
- * Worst case is that we've filled in the info and immediately free the
- * dentry and inode afterwards since the pidfd has been closed. Since
+ * Worst case is that we've filled in the info and the pid gets freed
+ * right away in free_pid() when no one holds a pidfd anymore. Since
* pidfs_exit() currently is placed after exit_task_work() we know that
- * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ * it cannot be us aka the exiting task holding a pidfd to itself.
*/
void pidfs_exit(struct task_struct *tsk)
{
- struct dentry *dentry;
+ struct pid *pid = task_pid(tsk);
+ struct pidfs_attr *attr;
+ struct pidfs_exit_info *exit_info;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+#endif
might_sleep();
- dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
- if (dentry) {
- struct inode *inode = d_inode(dentry);
- struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
-#ifdef CONFIG_CGROUPS
- struct cgroup *cgrp;
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+ attr = pid->attr;
+ if (!attr) {
+ /*
+ * No one ever held a pidfd for this struct pid.
+ * Mark it as dead so no one can add a pidfs
+ * entry anymore. We're about to be reaped and
+ * so no exit information would be available.
+ */
+ pid->attr = PIDFS_PID_DEAD;
+ return;
+ }
- rcu_read_lock();
- cgrp = task_dfl_cgroup(tsk);
- exit_info->cgroupid = cgroup_id(cgrp);
- rcu_read_unlock();
+ /*
+ * If @pid->attr is set someone might still legitimately hold a
+ * pidfd to @pid or someone might concurrently still be getting
+ * a reference to an already stashed dentry from @pid->stashed.
+ * So defer cleaning @pid->attr until the last reference to @pid
+ * is put
+ */
+
+ exit_info = &attr->__pei;
+
+#ifdef CONFIG_CGROUPS
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(tsk);
+ exit_info->cgroupid = cgroup_id(cgrp);
+ rcu_read_unlock();
#endif
- exit_info->exit_code = tsk->exit_code;
+ exit_info->exit_code = tsk->exit_code;
- /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
- smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
- dput(dentry);
- }
+ /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+ smp_store_release(&attr->exit_info, &attr->__pei);
}
#ifdef CONFIG_COREDUMP
{
struct pid *pid = cprm->pid;
struct pidfs_exit_info *exit_info;
- struct dentry *dentry;
- struct inode *inode;
+ struct pidfs_attr *attr;
__u32 coredump_mask = 0;
- dentry = pid->stashed;
- if (WARN_ON_ONCE(!dentry))
- return;
+ attr = READ_ONCE(pid->attr);
- inode = d_inode(dentry);
- exit_info = &pidfs_i(inode)->__pei;
+ VFS_WARN_ON_ONCE(!attr);
+ VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
+
+ exit_info = &attr->__pei;
/* Note how we were coredumped. */
coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
/* Note that we actually did coredump. */
static void pidfs_free_inode(struct inode *inode)
{
- kmem_cache_free(pidfs_cachep, pidfs_i(inode));
+ kfree(pidfs_i(inode));
}
static const struct super_operations pidfs_sops = {
* recorded and published can be handled correctly.
*/
if (unlikely(!pid_has_task(pid, type))) {
- struct inode *inode = d_inode(path->dentry);
- return !!READ_ONCE(pidfs_i(inode)->exit_info);
+ struct pidfs_attr *attr;
+
+ attr = READ_ONCE(pid->attr);
+ if (!attr)
+ return false;
+ if (!READ_ONCE(attr->exit_info))
+ return false;
}
return true;
put_pid(pid);
}
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs. Needs to be paired with
+ * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+ struct pidfs_attr *new_attr __free(kfree) = NULL;
+ struct pidfs_attr *attr;
+
+ might_sleep();
+
+ if (!pid)
+ return 0;
+
+ attr = READ_ONCE(pid->attr);
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (attr)
+ return 0;
+
+ new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
+ if (!new_attr)
+ return -ENOMEM;
+
+ /* Synchronize with pidfs_exit(). */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+ attr = pid->attr;
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (unlikely(attr))
+ return 0;
+
+ pid->attr = no_free_ptr(new_attr);
+ return 0;
+}
+
+static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
+ struct dentry *dentry)
+{
+ int ret;
+ struct pid *pid = d_inode(dentry)->i_private;
+
+ VFS_WARN_ON_ONCE(stashed != &pid->stashed);
+
+ ret = pidfs_register_pid(pid);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return stash_dentry(stashed, dentry);
+}
+
static const struct stashed_operations pidfs_stashed_ops = {
- .init_inode = pidfs_init_inode,
- .put_data = pidfs_put_data,
+ .stash_dentry = pidfs_stash_dentry,
+ .init_inode = pidfs_init_inode,
+ .put_data = pidfs_put_data,
};
static int pidfs_init_fs_context(struct fs_context *fc)
return pidfd_file;
}
-/**
- * pidfs_register_pid - register a struct pid in pidfs
- * @pid: pid to pin
- *
- * Register a struct pid in pidfs. Needs to be paired with
- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
- *
- * Return: On success zero, on error a negative error code is returned.
- */
-int pidfs_register_pid(struct pid *pid)
-{
- struct path path __free(path_put) = {};
- int ret;
-
- might_sleep();
-
- if (!pid)
- return 0;
-
- ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
- if (unlikely(ret))
- return ret;
- /* Keep the dentry and only put the reference to the mount. */
- path.dentry = NULL;
- return 0;
-}
-
/**
* pidfs_get_pid - pin a struct pid through pidfs
* @pid: pid to pin
(SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
SLAB_ACCOUNT | SLAB_PANIC),
pidfs_inode_init_once);
+ pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT | SLAB_PANIC), NULL);
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");