pidns: Make the pidns proc mount/umount logic obvious.
authorEric W. Biederman <ebiederm@xmission.com>
Wed, 1 Aug 2012 17:33:47 +0000 (10:33 -0700)
committerEric W. Biederman <ebiederm@xmission.com>
Mon, 19 Nov 2012 13:59:10 +0000 (05:59 -0800)
Track the number of pids in the proc hash table.  When the number of
pids goes to 0 schedule work to unmount the kernel mount of proc.

Move the mount of proc into alloc_pid when we allocate the pid for
init.

Remove the surprising calls of pid_ns_release proc in fork and
proc_flush_task.  Those code paths really shouldn't know about proc
namespace implementation details and people have demonstrated several
times that finding and understanding those code paths is difficult and
non-obvious.

Because of the call path detach pid is alwasy called with the
rtnl_lock held free_pid is not allowed to sleep, so the work to
unmounting proc is moved to a work queue.  This has the side benefit
of not blocking the entire world waiting for the unnecessary
rcu_barrier in deactivate_locked_super.

In the process of making the code clear and obvious this fixes a bug
reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a
mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns
succeeded and copy_net_ns failed.

Acked-by: "Serge E. Hallyn" <serge@hallyn.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
fs/proc/base.c
fs/proc/root.c
include/linux/pid_namespace.h
kernel/fork.c
kernel/pid.c
kernel/pid_namespace.c

index 6177fc238fdb0c9f13f2e492cf5e3eccb1646e15..7621dc51cff8c7feac70b5c93e275167bb3e1766 100644 (file)
@@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task)
                proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
                                        tgid->numbers[i].nr);
        }
-
-       upid = &pid->numbers[pid->level];
-       if (upid->nr == 1)
-               pid_ns_release_proc(upid->ns);
 }
 
 static struct dentry *proc_pid_instantiate(struct inode *dir,
index fc1609321a78ef971d9e8fa4f264d951530c8544..f2f251158d35526c3d4fe1d830bc2a316960ba8b 100644 (file)
@@ -155,11 +155,6 @@ void __init proc_root_init(void)
        err = register_filesystem(&proc_fs_type);
        if (err)
                return;
-       err = pid_ns_prepare_proc(&init_pid_ns);
-       if (err) {
-               unregister_filesystem(&proc_fs_type);
-               return;
-       }
 
        proc_self_init();
        proc_symlink("mounts", NULL, "self/mounts");
index c89c9cfcd247378f5f328d005d76fcedd50aab96..4c96acdb2489d986537894ba27981dde52b47584 100644 (file)
@@ -21,6 +21,7 @@ struct pid_namespace {
        struct kref kref;
        struct pidmap pidmap[PIDMAP_ENTRIES];
        int last_pid;
+       int nr_hashed;
        struct task_struct *child_reaper;
        struct kmem_cache *pid_cachep;
        unsigned int level;
@@ -32,6 +33,7 @@ struct pid_namespace {
        struct bsd_acct_struct *bacct;
 #endif
        struct user_namespace *user_ns;
+       struct work_struct proc_work;
        kgid_t pid_gid;
        int hide_pid;
        int reboot;     /* group exit code if this pidns was rebooted */
index 7798c247f4b9dda645b8f2e2276e4f34a5d508af..666dc8b06606b3ffb336fe495c73731b859b2635 100644 (file)
@@ -1476,8 +1476,6 @@ bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
 bad_fork_cleanup_namespaces:
-       if (unlikely(clone_flags & CLONE_NEWPID))
-               pid_ns_release_proc(p->nsproxy->pid_ns);
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
        if (p->mm)
index 3a5f238c1ca02ec74d1770dd359a137c1e98be6d..e957f8b091364b1f8e20196fceab6367d6cd2f0e 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
+#include <linux/proc_fs.h>
 
 #define pid_hashfn(nr, ns)     \
        hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -270,8 +271,12 @@ void free_pid(struct pid *pid)
        unsigned long flags;
 
        spin_lock_irqsave(&pidmap_lock, flags);
-       for (i = 0; i <= pid->level; i++)
-               hlist_del_rcu(&pid->numbers[i].pid_chain);
+       for (i = 0; i <= pid->level; i++) {
+               struct upid *upid = pid->numbers + i;
+               hlist_del_rcu(&upid->pid_chain);
+               if (--upid->ns->nr_hashed == 0)
+                       schedule_work(&upid->ns->proc_work);
+       }
        spin_unlock_irqrestore(&pidmap_lock, flags);
 
        for (i = 0; i <= pid->level; i++)
@@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
                goto out;
 
        tmp = ns;
+       pid->level = ns->level;
        for (i = ns->level; i >= 0; i--) {
                nr = alloc_pidmap(tmp);
                if (nr < 0)
@@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns)
                tmp = tmp->parent;
        }
 
+       if (unlikely(is_child_reaper(pid))) {
+               if (pid_ns_prepare_proc(ns))
+                       goto out_free;
+       }
+
        get_pid_ns(ns);
-       pid->level = ns->level;
        atomic_set(&pid->count, 1);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);
 
        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
-       for ( ; upid >= pid->numbers; --upid)
+       for ( ; upid >= pid->numbers; --upid) {
                hlist_add_head_rcu(&upid->pid_chain,
                                &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+               upid->ns->nr_hashed++;
+       }
        spin_unlock_irq(&pidmap_lock);
 
 out:
@@ -570,6 +582,7 @@ void __init pidmap_init(void)
        /* Reserve PID 0. We never call free_pidmap(0) */
        set_bit(0, init_pid_ns.pidmap[0].page);
        atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+       init_pid_ns.nr_hashed = 1;
 
        init_pid_ns.pid_cachep = KMEM_CACHE(pid,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC);
index b2604950aa50fa6e0b768418cf8cfab33ba53bc0..84591cfeefc1b1165ca047d549d36ad9edc1beaf 100644 (file)
@@ -72,6 +72,12 @@ err_alloc:
        return NULL;
 }
 
+static void proc_cleanup_work(struct work_struct *work)
+{
+       struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
+       pid_ns_release_proc(ns);
+}
+
 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
 #define MAX_PID_NS_LEVEL 32
 
@@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
        ns->level = level;
        ns->parent = get_pid_ns(parent_pid_ns);
        ns->user_ns = get_user_ns(user_ns);
+       INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
        set_bit(0, ns->pidmap[0].page);
        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
        for (i = 1; i < PIDMAP_ENTRIES; i++)
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
 
-       err = pid_ns_prepare_proc(ns);
-       if (err)
-               goto out_put_parent_pid_ns;
-
        return ns;
 
-out_put_parent_pid_ns:
-       put_pid_ns(parent_pid_ns);
-       put_user_ns(user_ns);
 out_free_map:
        kfree(ns->pidmap[0].page);
 out_free: