Merge branch 'for-4.6-ns' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

[linux-2.6-block.git] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 3fe02c15279940716ca62b98caa386f654a79596..671dc05c0b0fd6b732cf03d6efec6a5dd5a3557d 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -59,6 +59,9 @@
  #include <linux/delay.h>
  #include <linux/atomic.h>
  #include <linux/cpuset.h>
+#include <linux/proc_ns.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
  #include <net/sock.h>
  
  /*
@@ -215,6 +218,15 @@ static u16 have_fork_callback __read_mostly;
  static u16 have_exit_callback __read_mostly;
  static u16 have_free_callback __read_mostly;
  
+/* cgroup namespace for init task */
+struct cgroup_namespace init_cgroup_ns = {
+       .count          = { .counter = 2, },
+       .user_ns        = &init_user_ns,
+       .ns.ops         = &cgroupns_operations,
+       .ns.inum        = PROC_CGROUP_INIT_INO,
+       .root_cset      = &init_css_set,
+};
+
  /* Ditto for the can_fork callback. */
  static u16 have_canfork_callback __read_mostly;
  
@@ -2002,6 +2014,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
  {
         bool is_v2 = fs_type == &cgroup2_fs_type;
         struct super_block *pinned_sb = NULL;
+       struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
         struct cgroup_subsys *ss;
         struct cgroup_root *root;
         struct cgroup_sb_opts opts;
@@ -2010,6 +2023,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         int i;
         bool new_sb;
  
+       get_cgroup_ns(ns);
+
+       /* Check if the caller has permission to mount. */
+       if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+               put_cgroup_ns(ns);
+               return ERR_PTR(-EPERM);
+       }
+
         /*
          * The first time anyone tries to mount a cgroup, enable the list
          * linking each css_set to its tasks and fix up all existing tasks.
@@ -2020,6 +2041,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         if (is_v2) {
                 if (data) {
                         pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+                       put_cgroup_ns(ns);
                         return ERR_PTR(-EINVAL);
                 }
                 cgrp_dfl_visible = true;
@@ -2125,6 +2147,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 goto out_unlock;
         }
  
+       /*
+        * We know this subsystem has not yet been bound.  Users in a non-init
+        * user namespace may only mount hierarchies with no bound subsystems,
+        * i.e. 'none,name=user1'
+        */
+       if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+               ret = -EPERM;
+               goto out_unlock;
+       }
+
         root = kzalloc(sizeof(*root), GFP_KERNEL);
         if (!root) {
                 ret = -ENOMEM;
@@ -2143,12 +2175,37 @@ out_free:
         kfree(opts.release_agent);
         kfree(opts.name);
  
-       if (ret)
+       if (ret) {
+               put_cgroup_ns(ns);
                 return ERR_PTR(ret);
+       }
  out_mount:
         dentry = kernfs_mount(fs_type, flags, root->kf_root,
                               is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
                               &new_sb);
+
+       /*
+        * In non-init cgroup namespace, instead of root cgroup's
+        * dentry, we return the dentry corresponding to the
+        * cgroupns->root_cgrp.
+        */
+       if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+               struct dentry *nsdentry;
+               struct cgroup *cgrp;
+
+               mutex_lock(&cgroup_mutex);
+               spin_lock_bh(&css_set_lock);
+
+               cgrp = cset_cgroup_from_root(ns->root_cset, root);
+
+               spin_unlock_bh(&css_set_lock);
+               mutex_unlock(&cgroup_mutex);
+
+               nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+               dput(dentry);
+               dentry = nsdentry;
+       }
+
         if (IS_ERR(dentry) || !new_sb)
                 cgroup_put(&root->cgrp);
  
@@ -2161,6 +2218,7 @@ out_mount:
                 deactivate_super(pinned_sb);
         }
  
+       put_cgroup_ns(ns);
         return dentry;
  }
  
@@ -2189,14 +2247,45 @@ static struct file_system_type cgroup_fs_type = {
         .name = "cgroup",
         .mount = cgroup_mount,
         .kill_sb = cgroup_kill_sb,
+       .fs_flags = FS_USERNS_MOUNT,
  };
  
  static struct file_system_type cgroup2_fs_type = {
         .name = "cgroup2",
         .mount = cgroup_mount,
         .kill_sb = cgroup_kill_sb,
+       .fs_flags = FS_USERNS_MOUNT,
  };
  
+static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+                                  struct cgroup_namespace *ns)
+{
+       struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
+       int ret;
+
+       ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
+       if (ret < 0 || ret >= buflen)
+               return NULL;
+       return buf;
+}
+
+char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+                    struct cgroup_namespace *ns)
+{
+       char *ret;
+
+       mutex_lock(&cgroup_mutex);
+       spin_lock_bh(&css_set_lock);
+
+       ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
+
+       spin_unlock_bh(&css_set_lock);
+       mutex_unlock(&cgroup_mutex);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(cgroup_path_ns);
+
  /**
   * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
   * @task: target task
@@ -2224,7 +2313,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
  
         if (root) {
                 cgrp = task_cgroup_from_root(task, root);
-               path = cgroup_path(cgrp, buf, buflen);
+               path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
         } else {
                 /* if no hierarchy exists, everyone is in "/" */
                 if (strlcpy(buf, "/", buflen) < buflen)
@@ -5450,6 +5539,8 @@ int __init cgroup_init(void)
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
  
+       get_user_ns(init_cgroup_ns.user_ns);
+
         mutex_lock(&cgroup_mutex);
  
         /*
@@ -5601,7 +5692,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                  * " (deleted)" is appended to the cgroup path.
                  */
                 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
-                       path = cgroup_path(cgrp, buf, PATH_MAX);
+                       path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+                                               current->nsproxy->cgroup_ns);
                         if (!path) {
                                 retval = -ENAMETOOLONG;
                                 goto out_unlock;
@@ -5886,7 +5978,9 @@ static void cgroup_release_agent(struct work_struct *work)
         if (!pathbuf || !agentbuf)
                 goto out;
  
-       path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+       spin_lock_bh(&css_set_lock);
+       path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+       spin_unlock_bh(&css_set_lock);
         if (!path)
                 goto out;
  
@@ -6098,6 +6192,133 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
  
  #endif /* CONFIG_SOCK_CGROUP_DATA */
  
+/* cgroup namespaces */
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+       struct cgroup_namespace *new_ns;
+       int ret;
+
+       new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+       if (!new_ns)
+               return ERR_PTR(-ENOMEM);
+       ret = ns_alloc_inum(&new_ns->ns);
+       if (ret) {
+               kfree(new_ns);
+               return ERR_PTR(ret);
+       }
+       atomic_set(&new_ns->count, 1);
+       new_ns->ns.ops = &cgroupns_operations;
+       return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+       put_css_set(ns->root_cset);
+       put_user_ns(ns->user_ns);
+       ns_free_inum(&ns->ns);
+       kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+                                       struct user_namespace *user_ns,
+                                       struct cgroup_namespace *old_ns)
+{
+       struct cgroup_namespace *new_ns;
+       struct css_set *cset;
+
+       BUG_ON(!old_ns);
+
+       if (!(flags & CLONE_NEWCGROUP)) {
+               get_cgroup_ns(old_ns);
+               return old_ns;
+       }
+
+       /* Allow only sysadmin to create cgroup namespace. */
+       if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+               return ERR_PTR(-EPERM);
+
+       mutex_lock(&cgroup_mutex);
+       spin_lock_bh(&css_set_lock);
+
+       cset = task_css_set(current);
+       get_css_set(cset);
+
+       spin_unlock_bh(&css_set_lock);
+       mutex_unlock(&cgroup_mutex);
+
+       new_ns = alloc_cgroup_ns();
+       if (IS_ERR(new_ns)) {
+               put_css_set(cset);
+               return new_ns;
+       }
+
+       new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->root_cset = cset;
+
+       return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+       return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+       struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+       if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+           !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+               return -EPERM;
+
+       /* Don't need to do anything if we are attaching to our own cgroupns. */
+       if (cgroup_ns == nsproxy->cgroup_ns)
+               return 0;
+
+       get_cgroup_ns(cgroup_ns);
+       put_cgroup_ns(nsproxy->cgroup_ns);
+       nsproxy->cgroup_ns = cgroup_ns;
+
+       return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+       struct cgroup_namespace *ns = NULL;
+       struct nsproxy *nsproxy;
+
+       task_lock(task);
+       nsproxy = task->nsproxy;
+       if (nsproxy) {
+               ns = nsproxy->cgroup_ns;
+               get_cgroup_ns(ns);
+       }
+       task_unlock(task);
+
+       return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+       put_cgroup_ns(to_cg_ns(ns));
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+       .name           = "cgroup",
+       .type           = CLONE_NEWCGROUP,
+       .get            = cgroupns_get,
+       .put            = cgroupns_put,
+       .install        = cgroupns_install,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+       return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
+
  #ifdef CONFIG_CGROUP_DEBUG
  static struct cgroup_subsys_state *
  debug_css_alloc(struct cgroup_subsys_state *parent_css)