kernel/cgroup.c

   1 /*
   2  *  Generic process-grouping system.
   3  *
   4  *  Based originally on the cpuset system, extracted by Paul Menage
   5  *  Copyright (C) 2006 Google, Inc
   6  *
   7  *  Notifications support
   8  *  Copyright (C) 2009 Nokia Corporation
   9  *  Author: Kirill A. Shutemov
  10  *
  11  *  Copyright notices from the original cpuset code:
  12  *  --------------------------------------------------
  13  *  Copyright (C) 2003 BULL SA.
  14  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  15  *
  16  *  Portions derived from Patrick Mochel's sysfs code.
  17  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  18  *
  19  *  2003-10-10 Written by Simon Derr.
  20  *  2003-10-22 Updates by Stephen Hemminger.
  21  *  2004 May-July Rework by Paul Jackson.
  22  *  ---------------------------------------------------
  23  *
  24  *  This file is subject to the terms and conditions of the GNU General Public
  25  *  License.  See the file COPYING in the main directory of the Linux
  26  *  distribution for more details.
  27  */
  28
  29 #include <linux/cgroup.h>
  30 #include <linux/cred.h>
  31 #include <linux/ctype.h>
  32 #include <linux/errno.h>
  33 #include <linux/init_task.h>
  34 #include <linux/kernel.h>
  35 #include <linux/list.h>
  36 #include <linux/mm.h>
  37 #include <linux/mutex.h>
  38 #include <linux/mount.h>
  39 #include <linux/pagemap.h>
  40 #include <linux/proc_fs.h>
  41 #include <linux/rcupdate.h>
  42 #include <linux/sched.h>
  43 #include <linux/slab.h>
  44 #include <linux/spinlock.h>
  45 #include <linux/rwsem.h>
  46 #include <linux/string.h>
  47 #include <linux/sort.h>
  48 #include <linux/kmod.h>
  49 #include <linux/delayacct.h>
  50 #include <linux/cgroupstats.h>
  51 #include <linux/hashtable.h>
  52 #include <linux/pid_namespace.h>
  53 #include <linux/idr.h>
  54 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
  55 #include <linux/kthread.h>
  56 #include <linux/delay.h>
  57
  58 #include <linux/atomic.h>
  59
  60 /*
  61  * pidlists linger the following amount before being destroyed.  The goal
  62  * is avoiding frequent destruction in the middle of consecutive read calls
  63  * Expiring in the middle is a performance problem not a correctness one.
  64  * 1 sec should be enough.
  65  */
  66 #define CGROUP_PIDLIST_DESTROY_DELAY    HZ
  67
  68 #define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
  69                                          MAX_CFTYPE_NAME + 2)
  70
  71 /*
  72  * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
  73  * creation/removal and hierarchy changing operations including cgroup
  74  * creation, removal, css association and controller rebinding.  This outer
  75  * lock is needed mainly to resolve the circular dependency between kernfs
  76  * active ref and cgroup_mutex.  cgroup_tree_mutex nests above both.
  77  */
  78 static DEFINE_MUTEX(cgroup_tree_mutex);
  79
  80 /*
  81  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  82  * hierarchy must be performed while holding it.
  83  *
  84  * css_set_rwsem protects task->cgroups pointer, the list of css_set
  85  * objects, and the chain of tasks off each css_set.
  86  *
  87  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  88  * cgroup.h can use them for lockdep annotations.
  89  */
  90 #ifdef CONFIG_PROVE_RCU
  91 DEFINE_MUTEX(cgroup_mutex);
  92 DECLARE_RWSEM(css_set_rwsem);
  93 EXPORT_SYMBOL_GPL(cgroup_mutex);
  94 EXPORT_SYMBOL_GPL(css_set_rwsem);
  95 #else
  96 static DEFINE_MUTEX(cgroup_mutex);
  97 static DECLARE_RWSEM(css_set_rwsem);
  98 #endif
  99
 100 /*
 101  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 102  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 103  */
 104 static DEFINE_SPINLOCK(release_agent_path_lock);
 105
 106 #define cgroup_assert_mutexes_or_rcu_locked()                           \
 107         rcu_lockdep_assert(rcu_read_lock_held() ||                      \
 108                            lockdep_is_held(&cgroup_tree_mutex) ||       \
 109                            lockdep_is_held(&cgroup_mutex),              \
 110                            "cgroup_[tree_]mutex or RCU read lock required");
 111
 112 /*
 113  * cgroup destruction makes heavy use of work items and there can be a lot
 114  * of concurrent destructions.  Use a separate workqueue so that cgroup
 115  * destruction work items don't end up filling up max_active of system_wq
 116  * which may lead to deadlock.
 117  */
 118 static struct workqueue_struct *cgroup_destroy_wq;
 119
 120 /*
 121  * pidlist destructions need to be flushed on cgroup destruction.  Use a
 122  * separate workqueue as flush domain.
 123  */
 124 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 125
 126 /* generate an array of cgroup subsystem pointers */
 127 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 128 static struct cgroup_subsys *cgroup_subsys[] = {
 129 #include <linux/cgroup_subsys.h>
 130 };
 131 #undef SUBSYS
 132
 133 /* array of cgroup subsystem names */
 134 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 135 static const char *cgroup_subsys_name[] = {
 136 #include <linux/cgroup_subsys.h>
 137 };
 138 #undef SUBSYS
 139
 140 /*
 141  * The default hierarchy, reserved for the subsystems that are otherwise
 142  * unattached - it never has more than a single cgroup, and all tasks are
 143  * part of that cgroup.
 144  */
 145 struct cgroup_root cgrp_dfl_root;
 146
 147 /*
 148  * The default hierarchy always exists but is hidden until mounted for the
 149  * first time.  This is for backward compatibility.
 150  */
 151 static bool cgrp_dfl_root_visible;
 152
 153 /* The list of hierarchy roots */
 154
 155 static LIST_HEAD(cgroup_roots);
 156 static int cgroup_root_count;
 157
 158 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 159 static DEFINE_IDR(cgroup_hierarchy_idr);
 160
 161 /*
 162  * Assign a monotonically increasing serial number to cgroups.  It
 163  * guarantees cgroups with bigger numbers are newer than those with smaller
 164  * numbers.  Also, as cgroups are always appended to the parent's
 165  * ->children list, it guarantees that sibling cgroups are always sorted in
 166  * the ascending serial number order on the list.  Protected by
 167  * cgroup_mutex.
 168  */
 169 static u64 cgroup_serial_nr_next = 1;
 170
 171 /* This flag indicates whether tasks in the fork and exit paths should
 172  * check for fork/exit handlers to call. This avoids us having to do
 173  * extra work in the fork/exit path if none of the subsystems need to
 174  * be called.
 175  */
 176 static int need_forkexit_callback __read_mostly;
 177
 178 static struct cftype cgroup_base_files[];
 179
 180 static void cgroup_put(struct cgroup *cgrp);
 181 static int rebind_subsystems(struct cgroup_root *dst_root,
 182                              unsigned long ss_mask);
 183 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 184 static int cgroup_destroy_locked(struct cgroup *cgrp);
 185 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 186                               bool is_add);
 187 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 188
 189 /**
 190  * cgroup_css - obtain a cgroup's css for the specified subsystem
 191  * @cgrp: the cgroup of interest
 192  * @ss: the subsystem of interest (%NULL returns the dummy_css)
 193  *
 194  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 195  * function must be called either under cgroup_mutex or rcu_read_lock() and
 196  * the caller is responsible for pinning the returned css if it wants to
 197  * keep accessing it outside the said locks.  This function may return
 198  * %NULL if @cgrp doesn't have @subsys_id enabled.
 199  */
 200 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 201                                               struct cgroup_subsys *ss)
 202 {
 203         if (ss)
 204                 return rcu_dereference_check(cgrp->subsys[ss->id],
 205                                         lockdep_is_held(&cgroup_tree_mutex) ||
 206                                         lockdep_is_held(&cgroup_mutex));
 207         else
 208                 return &cgrp->dummy_css;
 209 }
 210
 211 /* convenient tests for these bits */
 212 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 213 {
 214         return test_bit(CGRP_DEAD, &cgrp->flags);
 215 }
 216
 217 struct cgroup_subsys_state *seq_css(struct seq_file *seq)
 218 {
 219         struct kernfs_open_file *of = seq->private;
 220         struct cgroup *cgrp = of->kn->parent->priv;
 221         struct cftype *cft = seq_cft(seq);
 222
 223         /*
 224          * This is open and unprotected implementation of cgroup_css().
 225          * seq_css() is only called from a kernfs file operation which has
 226          * an active reference on the file.  Because all the subsystem
 227          * files are drained before a css is disassociated with a cgroup,
 228          * the matching css from the cgroup's subsys table is guaranteed to
 229          * be and stay valid until the enclosing operation is complete.
 230          */
 231         if (cft->ss)
 232                 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 233         else
 234                 return &cgrp->dummy_css;
 235 }
 236 EXPORT_SYMBOL_GPL(seq_css);
 237
 238 /**
 239  * cgroup_is_descendant - test ancestry
 240  * @cgrp: the cgroup to be tested
 241  * @ancestor: possible ancestor of @cgrp
 242  *
 243  * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 244  * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 245  * and @ancestor are accessible.
 246  */
 247 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 248 {
 249         while (cgrp) {
 250                 if (cgrp == ancestor)
 251                         return true;
 252                 cgrp = cgrp->parent;
 253         }
 254         return false;
 255 }
 256
 257 static int cgroup_is_releasable(const struct cgroup *cgrp)
 258 {
 259         const int bits =
 260                 (1 << CGRP_RELEASABLE) |
 261                 (1 << CGRP_NOTIFY_ON_RELEASE);
 262         return (cgrp->flags & bits) == bits;
 263 }
 264
 265 static int notify_on_release(const struct cgroup *cgrp)
 266 {
 267         return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 268 }
 269
 270 /**
 271  * for_each_css - iterate all css's of a cgroup
 272  * @css: the iteration cursor
 273  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 274  * @cgrp: the target cgroup to iterate css's of
 275  *
 276  * Should be called under cgroup_mutex.
 277  */
 278 #define for_each_css(css, ssid, cgrp)                                   \
 279         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 280                 if (!((css) = rcu_dereference_check(                    \
 281                                 (cgrp)->subsys[(ssid)],                 \
 282                                 lockdep_is_held(&cgroup_tree_mutex) ||  \
 283                                 lockdep_is_held(&cgroup_mutex)))) { }   \
 284                 else
 285
 286 /**
 287  * for_each_subsys - iterate all enabled cgroup subsystems
 288  * @ss: the iteration cursor
 289  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 290  */
 291 #define for_each_subsys(ss, ssid)                                       \
 292         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
 293              (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 294
 295 /* iterate across the hierarchies */
 296 #define for_each_root(root)                                             \
 297         list_for_each_entry((root), &cgroup_roots, root_list)
 298
 299 /**
 300  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
 301  * @cgrp: the cgroup to be checked for liveness
 302  *
 303  * On success, returns true; the mutex should be later unlocked.  On
 304  * failure returns false with no lock held.
 305  */
 306 static bool cgroup_lock_live_group(struct cgroup *cgrp)
 307 {
 308         mutex_lock(&cgroup_mutex);
 309         if (cgroup_is_dead(cgrp)) {
 310                 mutex_unlock(&cgroup_mutex);
 311                 return false;
 312         }
 313         return true;
 314 }
 315
 316 /* the list of cgroups eligible for automatic release. Protected by
 317  * release_list_lock */
 318 static LIST_HEAD(release_list);
 319 static DEFINE_RAW_SPINLOCK(release_list_lock);
 320 static void cgroup_release_agent(struct work_struct *work);
 321 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 322 static void check_for_release(struct cgroup *cgrp);
 323
 324 /*
 325  * A cgroup can be associated with multiple css_sets as different tasks may
 326  * belong to different cgroups on different hierarchies.  In the other
 327  * direction, a css_set is naturally associated with multiple cgroups.
 328  * This M:N relationship is represented by the following link structure
 329  * which exists for each association and allows traversing the associations
 330  * from both sides.
 331  */
 332 struct cgrp_cset_link {
 333         /* the cgroup and css_set this link associates */
 334         struct cgroup           *cgrp;
 335         struct css_set          *cset;
 336
 337         /* list of cgrp_cset_links anchored at cgrp->cset_links */
 338         struct list_head        cset_link;
 339
 340         /* list of cgrp_cset_links anchored at css_set->cgrp_links */
 341         struct list_head        cgrp_link;
 342 };
 343
 344 /*
 345  * The default css_set - used by init and its children prior to any
 346  * hierarchies being mounted. It contains a pointer to the root state
 347  * for each subsystem. Also used to anchor the list of css_sets. Not
 348  * reference-counted, to improve performance when child cgroups
 349  * haven't been created.
 350  */
 351 static struct css_set init_css_set = {
 352         .refcount               = ATOMIC_INIT(1),
 353         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
 354         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
 355         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
 356         .mg_preload_node        = LIST_HEAD_INIT(init_css_set.mg_preload_node),
 357         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
 358 };
 359
 360 static int css_set_count        = 1;    /* 1 for init_css_set */
 361
 362 /*
 363  * hash table for cgroup groups. This improves the performance to find
 364  * an existing css_set. This hash doesn't (currently) take into
 365  * account cgroups in empty hierarchies.
 366  */
 367 #define CSS_SET_HASH_BITS       7
 368 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 369
 370 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 371 {
 372         unsigned long key = 0UL;
 373         struct cgroup_subsys *ss;
 374         int i;
 375
 376         for_each_subsys(ss, i)
 377                 key += (unsigned long)css[i];
 378         key = (key >> 16) ^ key;
 379
 380         return key;
 381 }
 382
 383 static void put_css_set_locked(struct css_set *cset, bool taskexit)
 384 {
 385         struct cgrp_cset_link *link, *tmp_link;
 386
 387         lockdep_assert_held(&css_set_rwsem);
 388
 389         if (!atomic_dec_and_test(&cset->refcount))
 390                 return;
 391
 392         /* This css_set is dead. unlink it and release cgroup refcounts */
 393         hash_del(&cset->hlist);
 394         css_set_count--;
 395
 396         list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 397                 struct cgroup *cgrp = link->cgrp;
 398
 399                 list_del(&link->cset_link);
 400                 list_del(&link->cgrp_link);
 401
 402                 /* @cgrp can't go away while we're holding css_set_rwsem */
 403                 if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
 404                         if (taskexit)
 405                                 set_bit(CGRP_RELEASABLE, &cgrp->flags);
 406                         check_for_release(cgrp);
 407                 }
 408
 409                 kfree(link);
 410         }
 411
 412         kfree_rcu(cset, rcu_head);
 413 }
 414
 415 static void put_css_set(struct css_set *cset, bool taskexit)
 416 {
 417         /*
 418          * Ensure that the refcount doesn't hit zero while any readers
 419          * can see it. Similar to atomic_dec_and_lock(), but for an
 420          * rwlock
 421          */
 422         if (atomic_add_unless(&cset->refcount, -1, 1))
 423                 return;
 424
 425         down_write(&css_set_rwsem);
 426         put_css_set_locked(cset, taskexit);
 427         up_write(&css_set_rwsem);
 428 }
 429
 430 /*
 431  * refcounted get/put for css_set objects
 432  */
 433 static inline void get_css_set(struct css_set *cset)
 434 {
 435         atomic_inc(&cset->refcount);
 436 }
 437
 438 /**
 439  * compare_css_sets - helper function for find_existing_css_set().
 440  * @cset: candidate css_set being tested
 441  * @old_cset: existing css_set for a task
 442  * @new_cgrp: cgroup that's being entered by the task
 443  * @template: desired set of css pointers in css_set (pre-calculated)
 444  *
 445  * Returns true if "cset" matches "old_cset" except for the hierarchy
 446  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 447  */
 448 static bool compare_css_sets(struct css_set *cset,
 449                              struct css_set *old_cset,
 450                              struct cgroup *new_cgrp,
 451                              struct cgroup_subsys_state *template[])
 452 {
 453         struct list_head *l1, *l2;
 454
 455         if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
 456                 /* Not all subsystems matched */
 457                 return false;
 458         }
 459
 460         /*
 461          * Compare cgroup pointers in order to distinguish between
 462          * different cgroups in heirarchies with no subsystems. We
 463          * could get by with just this check alone (and skip the
 464          * memcmp above) but on most setups the memcmp check will
 465          * avoid the need for this more expensive check on almost all
 466          * candidates.
 467          */
 468
 469         l1 = &cset->cgrp_links;
 470         l2 = &old_cset->cgrp_links;
 471         while (1) {
 472                 struct cgrp_cset_link *link1, *link2;
 473                 struct cgroup *cgrp1, *cgrp2;
 474
 475                 l1 = l1->next;
 476                 l2 = l2->next;
 477                 /* See if we reached the end - both lists are equal length. */
 478                 if (l1 == &cset->cgrp_links) {
 479                         BUG_ON(l2 != &old_cset->cgrp_links);
 480                         break;
 481                 } else {
 482                         BUG_ON(l2 == &old_cset->cgrp_links);
 483                 }
 484                 /* Locate the cgroups associated with these links. */
 485                 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
 486                 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
 487                 cgrp1 = link1->cgrp;
 488                 cgrp2 = link2->cgrp;
 489                 /* Hierarchies should be linked in the same order. */
 490                 BUG_ON(cgrp1->root != cgrp2->root);
 491
 492                 /*
 493                  * If this hierarchy is the hierarchy of the cgroup
 494                  * that's changing, then we need to check that this
 495                  * css_set points to the new cgroup; if it's any other
 496                  * hierarchy, then this css_set should point to the
 497                  * same cgroup as the old css_set.
 498                  */
 499                 if (cgrp1->root == new_cgrp->root) {
 500                         if (cgrp1 != new_cgrp)
 501                                 return false;
 502                 } else {
 503                         if (cgrp1 != cgrp2)
 504                                 return false;
 505                 }
 506         }
 507         return true;
 508 }
 509
 510 /**
 511  * find_existing_css_set - init css array and find the matching css_set
 512  * @old_cset: the css_set that we're using before the cgroup transition
 513  * @cgrp: the cgroup that we're moving into
 514  * @template: out param for the new set of csses, should be clear on entry
 515  */
 516 static struct css_set *find_existing_css_set(struct css_set *old_cset,
 517                                         struct cgroup *cgrp,
 518                                         struct cgroup_subsys_state *template[])
 519 {
 520         struct cgroup_root *root = cgrp->root;
 521         struct cgroup_subsys *ss;
 522         struct css_set *cset;
 523         unsigned long key;
 524         int i;
 525
 526         /*
 527          * Build the set of subsystem state objects that we want to see in the
 528          * new css_set. while subsystems can change globally, the entries here
 529          * won't change, so no need for locking.
 530          */
 531         for_each_subsys(ss, i) {
 532                 if (root->subsys_mask & (1UL << i)) {
 533                         /* Subsystem is in this hierarchy. So we want
 534                          * the subsystem state from the new
 535                          * cgroup */
 536                         template[i] = cgroup_css(cgrp, ss);
 537                 } else {
 538                         /* Subsystem is not in this hierarchy, so we
 539                          * don't want to change the subsystem state */
 540                         template[i] = old_cset->subsys[i];
 541                 }
 542         }
 543
 544         key = css_set_hash(template);
 545         hash_for_each_possible(css_set_table, cset, hlist, key) {
 546                 if (!compare_css_sets(cset, old_cset, cgrp, template))
 547                         continue;
 548
 549                 /* This css_set matches what we need */
 550                 return cset;
 551         }
 552
 553         /* No existing cgroup group matched */
 554         return NULL;
 555 }
 556
 557 static void free_cgrp_cset_links(struct list_head *links_to_free)
 558 {
 559         struct cgrp_cset_link *link, *tmp_link;
 560
 561         list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
 562                 list_del(&link->cset_link);
 563                 kfree(link);
 564         }
 565 }
 566
 567 /**
 568  * allocate_cgrp_cset_links - allocate cgrp_cset_links
 569  * @count: the number of links to allocate
 570  * @tmp_links: list_head the allocated links are put on
 571  *
 572  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
 573  * through ->cset_link.  Returns 0 on success or -errno.
 574  */
 575 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 576 {
 577         struct cgrp_cset_link *link;
 578         int i;
 579
 580         INIT_LIST_HEAD(tmp_links);
 581
 582         for (i = 0; i < count; i++) {
 583                 link = kzalloc(sizeof(*link), GFP_KERNEL);
 584                 if (!link) {
 585                         free_cgrp_cset_links(tmp_links);
 586                         return -ENOMEM;
 587                 }
 588                 list_add(&link->cset_link, tmp_links);
 589         }
 590         return 0;
 591 }
 592
 593 /**
 594  * link_css_set - a helper function to link a css_set to a cgroup
 595  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
 596  * @cset: the css_set to be linked
 597  * @cgrp: the destination cgroup
 598  */
 599 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 600                          struct cgroup *cgrp)
 601 {
 602         struct cgrp_cset_link *link;
 603
 604         BUG_ON(list_empty(tmp_links));
 605         link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 606         link->cset = cset;
 607         link->cgrp = cgrp;
 608         list_move(&link->cset_link, &cgrp->cset_links);
 609         /*
 610          * Always add links to the tail of the list so that the list
 611          * is sorted by order of hierarchy creation
 612          */
 613         list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 614 }
 615
 616 /**
 617  * find_css_set - return a new css_set with one cgroup updated
 618  * @old_cset: the baseline css_set
 619  * @cgrp: the cgroup to be updated
 620  *
 621  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
 622  * substituted into the appropriate hierarchy.
 623  */
 624 static struct css_set *find_css_set(struct css_set *old_cset,
 625                                     struct cgroup *cgrp)
 626 {
 627         struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
 628         struct css_set *cset;
 629         struct list_head tmp_links;
 630         struct cgrp_cset_link *link;
 631         unsigned long key;
 632
 633         lockdep_assert_held(&cgroup_mutex);
 634
 635         /* First see if we already have a cgroup group that matches
 636          * the desired set */
 637         down_read(&css_set_rwsem);
 638         cset = find_existing_css_set(old_cset, cgrp, template);
 639         if (cset)
 640                 get_css_set(cset);
 641         up_read(&css_set_rwsem);
 642
 643         if (cset)
 644                 return cset;
 645
 646         cset = kzalloc(sizeof(*cset), GFP_KERNEL);
 647         if (!cset)
 648                 return NULL;
 649
 650         /* Allocate all the cgrp_cset_link objects that we'll need */
 651         if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
 652                 kfree(cset);
 653                 return NULL;
 654         }
 655
 656         atomic_set(&cset->refcount, 1);
 657         INIT_LIST_HEAD(&cset->cgrp_links);
 658         INIT_LIST_HEAD(&cset->tasks);
 659         INIT_LIST_HEAD(&cset->mg_tasks);
 660         INIT_LIST_HEAD(&cset->mg_preload_node);
 661         INIT_LIST_HEAD(&cset->mg_node);
 662         INIT_HLIST_NODE(&cset->hlist);
 663
 664         /* Copy the set of subsystem state objects generated in
 665          * find_existing_css_set() */
 666         memcpy(cset->subsys, template, sizeof(cset->subsys));
 667
 668         down_write(&css_set_rwsem);
 669         /* Add reference counts and links from the new css_set. */
 670         list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 671                 struct cgroup *c = link->cgrp;
 672
 673                 if (c->root == cgrp->root)
 674                         c = cgrp;
 675                 link_css_set(&tmp_links, cset, c);
 676         }
 677
 678         BUG_ON(!list_empty(&tmp_links));
 679
 680         css_set_count++;
 681
 682         /* Add this cgroup group to the hash table */
 683         key = css_set_hash(cset->subsys);
 684         hash_add(css_set_table, &cset->hlist, key);
 685
 686         up_write(&css_set_rwsem);
 687
 688         return cset;
 689 }
 690
 691 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 692 {
 693         struct cgroup *root_cgrp = kf_root->kn->priv;
 694
 695         return root_cgrp->root;
 696 }
 697
 698 static int cgroup_init_root_id(struct cgroup_root *root)
 699 {
 700         int id;
 701
 702         lockdep_assert_held(&cgroup_mutex);
 703
 704         id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
 705         if (id < 0)
 706                 return id;
 707
 708         root->hierarchy_id = id;
 709         return 0;
 710 }
 711
 712 static void cgroup_exit_root_id(struct cgroup_root *root)
 713 {
 714         lockdep_assert_held(&cgroup_mutex);
 715
 716         if (root->hierarchy_id) {
 717                 idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 718                 root->hierarchy_id = 0;
 719         }
 720 }
 721
 722 static void cgroup_free_root(struct cgroup_root *root)
 723 {
 724         if (root) {
 725                 /* hierarhcy ID shoulid already have been released */
 726                 WARN_ON_ONCE(root->hierarchy_id);
 727
 728                 idr_destroy(&root->cgroup_idr);
 729                 kfree(root);
 730         }
 731 }
 732
 733 static void cgroup_destroy_root(struct cgroup_root *root)
 734 {
 735         struct cgroup *cgrp = &root->cgrp;
 736         struct cgrp_cset_link *link, *tmp_link;
 737
 738         mutex_lock(&cgroup_tree_mutex);
 739         mutex_lock(&cgroup_mutex);
 740
 741         BUG_ON(atomic_read(&root->nr_cgrps));
 742         BUG_ON(!list_empty(&cgrp->children));
 743
 744         /* Rebind all subsystems back to the default hierarchy */
 745         rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
 746
 747         /*
 748          * Release all the links from cset_links to this hierarchy's
 749          * root cgroup
 750          */
 751         down_write(&css_set_rwsem);
 752
 753         list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 754                 list_del(&link->cset_link);
 755                 list_del(&link->cgrp_link);
 756                 kfree(link);
 757         }
 758         up_write(&css_set_rwsem);
 759
 760         if (!list_empty(&root->root_list)) {
 761                 list_del(&root->root_list);
 762                 cgroup_root_count--;
 763         }
 764
 765         cgroup_exit_root_id(root);
 766
 767         mutex_unlock(&cgroup_mutex);
 768         mutex_unlock(&cgroup_tree_mutex);
 769
 770         kernfs_destroy_root(root->kf_root);
 771         cgroup_free_root(root);
 772 }
 773
 774 /* look up cgroup associated with given css_set on the specified hierarchy */
 775 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 776                                             struct cgroup_root *root)
 777 {
 778         struct cgroup *res = NULL;
 779
 780         lockdep_assert_held(&cgroup_mutex);
 781         lockdep_assert_held(&css_set_rwsem);
 782
 783         if (cset == &init_css_set) {
 784                 res = &root->cgrp;
 785         } else {
 786                 struct cgrp_cset_link *link;
 787
 788                 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 789                         struct cgroup *c = link->cgrp;
 790
 791                         if (c->root == root) {
 792                                 res = c;
 793                                 break;
 794                         }
 795                 }
 796         }
 797
 798         BUG_ON(!res);
 799         return res;
 800 }
 801
 802 /*
 803  * Return the cgroup for "task" from the given hierarchy. Must be
 804  * called with cgroup_mutex and css_set_rwsem held.
 805  */
 806 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 807                                             struct cgroup_root *root)
 808 {
 809         /*
 810          * No need to lock the task - since we hold cgroup_mutex the
 811          * task can't change groups, so the only thing that can happen
 812          * is that it exits and its css is set back to init_css_set.
 813          */
 814         return cset_cgroup_from_root(task_css_set(task), root);
 815 }
 816
 817 /*
 818  * A task must hold cgroup_mutex to modify cgroups.
 819  *
 820  * Any task can increment and decrement the count field without lock.
 821  * So in general, code holding cgroup_mutex can't rely on the count
 822  * field not changing.  However, if the count goes to zero, then only
 823  * cgroup_attach_task() can increment it again.  Because a count of zero
 824  * means that no tasks are currently attached, therefore there is no
 825  * way a task attached to that cgroup can fork (the other way to
 826  * increment the count).  So code holding cgroup_mutex can safely
 827  * assume that if the count is zero, it will stay zero. Similarly, if
 828  * a task holds cgroup_mutex on a cgroup with zero count, it
 829  * knows that the cgroup won't be removed, as cgroup_rmdir()
 830  * needs that mutex.
 831  *
 832  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
 833  * (usually) take cgroup_mutex.  These are the two most performance
 834  * critical pieces of code here.  The exception occurs on cgroup_exit(),
 835  * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
 836  * is taken, and if the cgroup count is zero, a usermode call made
 837  * to the release agent with the name of the cgroup (path relative to
 838  * the root of cgroup file system) as the argument.
 839  *
 840  * A cgroup can only be deleted if both its 'count' of using tasks
 841  * is zero, and its list of 'children' cgroups is empty.  Since all
 842  * tasks in the system use _some_ cgroup, and since there is always at
 843  * least one task in the system (init, pid == 1), therefore, root cgroup
 844  * always has either children cgroups and/or using tasks.  So we don't
 845  * need a special hack to ensure that root cgroup cannot be deleted.
 846  *
 847  * P.S.  One more locking exception.  RCU is used to guard the
 848  * update of a tasks cgroup pointer by cgroup_attach_task()
 849  */
 850
 851 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 852 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 853 static const struct file_operations proc_cgroupstats_operations;
 854
 855 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 856                               char *buf)
 857 {
 858         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 859             !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 860                 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
 861                          cft->ss->name, cft->name);
 862         else
 863                 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 864         return buf;
 865 }
 866
 867 /**
 868  * cgroup_file_mode - deduce file mode of a control file
 869  * @cft: the control file in question
 870  *
 871  * returns cft->mode if ->mode is not 0
 872  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
 873  * returns S_IRUGO if it has only a read handler
 874  * returns S_IWUSR if it has only a write hander
 875  */
 876 static umode_t cgroup_file_mode(const struct cftype *cft)
 877 {
 878         umode_t mode = 0;
 879
 880         if (cft->mode)
 881                 return cft->mode;
 882
 883         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
 884                 mode |= S_IRUGO;
 885
 886         if (cft->write_u64 || cft->write_s64 || cft->write_string ||
 887             cft->trigger)
 888                 mode |= S_IWUSR;
 889
 890         return mode;
 891 }
 892
 893 static void cgroup_free_fn(struct work_struct *work)
 894 {
 895         struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 896
 897         atomic_dec(&cgrp->root->nr_cgrps);
 898         cgroup_pidlist_destroy_all(cgrp);
 899
 900         if (cgrp->parent) {
 901                 /*
 902                  * We get a ref to the parent, and put the ref when this
 903                  * cgroup is being freed, so it's guaranteed that the
 904                  * parent won't be destroyed before its children.
 905                  */
 906                 cgroup_put(cgrp->parent);
 907                 kernfs_put(cgrp->kn);
 908                 kfree(cgrp);
 909         } else {
 910                 /*
 911                  * This is root cgroup's refcnt reaching zero, which
 912                  * indicates that the root should be released.
 913                  */
 914                 cgroup_destroy_root(cgrp->root);
 915         }
 916 }
 917
 918 static void cgroup_free_rcu(struct rcu_head *head)
 919 {
 920         struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 921
 922         INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
 923         queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 924 }
 925
 926 static void cgroup_get(struct cgroup *cgrp)
 927 {
 928         WARN_ON_ONCE(cgroup_is_dead(cgrp));
 929         WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
 930         atomic_inc(&cgrp->refcnt);
 931 }
 932
 933 static void cgroup_put(struct cgroup *cgrp)
 934 {
 935         if (!atomic_dec_and_test(&cgrp->refcnt))
 936                 return;
 937         if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
 938                 return;
 939
 940         /*
 941          * XXX: cgrp->id is only used to look up css's.  As cgroup and
 942          * css's lifetimes will be decoupled, it should be made
 943          * per-subsystem and moved to css->id so that lookups are
 944          * successful until the target css is released.
 945          */
 946         mutex_lock(&cgroup_mutex);
 947         idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 948         mutex_unlock(&cgroup_mutex);
 949         cgrp->id = -1;
 950
 951         call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
 952 }
 953
 954 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 955 {
 956         char name[CGROUP_FILE_NAME_MAX];
 957
 958         lockdep_assert_held(&cgroup_tree_mutex);
 959         kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
 960 }
 961
 962 /**
 963  * cgroup_clear_dir - remove subsys files in a cgroup directory
 964  * @cgrp: target cgroup
 965  * @subsys_mask: mask of the subsystem ids whose files should be removed
 966  */
 967 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 968 {
 969         struct cgroup_subsys *ss;
 970         int i;
 971
 972         for_each_subsys(ss, i) {
 973                 struct cftype *cfts;
 974
 975                 if (!test_bit(i, &subsys_mask))
 976                         continue;
 977                 list_for_each_entry(cfts, &ss->cfts, node)
 978                         cgroup_addrm_files(cgrp, cfts, false);
 979         }
 980 }
 981
 982 static int rebind_subsystems(struct cgroup_root *dst_root,
 983                              unsigned long ss_mask)
 984 {
 985         struct cgroup_subsys *ss;
 986         int ssid, ret;
 987
 988         lockdep_assert_held(&cgroup_tree_mutex);
 989         lockdep_assert_held(&cgroup_mutex);
 990
 991         for_each_subsys(ss, ssid) {
 992                 if (!(ss_mask & (1 << ssid)))
 993                         continue;
 994
 995                 /* if @ss is on the dummy_root, we can always move it */
 996                 if (ss->root == &cgrp_dfl_root)
 997                         continue;
 998
 999                 /* if @ss has non-root cgroups attached to it, can't move */
1000                 if (!list_empty(&ss->root->cgrp.children))
1001                         return -EBUSY;
1002
1003                 /* can't move between two non-dummy roots either */
1004                 if (dst_root != &cgrp_dfl_root)
1005                         return -EBUSY;
1006         }
1007
1008         ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
1009         if (ret) {
1010                 if (dst_root != &cgrp_dfl_root)
1011                         return ret;
1012
1013                 /*
1014                  * Rebinding back to the default root is not allowed to
1015                  * fail.  Using both default and non-default roots should
1016                  * be rare.  Moving subsystems back and forth even more so.
1017                  * Just warn about it and continue.
1018                  */
1019                 if (cgrp_dfl_root_visible) {
1020                         pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
1021                                    ret, ss_mask);
1022                         pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
1023                 }
1024         }
1025
1026         /*
1027          * Nothing can fail from this point on.  Remove files for the
1028          * removed subsystems and rebind each subsystem.
1029          */
1030         mutex_unlock(&cgroup_mutex);
1031         for_each_subsys(ss, ssid)
1032                 if (ss_mask & (1 << ssid))
1033                         cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1034         mutex_lock(&cgroup_mutex);
1035
1036         for_each_subsys(ss, ssid) {
1037                 struct cgroup_root *src_root;
1038                 struct cgroup_subsys_state *css;
1039
1040                 if (!(ss_mask & (1 << ssid)))
1041                         continue;
1042
1043                 src_root = ss->root;
1044                 css = cgroup_css(&src_root->cgrp, ss);
1045
1046                 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
1047
1048                 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
1049                 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
1050                 ss->root = dst_root;
1051                 css->cgroup = &dst_root->cgrp;
1052
1053                 src_root->subsys_mask &= ~(1 << ssid);
1054                 src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
1055
1056                 dst_root->subsys_mask |= 1 << ssid;
1057                 dst_root->cgrp.child_subsys_mask |= 1 << ssid;
1058
1059                 if (ss->bind)
1060                         ss->bind(css);
1061         }
1062
1063         kernfs_activate(dst_root->cgrp.kn);
1064         return 0;
1065 }
1066
1067 static int cgroup_show_options(struct seq_file *seq,
1068                                struct kernfs_root *kf_root)
1069 {
1070         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1071         struct cgroup_subsys *ss;
1072         int ssid;
1073
1074         for_each_subsys(ss, ssid)
1075                 if (root->subsys_mask & (1 << ssid))
1076                         seq_printf(seq, ",%s", ss->name);
1077         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1078                 seq_puts(seq, ",sane_behavior");
1079         if (root->flags & CGRP_ROOT_NOPREFIX)
1080                 seq_puts(seq, ",noprefix");
1081         if (root->flags & CGRP_ROOT_XATTR)
1082                 seq_puts(seq, ",xattr");
1083
1084         spin_lock(&release_agent_path_lock);
1085         if (strlen(root->release_agent_path))
1086                 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
1087         spin_unlock(&release_agent_path_lock);
1088
1089         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1090                 seq_puts(seq, ",clone_children");
1091         if (strlen(root->name))
1092                 seq_printf(seq, ",name=%s", root->name);
1093         return 0;
1094 }
1095
1096 struct cgroup_sb_opts {
1097         unsigned long subsys_mask;
1098         unsigned long flags;
1099         char *release_agent;
1100         bool cpuset_clone_children;
1101         char *name;
1102         /* User explicitly requested empty subsystem */
1103         bool none;
1104 };
1105
1106 /*
1107  * Convert a hierarchy specifier into a bitmask of subsystems and
1108  * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
1109  * array. This function takes refcounts on subsystems to be used, unless it
1110  * returns error, in which case no refcounts are taken.
1111  */
1112 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1113 {
1114         char *token, *o = data;
1115         bool all_ss = false, one_ss = false;
1116         unsigned long mask = (unsigned long)-1;
1117         struct cgroup_subsys *ss;
1118         int i;
1119
1120         BUG_ON(!mutex_is_locked(&cgroup_mutex));
1121
1122 #ifdef CONFIG_CPUSETS
1123         mask = ~(1UL << cpuset_cgrp_id);
1124 #endif
1125
1126         memset(opts, 0, sizeof(*opts));
1127
1128         while ((token = strsep(&o, ",")) != NULL) {
1129                 if (!*token)
1130                         return -EINVAL;
1131                 if (!strcmp(token, "none")) {
1132                         /* Explicitly have no subsystems */
1133                         opts->none = true;
1134                         continue;
1135                 }
1136                 if (!strcmp(token, "all")) {
1137                         /* Mutually exclusive option 'all' + subsystem name */
1138                         if (one_ss)
1139                                 return -EINVAL;
1140                         all_ss = true;
1141                         continue;
1142                 }
1143                 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1144                         opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1145                         continue;
1146                 }
1147                 if (!strcmp(token, "noprefix")) {
1148                         opts->flags |= CGRP_ROOT_NOPREFIX;
1149                         continue;
1150                 }
1151                 if (!strcmp(token, "clone_children")) {
1152                         opts->cpuset_clone_children = true;
1153                         continue;
1154                 }
1155                 if (!strcmp(token, "xattr")) {
1156                         opts->flags |= CGRP_ROOT_XATTR;
1157                         continue;
1158                 }
1159                 if (!strncmp(token, "release_agent=", 14)) {
1160                         /* Specifying two release agents is forbidden */
1161                         if (opts->release_agent)
1162                                 return -EINVAL;
1163                         opts->release_agent =
1164                                 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
1165                         if (!opts->release_agent)
1166                                 return -ENOMEM;
1167                         continue;
1168                 }
1169                 if (!strncmp(token, "name=", 5)) {
1170                         const char *name = token + 5;
1171                         /* Can't specify an empty name */
1172                         if (!strlen(name))
1173                                 return -EINVAL;
1174                         /* Must match [\w.-]+ */
1175                         for (i = 0; i < strlen(name); i++) {
1176                                 char c = name[i];
1177                                 if (isalnum(c))
1178                                         continue;
1179                                 if ((c == '.') || (c == '-') || (c == '_'))
1180                                         continue;
1181                                 return -EINVAL;
1182                         }
1183                         /* Specifying two names is forbidden */
1184                         if (opts->name)
1185                                 return -EINVAL;
1186                         opts->name = kstrndup(name,
1187                                               MAX_CGROUP_ROOT_NAMELEN - 1,
1188                                               GFP_KERNEL);
1189                         if (!opts->name)
1190                                 return -ENOMEM;
1191
1192                         continue;
1193                 }
1194
1195                 for_each_subsys(ss, i) {
1196                         if (strcmp(token, ss->name))
1197                                 continue;
1198                         if (ss->disabled)
1199                                 continue;
1200
1201                         /* Mutually exclusive option 'all' + subsystem name */
1202                         if (all_ss)
1203                                 return -EINVAL;
1204                         set_bit(i, &opts->subsys_mask);
1205                         one_ss = true;
1206
1207                         break;
1208                 }
1209                 if (i == CGROUP_SUBSYS_COUNT)
1210                         return -ENOENT;
1211         }
1212
1213         /* Consistency checks */
1214
1215         if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1216                 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1217
1218                 if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
1219                     opts->cpuset_clone_children || opts->release_agent ||
1220                     opts->name) {
1221                         pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
1222                         return -EINVAL;
1223                 }
1224         } else {
1225                 /*
1226                  * If the 'all' option was specified select all the
1227                  * subsystems, otherwise if 'none', 'name=' and a subsystem
1228                  * name options were not specified, let's default to 'all'
1229                  */
1230                 if (all_ss || (!one_ss && !opts->none && !opts->name))
1231                         for_each_subsys(ss, i)
1232                                 if (!ss->disabled)
1233                                         set_bit(i, &opts->subsys_mask);
1234
1235                 /*
1236                  * We either have to specify by name or by subsystems. (So
1237                  * all empty hierarchies must have a name).
1238                  */
1239                 if (!opts->subsys_mask && !opts->name)
1240                         return -EINVAL;
1241         }
1242
1243         /*
1244          * Option noprefix was introduced just for backward compatibility
1245          * with the old cpuset, so we allow noprefix only if mounting just
1246          * the cpuset subsystem.
1247          */
1248         if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1249                 return -EINVAL;
1250
1251
1252         /* Can't specify "none" and some subsystems */
1253         if (opts->subsys_mask && opts->none)
1254                 return -EINVAL;
1255
1256         return 0;
1257 }
1258
1259 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
1260 {
1261         int ret = 0;
1262         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1263         struct cgroup_sb_opts opts;
1264         unsigned long added_mask, removed_mask;
1265
1266         if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1267                 pr_err("cgroup: sane_behavior: remount is not allowed\n");
1268                 return -EINVAL;
1269         }
1270
1271         mutex_lock(&cgroup_tree_mutex);
1272         mutex_lock(&cgroup_mutex);
1273
1274         /* See what subsystems are wanted */
1275         ret = parse_cgroupfs_options(data, &opts);
1276         if (ret)
1277                 goto out_unlock;
1278
1279         if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
1280                 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
1281                            task_tgid_nr(current), current->comm);
1282
1283         added_mask = opts.subsys_mask & ~root->subsys_mask;
1284         removed_mask = root->subsys_mask & ~opts.subsys_mask;
1285
1286         /* Don't allow flags or name to change at remount */
1287         if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
1288             (opts.name && strcmp(opts.name, root->name))) {
1289                 pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
1290                        opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
1291                        root->flags & CGRP_ROOT_OPTION_MASK, root->name);
1292                 ret = -EINVAL;
1293                 goto out_unlock;
1294         }
1295
1296         /* remounting is not allowed for populated hierarchies */
1297         if (!list_empty(&root->cgrp.children)) {
1298                 ret = -EBUSY;
1299                 goto out_unlock;
1300         }
1301
1302         ret = rebind_subsystems(root, added_mask);
1303         if (ret)
1304                 goto out_unlock;
1305
1306         rebind_subsystems(&cgrp_dfl_root, removed_mask);
1307
1308         if (opts.release_agent) {
1309                 spin_lock(&release_agent_path_lock);
1310                 strcpy(root->release_agent_path, opts.release_agent);
1311                 spin_unlock(&release_agent_path_lock);
1312         }
1313  out_unlock:
1314         kfree(opts.release_agent);
1315         kfree(opts.name);
1316         mutex_unlock(&cgroup_mutex);
1317         mutex_unlock(&cgroup_tree_mutex);
1318         return ret;
1319 }
1320
1321 /*
1322  * To reduce the fork() overhead for systems that are not actually using
1323  * their cgroups capability, we don't maintain the lists running through
1324  * each css_set to its tasks until we see the list actually used - in other
1325  * words after the first mount.
1326  */
1327 static bool use_task_css_set_links __read_mostly;
1328
1329 static void cgroup_enable_task_cg_lists(void)
1330 {
1331         struct task_struct *p, *g;
1332
1333         down_write(&css_set_rwsem);
1334
1335         if (use_task_css_set_links)
1336                 goto out_unlock;
1337
1338         use_task_css_set_links = true;
1339
1340         /*
1341          * We need tasklist_lock because RCU is not safe against
1342          * while_each_thread(). Besides, a forking task that has passed
1343          * cgroup_post_fork() without seeing use_task_css_set_links = 1
1344          * is not guaranteed to have its child immediately visible in the
1345          * tasklist if we walk through it with RCU.
1346          */
1347         read_lock(&tasklist_lock);
1348         do_each_thread(g, p) {
1349                 WARN_ON_ONCE(!list_empty(&p->cg_list) ||
1350                              task_css_set(p) != &init_css_set);
1351
1352                 /*
1353                  * We should check if the process is exiting, otherwise
1354                  * it will race with cgroup_exit() in that the list
1355                  * entry won't be deleted though the process has exited.
1356                  * Do it while holding siglock so that we don't end up
1357                  * racing against cgroup_exit().
1358                  */
1359                 spin_lock_irq(&p->sighand->siglock);
1360                 if (!(p->flags & PF_EXITING)) {
1361                         struct css_set *cset = task_css_set(p);
1362
1363                         list_add(&p->cg_list, &cset->tasks);
1364                         get_css_set(cset);
1365                 }
1366                 spin_unlock_irq(&p->sighand->siglock);
1367         } while_each_thread(g, p);
1368         read_unlock(&tasklist_lock);
1369 out_unlock:
1370         up_write(&css_set_rwsem);
1371 }
1372
1373 static void init_cgroup_housekeeping(struct cgroup *cgrp)
1374 {
1375         atomic_set(&cgrp->refcnt, 1);
1376         INIT_LIST_HEAD(&cgrp->sibling);
1377         INIT_LIST_HEAD(&cgrp->children);
1378         INIT_LIST_HEAD(&cgrp->cset_links);
1379         INIT_LIST_HEAD(&cgrp->release_list);
1380         INIT_LIST_HEAD(&cgrp->pidlists);
1381         mutex_init(&cgrp->pidlist_mutex);
1382         cgrp->dummy_css.cgroup = cgrp;
1383 }
1384
1385 static void init_cgroup_root(struct cgroup_root *root,
1386                              struct cgroup_sb_opts *opts)
1387 {
1388         struct cgroup *cgrp = &root->cgrp;
1389
1390         INIT_LIST_HEAD(&root->root_list);
1391         atomic_set(&root->nr_cgrps, 1);
1392         cgrp->root = root;
1393         init_cgroup_housekeeping(cgrp);
1394         idr_init(&root->cgroup_idr);
1395
1396         root->flags = opts->flags;
1397         if (opts->release_agent)
1398                 strcpy(root->release_agent_path, opts->release_agent);
1399         if (opts->name)
1400                 strcpy(root->name, opts->name);
1401         if (opts->cpuset_clone_children)
1402                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1403 }
1404
1405 static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1406 {
1407         LIST_HEAD(tmp_links);
1408         struct cgroup *root_cgrp = &root->cgrp;
1409         struct css_set *cset;
1410         int i, ret;
1411
1412         lockdep_assert_held(&cgroup_tree_mutex);
1413         lockdep_assert_held(&cgroup_mutex);
1414
1415         ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
1416         if (ret < 0)
1417                 goto out;
1418         root_cgrp->id = ret;
1419
1420         /*
1421          * We're accessing css_set_count without locking css_set_rwsem here,
1422          * but that's OK - it can only be increased by someone holding
1423          * cgroup_lock, and that's us. The worst that can happen is that we
1424          * have some link structures left over
1425          */
1426         ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
1427         if (ret)
1428                 goto out;
1429
1430         ret = cgroup_init_root_id(root);
1431         if (ret)
1432                 goto out;
1433
1434         root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
1435                                            KERNFS_ROOT_CREATE_DEACTIVATED,
1436                                            root_cgrp);
1437         if (IS_ERR(root->kf_root)) {
1438                 ret = PTR_ERR(root->kf_root);
1439                 goto exit_root_id;
1440         }
1441         root_cgrp->kn = root->kf_root->kn;
1442
1443         ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1444         if (ret)
1445                 goto destroy_root;
1446
1447         ret = rebind_subsystems(root, ss_mask);
1448         if (ret)
1449                 goto destroy_root;
1450
1451         /*
1452          * There must be no failure case after here, since rebinding takes
1453          * care of subsystems' refcounts, which are explicitly dropped in
1454          * the failure exit path.
1455          */
1456         list_add(&root->root_list, &cgroup_roots);
1457         cgroup_root_count++;
1458
1459         /*
1460          * Link the root cgroup in this hierarchy into all the css_set
1461          * objects.
1462          */
1463         down_write(&css_set_rwsem);
1464         hash_for_each(css_set_table, i, cset, hlist)
1465                 link_css_set(&tmp_links, cset, root_cgrp);
1466         up_write(&css_set_rwsem);
1467
1468         BUG_ON(!list_empty(&root_cgrp->children));
1469         BUG_ON(atomic_read(&root->nr_cgrps) != 1);
1470
1471         kernfs_activate(root_cgrp->kn);
1472         ret = 0;
1473         goto out;
1474
1475 destroy_root:
1476         kernfs_destroy_root(root->kf_root);
1477         root->kf_root = NULL;
1478 exit_root_id:
1479         cgroup_exit_root_id(root);
1480 out:
1481         free_cgrp_cset_links(&tmp_links);
1482         return ret;
1483 }
1484
1485 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1486                          int flags, const char *unused_dev_name,
1487                          void *data)
1488 {
1489         struct cgroup_root *root;
1490         struct cgroup_sb_opts opts;
1491         struct dentry *dentry;
1492         int ret;
1493         bool new_sb;
1494
1495         /*
1496          * The first time anyone tries to mount a cgroup, enable the list
1497          * linking each css_set to its tasks and fix up all existing tasks.
1498          */
1499         if (!use_task_css_set_links)
1500                 cgroup_enable_task_cg_lists();
1501
1502         mutex_lock(&cgroup_tree_mutex);
1503         mutex_lock(&cgroup_mutex);
1504
1505         /* First find the desired set of subsystems */
1506         ret = parse_cgroupfs_options(data, &opts);
1507         if (ret)
1508                 goto out_unlock;
1509 retry:
1510         /* look for a matching existing root */
1511         if (!opts.subsys_mask && !opts.none && !opts.name) {
1512                 cgrp_dfl_root_visible = true;
1513                 root = &cgrp_dfl_root;
1514                 cgroup_get(&root->cgrp);
1515                 ret = 0;
1516                 goto out_unlock;
1517         }
1518
1519         for_each_root(root) {
1520                 bool name_match = false;
1521
1522                 if (root == &cgrp_dfl_root)
1523                         continue;
1524
1525                 /*
1526                  * If we asked for a name then it must match.  Also, if
1527                  * name matches but sybsys_mask doesn't, we should fail.
1528                  * Remember whether name matched.
1529                  */
1530                 if (opts.name) {
1531                         if (strcmp(opts.name, root->name))
1532                                 continue;
1533                         name_match = true;
1534                 }
1535
1536                 /*
1537                  * If we asked for subsystems (or explicitly for no
1538                  * subsystems) then they must match.
1539                  */
1540                 if ((opts.subsys_mask || opts.none) &&
1541                     (opts.subsys_mask != root->subsys_mask)) {
1542                         if (!name_match)
1543                                 continue;
1544                         ret = -EBUSY;
1545                         goto out_unlock;
1546                 }
1547
1548                 if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
1549                         if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
1550                                 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1551                                 ret = -EINVAL;
1552                                 goto out_unlock;
1553                         } else {
1554                                 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1555                         }
1556                 }
1557
1558                 /*
1559                  * A root's lifetime is governed by its root cgroup.  Zero
1560                  * ref indicate that the root is being destroyed.  Wait for
1561                  * destruction to complete so that the subsystems are free.
1562                  * We can use wait_queue for the wait but this path is
1563                  * super cold.  Let's just sleep for a bit and retry.
1564                  */
1565                 if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
1566                         mutex_unlock(&cgroup_mutex);
1567                         mutex_unlock(&cgroup_tree_mutex);
1568                         msleep(10);
1569                         mutex_lock(&cgroup_tree_mutex);
1570                         mutex_lock(&cgroup_mutex);
1571                         goto retry;
1572                 }
1573
1574                 ret = 0;
1575                 goto out_unlock;
1576         }
1577
1578         /*
1579          * No such thing, create a new one.  name= matching without subsys
1580          * specification is allowed for already existing hierarchies but we
1581          * can't create new one without subsys specification.
1582          */
1583         if (!opts.subsys_mask && !opts.none) {
1584                 ret = -EINVAL;
1585                 goto out_unlock;
1586         }
1587
1588         root = kzalloc(sizeof(*root), GFP_KERNEL);
1589         if (!root) {
1590                 ret = -ENOMEM;
1591                 goto out_unlock;
1592         }
1593
1594         init_cgroup_root(root, &opts);
1595
1596         ret = cgroup_setup_root(root, opts.subsys_mask);
1597         if (ret)
1598                 cgroup_free_root(root);
1599
1600 out_unlock:
1601         mutex_unlock(&cgroup_mutex);
1602         mutex_unlock(&cgroup_tree_mutex);
1603
1604         kfree(opts.release_agent);
1605         kfree(opts.name);
1606
1607         if (ret)
1608                 return ERR_PTR(ret);
1609
1610         dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
1611         if (IS_ERR(dentry) || !new_sb)
1612                 cgroup_put(&root->cgrp);
1613         return dentry;
1614 }
1615
1616 static void cgroup_kill_sb(struct super_block *sb)
1617 {
1618         struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
1619         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
1620
1621         cgroup_put(&root->cgrp);
1622         kernfs_kill_sb(sb);
1623 }
1624
1625 static struct file_system_type cgroup_fs_type = {
1626         .name = "cgroup",
1627         .mount = cgroup_mount,
1628         .kill_sb = cgroup_kill_sb,
1629 };
1630
1631 static struct kobject *cgroup_kobj;
1632
1633 /**
1634  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1635  * @task: target task
1636  * @buf: the buffer to write the path into
1637  * @buflen: the length of the buffer
1638  *
1639  * Determine @task's cgroup on the first (the one with the lowest non-zero
1640  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
1641  * function grabs cgroup_mutex and shouldn't be used inside locks used by
1642  * cgroup controller callbacks.
1643  *
1644  * Return value is the same as kernfs_path().
1645  */
1646 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1647 {
1648         struct cgroup_root *root;
1649         struct cgroup *cgrp;
1650         int hierarchy_id = 1;
1651         char *path = NULL;
1652
1653         mutex_lock(&cgroup_mutex);
1654         down_read(&css_set_rwsem);
1655
1656         root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1657
1658         if (root) {
1659                 cgrp = task_cgroup_from_root(task, root);
1660                 path = cgroup_path(cgrp, buf, buflen);
1661         } else {
1662                 /* if no hierarchy exists, everyone is in "/" */
1663                 if (strlcpy(buf, "/", buflen) < buflen)
1664                         path = buf;
1665         }
1666
1667         up_read(&css_set_rwsem);
1668         mutex_unlock(&cgroup_mutex);
1669         return path;
1670 }
1671 EXPORT_SYMBOL_GPL(task_cgroup_path);
1672
1673 /* used to track tasks and other necessary states during migration */
1674 struct cgroup_taskset {
1675         /* the src and dst cset list running through cset->mg_node */
1676         struct list_head        src_csets;
1677         struct list_head        dst_csets;
1678
1679         /*
1680          * Fields for cgroup_taskset_*() iteration.
1681          *
1682          * Before migration is committed, the target migration tasks are on
1683          * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
1684          * the csets on ->dst_csets.  ->csets point to either ->src_csets
1685          * or ->dst_csets depending on whether migration is committed.
1686          *
1687          * ->cur_csets and ->cur_task point to the current task position
1688          * during iteration.
1689          */
1690         struct list_head        *csets;
1691         struct css_set          *cur_cset;
1692         struct task_struct      *cur_task;
1693 };
1694
1695 /**
1696  * cgroup_taskset_first - reset taskset and return the first task
1697  * @tset: taskset of interest
1698  *
1699  * @tset iteration is initialized and the first task is returned.
1700  */
1701 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1702 {
1703         tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
1704         tset->cur_task = NULL;
1705
1706         return cgroup_taskset_next(tset);
1707 }
1708
1709 /**
1710  * cgroup_taskset_next - iterate to the next task in taskset
1711  * @tset: taskset of interest
1712  *
1713  * Return the next task in @tset.  Iteration must have been initialized
1714  * with cgroup_taskset_first().
1715  */
1716 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1717 {
1718         struct css_set *cset = tset->cur_cset;
1719         struct task_struct *task = tset->cur_task;
1720
1721         while (&cset->mg_node != tset->csets) {
1722                 if (!task)
1723                         task = list_first_entry(&cset->mg_tasks,
1724                                                 struct task_struct, cg_list);
1725                 else
1726                         task = list_next_entry(task, cg_list);
1727
1728                 if (&task->cg_list != &cset->mg_tasks) {
1729                         tset->cur_cset = cset;
1730                         tset->cur_task = task;
1731                         return task;
1732                 }
1733
1734                 cset = list_next_entry(cset, mg_node);
1735                 task = NULL;
1736         }
1737
1738         return NULL;
1739 }
1740
1741 /**
1742  * cgroup_task_migrate - move a task from one cgroup to another.
1743  * @old_cgrp; the cgroup @tsk is being migrated from
1744  * @tsk: the task being migrated
1745  * @new_cset: the new css_set @tsk is being attached to
1746  *
1747  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
1748  */
1749 static void cgroup_task_migrate(struct cgroup *old_cgrp,
1750                                 struct task_struct *tsk,
1751                                 struct css_set *new_cset)
1752 {
1753         struct css_set *old_cset;
1754
1755         lockdep_assert_held(&cgroup_mutex);
1756         lockdep_assert_held(&css_set_rwsem);
1757
1758         /*
1759          * We are synchronized through threadgroup_lock() against PF_EXITING
1760          * setting such that we can't race against cgroup_exit() changing the
1761          * css_set to init_css_set and dropping the old one.
1762          */
1763         WARN_ON_ONCE(tsk->flags & PF_EXITING);
1764         old_cset = task_css_set(tsk);
1765
1766         get_css_set(new_cset);
1767         rcu_assign_pointer(tsk->cgroups, new_cset);
1768
1769         /*
1770          * Use move_tail so that cgroup_taskset_first() still returns the
1771          * leader after migration.  This works because cgroup_migrate()
1772          * ensures that the dst_cset of the leader is the first on the
1773          * tset's dst_csets list.
1774          */
1775         list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
1776
1777         /*
1778          * We just gained a reference on old_cset by taking it from the
1779          * task. As trading it for new_cset is protected by cgroup_mutex,
1780          * we're safe to drop it here; it will be freed under RCU.
1781          */
1782         set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
1783         put_css_set_locked(old_cset, false);
1784 }
1785
1786 /**
1787  * cgroup_migrate_finish - cleanup after attach
1788  * @preloaded_csets: list of preloaded css_sets
1789  *
1790  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
1791  * those functions for details.
1792  */
1793 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
1794 {
1795         struct css_set *cset, *tmp_cset;
1796
1797         lockdep_assert_held(&cgroup_mutex);
1798
1799         down_write(&css_set_rwsem);
1800         list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
1801                 cset->mg_src_cgrp = NULL;
1802                 cset->mg_dst_cset = NULL;
1803                 list_del_init(&cset->mg_preload_node);
1804                 put_css_set_locked(cset, false);
1805         }
1806         up_write(&css_set_rwsem);
1807 }
1808
1809 /**
1810  * cgroup_migrate_add_src - add a migration source css_set
1811  * @src_cset: the source css_set to add
1812  * @dst_cgrp: the destination cgroup
1813  * @preloaded_csets: list of preloaded css_sets
1814  *
1815  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
1816  * @src_cset and add it to @preloaded_csets, which should later be cleaned
1817  * up by cgroup_migrate_finish().
1818  *
1819  * This function may be called without holding threadgroup_lock even if the
1820  * target is a process.  Threads may be created and destroyed but as long
1821  * as cgroup_mutex is not dropped, no new css_set can be put into play and
1822  * the preloaded css_sets are guaranteed to cover all migrations.
1823  */
1824 static void cgroup_migrate_add_src(struct css_set *src_cset,
1825                                    struct cgroup *dst_cgrp,
1826                                    struct list_head *preloaded_csets)
1827 {
1828         struct cgroup *src_cgrp;
1829
1830         lockdep_assert_held(&cgroup_mutex);
1831         lockdep_assert_held(&css_set_rwsem);
1832
1833         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
1834
1835         /* nothing to do if this cset already belongs to the cgroup */
1836         if (src_cgrp == dst_cgrp)
1837                 return;
1838
1839         if (!list_empty(&src_cset->mg_preload_node))
1840                 return;
1841
1842         WARN_ON(src_cset->mg_src_cgrp);
1843         WARN_ON(!list_empty(&src_cset->mg_tasks));
1844         WARN_ON(!list_empty(&src_cset->mg_node));
1845
1846         src_cset->mg_src_cgrp = src_cgrp;
1847         get_css_set(src_cset);
1848         list_add(&src_cset->mg_preload_node, preloaded_csets);
1849 }
1850
1851 /**
1852  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
1853  * @dst_cgrp: the destination cgroup
1854  * @preloaded_csets: list of preloaded source css_sets
1855  *
1856  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
1857  * have been preloaded to @preloaded_csets.  This function looks up and
1858  * pins all destination css_sets, links each to its source, and put them on
1859  * @preloaded_csets.
1860  *
1861  * This function must be called after cgroup_migrate_add_src() has been
1862  * called on each migration source css_set.  After migration is performed
1863  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
1864  * @preloaded_csets.
1865  */
1866 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
1867                                       struct list_head *preloaded_csets)
1868 {
1869         LIST_HEAD(csets);
1870         struct css_set *src_cset;
1871
1872         lockdep_assert_held(&cgroup_mutex);
1873
1874         /* look up the dst cset for each src cset and link it to src */
1875         list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
1876                 struct css_set *dst_cset;
1877
1878                 dst_cset = find_css_set(src_cset, dst_cgrp);
1879                 if (!dst_cset)
1880                         goto err;
1881
1882                 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
1883                 src_cset->mg_dst_cset = dst_cset;
1884
1885                 if (list_empty(&dst_cset->mg_preload_node))
1886                         list_add(&dst_cset->mg_preload_node, &csets);
1887                 else
1888                         put_css_set(dst_cset, false);
1889         }
1890
1891         list_splice(&csets, preloaded_csets);
1892         return 0;
1893 err:
1894         cgroup_migrate_finish(&csets);
1895         return -ENOMEM;
1896 }
1897
1898 /**
1899  * cgroup_migrate - migrate a process or task to a cgroup
1900  * @cgrp: the destination cgroup
1901  * @leader: the leader of the process or the task to migrate
1902  * @threadgroup: whether @leader points to the whole process or a single task
1903  *
1904  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
1905  * process, the caller must be holding threadgroup_lock of @leader.  The
1906  * caller is also responsible for invoking cgroup_migrate_add_src() and
1907  * cgroup_migrate_prepare_dst() on the targets before invoking this
1908  * function and following up with cgroup_migrate_finish().
1909  *
1910  * As long as a controller's ->can_attach() doesn't fail, this function is
1911  * guaranteed to succeed.  This means that, excluding ->can_attach()
1912  * failure, when migrating multiple targets, the success or failure can be
1913  * decided for all targets by invoking group_migrate_prepare_dst() before
1914  * actually starting migrating.
1915  */
1916 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
1917                           bool threadgroup)
1918 {
1919         struct cgroup_taskset tset = {
1920                 .src_csets      = LIST_HEAD_INIT(tset.src_csets),
1921                 .dst_csets      = LIST_HEAD_INIT(tset.dst_csets),
1922                 .csets          = &tset.src_csets,
1923         };
1924         struct cgroup_subsys_state *css, *failed_css = NULL;
1925         struct css_set *cset, *tmp_cset;
1926         struct task_struct *task, *tmp_task;
1927         int i, ret;
1928
1929         /*
1930          * Prevent freeing of tasks while we take a snapshot. Tasks that are
1931          * already PF_EXITING could be freed from underneath us unless we
1932          * take an rcu_read_lock.
1933          */
1934         down_write(&css_set_rwsem);
1935         rcu_read_lock();
1936         task = leader;
1937         do {
1938                 /* @task either already exited or can't exit until the end */
1939                 if (task->flags & PF_EXITING)
1940                         goto next;
1941
1942                 /* leave @task alone if post_fork() hasn't linked it yet */
1943                 if (list_empty(&task->cg_list))
1944                         goto next;
1945
1946                 cset = task_css_set(task);
1947                 if (!cset->mg_src_cgrp)
1948                         goto next;
1949
1950                 /*
1951                  * cgroup_taskset_first() must always return the leader.
1952                  * Take care to avoid disturbing the ordering.
1953                  */
1954                 list_move_tail(&task->cg_list, &cset->mg_tasks);
1955                 if (list_empty(&cset->mg_node))
1956                         list_add_tail(&cset->mg_node, &tset.src_csets);
1957                 if (list_empty(&cset->mg_dst_cset->mg_node))
1958                         list_move_tail(&cset->mg_dst_cset->mg_node,
1959                                        &tset.dst_csets);
1960         next:
1961                 if (!threadgroup)
1962                         break;
1963         } while_each_thread(leader, task);
1964         rcu_read_unlock();
1965         up_write(&css_set_rwsem);
1966
1967         /* methods shouldn't be called if no task is actually migrating */
1968         if (list_empty(&tset.src_csets))
1969                 return 0;
1970
1971         /* check that we can legitimately attach to the cgroup */
1972         for_each_css(css, i, cgrp) {
1973                 if (css->ss->can_attach) {
1974                         ret = css->ss->can_attach(css, &tset);
1975                         if (ret) {
1976                                 failed_css = css;
1977                                 goto out_cancel_attach;
1978                         }
1979                 }
1980         }
1981
1982         /*
1983          * Now that we're guaranteed success, proceed to move all tasks to
1984          * the new cgroup.  There are no failure cases after here, so this
1985          * is the commit point.
1986          */
1987         down_write(&css_set_rwsem);
1988         list_for_each_entry(cset, &tset.src_csets, mg_node) {
1989                 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
1990                         cgroup_task_migrate(cset->mg_src_cgrp, task,
1991                                             cset->mg_dst_cset);
1992         }
1993         up_write(&css_set_rwsem);
1994
1995         /*
1996          * Migration is committed, all target tasks are now on dst_csets.
1997          * Nothing is sensitive to fork() after this point.  Notify
1998          * controllers that migration is complete.
1999          */
2000         tset.csets = &tset.dst_csets;
2001
2002         for_each_css(css, i, cgrp)
2003                 if (css->ss->attach)
2004                         css->ss->attach(css, &tset);
2005
2006         ret = 0;
2007         goto out_release_tset;
2008
2009 out_cancel_attach:
2010         for_each_css(css, i, cgrp) {
2011                 if (css == failed_css)
2012                         break;
2013                 if (css->ss->cancel_attach)
2014                         css->ss->cancel_attach(css, &tset);
2015         }
2016 out_release_tset:
2017         down_write(&css_set_rwsem);
2018         list_splice_init(&tset.dst_csets, &tset.src_csets);
2019         list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2020                 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2021                 list_del_init(&cset->mg_node);
2022         }
2023         up_write(&css_set_rwsem);
2024         return ret;
2025 }
2026
2027 /**
2028  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2029  * @dst_cgrp: the cgroup to attach to
2030  * @leader: the task or the leader of the threadgroup to be attached
2031  * @threadgroup: attach the whole threadgroup?
2032  *
2033  * Call holding cgroup_mutex and threadgroup_lock of @leader.
2034  */
2035 static int cgroup_attach_task(struct cgroup *dst_cgrp,
2036                               struct task_struct *leader, bool threadgroup)
2037 {
2038         LIST_HEAD(preloaded_csets);
2039         struct task_struct *task;
2040         int ret;
2041
2042         /* look up all src csets */
2043         down_read(&css_set_rwsem);
2044         rcu_read_lock();
2045         task = leader;
2046         do {
2047                 cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
2048                                        &preloaded_csets);
2049                 if (!threadgroup)
2050                         break;
2051         } while_each_thread(leader, task);
2052         rcu_read_unlock();
2053         up_read(&css_set_rwsem);
2054
2055         /* prepare dst csets and commit */
2056         ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2057         if (!ret)
2058                 ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
2059
2060         cgroup_migrate_finish(&preloaded_csets);
2061         return ret;
2062 }
2063
2064 /*
2065  * Find the task_struct of the task to attach by vpid and pass it along to the
2066  * function to attach either it or all tasks in its threadgroup. Will lock
2067  * cgroup_mutex and threadgroup.
2068  */
2069 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2070 {
2071         struct task_struct *tsk;
2072         const struct cred *cred = current_cred(), *tcred;
2073         int ret;
2074
2075         if (!cgroup_lock_live_group(cgrp))
2076                 return -ENODEV;
2077
2078 retry_find_task:
2079         rcu_read_lock();
2080         if (pid) {
2081                 tsk = find_task_by_vpid(pid);
2082                 if (!tsk) {
2083                         rcu_read_unlock();
2084                         ret = -ESRCH;
2085                         goto out_unlock_cgroup;
2086                 }
2087                 /*
2088                  * even if we're attaching all tasks in the thread group, we
2089                  * only need to check permissions on one of them.
2090                  */
2091                 tcred = __task_cred(tsk);
2092                 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
2093                     !uid_eq(cred->euid, tcred->uid) &&
2094                     !uid_eq(cred->euid, tcred->suid)) {
2095                         rcu_read_unlock();
2096                         ret = -EACCES;
2097                         goto out_unlock_cgroup;
2098                 }
2099         } else
2100                 tsk = current;
2101
2102         if (threadgroup)
2103                 tsk = tsk->group_leader;
2104
2105         /*
2106          * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2107          * trapped in a cpuset, or RT worker may be born in a cgroup
2108          * with no rt_runtime allocated.  Just say no.
2109          */
2110         if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2111                 ret = -EINVAL;
2112                 rcu_read_unlock();
2113                 goto out_unlock_cgroup;
2114         }
2115
2116         get_task_struct(tsk);
2117         rcu_read_unlock();
2118
2119         threadgroup_lock(tsk);
2120         if (threadgroup) {
2121                 if (!thread_group_leader(tsk)) {
2122                         /*
2123                          * a race with de_thread from another thread's exec()
2124                          * may strip us of our leadership, if this happens,
2125                          * there is no choice but to throw this task away and
2126                          * try again; this is
2127                          * "double-double-toil-and-trouble-check locking".
2128                          */
2129                         threadgroup_unlock(tsk);
2130                         put_task_struct(tsk);
2131                         goto retry_find_task;
2132                 }
2133         }
2134
2135         ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2136
2137         threadgroup_unlock(tsk);
2138
2139         put_task_struct(tsk);
2140 out_unlock_cgroup:
2141         mutex_unlock(&cgroup_mutex);
2142         return ret;
2143 }
2144
2145 /**
2146  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2147  * @from: attach to all cgroups of a given task
2148  * @tsk: the task to be attached
2149  */
2150 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2151 {
2152         struct cgroup_root *root;
2153         int retval = 0;
2154
2155         mutex_lock(&cgroup_mutex);
2156         for_each_root(root) {
2157                 struct cgroup *from_cgrp;
2158
2159                 if (root == &cgrp_dfl_root)
2160                         continue;
2161
2162                 down_read(&css_set_rwsem);
2163                 from_cgrp = task_cgroup_from_root(from, root);
2164                 up_read(&css_set_rwsem);
2165
2166                 retval = cgroup_attach_task(from_cgrp, tsk, false);
2167                 if (retval)
2168                         break;
2169         }
2170         mutex_unlock(&cgroup_mutex);
2171
2172         return retval;
2173 }
2174 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2175
2176 static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2177                               struct cftype *cft, u64 pid)
2178 {
2179         return attach_task_by_pid(css->cgroup, pid, false);
2180 }
2181
2182 static int cgroup_procs_write(struct cgroup_subsys_state *css,
2183                               struct cftype *cft, u64 tgid)
2184 {
2185         return attach_task_by_pid(css->cgroup, tgid, true);
2186 }
2187
2188 static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2189                                       struct cftype *cft, char *buffer)
2190 {
2191         struct cgroup_root *root = css->cgroup->root;
2192
2193         BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
2194         if (!cgroup_lock_live_group(css->cgroup))
2195                 return -ENODEV;
2196         spin_lock(&release_agent_path_lock);
2197         strlcpy(root->release_agent_path, buffer,
2198                 sizeof(root->release_agent_path));
2199         spin_unlock(&release_agent_path_lock);
2200         mutex_unlock(&cgroup_mutex);
2201         return 0;
2202 }
2203
2204 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
2205 {
2206         struct cgroup *cgrp = seq_css(seq)->cgroup;
2207
2208         if (!cgroup_lock_live_group(cgrp))
2209                 return -ENODEV;
2210         seq_puts(seq, cgrp->root->release_agent_path);
2211         seq_putc(seq, '\n');
2212         mutex_unlock(&cgroup_mutex);
2213         return 0;
2214 }
2215
2216 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
2217 {
2218         struct cgroup *cgrp = seq_css(seq)->cgroup;
2219
2220         seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2221         return 0;
2222 }
2223
2224 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
2225                                  size_t nbytes, loff_t off)
2226 {
2227         struct cgroup *cgrp = of->kn->parent->priv;
2228         struct cftype *cft = of->kn->priv;
2229         struct cgroup_subsys_state *css;
2230         int ret;
2231
2232         /*
2233          * kernfs guarantees that a file isn't deleted with operations in
2234          * flight, which means that the matching css is and stays alive and
2235          * doesn't need to be pinned.  The RCU locking is not necessary
2236          * either.  It's just for the convenience of using cgroup_css().
2237          */
2238         rcu_read_lock();
2239         css = cgroup_css(cgrp, cft->ss);
2240         rcu_read_unlock();
2241
2242         if (cft->write_string) {
2243                 ret = cft->write_string(css, cft, strstrip(buf));
2244         } else if (cft->write_u64) {
2245                 unsigned long long v;
2246                 ret = kstrtoull(buf, 0, &v);
2247                 if (!ret)
2248                         ret = cft->write_u64(css, cft, v);
2249         } else if (cft->write_s64) {
2250                 long long v;
2251                 ret = kstrtoll(buf, 0, &v);
2252                 if (!ret)
2253                         ret = cft->write_s64(css, cft, v);
2254         } else if (cft->trigger) {
2255                 ret = cft->trigger(css, (unsigned int)cft->private);
2256         } else {
2257                 ret = -EINVAL;
2258         }
2259
2260         return ret ?: nbytes;
2261 }
2262
2263 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
2264 {
2265         return seq_cft(seq)->seq_start(seq, ppos);
2266 }
2267
2268 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
2269 {
2270         return seq_cft(seq)->seq_next(seq, v, ppos);
2271 }
2272
2273 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
2274 {
2275         seq_cft(seq)->seq_stop(seq, v);
2276 }
2277
2278 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2279 {
2280         struct cftype *cft = seq_cft(m);
2281         struct cgroup_subsys_state *css = seq_css(m);
2282
2283         if (cft->seq_show)
2284                 return cft->seq_show(m, arg);
2285
2286         if (cft->read_u64)
2287                 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
2288         else if (cft->read_s64)
2289                 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
2290         else
2291                 return -EINVAL;
2292         return 0;
2293 }
2294
2295 static struct kernfs_ops cgroup_kf_single_ops = {
2296         .atomic_write_len       = PAGE_SIZE,
2297         .write                  = cgroup_file_write,
2298         .seq_show               = cgroup_seqfile_show,
2299 };
2300
2301 static struct kernfs_ops cgroup_kf_ops = {
2302         .atomic_write_len       = PAGE_SIZE,
2303         .write                  = cgroup_file_write,
2304         .seq_start              = cgroup_seqfile_start,
2305         .seq_next               = cgroup_seqfile_next,
2306         .seq_stop               = cgroup_seqfile_stop,
2307         .seq_show               = cgroup_seqfile_show,
2308 };
2309
2310 /*
2311  * cgroup_rename - Only allow simple rename of directories in place.
2312  */
2313 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
2314                          const char *new_name_str)
2315 {
2316         struct cgroup *cgrp = kn->priv;
2317         int ret;
2318
2319         if (kernfs_type(kn) != KERNFS_DIR)
2320                 return -ENOTDIR;
2321         if (kn->parent != new_parent)
2322                 return -EIO;
2323
2324         /*
2325          * This isn't a proper migration and its usefulness is very
2326          * limited.  Disallow if sane_behavior.
2327          */
2328         if (cgroup_sane_behavior(cgrp))
2329                 return -EPERM;
2330
2331         /*
2332          * We're gonna grab cgroup_tree_mutex which nests outside kernfs
2333          * active_ref.  kernfs_rename() doesn't require active_ref
2334          * protection.  Break them before grabbing cgroup_tree_mutex.
2335          */
2336         kernfs_break_active_protection(new_parent);
2337         kernfs_break_active_protection(kn);
2338
2339         mutex_lock(&cgroup_tree_mutex);
2340         mutex_lock(&cgroup_mutex);
2341
2342         ret = kernfs_rename(kn, new_parent, new_name_str);
2343
2344         mutex_unlock(&cgroup_mutex);
2345         mutex_unlock(&cgroup_tree_mutex);
2346
2347         kernfs_unbreak_active_protection(kn);
2348         kernfs_unbreak_active_protection(new_parent);
2349         return ret;
2350 }
2351
2352 /* set uid and gid of cgroup dirs and files to that of the creator */
2353 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
2354 {
2355         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
2356                                .ia_uid = current_fsuid(),
2357                                .ia_gid = current_fsgid(), };
2358
2359         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
2360             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
2361                 return 0;
2362
2363         return kernfs_setattr(kn, &iattr);
2364 }
2365
2366 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2367 {
2368         char name[CGROUP_FILE_NAME_MAX];
2369         struct kernfs_node *kn;
2370         struct lock_class_key *key = NULL;
2371         int ret;
2372
2373 #ifdef CONFIG_DEBUG_LOCK_ALLOC
2374         key = &cft->lockdep_key;
2375 #endif
2376         kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
2377                                   cgroup_file_mode(cft), 0, cft->kf_ops, cft,
2378                                   NULL, false, key);
2379         if (IS_ERR(kn))
2380                 return PTR_ERR(kn);
2381
2382         ret = cgroup_kn_set_ugid(kn);
2383         if (ret)
2384                 kernfs_remove(kn);
2385         return ret;
2386 }
2387
2388 /**
2389  * cgroup_addrm_files - add or remove files to a cgroup directory
2390  * @cgrp: the target cgroup
2391  * @cfts: array of cftypes to be added
2392  * @is_add: whether to add or remove
2393  *
2394  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2395  * For removals, this function never fails.  If addition fails, this
2396  * function doesn't remove files already added.  The caller is responsible
2397  * for cleaning up.
2398  */
2399 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2400                               bool is_add)
2401 {
2402         struct cftype *cft;
2403         int ret;
2404
2405         lockdep_assert_held(&cgroup_tree_mutex);
2406
2407         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2408                 /* does cft->flags tell us to skip this file on @cgrp? */
2409                 if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
2410                         continue;
2411                 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2412                         continue;
2413                 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2414                         continue;
2415                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
2416                         continue;
2417
2418                 if (is_add) {
2419                         ret = cgroup_add_file(cgrp, cft);
2420                         if (ret) {
2421                                 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2422                                         cft->name, ret);
2423                                 return ret;
2424                         }
2425                 } else {
2426                         cgroup_rm_file(cgrp, cft);
2427                 }
2428         }
2429         return 0;
2430 }
2431
2432 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
2433 {
2434         LIST_HEAD(pending);
2435         struct cgroup_subsys *ss = cfts[0].ss;
2436         struct cgroup *root = &ss->root->cgrp;
2437         struct cgroup_subsys_state *css;
2438         int ret = 0;
2439
2440         lockdep_assert_held(&cgroup_tree_mutex);
2441
2442         /* add/rm files for all cgroups created before */
2443         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2444                 struct cgroup *cgrp = css->cgroup;
2445
2446                 if (cgroup_is_dead(cgrp))
2447                         continue;
2448
2449                 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2450                 if (ret)
2451                         break;
2452         }
2453
2454         if (is_add && !ret)
2455                 kernfs_activate(root->kn);
2456         return ret;
2457 }
2458
2459 static void cgroup_exit_cftypes(struct cftype *cfts)
2460 {
2461         struct cftype *cft;
2462
2463         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2464                 /* free copy for custom atomic_write_len, see init_cftypes() */
2465                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
2466                         kfree(cft->kf_ops);
2467                 cft->kf_ops = NULL;
2468                 cft->ss = NULL;
2469         }
2470 }
2471
2472 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2473 {
2474         struct cftype *cft;
2475
2476         for (cft = cfts; cft->name[0] != '\0'; cft++) {
2477                 struct kernfs_ops *kf_ops;
2478
2479                 WARN_ON(cft->ss || cft->kf_ops);
2480
2481                 if (cft->seq_start)
2482                         kf_ops = &cgroup_kf_ops;
2483                 else
2484                         kf_ops = &cgroup_kf_single_ops;
2485
2486                 /*
2487                  * Ugh... if @cft wants a custom max_write_len, we need to
2488                  * make a copy of kf_ops to set its atomic_write_len.
2489                  */
2490                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
2491                         kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
2492                         if (!kf_ops) {
2493                                 cgroup_exit_cftypes(cfts);
2494                                 return -ENOMEM;
2495                         }
2496                         kf_ops->atomic_write_len = cft->max_write_len;
2497                 }
2498
2499                 cft->kf_ops = kf_ops;
2500                 cft->ss = ss;
2501         }
2502
2503         return 0;
2504 }
2505
2506 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
2507 {
2508         lockdep_assert_held(&cgroup_tree_mutex);
2509
2510         if (!cfts || !cfts[0].ss)
2511                 return -ENOENT;
2512
2513         list_del(&cfts->node);
2514         cgroup_apply_cftypes(cfts, false);
2515         cgroup_exit_cftypes(cfts);
2516         return 0;
2517 }
2518
2519 /**
2520  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2521  * @cfts: zero-length name terminated array of cftypes
2522  *
2523  * Unregister @cfts.  Files described by @cfts are removed from all
2524  * existing cgroups and all future cgroups won't have them either.  This
2525  * function can be called anytime whether @cfts' subsys is attached or not.
2526  *
2527  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2528  * registered.
2529  */
2530 int cgroup_rm_cftypes(struct cftype *cfts)
2531 {
2532         int ret;
2533
2534         mutex_lock(&cgroup_tree_mutex);
2535         ret = cgroup_rm_cftypes_locked(cfts);
2536         mutex_unlock(&cgroup_tree_mutex);
2537         return ret;
2538 }
2539
2540 /**
2541  * cgroup_add_cftypes - add an array of cftypes to a subsystem
2542  * @ss: target cgroup subsystem
2543  * @cfts: zero-length name terminated array of cftypes
2544  *
2545  * Register @cfts to @ss.  Files described by @cfts are created for all
2546  * existing cgroups to which @ss is attached and all future cgroups will
2547  * have them too.  This function can be called anytime whether @ss is
2548  * attached or not.
2549  *
2550  * Returns 0 on successful registration, -errno on failure.  Note that this
2551  * function currently returns 0 as long as @cfts registration is successful
2552  * even if some file creation attempts on existing cgroups fail.
2553  */
2554 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2555 {
2556         int ret;
2557
2558         if (!cfts || cfts[0].name[0] == '\0')
2559                 return 0;
2560
2561         ret = cgroup_init_cftypes(ss, cfts);
2562         if (ret)
2563                 return ret;
2564
2565         mutex_lock(&cgroup_tree_mutex);
2566
2567         list_add_tail(&cfts->node, &ss->cfts);
2568         ret = cgroup_apply_cftypes(cfts, true);
2569         if (ret)
2570                 cgroup_rm_cftypes_locked(cfts);
2571
2572         mutex_unlock(&cgroup_tree_mutex);
2573         return ret;
2574 }
2575
2576 /**
2577  * cgroup_task_count - count the number of tasks in a cgroup.
2578  * @cgrp: the cgroup in question
2579  *
2580  * Return the number of tasks in the cgroup.
2581  */
2582 static int cgroup_task_count(const struct cgroup *cgrp)
2583 {
2584         int count = 0;
2585         struct cgrp_cset_link *link;
2586
2587         down_read(&css_set_rwsem);
2588         list_for_each_entry(link, &cgrp->cset_links, cset_link)
2589                 count += atomic_read(&link->cset->refcount);
2590         up_read(&css_set_rwsem);
2591         return count;
2592 }
2593
2594 /**
2595  * css_next_child - find the next child of a given css
2596  * @pos_css: the current position (%NULL to initiate traversal)
2597  * @parent_css: css whose children to walk
2598  *
2599  * This function returns the next child of @parent_css and should be called
2600  * under either cgroup_mutex or RCU read lock.  The only requirement is
2601  * that @parent_css and @pos_css are accessible.  The next sibling is
2602  * guaranteed to be returned regardless of their states.
2603  */
2604 struct cgroup_subsys_state *
2605 css_next_child(struct cgroup_subsys_state *pos_css,
2606                struct cgroup_subsys_state *parent_css)
2607 {
2608         struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
2609         struct cgroup *cgrp = parent_css->cgroup;
2610         struct cgroup *next;
2611
2612         cgroup_assert_mutexes_or_rcu_locked();
2613
2614         /*
2615          * @pos could already have been removed.  Once a cgroup is removed,
2616          * its ->sibling.next is no longer updated when its next sibling
2617          * changes.  As CGRP_DEAD assertion is serialized and happens
2618          * before the cgroup is taken off the ->sibling list, if we see it
2619          * unasserted, it's guaranteed that the next sibling hasn't
2620          * finished its grace period even if it's already removed, and thus
2621          * safe to dereference from this RCU critical section.  If
2622          * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
2623          * to be visible as %true here.
2624          *
2625          * If @pos is dead, its next pointer can't be dereferenced;
2626          * however, as each cgroup is given a monotonically increasing
2627          * unique serial number and always appended to the sibling list,
2628          * the next one can be found by walking the parent's children until
2629          * we see a cgroup with higher serial number than @pos's.  While
2630          * this path can be slower, it's taken only when either the current
2631          * cgroup is removed or iteration and removal race.
2632          */
2633         if (!pos) {
2634                 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
2635         } else if (likely(!cgroup_is_dead(pos))) {
2636                 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
2637         } else {
2638                 list_for_each_entry_rcu(next, &cgrp->children, sibling)
2639                         if (next->serial_nr > pos->serial_nr)
2640                                 break;
2641         }
2642
2643         if (&next->sibling == &cgrp->children)
2644                 return NULL;
2645
2646         return cgroup_css(next, parent_css->ss);
2647 }
2648
2649 /**
2650  * css_next_descendant_pre - find the next descendant for pre-order walk
2651  * @pos: the current position (%NULL to initiate traversal)
2652  * @root: css whose descendants to walk
2653  *
2654  * To be used by css_for_each_descendant_pre().  Find the next descendant
2655  * to visit for pre-order traversal of @root's descendants.  @root is
2656  * included in the iteration and the first node to be visited.
2657  *
2658  * While this function requires cgroup_mutex or RCU read locking, it
2659  * doesn't require the whole traversal to be contained in a single critical
2660  * section.  This function will return the correct next descendant as long
2661  * as both @pos and @root are accessible and @pos is a descendant of @root.
2662  */
2663 struct cgroup_subsys_state *
2664 css_next_descendant_pre(struct cgroup_subsys_state *pos,
2665                         struct cgroup_subsys_state *root)
2666 {
2667         struct cgroup_subsys_state *next;
2668
2669         cgroup_assert_mutexes_or_rcu_locked();
2670
2671         /* if first iteration, visit @root */
2672         if (!pos)
2673                 return root;
2674
2675         /* visit the first child if exists */
2676         next = css_next_child(NULL, pos);
2677         if (next)
2678                 return next;
2679
2680         /* no child, visit my or the closest ancestor's next sibling */
2681         while (pos != root) {
2682                 next = css_next_child(pos, css_parent(pos));
2683                 if (next)
2684                         return next;
2685                 pos = css_parent(pos);
2686         }
2687
2688         return NULL;
2689 }
2690
2691 /**
2692  * css_rightmost_descendant - return the rightmost descendant of a css
2693  * @pos: css of interest
2694  *
2695  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
2696  * is returned.  This can be used during pre-order traversal to skip
2697  * subtree of @pos.
2698  *
2699  * While this function requires cgroup_mutex or RCU read locking, it
2700  * doesn't require the whole traversal to be contained in a single critical
2701  * section.  This function will return the correct rightmost descendant as
2702  * long as @pos is accessible.
2703  */
2704 struct cgroup_subsys_state *
2705 css_rightmost_descendant(struct cgroup_subsys_state *pos)
2706 {
2707         struct cgroup_subsys_state *last, *tmp;
2708
2709         cgroup_assert_mutexes_or_rcu_locked();
2710
2711         do {
2712                 last = pos;
2713                 /* ->prev isn't RCU safe, walk ->next till the end */
2714                 pos = NULL;
2715                 css_for_each_child(tmp, last)
2716                         pos = tmp;
2717         } while (pos);
2718
2719         return last;
2720 }
2721
2722 static struct cgroup_subsys_state *
2723 css_leftmost_descendant(struct cgroup_subsys_state *pos)
2724 {
2725         struct cgroup_subsys_state *last;
2726
2727         do {
2728                 last = pos;
2729                 pos = css_next_child(NULL, pos);
2730         } while (pos);
2731
2732         return last;
2733 }
2734
2735 /**
2736  * css_next_descendant_post - find the next descendant for post-order walk
2737  * @pos: the current position (%NULL to initiate traversal)
2738  * @root: css whose descendants to walk
2739  *
2740  * To be used by css_for_each_descendant_post().  Find the next descendant
2741  * to visit for post-order traversal of @root's descendants.  @root is
2742  * included in the iteration and the last node to be visited.
2743  *
2744  * While this function requires cgroup_mutex or RCU read locking, it
2745  * doesn't require the whole traversal to be contained in a single critical
2746  * section.  This function will return the correct next descendant as long
2747  * as both @pos and @cgroup are accessible and @pos is a descendant of
2748  * @cgroup.
2749  */
2750 struct cgroup_subsys_state *
2751 css_next_descendant_post(struct cgroup_subsys_state *pos,
2752                          struct cgroup_subsys_state *root)
2753 {
2754         struct cgroup_subsys_state *next;
2755
2756         cgroup_assert_mutexes_or_rcu_locked();
2757
2758         /* if first iteration, visit leftmost descendant which may be @root */
2759         if (!pos)
2760                 return css_leftmost_descendant(root);
2761
2762         /* if we visited @root, we're done */
2763         if (pos == root)
2764                 return NULL;
2765
2766         /* if there's an unvisited sibling, visit its leftmost descendant */
2767         next = css_next_child(pos, css_parent(pos));
2768         if (next)
2769                 return css_leftmost_descendant(next);
2770
2771         /* no sibling left, visit parent */
2772         return css_parent(pos);
2773 }
2774
2775 /**
2776  * css_advance_task_iter - advance a task itererator to the next css_set
2777  * @it: the iterator to advance
2778  *
2779  * Advance @it to the next css_set to walk.
2780  */
2781 static void css_advance_task_iter(struct css_task_iter *it)
2782 {
2783         struct list_head *l = it->cset_link;
2784         struct cgrp_cset_link *link;
2785         struct css_set *cset;
2786
2787         /* Advance to the next non-empty css_set */
2788         do {
2789                 l = l->next;
2790                 if (l == &it->origin_css->cgroup->cset_links) {
2791                         it->cset_link = NULL;
2792                         return;
2793                 }
2794                 link = list_entry(l, struct cgrp_cset_link, cset_link);
2795                 cset = link->cset;
2796         } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
2797
2798         it->cset_link = l;
2799
2800         if (!list_empty(&cset->tasks))
2801                 it->task = cset->tasks.next;
2802         else
2803                 it->task = cset->mg_tasks.next;
2804 }
2805
2806 /**
2807  * css_task_iter_start - initiate task iteration
2808  * @css: the css to walk tasks of
2809  * @it: the task iterator to use
2810  *
2811  * Initiate iteration through the tasks of @css.  The caller can call
2812  * css_task_iter_next() to walk through the tasks until the function
2813  * returns NULL.  On completion of iteration, css_task_iter_end() must be
2814  * called.
2815  *
2816  * Note that this function acquires a lock which is released when the
2817  * iteration finishes.  The caller can't sleep while iteration is in
2818  * progress.
2819  */
2820 void css_task_iter_start(struct cgroup_subsys_state *css,
2821                          struct css_task_iter *it)
2822         __acquires(css_set_rwsem)
2823 {
2824         /* no one should try to iterate before mounting cgroups */
2825         WARN_ON_ONCE(!use_task_css_set_links);
2826
2827         down_read(&css_set_rwsem);
2828
2829         it->origin_css = css;
2830         it->cset_link = &css->cgroup->cset_links;
2831
2832         css_advance_task_iter(it);
2833 }
2834
2835 /**
2836  * css_task_iter_next - return the next task for the iterator
2837  * @it: the task iterator being iterated
2838  *
2839  * The "next" function for task iteration.  @it should have been
2840  * initialized via css_task_iter_start().  Returns NULL when the iteration
2841  * reaches the end.
2842  */
2843 struct task_struct *css_task_iter_next(struct css_task_iter *it)
2844 {
2845         struct task_struct *res;
2846         struct list_head *l = it->task;
2847         struct cgrp_cset_link *link = list_entry(it->cset_link,
2848                                         struct cgrp_cset_link, cset_link);
2849
2850         /* If the iterator cg is NULL, we have no tasks */
2851         if (!it->cset_link)
2852                 return NULL;
2853         res = list_entry(l, struct task_struct, cg_list);
2854
2855         /*
2856          * Advance iterator to find next entry.  cset->tasks is consumed
2857          * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
2858          * next cset.
2859          */
2860         l = l->next;
2861
2862         if (l == &link->cset->tasks)
2863                 l = link->cset->mg_tasks.next;
2864
2865         if (l == &link->cset->mg_tasks)
2866                 css_advance_task_iter(it);
2867         else
2868                 it->task = l;
2869
2870         return res;
2871 }
2872
2873 /**
2874  * css_task_iter_end - finish task iteration
2875  * @it: the task iterator to finish
2876  *
2877  * Finish task iteration started by css_task_iter_start().
2878  */
2879 void css_task_iter_end(struct css_task_iter *it)
2880         __releases(css_set_rwsem)
2881 {
2882         up_read(&css_set_rwsem);
2883 }
2884
2885 /**
2886  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
2887  * @to: cgroup to which the tasks will be moved
2888  * @from: cgroup in which the tasks currently reside
2889  *
2890  * Locking rules between cgroup_post_fork() and the migration path
2891  * guarantee that, if a task is forking while being migrated, the new child
2892  * is guaranteed to be either visible in the source cgroup after the
2893  * parent's migration is complete or put into the target cgroup.  No task
2894  * can slip out of migration through forking.
2895  */
2896 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
2897 {
2898         LIST_HEAD(preloaded_csets);
2899         struct cgrp_cset_link *link;
2900         struct css_task_iter it;
2901         struct task_struct *task;
2902         int ret;
2903
2904         mutex_lock(&cgroup_mutex);
2905
2906         /* all tasks in @from are being moved, all csets are source */
2907         down_read(&css_set_rwsem);
2908         list_for_each_entry(link, &from->cset_links, cset_link)
2909                 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
2910         up_read(&css_set_rwsem);
2911
2912         ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
2913         if (ret)
2914                 goto out_err;
2915
2916         /*
2917          * Migrate tasks one-by-one until @form is empty.  This fails iff
2918          * ->can_attach() fails.
2919          */
2920         do {
2921                 css_task_iter_start(&from->dummy_css, &it);
2922                 task = css_task_iter_next(&it);
2923                 if (task)
2924                         get_task_struct(task);
2925                 css_task_iter_end(&it);
2926
2927                 if (task) {
2928                         ret = cgroup_migrate(to, task, false);
2929                         put_task_struct(task);
2930                 }
2931         } while (task && !ret);
2932 out_err:
2933         cgroup_migrate_finish(&preloaded_csets);
2934         mutex_unlock(&cgroup_mutex);
2935         return ret;
2936 }
2937
2938 /*
2939  * Stuff for reading the 'tasks'/'procs' files.
2940  *
2941  * Reading this file can return large amounts of data if a cgroup has
2942  * *lots* of attached tasks. So it may need several calls to read(),
2943  * but we cannot guarantee that the information we produce is correct
2944  * unless we produce it entirely atomically.
2945  *
2946  */
2947
2948 /* which pidlist file are we talking about? */
2949 enum cgroup_filetype {
2950         CGROUP_FILE_PROCS,
2951         CGROUP_FILE_TASKS,
2952 };
2953
2954 /*
2955  * A pidlist is a list of pids that virtually represents the contents of one
2956  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
2957  * a pair (one each for procs, tasks) for each pid namespace that's relevant
2958  * to the cgroup.
2959  */
2960 struct cgroup_pidlist {
2961         /*
2962          * used to find which pidlist is wanted. doesn't change as long as
2963          * this particular list stays in the list.
2964         */
2965         struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
2966         /* array of xids */
2967         pid_t *list;
2968         /* how many elements the above list has */
2969         int length;
2970         /* each of these stored in a list by its cgroup */
2971         struct list_head links;
2972         /* pointer to the cgroup we belong to, for list removal purposes */
2973         struct cgroup *owner;
2974         /* for delayed destruction */
2975         struct delayed_work destroy_dwork;
2976 };
2977
2978 /*
2979  * The following two functions "fix" the issue where there are more pids
2980  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
2981  * TODO: replace with a kernel-wide solution to this problem
2982  */
2983 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
2984 static void *pidlist_allocate(int count)
2985 {
2986         if (PIDLIST_TOO_LARGE(count))
2987                 return vmalloc(count * sizeof(pid_t));
2988         else
2989                 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
2990 }
2991
2992 static void pidlist_free(void *p)
2993 {
2994         if (is_vmalloc_addr(p))
2995                 vfree(p);
2996         else
2997                 kfree(p);
2998 }
2999
3000 /*
3001  * Used to destroy all pidlists lingering waiting for destroy timer.  None
3002  * should be left afterwards.
3003  */
3004 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
3005 {
3006         struct cgroup_pidlist *l, *tmp_l;
3007
3008         mutex_lock(&cgrp->pidlist_mutex);
3009         list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
3010                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
3011         mutex_unlock(&cgrp->pidlist_mutex);
3012
3013         flush_workqueue(cgroup_pidlist_destroy_wq);
3014         BUG_ON(!list_empty(&cgrp->pidlists));
3015 }
3016
3017 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
3018 {
3019         struct delayed_work *dwork = to_delayed_work(work);
3020         struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
3021                                                 destroy_dwork);
3022         struct cgroup_pidlist *tofree = NULL;
3023
3024         mutex_lock(&l->owner->pidlist_mutex);
3025
3026         /*
3027          * Destroy iff we didn't get queued again.  The state won't change
3028          * as destroy_dwork can only be queued while locked.
3029          */
3030         if (!delayed_work_pending(dwork)) {
3031                 list_del(&l->links);
3032                 pidlist_free(l->list);
3033                 put_pid_ns(l->key.ns);
3034                 tofree = l;
3035         }
3036
3037         mutex_unlock(&l->owner->pidlist_mutex);
3038         kfree(tofree);
3039 }
3040
3041 /*
3042  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3043  * Returns the number of unique elements.
3044  */
3045 static int pidlist_uniq(pid_t *list, int length)
3046 {
3047         int src, dest = 1;
3048
3049         /*
3050          * we presume the 0th element is unique, so i starts at 1. trivial
3051          * edge cases first; no work needs to be done for either
3052          */
3053         if (length == 0 || length == 1)
3054                 return length;
3055         /* src and dest walk down the list; dest counts unique elements */
3056         for (src = 1; src < length; src++) {
3057                 /* find next unique element */
3058                 while (list[src] == list[src-1]) {
3059                         src++;
3060                         if (src == length)
3061                                 goto after;
3062                 }
3063                 /* dest always points to where the next unique element goes */
3064                 list[dest] = list[src];
3065                 dest++;
3066         }
3067 after:
3068         return dest;
3069 }
3070
3071 /*
3072  * The two pid files - task and cgroup.procs - guaranteed that the result
3073  * is sorted, which forced this whole pidlist fiasco.  As pid order is
3074  * different per namespace, each namespace needs differently sorted list,
3075  * making it impossible to use, for example, single rbtree of member tasks
3076  * sorted by task pointer.  As pidlists can be fairly large, allocating one
3077  * per open file is dangerous, so cgroup had to implement shared pool of
3078  * pidlists keyed by cgroup and namespace.
3079  *
3080  * All this extra complexity was caused by the original implementation
3081  * committing to an entirely unnecessary property.  In the long term, we
3082  * want to do away with it.  Explicitly scramble sort order if
3083  * sane_behavior so that no such expectation exists in the new interface.
3084  *
3085  * Scrambling is done by swapping every two consecutive bits, which is
3086  * non-identity one-to-one mapping which disturbs sort order sufficiently.
3087  */
3088 static pid_t pid_fry(pid_t pid)
3089 {
3090         unsigned a = pid & 0x55555555;
3091         unsigned b = pid & 0xAAAAAAAA;
3092
3093         return (a << 1) | (b >> 1);
3094 }
3095
3096 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
3097 {
3098         if (cgroup_sane_behavior(cgrp))
3099                 return pid_fry(pid);
3100         else
3101                 return pid;
3102 }
3103
3104 static int cmppid(const void *a, const void *b)
3105 {
3106         return *(pid_t *)a - *(pid_t *)b;
3107 }
3108
3109 static int fried_cmppid(const void *a, const void *b)
3110 {
3111         return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
3112 }
3113
3114 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3115                                                   enum cgroup_filetype type)
3116 {
3117         struct cgroup_pidlist *l;
3118         /* don't need task_nsproxy() if we're looking at ourself */
3119         struct pid_namespace *ns = task_active_pid_ns(current);
3120
3121         lockdep_assert_held(&cgrp->pidlist_mutex);
3122
3123         list_for_each_entry(l, &cgrp->pidlists, links)
3124                 if (l->key.type == type && l->key.ns == ns)
3125                         return l;
3126         return NULL;
3127 }
3128
3129 /*
3130  * find the appropriate pidlist for our purpose (given procs vs tasks)
3131  * returns with the lock on that pidlist already held, and takes care
3132  * of the use count, or returns NULL with no locks held if we're out of
3133  * memory.
3134  */
3135 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
3136                                                 enum cgroup_filetype type)
3137 {
3138         struct cgroup_pidlist *l;
3139
3140         lockdep_assert_held(&cgrp->pidlist_mutex);
3141
3142         l = cgroup_pidlist_find(cgrp, type);
3143         if (l)
3144                 return l;
3145
3146         /* entry not found; create a new one */
3147         l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
3148         if (!l)
3149                 return l;
3150
3151         INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
3152         l->key.type = type;
3153         /* don't need task_nsproxy() if we're looking at ourself */
3154         l->key.ns = get_pid_ns(task_active_pid_ns(current));
3155         l->owner = cgrp;
3156         list_add(&l->links, &cgrp->pidlists);
3157         return l;
3158 }
3159
3160 /*
3161  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
3162  */
3163 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3164                               struct cgroup_pidlist **lp)
3165 {
3166         pid_t *array;
3167         int length;
3168         int pid, n = 0; /* used for populating the array */
3169         struct css_task_iter it;
3170         struct task_struct *tsk;
3171         struct cgroup_pidlist *l;
3172
3173         lockdep_assert_held(&cgrp->pidlist_mutex);
3174
3175         /*
3176          * If cgroup gets more users after we read count, we won't have
3177          * enough space - tough.  This race is indistinguishable to the
3178          * caller from the case that the additional cgroup users didn't
3179          * show up until sometime later on.
3180          */
3181         length = cgroup_task_count(cgrp);
3182         array = pidlist_allocate(length);
3183         if (!array)
3184                 return -ENOMEM;
3185         /* now, populate the array */
3186         css_task_iter_start(&cgrp->dummy_css, &it);
3187         while ((tsk = css_task_iter_next(&it))) {
3188                 if (unlikely(n == length))
3189                         break;
3190                 /* get tgid or pid for procs or tasks file respectively */
3191                 if (type == CGROUP_FILE_PROCS)
3192                         pid = task_tgid_vnr(tsk);
3193                 else
3194                         pid = task_pid_vnr(tsk);
3195                 if (pid > 0) /* make sure to only use valid results */
3196                         array[n++] = pid;
3197         }
3198         css_task_iter_end(&it);
3199         length = n;
3200         /* now sort & (if procs) strip out duplicates */
3201         if (cgroup_sane_behavior(cgrp))
3202                 sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
3203         else
3204                 sort(array, length, sizeof(pid_t), cmppid, NULL);
3205         if (type == CGROUP_FILE_PROCS)
3206                 length = pidlist_uniq(array, length);
3207
3208         l = cgroup_pidlist_find_create(cgrp, type);
3209         if (!l) {
3210                 mutex_unlock(&cgrp->pidlist_mutex);
3211                 pidlist_free(array);
3212                 return -ENOMEM;
3213         }
3214
3215         /* store array, freeing old if necessary */
3216         pidlist_free(l->list);
3217         l->list = array;
3218         l->length = length;
3219         *lp = l;
3220         return 0;
3221 }
3222
3223 /**
3224  * cgroupstats_build - build and fill cgroupstats
3225  * @stats: cgroupstats to fill information into
3226  * @dentry: A dentry entry belonging to the cgroup for which stats have
3227  * been requested.
3228  *
3229  * Build and fill cgroupstats so that taskstats can export it to user
3230  * space.
3231  */
3232 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3233 {
3234         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
3235         struct cgroup *cgrp;
3236         struct css_task_iter it;
3237         struct task_struct *tsk;
3238
3239         /* it should be kernfs_node belonging to cgroupfs and is a directory */
3240         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
3241             kernfs_type(kn) != KERNFS_DIR)
3242                 return -EINVAL;
3243
3244         mutex_lock(&cgroup_mutex);
3245
3246         /*
3247          * We aren't being called from kernfs and there's no guarantee on
3248          * @kn->priv's validity.  For this and css_tryget_from_dir(),
3249          * @kn->priv is RCU safe.  Let's do the RCU dancing.
3250          */
3251         rcu_read_lock();
3252         cgrp = rcu_dereference(kn->priv);
3253         if (!cgrp || cgroup_is_dead(cgrp)) {
3254                 rcu_read_unlock();
3255                 mutex_unlock(&cgroup_mutex);
3256                 return -ENOENT;
3257         }
3258         rcu_read_unlock();
3259
3260         css_task_iter_start(&cgrp->dummy_css, &it);
3261         while ((tsk = css_task_iter_next(&it))) {
3262                 switch (tsk->state) {
3263                 case TASK_RUNNING:
3264                         stats->nr_running++;
3265                         break;
3266                 case TASK_INTERRUPTIBLE:
3267                         stats->nr_sleeping++;
3268                         break;
3269                 case TASK_UNINTERRUPTIBLE:
3270                         stats->nr_uninterruptible++;
3271                         break;
3272                 case TASK_STOPPED:
3273                         stats->nr_stopped++;
3274                         break;
3275                 default:
3276                         if (delayacct_is_task_waiting_on_io(tsk))
3277                                 stats->nr_io_wait++;
3278                         break;
3279                 }
3280         }
3281         css_task_iter_end(&it);
3282
3283         mutex_unlock(&cgroup_mutex);
3284         return 0;
3285 }
3286
3287
3288 /*
3289  * seq_file methods for the tasks/procs files. The seq_file position is the
3290  * next pid to display; the seq_file iterator is a pointer to the pid
3291  * in the cgroup->l->list array.
3292  */
3293
3294 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3295 {
3296         /*
3297          * Initially we receive a position value that corresponds to
3298          * one more than the last pid shown (or 0 on the first call or
3299          * after a seek to the start). Use a binary-search to find the
3300          * next pid to display, if any
3301          */
3302         struct kernfs_open_file *of = s->private;
3303         struct cgroup *cgrp = seq_css(s)->cgroup;
3304         struct cgroup_pidlist *l;
3305         enum cgroup_filetype type = seq_cft(s)->private;
3306         int index = 0, pid = *pos;
3307         int *iter, ret;
3308
3309         mutex_lock(&cgrp->pidlist_mutex);
3310
3311         /*
3312          * !NULL @of->priv indicates that this isn't the first start()
3313          * after open.  If the matching pidlist is around, we can use that.
3314          * Look for it.  Note that @of->priv can't be used directly.  It
3315          * could already have been destroyed.
3316          */
3317         if (of->priv)
3318                 of->priv = cgroup_pidlist_find(cgrp, type);
3319
3320         /*
3321          * Either this is the first start() after open or the matching
3322          * pidlist has been destroyed inbetween.  Create a new one.
3323          */
3324         if (!of->priv) {
3325                 ret = pidlist_array_load(cgrp, type,
3326                                          (struct cgroup_pidlist **)&of->priv);
3327                 if (ret)
3328                         return ERR_PTR(ret);
3329         }
3330         l = of->priv;
3331
3332         if (pid) {
3333                 int end = l->length;
3334
3335                 while (index < end) {
3336                         int mid = (index + end) / 2;
3337                         if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
3338                                 index = mid;
3339                                 break;
3340                         } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
3341                                 index = mid + 1;
3342                         else
3343                                 end = mid;
3344                 }
3345         }
3346         /* If we're off the end of the array, we're done */
3347         if (index >= l->length)
3348                 return NULL;
3349         /* Update the abstract position to be the actual pid that we found */
3350         iter = l->list + index;
3351         *pos = cgroup_pid_fry(cgrp, *iter);
3352         return iter;
3353 }
3354
3355 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3356 {
3357         struct kernfs_open_file *of = s->private;
3358         struct cgroup_pidlist *l = of->priv;
3359
3360         if (l)
3361                 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
3362                                  CGROUP_PIDLIST_DESTROY_DELAY);
3363         mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
3364 }
3365
3366 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
3367 {
3368         struct kernfs_open_file *of = s->private;
3369         struct cgroup_pidlist *l = of->priv;
3370         pid_t *p = v;
3371         pid_t *end = l->list + l->length;
3372         /*
3373          * Advance to the next pid in the array. If this goes off the
3374          * end, we're done
3375          */
3376         p++;
3377         if (p >= end) {
3378                 return NULL;
3379         } else {
3380                 *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
3381                 return p;
3382         }
3383 }
3384
3385 static int cgroup_pidlist_show(struct seq_file *s, void *v)
3386 {
3387         return seq_printf(s, "%d\n", *(int *)v);
3388 }
3389
3390 /*
3391  * seq_operations functions for iterating on pidlists through seq_file -
3392  * independent of whether it's tasks or procs
3393  */
3394 static const struct seq_operations cgroup_pidlist_seq_operations = {
3395         .start = cgroup_pidlist_start,
3396         .stop = cgroup_pidlist_stop,
3397         .next = cgroup_pidlist_next,
3398         .show = cgroup_pidlist_show,
3399 };
3400
3401 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3402                                          struct cftype *cft)
3403 {
3404         return notify_on_release(css->cgroup);
3405 }
3406
3407 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3408                                           struct cftype *cft, u64 val)
3409 {
3410         clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3411         if (val)
3412                 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3413         else
3414                 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3415         return 0;
3416 }
3417
3418 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
3419                                       struct cftype *cft)
3420 {
3421         return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3422 }
3423
3424 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
3425                                        struct cftype *cft, u64 val)
3426 {
3427         if (val)
3428                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3429         else
3430                 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
3431         return 0;
3432 }
3433
3434 static struct cftype cgroup_base_files[] = {
3435         {
3436                 .name = "cgroup.procs",
3437                 .seq_start = cgroup_pidlist_start,
3438                 .seq_next = cgroup_pidlist_next,
3439                 .seq_stop = cgroup_pidlist_stop,
3440                 .seq_show = cgroup_pidlist_show,
3441                 .private = CGROUP_FILE_PROCS,
3442                 .write_u64 = cgroup_procs_write,
3443                 .mode = S_IRUGO | S_IWUSR,
3444         },
3445         {
3446                 .name = "cgroup.clone_children",
3447                 .flags = CFTYPE_INSANE,
3448                 .read_u64 = cgroup_clone_children_read,
3449                 .write_u64 = cgroup_clone_children_write,
3450         },
3451         {
3452                 .name = "cgroup.sane_behavior",
3453                 .flags = CFTYPE_ONLY_ON_ROOT,
3454                 .seq_show = cgroup_sane_behavior_show,
3455         },
3456
3457         /*
3458          * Historical crazy stuff.  These don't have "cgroup."  prefix and
3459          * don't exist if sane_behavior.  If you're depending on these, be
3460          * prepared to be burned.
3461          */
3462         {
3463                 .name = "tasks",
3464                 .flags = CFTYPE_INSANE,         /* use "procs" instead */
3465                 .seq_start = cgroup_pidlist_start,
3466                 .seq_next = cgroup_pidlist_next,
3467                 .seq_stop = cgroup_pidlist_stop,
3468                 .seq_show = cgroup_pidlist_show,
3469                 .private = CGROUP_FILE_TASKS,
3470                 .write_u64 = cgroup_tasks_write,
3471                 .mode = S_IRUGO | S_IWUSR,
3472         },
3473         {
3474                 .name = "notify_on_release",
3475                 .flags = CFTYPE_INSANE,
3476                 .read_u64 = cgroup_read_notify_on_release,
3477                 .write_u64 = cgroup_write_notify_on_release,
3478         },
3479         {
3480                 .name = "release_agent",
3481                 .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
3482                 .seq_show = cgroup_release_agent_show,
3483                 .write_string = cgroup_release_agent_write,
3484                 .max_write_len = PATH_MAX - 1,
3485         },
3486         { }     /* terminate */
3487 };
3488
3489 /**
3490  * cgroup_populate_dir - create subsys files in a cgroup directory
3491  * @cgrp: target cgroup
3492  * @subsys_mask: mask of the subsystem ids whose files should be added
3493  *
3494  * On failure, no file is added.
3495  */
3496 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
3497 {
3498         struct cgroup_subsys *ss;
3499         int i, ret = 0;
3500
3501         /* process cftsets of each subsystem */
3502         for_each_subsys(ss, i) {
3503                 struct cftype *cfts;
3504
3505                 if (!test_bit(i, &subsys_mask))
3506                         continue;
3507
3508                 list_for_each_entry(cfts, &ss->cfts, node) {
3509                         ret = cgroup_addrm_files(cgrp, cfts, true);
3510                         if (ret < 0)
3511                                 goto err;
3512                 }
3513         }
3514         return 0;
3515 err:
3516         cgroup_clear_dir(cgrp, subsys_mask);
3517         return ret;
3518 }
3519
3520 /*
3521  * css destruction is four-stage process.
3522  *
3523  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
3524  *    Implemented in kill_css().
3525  *
3526  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
3527  *    and thus css_tryget() is guaranteed to fail, the css can be offlined
3528  *    by invoking offline_css().  After offlining, the base ref is put.
3529  *    Implemented in css_killed_work_fn().
3530  *
3531  * 3. When the percpu_ref reaches zero, the only possible remaining
3532  *    accessors are inside RCU read sections.  css_release() schedules the
3533  *    RCU callback.
3534  *
3535  * 4. After the grace period, the css can be freed.  Implemented in
3536  *    css_free_work_fn().
3537  *
3538  * It is actually hairier because both step 2 and 4 require process context
3539  * and thus involve punting to css->destroy_work adding two additional
3540  * steps to the already complex sequence.
3541  */
3542 static void css_free_work_fn(struct work_struct *work)
3543 {
3544         struct cgroup_subsys_state *css =
3545                 container_of(work, struct cgroup_subsys_state, destroy_work);
3546         struct cgroup *cgrp = css->cgroup;
3547
3548         if (css->parent)
3549                 css_put(css->parent);
3550
3551         css->ss->css_free(css);
3552         cgroup_put(cgrp);
3553 }
3554
3555 static void css_free_rcu_fn(struct rcu_head *rcu_head)
3556 {
3557         struct cgroup_subsys_state *css =
3558                 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
3559
3560         INIT_WORK(&css->destroy_work, css_free_work_fn);
3561         queue_work(cgroup_destroy_wq, &css->destroy_work);
3562 }
3563
3564 static void css_release(struct percpu_ref *ref)
3565 {
3566         struct cgroup_subsys_state *css =
3567                 container_of(ref, struct cgroup_subsys_state, refcnt);
3568
3569         RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
3570         call_rcu(&css->rcu_head, css_free_rcu_fn);
3571 }
3572
3573 static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
3574                      struct cgroup *cgrp)
3575 {
3576         css->cgroup = cgrp;
3577         css->ss = ss;
3578         css->flags = 0;
3579
3580         if (cgrp->parent)
3581                 css->parent = cgroup_css(cgrp->parent, ss);
3582         else
3583                 css->flags |= CSS_ROOT;
3584
3585         BUG_ON(cgroup_css(cgrp, ss));
3586 }
3587
3588 /* invoke ->css_online() on a new CSS and mark it online if successful */
3589 static int online_css(struct cgroup_subsys_state *css)
3590 {
3591         struct cgroup_subsys *ss = css->ss;
3592         int ret = 0;
3593
3594         lockdep_assert_held(&cgroup_tree_mutex);
3595         lockdep_assert_held(&cgroup_mutex);
3596
3597         if (ss->css_online)
3598                 ret = ss->css_online(css);
3599         if (!ret) {
3600                 css->flags |= CSS_ONLINE;
3601                 css->cgroup->nr_css++;
3602                 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
3603         }
3604         return ret;
3605 }
3606
3607 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
3608 static void offline_css(struct cgroup_subsys_state *css)
3609 {
3610         struct cgroup_subsys *ss = css->ss;
3611
3612         lockdep_assert_held(&cgroup_tree_mutex);
3613         lockdep_assert_held(&cgroup_mutex);
3614
3615         if (!(css->flags & CSS_ONLINE))
3616                 return;
3617
3618         if (ss->css_offline)
3619                 ss->css_offline(css);
3620
3621         css->flags &= ~CSS_ONLINE;
3622         css->cgroup->nr_css--;
3623         RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
3624 }
3625
3626 /**
3627  * create_css - create a cgroup_subsys_state
3628  * @cgrp: the cgroup new css will be associated with
3629  * @ss: the subsys of new css
3630  *
3631  * Create a new css associated with @cgrp - @ss pair.  On success, the new
3632  * css is online and installed in @cgrp with all interface files created.
3633  * Returns 0 on success, -errno on failure.
3634  */
3635 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
3636 {
3637         struct cgroup *parent = cgrp->parent;
3638         struct cgroup_subsys_state *css;
3639         int err;
3640
3641         lockdep_assert_held(&cgroup_mutex);
3642
3643         css = ss->css_alloc(cgroup_css(parent, ss));
3644         if (IS_ERR(css))
3645                 return PTR_ERR(css);
3646
3647         err = percpu_ref_init(&css->refcnt, css_release);
3648         if (err)
3649                 goto err_free_css;
3650
3651         init_css(css, ss, cgrp);
3652
3653         err = cgroup_populate_dir(cgrp, 1 << ss->id);
3654         if (err)
3655                 goto err_free_percpu_ref;
3656
3657         err = online_css(css);
3658         if (err)
3659                 goto err_clear_dir;
3660
3661         cgroup_get(cgrp);
3662         css_get(css->parent);
3663
3664         if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
3665             parent->parent) {
3666                 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
3667                            current->comm, current->pid, ss->name);
3668                 if (!strcmp(ss->name, "memory"))
3669                         pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
3670                 ss->warned_broken_hierarchy = true;
3671         }
3672
3673         return 0;
3674
3675 err_clear_dir:
3676         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
3677 err_free_percpu_ref:
3678         percpu_ref_cancel_init(&css->refcnt);
3679 err_free_css:
3680         ss->css_free(css);
3681         return err;
3682 }
3683
3684 /**
3685  * cgroup_create - create a cgroup
3686  * @parent: cgroup that will be parent of the new cgroup
3687  * @name: name of the new cgroup
3688  * @mode: mode to set on new cgroup
3689  */
3690 static long cgroup_create(struct cgroup *parent, const char *name,
3691                           umode_t mode)
3692 {
3693         struct cgroup *cgrp;
3694         struct cgroup_root *root = parent->root;
3695         int ssid, err;
3696         struct cgroup_subsys *ss;
3697         struct kernfs_node *kn;
3698
3699         /*
3700          * XXX: The default hierarchy isn't fully implemented yet.  Block
3701          * !root cgroup creation on it for now.
3702          */
3703         if (root == &cgrp_dfl_root)
3704                 return -EINVAL;
3705
3706         /* allocate the cgroup and its ID, 0 is reserved for the root */
3707         cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
3708         if (!cgrp)
3709                 return -ENOMEM;
3710
3711         mutex_lock(&cgroup_tree_mutex);
3712
3713         /*
3714          * Only live parents can have children.  Note that the liveliness
3715          * check isn't strictly necessary because cgroup_mkdir() and
3716          * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
3717          * anyway so that locking is contained inside cgroup proper and we
3718          * don't get nasty surprises if we ever grow another caller.
3719          */
3720         if (!cgroup_lock_live_group(parent)) {
3721                 err = -ENODEV;
3722                 goto err_unlock_tree;
3723         }
3724
3725         /*
3726          * Temporarily set the pointer to NULL, so idr_find() won't return
3727          * a half-baked cgroup.
3728          */
3729         cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
3730         if (cgrp->id < 0) {
3731                 err = -ENOMEM;
3732                 goto err_unlock;
3733         }
3734
3735         init_cgroup_housekeeping(cgrp);
3736
3737         cgrp->parent = parent;
3738         cgrp->dummy_css.parent = &parent->dummy_css;
3739         cgrp->root = parent->root;
3740
3741         if (notify_on_release(parent))
3742                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
3743
3744         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
3745                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
3746
3747         /* create the directory */
3748         kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
3749         if (IS_ERR(kn)) {
3750                 err = PTR_ERR(kn);
3751                 goto err_free_id;
3752         }
3753         cgrp->kn = kn;
3754
3755         /*
3756          * This extra ref will be put in cgroup_free_fn() and guarantees
3757          * that @cgrp->kn is always accessible.
3758          */
3759         kernfs_get(kn);
3760
3761         cgrp->serial_nr = cgroup_serial_nr_next++;
3762
3763         /* allocation complete, commit to creation */
3764         list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
3765         atomic_inc(&root->nr_cgrps);
3766         cgroup_get(parent);
3767
3768         /*
3769          * @cgrp is now fully operational.  If something fails after this
3770          * point, it'll be released via the normal destruction path.
3771          */
3772         idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
3773
3774         err = cgroup_kn_set_ugid(kn);
3775         if (err)
3776                 goto err_destroy;
3777
3778         err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
3779         if (err)
3780                 goto err_destroy;
3781
3782         /* let's create and online css's */
3783         for_each_subsys(ss, ssid) {
3784                 if (parent->child_subsys_mask & (1 << ssid)) {
3785                         err = create_css(cgrp, ss);
3786                         if (err)
3787                                 goto err_destroy;
3788                 }
3789         }
3790
3791         cgrp->child_subsys_mask = parent->child_subsys_mask;
3792
3793         kernfs_activate(kn);
3794
3795         mutex_unlock(&cgroup_mutex);
3796         mutex_unlock(&cgroup_tree_mutex);
3797
3798         return 0;
3799
3800 err_free_id:
3801         idr_remove(&root->cgroup_idr, cgrp->id);
3802 err_unlock:
3803         mutex_unlock(&cgroup_mutex);
3804 err_unlock_tree:
3805         mutex_unlock(&cgroup_tree_mutex);
3806         kfree(cgrp);
3807         return err;
3808
3809 err_destroy:
3810         cgroup_destroy_locked(cgrp);
3811         mutex_unlock(&cgroup_mutex);
3812         mutex_unlock(&cgroup_tree_mutex);
3813         return err;
3814 }
3815
3816 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3817                         umode_t mode)
3818 {
3819         struct cgroup *parent = parent_kn->priv;
3820         int ret;
3821
3822         /*
3823          * cgroup_create() grabs cgroup_tree_mutex which nests outside
3824          * kernfs active_ref and cgroup_create() already synchronizes
3825          * properly against removal through cgroup_lock_live_group().
3826          * Break it before calling cgroup_create().
3827          */
3828         cgroup_get(parent);
3829         kernfs_break_active_protection(parent_kn);
3830
3831         ret = cgroup_create(parent, name, mode);
3832
3833         kernfs_unbreak_active_protection(parent_kn);
3834         cgroup_put(parent);
3835         return ret;
3836 }
3837
3838 /*
3839  * This is called when the refcnt of a css is confirmed to be killed.
3840  * css_tryget() is now guaranteed to fail.
3841  */
3842 static void css_killed_work_fn(struct work_struct *work)
3843 {
3844         struct cgroup_subsys_state *css =
3845                 container_of(work, struct cgroup_subsys_state, destroy_work);
3846         struct cgroup *cgrp = css->cgroup;
3847
3848         mutex_lock(&cgroup_tree_mutex);
3849         mutex_lock(&cgroup_mutex);
3850
3851         /*
3852          * css_tryget() is guaranteed to fail now.  Tell subsystems to
3853          * initate destruction.
3854          */
3855         offline_css(css);
3856
3857         /*
3858          * If @cgrp is marked dead, it's waiting for refs of all css's to
3859          * be disabled before proceeding to the second phase of cgroup
3860          * destruction.  If we are the last one, kick it off.
3861          */
3862         if (!cgrp->nr_css && cgroup_is_dead(cgrp))
3863                 cgroup_destroy_css_killed(cgrp);
3864
3865         mutex_unlock(&cgroup_mutex);
3866         mutex_unlock(&cgroup_tree_mutex);
3867
3868         /*
3869          * Put the css refs from kill_css().  Each css holds an extra
3870          * reference to the cgroup's dentry and cgroup removal proceeds
3871          * regardless of css refs.  On the last put of each css, whenever
3872          * that may be, the extra dentry ref is put so that dentry
3873          * destruction happens only after all css's are released.
3874          */
3875         css_put(css);
3876 }
3877
3878 /* css kill confirmation processing requires process context, bounce */
3879 static void css_killed_ref_fn(struct percpu_ref *ref)
3880 {
3881         struct cgroup_subsys_state *css =
3882                 container_of(ref, struct cgroup_subsys_state, refcnt);
3883
3884         INIT_WORK(&css->destroy_work, css_killed_work_fn);
3885         queue_work(cgroup_destroy_wq, &css->destroy_work);
3886 }
3887
3888 /**
3889  * kill_css - destroy a css
3890  * @css: css to destroy
3891  *
3892  * This function initiates destruction of @css by removing cgroup interface
3893  * files and putting its base reference.  ->css_offline() will be invoked
3894  * asynchronously once css_tryget() is guaranteed to fail and when the
3895  * reference count reaches zero, @css will be released.
3896  */
3897 static void kill_css(struct cgroup_subsys_state *css)
3898 {
3899         lockdep_assert_held(&cgroup_tree_mutex);
3900
3901         /*
3902          * This must happen before css is disassociated with its cgroup.
3903          * See seq_css() for details.
3904          */
3905         cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
3906
3907         /*
3908          * Killing would put the base ref, but we need to keep it alive
3909          * until after ->css_offline().
3910          */
3911         css_get(css);
3912
3913         /*
3914          * cgroup core guarantees that, by the time ->css_offline() is
3915          * invoked, no new css reference will be given out via
3916          * css_tryget().  We can't simply call percpu_ref_kill() and
3917          * proceed to offlining css's because percpu_ref_kill() doesn't
3918          * guarantee that the ref is seen as killed on all CPUs on return.
3919          *
3920          * Use percpu_ref_kill_and_confirm() to get notifications as each
3921          * css is confirmed to be seen as killed on all CPUs.
3922          */
3923         percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
3924 }
3925
3926 /**
3927  * cgroup_destroy_locked - the first stage of cgroup destruction
3928  * @cgrp: cgroup to be destroyed
3929  *
3930  * css's make use of percpu refcnts whose killing latency shouldn't be
3931  * exposed to userland and are RCU protected.  Also, cgroup core needs to
3932  * guarantee that css_tryget() won't succeed by the time ->css_offline() is
3933  * invoked.  To satisfy all the requirements, destruction is implemented in
3934  * the following two steps.
3935  *
3936  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
3937  *     userland visible parts and start killing the percpu refcnts of
3938  *     css's.  Set up so that the next stage will be kicked off once all
3939  *     the percpu refcnts are confirmed to be killed.
3940  *
3941  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
3942  *     rest of destruction.  Once all cgroup references are gone, the
3943  *     cgroup is RCU-freed.
3944  *
3945  * This function implements s1.  After this step, @cgrp is gone as far as
3946  * the userland is concerned and a new cgroup with the same name may be
3947  * created.  As cgroup doesn't care about the names internally, this
3948  * doesn't cause any problem.
3949  */
3950 static int cgroup_destroy_locked(struct cgroup *cgrp)
3951         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
3952 {
3953         struct cgroup *child;
3954         struct cgroup_subsys_state *css;
3955         bool empty;
3956         int ssid;
3957
3958         lockdep_assert_held(&cgroup_tree_mutex);
3959         lockdep_assert_held(&cgroup_mutex);
3960
3961         /*
3962          * css_set_rwsem synchronizes access to ->cset_links and prevents
3963          * @cgrp from being removed while put_css_set() is in progress.
3964          */
3965         down_read(&css_set_rwsem);
3966         empty = list_empty(&cgrp->cset_links);
3967         up_read(&css_set_rwsem);
3968         if (!empty)
3969                 return -EBUSY;
3970
3971         /*
3972          * Make sure there's no live children.  We can't test ->children
3973          * emptiness as dead children linger on it while being destroyed;
3974          * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
3975          */
3976         empty = true;
3977         rcu_read_lock();
3978         list_for_each_entry_rcu(child, &cgrp->children, sibling) {
3979                 empty = cgroup_is_dead(child);
3980                 if (!empty)
3981                         break;
3982         }
3983         rcu_read_unlock();
3984         if (!empty)
3985                 return -EBUSY;
3986
3987         /*
3988          * Mark @cgrp dead.  This prevents further task migration and child
3989          * creation by disabling cgroup_lock_live_group().  Note that
3990          * CGRP_DEAD assertion is depended upon by css_next_child() to
3991          * resume iteration after dropping RCU read lock.  See
3992          * css_next_child() for details.
3993          */
3994         set_bit(CGRP_DEAD, &cgrp->flags);
3995
3996         /*
3997          * Initiate massacre of all css's.  cgroup_destroy_css_killed()
3998          * will be invoked to perform the rest of destruction once the
3999          * percpu refs of all css's are confirmed to be killed.  This
4000          * involves removing the subsystem's files, drop cgroup_mutex.
4001          */
4002         mutex_unlock(&cgroup_mutex);
4003         for_each_css(css, ssid, cgrp)
4004                 kill_css(css);
4005         mutex_lock(&cgroup_mutex);
4006
4007         /* CGRP_DEAD is set, remove from ->release_list for the last time */
4008         raw_spin_lock(&release_list_lock);
4009         if (!list_empty(&cgrp->release_list))
4010                 list_del_init(&cgrp->release_list);
4011         raw_spin_unlock(&release_list_lock);
4012
4013         /*
4014          * If @cgrp has css's attached, the second stage of cgroup
4015          * destruction is kicked off from css_killed_work_fn() after the
4016          * refs of all attached css's are killed.  If @cgrp doesn't have
4017          * any css, we kick it off here.
4018          */
4019         if (!cgrp->nr_css)
4020                 cgroup_destroy_css_killed(cgrp);
4021
4022         /* remove @cgrp directory along with the base files */
4023         mutex_unlock(&cgroup_mutex);
4024
4025         /*
4026          * There are two control paths which try to determine cgroup from
4027          * dentry without going through kernfs - cgroupstats_build() and
4028          * css_tryget_from_dir().  Those are supported by RCU protecting
4029          * clearing of cgrp->kn->priv backpointer, which should happen
4030          * after all files under it have been removed.
4031          */
4032         kernfs_remove(cgrp->kn);        /* @cgrp has an extra ref on its kn */
4033         RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4034
4035         mutex_lock(&cgroup_mutex);
4036
4037         return 0;
4038 };
4039
4040 /**
4041  * cgroup_destroy_css_killed - the second step of cgroup destruction
4042  * @work: cgroup->destroy_free_work
4043  *
4044  * This function is invoked from a work item for a cgroup which is being
4045  * destroyed after all css's are offlined and performs the rest of
4046  * destruction.  This is the second step of destruction described in the
4047  * comment above cgroup_destroy_locked().
4048  */
4049 static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4050 {
4051         struct cgroup *parent = cgrp->parent;
4052
4053         lockdep_assert_held(&cgroup_tree_mutex);
4054         lockdep_assert_held(&cgroup_mutex);
4055
4056         /* delete this cgroup from parent->children */
4057         list_del_rcu(&cgrp->sibling);
4058
4059         cgroup_put(cgrp);
4060
4061         set_bit(CGRP_RELEASABLE, &parent->flags);
4062         check_for_release(parent);
4063 }
4064
4065 static int cgroup_rmdir(struct kernfs_node *kn)
4066 {
4067         struct cgroup *cgrp = kn->priv;
4068         int ret = 0;
4069
4070         /*
4071          * This is self-destruction but @kn can't be removed while this
4072          * callback is in progress.  Let's break active protection.  Once
4073          * the protection is broken, @cgrp can be destroyed at any point.
4074          * Pin it so that it stays accessible.
4075          */
4076         cgroup_get(cgrp);
4077         kernfs_break_active_protection(kn);
4078
4079         mutex_lock(&cgroup_tree_mutex);
4080         mutex_lock(&cgroup_mutex);
4081
4082         /*
4083          * @cgrp might already have been destroyed while we're trying to
4084          * grab the mutexes.
4085          */
4086         if (!cgroup_is_dead(cgrp))
4087                 ret = cgroup_destroy_locked(cgrp);
4088
4089         mutex_unlock(&cgroup_mutex);
4090         mutex_unlock(&cgroup_tree_mutex);
4091
4092         kernfs_unbreak_active_protection(kn);
4093         cgroup_put(cgrp);
4094         return ret;
4095 }
4096
4097 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
4098         .remount_fs             = cgroup_remount,
4099         .show_options           = cgroup_show_options,
4100         .mkdir                  = cgroup_mkdir,
4101         .rmdir                  = cgroup_rmdir,
4102         .rename                 = cgroup_rename,
4103 };
4104
4105 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4106 {
4107         struct cgroup_subsys_state *css;
4108
4109         printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
4110
4111         mutex_lock(&cgroup_tree_mutex);
4112         mutex_lock(&cgroup_mutex);
4113
4114         INIT_LIST_HEAD(&ss->cfts);
4115
4116         /* Create the root cgroup state for this subsystem */
4117         ss->root = &cgrp_dfl_root;
4118         css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
4119         /* We don't handle early failures gracefully */
4120         BUG_ON(IS_ERR(css));
4121         init_css(css, ss, &cgrp_dfl_root.cgrp);
4122
4123         /* Update the init_css_set to contain a subsys
4124          * pointer to this state - since the subsystem is
4125          * newly registered, all tasks and hence the
4126          * init_css_set is in the subsystem's root cgroup. */
4127         init_css_set.subsys[ss->id] = css;
4128
4129         need_forkexit_callback |= ss->fork || ss->exit;
4130
4131         /* At system boot, before all subsystems have been
4132          * registered, no tasks have been forked, so we don't
4133          * need to invoke fork callbacks here. */
4134         BUG_ON(!list_empty(&init_task.tasks));
4135
4136         BUG_ON(online_css(css));
4137
4138         cgrp_dfl_root.subsys_mask |= 1 << ss->id;
4139
4140         mutex_unlock(&cgroup_mutex);
4141         mutex_unlock(&cgroup_tree_mutex);
4142 }
4143
4144 /**
4145  * cgroup_init_early - cgroup initialization at system boot
4146  *
4147  * Initialize cgroups at system boot, and initialize any
4148  * subsystems that request early init.
4149  */
4150 int __init cgroup_init_early(void)
4151 {
4152         static struct cgroup_sb_opts __initdata opts =
4153                 { .flags = CGRP_ROOT_SANE_BEHAVIOR };
4154         struct cgroup_subsys *ss;
4155         int i;
4156
4157         init_cgroup_root(&cgrp_dfl_root, &opts);
4158         RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
4159
4160         for_each_subsys(ss, i) {
4161                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
4162                      "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
4163                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
4164                      ss->id, ss->name);
4165                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
4166                      "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
4167
4168                 ss->id = i;
4169                 ss->name = cgroup_subsys_name[i];
4170
4171                 if (ss->early_init)
4172                         cgroup_init_subsys(ss);
4173         }
4174         return 0;
4175 }
4176
4177 /**
4178  * cgroup_init - cgroup initialization
4179  *
4180  * Register cgroup filesystem and /proc file, and initialize
4181  * any subsystems that didn't request early init.
4182  */
4183 int __init cgroup_init(void)
4184 {
4185         struct cgroup_subsys *ss;
4186         unsigned long key;
4187         int ssid, err;
4188
4189         BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
4190
4191         mutex_lock(&cgroup_tree_mutex);
4192         mutex_lock(&cgroup_mutex);
4193
4194         /* Add init_css_set to the hash table */
4195         key = css_set_hash(init_css_set.subsys);
4196         hash_add(css_set_table, &init_css_set.hlist, key);
4197
4198         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
4199
4200         mutex_unlock(&cgroup_mutex);
4201         mutex_unlock(&cgroup_tree_mutex);
4202
4203         for_each_subsys(ss, ssid) {
4204                 if (!ss->early_init)
4205                         cgroup_init_subsys(ss);
4206
4207                 /*
4208                  * cftype registration needs kmalloc and can't be done
4209                  * during early_init.  Register base cftypes separately.
4210                  */
4211                 if (ss->base_cftypes)
4212                         WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
4213         }
4214
4215         cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
4216         if (!cgroup_kobj)
4217                 return -ENOMEM;
4218
4219         err = register_filesystem(&cgroup_fs_type);
4220         if (err < 0) {
4221                 kobject_put(cgroup_kobj);
4222                 return err;
4223         }
4224
4225         proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
4226         return 0;
4227 }
4228
4229 static int __init cgroup_wq_init(void)
4230 {
4231         /*
4232          * There isn't much point in executing destruction path in
4233          * parallel.  Good chunk is serialized with cgroup_mutex anyway.
4234          * Use 1 for @max_active.
4235          *
4236          * We would prefer to do this in cgroup_init() above, but that
4237          * is called before init_workqueues(): so leave this until after.
4238          */
4239         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
4240         BUG_ON(!cgroup_destroy_wq);
4241
4242         /*
4243          * Used to destroy pidlists and separate to serve as flush domain.
4244          * Cap @max_active to 1 too.
4245          */
4246         cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
4247                                                     0, 1);
4248         BUG_ON(!cgroup_pidlist_destroy_wq);
4249
4250         return 0;
4251 }
4252 core_initcall(cgroup_wq_init);
4253
4254 /*
4255  * proc_cgroup_show()
4256  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
4257  *  - Used for /proc/<pid>/cgroup.
4258  */
4259
4260 /* TODO: Use a proper seq_file iterator */
4261 int proc_cgroup_show(struct seq_file *m, void *v)
4262 {
4263         struct pid *pid;
4264         struct task_struct *tsk;
4265         char *buf, *path;
4266         int retval;
4267         struct cgroup_root *root;
4268
4269         retval = -ENOMEM;
4270         buf = kmalloc(PATH_MAX, GFP_KERNEL);
4271         if (!buf)
4272                 goto out;
4273
4274         retval = -ESRCH;
4275         pid = m->private;
4276         tsk = get_pid_task(pid, PIDTYPE_PID);
4277         if (!tsk)
4278                 goto out_free;
4279
4280         retval = 0;
4281
4282         mutex_lock(&cgroup_mutex);
4283         down_read(&css_set_rwsem);
4284
4285         for_each_root(root) {
4286                 struct cgroup_subsys *ss;
4287                 struct cgroup *cgrp;
4288                 int ssid, count = 0;
4289
4290                 if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
4291                         continue;
4292
4293                 seq_printf(m, "%d:", root->hierarchy_id);
4294                 for_each_subsys(ss, ssid)
4295                         if (root->subsys_mask & (1 << ssid))
4296                                 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
4297                 if (strlen(root->name))
4298                         seq_printf(m, "%sname=%s", count ? "," : "",
4299                                    root->name);
4300                 seq_putc(m, ':');
4301                 cgrp = task_cgroup_from_root(tsk, root);
4302                 path = cgroup_path(cgrp, buf, PATH_MAX);
4303                 if (!path) {
4304                         retval = -ENAMETOOLONG;
4305                         goto out_unlock;
4306                 }
4307                 seq_puts(m, path);
4308                 seq_putc(m, '\n');
4309         }
4310
4311 out_unlock:
4312         up_read(&css_set_rwsem);
4313         mutex_unlock(&cgroup_mutex);
4314         put_task_struct(tsk);
4315 out_free:
4316         kfree(buf);
4317 out:
4318         return retval;
4319 }
4320
4321 /* Display information about each subsystem and each hierarchy */
4322 static int proc_cgroupstats_show(struct seq_file *m, void *v)
4323 {
4324         struct cgroup_subsys *ss;
4325         int i;
4326
4327         seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
4328         /*
4329          * ideally we don't want subsystems moving around while we do this.
4330          * cgroup_mutex is also necessary to guarantee an atomic snapshot of
4331          * subsys/hierarchy state.
4332          */
4333         mutex_lock(&cgroup_mutex);
4334
4335         for_each_subsys(ss, i)
4336                 seq_printf(m, "%s\t%d\t%d\t%d\n",
4337                            ss->name, ss->root->hierarchy_id,
4338                            atomic_read(&ss->root->nr_cgrps), !ss->disabled);
4339
4340         mutex_unlock(&cgroup_mutex);
4341         return 0;
4342 }
4343
4344 static int cgroupstats_open(struct inode *inode, struct file *file)
4345 {
4346         return single_open(file, proc_cgroupstats_show, NULL);
4347 }
4348
4349 static const struct file_operations proc_cgroupstats_operations = {
4350         .open = cgroupstats_open,
4351         .read = seq_read,
4352         .llseek = seq_lseek,
4353         .release = single_release,
4354 };
4355
4356 /**
4357  * cgroup_fork - initialize cgroup related fields during copy_process()
4358  * @child: pointer to task_struct of forking parent process.
4359  *
4360  * A task is associated with the init_css_set until cgroup_post_fork()
4361  * attaches it to the parent's css_set.  Empty cg_list indicates that
4362  * @child isn't holding reference to its css_set.
4363  */
4364 void cgroup_fork(struct task_struct *child)
4365 {
4366         RCU_INIT_POINTER(child->cgroups, &init_css_set);
4367         INIT_LIST_HEAD(&child->cg_list);
4368 }
4369
4370 /**
4371  * cgroup_post_fork - called on a new task after adding it to the task list
4372  * @child: the task in question
4373  *
4374  * Adds the task to the list running through its css_set if necessary and
4375  * call the subsystem fork() callbacks.  Has to be after the task is
4376  * visible on the task list in case we race with the first call to
4377  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
4378  * list.
4379  */
4380 void cgroup_post_fork(struct task_struct *child)
4381 {
4382         struct cgroup_subsys *ss;
4383         int i;
4384
4385         /*
4386          * This may race against cgroup_enable_task_cg_links().  As that
4387          * function sets use_task_css_set_links before grabbing
4388          * tasklist_lock and we just went through tasklist_lock to add
4389          * @child, it's guaranteed that either we see the set
4390          * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
4391          * @child during its iteration.
4392          *
4393          * If we won the race, @child is associated with %current's
4394          * css_set.  Grabbing css_set_rwsem guarantees both that the
4395          * association is stable, and, on completion of the parent's
4396          * migration, @child is visible in the source of migration or
4397          * already in the destination cgroup.  This guarantee is necessary
4398          * when implementing operations which need to migrate all tasks of
4399          * a cgroup to another.
4400          *
4401          * Note that if we lose to cgroup_enable_task_cg_links(), @child
4402          * will remain in init_css_set.  This is safe because all tasks are
4403          * in the init_css_set before cg_links is enabled and there's no
4404          * operation which transfers all tasks out of init_css_set.
4405          */
4406         if (use_task_css_set_links) {
4407                 struct css_set *cset;
4408
4409                 down_write(&css_set_rwsem);
4410                 cset = task_css_set(current);
4411                 if (list_empty(&child->cg_list)) {
4412                         rcu_assign_pointer(child->cgroups, cset);
4413                         list_add(&child->cg_list, &cset->tasks);
4414                         get_css_set(cset);
4415                 }
4416                 up_write(&css_set_rwsem);
4417         }
4418
4419         /*
4420          * Call ss->fork().  This must happen after @child is linked on
4421          * css_set; otherwise, @child might change state between ->fork()
4422          * and addition to css_set.
4423          */
4424         if (need_forkexit_callback) {
4425                 for_each_subsys(ss, i)
4426                         if (ss->fork)
4427                                 ss->fork(child);
4428         }
4429 }
4430
4431 /**
4432  * cgroup_exit - detach cgroup from exiting task
4433  * @tsk: pointer to task_struct of exiting process
4434  *
4435  * Description: Detach cgroup from @tsk and release it.
4436  *
4437  * Note that cgroups marked notify_on_release force every task in
4438  * them to take the global cgroup_mutex mutex when exiting.
4439  * This could impact scaling on very large systems.  Be reluctant to
4440  * use notify_on_release cgroups where very high task exit scaling
4441  * is required on large systems.
4442  *
4443  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
4444  * call cgroup_exit() while the task is still competent to handle
4445  * notify_on_release(), then leave the task attached to the root cgroup in
4446  * each hierarchy for the remainder of its exit.  No need to bother with
4447  * init_css_set refcnting.  init_css_set never goes away and we can't race
4448  * with migration path - PF_EXITING is visible to migration path.
4449  */
4450 void cgroup_exit(struct task_struct *tsk)
4451 {
4452         struct cgroup_subsys *ss;
4453         struct css_set *cset;
4454         bool put_cset = false;
4455         int i;
4456
4457         /*
4458          * Unlink from @tsk from its css_set.  As migration path can't race
4459          * with us, we can check cg_list without grabbing css_set_rwsem.
4460          */
4461         if (!list_empty(&tsk->cg_list)) {
4462                 down_write(&css_set_rwsem);
4463                 list_del_init(&tsk->cg_list);
4464                 up_write(&css_set_rwsem);
4465                 put_cset = true;
4466         }
4467
4468         /* Reassign the task to the init_css_set. */
4469         cset = task_css_set(tsk);
4470         RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
4471
4472         if (need_forkexit_callback) {
4473                 /* see cgroup_post_fork() for details */
4474                 for_each_subsys(ss, i) {
4475                         if (ss->exit) {
4476                                 struct cgroup_subsys_state *old_css = cset->subsys[i];
4477                                 struct cgroup_subsys_state *css = task_css(tsk, i);
4478
4479                                 ss->exit(css, old_css, tsk);
4480                         }
4481                 }
4482         }
4483
4484         if (put_cset)
4485                 put_css_set(cset, true);
4486 }
4487
4488 static void check_for_release(struct cgroup *cgrp)
4489 {
4490         if (cgroup_is_releasable(cgrp) &&
4491             list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
4492                 /*
4493                  * Control Group is currently removeable. If it's not
4494                  * already queued for a userspace notification, queue
4495                  * it now
4496                  */
4497                 int need_schedule_work = 0;
4498
4499                 raw_spin_lock(&release_list_lock);
4500                 if (!cgroup_is_dead(cgrp) &&
4501                     list_empty(&cgrp->release_list)) {
4502                         list_add(&cgrp->release_list, &release_list);
4503                         need_schedule_work = 1;
4504                 }
4505                 raw_spin_unlock(&release_list_lock);
4506                 if (need_schedule_work)
4507                         schedule_work(&release_agent_work);
4508         }
4509 }
4510
4511 /*
4512  * Notify userspace when a cgroup is released, by running the
4513  * configured release agent with the name of the cgroup (path
4514  * relative to the root of cgroup file system) as the argument.
4515  *
4516  * Most likely, this user command will try to rmdir this cgroup.
4517  *
4518  * This races with the possibility that some other task will be
4519  * attached to this cgroup before it is removed, or that some other
4520  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
4521  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
4522  * unused, and this cgroup will be reprieved from its death sentence,
4523  * to continue to serve a useful existence.  Next time it's released,
4524  * we will get notified again, if it still has 'notify_on_release' set.
4525  *
4526  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
4527  * means only wait until the task is successfully execve()'d.  The
4528  * separate release agent task is forked by call_usermodehelper(),
4529  * then control in this thread returns here, without waiting for the
4530  * release agent task.  We don't bother to wait because the caller of
4531  * this routine has no use for the exit status of the release agent
4532  * task, so no sense holding our caller up for that.
4533  */
4534 static void cgroup_release_agent(struct work_struct *work)
4535 {
4536         BUG_ON(work != &release_agent_work);
4537         mutex_lock(&cgroup_mutex);
4538         raw_spin_lock(&release_list_lock);
4539         while (!list_empty(&release_list)) {
4540                 char *argv[3], *envp[3];
4541                 int i;
4542                 char *pathbuf = NULL, *agentbuf = NULL, *path;
4543                 struct cgroup *cgrp = list_entry(release_list.next,
4544                                                     struct cgroup,
4545                                                     release_list);
4546                 list_del_init(&cgrp->release_list);
4547                 raw_spin_unlock(&release_list_lock);
4548                 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
4549                 if (!pathbuf)
4550                         goto continue_free;
4551                 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
4552                 if (!path)
4553                         goto continue_free;
4554                 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
4555                 if (!agentbuf)
4556                         goto continue_free;
4557
4558                 i = 0;
4559                 argv[i++] = agentbuf;
4560                 argv[i++] = path;
4561                 argv[i] = NULL;
4562
4563                 i = 0;
4564                 /* minimal command environment */
4565                 envp[i++] = "HOME=/";
4566                 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
4567                 envp[i] = NULL;
4568
4569                 /* Drop the lock while we invoke the usermode helper,
4570                  * since the exec could involve hitting disk and hence
4571                  * be a slow process */
4572                 mutex_unlock(&cgroup_mutex);
4573                 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
4574                 mutex_lock(&cgroup_mutex);
4575  continue_free:
4576                 kfree(pathbuf);
4577                 kfree(agentbuf);
4578                 raw_spin_lock(&release_list_lock);
4579         }
4580         raw_spin_unlock(&release_list_lock);
4581         mutex_unlock(&cgroup_mutex);
4582 }
4583
4584 static int __init cgroup_disable(char *str)
4585 {
4586         struct cgroup_subsys *ss;
4587         char *token;
4588         int i;
4589
4590         while ((token = strsep(&str, ",")) != NULL) {
4591                 if (!*token)
4592                         continue;
4593
4594                 for_each_subsys(ss, i) {
4595                         if (!strcmp(token, ss->name)) {
4596                                 ss->disabled = 1;
4597                                 printk(KERN_INFO "Disabling %s control group"
4598                                         " subsystem\n", ss->name);
4599                                 break;
4600                         }
4601                 }
4602         }
4603         return 1;
4604 }
4605 __setup("cgroup_disable=", cgroup_disable);
4606
4607 /**
4608  * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
4609  * @dentry: directory dentry of interest
4610  * @ss: subsystem of interest
4611  *
4612  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
4613  * to get the corresponding css and return it.  If such css doesn't exist
4614  * or can't be pinned, an ERR_PTR value is returned.
4615  */
4616 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
4617                                                 struct cgroup_subsys *ss)
4618 {
4619         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
4620         struct cgroup_subsys_state *css = NULL;
4621         struct cgroup *cgrp;
4622
4623         /* is @dentry a cgroup dir? */
4624         if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
4625             kernfs_type(kn) != KERNFS_DIR)
4626                 return ERR_PTR(-EBADF);
4627
4628         rcu_read_lock();
4629
4630         /*
4631          * This path doesn't originate from kernfs and @kn could already
4632          * have been or be removed at any point.  @kn->priv is RCU
4633          * protected for this access.  See destroy_locked() for details.
4634          */
4635         cgrp = rcu_dereference(kn->priv);
4636         if (cgrp)
4637                 css = cgroup_css(cgrp, ss);
4638
4639         if (!css || !css_tryget(css))
4640                 css = ERR_PTR(-ENOENT);
4641
4642         rcu_read_unlock();
4643         return css;
4644 }
4645
4646 /**
4647  * css_from_id - lookup css by id
4648  * @id: the cgroup id
4649  * @ss: cgroup subsys to be looked into
4650  *
4651  * Returns the css if there's valid one with @id, otherwise returns NULL.
4652  * Should be called under rcu_read_lock().
4653  */
4654 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
4655 {
4656         struct cgroup *cgrp;
4657
4658         cgroup_assert_mutexes_or_rcu_locked();
4659
4660         cgrp = idr_find(&ss->root->cgroup_idr, id);
4661         if (cgrp)
4662                 return cgroup_css(cgrp, ss);
4663         return NULL;
4664 }
4665
4666 #ifdef CONFIG_CGROUP_DEBUG
4667 static struct cgroup_subsys_state *
4668 debug_css_alloc(struct cgroup_subsys_state *parent_css)
4669 {
4670         struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
4671
4672         if (!css)
4673                 return ERR_PTR(-ENOMEM);
4674
4675         return css;
4676 }
4677
4678 static void debug_css_free(struct cgroup_subsys_state *css)
4679 {
4680         kfree(css);
4681 }
4682
4683 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
4684                                 struct cftype *cft)
4685 {
4686         return cgroup_task_count(css->cgroup);
4687 }
4688
4689 static u64 current_css_set_read(struct cgroup_subsys_state *css,
4690                                 struct cftype *cft)
4691 {
4692         return (u64)(unsigned long)current->cgroups;
4693 }
4694
4695 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
4696                                          struct cftype *cft)
4697 {
4698         u64 count;
4699
4700         rcu_read_lock();
4701         count = atomic_read(&task_css_set(current)->refcount);
4702         rcu_read_unlock();
4703         return count;
4704 }
4705
4706 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
4707 {
4708         struct cgrp_cset_link *link;
4709         struct css_set *cset;
4710         char *name_buf;
4711
4712         name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
4713         if (!name_buf)
4714                 return -ENOMEM;
4715
4716         down_read(&css_set_rwsem);
4717         rcu_read_lock();
4718         cset = rcu_dereference(current->cgroups);
4719         list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
4720                 struct cgroup *c = link->cgrp;
4721
4722                 cgroup_name(c, name_buf, NAME_MAX + 1);
4723                 seq_printf(seq, "Root %d group %s\n",
4724                            c->root->hierarchy_id, name_buf);
4725         }
4726         rcu_read_unlock();
4727         up_read(&css_set_rwsem);
4728         kfree(name_buf);
4729         return 0;
4730 }
4731
4732 #define MAX_TASKS_SHOWN_PER_CSS 25
4733 static int cgroup_css_links_read(struct seq_file *seq, void *v)
4734 {
4735         struct cgroup_subsys_state *css = seq_css(seq);
4736         struct cgrp_cset_link *link;
4737
4738         down_read(&css_set_rwsem);
4739         list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
4740                 struct css_set *cset = link->cset;
4741                 struct task_struct *task;
4742                 int count = 0;
4743
4744                 seq_printf(seq, "css_set %p\n", cset);
4745
4746                 list_for_each_entry(task, &cset->tasks, cg_list) {
4747                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4748                                 goto overflow;
4749                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
4750                 }
4751
4752                 list_for_each_entry(task, &cset->mg_tasks, cg_list) {
4753                         if (count++ > MAX_TASKS_SHOWN_PER_CSS)
4754                                 goto overflow;
4755                         seq_printf(seq, "  task %d\n", task_pid_vnr(task));
4756                 }
4757                 continue;
4758         overflow:
4759                 seq_puts(seq, "  ...\n");
4760         }
4761         up_read(&css_set_rwsem);
4762         return 0;
4763 }
4764
4765 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
4766 {
4767         return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
4768 }
4769
4770 static struct cftype debug_files[] =  {
4771         {
4772                 .name = "taskcount",
4773                 .read_u64 = debug_taskcount_read,
4774         },
4775
4776         {
4777                 .name = "current_css_set",
4778                 .read_u64 = current_css_set_read,
4779         },
4780
4781         {
4782                 .name = "current_css_set_refcount",
4783                 .read_u64 = current_css_set_refcount_read,
4784         },
4785
4786         {
4787                 .name = "current_css_set_cg_links",
4788                 .seq_show = current_css_set_cg_links_read,
4789         },
4790
4791         {
4792                 .name = "cgroup_css_links",
4793                 .seq_show = cgroup_css_links_read,
4794         },
4795
4796         {
4797                 .name = "releasable",
4798                 .read_u64 = releasable_read,
4799         },
4800
4801         { }     /* terminate */
4802 };
4803
4804 struct cgroup_subsys debug_cgrp_subsys = {
4805         .css_alloc = debug_css_alloc,
4806         .css_free = debug_css_free,
4807         .base_cftypes = debug_files,
4808 };
4809 #endif /* CONFIG_CGROUP_DEBUG */