Merge branch 'work.mount0' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux-2.6-block.git] / kernel / cgroup / cgroup.c
index 4a0eb465d17ec22c2134dd56778d8cbc02b8efab..753afbca549fdaeba9c133c192c1927d1d9c6f3c 100644 (file)
@@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
 
 #define cgroup_assert_mutex_or_rcu_locked()                            \
        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];
 
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_advance(struct css_task_iter *it);
+static void css_task_iter_skip(struct css_task_iter *it,
+                              struct task_struct *task);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss);
@@ -738,6 +739,7 @@ struct css_set init_css_set = {
        .dom_cset               = &init_css_set,
        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
        .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
+       .dying_tasks            = LIST_HEAD_INIT(init_css_set.dying_tasks),
        .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
        .threaded_csets         = LIST_HEAD_INIT(init_css_set.threaded_csets),
        .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
                cgroup_update_populated(link->cgrp, populated);
 }
 
+/*
+ * @task is leaving, advance task iterators which are pointing to it so
+ * that they can resume at the next position.  Advancing an iterator might
+ * remove it from the list, use safe walk.  See css_task_iter_skip() for
+ * details.
+ */
+static void css_set_skip_task_iters(struct css_set *cset,
+                                   struct task_struct *task)
+{
+       struct css_task_iter *it, *pos;
+
+       list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+               css_task_iter_skip(it, task);
+}
+
 /**
  * css_set_move_task - move a task from one css_set to another
  * @task: task being moved
@@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
                css_set_update_populated(to_cset, true);
 
        if (from_cset) {
-               struct css_task_iter *it, *pos;
-
                WARN_ON_ONCE(list_empty(&task->cg_list));
 
-               /*
-                * @task is leaving, advance task iterators which are
-                * pointing to it so that they can resume at the next
-                * position.  Advancing an iterator might remove it from
-                * the list, use safe walk.  See css_task_iter_advance*()
-                * for details.
-                */
-               list_for_each_entry_safe(it, pos, &from_cset->task_iters,
-                                        iters_node)
-                       if (it->task_pos == &task->cg_list)
-                               css_task_iter_advance(it);
-
+               css_set_skip_task_iters(from_cset, task);
                list_del_init(&task->cg_list);
                if (!css_set_populated(from_cset))
                        css_set_update_populated(from_cset, false);
@@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        cset->dom_cset = cset;
        INIT_LIST_HEAD(&cset->tasks);
        INIT_LIST_HEAD(&cset->mg_tasks);
+       INIT_LIST_HEAD(&cset->dying_tasks);
        INIT_LIST_HEAD(&cset->task_iters);
        INIT_LIST_HEAD(&cset->threaded_csets);
        INIT_HLIST_NODE(&cset->hlist);
@@ -1810,11 +1815,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 
 enum cgroup2_param {
        Opt_nsdelegate,
+       Opt_memory_localevents,
        nr__cgroup2_params
 };
 
 static const struct fs_parameter_spec cgroup2_param_specs[] = {
-       fsparam_flag  ("nsdelegate",            Opt_nsdelegate),
+       fsparam_flag("nsdelegate",              Opt_nsdelegate),
+       fsparam_flag("memory_localevents",      Opt_memory_localevents),
        {}
 };
 
@@ -1837,6 +1844,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
        case Opt_nsdelegate:
                ctx->flags |= CGRP_ROOT_NS_DELEGATE;
                return 0;
+       case Opt_memory_localevents:
+               ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+               return 0;
        }
        return -EINVAL;
 }
@@ -1848,6 +1858,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
                        cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
                else
                        cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+
+               if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+                       cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+               else
+                       cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
        }
 }
 
@@ -1855,6 +1870,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
 {
        if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
                seq_puts(seq, ",nsdelegate");
+       if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+               seq_puts(seq, ",memory_localevents");
        return 0;
 }
 
@@ -4252,6 +4269,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
 
        return NULL;
 }
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
 
 /**
  * css_rightmost_descendant - return the rightmost descendant of a css
@@ -4439,15 +4457,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
                        it->task_pos = NULL;
                        return;
                }
-       } while (!css_set_populated(cset));
+       } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
 
        if (!list_empty(&cset->tasks))
                it->task_pos = cset->tasks.next;
-       else
+       else if (!list_empty(&cset->mg_tasks))
                it->task_pos = cset->mg_tasks.next;
+       else
+               it->task_pos = cset->dying_tasks.next;
 
        it->tasks_head = &cset->tasks;
        it->mg_tasks_head = &cset->mg_tasks;
+       it->dying_tasks_head = &cset->dying_tasks;
 
        /*
         * We don't keep css_sets locked across iteration steps and thus
@@ -4473,9 +4494,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
        list_add(&it->iters_node, &cset->task_iters);
 }
 
+static void css_task_iter_skip(struct css_task_iter *it,
+                              struct task_struct *task)
+{
+       lockdep_assert_held(&css_set_lock);
+
+       if (it->task_pos == &task->cg_list) {
+               it->task_pos = it->task_pos->next;
+               it->flags |= CSS_TASK_ITER_SKIPPED;
+       }
+}
+
 static void css_task_iter_advance(struct css_task_iter *it)
 {
-       struct list_head *next;
+       struct task_struct *task;
 
        lockdep_assert_held(&css_set_lock);
 repeat:
@@ -4485,25 +4517,40 @@ repeat:
                 * consumed first and then ->mg_tasks.  After ->mg_tasks,
                 * we move onto the next cset.
                 */
-               next = it->task_pos->next;
-
-               if (next == it->tasks_head)
-                       next = it->mg_tasks_head->next;
+               if (it->flags & CSS_TASK_ITER_SKIPPED)
+                       it->flags &= ~CSS_TASK_ITER_SKIPPED;
+               else
+                       it->task_pos = it->task_pos->next;
 
-               if (next == it->mg_tasks_head)
+               if (it->task_pos == it->tasks_head)
+                       it->task_pos = it->mg_tasks_head->next;
+               if (it->task_pos == it->mg_tasks_head)
+                       it->task_pos = it->dying_tasks_head->next;
+               if (it->task_pos == it->dying_tasks_head)
                        css_task_iter_advance_css_set(it);
-               else
-                       it->task_pos = next;
        } else {
                /* called from start, proceed to the first cset */
                css_task_iter_advance_css_set(it);
        }
 
-       /* if PROCS, skip over tasks which aren't group leaders */
-       if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
-           !thread_group_leader(list_entry(it->task_pos, struct task_struct,
-                                           cg_list)))
-               goto repeat;
+       if (!it->task_pos)
+               return;
+
+       task = list_entry(it->task_pos, struct task_struct, cg_list);
+
+       if (it->flags & CSS_TASK_ITER_PROCS) {
+               /* if PROCS, skip over tasks which aren't group leaders */
+               if (!thread_group_leader(task))
+                       goto repeat;
+
+               /* and dying leaders w/o live member threads */
+               if (!atomic_read(&task->signal->live))
+                       goto repeat;
+       } else {
+               /* skip all dying ones */
+               if (task->flags & PF_EXITING)
+                       goto repeat;
+       }
 }
 
 /**
@@ -4559,6 +4606,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 
        spin_lock_irq(&css_set_lock);
 
+       /* @it may be half-advanced by skips, finish advancing */
+       if (it->flags & CSS_TASK_ITER_SKIPPED)
+               css_task_iter_advance(it);
+
        if (it->task_pos) {
                it->cur_task = list_entry(it->task_pos, struct task_struct,
                                          cg_list);
@@ -4998,8 +5049,6 @@ static void css_release_work_fn(struct work_struct *work)
                if (cgrp->kn)
                        RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
                                         NULL);
-
-               cgroup_bpf_put(cgrp);
        }
 
        mutex_unlock(&cgroup_mutex);
@@ -5525,6 +5574,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
        cgroup1_check_for_release(parent);
 
+       cgroup_bpf_offline(cgrp);
+
        /* put the base reference */
        percpu_ref_kill(&cgrp->self.refcnt);
 
@@ -5659,7 +5710,6 @@ int __init cgroup_init(void)
        int ssid;
 
        BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
-       BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
        BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
@@ -6043,6 +6093,7 @@ void cgroup_exit(struct task_struct *tsk)
        if (!list_empty(&tsk->cg_list)) {
                spin_lock_irq(&css_set_lock);
                css_set_move_task(tsk, cset, NULL, false);
+               list_add_tail(&tsk->cg_list, &cset->dying_tasks);
                cset->nr_tasks--;
 
                WARN_ON_ONCE(cgroup_task_frozen(tsk));
@@ -6068,6 +6119,13 @@ void cgroup_release(struct task_struct *task)
        do_each_subsys_mask(ss, ssid, have_release_callback) {
                ss->release(task);
        } while_each_subsys_mask();
+
+       if (use_task_css_set_links) {
+               spin_lock_irq(&css_set_lock);
+               css_set_skip_task_iters(task_css_set(task), task);
+               list_del_init(&task->cg_list);
+               spin_unlock_irq(&css_set_lock);
+       }
 }
 
 void cgroup_free(struct task_struct *task)
@@ -6229,6 +6287,48 @@ struct cgroup *cgroup_get_from_fd(int fd)
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
 
+static u64 power_of_ten(int power)
+{
+       u64 v = 1;
+       while (power--)
+               v *= 10;
+       return v;
+}
+
+/**
+ * cgroup_parse_float - parse a floating number
+ * @input: input string
+ * @dec_shift: number of decimal digits to shift
+ * @v: output
+ *
+ * Parse a decimal floating point number in @input and store the result in
+ * @v with decimal point right shifted @dec_shift times.  For example, if
+ * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
+ * Returns 0 on success, -errno otherwise.
+ *
+ * There's nothing cgroup specific about this function except that it's
+ * currently the only user.
+ */
+int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
+{
+       s64 whole, frac = 0;
+       int fstart = 0, fend = 0, flen;
+
+       if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
+               return -EINVAL;
+       if (frac < 0)
+               return -EINVAL;
+
+       flen = fend > fstart ? fend - fstart : 0;
+       if (flen < dec_shift)
+               frac *= power_of_ten(dec_shift - flen);
+       else
+               frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
+
+       *v = whole * power_of_ten(dec_shift) + frac;
+       return 0;
+}
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
@@ -6267,6 +6367,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
                 * Don't use cgroup_get_live().
                 */
                cgroup_get(sock_cgroup_ptr(skcd));
+               cgroup_bpf_get(sock_cgroup_ptr(skcd));
                return;
        }
 
@@ -6278,6 +6379,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
                cset = task_css_set(current);
                if (likely(cgroup_tryget(cset->dfl_cgrp))) {
                        skcd->val = (unsigned long)cset->dfl_cgrp;
+                       cgroup_bpf_get(cset->dfl_cgrp);
                        break;
                }
                cpu_relax();
@@ -6288,7 +6390,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
 
 void cgroup_sk_free(struct sock_cgroup_data *skcd)
 {
-       cgroup_put(sock_cgroup_ptr(skcd));
+       struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+       cgroup_bpf_put(cgrp);
+       cgroup_put(cgrp);
 }
 
 #endif /* CONFIG_SOCK_CGROUP_DATA */
@@ -6371,7 +6476,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
 static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
                             char *buf)
 {
-       return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+       return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
 }
 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
 
@@ -6391,4 +6496,5 @@ static int __init cgroup_sysfs_init(void)
        return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
 }
 subsys_initcall(cgroup_sysfs_init);
+
 #endif /* CONFIG_SYSFS */