kernel/cgroup_pids.c

   1 /*
   2  * Process number limiting controller for cgroups.
   3  *
   4  * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
   5  * after a certain limit is reached.
   6  *
   7  * Since it is trivial to hit the task limit without hitting any kmemcg limits
   8  * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
   9  * preventable in the scope of a cgroup hierarchy by allowing resource limiting
  10  * of the number of tasks in a cgroup.
  11  *
  12  * In order to use the `pids` controller, set the maximum number of tasks in
  13  * pids.max (this is not available in the root cgroup for obvious reasons). The
  14  * number of processes currently in the cgroup is given by pids.current.
  15  * Organisational operations are not blocked by cgroup policies, so it is
  16  * possible to have pids.current > pids.max. However, it is not possible to
  17  * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
  18  * would cause a cgroup policy to be violated.
  19  *
  20  * To set a cgroup to have no limit, set pids.max to "max". This is the default
  21  * for all new cgroups (N.B. that PID limits are hierarchical, so the most
  22  * stringent limit in the hierarchy is followed).
  23  *
  24  * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
  25  * a superset of parent/child/pids.current.
  26  *
  27  * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
  28  *
  29  * This file is subject to the terms and conditions of version 2 of the GNU
  30  * General Public License.  See the file COPYING in the main directory of the
  31  * Linux distribution for more details.
  32  */
  33
  34 #include <linux/kernel.h>
  35 #include <linux/threads.h>
  36 #include <linux/atomic.h>
  37 #include <linux/cgroup.h>
  38 #include <linux/slab.h>
  39
  40 #define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
  41 #define PIDS_MAX_STR "max"
  42
  43 struct pids_cgroup {
  44         struct cgroup_subsys_state      css;
  45
  46         /*
  47          * Use 64-bit types so that we can safely represent "max" as
  48          * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
  49          */
  50         atomic64_t                      counter;
  51         int64_t                         limit;
  52 };
  53
  54 static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
  55 {
  56         return container_of(css, struct pids_cgroup, css);
  57 }
  58
  59 static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
  60 {
  61         return css_pids(pids->css.parent);
  62 }
  63
  64 static struct cgroup_subsys_state *
  65 pids_css_alloc(struct cgroup_subsys_state *parent)
  66 {
  67         struct pids_cgroup *pids;
  68
  69         pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
  70         if (!pids)
  71                 return ERR_PTR(-ENOMEM);
  72
  73         pids->limit = PIDS_MAX;
  74         atomic64_set(&pids->counter, 0);
  75         return &pids->css;
  76 }
  77
  78 static void pids_css_free(struct cgroup_subsys_state *css)
  79 {
  80         kfree(css_pids(css));
  81 }
  82
  83 /**
  84  * pids_cancel - uncharge the local pid count
  85  * @pids: the pid cgroup state
  86  * @num: the number of pids to cancel
  87  *
  88  * This function will WARN if the pid count goes under 0, because such a case is
  89  * a bug in the pids controller proper.
  90  */
  91 static void pids_cancel(struct pids_cgroup *pids, int num)
  92 {
  93         /*
  94          * A negative count (or overflow for that matter) is invalid,
  95          * and indicates a bug in the `pids` controller proper.
  96          */
  97         WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
  98 }
  99
 100 /**
 101  * pids_uncharge - hierarchically uncharge the pid count
 102  * @pids: the pid cgroup state
 103  * @num: the number of pids to uncharge
 104  */
 105 static void pids_uncharge(struct pids_cgroup *pids, int num)
 106 {
 107         struct pids_cgroup *p;
 108
 109         for (p = pids; p; p = parent_pids(p))
 110                 pids_cancel(p, num);
 111 }
 112
 113 /**
 114  * pids_charge - hierarchically charge the pid count
 115  * @pids: the pid cgroup state
 116  * @num: the number of pids to charge
 117  *
 118  * This function does *not* follow the pid limit set. It cannot fail and the new
 119  * pid count may exceed the limit. This is only used for reverting failed
 120  * attaches, where there is no other way out than violating the limit.
 121  */
 122 static void pids_charge(struct pids_cgroup *pids, int num)
 123 {
 124         struct pids_cgroup *p;
 125
 126         for (p = pids; p; p = parent_pids(p))
 127                 atomic64_add(num, &p->counter);
 128 }
 129
 130 /**
 131  * pids_try_charge - hierarchically try to charge the pid count
 132  * @pids: the pid cgroup state
 133  * @num: the number of pids to charge
 134  *
 135  * This function follows the set limit. It will fail if the charge would cause
 136  * the new value to exceed the hierarchical limit. Returns 0 if the charge
 137  * succeded, otherwise -EAGAIN.
 138  */
 139 static int pids_try_charge(struct pids_cgroup *pids, int num)
 140 {
 141         struct pids_cgroup *p, *q;
 142
 143         for (p = pids; p; p = parent_pids(p)) {
 144                 int64_t new = atomic64_add_return(num, &p->counter);
 145
 146                 /*
 147                  * Since new is capped to the maximum number of pid_t, if
 148                  * p->limit is %PIDS_MAX then we know that this test will never
 149                  * fail.
 150                  */
 151                 if (new > p->limit)
 152                         goto revert;
 153         }
 154
 155         return 0;
 156
 157 revert:
 158         for (q = pids; q != p; q = parent_pids(q))
 159                 pids_cancel(q, num);
 160         pids_cancel(p, num);
 161
 162         return -EAGAIN;
 163 }
 164
 165 static int pids_can_attach(struct cgroup_subsys_state *css,
 166                            struct cgroup_taskset *tset)
 167 {
 168         struct pids_cgroup *pids = css_pids(css);
 169         struct task_struct *task;
 170
 171         cgroup_taskset_for_each(task, tset) {
 172                 struct cgroup_subsys_state *old_css;
 173                 struct pids_cgroup *old_pids;
 174
 175                 /*
 176                  * Grab a ref to each task's css. We don't drop the ref until
 177                  * we either fail and hit ->cancel_attach() or succeed and hit
 178                  * ->attach().
 179                  */
 180                 old_css = task_get_css(task, pids_cgrp_id);
 181                 old_pids = css_pids(old_css);
 182
 183                 pids_charge(pids, 1);
 184                 pids_uncharge(old_pids, 1);
 185         }
 186
 187         return 0;
 188 }
 189
 190 static void pids_cancel_attach(struct cgroup_subsys_state *css,
 191                                struct cgroup_taskset *tset)
 192 {
 193         struct pids_cgroup *pids = css_pids(css);
 194         struct task_struct *task;
 195
 196         cgroup_taskset_for_each(task, tset) {
 197                 struct cgroup_subsys_state *old_css;
 198                 struct pids_cgroup *old_pids;
 199
 200                 old_css = task_css(task, pids_cgrp_id);
 201                 old_pids = css_pids(old_css);
 202
 203                 pids_charge(old_pids, 1);
 204                 pids_uncharge(pids, 1);
 205                 css_put(old_css);
 206         }
 207 }
 208
 209 static void pids_attach(struct cgroup_subsys_state *css,
 210                         struct cgroup_taskset *tset)
 211 {
 212         struct task_struct *task;
 213
 214         cgroup_taskset_for_each(task, tset)
 215                 css_put(task_css(task, pids_cgrp_id));
 216 }
 217
 218 static int pids_can_fork(struct task_struct *task, void **priv_p)
 219 {
 220         struct cgroup_subsys_state *css;
 221         struct pids_cgroup *pids;
 222         int err;
 223
 224         /*
 225          * Use the "current" task_css for the pids subsystem as the tentative
 226          * css. It is possible we will charge the wrong hierarchy, in which
 227          * case we will forcefully revert/reapply the charge on the right
 228          * hierarchy after it is committed to the task proper.
 229          */
 230         css = task_get_css(current, pids_cgrp_id);
 231         pids = css_pids(css);
 232
 233         err = pids_try_charge(pids, 1);
 234         if (err)
 235                 goto err_css_put;
 236
 237         *priv_p = css;
 238         return 0;
 239
 240 err_css_put:
 241         css_put(css);
 242         return err;
 243 }
 244
 245 static void pids_cancel_fork(struct task_struct *task, void *priv)
 246 {
 247         struct cgroup_subsys_state *css = priv;
 248         struct pids_cgroup *pids = css_pids(css);
 249
 250         pids_uncharge(pids, 1);
 251         css_put(css);
 252 }
 253
 254 static void pids_fork(struct task_struct *task, void *priv)
 255 {
 256         struct cgroup_subsys_state *css;
 257         struct cgroup_subsys_state *old_css = priv;
 258         struct pids_cgroup *pids;
 259         struct pids_cgroup *old_pids = css_pids(old_css);
 260
 261         css = task_get_css(task, pids_cgrp_id);
 262         pids = css_pids(css);
 263
 264         /*
 265          * If the association has changed, we have to revert and reapply the
 266          * charge/uncharge on the wrong hierarchy to the current one. Since
 267          * the association can only change due to an organisation event, its
 268          * okay for us to ignore the limit in this case.
 269          */
 270         if (pids != old_pids) {
 271                 pids_uncharge(old_pids, 1);
 272                 pids_charge(pids, 1);
 273         }
 274
 275         css_put(css);
 276         css_put(old_css);
 277 }
 278
 279 static void pids_exit(struct cgroup_subsys_state *css,
 280                       struct cgroup_subsys_state *old_css,
 281                       struct task_struct *task)
 282 {
 283         struct pids_cgroup *pids = css_pids(old_css);
 284
 285         pids_uncharge(pids, 1);
 286 }
 287
 288 static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
 289                               size_t nbytes, loff_t off)
 290 {
 291         struct cgroup_subsys_state *css = of_css(of);
 292         struct pids_cgroup *pids = css_pids(css);
 293         int64_t limit;
 294         int err;
 295
 296         buf = strstrip(buf);
 297         if (!strcmp(buf, PIDS_MAX_STR)) {
 298                 limit = PIDS_MAX;
 299                 goto set_limit;
 300         }
 301
 302         err = kstrtoll(buf, 0, &limit);
 303         if (err)
 304                 return err;
 305
 306         if (limit < 0 || limit >= PIDS_MAX)
 307                 return -EINVAL;
 308
 309 set_limit:
 310         /*
 311          * Limit updates don't need to be mutex'd, since it isn't
 312          * critical that any racing fork()s follow the new limit.
 313          */
 314         pids->limit = limit;
 315         return nbytes;
 316 }
 317
 318 static int pids_max_show(struct seq_file *sf, void *v)
 319 {
 320         struct cgroup_subsys_state *css = seq_css(sf);
 321         struct pids_cgroup *pids = css_pids(css);
 322         int64_t limit = pids->limit;
 323
 324         if (limit >= PIDS_MAX)
 325                 seq_printf(sf, "%s\n", PIDS_MAX_STR);
 326         else
 327                 seq_printf(sf, "%lld\n", limit);
 328
 329         return 0;
 330 }
 331
 332 static s64 pids_current_read(struct cgroup_subsys_state *css,
 333                              struct cftype *cft)
 334 {
 335         struct pids_cgroup *pids = css_pids(css);
 336
 337         return atomic64_read(&pids->counter);
 338 }
 339
 340 static struct cftype pids_files[] = {
 341         {
 342                 .name = "max",
 343                 .write = pids_max_write,
 344                 .seq_show = pids_max_show,
 345                 .flags = CFTYPE_NOT_ON_ROOT,
 346         },
 347         {
 348                 .name = "current",
 349                 .read_s64 = pids_current_read,
 350         },
 351         { }     /* terminate */
 352 };
 353
 354 struct cgroup_subsys pids_cgrp_subsys = {
 355         .css_alloc      = pids_css_alloc,
 356         .css_free       = pids_css_free,
 357         .attach         = pids_attach,
 358         .can_attach     = pids_can_attach,
 359         .cancel_attach  = pids_cancel_attach,
 360         .can_fork       = pids_can_fork,
 361         .cancel_fork    = pids_cancel_fork,
 362         .fork           = pids_fork,
 363         .exit           = pids_exit,
 364         .legacy_cftypes = pids_files,
 365         .dfl_cftypes    = pids_files,
 366 };