kernel/cpuset.c

   1 /*
   2  *  kernel/cpuset.c
   3  *
   4  *  Processor and Memory placement constraints for sets of tasks.
   5  *
   6  *  Copyright (C) 2003 BULL SA.
   7  *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
   8  *  Copyright (C) 2006 Google, Inc
   9  *
  10  *  Portions derived from Patrick Mochel's sysfs code.
  11  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  12  *
  13  *  2003-10-10 Written by Simon Derr.
  14  *  2003-10-22 Updates by Stephen Hemminger.
  15  *  2004 May-July Rework by Paul Jackson.
  16  *  2006 Rework by Paul Menage to use generic cgroups
  17  *
  18  *  This file is subject to the terms and conditions of the GNU General Public
  19  *  License.  See the file COPYING in the main directory of the Linux
  20  *  distribution for more details.
  21  */
  22
  23 #include <linux/cpu.h>
  24 #include <linux/cpumask.h>
  25 #include <linux/cpuset.h>
  26 #include <linux/err.h>
  27 #include <linux/errno.h>
  28 #include <linux/file.h>
  29 #include <linux/fs.h>
  30 #include <linux/init.h>
  31 #include <linux/interrupt.h>
  32 #include <linux/kernel.h>
  33 #include <linux/kmod.h>
  34 #include <linux/list.h>
  35 #include <linux/mempolicy.h>
  36 #include <linux/mm.h>
  37 #include <linux/module.h>
  38 #include <linux/mount.h>
  39 #include <linux/namei.h>
  40 #include <linux/pagemap.h>
  41 #include <linux/prio_heap.h>
  42 #include <linux/proc_fs.h>
  43 #include <linux/rcupdate.h>
  44 #include <linux/sched.h>
  45 #include <linux/seq_file.h>
  46 #include <linux/security.h>
  47 #include <linux/slab.h>
  48 #include <linux/spinlock.h>
  49 #include <linux/stat.h>
  50 #include <linux/string.h>
  51 #include <linux/time.h>
  52 #include <linux/backing-dev.h>
  53 #include <linux/sort.h>
  54
  55 #include <asm/uaccess.h>
  56 #include <asm/atomic.h>
  57 #include <linux/mutex.h>
  58 #include <linux/kfifo.h>
  59 #include <linux/workqueue.h>
  60 #include <linux/cgroup.h>
  61
  62 /*
  63  * Tracks how many cpusets are currently defined in system.
  64  * When there is only one cpuset (the root cpuset) we can
  65  * short circuit some hooks.
  66  */
  67 int number_of_cpusets __read_mostly;
  68
  69 /* Retrieve the cpuset from a cgroup */
  70 struct cgroup_subsys cpuset_subsys;
  71 struct cpuset;
  72
  73 /* See "Frequency meter" comments, below. */
  74
  75 struct fmeter {
  76         int cnt;                /* unprocessed events count */
  77         int val;                /* most recent output value */
  78         time_t time;            /* clock (secs) when val computed */
  79         spinlock_t lock;        /* guards read or write of above */
  80 };
  81
  82 struct cpuset {
  83         struct cgroup_subsys_state css;
  84
  85         unsigned long flags;            /* "unsigned long" so bitops work */
  86         cpumask_t cpus_allowed;         /* CPUs allowed to tasks in cpuset */
  87         nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
  88
  89         struct cpuset *parent;          /* my parent */
  90
  91         /*
  92          * Copy of global cpuset_mems_generation as of the most
  93          * recent time this cpuset changed its mems_allowed.
  94          */
  95         int mems_generation;
  96
  97         struct fmeter fmeter;           /* memory_pressure filter */
  98
  99         /* partition number for rebuild_sched_domains() */
 100         int pn;
 101
 102         /* used for walking a cpuset heirarchy */
 103         struct list_head stack_list;
 104 };
 105
 106 /* Retrieve the cpuset for a cgroup */
 107 static inline struct cpuset *cgroup_cs(struct cgroup *cont)
 108 {
 109         return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
 110                             struct cpuset, css);
 111 }
 112
 113 /* Retrieve the cpuset for a task */
 114 static inline struct cpuset *task_cs(struct task_struct *task)
 115 {
 116         return container_of(task_subsys_state(task, cpuset_subsys_id),
 117                             struct cpuset, css);
 118 }
 119 struct cpuset_hotplug_scanner {
 120         struct cgroup_scanner scan;
 121         struct cgroup *to;
 122 };
 123
 124 /* bits in struct cpuset flags field */
 125 typedef enum {
 126         CS_CPU_EXCLUSIVE,
 127         CS_MEM_EXCLUSIVE,
 128         CS_MEMORY_MIGRATE,
 129         CS_SCHED_LOAD_BALANCE,
 130         CS_SPREAD_PAGE,
 131         CS_SPREAD_SLAB,
 132 } cpuset_flagbits_t;
 133
 134 /* convenient tests for these bits */
 135 static inline int is_cpu_exclusive(const struct cpuset *cs)
 136 {
 137         return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
 138 }
 139
 140 static inline int is_mem_exclusive(const struct cpuset *cs)
 141 {
 142         return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 143 }
 144
 145 static inline int is_sched_load_balance(const struct cpuset *cs)
 146 {
 147         return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 148 }
 149
 150 static inline int is_memory_migrate(const struct cpuset *cs)
 151 {
 152         return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
 153 }
 154
 155 static inline int is_spread_page(const struct cpuset *cs)
 156 {
 157         return test_bit(CS_SPREAD_PAGE, &cs->flags);
 158 }
 159
 160 static inline int is_spread_slab(const struct cpuset *cs)
 161 {
 162         return test_bit(CS_SPREAD_SLAB, &cs->flags);
 163 }
 164
 165 /*
 166  * Increment this integer everytime any cpuset changes its
 167  * mems_allowed value.  Users of cpusets can track this generation
 168  * number, and avoid having to lock and reload mems_allowed unless
 169  * the cpuset they're using changes generation.
 170  *
 171  * A single, global generation is needed because attach_task() could
 172  * reattach a task to a different cpuset, which must not have its
 173  * generation numbers aliased with those of that tasks previous cpuset.
 174  *
 175  * Generations are needed for mems_allowed because one task cannot
 176  * modify anothers memory placement.  So we must enable every task,
 177  * on every visit to __alloc_pages(), to efficiently check whether
 178  * its current->cpuset->mems_allowed has changed, requiring an update
 179  * of its current->mems_allowed.
 180  *
 181  * Since cpuset_mems_generation is guarded by manage_mutex,
 182  * there is no need to mark it atomic.
 183  */
 184 static int cpuset_mems_generation;
 185
 186 static struct cpuset top_cpuset = {
 187         .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 188         .cpus_allowed = CPU_MASK_ALL,
 189         .mems_allowed = NODE_MASK_ALL,
 190 };
 191
 192 /*
 193  * We have two global cpuset mutexes below.  They can nest.
 194  * It is ok to first take manage_mutex, then nest callback_mutex.  We also
 195  * require taking task_lock() when dereferencing a tasks cpuset pointer.
 196  * See "The task_lock() exception", at the end of this comment.
 197  *
 198  * A task must hold both mutexes to modify cpusets.  If a task
 199  * holds manage_mutex, then it blocks others wanting that mutex,
 200  * ensuring that it is the only task able to also acquire callback_mutex
 201  * and be able to modify cpusets.  It can perform various checks on
 202  * the cpuset structure first, knowing nothing will change.  It can
 203  * also allocate memory while just holding manage_mutex.  While it is
 204  * performing these checks, various callback routines can briefly
 205  * acquire callback_mutex to query cpusets.  Once it is ready to make
 206  * the changes, it takes callback_mutex, blocking everyone else.
 207  *
 208  * Calls to the kernel memory allocator can not be made while holding
 209  * callback_mutex, as that would risk double tripping on callback_mutex
 210  * from one of the callbacks into the cpuset code from within
 211  * __alloc_pages().
 212  *
 213  * If a task is only holding callback_mutex, then it has read-only
 214  * access to cpusets.
 215  *
 216  * The task_struct fields mems_allowed and mems_generation may only
 217  * be accessed in the context of that task, so require no locks.
 218  *
 219  * Any task can increment and decrement the count field without lock.
 220  * So in general, code holding manage_mutex or callback_mutex can't rely
 221  * on the count field not changing.  However, if the count goes to
 222  * zero, then only attach_task(), which holds both mutexes, can
 223  * increment it again.  Because a count of zero means that no tasks
 224  * are currently attached, therefore there is no way a task attached
 225  * to that cpuset can fork (the other way to increment the count).
 226  * So code holding manage_mutex or callback_mutex can safely assume that
 227  * if the count is zero, it will stay zero.  Similarly, if a task
 228  * holds manage_mutex or callback_mutex on a cpuset with zero count, it
 229  * knows that the cpuset won't be removed, as cpuset_rmdir() needs
 230  * both of those mutexes.
 231  *
 232  * The cpuset_common_file_write handler for operations that modify
 233  * the cpuset hierarchy holds manage_mutex across the entire operation,
 234  * single threading all such cpuset modifications across the system.
 235  *
 236  * The cpuset_common_file_read() handlers only hold callback_mutex across
 237  * small pieces of code, such as when reading out possibly multi-word
 238  * cpumasks and nodemasks.
 239  *
 240  * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
 241  * (usually) take either mutex.  These are the two most performance
 242  * critical pieces of code here.  The exception occurs on cpuset_exit(),
 243  * when a task in a notify_on_release cpuset exits.  Then manage_mutex
 244  * is taken, and if the cpuset count is zero, a usermode call made
 245  * to /sbin/cpuset_release_agent with the name of the cpuset (path
 246  * relative to the root of cpuset file system) as the argument.
 247  *
 248  * A cpuset can only be deleted if both its 'count' of using tasks
 249  * is zero, and its list of 'children' cpusets is empty.  Since all
 250  * tasks in the system use _some_ cpuset, and since there is always at
 251  * least one task in the system (init), therefore, top_cpuset
 252  * always has either children cpusets and/or using tasks.  So we don't
 253  * need a special hack to ensure that top_cpuset cannot be deleted.
 254  *
 255  * The above "Tale of Two Semaphores" would be complete, but for:
 256  *
 257  *      The task_lock() exception
 258  *
 259  * The need for this exception arises from the action of attach_task(),
 260  * which overwrites one tasks cpuset pointer with another.  It does
 261  * so using both mutexes, however there are several performance
 262  * critical places that need to reference task->cpuset without the
 263  * expense of grabbing a system global mutex.  Therefore except as
 264  * noted below, when dereferencing or, as in attach_task(), modifying
 265  * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
 266  * (task->alloc_lock) already in the task_struct routinely used for
 267  * such matters.
 268  *
 269  * P.S.  One more locking exception.  RCU is used to guard the
 270  * update of a tasks cpuset pointer by attach_task() and the
 271  * access of task->cpuset->mems_generation via that pointer in
 272  * the routine cpuset_update_task_memory_state().
 273  */
 274
 275 static DEFINE_MUTEX(callback_mutex);
 276
 277 /* This is ugly, but preserves the userspace API for existing cpuset
 278  * users. If someone tries to mount the "cpuset" filesystem, we
 279  * silently switch it to mount "cgroup" instead */
 280 static int cpuset_get_sb(struct file_system_type *fs_type,
 281                          int flags, const char *unused_dev_name,
 282                          void *data, struct vfsmount *mnt)
 283 {
 284         struct file_system_type *cgroup_fs = get_fs_type("cgroup");
 285         int ret = -ENODEV;
 286         if (cgroup_fs) {
 287                 char mountopts[] =
 288                         "cpuset,noprefix,"
 289                         "release_agent=/sbin/cpuset_release_agent";
 290                 ret = cgroup_fs->get_sb(cgroup_fs, flags,
 291                                            unused_dev_name, mountopts, mnt);
 292                 put_filesystem(cgroup_fs);
 293         }
 294         return ret;
 295 }
 296
 297 static struct file_system_type cpuset_fs_type = {
 298         .name = "cpuset",
 299         .get_sb = cpuset_get_sb,
 300 };
 301
 302 /*
 303  * Return in *pmask the portion of a cpusets's cpus_allowed that
 304  * are online.  If none are online, walk up the cpuset hierarchy
 305  * until we find one that does have some online cpus.  If we get
 306  * all the way to the top and still haven't found any online cpus,
 307  * return cpu_online_map.  Or if passed a NULL cs from an exit'ing
 308  * task, return cpu_online_map.
 309  *
 310  * One way or another, we guarantee to return some non-empty subset
 311  * of cpu_online_map.
 312  *
 313  * Call with callback_mutex held.
 314  */
 315
 316 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
 317 {
 318         while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map))
 319                 cs = cs->parent;
 320         if (cs)
 321                 cpus_and(*pmask, cs->cpus_allowed, cpu_online_map);
 322         else
 323                 *pmask = cpu_online_map;
 324         BUG_ON(!cpus_intersects(*pmask, cpu_online_map));
 325 }
 326
 327 /*
 328  * Return in *pmask the portion of a cpusets's mems_allowed that
 329  * are online, with memory.  If none are online with memory, walk
 330  * up the cpuset hierarchy until we find one that does have some
 331  * online mems.  If we get all the way to the top and still haven't
 332  * found any online mems, return node_states[N_HIGH_MEMORY].
 333  *
 334  * One way or another, we guarantee to return some non-empty subset
 335  * of node_states[N_HIGH_MEMORY].
 336  *
 337  * Call with callback_mutex held.
 338  */
 339
 340 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 341 {
 342         while (cs && !nodes_intersects(cs->mems_allowed,
 343                                         node_states[N_HIGH_MEMORY]))
 344                 cs = cs->parent;
 345         if (cs)
 346                 nodes_and(*pmask, cs->mems_allowed,
 347                                         node_states[N_HIGH_MEMORY]);
 348         else
 349                 *pmask = node_states[N_HIGH_MEMORY];
 350         BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
 351 }
 352
 353 /**
 354  * cpuset_update_task_memory_state - update task memory placement
 355  *
 356  * If the current tasks cpusets mems_allowed changed behind our
 357  * backs, update current->mems_allowed, mems_generation and task NUMA
 358  * mempolicy to the new value.
 359  *
 360  * Task mempolicy is updated by rebinding it relative to the
 361  * current->cpuset if a task has its memory placement changed.
 362  * Do not call this routine if in_interrupt().
 363  *
 364  * Call without callback_mutex or task_lock() held.  May be
 365  * called with or without manage_mutex held.  Thanks in part to
 366  * 'the_top_cpuset_hack', the tasks cpuset pointer will never
 367  * be NULL.  This routine also might acquire callback_mutex and
 368  * current->mm->mmap_sem during call.
 369  *
 370  * Reading current->cpuset->mems_generation doesn't need task_lock
 371  * to guard the current->cpuset derefence, because it is guarded
 372  * from concurrent freeing of current->cpuset by attach_task(),
 373  * using RCU.
 374  *
 375  * The rcu_dereference() is technically probably not needed,
 376  * as I don't actually mind if I see a new cpuset pointer but
 377  * an old value of mems_generation.  However this really only
 378  * matters on alpha systems using cpusets heavily.  If I dropped
 379  * that rcu_dereference(), it would save them a memory barrier.
 380  * For all other arch's, rcu_dereference is a no-op anyway, and for
 381  * alpha systems not using cpusets, another planned optimization,
 382  * avoiding the rcu critical section for tasks in the root cpuset
 383  * which is statically allocated, so can't vanish, will make this
 384  * irrelevant.  Better to use RCU as intended, than to engage in
 385  * some cute trick to save a memory barrier that is impossible to
 386  * test, for alpha systems using cpusets heavily, which might not
 387  * even exist.
 388  *
 389  * This routine is needed to update the per-task mems_allowed data,
 390  * within the tasks context, when it is trying to allocate memory
 391  * (in various mm/mempolicy.c routines) and notices that some other
 392  * task has been modifying its cpuset.
 393  */
 394
 395 void cpuset_update_task_memory_state(void)
 396 {
 397         int my_cpusets_mem_gen;
 398         struct task_struct *tsk = current;
 399         struct cpuset *cs;
 400
 401         if (task_cs(tsk) == &top_cpuset) {
 402                 /* Don't need rcu for top_cpuset.  It's never freed. */
 403                 my_cpusets_mem_gen = top_cpuset.mems_generation;
 404         } else {
 405                 rcu_read_lock();
 406                 my_cpusets_mem_gen = task_cs(current)->mems_generation;
 407                 rcu_read_unlock();
 408         }
 409
 410         if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
 411                 mutex_lock(&callback_mutex);
 412                 task_lock(tsk);
 413                 cs = task_cs(tsk); /* Maybe changed when task not locked */
 414                 guarantee_online_mems(cs, &tsk->mems_allowed);
 415                 tsk->cpuset_mems_generation = cs->mems_generation;
 416                 if (is_spread_page(cs))
 417                         tsk->flags |= PF_SPREAD_PAGE;
 418                 else
 419                         tsk->flags &= ~PF_SPREAD_PAGE;
 420                 if (is_spread_slab(cs))
 421                         tsk->flags |= PF_SPREAD_SLAB;
 422                 else
 423                         tsk->flags &= ~PF_SPREAD_SLAB;
 424                 task_unlock(tsk);
 425                 mutex_unlock(&callback_mutex);
 426                 mpol_rebind_task(tsk, &tsk->mems_allowed);
 427         }
 428 }
 429
 430 /*
 431  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 432  *
 433  * One cpuset is a subset of another if all its allowed CPUs and
 434  * Memory Nodes are a subset of the other, and its exclusive flags
 435  * are only set if the other's are set.  Call holding manage_mutex.
 436  */
 437
 438 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 439 {
 440         return  cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
 441                 nodes_subset(p->mems_allowed, q->mems_allowed) &&
 442                 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
 443                 is_mem_exclusive(p) <= is_mem_exclusive(q);
 444 }
 445
 446 /*
 447  * validate_change() - Used to validate that any proposed cpuset change
 448  *                     follows the structural rules for cpusets.
 449  *
 450  * If we replaced the flag and mask values of the current cpuset
 451  * (cur) with those values in the trial cpuset (trial), would
 452  * our various subset and exclusive rules still be valid?  Presumes
 453  * manage_mutex held.
 454  *
 455  * 'cur' is the address of an actual, in-use cpuset.  Operations
 456  * such as list traversal that depend on the actual address of the
 457  * cpuset in the list must use cur below, not trial.
 458  *
 459  * 'trial' is the address of bulk structure copy of cur, with
 460  * perhaps one or more of the fields cpus_allowed, mems_allowed,
 461  * or flags changed to new, trial values.
 462  *
 463  * Return 0 if valid, -errno if not.
 464  */
 465
 466 static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 467 {
 468         struct cgroup *cont;
 469         struct cpuset *c, *par;
 470
 471         /* Each of our child cpusets must be a subset of us */
 472         list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
 473                 if (!is_cpuset_subset(cgroup_cs(cont), trial))
 474                         return -EBUSY;
 475         }
 476
 477         /* Remaining checks don't apply to root cpuset */
 478         if (cur == &top_cpuset)
 479                 return 0;
 480
 481         par = cur->parent;
 482
 483         /* We must be a subset of our parent cpuset */
 484         if (!is_cpuset_subset(trial, par))
 485                 return -EACCES;
 486
 487         /* If either I or some sibling (!= me) is exclusive, we can't overlap */
 488         list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
 489                 c = cgroup_cs(cont);
 490                 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 491                     c != cur &&
 492                     cpus_intersects(trial->cpus_allowed, c->cpus_allowed))
 493                         return -EINVAL;
 494                 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 495                     c != cur &&
 496                     nodes_intersects(trial->mems_allowed, c->mems_allowed))
 497                         return -EINVAL;
 498         }
 499
 500         /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
 501         if (cgroup_task_count(cur->css.cgroup)) {
 502                 if (cpus_empty(trial->cpus_allowed) ||
 503                     nodes_empty(trial->mems_allowed)) {
 504                         return -ENOSPC;
 505                 }
 506         }
 507
 508         return 0;
 509 }
 510
 511 /*
 512  * Helper routine for rebuild_sched_domains().
 513  * Do cpusets a, b have overlapping cpus_allowed masks?
 514  */
 515
 516 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 517 {
 518         return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
 519 }
 520
 521 /*
 522  * rebuild_sched_domains()
 523  *
 524  * If the flag 'sched_load_balance' of any cpuset with non-empty
 525  * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
 526  * which has that flag enabled, or if any cpuset with a non-empty
 527  * 'cpus' is removed, then call this routine to rebuild the
 528  * scheduler's dynamic sched domains.
 529  *
 530  * This routine builds a partial partition of the systems CPUs
 531  * (the set of non-overlappping cpumask_t's in the array 'part'
 532  * below), and passes that partial partition to the kernel/sched.c
 533  * partition_sched_domains() routine, which will rebuild the
 534  * schedulers load balancing domains (sched domains) as specified
 535  * by that partial partition.  A 'partial partition' is a set of
 536  * non-overlapping subsets whose union is a subset of that set.
 537  *
 538  * See "What is sched_load_balance" in Documentation/cpusets.txt
 539  * for a background explanation of this.
 540  *
 541  * Does not return errors, on the theory that the callers of this
 542  * routine would rather not worry about failures to rebuild sched
 543  * domains when operating in the severe memory shortage situations
 544  * that could cause allocation failures below.
 545  *
 546  * Call with cgroup_mutex held.  May take callback_mutex during
 547  * call due to the kfifo_alloc() and kmalloc() calls.  May nest
 548  * a call to the get_online_cpus()/put_online_cpus() pair.
 549  * Must not be called holding callback_mutex, because we must not
 550  * call get_online_cpus() while holding callback_mutex.  Elsewhere
 551  * the kernel nests callback_mutex inside get_online_cpus() calls.
 552  * So the reverse nesting would risk an ABBA deadlock.
 553  *
 554  * The three key local variables below are:
 555  *    q  - a kfifo queue of cpuset pointers, used to implement a
 556  *         top-down scan of all cpusets.  This scan loads a pointer
 557  *         to each cpuset marked is_sched_load_balance into the
 558  *         array 'csa'.  For our purposes, rebuilding the schedulers
 559  *         sched domains, we can ignore !is_sched_load_balance cpusets.
 560  *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 561  *         that need to be load balanced, for convenient iterative
 562  *         access by the subsequent code that finds the best partition,
 563  *         i.e the set of domains (subsets) of CPUs such that the
 564  *         cpus_allowed of every cpuset marked is_sched_load_balance
 565  *         is a subset of one of these domains, while there are as
 566  *         many such domains as possible, each as small as possible.
 567  * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
 568  *         the kernel/sched.c routine partition_sched_domains() in a
 569  *         convenient format, that can be easily compared to the prior
 570  *         value to determine what partition elements (sched domains)
 571  *         were changed (added or removed.)
 572  *
 573  * Finding the best partition (set of domains):
 574  *      The triple nested loops below over i, j, k scan over the
 575  *      load balanced cpusets (using the array of cpuset pointers in
 576  *      csa[]) looking for pairs of cpusets that have overlapping
 577  *      cpus_allowed, but which don't have the same 'pn' partition
 578  *      number and gives them in the same partition number.  It keeps
 579  *      looping on the 'restart' label until it can no longer find
 580  *      any such pairs.
 581  *
 582  *      The union of the cpus_allowed masks from the set of
 583  *      all cpusets having the same 'pn' value then form the one
 584  *      element of the partition (one sched domain) to be passed to
 585  *      partition_sched_domains().
 586  */
 587
 588 static void rebuild_sched_domains(void)
 589 {
 590         struct kfifo *q;        /* queue of cpusets to be scanned */
 591         struct cpuset *cp;      /* scans q */
 592         struct cpuset **csa;    /* array of all cpuset ptrs */
 593         int csn;                /* how many cpuset ptrs in csa so far */
 594         int i, j, k;            /* indices for partition finding loops */
 595         cpumask_t *doms;        /* resulting partition; i.e. sched domains */
 596         int ndoms;              /* number of sched domains in result */
 597         int nslot;              /* next empty doms[] cpumask_t slot */
 598
 599         q = NULL;
 600         csa = NULL;
 601         doms = NULL;
 602
 603         /* Special case for the 99% of systems with one, full, sched domain */
 604         if (is_sched_load_balance(&top_cpuset)) {
 605                 ndoms = 1;
 606                 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
 607                 if (!doms)
 608                         goto rebuild;
 609                 *doms = top_cpuset.cpus_allowed;
 610                 goto rebuild;
 611         }
 612
 613         q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
 614         if (IS_ERR(q))
 615                 goto done;
 616         csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
 617         if (!csa)
 618                 goto done;
 619         csn = 0;
 620
 621         cp = &top_cpuset;
 622         __kfifo_put(q, (void *)&cp, sizeof(cp));
 623         while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
 624                 struct cgroup *cont;
 625                 struct cpuset *child;   /* scans child cpusets of cp */
 626                 if (is_sched_load_balance(cp))
 627                         csa[csn++] = cp;
 628                 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 629                         child = cgroup_cs(cont);
 630                         __kfifo_put(q, (void *)&child, sizeof(cp));
 631                 }
 632         }
 633
 634         for (i = 0; i < csn; i++)
 635                 csa[i]->pn = i;
 636         ndoms = csn;
 637
 638 restart:
 639         /* Find the best partition (set of sched domains) */
 640         for (i = 0; i < csn; i++) {
 641                 struct cpuset *a = csa[i];
 642                 int apn = a->pn;
 643
 644                 for (j = 0; j < csn; j++) {
 645                         struct cpuset *b = csa[j];
 646                         int bpn = b->pn;
 647
 648                         if (apn != bpn && cpusets_overlap(a, b)) {
 649                                 for (k = 0; k < csn; k++) {
 650                                         struct cpuset *c = csa[k];
 651
 652                                         if (c->pn == bpn)
 653                                                 c->pn = apn;
 654                                 }
 655                                 ndoms--;        /* one less element */
 656                                 goto restart;
 657                         }
 658                 }
 659         }
 660
 661         /* Convert <csn, csa> to <ndoms, doms> */
 662         doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
 663         if (!doms)
 664                 goto rebuild;
 665
 666         for (nslot = 0, i = 0; i < csn; i++) {
 667                 struct cpuset *a = csa[i];
 668                 int apn = a->pn;
 669
 670                 if (apn >= 0) {
 671                         cpumask_t *dp = doms + nslot;
 672
 673                         if (nslot == ndoms) {
 674                                 static int warnings = 10;
 675                                 if (warnings) {
 676                                         printk(KERN_WARNING
 677                                          "rebuild_sched_domains confused:"
 678                                           " nslot %d, ndoms %d, csn %d, i %d,"
 679                                           " apn %d\n",
 680                                           nslot, ndoms, csn, i, apn);
 681                                         warnings--;
 682                                 }
 683                                 continue;
 684                         }
 685
 686                         cpus_clear(*dp);
 687                         for (j = i; j < csn; j++) {
 688                                 struct cpuset *b = csa[j];
 689
 690                                 if (apn == b->pn) {
 691                                         cpus_or(*dp, *dp, b->cpus_allowed);
 692                                         b->pn = -1;
 693                                 }
 694                         }
 695                         nslot++;
 696                 }
 697         }
 698         BUG_ON(nslot != ndoms);
 699
 700 rebuild:
 701         /* Have scheduler rebuild sched domains */
 702         get_online_cpus();
 703         partition_sched_domains(ndoms, doms);
 704         put_online_cpus();
 705
 706 done:
 707         if (q && !IS_ERR(q))
 708                 kfifo_free(q);
 709         kfree(csa);
 710         /* Don't kfree(doms) -- partition_sched_domains() does that. */
 711 }
 712
 713 static inline int started_after_time(struct task_struct *t1,
 714                                      struct timespec *time,
 715                                      struct task_struct *t2)
 716 {
 717         int start_diff = timespec_compare(&t1->start_time, time);
 718         if (start_diff > 0) {
 719                 return 1;
 720         } else if (start_diff < 0) {
 721                 return 0;
 722         } else {
 723                 /*
 724                  * Arbitrarily, if two processes started at the same
 725                  * time, we'll say that the lower pointer value
 726                  * started first. Note that t2 may have exited by now
 727                  * so this may not be a valid pointer any longer, but
 728                  * that's fine - it still serves to distinguish
 729                  * between two tasks started (effectively)
 730                  * simultaneously.
 731                  */
 732                 return t1 > t2;
 733         }
 734 }
 735
 736 static inline int started_after(void *p1, void *p2)
 737 {
 738         struct task_struct *t1 = p1;
 739         struct task_struct *t2 = p2;
 740         return started_after_time(t1, &t2->start_time, t2);
 741 }
 742
 743 /*
 744  * Call with manage_mutex held.  May take callback_mutex during call.
 745  */
 746
 747 static int update_cpumask(struct cpuset *cs, char *buf)
 748 {
 749         struct cpuset trialcs;
 750         int retval, i;
 751         int is_load_balanced;
 752         struct cgroup_iter it;
 753         struct cgroup *cgrp = cs->css.cgroup;
 754         struct task_struct *p, *dropped;
 755         /* Never dereference latest_task, since it's not refcounted */
 756         struct task_struct *latest_task = NULL;
 757         struct ptr_heap heap;
 758         struct timespec latest_time = { 0, 0 };
 759
 760         /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
 761         if (cs == &top_cpuset)
 762                 return -EACCES;
 763
 764         trialcs = *cs;
 765
 766         /*
 767          * An empty cpus_allowed is ok iff there are no tasks in the cpuset.
 768          * Since cpulist_parse() fails on an empty mask, we special case
 769          * that parsing.  The validate_change() call ensures that cpusets
 770          * with tasks have cpus.
 771          */
 772         buf = strstrip(buf);
 773         if (!*buf) {
 774                 cpus_clear(trialcs.cpus_allowed);
 775         } else {
 776                 retval = cpulist_parse(buf, trialcs.cpus_allowed);
 777                 if (retval < 0)
 778                         return retval;
 779         }
 780         cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
 781         retval = validate_change(cs, &trialcs);
 782         if (retval < 0)
 783                 return retval;
 784
 785         /* Nothing to do if the cpus didn't change */
 786         if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
 787                 return 0;
 788         retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
 789         if (retval)
 790                 return retval;
 791
 792         is_load_balanced = is_sched_load_balance(&trialcs);
 793
 794         mutex_lock(&callback_mutex);
 795         cs->cpus_allowed = trialcs.cpus_allowed;
 796         mutex_unlock(&callback_mutex);
 797
 798  again:
 799         /*
 800          * Scan tasks in the cpuset, and update the cpumasks of any
 801          * that need an update. Since we can't call set_cpus_allowed()
 802          * while holding tasklist_lock, gather tasks to be processed
 803          * in a heap structure. If the statically-sized heap fills up,
 804          * overflow tasks that started later, and in future iterations
 805          * only consider tasks that started after the latest task in
 806          * the previous pass. This guarantees forward progress and
 807          * that we don't miss any tasks
 808          */
 809         heap.size = 0;
 810         cgroup_iter_start(cgrp, &it);
 811         while ((p = cgroup_iter_next(cgrp, &it))) {
 812                 /* Only affect tasks that don't have the right cpus_allowed */
 813                 if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
 814                         continue;
 815                 /*
 816                  * Only process tasks that started after the last task
 817                  * we processed
 818                  */
 819                 if (!started_after_time(p, &latest_time, latest_task))
 820                         continue;
 821                 dropped = heap_insert(&heap, p);
 822                 if (dropped == NULL) {
 823                         get_task_struct(p);
 824                 } else if (dropped != p) {
 825                         get_task_struct(p);
 826                         put_task_struct(dropped);
 827                 }
 828         }
 829         cgroup_iter_end(cgrp, &it);
 830         if (heap.size) {
 831                 for (i = 0; i < heap.size; i++) {
 832                         struct task_struct *p = heap.ptrs[i];
 833                         if (i == 0) {
 834                                 latest_time = p->start_time;
 835                                 latest_task = p;
 836                         }
 837                         set_cpus_allowed(p, cs->cpus_allowed);
 838                         put_task_struct(p);
 839                 }
 840                 /*
 841                  * If we had to process any tasks at all, scan again
 842                  * in case some of them were in the middle of forking
 843                  * children that didn't notice the new cpumask
 844                  * restriction.  Not the most efficient way to do it,
 845                  * but it avoids having to take callback_mutex in the
 846                  * fork path
 847                  */
 848                 goto again;
 849         }
 850         heap_free(&heap);
 851         if (is_load_balanced)
 852                 rebuild_sched_domains();
 853
 854         return 0;
 855 }
 856
 857 /*
 858  * cpuset_migrate_mm
 859  *
 860  *    Migrate memory region from one set of nodes to another.
 861  *
 862  *    Temporarilly set tasks mems_allowed to target nodes of migration,
 863  *    so that the migration code can allocate pages on these nodes.
 864  *
 865  *    Call holding manage_mutex, so our current->cpuset won't change
 866  *    during this call, as manage_mutex holds off any attach_task()
 867  *    calls.  Therefore we don't need to take task_lock around the
 868  *    call to guarantee_online_mems(), as we know no one is changing
 869  *    our tasks cpuset.
 870  *
 871  *    Hold callback_mutex around the two modifications of our tasks
 872  *    mems_allowed to synchronize with cpuset_mems_allowed().
 873  *
 874  *    While the mm_struct we are migrating is typically from some
 875  *    other task, the task_struct mems_allowed that we are hacking
 876  *    is for our current task, which must allocate new pages for that
 877  *    migrating memory region.
 878  *
 879  *    We call cpuset_update_task_memory_state() before hacking
 880  *    our tasks mems_allowed, so that we are assured of being in
 881  *    sync with our tasks cpuset, and in particular, callbacks to
 882  *    cpuset_update_task_memory_state() from nested page allocations
 883  *    won't see any mismatch of our cpuset and task mems_generation
 884  *    values, so won't overwrite our hacked tasks mems_allowed
 885  *    nodemask.
 886  */
 887
 888 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 889                                                         const nodemask_t *to)
 890 {
 891         struct task_struct *tsk = current;
 892
 893         cpuset_update_task_memory_state();
 894
 895         mutex_lock(&callback_mutex);
 896         tsk->mems_allowed = *to;
 897         mutex_unlock(&callback_mutex);
 898
 899         do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
 900
 901         mutex_lock(&callback_mutex);
 902         guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
 903         mutex_unlock(&callback_mutex);
 904 }
 905
 906 /*
 907  * Handle user request to change the 'mems' memory placement
 908  * of a cpuset.  Needs to validate the request, update the
 909  * cpusets mems_allowed and mems_generation, and for each
 910  * task in the cpuset, rebind any vma mempolicies and if
 911  * the cpuset is marked 'memory_migrate', migrate the tasks
 912  * pages to the new memory.
 913  *
 914  * Call with manage_mutex held.  May take callback_mutex during call.
 915  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 916  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 917  * their mempolicies to the cpusets new mems_allowed.
 918  */
 919
 920 static void *cpuset_being_rebound;
 921
 922 static int update_nodemask(struct cpuset *cs, char *buf)
 923 {
 924         struct cpuset trialcs;
 925         nodemask_t oldmem;
 926         struct task_struct *p;
 927         struct mm_struct **mmarray;
 928         int i, n, ntasks;
 929         int migrate;
 930         int fudge;
 931         int retval;
 932         struct cgroup_iter it;
 933
 934         /*
 935          * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
 936          * it's read-only
 937          */
 938         if (cs == &top_cpuset)
 939                 return -EACCES;
 940
 941         trialcs = *cs;
 942
 943         /*
 944          * An empty mems_allowed is ok iff there are no tasks in the cpuset.
 945          * Since nodelist_parse() fails on an empty mask, we special case
 946          * that parsing.  The validate_change() call ensures that cpusets
 947          * with tasks have memory.
 948          */
 949         buf = strstrip(buf);
 950         if (!*buf) {
 951                 nodes_clear(trialcs.mems_allowed);
 952         } else {
 953                 retval = nodelist_parse(buf, trialcs.mems_allowed);
 954                 if (retval < 0)
 955                         goto done;
 956         }
 957         nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
 958                                                 node_states[N_HIGH_MEMORY]);
 959         oldmem = cs->mems_allowed;
 960         if (nodes_equal(oldmem, trialcs.mems_allowed)) {
 961                 retval = 0;             /* Too easy - nothing to do */
 962                 goto done;
 963         }
 964         retval = validate_change(cs, &trialcs);
 965         if (retval < 0)
 966                 goto done;
 967
 968         mutex_lock(&callback_mutex);
 969         cs->mems_allowed = trialcs.mems_allowed;
 970         cs->mems_generation = cpuset_mems_generation++;
 971         mutex_unlock(&callback_mutex);
 972
 973         cpuset_being_rebound = cs;              /* causes mpol_copy() rebind */
 974
 975         fudge = 10;                             /* spare mmarray[] slots */
 976         fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
 977         retval = -ENOMEM;
 978
 979         /*
 980          * Allocate mmarray[] to hold mm reference for each task
 981          * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
 982          * tasklist_lock.  We could use GFP_ATOMIC, but with a
 983          * few more lines of code, we can retry until we get a big
 984          * enough mmarray[] w/o using GFP_ATOMIC.
 985          */
 986         while (1) {
 987                 ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
 988                 ntasks += fudge;
 989                 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
 990                 if (!mmarray)
 991                         goto done;
 992                 read_lock(&tasklist_lock);              /* block fork */
 993                 if (cgroup_task_count(cs->css.cgroup) <= ntasks)
 994                         break;                          /* got enough */
 995                 read_unlock(&tasklist_lock);            /* try again */
 996                 kfree(mmarray);
 997         }
 998
 999         n = 0;
1000
1001         /* Load up mmarray[] with mm reference for each task in cpuset. */
1002         cgroup_iter_start(cs->css.cgroup, &it);
1003         while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
1004                 struct mm_struct *mm;
1005
1006                 if (n >= ntasks) {
1007                         printk(KERN_WARNING
1008                                 "Cpuset mempolicy rebind incomplete.\n");
1009                         break;
1010                 }
1011                 mm = get_task_mm(p);
1012                 if (!mm)
1013                         continue;
1014                 mmarray[n++] = mm;
1015         }
1016         cgroup_iter_end(cs->css.cgroup, &it);
1017         read_unlock(&tasklist_lock);
1018
1019         /*
1020          * Now that we've dropped the tasklist spinlock, we can
1021          * rebind the vma mempolicies of each mm in mmarray[] to their
1022          * new cpuset, and release that mm.  The mpol_rebind_mm()
1023          * call takes mmap_sem, which we couldn't take while holding
1024          * tasklist_lock.  Forks can happen again now - the mpol_copy()
1025          * cpuset_being_rebound check will catch such forks, and rebind
1026          * their vma mempolicies too.  Because we still hold the global
1027          * cpuset manage_mutex, we know that no other rebind effort will
1028          * be contending for the global variable cpuset_being_rebound.
1029          * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1030          * is idempotent.  Also migrate pages in each mm to new nodes.
1031          */
1032         migrate = is_memory_migrate(cs);
1033         for (i = 0; i < n; i++) {
1034                 struct mm_struct *mm = mmarray[i];
1035
1036                 mpol_rebind_mm(mm, &cs->mems_allowed);
1037                 if (migrate)
1038                         cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
1039                 mmput(mm);
1040         }
1041
1042         /* We're done rebinding vma's to this cpusets new mems_allowed. */
1043         kfree(mmarray);
1044         cpuset_being_rebound = NULL;
1045         retval = 0;
1046 done:
1047         return retval;
1048 }
1049
1050 int current_cpuset_is_being_rebound(void)
1051 {
1052         return task_cs(current) == cpuset_being_rebound;
1053 }
1054
1055 /*
1056  * Call with manage_mutex held.
1057  */
1058
1059 static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1060 {
1061         if (simple_strtoul(buf, NULL, 10) != 0)
1062                 cpuset_memory_pressure_enabled = 1;
1063         else
1064                 cpuset_memory_pressure_enabled = 0;
1065         return 0;
1066 }
1067
1068 /*
1069  * update_flag - read a 0 or a 1 in a file and update associated flag
1070  * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
1071  *                              CS_SCHED_LOAD_BALANCE,
1072  *                              CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
1073  *                              CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1074  * cs:  the cpuset to update
1075  * buf: the buffer where we read the 0 or 1
1076  *
1077  * Call with manage_mutex held.
1078  */
1079
1080 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1081 {
1082         int turning_on;
1083         struct cpuset trialcs;
1084         int err;
1085         int cpus_nonempty, balance_flag_changed;
1086
1087         turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1088
1089         trialcs = *cs;
1090         if (turning_on)
1091                 set_bit(bit, &trialcs.flags);
1092         else
1093                 clear_bit(bit, &trialcs.flags);
1094
1095         err = validate_change(cs, &trialcs);
1096         if (err < 0)
1097                 return err;
1098
1099         cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1100         balance_flag_changed = (is_sched_load_balance(cs) !=
1101                                         is_sched_load_balance(&trialcs));
1102
1103         mutex_lock(&callback_mutex);
1104         cs->flags = trialcs.flags;
1105         mutex_unlock(&callback_mutex);
1106
1107         if (cpus_nonempty && balance_flag_changed)
1108                 rebuild_sched_domains();
1109
1110         return 0;
1111 }
1112
1113 /*
1114  * Frequency meter - How fast is some event occurring?
1115  *
1116  * These routines manage a digitally filtered, constant time based,
1117  * event frequency meter.  There are four routines:
1118  *   fmeter_init() - initialize a frequency meter.
1119  *   fmeter_markevent() - called each time the event happens.
1120  *   fmeter_getrate() - returns the recent rate of such events.
1121  *   fmeter_update() - internal routine used to update fmeter.
1122  *
1123  * A common data structure is passed to each of these routines,
1124  * which is used to keep track of the state required to manage the
1125  * frequency meter and its digital filter.
1126  *
1127  * The filter works on the number of events marked per unit time.
1128  * The filter is single-pole low-pass recursive (IIR).  The time unit
1129  * is 1 second.  Arithmetic is done using 32-bit integers scaled to
1130  * simulate 3 decimal digits of precision (multiplied by 1000).
1131  *
1132  * With an FM_COEF of 933, and a time base of 1 second, the filter
1133  * has a half-life of 10 seconds, meaning that if the events quit
1134  * happening, then the rate returned from the fmeter_getrate()
1135  * will be cut in half each 10 seconds, until it converges to zero.
1136  *
1137  * It is not worth doing a real infinitely recursive filter.  If more
1138  * than FM_MAXTICKS ticks have elapsed since the last filter event,
1139  * just compute FM_MAXTICKS ticks worth, by which point the level
1140  * will be stable.
1141  *
1142  * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
1143  * arithmetic overflow in the fmeter_update() routine.
1144  *
1145  * Given the simple 32 bit integer arithmetic used, this meter works
1146  * best for reporting rates between one per millisecond (msec) and
1147  * one per 32 (approx) seconds.  At constant rates faster than one
1148  * per msec it maxes out at values just under 1,000,000.  At constant
1149  * rates between one per msec, and one per second it will stabilize
1150  * to a value N*1000, where N is the rate of events per second.
1151  * At constant rates between one per second and one per 32 seconds,
1152  * it will be choppy, moving up on the seconds that have an event,
1153  * and then decaying until the next event.  At rates slower than
1154  * about one in 32 seconds, it decays all the way back to zero between
1155  * each event.
1156  */
1157
1158 #define FM_COEF 933             /* coefficient for half-life of 10 secs */
1159 #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
1160 #define FM_MAXCNT 1000000       /* limit cnt to avoid overflow */
1161 #define FM_SCALE 1000           /* faux fixed point scale */
1162
1163 /* Initialize a frequency meter */
1164 static void fmeter_init(struct fmeter *fmp)
1165 {
1166         fmp->cnt = 0;
1167         fmp->val = 0;
1168         fmp->time = 0;
1169         spin_lock_init(&fmp->lock);
1170 }
1171
1172 /* Internal meter update - process cnt events and update value */
1173 static void fmeter_update(struct fmeter *fmp)
1174 {
1175         time_t now = get_seconds();
1176         time_t ticks = now - fmp->time;
1177
1178         if (ticks == 0)
1179                 return;
1180
1181         ticks = min(FM_MAXTICKS, ticks);
1182         while (ticks-- > 0)
1183                 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1184         fmp->time = now;
1185
1186         fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1187         fmp->cnt = 0;
1188 }
1189
1190 /* Process any previous ticks, then bump cnt by one (times scale). */
1191 static void fmeter_markevent(struct fmeter *fmp)
1192 {
1193         spin_lock(&fmp->lock);
1194         fmeter_update(fmp);
1195         fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1196         spin_unlock(&fmp->lock);
1197 }
1198
1199 /* Process any previous ticks, then return current value. */
1200 static int fmeter_getrate(struct fmeter *fmp)
1201 {
1202         int val;
1203
1204         spin_lock(&fmp->lock);
1205         fmeter_update(fmp);
1206         val = fmp->val;
1207         spin_unlock(&fmp->lock);
1208         return val;
1209 }
1210
1211 static int cpuset_can_attach(struct cgroup_subsys *ss,
1212                              struct cgroup *cont, struct task_struct *tsk)
1213 {
1214         struct cpuset *cs = cgroup_cs(cont);
1215
1216         if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1217                 return -ENOSPC;
1218
1219         return security_task_setscheduler(tsk, 0, NULL);
1220 }
1221
1222 static void cpuset_attach(struct cgroup_subsys *ss,
1223                           struct cgroup *cont, struct cgroup *oldcont,
1224                           struct task_struct *tsk)
1225 {
1226         cpumask_t cpus;
1227         nodemask_t from, to;
1228         struct mm_struct *mm;
1229         struct cpuset *cs = cgroup_cs(cont);
1230         struct cpuset *oldcs = cgroup_cs(oldcont);
1231
1232         mutex_lock(&callback_mutex);
1233         guarantee_online_cpus(cs, &cpus);
1234         set_cpus_allowed(tsk, cpus);
1235         mutex_unlock(&callback_mutex);
1236
1237         from = oldcs->mems_allowed;
1238         to = cs->mems_allowed;
1239         mm = get_task_mm(tsk);
1240         if (mm) {
1241                 mpol_rebind_mm(mm, &to);
1242                 if (is_memory_migrate(cs))
1243                         cpuset_migrate_mm(mm, &from, &to);
1244                 mmput(mm);
1245         }
1246
1247 }
1248
1249 /* The various types of files and directories in a cpuset file system */
1250
1251 typedef enum {
1252         FILE_MEMORY_MIGRATE,
1253         FILE_CPULIST,
1254         FILE_MEMLIST,
1255         FILE_CPU_EXCLUSIVE,
1256         FILE_MEM_EXCLUSIVE,
1257         FILE_SCHED_LOAD_BALANCE,
1258         FILE_MEMORY_PRESSURE_ENABLED,
1259         FILE_MEMORY_PRESSURE,
1260         FILE_SPREAD_PAGE,
1261         FILE_SPREAD_SLAB,
1262 } cpuset_filetype_t;
1263
1264 static ssize_t cpuset_common_file_write(struct cgroup *cont,
1265                                         struct cftype *cft,
1266                                         struct file *file,
1267                                         const char __user *userbuf,
1268                                         size_t nbytes, loff_t *unused_ppos)
1269 {
1270         struct cpuset *cs = cgroup_cs(cont);
1271         cpuset_filetype_t type = cft->private;
1272         char *buffer;
1273         int retval = 0;
1274
1275         /* Crude upper limit on largest legitimate cpulist user might write. */
1276         if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1277                 return -E2BIG;
1278
1279         /* +1 for nul-terminator */
1280         if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)
1281                 return -ENOMEM;
1282
1283         if (copy_from_user(buffer, userbuf, nbytes)) {
1284                 retval = -EFAULT;
1285                 goto out1;
1286         }
1287         buffer[nbytes] = 0;     /* nul-terminate */
1288
1289         cgroup_lock();
1290
1291         if (cgroup_is_removed(cont)) {
1292                 retval = -ENODEV;
1293                 goto out2;
1294         }
1295
1296         switch (type) {
1297         case FILE_CPULIST:
1298                 retval = update_cpumask(cs, buffer);
1299                 break;
1300         case FILE_MEMLIST:
1301                 retval = update_nodemask(cs, buffer);
1302                 break;
1303         case FILE_CPU_EXCLUSIVE:
1304                 retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
1305                 break;
1306         case FILE_MEM_EXCLUSIVE:
1307                 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
1308                 break;
1309         case FILE_SCHED_LOAD_BALANCE:
1310                 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
1311                 break;
1312         case FILE_MEMORY_MIGRATE:
1313                 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
1314                 break;
1315         case FILE_MEMORY_PRESSURE_ENABLED:
1316                 retval = update_memory_pressure_enabled(cs, buffer);
1317                 break;
1318         case FILE_MEMORY_PRESSURE:
1319                 retval = -EACCES;
1320                 break;
1321         case FILE_SPREAD_PAGE:
1322                 retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
1323                 cs->mems_generation = cpuset_mems_generation++;
1324                 break;
1325         case FILE_SPREAD_SLAB:
1326                 retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1327                 cs->mems_generation = cpuset_mems_generation++;
1328                 break;
1329         default:
1330                 retval = -EINVAL;
1331                 goto out2;
1332         }
1333
1334         if (retval == 0)
1335                 retval = nbytes;
1336 out2:
1337         cgroup_unlock();
1338 out1:
1339         kfree(buffer);
1340         return retval;
1341 }
1342
1343 /*
1344  * These ascii lists should be read in a single call, by using a user
1345  * buffer large enough to hold the entire map.  If read in smaller
1346  * chunks, there is no guarantee of atomicity.  Since the display format
1347  * used, list of ranges of sequential numbers, is variable length,
1348  * and since these maps can change value dynamically, one could read
1349  * gibberish by doing partial reads while a list was changing.
1350  * A single large read to a buffer that crosses a page boundary is
1351  * ok, because the result being copied to user land is not recomputed
1352  * across a page fault.
1353  */
1354
1355 static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1356 {
1357         cpumask_t mask;
1358
1359         mutex_lock(&callback_mutex);
1360         mask = cs->cpus_allowed;
1361         mutex_unlock(&callback_mutex);
1362
1363         return cpulist_scnprintf(page, PAGE_SIZE, mask);
1364 }
1365
1366 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1367 {
1368         nodemask_t mask;
1369
1370         mutex_lock(&callback_mutex);
1371         mask = cs->mems_allowed;
1372         mutex_unlock(&callback_mutex);
1373
1374         return nodelist_scnprintf(page, PAGE_SIZE, mask);
1375 }
1376
1377 static ssize_t cpuset_common_file_read(struct cgroup *cont,
1378                                        struct cftype *cft,
1379                                        struct file *file,
1380                                        char __user *buf,
1381                                        size_t nbytes, loff_t *ppos)
1382 {
1383         struct cpuset *cs = cgroup_cs(cont);
1384         cpuset_filetype_t type = cft->private;
1385         char *page;
1386         ssize_t retval = 0;
1387         char *s;
1388
1389         if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1390                 return -ENOMEM;
1391
1392         s = page;
1393
1394         switch (type) {
1395         case FILE_CPULIST:
1396                 s += cpuset_sprintf_cpulist(s, cs);
1397                 break;
1398         case FILE_MEMLIST:
1399                 s += cpuset_sprintf_memlist(s, cs);
1400                 break;
1401         case FILE_CPU_EXCLUSIVE:
1402                 *s++ = is_cpu_exclusive(cs) ? '1' : '0';
1403                 break;
1404         case FILE_MEM_EXCLUSIVE:
1405                 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1406                 break;
1407         case FILE_SCHED_LOAD_BALANCE:
1408                 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1409                 break;
1410         case FILE_MEMORY_MIGRATE:
1411                 *s++ = is_memory_migrate(cs) ? '1' : '0';
1412                 break;
1413         case FILE_MEMORY_PRESSURE_ENABLED:
1414                 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1415                 break;
1416         case FILE_MEMORY_PRESSURE:
1417                 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1418                 break;
1419         case FILE_SPREAD_PAGE:
1420                 *s++ = is_spread_page(cs) ? '1' : '0';
1421                 break;
1422         case FILE_SPREAD_SLAB:
1423                 *s++ = is_spread_slab(cs) ? '1' : '0';
1424                 break;
1425         default:
1426                 retval = -EINVAL;
1427                 goto out;
1428         }
1429         *s++ = '\n';
1430
1431         retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1432 out:
1433         free_page((unsigned long)page);
1434         return retval;
1435 }
1436
1437
1438
1439
1440
1441 /*
1442  * for the common functions, 'private' gives the type of file
1443  */
1444
1445 static struct cftype cft_cpus = {
1446         .name = "cpus",
1447         .read = cpuset_common_file_read,
1448         .write = cpuset_common_file_write,
1449         .private = FILE_CPULIST,
1450 };
1451
1452 static struct cftype cft_mems = {
1453         .name = "mems",
1454         .read = cpuset_common_file_read,
1455         .write = cpuset_common_file_write,
1456         .private = FILE_MEMLIST,
1457 };
1458
1459 static struct cftype cft_cpu_exclusive = {
1460         .name = "cpu_exclusive",
1461         .read = cpuset_common_file_read,
1462         .write = cpuset_common_file_write,
1463         .private = FILE_CPU_EXCLUSIVE,
1464 };
1465
1466 static struct cftype cft_mem_exclusive = {
1467         .name = "mem_exclusive",
1468         .read = cpuset_common_file_read,
1469         .write = cpuset_common_file_write,
1470         .private = FILE_MEM_EXCLUSIVE,
1471 };
1472
1473 static struct cftype cft_sched_load_balance = {
1474         .name = "sched_load_balance",
1475         .read = cpuset_common_file_read,
1476         .write = cpuset_common_file_write,
1477         .private = FILE_SCHED_LOAD_BALANCE,
1478 };
1479
1480 static struct cftype cft_memory_migrate = {
1481         .name = "memory_migrate",
1482         .read = cpuset_common_file_read,
1483         .write = cpuset_common_file_write,
1484         .private = FILE_MEMORY_MIGRATE,
1485 };
1486
1487 static struct cftype cft_memory_pressure_enabled = {
1488         .name = "memory_pressure_enabled",
1489         .read = cpuset_common_file_read,
1490         .write = cpuset_common_file_write,
1491         .private = FILE_MEMORY_PRESSURE_ENABLED,
1492 };
1493
1494 static struct cftype cft_memory_pressure = {
1495         .name = "memory_pressure",
1496         .read = cpuset_common_file_read,
1497         .write = cpuset_common_file_write,
1498         .private = FILE_MEMORY_PRESSURE,
1499 };
1500
1501 static struct cftype cft_spread_page = {
1502         .name = "memory_spread_page",
1503         .read = cpuset_common_file_read,
1504         .write = cpuset_common_file_write,
1505         .private = FILE_SPREAD_PAGE,
1506 };
1507
1508 static struct cftype cft_spread_slab = {
1509         .name = "memory_spread_slab",
1510         .read = cpuset_common_file_read,
1511         .write = cpuset_common_file_write,
1512         .private = FILE_SPREAD_SLAB,
1513 };
1514
1515 static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1516 {
1517         int err;
1518
1519         if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0)
1520                 return err;
1521         if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0)
1522                 return err;
1523         if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0)
1524                 return err;
1525         if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0)
1526                 return err;
1527         if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1528                 return err;
1529         if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1530                 return err;
1531         if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1532                 return err;
1533         if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
1534                 return err;
1535         if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
1536                 return err;
1537         /* memory_pressure_enabled is in root cpuset only */
1538         if (err == 0 && !cont->parent)
1539                 err = cgroup_add_file(cont, ss,
1540                                          &cft_memory_pressure_enabled);
1541         return 0;
1542 }
1543
1544 /*
1545  * post_clone() is called at the end of cgroup_clone().
1546  * 'cgroup' was just created automatically as a result of
1547  * a cgroup_clone(), and the current task is about to
1548  * be moved into 'cgroup'.
1549  *
1550  * Currently we refuse to set up the cgroup - thereby
1551  * refusing the task to be entered, and as a result refusing
1552  * the sys_unshare() or clone() which initiated it - if any
1553  * sibling cpusets have exclusive cpus or mem.
1554  *
1555  * If this becomes a problem for some users who wish to
1556  * allow that scenario, then cpuset_post_clone() could be
1557  * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1558  * (and likewise for mems) to the new cgroup.
1559  */
1560 static void cpuset_post_clone(struct cgroup_subsys *ss,
1561                               struct cgroup *cgroup)
1562 {
1563         struct cgroup *parent, *child;
1564         struct cpuset *cs, *parent_cs;
1565
1566         parent = cgroup->parent;
1567         list_for_each_entry(child, &parent->children, sibling) {
1568                 cs = cgroup_cs(child);
1569                 if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
1570                         return;
1571         }
1572         cs = cgroup_cs(cgroup);
1573         parent_cs = cgroup_cs(parent);
1574
1575         cs->mems_allowed = parent_cs->mems_allowed;
1576         cs->cpus_allowed = parent_cs->cpus_allowed;
1577         return;
1578 }
1579
1580 /*
1581  *      cpuset_create - create a cpuset
1582  *      parent: cpuset that will be parent of the new cpuset.
1583  *      name:           name of the new cpuset. Will be strcpy'ed.
1584  *      mode:           mode to set on new inode
1585  *
1586  *      Must be called with the mutex on the parent inode held
1587  */
1588
1589 static struct cgroup_subsys_state *cpuset_create(
1590         struct cgroup_subsys *ss,
1591         struct cgroup *cont)
1592 {
1593         struct cpuset *cs;
1594         struct cpuset *parent;
1595
1596         if (!cont->parent) {
1597                 /* This is early initialization for the top cgroup */
1598                 top_cpuset.mems_generation = cpuset_mems_generation++;
1599                 return &top_cpuset.css;
1600         }
1601         parent = cgroup_cs(cont->parent);
1602         cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1603         if (!cs)
1604                 return ERR_PTR(-ENOMEM);
1605
1606         cpuset_update_task_memory_state();
1607         cs->flags = 0;
1608         if (is_spread_page(parent))
1609                 set_bit(CS_SPREAD_PAGE, &cs->flags);
1610         if (is_spread_slab(parent))
1611                 set_bit(CS_SPREAD_SLAB, &cs->flags);
1612         set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1613         cs->cpus_allowed = CPU_MASK_NONE;
1614         cs->mems_allowed = NODE_MASK_NONE;
1615         cs->mems_generation = cpuset_mems_generation++;
1616         fmeter_init(&cs->fmeter);
1617
1618         cs->parent = parent;
1619         number_of_cpusets++;
1620         return &cs->css ;
1621 }
1622
1623 /*
1624  * Locking note on the strange update_flag() call below:
1625  *
1626  * If the cpuset being removed has its flag 'sched_load_balance'
1627  * enabled, then simulate turning sched_load_balance off, which
1628  * will call rebuild_sched_domains().  The get_online_cpus()
1629  * call in rebuild_sched_domains() must not be made while holding
1630  * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
1631  * get_online_cpus() calls.  So the reverse nesting would risk an
1632  * ABBA deadlock.
1633  */
1634
1635 static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1636 {
1637         struct cpuset *cs = cgroup_cs(cont);
1638
1639         cpuset_update_task_memory_state();
1640
1641         if (is_sched_load_balance(cs))
1642                 update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
1643
1644         number_of_cpusets--;
1645         kfree(cs);
1646 }
1647
1648 struct cgroup_subsys cpuset_subsys = {
1649         .name = "cpuset",
1650         .create = cpuset_create,
1651         .destroy  = cpuset_destroy,
1652         .can_attach = cpuset_can_attach,
1653         .attach = cpuset_attach,
1654         .populate = cpuset_populate,
1655         .post_clone = cpuset_post_clone,
1656         .subsys_id = cpuset_subsys_id,
1657         .early_init = 1,
1658 };
1659
1660 /*
1661  * cpuset_init_early - just enough so that the calls to
1662  * cpuset_update_task_memory_state() in early init code
1663  * are harmless.
1664  */
1665
1666 int __init cpuset_init_early(void)
1667 {
1668         top_cpuset.mems_generation = cpuset_mems_generation++;
1669         return 0;
1670 }
1671
1672
1673 /**
1674  * cpuset_init - initialize cpusets at system boot
1675  *
1676  * Description: Initialize top_cpuset and the cpuset internal file system,
1677  **/
1678
1679 int __init cpuset_init(void)
1680 {
1681         int err = 0;
1682
1683         top_cpuset.cpus_allowed = CPU_MASK_ALL;
1684         top_cpuset.mems_allowed = NODE_MASK_ALL;
1685
1686         fmeter_init(&top_cpuset.fmeter);
1687         top_cpuset.mems_generation = cpuset_mems_generation++;
1688         set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1689
1690         err = register_filesystem(&cpuset_fs_type);
1691         if (err < 0)
1692                 return err;
1693
1694         number_of_cpusets = 1;
1695         return 0;
1696 }
1697
1698 /**
1699  * cpuset_do_move_task - move a given task to another cpuset
1700  * @tsk: pointer to task_struct the task to move
1701  * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
1702  *
1703  * Called by cgroup_scan_tasks() for each task in a cgroup.
1704  * Return nonzero to stop the walk through the tasks.
1705  */
1706 void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
1707 {
1708         struct cpuset_hotplug_scanner *chsp;
1709
1710         chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
1711         cgroup_attach_task(chsp->to, tsk);
1712 }
1713
1714 /**
1715  * move_member_tasks_to_cpuset - move tasks from one cpuset to another
1716  * @from: cpuset in which the tasks currently reside
1717  * @to: cpuset to which the tasks will be moved
1718  *
1719  * Called with manage_sem held
1720  * callback_mutex must not be held, as attach_task() will take it.
1721  *
1722  * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1723  * calling callback functions for each.
1724  */
1725 static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1726 {
1727         struct cpuset_hotplug_scanner scan;
1728
1729         scan.scan.cg = from->css.cgroup;
1730         scan.scan.test_task = NULL; /* select all tasks in cgroup */
1731         scan.scan.process_task = cpuset_do_move_task;
1732         scan.scan.heap = NULL;
1733         scan.to = to->css.cgroup;
1734
1735         if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
1736                 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1737                                 "cgroup_scan_tasks failed\n");
1738 }
1739
1740 /*
1741  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
1742  * or memory nodes, we need to walk over the cpuset hierarchy,
1743  * removing that CPU or node from all cpusets.  If this removes the
1744  * last CPU or node from a cpuset, then move the tasks in the empty
1745  * cpuset to its next-highest non-empty parent.
1746  *
1747  * The parent cpuset has some superset of the 'mems' nodes that the
1748  * newly empty cpuset held, so no migration of memory is necessary.
1749  *
1750  * Called with both manage_sem and callback_sem held
1751  */
1752 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1753 {
1754         struct cpuset *parent;
1755
1756         /* the cgroup's css_sets list is in use if there are tasks
1757            in the cpuset; the list is empty if there are none;
1758            the cs->css.refcnt seems always 0 */
1759         if (list_empty(&cs->css.cgroup->css_sets))
1760                 return;
1761
1762         /*
1763          * Find its next-highest non-empty parent, (top cpuset
1764          * has online cpus, so can't be empty).
1765          */
1766         parent = cs->parent;
1767         while (cpus_empty(parent->cpus_allowed)) {
1768                 /*
1769                  * this empty cpuset should now be considered to
1770                  * have been used, and therefore eligible for
1771                  * release when empty (if it is notify_on_release)
1772                  */
1773                 parent = parent->parent;
1774         }
1775
1776         move_member_tasks_to_cpuset(cs, parent);
1777 }
1778
1779 /*
1780  * Walk the specified cpuset subtree and look for empty cpusets.
1781  * The tasks of such cpuset must be moved to a parent cpuset.
1782  *
1783  * Note that such a notify_on_release cpuset must have had, at some time,
1784  * member tasks or cpuset descendants and cpus and memory, before it can
1785  * be a candidate for release.
1786  *
1787  * Called with manage_mutex held.  We take callback_mutex to modify
1788  * cpus_allowed and mems_allowed.
1789  *
1790  * This walk processes the tree from top to bottom, completing one layer
1791  * before dropping down to the next.  It always processes a node before
1792  * any of its children.
1793  *
1794  * For now, since we lack memory hot unplug, we'll never see a cpuset
1795  * that has tasks along with an empty 'mems'.  But if we did see such
1796  * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1797  */
1798 static void scan_for_empty_cpusets(const struct cpuset *root)
1799 {
1800         struct cpuset *cp;      /* scans cpusets being updated */
1801         struct cpuset *child;   /* scans child cpusets of cp */
1802         struct list_head queue;
1803         struct cgroup *cont;
1804
1805         INIT_LIST_HEAD(&queue);
1806
1807         list_add_tail((struct list_head *)&root->stack_list, &queue);
1808
1809         mutex_lock(&callback_mutex);
1810         while (!list_empty(&queue)) {
1811                 cp = container_of(queue.next, struct cpuset, stack_list);
1812                 list_del(queue.next);
1813                 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1814                         child = cgroup_cs(cont);
1815                         list_add_tail(&child->stack_list, &queue);
1816                 }
1817                 cont = cp->css.cgroup;
1818                 /* Remove offline cpus and mems from this cpuset. */
1819                 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
1820                 nodes_and(cp->mems_allowed, cp->mems_allowed,
1821                                                 node_states[N_HIGH_MEMORY]);
1822                 if ((cpus_empty(cp->cpus_allowed) ||
1823                      nodes_empty(cp->mems_allowed))) {
1824                         /* Move tasks from the empty cpuset to a parent */
1825                         mutex_unlock(&callback_mutex);
1826                         remove_tasks_in_empty_cpuset(cp);
1827                         mutex_lock(&callback_mutex);
1828                 }
1829         }
1830         mutex_unlock(&callback_mutex);
1831         return;
1832 }
1833
1834 /*
1835  * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1836  * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
1837  * track what's online after any CPU or memory node hotplug or unplug event.
1838  *
1839  * Since there are two callers of this routine, one for CPU hotplug
1840  * events and one for memory node hotplug events, we could have coded
1841  * two separate routines here.  We code it as a single common routine
1842  * in order to minimize text size.
1843  */
1844
1845 static void common_cpu_mem_hotplug_unplug(void)
1846 {
1847         cgroup_lock();
1848
1849         top_cpuset.cpus_allowed = cpu_online_map;
1850         top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1851         scan_for_empty_cpusets(&top_cpuset);
1852
1853         cgroup_unlock();
1854 }
1855
1856 /*
1857  * The top_cpuset tracks what CPUs and Memory Nodes are online,
1858  * period.  This is necessary in order to make cpusets transparent
1859  * (of no affect) on systems that are actively using CPU hotplug
1860  * but making no active use of cpusets.
1861  *
1862  * This routine ensures that top_cpuset.cpus_allowed tracks
1863  * cpu_online_map on each CPU hotplug (cpuhp) event.
1864  */
1865
1866 static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1867                                 unsigned long phase, void *unused_cpu)
1868 {
1869         if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
1870                 return NOTIFY_DONE;
1871
1872         common_cpu_mem_hotplug_unplug();
1873         return 0;
1874 }
1875
1876 #ifdef CONFIG_MEMORY_HOTPLUG
1877 /*
1878  * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1879  * Call this routine anytime after you change
1880  * node_states[N_HIGH_MEMORY].
1881  * See also the previous routine cpuset_handle_cpuhp().
1882  */
1883
1884 void cpuset_track_online_nodes(void)
1885 {
1886         common_cpu_mem_hotplug_unplug();
1887 }
1888 #endif
1889
1890 /**
1891  * cpuset_init_smp - initialize cpus_allowed
1892  *
1893  * Description: Finish top cpuset after cpu, node maps are initialized
1894  **/
1895
1896 void __init cpuset_init_smp(void)
1897 {
1898         top_cpuset.cpus_allowed = cpu_online_map;
1899         top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1900
1901         hotcpu_notifier(cpuset_handle_cpuhp, 0);
1902 }
1903
1904 /**
1905
1906  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1907  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1908  *
1909  * Description: Returns the cpumask_t cpus_allowed of the cpuset
1910  * attached to the specified @tsk.  Guaranteed to return some non-empty
1911  * subset of cpu_online_map, even if this means going outside the
1912  * tasks cpuset.
1913  **/
1914
1915 cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
1916 {
1917         cpumask_t mask;
1918
1919         mutex_lock(&callback_mutex);
1920         mask = cpuset_cpus_allowed_locked(tsk);
1921         mutex_unlock(&callback_mutex);
1922
1923         return mask;
1924 }
1925
1926 /**
1927  * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1928  * Must be  called with callback_mutex held.
1929  **/
1930 cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
1931 {
1932         cpumask_t mask;
1933
1934         task_lock(tsk);
1935         guarantee_online_cpus(task_cs(tsk), &mask);
1936         task_unlock(tsk);
1937
1938         return mask;
1939 }
1940
1941 void cpuset_init_current_mems_allowed(void)
1942 {
1943         current->mems_allowed = NODE_MASK_ALL;
1944 }
1945
1946 /**
1947  * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
1948  * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
1949  *
1950  * Description: Returns the nodemask_t mems_allowed of the cpuset
1951  * attached to the specified @tsk.  Guaranteed to return some non-empty
1952  * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
1953  * tasks cpuset.
1954  **/
1955
1956 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
1957 {
1958         nodemask_t mask;
1959
1960         mutex_lock(&callback_mutex);
1961         task_lock(tsk);
1962         guarantee_online_mems(task_cs(tsk), &mask);
1963         task_unlock(tsk);
1964         mutex_unlock(&callback_mutex);
1965
1966         return mask;
1967 }
1968
1969 /**
1970  * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
1971  * @zl: the zonelist to be checked
1972  *
1973  * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
1974  */
1975 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1976 {
1977         int i;
1978
1979         for (i = 0; zl->zones[i]; i++) {
1980                 int nid = zone_to_nid(zl->zones[i]);
1981
1982                 if (node_isset(nid, current->mems_allowed))
1983                         return 1;
1984         }
1985         return 0;
1986 }
1987
1988 /*
1989  * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1990  * ancestor to the specified cpuset.  Call holding callback_mutex.
1991  * If no ancestor is mem_exclusive (an unusual configuration), then
1992  * returns the root cpuset.
1993  */
1994 static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1995 {
1996         while (!is_mem_exclusive(cs) && cs->parent)
1997                 cs = cs->parent;
1998         return cs;
1999 }
2000
2001 /**
2002  * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
2003  * @z: is this zone on an allowed node?
2004  * @gfp_mask: memory allocation flags
2005  *
2006  * If we're in interrupt, yes, we can always allocate.  If
2007  * __GFP_THISNODE is set, yes, we can always allocate.  If zone
2008  * z's node is in our tasks mems_allowed, yes.  If it's not a
2009  * __GFP_HARDWALL request and this zone's nodes is in the nearest
2010  * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
2011  * If the task has been OOM killed and has access to memory reserves
2012  * as specified by the TIF_MEMDIE flag, yes.
2013  * Otherwise, no.
2014  *
2015  * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
2016  * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
2017  * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
2018  * from an enclosing cpuset.
2019  *
2020  * cpuset_zone_allowed_hardwall() only handles the simpler case of
2021  * hardwall cpusets, and never sleeps.
2022  *
2023  * The __GFP_THISNODE placement logic is really handled elsewhere,
2024  * by forcibly using a zonelist starting at a specified node, and by
2025  * (in get_page_from_freelist()) refusing to consider the zones for
2026  * any node on the zonelist except the first.  By the time any such
2027  * calls get to this routine, we should just shut up and say 'yes'.
2028  *
2029  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
2030  * and do not allow allocations outside the current tasks cpuset
2031  * unless the task has been OOM killed as is marked TIF_MEMDIE.
2032  * GFP_KERNEL allocations are not so marked, so can escape to the
2033  * nearest enclosing mem_exclusive ancestor cpuset.
2034  *
2035  * Scanning up parent cpusets requires callback_mutex.  The
2036  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
2037  * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
2038  * current tasks mems_allowed came up empty on the first pass over
2039  * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
2040  * cpuset are short of memory, might require taking the callback_mutex
2041  * mutex.
2042  *
2043  * The first call here from mm/page_alloc:get_page_from_freelist()
2044  * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
2045  * so no allocation on a node outside the cpuset is allowed (unless
2046  * in interrupt, of course).
2047  *
2048  * The second pass through get_page_from_freelist() doesn't even call
2049  * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
2050  * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
2051  * in alloc_flags.  That logic and the checks below have the combined
2052  * affect that:
2053  *      in_interrupt - any node ok (current task context irrelevant)
2054  *      GFP_ATOMIC   - any node ok
2055  *      TIF_MEMDIE   - any node ok
2056  *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
2057  *      GFP_USER     - only nodes in current tasks mems allowed ok.
2058  *
2059  * Rule:
2060  *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
2061  *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2062  *    the code that might scan up ancestor cpusets and sleep.
2063  */
2064
2065 int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2066 {
2067         int node;                       /* node that zone z is on */
2068         const struct cpuset *cs;        /* current cpuset ancestors */
2069         int allowed;                    /* is allocation in zone z allowed? */
2070
2071         if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2072                 return 1;
2073         node = zone_to_nid(z);
2074         might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2075         if (node_isset(node, current->mems_allowed))
2076                 return 1;
2077         /*
2078          * Allow tasks that have access to memory reserves because they have
2079          * been OOM killed to get memory anywhere.
2080          */
2081         if (unlikely(test_thread_flag(TIF_MEMDIE)))
2082                 return 1;
2083         if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
2084                 return 0;
2085
2086         if (current->flags & PF_EXITING) /* Let dying task have memory */
2087                 return 1;
2088
2089         /* Not hardwall and node outside mems_allowed: scan up cpusets */
2090         mutex_lock(&callback_mutex);
2091
2092         task_lock(current);
2093         cs = nearest_exclusive_ancestor(task_cs(current));
2094         task_unlock(current);
2095
2096         allowed = node_isset(node, cs->mems_allowed);
2097         mutex_unlock(&callback_mutex);
2098         return allowed;
2099 }
2100
2101 /*
2102  * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
2103  * @z: is this zone on an allowed node?
2104  * @gfp_mask: memory allocation flags
2105  *
2106  * If we're in interrupt, yes, we can always allocate.
2107  * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
2108  * z's node is in our tasks mems_allowed, yes.   If the task has been
2109  * OOM killed and has access to memory reserves as specified by the
2110  * TIF_MEMDIE flag, yes.  Otherwise, no.
2111  *
2112  * The __GFP_THISNODE placement logic is really handled elsewhere,
2113  * by forcibly using a zonelist starting at a specified node, and by
2114  * (in get_page_from_freelist()) refusing to consider the zones for
2115  * any node on the zonelist except the first.  By the time any such
2116  * calls get to this routine, we should just shut up and say 'yes'.
2117  *
2118  * Unlike the cpuset_zone_allowed_softwall() variant, above,
2119  * this variant requires that the zone be in the current tasks
2120  * mems_allowed or that we're in interrupt.  It does not scan up the
2121  * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2122  * It never sleeps.
2123  */
2124
2125 int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2126 {
2127         int node;                       /* node that zone z is on */
2128
2129         if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2130                 return 1;
2131         node = zone_to_nid(z);
2132         if (node_isset(node, current->mems_allowed))
2133                 return 1;
2134         /*
2135          * Allow tasks that have access to memory reserves because they have
2136          * been OOM killed to get memory anywhere.
2137          */
2138         if (unlikely(test_thread_flag(TIF_MEMDIE)))
2139                 return 1;
2140         return 0;
2141 }
2142
2143 /**
2144  * cpuset_lock - lock out any changes to cpuset structures
2145  *
2146  * The out of memory (oom) code needs to mutex_lock cpusets
2147  * from being changed while it scans the tasklist looking for a
2148  * task in an overlapping cpuset.  Expose callback_mutex via this
2149  * cpuset_lock() routine, so the oom code can lock it, before
2150  * locking the task list.  The tasklist_lock is a spinlock, so
2151  * must be taken inside callback_mutex.
2152  */
2153
2154 void cpuset_lock(void)
2155 {
2156         mutex_lock(&callback_mutex);
2157 }
2158
2159 /**
2160  * cpuset_unlock - release lock on cpuset changes
2161  *
2162  * Undo the lock taken in a previous cpuset_lock() call.
2163  */
2164
2165 void cpuset_unlock(void)
2166 {
2167         mutex_unlock(&callback_mutex);
2168 }
2169
2170 /**
2171  * cpuset_mem_spread_node() - On which node to begin search for a page
2172  *
2173  * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2174  * tasks in a cpuset with is_spread_page or is_spread_slab set),
2175  * and if the memory allocation used cpuset_mem_spread_node()
2176  * to determine on which node to start looking, as it will for
2177  * certain page cache or slab cache pages such as used for file
2178  * system buffers and inode caches, then instead of starting on the
2179  * local node to look for a free page, rather spread the starting
2180  * node around the tasks mems_allowed nodes.
2181  *
2182  * We don't have to worry about the returned node being offline
2183  * because "it can't happen", and even if it did, it would be ok.
2184  *
2185  * The routines calling guarantee_online_mems() are careful to
2186  * only set nodes in task->mems_allowed that are online.  So it
2187  * should not be possible for the following code to return an
2188  * offline node.  But if it did, that would be ok, as this routine
2189  * is not returning the node where the allocation must be, only
2190  * the node where the search should start.  The zonelist passed to
2191  * __alloc_pages() will include all nodes.  If the slab allocator
2192  * is passed an offline node, it will fall back to the local node.
2193  * See kmem_cache_alloc_node().
2194  */
2195
2196 int cpuset_mem_spread_node(void)
2197 {
2198         int node;
2199
2200         node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
2201         if (node == MAX_NUMNODES)
2202                 node = first_node(current->mems_allowed);
2203         current->cpuset_mem_spread_rotor = node;
2204         return node;
2205 }
2206 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2207
2208 /**
2209  * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
2210  * @tsk1: pointer to task_struct of some task.
2211  * @tsk2: pointer to task_struct of some other task.
2212  *
2213  * Description: Return true if @tsk1's mems_allowed intersects the
2214  * mems_allowed of @tsk2.  Used by the OOM killer to determine if
2215  * one of the task's memory usage might impact the memory available
2216  * to the other.
2217  **/
2218
2219 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2220                                    const struct task_struct *tsk2)
2221 {
2222         return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2223 }
2224
2225 /*
2226  * Collection of memory_pressure is suppressed unless
2227  * this flag is enabled by writing "1" to the special
2228  * cpuset file 'memory_pressure_enabled' in the root cpuset.
2229  */
2230
2231 int cpuset_memory_pressure_enabled __read_mostly;
2232
2233 /**
2234  * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
2235  *
2236  * Keep a running average of the rate of synchronous (direct)
2237  * page reclaim efforts initiated by tasks in each cpuset.
2238  *
2239  * This represents the rate at which some task in the cpuset
2240  * ran low on memory on all nodes it was allowed to use, and
2241  * had to enter the kernels page reclaim code in an effort to
2242  * create more free memory by tossing clean pages or swapping
2243  * or writing dirty pages.
2244  *
2245  * Display to user space in the per-cpuset read-only file
2246  * "memory_pressure".  Value displayed is an integer
2247  * representing the recent rate of entry into the synchronous
2248  * (direct) page reclaim by any task attached to the cpuset.
2249  **/
2250
2251 void __cpuset_memory_pressure_bump(void)
2252 {
2253         task_lock(current);
2254         fmeter_markevent(&task_cs(current)->fmeter);
2255         task_unlock(current);
2256 }
2257
2258 #ifdef CONFIG_PROC_PID_CPUSET
2259 /*
2260  * proc_cpuset_show()
2261  *  - Print tasks cpuset path into seq_file.
2262  *  - Used for /proc/<pid>/cpuset.
2263  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2264  *    doesn't really matter if tsk->cpuset changes after we read it,
2265  *    and we take manage_mutex, keeping attach_task() from changing it
2266  *    anyway.  No need to check that tsk->cpuset != NULL, thanks to
2267  *    the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2268  *    cpuset to top_cpuset.
2269  */
2270 static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2271 {
2272         struct pid *pid;
2273         struct task_struct *tsk;
2274         char *buf;
2275         struct cgroup_subsys_state *css;
2276         int retval;
2277
2278         retval = -ENOMEM;
2279         buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2280         if (!buf)
2281                 goto out;
2282
2283         retval = -ESRCH;
2284         pid = m->private;
2285         tsk = get_pid_task(pid, PIDTYPE_PID);
2286         if (!tsk)
2287                 goto out_free;
2288
2289         retval = -EINVAL;
2290         cgroup_lock();
2291         css = task_subsys_state(tsk, cpuset_subsys_id);
2292         retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2293         if (retval < 0)
2294                 goto out_unlock;
2295         seq_puts(m, buf);
2296         seq_putc(m, '\n');
2297 out_unlock:
2298         cgroup_unlock();
2299         put_task_struct(tsk);
2300 out_free:
2301         kfree(buf);
2302 out:
2303         return retval;
2304 }
2305
2306 static int cpuset_open(struct inode *inode, struct file *file)
2307 {
2308         struct pid *pid = PROC_I(inode)->pid;
2309         return single_open(file, proc_cpuset_show, pid);
2310 }
2311
2312 const struct file_operations proc_cpuset_operations = {
2313         .open           = cpuset_open,
2314         .read           = seq_read,
2315         .llseek         = seq_lseek,
2316         .release        = single_release,
2317 };
2318 #endif /* CONFIG_PROC_PID_CPUSET */
2319
2320 /* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
2321 char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
2322 {
2323         buffer += sprintf(buffer, "Cpus_allowed:\t");
2324         buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
2325         buffer += sprintf(buffer, "\n");
2326         buffer += sprintf(buffer, "Mems_allowed:\t");
2327         buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
2328         buffer += sprintf(buffer, "\n");
2329         return buffer;
2330 }