mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90 #include <linux/rmap.h>
  91
  92 #include <asm/tlbflush.h>
  93 #include <asm/uaccess.h>
  94
  95 /* Internal flags */
  96 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  97 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  98 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  99
 100 static struct kmem_cache *policy_cache;
 101 static struct kmem_cache *sn_cache;
 102
 103 #define PDprintk(fmt...)
 104
 105 /* Highest zone. An specific allocation for a zone below that is not
 106    policied. */
 107 int policy_zone = ZONE_DMA;
 108
 109 struct mempolicy default_policy = {
 110         .refcnt = ATOMIC_INIT(1), /* never free it */
 111         .policy = MPOL_DEFAULT,
 112 };
 113
 114 /* Do sanity checking on a policy */
 115 static int mpol_check_policy(int mode, nodemask_t *nodes)
 116 {
 117         int empty = nodes_empty(*nodes);
 118
 119         switch (mode) {
 120         case MPOL_DEFAULT:
 121                 if (!empty)
 122                         return -EINVAL;
 123                 break;
 124         case MPOL_BIND:
 125         case MPOL_INTERLEAVE:
 126                 /* Preferred will only use the first bit, but allow
 127                    more for now. */
 128                 if (empty)
 129                         return -EINVAL;
 130                 break;
 131         }
 132         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 133 }
 134
 135 /* Generate a custom zonelist for the BIND policy. */
 136 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 137 {
 138         struct zonelist *zl;
 139         int num, max, nd, k;
 140
 141         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 142         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 143         if (!zl)
 144                 return NULL;
 145         num = 0;
 146         /* First put in the highest zones from all nodes, then all the next
 147            lower zones etc. Avoid empty zones because the memory allocator
 148            doesn't like them. If you implement node hot removal you
 149            have to fix that. */
 150         for (k = policy_zone; k >= 0; k--) {
 151                 for_each_node_mask(nd, *nodes) {
 152                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 153                         if (z->present_pages > 0)
 154                                 zl->zones[num++] = z;
 155                 }
 156         }
 157         zl->zones[num] = NULL;
 158         return zl;
 159 }
 160
 161 /* Create a new policy */
 162 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 163 {
 164         struct mempolicy *policy;
 165
 166         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 167         if (mode == MPOL_DEFAULT)
 168                 return NULL;
 169         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 170         if (!policy)
 171                 return ERR_PTR(-ENOMEM);
 172         atomic_set(&policy->refcnt, 1);
 173         switch (mode) {
 174         case MPOL_INTERLEAVE:
 175                 policy->v.nodes = *nodes;
 176                 if (nodes_weight(*nodes) == 0) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-EINVAL);
 179                 }
 180                 break;
 181         case MPOL_PREFERRED:
 182                 policy->v.preferred_node = first_node(*nodes);
 183                 if (policy->v.preferred_node >= MAX_NUMNODES)
 184                         policy->v.preferred_node = -1;
 185                 break;
 186         case MPOL_BIND:
 187                 policy->v.zonelist = bind_zonelist(nodes);
 188                 if (policy->v.zonelist == NULL) {
 189                         kmem_cache_free(policy_cache, policy);
 190                         return ERR_PTR(-ENOMEM);
 191                 }
 192                 break;
 193         }
 194         policy->policy = mode;
 195         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 196         return policy;
 197 }
 198
 199 static void gather_stats(struct page *, void *, int pte_dirty);
 200 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 201                                 unsigned long flags);
 202
 203 /* Scan through pages checking if pages follow certain conditions. */
 204 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 205                 unsigned long addr, unsigned long end,
 206                 const nodemask_t *nodes, unsigned long flags,
 207                 void *private)
 208 {
 209         pte_t *orig_pte;
 210         pte_t *pte;
 211         spinlock_t *ptl;
 212
 213         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 214         do {
 215                 struct page *page;
 216                 unsigned int nid;
 217
 218                 if (!pte_present(*pte))
 219                         continue;
 220                 page = vm_normal_page(vma, addr, *pte);
 221                 if (!page)
 222                         continue;
 223                 /*
 224                  * The check for PageReserved here is important to avoid
 225                  * handling zero pages and other pages that may have been
 226                  * marked special by the system.
 227                  *
 228                  * If the PageReserved would not be checked here then f.e.
 229                  * the location of the zero page could have an influence
 230                  * on MPOL_MF_STRICT, zero pages would be counted for
 231                  * the per node stats, and there would be useless attempts
 232                  * to put zero pages on the migration list.
 233                  */
 234                 if (PageReserved(page))
 235                         continue;
 236                 nid = page_to_nid(page);
 237                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 238                         continue;
 239
 240                 if (flags & MPOL_MF_STATS)
 241                         gather_stats(page, private, pte_dirty(*pte));
 242                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 243                         migrate_page_add(page, private, flags);
 244                 else
 245                         break;
 246         } while (pte++, addr += PAGE_SIZE, addr != end);
 247         pte_unmap_unlock(orig_pte, ptl);
 248         return addr != end;
 249 }
 250
 251 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 252                 unsigned long addr, unsigned long end,
 253                 const nodemask_t *nodes, unsigned long flags,
 254                 void *private)
 255 {
 256         pmd_t *pmd;
 257         unsigned long next;
 258
 259         pmd = pmd_offset(pud, addr);
 260         do {
 261                 next = pmd_addr_end(addr, end);
 262                 if (pmd_none_or_clear_bad(pmd))
 263                         continue;
 264                 if (check_pte_range(vma, pmd, addr, next, nodes,
 265                                     flags, private))
 266                         return -EIO;
 267         } while (pmd++, addr = next, addr != end);
 268         return 0;
 269 }
 270
 271 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 272                 unsigned long addr, unsigned long end,
 273                 const nodemask_t *nodes, unsigned long flags,
 274                 void *private)
 275 {
 276         pud_t *pud;
 277         unsigned long next;
 278
 279         pud = pud_offset(pgd, addr);
 280         do {
 281                 next = pud_addr_end(addr, end);
 282                 if (pud_none_or_clear_bad(pud))
 283                         continue;
 284                 if (check_pmd_range(vma, pud, addr, next, nodes,
 285                                     flags, private))
 286                         return -EIO;
 287         } while (pud++, addr = next, addr != end);
 288         return 0;
 289 }
 290
 291 static inline int check_pgd_range(struct vm_area_struct *vma,
 292                 unsigned long addr, unsigned long end,
 293                 const nodemask_t *nodes, unsigned long flags,
 294                 void *private)
 295 {
 296         pgd_t *pgd;
 297         unsigned long next;
 298
 299         pgd = pgd_offset(vma->vm_mm, addr);
 300         do {
 301                 next = pgd_addr_end(addr, end);
 302                 if (pgd_none_or_clear_bad(pgd))
 303                         continue;
 304                 if (check_pud_range(vma, pgd, addr, next, nodes,
 305                                     flags, private))
 306                         return -EIO;
 307         } while (pgd++, addr = next, addr != end);
 308         return 0;
 309 }
 310
 311 /* Check if a vma is migratable */
 312 static inline int vma_migratable(struct vm_area_struct *vma)
 313 {
 314         if (vma->vm_flags & (
 315                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 316                 return 0;
 317         return 1;
 318 }
 319
 320 /*
 321  * Check if all pages in a range are on a set of nodes.
 322  * If pagelist != NULL then isolate pages from the LRU and
 323  * put them on the pagelist.
 324  */
 325 static struct vm_area_struct *
 326 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 327                 const nodemask_t *nodes, unsigned long flags, void *private)
 328 {
 329         int err;
 330         struct vm_area_struct *first, *vma, *prev;
 331
 332         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 333
 334                 err = migrate_prep();
 335                 if (err)
 336                         return ERR_PTR(err);
 337         }
 338
 339         first = find_vma(mm, start);
 340         if (!first)
 341                 return ERR_PTR(-EFAULT);
 342         prev = NULL;
 343         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 344                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 345                         if (!vma->vm_next && vma->vm_end < end)
 346                                 return ERR_PTR(-EFAULT);
 347                         if (prev && prev->vm_end < vma->vm_start)
 348                                 return ERR_PTR(-EFAULT);
 349                 }
 350                 if (!is_vm_hugetlb_page(vma) &&
 351                     ((flags & MPOL_MF_STRICT) ||
 352                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 353                                 vma_migratable(vma)))) {
 354                         unsigned long endvma = vma->vm_end;
 355
 356                         if (endvma > end)
 357                                 endvma = end;
 358                         if (vma->vm_start > start)
 359                                 start = vma->vm_start;
 360                         err = check_pgd_range(vma, start, endvma, nodes,
 361                                                 flags, private);
 362                         if (err) {
 363                                 first = ERR_PTR(err);
 364                                 break;
 365                         }
 366                 }
 367                 prev = vma;
 368         }
 369         return first;
 370 }
 371
 372 /* Apply policy to a single VMA */
 373 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 374 {
 375         int err = 0;
 376         struct mempolicy *old = vma->vm_policy;
 377
 378         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 379                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 380                  vma->vm_ops, vma->vm_file,
 381                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 382
 383         if (vma->vm_ops && vma->vm_ops->set_policy)
 384                 err = vma->vm_ops->set_policy(vma, new);
 385         if (!err) {
 386                 mpol_get(new);
 387                 vma->vm_policy = new;
 388                 mpol_free(old);
 389         }
 390         return err;
 391 }
 392
 393 /* Step 2: apply policy to a range and do splits. */
 394 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 395                        unsigned long end, struct mempolicy *new)
 396 {
 397         struct vm_area_struct *next;
 398         int err;
 399
 400         err = 0;
 401         for (; vma && vma->vm_start < end; vma = next) {
 402                 next = vma->vm_next;
 403                 if (vma->vm_start < start)
 404                         err = split_vma(vma->vm_mm, vma, start, 1);
 405                 if (!err && vma->vm_end > end)
 406                         err = split_vma(vma->vm_mm, vma, end, 0);
 407                 if (!err)
 408                         err = policy_vma(vma, new);
 409                 if (err)
 410                         break;
 411         }
 412         return err;
 413 }
 414
 415 static int contextualize_policy(int mode, nodemask_t *nodes)
 416 {
 417         if (!nodes)
 418                 return 0;
 419
 420         cpuset_update_task_memory_state();
 421         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 422                 return -EINVAL;
 423         return mpol_check_policy(mode, nodes);
 424 }
 425
 426
 427 /*
 428  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 429  * mempolicy.  Allows more rapid checking of this (combined perhaps
 430  * with other PF_* flag bits) on memory allocation hot code paths.
 431  *
 432  * If called from outside this file, the task 'p' should -only- be
 433  * a newly forked child not yet visible on the task list, because
 434  * manipulating the task flags of a visible task is not safe.
 435  *
 436  * The above limitation is why this routine has the funny name
 437  * mpol_fix_fork_child_flag().
 438  *
 439  * It is also safe to call this with a task pointer of current,
 440  * which the static wrapper mpol_set_task_struct_flag() does,
 441  * for use within this file.
 442  */
 443
 444 void mpol_fix_fork_child_flag(struct task_struct *p)
 445 {
 446         if (p->mempolicy)
 447                 p->flags |= PF_MEMPOLICY;
 448         else
 449                 p->flags &= ~PF_MEMPOLICY;
 450 }
 451
 452 static void mpol_set_task_struct_flag(void)
 453 {
 454         mpol_fix_fork_child_flag(current);
 455 }
 456
 457 /* Set the process memory policy */
 458 long do_set_mempolicy(int mode, nodemask_t *nodes)
 459 {
 460         struct mempolicy *new;
 461
 462         if (contextualize_policy(mode, nodes))
 463                 return -EINVAL;
 464         new = mpol_new(mode, nodes);
 465         if (IS_ERR(new))
 466                 return PTR_ERR(new);
 467         mpol_free(current->mempolicy);
 468         current->mempolicy = new;
 469         mpol_set_task_struct_flag();
 470         if (new && new->policy == MPOL_INTERLEAVE)
 471                 current->il_next = first_node(new->v.nodes);
 472         return 0;
 473 }
 474
 475 /* Fill a zone bitmap for a policy */
 476 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 477 {
 478         int i;
 479
 480         nodes_clear(*nodes);
 481         switch (p->policy) {
 482         case MPOL_BIND:
 483                 for (i = 0; p->v.zonelist->zones[i]; i++)
 484                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 485                                 *nodes);
 486                 break;
 487         case MPOL_DEFAULT:
 488                 break;
 489         case MPOL_INTERLEAVE:
 490                 *nodes = p->v.nodes;
 491                 break;
 492         case MPOL_PREFERRED:
 493                 /* or use current node instead of online map? */
 494                 if (p->v.preferred_node < 0)
 495                         *nodes = node_online_map;
 496                 else
 497                         node_set(p->v.preferred_node, *nodes);
 498                 break;
 499         default:
 500                 BUG();
 501         }
 502 }
 503
 504 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 505 {
 506         struct page *p;
 507         int err;
 508
 509         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 510         if (err >= 0) {
 511                 err = page_to_nid(p);
 512                 put_page(p);
 513         }
 514         return err;
 515 }
 516
 517 /* Retrieve NUMA policy */
 518 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 519                         unsigned long addr, unsigned long flags)
 520 {
 521         int err;
 522         struct mm_struct *mm = current->mm;
 523         struct vm_area_struct *vma = NULL;
 524         struct mempolicy *pol = current->mempolicy;
 525
 526         cpuset_update_task_memory_state();
 527         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 528                 return -EINVAL;
 529         if (flags & MPOL_F_ADDR) {
 530                 down_read(&mm->mmap_sem);
 531                 vma = find_vma_intersection(mm, addr, addr+1);
 532                 if (!vma) {
 533                         up_read(&mm->mmap_sem);
 534                         return -EFAULT;
 535                 }
 536                 if (vma->vm_ops && vma->vm_ops->get_policy)
 537                         pol = vma->vm_ops->get_policy(vma, addr);
 538                 else
 539                         pol = vma->vm_policy;
 540         } else if (addr)
 541                 return -EINVAL;
 542
 543         if (!pol)
 544                 pol = &default_policy;
 545
 546         if (flags & MPOL_F_NODE) {
 547                 if (flags & MPOL_F_ADDR) {
 548                         err = lookup_node(mm, addr);
 549                         if (err < 0)
 550                                 goto out;
 551                         *policy = err;
 552                 } else if (pol == current->mempolicy &&
 553                                 pol->policy == MPOL_INTERLEAVE) {
 554                         *policy = current->il_next;
 555                 } else {
 556                         err = -EINVAL;
 557                         goto out;
 558                 }
 559         } else
 560                 *policy = pol->policy;
 561
 562         if (vma) {
 563                 up_read(&current->mm->mmap_sem);
 564                 vma = NULL;
 565         }
 566
 567         err = 0;
 568         if (nmask)
 569                 get_zonemask(pol, nmask);
 570
 571  out:
 572         if (vma)
 573                 up_read(&current->mm->mmap_sem);
 574         return err;
 575 }
 576
 577 #ifdef CONFIG_MIGRATION
 578 /*
 579  * page migration
 580  */
 581 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 582                                 unsigned long flags)
 583 {
 584         /*
 585          * Avoid migrating a page that is shared with others.
 586          */
 587         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 588                 isolate_lru_page(page, pagelist);
 589 }
 590
 591 static struct page *new_node_page(struct page *page, unsigned long node)
 592 {
 593         return alloc_pages_node(node, GFP_HIGHUSER, 0);
 594 }
 595
 596 /*
 597  * Migrate pages from one node to a target node.
 598  * Returns error or the number of pages not migrated.
 599  */
 600 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 601 {
 602         nodemask_t nmask;
 603         LIST_HEAD(pagelist);
 604         int err = 0;
 605
 606         nodes_clear(nmask);
 607         node_set(source, nmask);
 608
 609         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 610                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 611
 612         if (!list_empty(&pagelist))
 613                 err = migrate_pages(&pagelist, new_node_page, dest);
 614
 615         return err;
 616 }
 617
 618 /*
 619  * Move pages between the two nodesets so as to preserve the physical
 620  * layout as much as possible.
 621  *
 622  * Returns the number of page that could not be moved.
 623  */
 624 int do_migrate_pages(struct mm_struct *mm,
 625         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 626 {
 627         LIST_HEAD(pagelist);
 628         int busy = 0;
 629         int err = 0;
 630         nodemask_t tmp;
 631
 632         down_read(&mm->mmap_sem);
 633
 634 /*
 635  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 636  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 637  * bit in 'tmp', and return that <source, dest> pair for migration.
 638  * The pair of nodemasks 'to' and 'from' define the map.
 639  *
 640  * If no pair of bits is found that way, fallback to picking some
 641  * pair of 'source' and 'dest' bits that are not the same.  If the
 642  * 'source' and 'dest' bits are the same, this represents a node
 643  * that will be migrating to itself, so no pages need move.
 644  *
 645  * If no bits are left in 'tmp', or if all remaining bits left
 646  * in 'tmp' correspond to the same bit in 'to', return false
 647  * (nothing left to migrate).
 648  *
 649  * This lets us pick a pair of nodes to migrate between, such that
 650  * if possible the dest node is not already occupied by some other
 651  * source node, minimizing the risk of overloading the memory on a
 652  * node that would happen if we migrated incoming memory to a node
 653  * before migrating outgoing memory source that same node.
 654  *
 655  * A single scan of tmp is sufficient.  As we go, we remember the
 656  * most recent <s, d> pair that moved (s != d).  If we find a pair
 657  * that not only moved, but what's better, moved to an empty slot
 658  * (d is not set in tmp), then we break out then, with that pair.
 659  * Otherwise when we finish scannng from_tmp, we at least have the
 660  * most recent <s, d> pair that moved.  If we get all the way through
 661  * the scan of tmp without finding any node that moved, much less
 662  * moved to an empty node, then there is nothing left worth migrating.
 663  */
 664
 665         tmp = *from_nodes;
 666         while (!nodes_empty(tmp)) {
 667                 int s,d;
 668                 int source = -1;
 669                 int dest = 0;
 670
 671                 for_each_node_mask(s, tmp) {
 672                         d = node_remap(s, *from_nodes, *to_nodes);
 673                         if (s == d)
 674                                 continue;
 675
 676                         source = s;     /* Node moved. Memorize */
 677                         dest = d;
 678
 679                         /* dest not in remaining from nodes? */
 680                         if (!node_isset(dest, tmp))
 681                                 break;
 682                 }
 683                 if (source == -1)
 684                         break;
 685
 686                 node_clear(source, tmp);
 687                 err = migrate_to_node(mm, source, dest, flags);
 688                 if (err > 0)
 689                         busy += err;
 690                 if (err < 0)
 691                         break;
 692         }
 693
 694         up_read(&mm->mmap_sem);
 695         if (err < 0)
 696                 return err;
 697         return busy;
 698
 699 }
 700
 701 static struct page *new_vma_page(struct page *page, unsigned long private)
 702 {
 703         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 704
 705         return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
 706 }
 707 #else
 708
 709 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 710                                 unsigned long flags)
 711 {
 712 }
 713
 714 int do_migrate_pages(struct mm_struct *mm,
 715         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 716 {
 717         return -ENOSYS;
 718 }
 719
 720 static struct page *new_vma_page(struct page *page, unsigned long private)
 721 {
 722         return NULL;
 723 }
 724 #endif
 725
 726 long do_mbind(unsigned long start, unsigned long len,
 727                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 728 {
 729         struct vm_area_struct *vma;
 730         struct mm_struct *mm = current->mm;
 731         struct mempolicy *new;
 732         unsigned long end;
 733         int err;
 734         LIST_HEAD(pagelist);
 735
 736         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 737                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 738             || mode > MPOL_MAX)
 739                 return -EINVAL;
 740         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 741                 return -EPERM;
 742
 743         if (start & ~PAGE_MASK)
 744                 return -EINVAL;
 745
 746         if (mode == MPOL_DEFAULT)
 747                 flags &= ~MPOL_MF_STRICT;
 748
 749         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 750         end = start + len;
 751
 752         if (end < start)
 753                 return -EINVAL;
 754         if (end == start)
 755                 return 0;
 756
 757         if (mpol_check_policy(mode, nmask))
 758                 return -EINVAL;
 759
 760         new = mpol_new(mode, nmask);
 761         if (IS_ERR(new))
 762                 return PTR_ERR(new);
 763
 764         /*
 765          * If we are using the default policy then operation
 766          * on discontinuous address spaces is okay after all
 767          */
 768         if (!new)
 769                 flags |= MPOL_MF_DISCONTIG_OK;
 770
 771         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 772                         mode,nodes_addr(nodes)[0]);
 773
 774         down_write(&mm->mmap_sem);
 775         vma = check_range(mm, start, end, nmask,
 776                           flags | MPOL_MF_INVERT, &pagelist);
 777
 778         err = PTR_ERR(vma);
 779         if (!IS_ERR(vma)) {
 780                 int nr_failed = 0;
 781
 782                 err = mbind_range(vma, start, end, new);
 783
 784                 if (!list_empty(&pagelist))
 785                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 786                                                 (unsigned long)vma);
 787
 788                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 789                         err = -EIO;
 790         }
 791
 792         up_write(&mm->mmap_sem);
 793         mpol_free(new);
 794         return err;
 795 }
 796
 797 /*
 798  * User space interface with variable sized bitmaps for nodelists.
 799  */
 800
 801 /* Copy a node mask from user space. */
 802 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 803                      unsigned long maxnode)
 804 {
 805         unsigned long k;
 806         unsigned long nlongs;
 807         unsigned long endmask;
 808
 809         --maxnode;
 810         nodes_clear(*nodes);
 811         if (maxnode == 0 || !nmask)
 812                 return 0;
 813         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 814                 return -EINVAL;
 815
 816         nlongs = BITS_TO_LONGS(maxnode);
 817         if ((maxnode % BITS_PER_LONG) == 0)
 818                 endmask = ~0UL;
 819         else
 820                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 821
 822         /* When the user specified more nodes than supported just check
 823            if the non supported part is all zero. */
 824         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 825                 if (nlongs > PAGE_SIZE/sizeof(long))
 826                         return -EINVAL;
 827                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 828                         unsigned long t;
 829                         if (get_user(t, nmask + k))
 830                                 return -EFAULT;
 831                         if (k == nlongs - 1) {
 832                                 if (t & endmask)
 833                                         return -EINVAL;
 834                         } else if (t)
 835                                 return -EINVAL;
 836                 }
 837                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 838                 endmask = ~0UL;
 839         }
 840
 841         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 842                 return -EFAULT;
 843         nodes_addr(*nodes)[nlongs-1] &= endmask;
 844         return 0;
 845 }
 846
 847 /* Copy a kernel node mask to user space */
 848 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 849                               nodemask_t *nodes)
 850 {
 851         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 852         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 853
 854         if (copy > nbytes) {
 855                 if (copy > PAGE_SIZE)
 856                         return -EINVAL;
 857                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 858                         return -EFAULT;
 859                 copy = nbytes;
 860         }
 861         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 862 }
 863
 864 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 865                         unsigned long mode,
 866                         unsigned long __user *nmask, unsigned long maxnode,
 867                         unsigned flags)
 868 {
 869         nodemask_t nodes;
 870         int err;
 871
 872         err = get_nodes(&nodes, nmask, maxnode);
 873         if (err)
 874                 return err;
 875         return do_mbind(start, len, mode, &nodes, flags);
 876 }
 877
 878 /* Set the process memory policy */
 879 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 880                 unsigned long maxnode)
 881 {
 882         int err;
 883         nodemask_t nodes;
 884
 885         if (mode < 0 || mode > MPOL_MAX)
 886                 return -EINVAL;
 887         err = get_nodes(&nodes, nmask, maxnode);
 888         if (err)
 889                 return err;
 890         return do_set_mempolicy(mode, &nodes);
 891 }
 892
 893 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 894                 const unsigned long __user *old_nodes,
 895                 const unsigned long __user *new_nodes)
 896 {
 897         struct mm_struct *mm;
 898         struct task_struct *task;
 899         nodemask_t old;
 900         nodemask_t new;
 901         nodemask_t task_nodes;
 902         int err;
 903
 904         err = get_nodes(&old, old_nodes, maxnode);
 905         if (err)
 906                 return err;
 907
 908         err = get_nodes(&new, new_nodes, maxnode);
 909         if (err)
 910                 return err;
 911
 912         /* Find the mm_struct */
 913         read_lock(&tasklist_lock);
 914         task = pid ? find_task_by_pid(pid) : current;
 915         if (!task) {
 916                 read_unlock(&tasklist_lock);
 917                 return -ESRCH;
 918         }
 919         mm = get_task_mm(task);
 920         read_unlock(&tasklist_lock);
 921
 922         if (!mm)
 923                 return -EINVAL;
 924
 925         /*
 926          * Check if this process has the right to modify the specified
 927          * process. The right exists if the process has administrative
 928          * capabilities, superuser privileges or the same
 929          * userid as the target process.
 930          */
 931         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 932             (current->uid != task->suid) && (current->uid != task->uid) &&
 933             !capable(CAP_SYS_NICE)) {
 934                 err = -EPERM;
 935                 goto out;
 936         }
 937
 938         task_nodes = cpuset_mems_allowed(task);
 939         /* Is the user allowed to access the target nodes? */
 940         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 941                 err = -EPERM;
 942                 goto out;
 943         }
 944
 945         err = do_migrate_pages(mm, &old, &new,
 946                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 947 out:
 948         mmput(mm);
 949         return err;
 950 }
 951
 952
 953 /* Retrieve NUMA policy */
 954 asmlinkage long sys_get_mempolicy(int __user *policy,
 955                                 unsigned long __user *nmask,
 956                                 unsigned long maxnode,
 957                                 unsigned long addr, unsigned long flags)
 958 {
 959         int err, pval;
 960         nodemask_t nodes;
 961
 962         if (nmask != NULL && maxnode < MAX_NUMNODES)
 963                 return -EINVAL;
 964
 965         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 966
 967         if (err)
 968                 return err;
 969
 970         if (policy && put_user(pval, policy))
 971                 return -EFAULT;
 972
 973         if (nmask)
 974                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 975
 976         return err;
 977 }
 978
 979 #ifdef CONFIG_COMPAT
 980
 981 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 982                                      compat_ulong_t __user *nmask,
 983                                      compat_ulong_t maxnode,
 984                                      compat_ulong_t addr, compat_ulong_t flags)
 985 {
 986         long err;
 987         unsigned long __user *nm = NULL;
 988         unsigned long nr_bits, alloc_size;
 989         DECLARE_BITMAP(bm, MAX_NUMNODES);
 990
 991         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 992         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 993
 994         if (nmask)
 995                 nm = compat_alloc_user_space(alloc_size);
 996
 997         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 998
 999         if (!err && nmask) {
1000                 err = copy_from_user(bm, nm, alloc_size);
1001                 /* ensure entire bitmap is zeroed */
1002                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1003                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1004         }
1005
1006         return err;
1007 }
1008
1009 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1010                                      compat_ulong_t maxnode)
1011 {
1012         long err = 0;
1013         unsigned long __user *nm = NULL;
1014         unsigned long nr_bits, alloc_size;
1015         DECLARE_BITMAP(bm, MAX_NUMNODES);
1016
1017         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1018         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1019
1020         if (nmask) {
1021                 err = compat_get_bitmap(bm, nmask, nr_bits);
1022                 nm = compat_alloc_user_space(alloc_size);
1023                 err |= copy_to_user(nm, bm, alloc_size);
1024         }
1025
1026         if (err)
1027                 return -EFAULT;
1028
1029         return sys_set_mempolicy(mode, nm, nr_bits+1);
1030 }
1031
1032 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1033                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1034                              compat_ulong_t maxnode, compat_ulong_t flags)
1035 {
1036         long err = 0;
1037         unsigned long __user *nm = NULL;
1038         unsigned long nr_bits, alloc_size;
1039         nodemask_t bm;
1040
1041         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1042         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1043
1044         if (nmask) {
1045                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1046                 nm = compat_alloc_user_space(alloc_size);
1047                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1048         }
1049
1050         if (err)
1051                 return -EFAULT;
1052
1053         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1054 }
1055
1056 #endif
1057
1058 /* Return effective policy for a VMA */
1059 static struct mempolicy * get_vma_policy(struct task_struct *task,
1060                 struct vm_area_struct *vma, unsigned long addr)
1061 {
1062         struct mempolicy *pol = task->mempolicy;
1063
1064         if (vma) {
1065                 if (vma->vm_ops && vma->vm_ops->get_policy)
1066                         pol = vma->vm_ops->get_policy(vma, addr);
1067                 else if (vma->vm_policy &&
1068                                 vma->vm_policy->policy != MPOL_DEFAULT)
1069                         pol = vma->vm_policy;
1070         }
1071         if (!pol)
1072                 pol = &default_policy;
1073         return pol;
1074 }
1075
1076 /* Return a zonelist representing a mempolicy */
1077 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1078 {
1079         int nd;
1080
1081         switch (policy->policy) {
1082         case MPOL_PREFERRED:
1083                 nd = policy->v.preferred_node;
1084                 if (nd < 0)
1085                         nd = numa_node_id();
1086                 break;
1087         case MPOL_BIND:
1088                 /* Lower zones don't get a policy applied */
1089                 /* Careful: current->mems_allowed might have moved */
1090                 if (gfp_zone(gfp) >= policy_zone)
1091                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1092                                 return policy->v.zonelist;
1093                 /*FALL THROUGH*/
1094         case MPOL_INTERLEAVE: /* should not happen */
1095         case MPOL_DEFAULT:
1096                 nd = numa_node_id();
1097                 break;
1098         default:
1099                 nd = 0;
1100                 BUG();
1101         }
1102         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1103 }
1104
1105 /* Do dynamic interleaving for a process */
1106 static unsigned interleave_nodes(struct mempolicy *policy)
1107 {
1108         unsigned nid, next;
1109         struct task_struct *me = current;
1110
1111         nid = me->il_next;
1112         next = next_node(nid, policy->v.nodes);
1113         if (next >= MAX_NUMNODES)
1114                 next = first_node(policy->v.nodes);
1115         me->il_next = next;
1116         return nid;
1117 }
1118
1119 /*
1120  * Depending on the memory policy provide a node from which to allocate the
1121  * next slab entry.
1122  */
1123 unsigned slab_node(struct mempolicy *policy)
1124 {
1125         switch (policy->policy) {
1126         case MPOL_INTERLEAVE:
1127                 return interleave_nodes(policy);
1128
1129         case MPOL_BIND:
1130                 /*
1131                  * Follow bind policy behavior and start allocation at the
1132                  * first node.
1133                  */
1134                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1135
1136         case MPOL_PREFERRED:
1137                 if (policy->v.preferred_node >= 0)
1138                         return policy->v.preferred_node;
1139                 /* Fall through */
1140
1141         default:
1142                 return numa_node_id();
1143         }
1144 }
1145
1146 /* Do static interleaving for a VMA with known offset. */
1147 static unsigned offset_il_node(struct mempolicy *pol,
1148                 struct vm_area_struct *vma, unsigned long off)
1149 {
1150         unsigned nnodes = nodes_weight(pol->v.nodes);
1151         unsigned target = (unsigned)off % nnodes;
1152         int c;
1153         int nid = -1;
1154
1155         c = 0;
1156         do {
1157                 nid = next_node(nid, pol->v.nodes);
1158                 c++;
1159         } while (c <= target);
1160         return nid;
1161 }
1162
1163 /* Determine a node number for interleave */
1164 static inline unsigned interleave_nid(struct mempolicy *pol,
1165                  struct vm_area_struct *vma, unsigned long addr, int shift)
1166 {
1167         if (vma) {
1168                 unsigned long off;
1169
1170                 off = vma->vm_pgoff;
1171                 off += (addr - vma->vm_start) >> shift;
1172                 return offset_il_node(pol, vma, off);
1173         } else
1174                 return interleave_nodes(pol);
1175 }
1176
1177 #ifdef CONFIG_HUGETLBFS
1178 /* Return a zonelist suitable for a huge page allocation. */
1179 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1180 {
1181         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1182
1183         if (pol->policy == MPOL_INTERLEAVE) {
1184                 unsigned nid;
1185
1186                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1187                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1188         }
1189         return zonelist_policy(GFP_HIGHUSER, pol);
1190 }
1191 #endif
1192
1193 /* Allocate a page in interleaved policy.
1194    Own path because it needs to do special accounting. */
1195 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1196                                         unsigned nid)
1197 {
1198         struct zonelist *zl;
1199         struct page *page;
1200
1201         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1202         page = __alloc_pages(gfp, order, zl);
1203         if (page && page_zone(page) == zl->zones[0]) {
1204                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1205                 put_cpu();
1206         }
1207         return page;
1208 }
1209
1210 /**
1211  *      alloc_page_vma  - Allocate a page for a VMA.
1212  *
1213  *      @gfp:
1214  *      %GFP_USER    user allocation.
1215  *      %GFP_KERNEL  kernel allocations,
1216  *      %GFP_HIGHMEM highmem/user allocations,
1217  *      %GFP_FS      allocation should not call back into a file system.
1218  *      %GFP_ATOMIC  don't sleep.
1219  *
1220  *      @vma:  Pointer to VMA or NULL if not available.
1221  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1222  *
1223  *      This function allocates a page from the kernel page pool and applies
1224  *      a NUMA policy associated with the VMA or the current process.
1225  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1226  *      mm_struct of the VMA to prevent it from going away. Should be used for
1227  *      all allocations for pages that will be mapped into
1228  *      user space. Returns NULL when no page can be allocated.
1229  *
1230  *      Should be called with the mm_sem of the vma hold.
1231  */
1232 struct page *
1233 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1234 {
1235         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1236
1237         cpuset_update_task_memory_state();
1238
1239         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1240                 unsigned nid;
1241
1242                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1243                 return alloc_page_interleave(gfp, 0, nid);
1244         }
1245         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1246 }
1247
1248 /**
1249  *      alloc_pages_current - Allocate pages.
1250  *
1251  *      @gfp:
1252  *              %GFP_USER   user allocation,
1253  *              %GFP_KERNEL kernel allocation,
1254  *              %GFP_HIGHMEM highmem allocation,
1255  *              %GFP_FS     don't call back into a file system.
1256  *              %GFP_ATOMIC don't sleep.
1257  *      @order: Power of two of allocation size in pages. 0 is a single page.
1258  *
1259  *      Allocate a page from the kernel page pool.  When not in
1260  *      interrupt context and apply the current process NUMA policy.
1261  *      Returns NULL when no page can be allocated.
1262  *
1263  *      Don't call cpuset_update_task_memory_state() unless
1264  *      1) it's ok to take cpuset_sem (can WAIT), and
1265  *      2) allocating for current task (not interrupt).
1266  */
1267 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1268 {
1269         struct mempolicy *pol = current->mempolicy;
1270
1271         if ((gfp & __GFP_WAIT) && !in_interrupt())
1272                 cpuset_update_task_memory_state();
1273         if (!pol || in_interrupt())
1274                 pol = &default_policy;
1275         if (pol->policy == MPOL_INTERLEAVE)
1276                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1277         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1278 }
1279 EXPORT_SYMBOL(alloc_pages_current);
1280
1281 /*
1282  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1283  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1284  * with the mems_allowed returned by cpuset_mems_allowed().  This
1285  * keeps mempolicies cpuset relative after its cpuset moves.  See
1286  * further kernel/cpuset.c update_nodemask().
1287  */
1288 void *cpuset_being_rebound;
1289
1290 /* Slow path of a mempolicy copy */
1291 struct mempolicy *__mpol_copy(struct mempolicy *old)
1292 {
1293         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1294
1295         if (!new)
1296                 return ERR_PTR(-ENOMEM);
1297         if (current_cpuset_is_being_rebound()) {
1298                 nodemask_t mems = cpuset_mems_allowed(current);
1299                 mpol_rebind_policy(old, &mems);
1300         }
1301         *new = *old;
1302         atomic_set(&new->refcnt, 1);
1303         if (new->policy == MPOL_BIND) {
1304                 int sz = ksize(old->v.zonelist);
1305                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1306                 if (!new->v.zonelist) {
1307                         kmem_cache_free(policy_cache, new);
1308                         return ERR_PTR(-ENOMEM);
1309                 }
1310                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1311         }
1312         return new;
1313 }
1314
1315 /* Slow path of a mempolicy comparison */
1316 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1317 {
1318         if (!a || !b)
1319                 return 0;
1320         if (a->policy != b->policy)
1321                 return 0;
1322         switch (a->policy) {
1323         case MPOL_DEFAULT:
1324                 return 1;
1325         case MPOL_INTERLEAVE:
1326                 return nodes_equal(a->v.nodes, b->v.nodes);
1327         case MPOL_PREFERRED:
1328                 return a->v.preferred_node == b->v.preferred_node;
1329         case MPOL_BIND: {
1330                 int i;
1331                 for (i = 0; a->v.zonelist->zones[i]; i++)
1332                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1333                                 return 0;
1334                 return b->v.zonelist->zones[i] == NULL;
1335         }
1336         default:
1337                 BUG();
1338                 return 0;
1339         }
1340 }
1341
1342 /* Slow path of a mpol destructor. */
1343 void __mpol_free(struct mempolicy *p)
1344 {
1345         if (!atomic_dec_and_test(&p->refcnt))
1346                 return;
1347         if (p->policy == MPOL_BIND)
1348                 kfree(p->v.zonelist);
1349         p->policy = MPOL_DEFAULT;
1350         kmem_cache_free(policy_cache, p);
1351 }
1352
1353 /*
1354  * Shared memory backing store policy support.
1355  *
1356  * Remember policies even when nobody has shared memory mapped.
1357  * The policies are kept in Red-Black tree linked from the inode.
1358  * They are protected by the sp->lock spinlock, which should be held
1359  * for any accesses to the tree.
1360  */
1361
1362 /* lookup first element intersecting start-end */
1363 /* Caller holds sp->lock */
1364 static struct sp_node *
1365 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1366 {
1367         struct rb_node *n = sp->root.rb_node;
1368
1369         while (n) {
1370                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1371
1372                 if (start >= p->end)
1373                         n = n->rb_right;
1374                 else if (end <= p->start)
1375                         n = n->rb_left;
1376                 else
1377                         break;
1378         }
1379         if (!n)
1380                 return NULL;
1381         for (;;) {
1382                 struct sp_node *w = NULL;
1383                 struct rb_node *prev = rb_prev(n);
1384                 if (!prev)
1385                         break;
1386                 w = rb_entry(prev, struct sp_node, nd);
1387                 if (w->end <= start)
1388                         break;
1389                 n = prev;
1390         }
1391         return rb_entry(n, struct sp_node, nd);
1392 }
1393
1394 /* Insert a new shared policy into the list. */
1395 /* Caller holds sp->lock */
1396 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1397 {
1398         struct rb_node **p = &sp->root.rb_node;
1399         struct rb_node *parent = NULL;
1400         struct sp_node *nd;
1401
1402         while (*p) {
1403                 parent = *p;
1404                 nd = rb_entry(parent, struct sp_node, nd);
1405                 if (new->start < nd->start)
1406                         p = &(*p)->rb_left;
1407                 else if (new->end > nd->end)
1408                         p = &(*p)->rb_right;
1409                 else
1410                         BUG();
1411         }
1412         rb_link_node(&new->nd, parent, p);
1413         rb_insert_color(&new->nd, &sp->root);
1414         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1415                  new->policy ? new->policy->policy : 0);
1416 }
1417
1418 /* Find shared policy intersecting idx */
1419 struct mempolicy *
1420 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1421 {
1422         struct mempolicy *pol = NULL;
1423         struct sp_node *sn;
1424
1425         if (!sp->root.rb_node)
1426                 return NULL;
1427         spin_lock(&sp->lock);
1428         sn = sp_lookup(sp, idx, idx+1);
1429         if (sn) {
1430                 mpol_get(sn->policy);
1431                 pol = sn->policy;
1432         }
1433         spin_unlock(&sp->lock);
1434         return pol;
1435 }
1436
1437 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1438 {
1439         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1440         rb_erase(&n->nd, &sp->root);
1441         mpol_free(n->policy);
1442         kmem_cache_free(sn_cache, n);
1443 }
1444
1445 struct sp_node *
1446 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1447 {
1448         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1449
1450         if (!n)
1451                 return NULL;
1452         n->start = start;
1453         n->end = end;
1454         mpol_get(pol);
1455         n->policy = pol;
1456         return n;
1457 }
1458
1459 /* Replace a policy range. */
1460 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1461                                  unsigned long end, struct sp_node *new)
1462 {
1463         struct sp_node *n, *new2 = NULL;
1464
1465 restart:
1466         spin_lock(&sp->lock);
1467         n = sp_lookup(sp, start, end);
1468         /* Take care of old policies in the same range. */
1469         while (n && n->start < end) {
1470                 struct rb_node *next = rb_next(&n->nd);
1471                 if (n->start >= start) {
1472                         if (n->end <= end)
1473                                 sp_delete(sp, n);
1474                         else
1475                                 n->start = end;
1476                 } else {
1477                         /* Old policy spanning whole new range. */
1478                         if (n->end > end) {
1479                                 if (!new2) {
1480                                         spin_unlock(&sp->lock);
1481                                         new2 = sp_alloc(end, n->end, n->policy);
1482                                         if (!new2)
1483                                                 return -ENOMEM;
1484                                         goto restart;
1485                                 }
1486                                 n->end = start;
1487                                 sp_insert(sp, new2);
1488                                 new2 = NULL;
1489                                 break;
1490                         } else
1491                                 n->end = start;
1492                 }
1493                 if (!next)
1494                         break;
1495                 n = rb_entry(next, struct sp_node, nd);
1496         }
1497         if (new)
1498                 sp_insert(sp, new);
1499         spin_unlock(&sp->lock);
1500         if (new2) {
1501                 mpol_free(new2->policy);
1502                 kmem_cache_free(sn_cache, new2);
1503         }
1504         return 0;
1505 }
1506
1507 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1508                                 nodemask_t *policy_nodes)
1509 {
1510         info->root = RB_ROOT;
1511         spin_lock_init(&info->lock);
1512
1513         if (policy != MPOL_DEFAULT) {
1514                 struct mempolicy *newpol;
1515
1516                 /* Falls back to MPOL_DEFAULT on any error */
1517                 newpol = mpol_new(policy, policy_nodes);
1518                 if (!IS_ERR(newpol)) {
1519                         /* Create pseudo-vma that contains just the policy */
1520                         struct vm_area_struct pvma;
1521
1522                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1523                         /* Policy covers entire file */
1524                         pvma.vm_end = TASK_SIZE;
1525                         mpol_set_shared_policy(info, &pvma, newpol);
1526                         mpol_free(newpol);
1527                 }
1528         }
1529 }
1530
1531 int mpol_set_shared_policy(struct shared_policy *info,
1532                         struct vm_area_struct *vma, struct mempolicy *npol)
1533 {
1534         int err;
1535         struct sp_node *new = NULL;
1536         unsigned long sz = vma_pages(vma);
1537
1538         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1539                  vma->vm_pgoff,
1540                  sz, npol? npol->policy : -1,
1541                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1542
1543         if (npol) {
1544                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1545                 if (!new)
1546                         return -ENOMEM;
1547         }
1548         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1549         if (err && new)
1550                 kmem_cache_free(sn_cache, new);
1551         return err;
1552 }
1553
1554 /* Free a backing policy store on inode delete. */
1555 void mpol_free_shared_policy(struct shared_policy *p)
1556 {
1557         struct sp_node *n;
1558         struct rb_node *next;
1559
1560         if (!p->root.rb_node)
1561                 return;
1562         spin_lock(&p->lock);
1563         next = rb_first(&p->root);
1564         while (next) {
1565                 n = rb_entry(next, struct sp_node, nd);
1566                 next = rb_next(&n->nd);
1567                 rb_erase(&n->nd, &p->root);
1568                 mpol_free(n->policy);
1569                 kmem_cache_free(sn_cache, n);
1570         }
1571         spin_unlock(&p->lock);
1572 }
1573
1574 /* assumes fs == KERNEL_DS */
1575 void __init numa_policy_init(void)
1576 {
1577         policy_cache = kmem_cache_create("numa_policy",
1578                                          sizeof(struct mempolicy),
1579                                          0, SLAB_PANIC, NULL, NULL);
1580
1581         sn_cache = kmem_cache_create("shared_policy_node",
1582                                      sizeof(struct sp_node),
1583                                      0, SLAB_PANIC, NULL, NULL);
1584
1585         /* Set interleaving policy for system init. This way not all
1586            the data structures allocated at system boot end up in node zero. */
1587
1588         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1589                 printk("numa_policy_init: interleaving failed\n");
1590 }
1591
1592 /* Reset policy of current process to default */
1593 void numa_default_policy(void)
1594 {
1595         do_set_mempolicy(MPOL_DEFAULT, NULL);
1596 }
1597
1598 /* Migrate a policy to a different set of nodes */
1599 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1600 {
1601         nodemask_t *mpolmask;
1602         nodemask_t tmp;
1603
1604         if (!pol)
1605                 return;
1606         mpolmask = &pol->cpuset_mems_allowed;
1607         if (nodes_equal(*mpolmask, *newmask))
1608                 return;
1609
1610         switch (pol->policy) {
1611         case MPOL_DEFAULT:
1612                 break;
1613         case MPOL_INTERLEAVE:
1614                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1615                 pol->v.nodes = tmp;
1616                 *mpolmask = *newmask;
1617                 current->il_next = node_remap(current->il_next,
1618                                                 *mpolmask, *newmask);
1619                 break;
1620         case MPOL_PREFERRED:
1621                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1622                                                 *mpolmask, *newmask);
1623                 *mpolmask = *newmask;
1624                 break;
1625         case MPOL_BIND: {
1626                 nodemask_t nodes;
1627                 struct zone **z;
1628                 struct zonelist *zonelist;
1629
1630                 nodes_clear(nodes);
1631                 for (z = pol->v.zonelist->zones; *z; z++)
1632                         node_set((*z)->zone_pgdat->node_id, nodes);
1633                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1634                 nodes = tmp;
1635
1636                 zonelist = bind_zonelist(&nodes);
1637
1638                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1639                  * If that old zonelist has no remaining mems_allowed nodes,
1640                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1641                  */
1642
1643                 if (zonelist) {
1644                         /* Good - got mem - substitute new zonelist */
1645                         kfree(pol->v.zonelist);
1646                         pol->v.zonelist = zonelist;
1647                 }
1648                 *mpolmask = *newmask;
1649                 break;
1650         }
1651         default:
1652                 BUG();
1653                 break;
1654         }
1655 }
1656
1657 /*
1658  * Wrapper for mpol_rebind_policy() that just requires task
1659  * pointer, and updates task mempolicy.
1660  */
1661
1662 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1663 {
1664         mpol_rebind_policy(tsk->mempolicy, new);
1665 }
1666
1667 /*
1668  * Rebind each vma in mm to new nodemask.
1669  *
1670  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1671  */
1672
1673 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1674 {
1675         struct vm_area_struct *vma;
1676
1677         down_write(&mm->mmap_sem);
1678         for (vma = mm->mmap; vma; vma = vma->vm_next)
1679                 mpol_rebind_policy(vma->vm_policy, new);
1680         up_write(&mm->mmap_sem);
1681 }
1682
1683 /*
1684  * Display pages allocated per node and memory policy via /proc.
1685  */
1686
1687 static const char *policy_types[] = { "default", "prefer", "bind",
1688                                       "interleave" };
1689
1690 /*
1691  * Convert a mempolicy into a string.
1692  * Returns the number of characters in buffer (if positive)
1693  * or an error (negative)
1694  */
1695 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1696 {
1697         char *p = buffer;
1698         int l;
1699         nodemask_t nodes;
1700         int mode = pol ? pol->policy : MPOL_DEFAULT;
1701
1702         switch (mode) {
1703         case MPOL_DEFAULT:
1704                 nodes_clear(nodes);
1705                 break;
1706
1707         case MPOL_PREFERRED:
1708                 nodes_clear(nodes);
1709                 node_set(pol->v.preferred_node, nodes);
1710                 break;
1711
1712         case MPOL_BIND:
1713                 get_zonemask(pol, &nodes);
1714                 break;
1715
1716         case MPOL_INTERLEAVE:
1717                 nodes = pol->v.nodes;
1718                 break;
1719
1720         default:
1721                 BUG();
1722                 return -EFAULT;
1723         }
1724
1725         l = strlen(policy_types[mode]);
1726         if (buffer + maxlen < p + l + 1)
1727                 return -ENOSPC;
1728
1729         strcpy(p, policy_types[mode]);
1730         p += l;
1731
1732         if (!nodes_empty(nodes)) {
1733                 if (buffer + maxlen < p + 2)
1734                         return -ENOSPC;
1735                 *p++ = '=';
1736                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1737         }
1738         return p - buffer;
1739 }
1740
1741 struct numa_maps {
1742         unsigned long pages;
1743         unsigned long anon;
1744         unsigned long active;
1745         unsigned long writeback;
1746         unsigned long mapcount_max;
1747         unsigned long dirty;
1748         unsigned long swapcache;
1749         unsigned long node[MAX_NUMNODES];
1750 };
1751
1752 static void gather_stats(struct page *page, void *private, int pte_dirty)
1753 {
1754         struct numa_maps *md = private;
1755         int count = page_mapcount(page);
1756
1757         md->pages++;
1758         if (pte_dirty || PageDirty(page))
1759                 md->dirty++;
1760
1761         if (PageSwapCache(page))
1762                 md->swapcache++;
1763
1764         if (PageActive(page))
1765                 md->active++;
1766
1767         if (PageWriteback(page))
1768                 md->writeback++;
1769
1770         if (PageAnon(page))
1771                 md->anon++;
1772
1773         if (count > md->mapcount_max)
1774                 md->mapcount_max = count;
1775
1776         md->node[page_to_nid(page)]++;
1777 }
1778
1779 #ifdef CONFIG_HUGETLB_PAGE
1780 static void check_huge_range(struct vm_area_struct *vma,
1781                 unsigned long start, unsigned long end,
1782                 struct numa_maps *md)
1783 {
1784         unsigned long addr;
1785         struct page *page;
1786
1787         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1788                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1789                 pte_t pte;
1790
1791                 if (!ptep)
1792                         continue;
1793
1794                 pte = *ptep;
1795                 if (pte_none(pte))
1796                         continue;
1797
1798                 page = pte_page(pte);
1799                 if (!page)
1800                         continue;
1801
1802                 gather_stats(page, md, pte_dirty(*ptep));
1803         }
1804 }
1805 #else
1806 static inline void check_huge_range(struct vm_area_struct *vma,
1807                 unsigned long start, unsigned long end,
1808                 struct numa_maps *md)
1809 {
1810 }
1811 #endif
1812
1813 int show_numa_map(struct seq_file *m, void *v)
1814 {
1815         struct task_struct *task = m->private;
1816         struct vm_area_struct *vma = v;
1817         struct numa_maps *md;
1818         struct file *file = vma->vm_file;
1819         struct mm_struct *mm = vma->vm_mm;
1820         int n;
1821         char buffer[50];
1822
1823         if (!mm)
1824                 return 0;
1825
1826         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1827         if (!md)
1828                 return 0;
1829
1830         mpol_to_str(buffer, sizeof(buffer),
1831                         get_vma_policy(task, vma, vma->vm_start));
1832
1833         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1834
1835         if (file) {
1836                 seq_printf(m, " file=");
1837                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1838         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1839                 seq_printf(m, " heap");
1840         } else if (vma->vm_start <= mm->start_stack &&
1841                         vma->vm_end >= mm->start_stack) {
1842                 seq_printf(m, " stack");
1843         }
1844
1845         if (is_vm_hugetlb_page(vma)) {
1846                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1847                 seq_printf(m, " huge");
1848         } else {
1849                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1850                                 &node_online_map, MPOL_MF_STATS, md);
1851         }
1852
1853         if (!md->pages)
1854                 goto out;
1855
1856         if (md->anon)
1857                 seq_printf(m," anon=%lu",md->anon);
1858
1859         if (md->dirty)
1860                 seq_printf(m," dirty=%lu",md->dirty);
1861
1862         if (md->pages != md->anon && md->pages != md->dirty)
1863                 seq_printf(m, " mapped=%lu", md->pages);
1864
1865         if (md->mapcount_max > 1)
1866                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1867
1868         if (md->swapcache)
1869                 seq_printf(m," swapcache=%lu", md->swapcache);
1870
1871         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1872                 seq_printf(m," active=%lu", md->active);
1873
1874         if (md->writeback)
1875                 seq_printf(m," writeback=%lu", md->writeback);
1876
1877         for_each_online_node(n)
1878                 if (md->node[n])
1879                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1880 out:
1881         seq_putc(m, '\n');
1882         kfree(md);
1883
1884         if (m->count < m->size)
1885                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1886         return 0;
1887 }
1888