mm/mempolicy.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Simple NUMA memory policy for the Linux kernel.
   4  *
   5  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   6  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * preferred many Try a set of nodes first before normal fallback. This is
  35  *                similar to preferred without the special case.
  36  *
  37  * default        Allocate on the local node first, or when on a VMA
  38  *                use the process policy. This is what Linux always did
  39  *                in a NUMA aware kernel and still does by, ahem, default.
  40  *
  41  * The process policy is applied for most non interrupt memory allocations
  42  * in that process' context. Interrupts ignore the policies and always
  43  * try to allocate on the local CPU. The VMA policy is only applied for memory
  44  * allocations for a VMA in the VM.
  45  *
  46  * Currently there are a few corner cases in swapping where the policy
  47  * is not applied, but the majority should be handled. When process policy
  48  * is used it is not remembered over swap outs/swap ins.
  49  *
  50  * Only the highest zone in the zone hierarchy gets policied. Allocations
  51  * requesting a lower zone just use default policy. This implies that
  52  * on systems with highmem kernel lowmem allocation don't get policied.
  53  * Same with GFP_DMA allocations.
  54  *
  55  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  56  * all users and remembered even when nobody has memory mapped.
  57  */
  58
  59 /* Notebook:
  60    fix mmap readahead to honour policy and enable policy for any page cache
  61    object
  62    statistics for bigpages
  63    global policy for page cache? currently it uses process policy. Requires
  64    first item above.
  65    handle mremap for shared memory (currently ignored for the policy)
  66    grows down?
  67    make bind policy root only? It can trigger oom much faster and the
  68    kernel is not always grateful with that.
  69 */
  70
  71 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  72
  73 #include <linux/mempolicy.h>
  74 #include <linux/pagewalk.h>
  75 #include <linux/highmem.h>
  76 #include <linux/hugetlb.h>
  77 #include <linux/kernel.h>
  78 #include <linux/sched.h>
  79 #include <linux/sched/mm.h>
  80 #include <linux/sched/numa_balancing.h>
  81 #include <linux/sched/task.h>
  82 #include <linux/nodemask.h>
  83 #include <linux/cpuset.h>
  84 #include <linux/slab.h>
  85 #include <linux/string.h>
  86 #include <linux/export.h>
  87 #include <linux/nsproxy.h>
  88 #include <linux/interrupt.h>
  89 #include <linux/init.h>
  90 #include <linux/compat.h>
  91 #include <linux/ptrace.h>
  92 #include <linux/swap.h>
  93 #include <linux/seq_file.h>
  94 #include <linux/proc_fs.h>
  95 #include <linux/migrate.h>
  96 #include <linux/ksm.h>
  97 #include <linux/rmap.h>
  98 #include <linux/security.h>
  99 #include <linux/syscalls.h>
 100 #include <linux/ctype.h>
 101 #include <linux/mm_inline.h>
 102 #include <linux/mmu_notifier.h>
 103 #include <linux/printk.h>
 104 #include <linux/swapops.h>
 105
 106 #include <asm/tlbflush.h>
 107 #include <asm/tlb.h>
 108 #include <linux/uaccess.h>
 109
 110 #include "internal.h"
 111
 112 /* Internal flags */
 113 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 114 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 115
 116 static struct kmem_cache *policy_cache;
 117 static struct kmem_cache *sn_cache;
 118
 119 /* Highest zone. An specific allocation for a zone below that is not
 120    policied. */
 121 enum zone_type policy_zone = 0;
 122
 123 /*
 124  * run-time system-wide default policy => local allocation
 125  */
 126 static struct mempolicy default_policy = {
 127         .refcnt = ATOMIC_INIT(1), /* never free it */
 128         .mode = MPOL_LOCAL,
 129 };
 130
 131 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 132
 133 /**
 134  * numa_map_to_online_node - Find closest online node
 135  * @node: Node id to start the search
 136  *
 137  * Lookup the next closest node by distance if @nid is not online.
 138  *
 139  * Return: this @node if it is online, otherwise the closest node by distance
 140  */
 141 int numa_map_to_online_node(int node)
 142 {
 143         int min_dist = INT_MAX, dist, n, min_node;
 144
 145         if (node == NUMA_NO_NODE || node_online(node))
 146                 return node;
 147
 148         min_node = node;
 149         for_each_online_node(n) {
 150                 dist = node_distance(node, n);
 151                 if (dist < min_dist) {
 152                         min_dist = dist;
 153                         min_node = n;
 154                 }
 155         }
 156
 157         return min_node;
 158 }
 159 EXPORT_SYMBOL_GPL(numa_map_to_online_node);
 160
 161 struct mempolicy *get_task_policy(struct task_struct *p)
 162 {
 163         struct mempolicy *pol = p->mempolicy;
 164         int node;
 165
 166         if (pol)
 167                 return pol;
 168
 169         node = numa_node_id();
 170         if (node != NUMA_NO_NODE) {
 171                 pol = &preferred_node_policy[node];
 172                 /* preferred_node_policy is not initialised early in boot */
 173                 if (pol->mode)
 174                         return pol;
 175         }
 176
 177         return &default_policy;
 178 }
 179
 180 static const struct mempolicy_operations {
 181         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 182         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 183 } mpol_ops[MPOL_MAX];
 184
 185 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 186 {
 187         return pol->flags & MPOL_MODE_FLAGS;
 188 }
 189
 190 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 191                                    const nodemask_t *rel)
 192 {
 193         nodemask_t tmp;
 194         nodes_fold(tmp, *orig, nodes_weight(*rel));
 195         nodes_onto(*ret, tmp, *rel);
 196 }
 197
 198 static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 199 {
 200         if (nodes_empty(*nodes))
 201                 return -EINVAL;
 202         pol->nodes = *nodes;
 203         return 0;
 204 }
 205
 206 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 207 {
 208         if (nodes_empty(*nodes))
 209                 return -EINVAL;
 210
 211         nodes_clear(pol->nodes);
 212         node_set(first_node(*nodes), pol->nodes);
 213         return 0;
 214 }
 215
 216 /*
 217  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 218  * any, for the new policy.  mpol_new() has already validated the nodes
 219  * parameter with respect to the policy mode and flags.
 220  *
 221  * Must be called holding task's alloc_lock to protect task's mems_allowed
 222  * and mempolicy.  May also be called holding the mmap_lock for write.
 223  */
 224 static int mpol_set_nodemask(struct mempolicy *pol,
 225                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 226 {
 227         int ret;
 228
 229         /*
 230          * Default (pol==NULL) resp. local memory policies are not a
 231          * subject of any remapping. They also do not need any special
 232          * constructor.
 233          */
 234         if (!pol || pol->mode == MPOL_LOCAL)
 235                 return 0;
 236
 237         /* Check N_MEMORY */
 238         nodes_and(nsc->mask1,
 239                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 240
 241         VM_BUG_ON(!nodes);
 242
 243         if (pol->flags & MPOL_F_RELATIVE_NODES)
 244                 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
 245         else
 246                 nodes_and(nsc->mask2, *nodes, nsc->mask1);
 247
 248         if (mpol_store_user_nodemask(pol))
 249                 pol->w.user_nodemask = *nodes;
 250         else
 251                 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
 252
 253         ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 254         return ret;
 255 }
 256
 257 /*
 258  * This function just creates a new policy, does some check and simple
 259  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 260  */
 261 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 262                                   nodemask_t *nodes)
 263 {
 264         struct mempolicy *policy;
 265
 266         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 267                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 268
 269         if (mode == MPOL_DEFAULT) {
 270                 if (nodes && !nodes_empty(*nodes))
 271                         return ERR_PTR(-EINVAL);
 272                 return NULL;
 273         }
 274         VM_BUG_ON(!nodes);
 275
 276         /*
 277          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 278          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 279          * All other modes require a valid pointer to a non-empty nodemask.
 280          */
 281         if (mode == MPOL_PREFERRED) {
 282                 if (nodes_empty(*nodes)) {
 283                         if (((flags & MPOL_F_STATIC_NODES) ||
 284                              (flags & MPOL_F_RELATIVE_NODES)))
 285                                 return ERR_PTR(-EINVAL);
 286
 287                         mode = MPOL_LOCAL;
 288                 }
 289         } else if (mode == MPOL_LOCAL) {
 290                 if (!nodes_empty(*nodes) ||
 291                     (flags & MPOL_F_STATIC_NODES) ||
 292                     (flags & MPOL_F_RELATIVE_NODES))
 293                         return ERR_PTR(-EINVAL);
 294         } else if (nodes_empty(*nodes))
 295                 return ERR_PTR(-EINVAL);
 296         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 297         if (!policy)
 298                 return ERR_PTR(-ENOMEM);
 299         atomic_set(&policy->refcnt, 1);
 300         policy->mode = mode;
 301         policy->flags = flags;
 302         policy->home_node = NUMA_NO_NODE;
 303
 304         return policy;
 305 }
 306
 307 /* Slow path of a mpol destructor. */
 308 void __mpol_put(struct mempolicy *p)
 309 {
 310         if (!atomic_dec_and_test(&p->refcnt))
 311                 return;
 312         kmem_cache_free(policy_cache, p);
 313 }
 314
 315 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 316 {
 317 }
 318
 319 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 320 {
 321         nodemask_t tmp;
 322
 323         if (pol->flags & MPOL_F_STATIC_NODES)
 324                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 325         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 326                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 327         else {
 328                 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
 329                                                                 *nodes);
 330                 pol->w.cpuset_mems_allowed = *nodes;
 331         }
 332
 333         if (nodes_empty(tmp))
 334                 tmp = *nodes;
 335
 336         pol->nodes = tmp;
 337 }
 338
 339 static void mpol_rebind_preferred(struct mempolicy *pol,
 340                                                 const nodemask_t *nodes)
 341 {
 342         pol->w.cpuset_mems_allowed = *nodes;
 343 }
 344
 345 /*
 346  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 347  *
 348  * Per-vma policies are protected by mmap_lock. Allocations using per-task
 349  * policies are protected by task->mems_allowed_seq to prevent a premature
 350  * OOM/allocation failure due to parallel nodemask modification.
 351  */
 352 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 353 {
 354         if (!pol || pol->mode == MPOL_LOCAL)
 355                 return;
 356         if (!mpol_store_user_nodemask(pol) &&
 357             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 358                 return;
 359
 360         mpol_ops[pol->mode].rebind(pol, newmask);
 361 }
 362
 363 /*
 364  * Wrapper for mpol_rebind_policy() that just requires task
 365  * pointer, and updates task mempolicy.
 366  *
 367  * Called with task's alloc_lock held.
 368  */
 369
 370 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 371 {
 372         mpol_rebind_policy(tsk->mempolicy, new);
 373 }
 374
 375 /*
 376  * Rebind each vma in mm to new nodemask.
 377  *
 378  * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 379  */
 380
 381 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 382 {
 383         struct vm_area_struct *vma;
 384         VMA_ITERATOR(vmi, mm, 0);
 385
 386         mmap_write_lock(mm);
 387         for_each_vma(vmi, vma)
 388                 mpol_rebind_policy(vma->vm_policy, new);
 389         mmap_write_unlock(mm);
 390 }
 391
 392 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 393         [MPOL_DEFAULT] = {
 394                 .rebind = mpol_rebind_default,
 395         },
 396         [MPOL_INTERLEAVE] = {
 397                 .create = mpol_new_nodemask,
 398                 .rebind = mpol_rebind_nodemask,
 399         },
 400         [MPOL_PREFERRED] = {
 401                 .create = mpol_new_preferred,
 402                 .rebind = mpol_rebind_preferred,
 403         },
 404         [MPOL_BIND] = {
 405                 .create = mpol_new_nodemask,
 406                 .rebind = mpol_rebind_nodemask,
 407         },
 408         [MPOL_LOCAL] = {
 409                 .rebind = mpol_rebind_default,
 410         },
 411         [MPOL_PREFERRED_MANY] = {
 412                 .create = mpol_new_nodemask,
 413                 .rebind = mpol_rebind_preferred,
 414         },
 415 };
 416
 417 static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
 418                                 unsigned long flags);
 419
 420 struct queue_pages {
 421         struct list_head *pagelist;
 422         unsigned long flags;
 423         nodemask_t *nmask;
 424         unsigned long start;
 425         unsigned long end;
 426         struct vm_area_struct *first;
 427 };
 428
 429 /*
 430  * Check if the folio's nid is in qp->nmask.
 431  *
 432  * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 433  * in the invert of qp->nmask.
 434  */
 435 static inline bool queue_folio_required(struct folio *folio,
 436                                         struct queue_pages *qp)
 437 {
 438         int nid = folio_nid(folio);
 439         unsigned long flags = qp->flags;
 440
 441         return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
 442 }
 443
 444 /*
 445  * queue_folios_pmd() has three possible return values:
 446  * 0 - folios are placed on the right node or queued successfully, or
 447  *     special page is met, i.e. huge zero page.
 448  * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 449  *     specified.
 450  * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
 451  *        existing folio was already on a node that does not follow the
 452  *        policy.
 453  */
 454 static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
 455                                 unsigned long end, struct mm_walk *walk)
 456         __releases(ptl)
 457 {
 458         int ret = 0;
 459         struct folio *folio;
 460         struct queue_pages *qp = walk->private;
 461         unsigned long flags;
 462
 463         if (unlikely(is_pmd_migration_entry(*pmd))) {
 464                 ret = -EIO;
 465                 goto unlock;
 466         }
 467         folio = pfn_folio(pmd_pfn(*pmd));
 468         if (is_huge_zero_page(&folio->page)) {
 469                 walk->action = ACTION_CONTINUE;
 470                 goto unlock;
 471         }
 472         if (!queue_folio_required(folio, qp))
 473                 goto unlock;
 474
 475         flags = qp->flags;
 476         /* go to folio migration */
 477         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 478                 if (!vma_migratable(walk->vma) ||
 479                     migrate_folio_add(folio, qp->pagelist, flags)) {
 480                         ret = 1;
 481                         goto unlock;
 482                 }
 483         } else
 484                 ret = -EIO;
 485 unlock:
 486         spin_unlock(ptl);
 487         return ret;
 488 }
 489
 490 /*
 491  * Scan through pages checking if pages follow certain conditions,
 492  * and move them to the pagelist if they do.
 493  *
 494  * queue_folios_pte_range() has three possible return values:
 495  * 0 - folios are placed on the right node or queued successfully, or
 496  *     special page is met, i.e. zero page.
 497  * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
 498  *     specified.
 499  * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
 500  *        on a node that does not follow the policy.
 501  */
 502 static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
 503                         unsigned long end, struct mm_walk *walk)
 504 {
 505         struct vm_area_struct *vma = walk->vma;
 506         struct folio *folio;
 507         struct queue_pages *qp = walk->private;
 508         unsigned long flags = qp->flags;
 509         bool has_unmovable = false;
 510         pte_t *pte, *mapped_pte;
 511         spinlock_t *ptl;
 512
 513         ptl = pmd_trans_huge_lock(pmd, vma);
 514         if (ptl)
 515                 return queue_folios_pmd(pmd, ptl, addr, end, walk);
 516
 517         if (pmd_trans_unstable(pmd))
 518                 return 0;
 519
 520         mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 521         for (; addr != end; pte++, addr += PAGE_SIZE) {
 522                 if (!pte_present(*pte))
 523                         continue;
 524                 folio = vm_normal_folio(vma, addr, *pte);
 525                 if (!folio || folio_is_zone_device(folio))
 526                         continue;
 527                 /*
 528                  * vm_normal_folio() filters out zero pages, but there might
 529                  * still be reserved folios to skip, perhaps in a VDSO.
 530                  */
 531                 if (folio_test_reserved(folio))
 532                         continue;
 533                 if (!queue_folio_required(folio, qp))
 534                         continue;
 535                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 536                         /* MPOL_MF_STRICT must be specified if we get here */
 537                         if (!vma_migratable(vma)) {
 538                                 has_unmovable = true;
 539                                 break;
 540                         }
 541
 542                         /*
 543                          * Do not abort immediately since there may be
 544                          * temporary off LRU pages in the range.  Still
 545                          * need migrate other LRU pages.
 546                          */
 547                         if (migrate_folio_add(folio, qp->pagelist, flags))
 548                                 has_unmovable = true;
 549                 } else
 550                         break;
 551         }
 552         pte_unmap_unlock(mapped_pte, ptl);
 553         cond_resched();
 554
 555         if (has_unmovable)
 556                 return 1;
 557
 558         return addr != end ? -EIO : 0;
 559 }
 560
 561 static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
 562                                unsigned long addr, unsigned long end,
 563                                struct mm_walk *walk)
 564 {
 565         int ret = 0;
 566 #ifdef CONFIG_HUGETLB_PAGE
 567         struct queue_pages *qp = walk->private;
 568         unsigned long flags = (qp->flags & MPOL_MF_VALID);
 569         struct folio *folio;
 570         spinlock_t *ptl;
 571         pte_t entry;
 572
 573         ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
 574         entry = huge_ptep_get(pte);
 575         if (!pte_present(entry))
 576                 goto unlock;
 577         folio = pfn_folio(pte_pfn(entry));
 578         if (!queue_folio_required(folio, qp))
 579                 goto unlock;
 580
 581         if (flags == MPOL_MF_STRICT) {
 582                 /*
 583                  * STRICT alone means only detecting misplaced folio and no
 584                  * need to further check other vma.
 585                  */
 586                 ret = -EIO;
 587                 goto unlock;
 588         }
 589
 590         if (!vma_migratable(walk->vma)) {
 591                 /*
 592                  * Must be STRICT with MOVE*, otherwise .test_walk() have
 593                  * stopped walking current vma.
 594                  * Detecting misplaced folio but allow migrating folios which
 595                  * have been queued.
 596                  */
 597                 ret = 1;
 598                 goto unlock;
 599         }
 600
 601         /*
 602          * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it
 603          * is shared it is likely not worth migrating.
 604          *
 605          * To check if the folio is shared, ideally we want to make sure
 606          * every page is mapped to the same process. Doing that is very
 607          * expensive, so check the estimated mapcount of the folio instead.
 608          */
 609         if (flags & (MPOL_MF_MOVE_ALL) ||
 610             (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 &&
 611              !hugetlb_pmd_shared(pte))) {
 612                 if (!isolate_hugetlb(folio, qp->pagelist) &&
 613                         (flags & MPOL_MF_STRICT))
 614                         /*
 615                          * Failed to isolate folio but allow migrating pages
 616                          * which have been queued.
 617                          */
 618                         ret = 1;
 619         }
 620 unlock:
 621         spin_unlock(ptl);
 622 #else
 623         BUG();
 624 #endif
 625         return ret;
 626 }
 627
 628 #ifdef CONFIG_NUMA_BALANCING
 629 /*
 630  * This is used to mark a range of virtual addresses to be inaccessible.
 631  * These are later cleared by a NUMA hinting fault. Depending on these
 632  * faults, pages may be migrated for better NUMA placement.
 633  *
 634  * This is assuming that NUMA faults are handled using PROT_NONE. If
 635  * an architecture makes a different choice, it will need further
 636  * changes to the core.
 637  */
 638 unsigned long change_prot_numa(struct vm_area_struct *vma,
 639                         unsigned long addr, unsigned long end)
 640 {
 641         struct mmu_gather tlb;
 642         long nr_updated;
 643
 644         tlb_gather_mmu(&tlb, vma->vm_mm);
 645
 646         nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
 647         if (nr_updated > 0)
 648                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 649
 650         tlb_finish_mmu(&tlb);
 651
 652         return nr_updated;
 653 }
 654 #else
 655 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 656                         unsigned long addr, unsigned long end)
 657 {
 658         return 0;
 659 }
 660 #endif /* CONFIG_NUMA_BALANCING */
 661
 662 static int queue_pages_test_walk(unsigned long start, unsigned long end,
 663                                 struct mm_walk *walk)
 664 {
 665         struct vm_area_struct *next, *vma = walk->vma;
 666         struct queue_pages *qp = walk->private;
 667         unsigned long endvma = vma->vm_end;
 668         unsigned long flags = qp->flags;
 669
 670         /* range check first */
 671         VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
 672
 673         if (!qp->first) {
 674                 qp->first = vma;
 675                 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 676                         (qp->start < vma->vm_start))
 677                         /* hole at head side of range */
 678                         return -EFAULT;
 679         }
 680         next = find_vma(vma->vm_mm, vma->vm_end);
 681         if (!(flags & MPOL_MF_DISCONTIG_OK) &&
 682                 ((vma->vm_end < qp->end) &&
 683                 (!next || vma->vm_end < next->vm_start)))
 684                 /* hole at middle or tail of range */
 685                 return -EFAULT;
 686
 687         /*
 688          * Need check MPOL_MF_STRICT to return -EIO if possible
 689          * regardless of vma_migratable
 690          */
 691         if (!vma_migratable(vma) &&
 692             !(flags & MPOL_MF_STRICT))
 693                 return 1;
 694
 695         if (endvma > end)
 696                 endvma = end;
 697
 698         if (flags & MPOL_MF_LAZY) {
 699                 /* Similar to task_numa_work, skip inaccessible VMAs */
 700                 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
 701                         !(vma->vm_flags & VM_MIXEDMAP))
 702                         change_prot_numa(vma, start, endvma);
 703                 return 1;
 704         }
 705
 706         /* queue pages from current vma */
 707         if (flags & MPOL_MF_VALID)
 708                 return 0;
 709         return 1;
 710 }
 711
 712 static const struct mm_walk_ops queue_pages_walk_ops = {
 713         .hugetlb_entry          = queue_folios_hugetlb,
 714         .pmd_entry              = queue_folios_pte_range,
 715         .test_walk              = queue_pages_test_walk,
 716 };
 717
 718 /*
 719  * Walk through page tables and collect pages to be migrated.
 720  *
 721  * If pages found in a given range are on a set of nodes (determined by
 722  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 723  * passed via @private.
 724  *
 725  * queue_pages_range() has three possible return values:
 726  * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
 727  *     specified.
 728  * 0 - queue pages successfully or no misplaced page.
 729  * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
 730  *         memory range specified by nodemask and maxnode points outside
 731  *         your accessible address space (-EFAULT)
 732  */
 733 static int
 734 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 735                 nodemask_t *nodes, unsigned long flags,
 736                 struct list_head *pagelist)
 737 {
 738         int err;
 739         struct queue_pages qp = {
 740                 .pagelist = pagelist,
 741                 .flags = flags,
 742                 .nmask = nodes,
 743                 .start = start,
 744                 .end = end,
 745                 .first = NULL,
 746         };
 747
 748         err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
 749
 750         if (!qp.first)
 751                 /* whole range in hole */
 752                 err = -EFAULT;
 753
 754         return err;
 755 }
 756
 757 /*
 758  * Apply policy to a single VMA
 759  * This must be called with the mmap_lock held for writing.
 760  */
 761 static int vma_replace_policy(struct vm_area_struct *vma,
 762                                                 struct mempolicy *pol)
 763 {
 764         int err;
 765         struct mempolicy *old;
 766         struct mempolicy *new;
 767
 768         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 769                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 770                  vma->vm_ops, vma->vm_file,
 771                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 772
 773         new = mpol_dup(pol);
 774         if (IS_ERR(new))
 775                 return PTR_ERR(new);
 776
 777         if (vma->vm_ops && vma->vm_ops->set_policy) {
 778                 err = vma->vm_ops->set_policy(vma, new);
 779                 if (err)
 780                         goto err_out;
 781         }
 782
 783         old = vma->vm_policy;
 784         vma->vm_policy = new; /* protected by mmap_lock */
 785         mpol_put(old);
 786
 787         return 0;
 788  err_out:
 789         mpol_put(new);
 790         return err;
 791 }
 792
 793 /* Step 2: apply policy to a range and do splits. */
 794 static int mbind_range(struct mm_struct *mm, unsigned long start,
 795                        unsigned long end, struct mempolicy *new_pol)
 796 {
 797         VMA_ITERATOR(vmi, mm, start);
 798         struct vm_area_struct *prev;
 799         struct vm_area_struct *vma;
 800         int err = 0;
 801         pgoff_t pgoff;
 802
 803         prev = vma_prev(&vmi);
 804         vma = vma_find(&vmi, end);
 805         if (WARN_ON(!vma))
 806                 return 0;
 807
 808         if (start > vma->vm_start)
 809                 prev = vma;
 810
 811         do {
 812                 unsigned long vmstart = max(start, vma->vm_start);
 813                 unsigned long vmend = min(end, vma->vm_end);
 814
 815                 if (mpol_equal(vma_policy(vma), new_pol))
 816                         goto next;
 817
 818                 pgoff = vma->vm_pgoff +
 819                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 820                 prev = vma_merge(&vmi, mm, prev, vmstart, vmend, vma->vm_flags,
 821                                  vma->anon_vma, vma->vm_file, pgoff,
 822                                  new_pol, vma->vm_userfaultfd_ctx,
 823                                  anon_vma_name(vma));
 824                 if (prev) {
 825                         vma = prev;
 826                         goto replace;
 827                 }
 828                 if (vma->vm_start != vmstart) {
 829                         err = split_vma(&vmi, vma, vmstart, 1);
 830                         if (err)
 831                                 goto out;
 832                 }
 833                 if (vma->vm_end != vmend) {
 834                         err = split_vma(&vmi, vma, vmend, 0);
 835                         if (err)
 836                                 goto out;
 837                 }
 838 replace:
 839                 err = vma_replace_policy(vma, new_pol);
 840                 if (err)
 841                         goto out;
 842 next:
 843                 prev = vma;
 844         } for_each_vma_range(vmi, vma, end);
 845
 846 out:
 847         return err;
 848 }
 849
 850 /* Set the process memory policy */
 851 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 852                              nodemask_t *nodes)
 853 {
 854         struct mempolicy *new, *old;
 855         NODEMASK_SCRATCH(scratch);
 856         int ret;
 857
 858         if (!scratch)
 859                 return -ENOMEM;
 860
 861         new = mpol_new(mode, flags, nodes);
 862         if (IS_ERR(new)) {
 863                 ret = PTR_ERR(new);
 864                 goto out;
 865         }
 866
 867         task_lock(current);
 868         ret = mpol_set_nodemask(new, nodes, scratch);
 869         if (ret) {
 870                 task_unlock(current);
 871                 mpol_put(new);
 872                 goto out;
 873         }
 874
 875         old = current->mempolicy;
 876         current->mempolicy = new;
 877         if (new && new->mode == MPOL_INTERLEAVE)
 878                 current->il_prev = MAX_NUMNODES-1;
 879         task_unlock(current);
 880         mpol_put(old);
 881         ret = 0;
 882 out:
 883         NODEMASK_SCRATCH_FREE(scratch);
 884         return ret;
 885 }
 886
 887 /*
 888  * Return nodemask for policy for get_mempolicy() query
 889  *
 890  * Called with task's alloc_lock held
 891  */
 892 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 893 {
 894         nodes_clear(*nodes);
 895         if (p == &default_policy)
 896                 return;
 897
 898         switch (p->mode) {
 899         case MPOL_BIND:
 900         case MPOL_INTERLEAVE:
 901         case MPOL_PREFERRED:
 902         case MPOL_PREFERRED_MANY:
 903                 *nodes = p->nodes;
 904                 break;
 905         case MPOL_LOCAL:
 906                 /* return empty node mask for local allocation */
 907                 break;
 908         default:
 909                 BUG();
 910         }
 911 }
 912
 913 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 914 {
 915         struct page *p = NULL;
 916         int ret;
 917
 918         ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
 919         if (ret > 0) {
 920                 ret = page_to_nid(p);
 921                 put_page(p);
 922         }
 923         return ret;
 924 }
 925
 926 /* Retrieve NUMA policy */
 927 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 928                              unsigned long addr, unsigned long flags)
 929 {
 930         int err;
 931         struct mm_struct *mm = current->mm;
 932         struct vm_area_struct *vma = NULL;
 933         struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
 934
 935         if (flags &
 936                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 937                 return -EINVAL;
 938
 939         if (flags & MPOL_F_MEMS_ALLOWED) {
 940                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 941                         return -EINVAL;
 942                 *policy = 0;    /* just so it's initialized */
 943                 task_lock(current);
 944                 *nmask  = cpuset_current_mems_allowed;
 945                 task_unlock(current);
 946                 return 0;
 947         }
 948
 949         if (flags & MPOL_F_ADDR) {
 950                 /*
 951                  * Do NOT fall back to task policy if the
 952                  * vma/shared policy at addr is NULL.  We
 953                  * want to return MPOL_DEFAULT in this case.
 954                  */
 955                 mmap_read_lock(mm);
 956                 vma = vma_lookup(mm, addr);
 957                 if (!vma) {
 958                         mmap_read_unlock(mm);
 959                         return -EFAULT;
 960                 }
 961                 if (vma->vm_ops && vma->vm_ops->get_policy)
 962                         pol = vma->vm_ops->get_policy(vma, addr);
 963                 else
 964                         pol = vma->vm_policy;
 965         } else if (addr)
 966                 return -EINVAL;
 967
 968         if (!pol)
 969                 pol = &default_policy;  /* indicates default behavior */
 970
 971         if (flags & MPOL_F_NODE) {
 972                 if (flags & MPOL_F_ADDR) {
 973                         /*
 974                          * Take a refcount on the mpol, because we are about to
 975                          * drop the mmap_lock, after which only "pol" remains
 976                          * valid, "vma" is stale.
 977                          */
 978                         pol_refcount = pol;
 979                         vma = NULL;
 980                         mpol_get(pol);
 981                         mmap_read_unlock(mm);
 982                         err = lookup_node(mm, addr);
 983                         if (err < 0)
 984                                 goto out;
 985                         *policy = err;
 986                 } else if (pol == current->mempolicy &&
 987                                 pol->mode == MPOL_INTERLEAVE) {
 988                         *policy = next_node_in(current->il_prev, pol->nodes);
 989                 } else {
 990                         err = -EINVAL;
 991                         goto out;
 992                 }
 993         } else {
 994                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 995                                                 pol->mode;
 996                 /*
 997                  * Internal mempolicy flags must be masked off before exposing
 998                  * the policy to userspace.
 999                  */
1000                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
1001         }
1002
1003         err = 0;
1004         if (nmask) {
1005                 if (mpol_store_user_nodemask(pol)) {
1006                         *nmask = pol->w.user_nodemask;
1007                 } else {
1008                         task_lock(current);
1009                         get_policy_nodemask(pol, nmask);
1010                         task_unlock(current);
1011                 }
1012         }
1013
1014  out:
1015         mpol_cond_put(pol);
1016         if (vma)
1017                 mmap_read_unlock(mm);
1018         if (pol_refcount)
1019                 mpol_put(pol_refcount);
1020         return err;
1021 }
1022
1023 #ifdef CONFIG_MIGRATION
1024 static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1025                                 unsigned long flags)
1026 {
1027         /*
1028          * We try to migrate only unshared folios. If it is shared it
1029          * is likely not worth migrating.
1030          *
1031          * To check if the folio is shared, ideally we want to make sure
1032          * every page is mapped to the same process. Doing that is very
1033          * expensive, so check the estimated mapcount of the folio instead.
1034          */
1035         if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
1036                 if (folio_isolate_lru(folio)) {
1037                         list_add_tail(&folio->lru, foliolist);
1038                         node_stat_mod_folio(folio,
1039                                 NR_ISOLATED_ANON + folio_is_file_lru(folio),
1040                                 folio_nr_pages(folio));
1041                 } else if (flags & MPOL_MF_STRICT) {
1042                         /*
1043                          * Non-movable folio may reach here.  And, there may be
1044                          * temporary off LRU folios or non-LRU movable folios.
1045                          * Treat them as unmovable folios since they can't be
1046                          * isolated, so they can't be moved at the moment.  It
1047                          * should return -EIO for this case too.
1048                          */
1049                         return -EIO;
1050                 }
1051         }
1052
1053         return 0;
1054 }
1055
1056 /*
1057  * Migrate pages from one node to a target node.
1058  * Returns error or the number of pages not migrated.
1059  */
1060 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1061                            int flags)
1062 {
1063         nodemask_t nmask;
1064         struct vm_area_struct *vma;
1065         LIST_HEAD(pagelist);
1066         int err = 0;
1067         struct migration_target_control mtc = {
1068                 .nid = dest,
1069                 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1070         };
1071
1072         nodes_clear(nmask);
1073         node_set(source, nmask);
1074
1075         /*
1076          * This does not "check" the range but isolates all pages that
1077          * need migration.  Between passing in the full user address
1078          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1079          */
1080         vma = find_vma(mm, 0);
1081         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1082         queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1083                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1084
1085         if (!list_empty(&pagelist)) {
1086                 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1087                                 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
1088                 if (err)
1089                         putback_movable_pages(&pagelist);
1090         }
1091
1092         return err;
1093 }
1094
1095 /*
1096  * Move pages between the two nodesets so as to preserve the physical
1097  * layout as much as possible.
1098  *
1099  * Returns the number of page that could not be moved.
1100  */
1101 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1102                      const nodemask_t *to, int flags)
1103 {
1104         int busy = 0;
1105         int err = 0;
1106         nodemask_t tmp;
1107
1108         lru_cache_disable();
1109
1110         mmap_read_lock(mm);
1111
1112         /*
1113          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1114          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1115          * bit in 'tmp', and return that <source, dest> pair for migration.
1116          * The pair of nodemasks 'to' and 'from' define the map.
1117          *
1118          * If no pair of bits is found that way, fallback to picking some
1119          * pair of 'source' and 'dest' bits that are not the same.  If the
1120          * 'source' and 'dest' bits are the same, this represents a node
1121          * that will be migrating to itself, so no pages need move.
1122          *
1123          * If no bits are left in 'tmp', or if all remaining bits left
1124          * in 'tmp' correspond to the same bit in 'to', return false
1125          * (nothing left to migrate).
1126          *
1127          * This lets us pick a pair of nodes to migrate between, such that
1128          * if possible the dest node is not already occupied by some other
1129          * source node, minimizing the risk of overloading the memory on a
1130          * node that would happen if we migrated incoming memory to a node
1131          * before migrating outgoing memory source that same node.
1132          *
1133          * A single scan of tmp is sufficient.  As we go, we remember the
1134          * most recent <s, d> pair that moved (s != d).  If we find a pair
1135          * that not only moved, but what's better, moved to an empty slot
1136          * (d is not set in tmp), then we break out then, with that pair.
1137          * Otherwise when we finish scanning from_tmp, we at least have the
1138          * most recent <s, d> pair that moved.  If we get all the way through
1139          * the scan of tmp without finding any node that moved, much less
1140          * moved to an empty node, then there is nothing left worth migrating.
1141          */
1142
1143         tmp = *from;
1144         while (!nodes_empty(tmp)) {
1145                 int s, d;
1146                 int source = NUMA_NO_NODE;
1147                 int dest = 0;
1148
1149                 for_each_node_mask(s, tmp) {
1150
1151                         /*
1152                          * do_migrate_pages() tries to maintain the relative
1153                          * node relationship of the pages established between
1154                          * threads and memory areas.
1155                          *
1156                          * However if the number of source nodes is not equal to
1157                          * the number of destination nodes we can not preserve
1158                          * this node relative relationship.  In that case, skip
1159                          * copying memory from a node that is in the destination
1160                          * mask.
1161                          *
1162                          * Example: [2,3,4] -> [3,4,5] moves everything.
1163                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1164                          */
1165
1166                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1167                                                 (node_isset(s, *to)))
1168                                 continue;
1169
1170                         d = node_remap(s, *from, *to);
1171                         if (s == d)
1172                                 continue;
1173
1174                         source = s;     /* Node moved. Memorize */
1175                         dest = d;
1176
1177                         /* dest not in remaining from nodes? */
1178                         if (!node_isset(dest, tmp))
1179                                 break;
1180                 }
1181                 if (source == NUMA_NO_NODE)
1182                         break;
1183
1184                 node_clear(source, tmp);
1185                 err = migrate_to_node(mm, source, dest, flags);
1186                 if (err > 0)
1187                         busy += err;
1188                 if (err < 0)
1189                         break;
1190         }
1191         mmap_read_unlock(mm);
1192
1193         lru_cache_enable();
1194         if (err < 0)
1195                 return err;
1196         return busy;
1197
1198 }
1199
1200 /*
1201  * Allocate a new page for page migration based on vma policy.
1202  * Start by assuming the page is mapped by the same vma as contains @start.
1203  * Search forward from there, if not.  N.B., this assumes that the
1204  * list of pages handed to migrate_pages()--which is how we get here--
1205  * is in virtual address order.
1206  */
1207 static struct page *new_page(struct page *page, unsigned long start)
1208 {
1209         struct folio *dst, *src = page_folio(page);
1210         struct vm_area_struct *vma;
1211         unsigned long address;
1212         VMA_ITERATOR(vmi, current->mm, start);
1213         gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
1214
1215         for_each_vma(vmi, vma) {
1216                 address = page_address_in_vma(page, vma);
1217                 if (address != -EFAULT)
1218                         break;
1219         }
1220
1221         if (folio_test_hugetlb(src)) {
1222                 dst = alloc_hugetlb_folio_vma(folio_hstate(src),
1223                                 vma, address);
1224                 return &dst->page;
1225         }
1226
1227         if (folio_test_large(src))
1228                 gfp = GFP_TRANSHUGE;
1229
1230         /*
1231          * if !vma, vma_alloc_folio() will use task or system default policy
1232          */
1233         dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
1234                         folio_test_large(src));
1235         return &dst->page;
1236 }
1237 #else
1238
1239 static int migrate_folio_add(struct folio *folio, struct list_head *foliolist,
1240                                 unsigned long flags)
1241 {
1242         return -EIO;
1243 }
1244
1245 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1246                      const nodemask_t *to, int flags)
1247 {
1248         return -ENOSYS;
1249 }
1250
1251 static struct page *new_page(struct page *page, unsigned long start)
1252 {
1253         return NULL;
1254 }
1255 #endif
1256
1257 static long do_mbind(unsigned long start, unsigned long len,
1258                      unsigned short mode, unsigned short mode_flags,
1259                      nodemask_t *nmask, unsigned long flags)
1260 {
1261         struct mm_struct *mm = current->mm;
1262         struct mempolicy *new;
1263         unsigned long end;
1264         int err;
1265         int ret;
1266         LIST_HEAD(pagelist);
1267
1268         if (flags & ~(unsigned long)MPOL_MF_VALID)
1269                 return -EINVAL;
1270         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1271                 return -EPERM;
1272
1273         if (start & ~PAGE_MASK)
1274                 return -EINVAL;
1275
1276         if (mode == MPOL_DEFAULT)
1277                 flags &= ~MPOL_MF_STRICT;
1278
1279         len = PAGE_ALIGN(len);
1280         end = start + len;
1281
1282         if (end < start)
1283                 return -EINVAL;
1284         if (end == start)
1285                 return 0;
1286
1287         new = mpol_new(mode, mode_flags, nmask);
1288         if (IS_ERR(new))
1289                 return PTR_ERR(new);
1290
1291         if (flags & MPOL_MF_LAZY)
1292                 new->flags |= MPOL_F_MOF;
1293
1294         /*
1295          * If we are using the default policy then operation
1296          * on discontinuous address spaces is okay after all
1297          */
1298         if (!new)
1299                 flags |= MPOL_MF_DISCONTIG_OK;
1300
1301         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1302                  start, start + len, mode, mode_flags,
1303                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1304
1305         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1306
1307                 lru_cache_disable();
1308         }
1309         {
1310                 NODEMASK_SCRATCH(scratch);
1311                 if (scratch) {
1312                         mmap_write_lock(mm);
1313                         err = mpol_set_nodemask(new, nmask, scratch);
1314                         if (err)
1315                                 mmap_write_unlock(mm);
1316                 } else
1317                         err = -ENOMEM;
1318                 NODEMASK_SCRATCH_FREE(scratch);
1319         }
1320         if (err)
1321                 goto mpol_out;
1322
1323         ret = queue_pages_range(mm, start, end, nmask,
1324                           flags | MPOL_MF_INVERT, &pagelist);
1325
1326         if (ret < 0) {
1327                 err = ret;
1328                 goto up_out;
1329         }
1330
1331         err = mbind_range(mm, start, end, new);
1332
1333         if (!err) {
1334                 int nr_failed = 0;
1335
1336                 if (!list_empty(&pagelist)) {
1337                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1338                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1339                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
1340                         if (nr_failed)
1341                                 putback_movable_pages(&pagelist);
1342                 }
1343
1344                 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1345                         err = -EIO;
1346         } else {
1347 up_out:
1348                 if (!list_empty(&pagelist))
1349                         putback_movable_pages(&pagelist);
1350         }
1351
1352         mmap_write_unlock(mm);
1353 mpol_out:
1354         mpol_put(new);
1355         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1356                 lru_cache_enable();
1357         return err;
1358 }
1359
1360 /*
1361  * User space interface with variable sized bitmaps for nodelists.
1362  */
1363 static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1364                       unsigned long maxnode)
1365 {
1366         unsigned long nlongs = BITS_TO_LONGS(maxnode);
1367         int ret;
1368
1369         if (in_compat_syscall())
1370                 ret = compat_get_bitmap(mask,
1371                                         (const compat_ulong_t __user *)nmask,
1372                                         maxnode);
1373         else
1374                 ret = copy_from_user(mask, nmask,
1375                                      nlongs * sizeof(unsigned long));
1376
1377         if (ret)
1378                 return -EFAULT;
1379
1380         if (maxnode % BITS_PER_LONG)
1381                 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1382
1383         return 0;
1384 }
1385
1386 /* Copy a node mask from user space. */
1387 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1388                      unsigned long maxnode)
1389 {
1390         --maxnode;
1391         nodes_clear(*nodes);
1392         if (maxnode == 0 || !nmask)
1393                 return 0;
1394         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1395                 return -EINVAL;
1396
1397         /*
1398          * When the user specified more nodes than supported just check
1399          * if the non supported part is all zero, one word at a time,
1400          * starting at the end.
1401          */
1402         while (maxnode > MAX_NUMNODES) {
1403                 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1404                 unsigned long t;
1405
1406                 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
1407                         return -EFAULT;
1408
1409                 if (maxnode - bits >= MAX_NUMNODES) {
1410                         maxnode -= bits;
1411                 } else {
1412                         maxnode = MAX_NUMNODES;
1413                         t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1414                 }
1415                 if (t)
1416                         return -EINVAL;
1417         }
1418
1419         return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
1420 }
1421
1422 /* Copy a kernel node mask to user space */
1423 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1424                               nodemask_t *nodes)
1425 {
1426         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1427         unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1428         bool compat = in_compat_syscall();
1429
1430         if (compat)
1431                 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
1432
1433         if (copy > nbytes) {
1434                 if (copy > PAGE_SIZE)
1435                         return -EINVAL;
1436                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1437                         return -EFAULT;
1438                 copy = nbytes;
1439                 maxnode = nr_node_ids;
1440         }
1441
1442         if (compat)
1443                 return compat_put_bitmap((compat_ulong_t __user *)mask,
1444                                          nodes_addr(*nodes), maxnode);
1445
1446         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1447 }
1448
1449 /* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1450 static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1451 {
1452         *flags = *mode & MPOL_MODE_FLAGS;
1453         *mode &= ~MPOL_MODE_FLAGS;
1454
1455         if ((unsigned int)(*mode) >=  MPOL_MAX)
1456                 return -EINVAL;
1457         if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1458                 return -EINVAL;
1459         if (*flags & MPOL_F_NUMA_BALANCING) {
1460                 if (*mode != MPOL_BIND)
1461                         return -EINVAL;
1462                 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1463         }
1464         return 0;
1465 }
1466
1467 static long kernel_mbind(unsigned long start, unsigned long len,
1468                          unsigned long mode, const unsigned long __user *nmask,
1469                          unsigned long maxnode, unsigned int flags)
1470 {
1471         unsigned short mode_flags;
1472         nodemask_t nodes;
1473         int lmode = mode;
1474         int err;
1475
1476         start = untagged_addr(start);
1477         err = sanitize_mpol_flags(&lmode, &mode_flags);
1478         if (err)
1479                 return err;
1480
1481         err = get_nodes(&nodes, nmask, maxnode);
1482         if (err)
1483                 return err;
1484
1485         return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
1486 }
1487
1488 SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1489                 unsigned long, home_node, unsigned long, flags)
1490 {
1491         struct mm_struct *mm = current->mm;
1492         struct vm_area_struct *vma;
1493         struct mempolicy *new, *old;
1494         unsigned long vmstart;
1495         unsigned long vmend;
1496         unsigned long end;
1497         int err = -ENOENT;
1498         VMA_ITERATOR(vmi, mm, start);
1499
1500         start = untagged_addr(start);
1501         if (start & ~PAGE_MASK)
1502                 return -EINVAL;
1503         /*
1504          * flags is used for future extension if any.
1505          */
1506         if (flags != 0)
1507                 return -EINVAL;
1508
1509         /*
1510          * Check home_node is online to avoid accessing uninitialized
1511          * NODE_DATA.
1512          */
1513         if (home_node >= MAX_NUMNODES || !node_online(home_node))
1514                 return -EINVAL;
1515
1516         len = PAGE_ALIGN(len);
1517         end = start + len;
1518
1519         if (end < start)
1520                 return -EINVAL;
1521         if (end == start)
1522                 return 0;
1523         mmap_write_lock(mm);
1524         for_each_vma_range(vmi, vma, end) {
1525                 /*
1526                  * If any vma in the range got policy other than MPOL_BIND
1527                  * or MPOL_PREFERRED_MANY we return error. We don't reset
1528                  * the home node for vmas we already updated before.
1529                  */
1530                 old = vma_policy(vma);
1531                 if (!old)
1532                         continue;
1533                 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
1534                         err = -EOPNOTSUPP;
1535                         break;
1536                 }
1537                 new = mpol_dup(old);
1538                 if (IS_ERR(new)) {
1539                         err = PTR_ERR(new);
1540                         break;
1541                 }
1542
1543                 new->home_node = home_node;
1544                 vmstart = max(start, vma->vm_start);
1545                 vmend   = min(end, vma->vm_end);
1546                 err = mbind_range(mm, vmstart, vmend, new);
1547                 mpol_put(new);
1548                 if (err)
1549                         break;
1550         }
1551         mmap_write_unlock(mm);
1552         return err;
1553 }
1554
1555 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1556                 unsigned long, mode, const unsigned long __user *, nmask,
1557                 unsigned long, maxnode, unsigned int, flags)
1558 {
1559         return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1560 }
1561
1562 /* Set the process memory policy */
1563 static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1564                                  unsigned long maxnode)
1565 {
1566         unsigned short mode_flags;
1567         nodemask_t nodes;
1568         int lmode = mode;
1569         int err;
1570
1571         err = sanitize_mpol_flags(&lmode, &mode_flags);
1572         if (err)
1573                 return err;
1574
1575         err = get_nodes(&nodes, nmask, maxnode);
1576         if (err)
1577                 return err;
1578
1579         return do_set_mempolicy(lmode, mode_flags, &nodes);
1580 }
1581
1582 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1583                 unsigned long, maxnode)
1584 {
1585         return kernel_set_mempolicy(mode, nmask, maxnode);
1586 }
1587
1588 static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1589                                 const unsigned long __user *old_nodes,
1590                                 const unsigned long __user *new_nodes)
1591 {
1592         struct mm_struct *mm = NULL;
1593         struct task_struct *task;
1594         nodemask_t task_nodes;
1595         int err;
1596         nodemask_t *old;
1597         nodemask_t *new;
1598         NODEMASK_SCRATCH(scratch);
1599
1600         if (!scratch)
1601                 return -ENOMEM;
1602
1603         old = &scratch->mask1;
1604         new = &scratch->mask2;
1605
1606         err = get_nodes(old, old_nodes, maxnode);
1607         if (err)
1608                 goto out;
1609
1610         err = get_nodes(new, new_nodes, maxnode);
1611         if (err)
1612                 goto out;
1613
1614         /* Find the mm_struct */
1615         rcu_read_lock();
1616         task = pid ? find_task_by_vpid(pid) : current;
1617         if (!task) {
1618                 rcu_read_unlock();
1619                 err = -ESRCH;
1620                 goto out;
1621         }
1622         get_task_struct(task);
1623
1624         err = -EINVAL;
1625
1626         /*
1627          * Check if this process has the right to modify the specified process.
1628          * Use the regular "ptrace_may_access()" checks.
1629          */
1630         if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1631                 rcu_read_unlock();
1632                 err = -EPERM;
1633                 goto out_put;
1634         }
1635         rcu_read_unlock();
1636
1637         task_nodes = cpuset_mems_allowed(task);
1638         /* Is the user allowed to access the target nodes? */
1639         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1640                 err = -EPERM;
1641                 goto out_put;
1642         }
1643
1644         task_nodes = cpuset_mems_allowed(current);
1645         nodes_and(*new, *new, task_nodes);
1646         if (nodes_empty(*new))
1647                 goto out_put;
1648
1649         err = security_task_movememory(task);
1650         if (err)
1651                 goto out_put;
1652
1653         mm = get_task_mm(task);
1654         put_task_struct(task);
1655
1656         if (!mm) {
1657                 err = -EINVAL;
1658                 goto out;
1659         }
1660
1661         err = do_migrate_pages(mm, old, new,
1662                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1663
1664         mmput(mm);
1665 out:
1666         NODEMASK_SCRATCH_FREE(scratch);
1667
1668         return err;
1669
1670 out_put:
1671         put_task_struct(task);
1672         goto out;
1673
1674 }
1675
1676 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1677                 const unsigned long __user *, old_nodes,
1678                 const unsigned long __user *, new_nodes)
1679 {
1680         return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1681 }
1682
1683
1684 /* Retrieve NUMA policy */
1685 static int kernel_get_mempolicy(int __user *policy,
1686                                 unsigned long __user *nmask,
1687                                 unsigned long maxnode,
1688                                 unsigned long addr,
1689                                 unsigned long flags)
1690 {
1691         int err;
1692         int pval;
1693         nodemask_t nodes;
1694
1695         if (nmask != NULL && maxnode < nr_node_ids)
1696                 return -EINVAL;
1697
1698         addr = untagged_addr(addr);
1699
1700         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1701
1702         if (err)
1703                 return err;
1704
1705         if (policy && put_user(pval, policy))
1706                 return -EFAULT;
1707
1708         if (nmask)
1709                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1710
1711         return err;
1712 }
1713
1714 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1715                 unsigned long __user *, nmask, unsigned long, maxnode,
1716                 unsigned long, addr, unsigned long, flags)
1717 {
1718         return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1719 }
1720
1721 bool vma_migratable(struct vm_area_struct *vma)
1722 {
1723         if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1724                 return false;
1725
1726         /*
1727          * DAX device mappings require predictable access latency, so avoid
1728          * incurring periodic faults.
1729          */
1730         if (vma_is_dax(vma))
1731                 return false;
1732
1733         if (is_vm_hugetlb_page(vma) &&
1734                 !hugepage_migration_supported(hstate_vma(vma)))
1735                 return false;
1736
1737         /*
1738          * Migration allocates pages in the highest zone. If we cannot
1739          * do so then migration (at least from node to node) is not
1740          * possible.
1741          */
1742         if (vma->vm_file &&
1743                 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1744                         < policy_zone)
1745                 return false;
1746         return true;
1747 }
1748
1749 struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1750                                                 unsigned long addr)
1751 {
1752         struct mempolicy *pol = NULL;
1753
1754         if (vma) {
1755                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1756                         pol = vma->vm_ops->get_policy(vma, addr);
1757                 } else if (vma->vm_policy) {
1758                         pol = vma->vm_policy;
1759
1760                         /*
1761                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1762                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1763                          * count on these policies which will be dropped by
1764                          * mpol_cond_put() later
1765                          */
1766                         if (mpol_needs_cond_ref(pol))
1767                                 mpol_get(pol);
1768                 }
1769         }
1770
1771         return pol;
1772 }
1773
1774 /*
1775  * get_vma_policy(@vma, @addr)
1776  * @vma: virtual memory area whose policy is sought
1777  * @addr: address in @vma for shared policy lookup
1778  *
1779  * Returns effective policy for a VMA at specified address.
1780  * Falls back to current->mempolicy or system default policy, as necessary.
1781  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1782  * count--added by the get_policy() vm_op, as appropriate--to protect against
1783  * freeing by another task.  It is the caller's responsibility to free the
1784  * extra reference for shared policies.
1785  */
1786 static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1787                                                 unsigned long addr)
1788 {
1789         struct mempolicy *pol = __get_vma_policy(vma, addr);
1790
1791         if (!pol)
1792                 pol = get_task_policy(current);
1793
1794         return pol;
1795 }
1796
1797 bool vma_policy_mof(struct vm_area_struct *vma)
1798 {
1799         struct mempolicy *pol;
1800
1801         if (vma->vm_ops && vma->vm_ops->get_policy) {
1802                 bool ret = false;
1803
1804                 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1805                 if (pol && (pol->flags & MPOL_F_MOF))
1806                         ret = true;
1807                 mpol_cond_put(pol);
1808
1809                 return ret;
1810         }
1811
1812         pol = vma->vm_policy;
1813         if (!pol)
1814                 pol = get_task_policy(current);
1815
1816         return pol->flags & MPOL_F_MOF;
1817 }
1818
1819 bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1820 {
1821         enum zone_type dynamic_policy_zone = policy_zone;
1822
1823         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1824
1825         /*
1826          * if policy->nodes has movable memory only,
1827          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1828          *
1829          * policy->nodes is intersect with node_states[N_MEMORY].
1830          * so if the following test fails, it implies
1831          * policy->nodes has movable memory only.
1832          */
1833         if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
1834                 dynamic_policy_zone = ZONE_MOVABLE;
1835
1836         return zone >= dynamic_policy_zone;
1837 }
1838
1839 /*
1840  * Return a nodemask representing a mempolicy for filtering nodes for
1841  * page allocation
1842  */
1843 nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1844 {
1845         int mode = policy->mode;
1846
1847         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1848         if (unlikely(mode == MPOL_BIND) &&
1849                 apply_policy_zone(policy, gfp_zone(gfp)) &&
1850                 cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1851                 return &policy->nodes;
1852
1853         if (mode == MPOL_PREFERRED_MANY)
1854                 return &policy->nodes;
1855
1856         return NULL;
1857 }
1858
1859 /*
1860  * Return the  preferred node id for 'prefer' mempolicy, and return
1861  * the given id for all other policies.
1862  *
1863  * policy_node() is always coupled with policy_nodemask(), which
1864  * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1865  */
1866 static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1867 {
1868         if (policy->mode == MPOL_PREFERRED) {
1869                 nd = first_node(policy->nodes);
1870         } else {
1871                 /*
1872                  * __GFP_THISNODE shouldn't even be used with the bind policy
1873                  * because we might easily break the expectation to stay on the
1874                  * requested node and not break the policy.
1875                  */
1876                 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1877         }
1878
1879         if ((policy->mode == MPOL_BIND ||
1880              policy->mode == MPOL_PREFERRED_MANY) &&
1881             policy->home_node != NUMA_NO_NODE)
1882                 return policy->home_node;
1883
1884         return nd;
1885 }
1886
1887 /* Do dynamic interleaving for a process */
1888 static unsigned interleave_nodes(struct mempolicy *policy)
1889 {
1890         unsigned next;
1891         struct task_struct *me = current;
1892
1893         next = next_node_in(me->il_prev, policy->nodes);
1894         if (next < MAX_NUMNODES)
1895                 me->il_prev = next;
1896         return next;
1897 }
1898
1899 /*
1900  * Depending on the memory policy provide a node from which to allocate the
1901  * next slab entry.
1902  */
1903 unsigned int mempolicy_slab_node(void)
1904 {
1905         struct mempolicy *policy;
1906         int node = numa_mem_id();
1907
1908         if (!in_task())
1909                 return node;
1910
1911         policy = current->mempolicy;
1912         if (!policy)
1913                 return node;
1914
1915         switch (policy->mode) {
1916         case MPOL_PREFERRED:
1917                 return first_node(policy->nodes);
1918
1919         case MPOL_INTERLEAVE:
1920                 return interleave_nodes(policy);
1921
1922         case MPOL_BIND:
1923         case MPOL_PREFERRED_MANY:
1924         {
1925                 struct zoneref *z;
1926
1927                 /*
1928                  * Follow bind policy behavior and start allocation at the
1929                  * first node.
1930                  */
1931                 struct zonelist *zonelist;
1932                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1933                 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1934                 z = first_zones_zonelist(zonelist, highest_zoneidx,
1935                                                         &policy->nodes);
1936                 return z->zone ? zone_to_nid(z->zone) : node;
1937         }
1938         case MPOL_LOCAL:
1939                 return node;
1940
1941         default:
1942                 BUG();
1943         }
1944 }
1945
1946 /*
1947  * Do static interleaving for a VMA with known offset @n.  Returns the n'th
1948  * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
1949  * number of present nodes.
1950  */
1951 static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1952 {
1953         nodemask_t nodemask = pol->nodes;
1954         unsigned int target, nnodes;
1955         int i;
1956         int nid;
1957         /*
1958          * The barrier will stabilize the nodemask in a register or on
1959          * the stack so that it will stop changing under the code.
1960          *
1961          * Between first_node() and next_node(), pol->nodes could be changed
1962          * by other threads. So we put pol->nodes in a local stack.
1963          */
1964         barrier();
1965
1966         nnodes = nodes_weight(nodemask);
1967         if (!nnodes)
1968                 return numa_node_id();
1969         target = (unsigned int)n % nnodes;
1970         nid = first_node(nodemask);
1971         for (i = 0; i < target; i++)
1972                 nid = next_node(nid, nodemask);
1973         return nid;
1974 }
1975
1976 /* Determine a node number for interleave */
1977 static inline unsigned interleave_nid(struct mempolicy *pol,
1978                  struct vm_area_struct *vma, unsigned long addr, int shift)
1979 {
1980         if (vma) {
1981                 unsigned long off;
1982
1983                 /*
1984                  * for small pages, there is no difference between
1985                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1986                  * for huge pages, since vm_pgoff is in units of small
1987                  * pages, we need to shift off the always 0 bits to get
1988                  * a useful offset.
1989                  */
1990                 BUG_ON(shift < PAGE_SHIFT);
1991                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1992                 off += (addr - vma->vm_start) >> shift;
1993                 return offset_il_node(pol, off);
1994         } else
1995                 return interleave_nodes(pol);
1996 }
1997
1998 #ifdef CONFIG_HUGETLBFS
1999 /*
2000  * huge_node(@vma, @addr, @gfp_flags, @mpol)
2001  * @vma: virtual memory area whose policy is sought
2002  * @addr: address in @vma for shared policy lookup and interleave policy
2003  * @gfp_flags: for requested zone
2004  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
2005  * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
2006  *
2007  * Returns a nid suitable for a huge page allocation and a pointer
2008  * to the struct mempolicy for conditional unref after allocation.
2009  * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2010  * to the mempolicy's @nodemask for filtering the zonelist.
2011  *
2012  * Must be protected by read_mems_allowed_begin()
2013  */
2014 int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2015                                 struct mempolicy **mpol, nodemask_t **nodemask)
2016 {
2017         int nid;
2018         int mode;
2019
2020         *mpol = get_vma_policy(vma, addr);
2021         *nodemask = NULL;
2022         mode = (*mpol)->mode;
2023
2024         if (unlikely(mode == MPOL_INTERLEAVE)) {
2025                 nid = interleave_nid(*mpol, vma, addr,
2026                                         huge_page_shift(hstate_vma(vma)));
2027         } else {
2028                 nid = policy_node(gfp_flags, *mpol, numa_node_id());
2029                 if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
2030                         *nodemask = &(*mpol)->nodes;
2031         }
2032         return nid;
2033 }
2034
2035 /*
2036  * init_nodemask_of_mempolicy
2037  *
2038  * If the current task's mempolicy is "default" [NULL], return 'false'
2039  * to indicate default policy.  Otherwise, extract the policy nodemask
2040  * for 'bind' or 'interleave' policy into the argument nodemask, or
2041  * initialize the argument nodemask to contain the single node for
2042  * 'preferred' or 'local' policy and return 'true' to indicate presence
2043  * of non-default mempolicy.
2044  *
2045  * We don't bother with reference counting the mempolicy [mpol_get/put]
2046  * because the current task is examining it's own mempolicy and a task's
2047  * mempolicy is only ever changed by the task itself.
2048  *
2049  * N.B., it is the caller's responsibility to free a returned nodemask.
2050  */
2051 bool init_nodemask_of_mempolicy(nodemask_t *mask)
2052 {
2053         struct mempolicy *mempolicy;
2054
2055         if (!(mask && current->mempolicy))
2056                 return false;
2057
2058         task_lock(current);
2059         mempolicy = current->mempolicy;
2060         switch (mempolicy->mode) {
2061         case MPOL_PREFERRED:
2062         case MPOL_PREFERRED_MANY:
2063         case MPOL_BIND:
2064         case MPOL_INTERLEAVE:
2065                 *mask = mempolicy->nodes;
2066                 break;
2067
2068         case MPOL_LOCAL:
2069                 init_nodemask_of_node(mask, numa_node_id());
2070                 break;
2071
2072         default:
2073                 BUG();
2074         }
2075         task_unlock(current);
2076
2077         return true;
2078 }
2079 #endif
2080
2081 /*
2082  * mempolicy_in_oom_domain
2083  *
2084  * If tsk's mempolicy is "bind", check for intersection between mask and
2085  * the policy nodemask. Otherwise, return true for all other policies
2086  * including "interleave", as a tsk with "interleave" policy may have
2087  * memory allocated from all nodes in system.
2088  *
2089  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2090  */
2091 bool mempolicy_in_oom_domain(struct task_struct *tsk,
2092                                         const nodemask_t *mask)
2093 {
2094         struct mempolicy *mempolicy;
2095         bool ret = true;
2096
2097         if (!mask)
2098                 return ret;
2099
2100         task_lock(tsk);
2101         mempolicy = tsk->mempolicy;
2102         if (mempolicy && mempolicy->mode == MPOL_BIND)
2103                 ret = nodes_intersects(mempolicy->nodes, *mask);
2104         task_unlock(tsk);
2105
2106         return ret;
2107 }
2108
2109 /* Allocate a page in interleaved policy.
2110    Own path because it needs to do special accounting. */
2111 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2112                                         unsigned nid)
2113 {
2114         struct page *page;
2115
2116         page = __alloc_pages(gfp, order, nid, NULL);
2117         /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2118         if (!static_branch_likely(&vm_numa_stat_key))
2119                 return page;
2120         if (page && page_to_nid(page) == nid) {
2121                 preempt_disable();
2122                 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2123                 preempt_enable();
2124         }
2125         return page;
2126 }
2127
2128 static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2129                                                 int nid, struct mempolicy *pol)
2130 {
2131         struct page *page;
2132         gfp_t preferred_gfp;
2133
2134         /*
2135          * This is a two pass approach. The first pass will only try the
2136          * preferred nodes but skip the direct reclaim and allow the
2137          * allocation to fail, while the second pass will try all the
2138          * nodes in system.
2139          */
2140         preferred_gfp = gfp | __GFP_NOWARN;
2141         preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2142         page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2143         if (!page)
2144                 page = __alloc_pages(gfp, order, nid, NULL);
2145
2146         return page;
2147 }
2148
2149 /**
2150  * vma_alloc_folio - Allocate a folio for a VMA.
2151  * @gfp: GFP flags.
2152  * @order: Order of the folio.
2153  * @vma: Pointer to VMA or NULL if not available.
2154  * @addr: Virtual address of the allocation.  Must be inside @vma.
2155  * @hugepage: For hugepages try only the preferred node if possible.
2156  *
2157  * Allocate a folio for a specific address in @vma, using the appropriate
2158  * NUMA policy.  When @vma is not NULL the caller must hold the mmap_lock
2159  * of the mm_struct of the VMA to prevent it from going away.  Should be
2160  * used for all allocations for folios that will be mapped into user space.
2161  *
2162  * Return: The folio on success or NULL if allocation fails.
2163  */
2164 struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2165                 unsigned long addr, bool hugepage)
2166 {
2167         struct mempolicy *pol;
2168         int node = numa_node_id();
2169         struct folio *folio;
2170         int preferred_nid;
2171         nodemask_t *nmask;
2172
2173         pol = get_vma_policy(vma, addr);
2174
2175         if (pol->mode == MPOL_INTERLEAVE) {
2176                 struct page *page;
2177                 unsigned nid;
2178
2179                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2180                 mpol_cond_put(pol);
2181                 gfp |= __GFP_COMP;
2182                 page = alloc_page_interleave(gfp, order, nid);
2183                 if (page && order > 1)
2184                         prep_transhuge_page(page);
2185                 folio = (struct folio *)page;
2186                 goto out;
2187         }
2188
2189         if (pol->mode == MPOL_PREFERRED_MANY) {
2190                 struct page *page;
2191
2192                 node = policy_node(gfp, pol, node);
2193                 gfp |= __GFP_COMP;
2194                 page = alloc_pages_preferred_many(gfp, order, node, pol);
2195                 mpol_cond_put(pol);
2196                 if (page && order > 1)
2197                         prep_transhuge_page(page);
2198                 folio = (struct folio *)page;
2199                 goto out;
2200         }
2201
2202         if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2203                 int hpage_node = node;
2204
2205                 /*
2206                  * For hugepage allocation and non-interleave policy which
2207                  * allows the current node (or other explicitly preferred
2208                  * node) we only try to allocate from the current/preferred
2209                  * node and don't fall back to other nodes, as the cost of
2210                  * remote accesses would likely offset THP benefits.
2211                  *
2212                  * If the policy is interleave or does not allow the current
2213                  * node in its nodemask, we allocate the standard way.
2214                  */
2215                 if (pol->mode == MPOL_PREFERRED)
2216                         hpage_node = first_node(pol->nodes);
2217
2218                 nmask = policy_nodemask(gfp, pol);
2219                 if (!nmask || node_isset(hpage_node, *nmask)) {
2220                         mpol_cond_put(pol);
2221                         /*
2222                          * First, try to allocate THP only on local node, but
2223                          * don't reclaim unnecessarily, just compact.
2224                          */
2225                         folio = __folio_alloc_node(gfp | __GFP_THISNODE |
2226                                         __GFP_NORETRY, order, hpage_node);
2227
2228                         /*
2229                          * If hugepage allocations are configured to always
2230                          * synchronous compact or the vma has been madvised
2231                          * to prefer hugepage backing, retry allowing remote
2232                          * memory with both reclaim and compact as well.
2233                          */
2234                         if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
2235                                 folio = __folio_alloc(gfp, order, hpage_node,
2236                                                       nmask);
2237
2238                         goto out;
2239                 }
2240         }
2241
2242         nmask = policy_nodemask(gfp, pol);
2243         preferred_nid = policy_node(gfp, pol, node);
2244         folio = __folio_alloc(gfp, order, preferred_nid, nmask);
2245         mpol_cond_put(pol);
2246 out:
2247         return folio;
2248 }
2249 EXPORT_SYMBOL(vma_alloc_folio);
2250
2251 /**
2252  * alloc_pages - Allocate pages.
2253  * @gfp: GFP flags.
2254  * @order: Power of two of number of pages to allocate.
2255  *
2256  * Allocate 1 << @order contiguous pages.  The physical address of the
2257  * first page is naturally aligned (eg an order-3 allocation will be aligned
2258  * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
2259  * process is honoured when in process context.
2260  *
2261  * Context: Can be called from any context, providing the appropriate GFP
2262  * flags are used.
2263  * Return: The page on success or NULL if allocation fails.
2264  */
2265 struct page *alloc_pages(gfp_t gfp, unsigned order)
2266 {
2267         struct mempolicy *pol = &default_policy;
2268         struct page *page;
2269
2270         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2271                 pol = get_task_policy(current);
2272
2273         /*
2274          * No reference counting needed for current->mempolicy
2275          * nor system default_policy
2276          */
2277         if (pol->mode == MPOL_INTERLEAVE)
2278                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2279         else if (pol->mode == MPOL_PREFERRED_MANY)
2280                 page = alloc_pages_preferred_many(gfp, order,
2281                                   policy_node(gfp, pol, numa_node_id()), pol);
2282         else
2283                 page = __alloc_pages(gfp, order,
2284                                 policy_node(gfp, pol, numa_node_id()),
2285                                 policy_nodemask(gfp, pol));
2286
2287         return page;
2288 }
2289 EXPORT_SYMBOL(alloc_pages);
2290
2291 struct folio *folio_alloc(gfp_t gfp, unsigned order)
2292 {
2293         struct page *page = alloc_pages(gfp | __GFP_COMP, order);
2294
2295         if (page && order > 1)
2296                 prep_transhuge_page(page);
2297         return (struct folio *)page;
2298 }
2299 EXPORT_SYMBOL(folio_alloc);
2300
2301 static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2302                 struct mempolicy *pol, unsigned long nr_pages,
2303                 struct page **page_array)
2304 {
2305         int nodes;
2306         unsigned long nr_pages_per_node;
2307         int delta;
2308         int i;
2309         unsigned long nr_allocated;
2310         unsigned long total_allocated = 0;
2311
2312         nodes = nodes_weight(pol->nodes);
2313         nr_pages_per_node = nr_pages / nodes;
2314         delta = nr_pages - nodes * nr_pages_per_node;
2315
2316         for (i = 0; i < nodes; i++) {
2317                 if (delta) {
2318                         nr_allocated = __alloc_pages_bulk(gfp,
2319                                         interleave_nodes(pol), NULL,
2320                                         nr_pages_per_node + 1, NULL,
2321                                         page_array);
2322                         delta--;
2323                 } else {
2324                         nr_allocated = __alloc_pages_bulk(gfp,
2325                                         interleave_nodes(pol), NULL,
2326                                         nr_pages_per_node, NULL, page_array);
2327                 }
2328
2329                 page_array += nr_allocated;
2330                 total_allocated += nr_allocated;
2331         }
2332
2333         return total_allocated;
2334 }
2335
2336 static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2337                 struct mempolicy *pol, unsigned long nr_pages,
2338                 struct page **page_array)
2339 {
2340         gfp_t preferred_gfp;
2341         unsigned long nr_allocated = 0;
2342
2343         preferred_gfp = gfp | __GFP_NOWARN;
2344         preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2345
2346         nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2347                                            nr_pages, NULL, page_array);
2348
2349         if (nr_allocated < nr_pages)
2350                 nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2351                                 nr_pages - nr_allocated, NULL,
2352                                 page_array + nr_allocated);
2353         return nr_allocated;
2354 }
2355
2356 /* alloc pages bulk and mempolicy should be considered at the
2357  * same time in some situation such as vmalloc.
2358  *
2359  * It can accelerate memory allocation especially interleaving
2360  * allocate memory.
2361  */
2362 unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2363                 unsigned long nr_pages, struct page **page_array)
2364 {
2365         struct mempolicy *pol = &default_policy;
2366
2367         if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2368                 pol = get_task_policy(current);
2369
2370         if (pol->mode == MPOL_INTERLEAVE)
2371                 return alloc_pages_bulk_array_interleave(gfp, pol,
2372                                                          nr_pages, page_array);
2373
2374         if (pol->mode == MPOL_PREFERRED_MANY)
2375                 return alloc_pages_bulk_array_preferred_many(gfp,
2376                                 numa_node_id(), pol, nr_pages, page_array);
2377
2378         return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2379                                   policy_nodemask(gfp, pol), nr_pages, NULL,
2380                                   page_array);
2381 }
2382
2383 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2384 {
2385         struct mempolicy *pol = mpol_dup(vma_policy(src));
2386
2387         if (IS_ERR(pol))
2388                 return PTR_ERR(pol);
2389         dst->vm_policy = pol;
2390         return 0;
2391 }
2392
2393 /*
2394  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2395  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2396  * with the mems_allowed returned by cpuset_mems_allowed().  This
2397  * keeps mempolicies cpuset relative after its cpuset moves.  See
2398  * further kernel/cpuset.c update_nodemask().
2399  *
2400  * current's mempolicy may be rebinded by the other task(the task that changes
2401  * cpuset's mems), so we needn't do rebind work for current task.
2402  */
2403
2404 /* Slow path of a mempolicy duplicate */
2405 struct mempolicy *__mpol_dup(struct mempolicy *old)
2406 {
2407         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2408
2409         if (!new)
2410                 return ERR_PTR(-ENOMEM);
2411
2412         /* task's mempolicy is protected by alloc_lock */
2413         if (old == current->mempolicy) {
2414                 task_lock(current);
2415                 *new = *old;
2416                 task_unlock(current);
2417         } else
2418                 *new = *old;
2419
2420         if (current_cpuset_is_being_rebound()) {
2421                 nodemask_t mems = cpuset_mems_allowed(current);
2422                 mpol_rebind_policy(new, &mems);
2423         }
2424         atomic_set(&new->refcnt, 1);
2425         return new;
2426 }
2427
2428 /* Slow path of a mempolicy comparison */
2429 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2430 {
2431         if (!a || !b)
2432                 return false;
2433         if (a->mode != b->mode)
2434                 return false;
2435         if (a->flags != b->flags)
2436                 return false;
2437         if (a->home_node != b->home_node)
2438                 return false;
2439         if (mpol_store_user_nodemask(a))
2440                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2441                         return false;
2442
2443         switch (a->mode) {
2444         case MPOL_BIND:
2445         case MPOL_INTERLEAVE:
2446         case MPOL_PREFERRED:
2447         case MPOL_PREFERRED_MANY:
2448                 return !!nodes_equal(a->nodes, b->nodes);
2449         case MPOL_LOCAL:
2450                 return true;
2451         default:
2452                 BUG();
2453                 return false;
2454         }
2455 }
2456
2457 /*
2458  * Shared memory backing store policy support.
2459  *
2460  * Remember policies even when nobody has shared memory mapped.
2461  * The policies are kept in Red-Black tree linked from the inode.
2462  * They are protected by the sp->lock rwlock, which should be held
2463  * for any accesses to the tree.
2464  */
2465
2466 /*
2467  * lookup first element intersecting start-end.  Caller holds sp->lock for
2468  * reading or for writing
2469  */
2470 static struct sp_node *
2471 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2472 {
2473         struct rb_node *n = sp->root.rb_node;
2474
2475         while (n) {
2476                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2477
2478                 if (start >= p->end)
2479                         n = n->rb_right;
2480                 else if (end <= p->start)
2481                         n = n->rb_left;
2482                 else
2483                         break;
2484         }
2485         if (!n)
2486                 return NULL;
2487         for (;;) {
2488                 struct sp_node *w = NULL;
2489                 struct rb_node *prev = rb_prev(n);
2490                 if (!prev)
2491                         break;
2492                 w = rb_entry(prev, struct sp_node, nd);
2493                 if (w->end <= start)
2494                         break;
2495                 n = prev;
2496         }
2497         return rb_entry(n, struct sp_node, nd);
2498 }
2499
2500 /*
2501  * Insert a new shared policy into the list.  Caller holds sp->lock for
2502  * writing.
2503  */
2504 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2505 {
2506         struct rb_node **p = &sp->root.rb_node;
2507         struct rb_node *parent = NULL;
2508         struct sp_node *nd;
2509
2510         while (*p) {
2511                 parent = *p;
2512                 nd = rb_entry(parent, struct sp_node, nd);
2513                 if (new->start < nd->start)
2514                         p = &(*p)->rb_left;
2515                 else if (new->end > nd->end)
2516                         p = &(*p)->rb_right;
2517                 else
2518                         BUG();
2519         }
2520         rb_link_node(&new->nd, parent, p);
2521         rb_insert_color(&new->nd, &sp->root);
2522         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2523                  new->policy ? new->policy->mode : 0);
2524 }
2525
2526 /* Find shared policy intersecting idx */
2527 struct mempolicy *
2528 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2529 {
2530         struct mempolicy *pol = NULL;
2531         struct sp_node *sn;
2532
2533         if (!sp->root.rb_node)
2534                 return NULL;
2535         read_lock(&sp->lock);
2536         sn = sp_lookup(sp, idx, idx+1);
2537         if (sn) {
2538                 mpol_get(sn->policy);
2539                 pol = sn->policy;
2540         }
2541         read_unlock(&sp->lock);
2542         return pol;
2543 }
2544
2545 static void sp_free(struct sp_node *n)
2546 {
2547         mpol_put(n->policy);
2548         kmem_cache_free(sn_cache, n);
2549 }
2550
2551 /**
2552  * mpol_misplaced - check whether current page node is valid in policy
2553  *
2554  * @page: page to be checked
2555  * @vma: vm area where page mapped
2556  * @addr: virtual address where page mapped
2557  *
2558  * Lookup current policy node id for vma,addr and "compare to" page's
2559  * node id.  Policy determination "mimics" alloc_page_vma().
2560  * Called from fault path where we know the vma and faulting address.
2561  *
2562  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2563  * policy, or a suitable node ID to allocate a replacement page from.
2564  */
2565 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2566 {
2567         struct mempolicy *pol;
2568         struct zoneref *z;
2569         int curnid = page_to_nid(page);
2570         unsigned long pgoff;
2571         int thiscpu = raw_smp_processor_id();
2572         int thisnid = cpu_to_node(thiscpu);
2573         int polnid = NUMA_NO_NODE;
2574         int ret = NUMA_NO_NODE;
2575
2576         pol = get_vma_policy(vma, addr);
2577         if (!(pol->flags & MPOL_F_MOF))
2578                 goto out;
2579
2580         switch (pol->mode) {
2581         case MPOL_INTERLEAVE:
2582                 pgoff = vma->vm_pgoff;
2583                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2584                 polnid = offset_il_node(pol, pgoff);
2585                 break;
2586
2587         case MPOL_PREFERRED:
2588                 if (node_isset(curnid, pol->nodes))
2589                         goto out;
2590                 polnid = first_node(pol->nodes);
2591                 break;
2592
2593         case MPOL_LOCAL:
2594                 polnid = numa_node_id();
2595                 break;
2596
2597         case MPOL_BIND:
2598                 /* Optimize placement among multiple nodes via NUMA balancing */
2599                 if (pol->flags & MPOL_F_MORON) {
2600                         if (node_isset(thisnid, pol->nodes))
2601                                 break;
2602                         goto out;
2603                 }
2604                 fallthrough;
2605
2606         case MPOL_PREFERRED_MANY:
2607                 /*
2608                  * use current page if in policy nodemask,
2609                  * else select nearest allowed node, if any.
2610                  * If no allowed nodes, use current [!misplaced].
2611                  */
2612                 if (node_isset(curnid, pol->nodes))
2613                         goto out;
2614                 z = first_zones_zonelist(
2615                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2616                                 gfp_zone(GFP_HIGHUSER),
2617                                 &pol->nodes);
2618                 polnid = zone_to_nid(z->zone);
2619                 break;
2620
2621         default:
2622                 BUG();
2623         }
2624
2625         /* Migrate the page towards the node whose CPU is referencing it */
2626         if (pol->flags & MPOL_F_MORON) {
2627                 polnid = thisnid;
2628
2629                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2630                         goto out;
2631         }
2632
2633         if (curnid != polnid)
2634                 ret = polnid;
2635 out:
2636         mpol_cond_put(pol);
2637
2638         return ret;
2639 }
2640
2641 /*
2642  * Drop the (possibly final) reference to task->mempolicy.  It needs to be
2643  * dropped after task->mempolicy is set to NULL so that any allocation done as
2644  * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2645  * policy.
2646  */
2647 void mpol_put_task_policy(struct task_struct *task)
2648 {
2649         struct mempolicy *pol;
2650
2651         task_lock(task);
2652         pol = task->mempolicy;
2653         task->mempolicy = NULL;
2654         task_unlock(task);
2655         mpol_put(pol);
2656 }
2657
2658 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2659 {
2660         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2661         rb_erase(&n->nd, &sp->root);
2662         sp_free(n);
2663 }
2664
2665 static void sp_node_init(struct sp_node *node, unsigned long start,
2666                         unsigned long end, struct mempolicy *pol)
2667 {
2668         node->start = start;
2669         node->end = end;
2670         node->policy = pol;
2671 }
2672
2673 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2674                                 struct mempolicy *pol)
2675 {
2676         struct sp_node *n;
2677         struct mempolicy *newpol;
2678
2679         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2680         if (!n)
2681                 return NULL;
2682
2683         newpol = mpol_dup(pol);
2684         if (IS_ERR(newpol)) {
2685                 kmem_cache_free(sn_cache, n);
2686                 return NULL;
2687         }
2688         newpol->flags |= MPOL_F_SHARED;
2689         sp_node_init(n, start, end, newpol);
2690
2691         return n;
2692 }
2693
2694 /* Replace a policy range. */
2695 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2696                                  unsigned long end, struct sp_node *new)
2697 {
2698         struct sp_node *n;
2699         struct sp_node *n_new = NULL;
2700         struct mempolicy *mpol_new = NULL;
2701         int ret = 0;
2702
2703 restart:
2704         write_lock(&sp->lock);
2705         n = sp_lookup(sp, start, end);
2706         /* Take care of old policies in the same range. */
2707         while (n && n->start < end) {
2708                 struct rb_node *next = rb_next(&n->nd);
2709                 if (n->start >= start) {
2710                         if (n->end <= end)
2711                                 sp_delete(sp, n);
2712                         else
2713                                 n->start = end;
2714                 } else {
2715                         /* Old policy spanning whole new range. */
2716                         if (n->end > end) {
2717                                 if (!n_new)
2718                                         goto alloc_new;
2719
2720                                 *mpol_new = *n->policy;
2721                                 atomic_set(&mpol_new->refcnt, 1);
2722                                 sp_node_init(n_new, end, n->end, mpol_new);
2723                                 n->end = start;
2724                                 sp_insert(sp, n_new);
2725                                 n_new = NULL;
2726                                 mpol_new = NULL;
2727                                 break;
2728                         } else
2729                                 n->end = start;
2730                 }
2731                 if (!next)
2732                         break;
2733                 n = rb_entry(next, struct sp_node, nd);
2734         }
2735         if (new)
2736                 sp_insert(sp, new);
2737         write_unlock(&sp->lock);
2738         ret = 0;
2739
2740 err_out:
2741         if (mpol_new)
2742                 mpol_put(mpol_new);
2743         if (n_new)
2744                 kmem_cache_free(sn_cache, n_new);
2745
2746         return ret;
2747
2748 alloc_new:
2749         write_unlock(&sp->lock);
2750         ret = -ENOMEM;
2751         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2752         if (!n_new)
2753                 goto err_out;
2754         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2755         if (!mpol_new)
2756                 goto err_out;
2757         atomic_set(&mpol_new->refcnt, 1);
2758         goto restart;
2759 }
2760
2761 /**
2762  * mpol_shared_policy_init - initialize shared policy for inode
2763  * @sp: pointer to inode shared policy
2764  * @mpol:  struct mempolicy to install
2765  *
2766  * Install non-NULL @mpol in inode's shared policy rb-tree.
2767  * On entry, the current task has a reference on a non-NULL @mpol.
2768  * This must be released on exit.
2769  * This is called at get_inode() calls and we can use GFP_KERNEL.
2770  */
2771 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2772 {
2773         int ret;
2774
2775         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2776         rwlock_init(&sp->lock);
2777
2778         if (mpol) {
2779                 struct vm_area_struct pvma;
2780                 struct mempolicy *new;
2781                 NODEMASK_SCRATCH(scratch);
2782
2783                 if (!scratch)
2784                         goto put_mpol;
2785                 /* contextualize the tmpfs mount point mempolicy */
2786                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2787                 if (IS_ERR(new))
2788                         goto free_scratch; /* no valid nodemask intersection */
2789
2790                 task_lock(current);
2791                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2792                 task_unlock(current);
2793                 if (ret)
2794                         goto put_new;
2795
2796                 /* Create pseudo-vma that contains just the policy */
2797                 vma_init(&pvma, NULL);
2798                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2799                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2800
2801 put_new:
2802                 mpol_put(new);                  /* drop initial ref */
2803 free_scratch:
2804                 NODEMASK_SCRATCH_FREE(scratch);
2805 put_mpol:
2806                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2807         }
2808 }
2809
2810 int mpol_set_shared_policy(struct shared_policy *info,
2811                         struct vm_area_struct *vma, struct mempolicy *npol)
2812 {
2813         int err;
2814         struct sp_node *new = NULL;
2815         unsigned long sz = vma_pages(vma);
2816
2817         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2818                  vma->vm_pgoff,
2819                  sz, npol ? npol->mode : -1,
2820                  npol ? npol->flags : -1,
2821                  npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
2822
2823         if (npol) {
2824                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2825                 if (!new)
2826                         return -ENOMEM;
2827         }
2828         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2829         if (err && new)
2830                 sp_free(new);
2831         return err;
2832 }
2833
2834 /* Free a backing policy store on inode delete. */
2835 void mpol_free_shared_policy(struct shared_policy *p)
2836 {
2837         struct sp_node *n;
2838         struct rb_node *next;
2839
2840         if (!p->root.rb_node)
2841                 return;
2842         write_lock(&p->lock);
2843         next = rb_first(&p->root);
2844         while (next) {
2845                 n = rb_entry(next, struct sp_node, nd);
2846                 next = rb_next(&n->nd);
2847                 sp_delete(p, n);
2848         }
2849         write_unlock(&p->lock);
2850 }
2851
2852 #ifdef CONFIG_NUMA_BALANCING
2853 static int __initdata numabalancing_override;
2854
2855 static void __init check_numabalancing_enable(void)
2856 {
2857         bool numabalancing_default = false;
2858
2859         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2860                 numabalancing_default = true;
2861
2862         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2863         if (numabalancing_override)
2864                 set_numabalancing_state(numabalancing_override == 1);
2865
2866         if (num_online_nodes() > 1 && !numabalancing_override) {
2867                 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2868                         numabalancing_default ? "Enabling" : "Disabling");
2869                 set_numabalancing_state(numabalancing_default);
2870         }
2871 }
2872
2873 static int __init setup_numabalancing(char *str)
2874 {
2875         int ret = 0;
2876         if (!str)
2877                 goto out;
2878
2879         if (!strcmp(str, "enable")) {
2880                 numabalancing_override = 1;
2881                 ret = 1;
2882         } else if (!strcmp(str, "disable")) {
2883                 numabalancing_override = -1;
2884                 ret = 1;
2885         }
2886 out:
2887         if (!ret)
2888                 pr_warn("Unable to parse numa_balancing=\n");
2889
2890         return ret;
2891 }
2892 __setup("numa_balancing=", setup_numabalancing);
2893 #else
2894 static inline void __init check_numabalancing_enable(void)
2895 {
2896 }
2897 #endif /* CONFIG_NUMA_BALANCING */
2898
2899 /* assumes fs == KERNEL_DS */
2900 void __init numa_policy_init(void)
2901 {
2902         nodemask_t interleave_nodes;
2903         unsigned long largest = 0;
2904         int nid, prefer = 0;
2905
2906         policy_cache = kmem_cache_create("numa_policy",
2907                                          sizeof(struct mempolicy),
2908                                          0, SLAB_PANIC, NULL);
2909
2910         sn_cache = kmem_cache_create("shared_policy_node",
2911                                      sizeof(struct sp_node),
2912                                      0, SLAB_PANIC, NULL);
2913
2914         for_each_node(nid) {
2915                 preferred_node_policy[nid] = (struct mempolicy) {
2916                         .refcnt = ATOMIC_INIT(1),
2917                         .mode = MPOL_PREFERRED,
2918                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2919                         .nodes = nodemask_of_node(nid),
2920                 };
2921         }
2922
2923         /*
2924          * Set interleaving policy for system init. Interleaving is only
2925          * enabled across suitably sized nodes (default is >= 16MB), or
2926          * fall back to the largest node if they're all smaller.
2927          */
2928         nodes_clear(interleave_nodes);
2929         for_each_node_state(nid, N_MEMORY) {
2930                 unsigned long total_pages = node_present_pages(nid);
2931
2932                 /* Preserve the largest node */
2933                 if (largest < total_pages) {
2934                         largest = total_pages;
2935                         prefer = nid;
2936                 }
2937
2938                 /* Interleave this node? */
2939                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2940                         node_set(nid, interleave_nodes);
2941         }
2942
2943         /* All too small, use the largest */
2944         if (unlikely(nodes_empty(interleave_nodes)))
2945                 node_set(prefer, interleave_nodes);
2946
2947         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2948                 pr_err("%s: interleaving failed\n", __func__);
2949
2950         check_numabalancing_enable();
2951 }
2952
2953 /* Reset policy of current process to default */
2954 void numa_default_policy(void)
2955 {
2956         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2957 }
2958
2959 /*
2960  * Parse and format mempolicy from/to strings
2961  */
2962
2963 static const char * const policy_modes[] =
2964 {
2965         [MPOL_DEFAULT]    = "default",
2966         [MPOL_PREFERRED]  = "prefer",
2967         [MPOL_BIND]       = "bind",
2968         [MPOL_INTERLEAVE] = "interleave",
2969         [MPOL_LOCAL]      = "local",
2970         [MPOL_PREFERRED_MANY]  = "prefer (many)",
2971 };
2972
2973
2974 #ifdef CONFIG_TMPFS
2975 /**
2976  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2977  * @str:  string containing mempolicy to parse
2978  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2979  *
2980  * Format of input:
2981  *      <mode>[=<flags>][:<nodelist>]
2982  *
2983  * Return: %0 on success, else %1
2984  */
2985 int mpol_parse_str(char *str, struct mempolicy **mpol)
2986 {
2987         struct mempolicy *new = NULL;
2988         unsigned short mode_flags;
2989         nodemask_t nodes;
2990         char *nodelist = strchr(str, ':');
2991         char *flags = strchr(str, '=');
2992         int err = 1, mode;
2993
2994         if (flags)
2995                 *flags++ = '\0';        /* terminate mode string */
2996
2997         if (nodelist) {
2998                 /* NUL-terminate mode or flags string */
2999                 *nodelist++ = '\0';
3000                 if (nodelist_parse(nodelist, nodes))
3001                         goto out;
3002                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
3003                         goto out;
3004         } else
3005                 nodes_clear(nodes);
3006
3007         mode = match_string(policy_modes, MPOL_MAX, str);
3008         if (mode < 0)
3009                 goto out;
3010
3011         switch (mode) {
3012         case MPOL_PREFERRED:
3013                 /*
3014                  * Insist on a nodelist of one node only, although later
3015                  * we use first_node(nodes) to grab a single node, so here
3016                  * nodelist (or nodes) cannot be empty.
3017                  */
3018                 if (nodelist) {
3019                         char *rest = nodelist;
3020                         while (isdigit(*rest))
3021                                 rest++;
3022                         if (*rest)
3023                                 goto out;
3024                         if (nodes_empty(nodes))
3025                                 goto out;
3026                 }
3027                 break;
3028         case MPOL_INTERLEAVE:
3029                 /*
3030                  * Default to online nodes with memory if no nodelist
3031                  */
3032                 if (!nodelist)
3033                         nodes = node_states[N_MEMORY];
3034                 break;
3035         case MPOL_LOCAL:
3036                 /*
3037                  * Don't allow a nodelist;  mpol_new() checks flags
3038                  */
3039                 if (nodelist)
3040                         goto out;
3041                 break;
3042         case MPOL_DEFAULT:
3043                 /*
3044                  * Insist on a empty nodelist
3045                  */
3046                 if (!nodelist)
3047                         err = 0;
3048                 goto out;
3049         case MPOL_PREFERRED_MANY:
3050         case MPOL_BIND:
3051                 /*
3052                  * Insist on a nodelist
3053                  */
3054                 if (!nodelist)
3055                         goto out;
3056         }
3057
3058         mode_flags = 0;
3059         if (flags) {
3060                 /*
3061                  * Currently, we only support two mutually exclusive
3062                  * mode flags.
3063                  */
3064                 if (!strcmp(flags, "static"))
3065                         mode_flags |= MPOL_F_STATIC_NODES;
3066                 else if (!strcmp(flags, "relative"))
3067                         mode_flags |= MPOL_F_RELATIVE_NODES;
3068                 else
3069                         goto out;
3070         }
3071
3072         new = mpol_new(mode, mode_flags, &nodes);
3073         if (IS_ERR(new))
3074                 goto out;
3075
3076         /*
3077          * Save nodes for mpol_to_str() to show the tmpfs mount options
3078          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3079          */
3080         if (mode != MPOL_PREFERRED) {
3081                 new->nodes = nodes;
3082         } else if (nodelist) {
3083                 nodes_clear(new->nodes);
3084                 node_set(first_node(nodes), new->nodes);
3085         } else {
3086                 new->mode = MPOL_LOCAL;
3087         }
3088
3089         /*
3090          * Save nodes for contextualization: this will be used to "clone"
3091          * the mempolicy in a specific context [cpuset] at a later time.
3092          */
3093         new->w.user_nodemask = nodes;
3094
3095         err = 0;
3096
3097 out:
3098         /* Restore string for error message */
3099         if (nodelist)
3100                 *--nodelist = ':';
3101         if (flags)
3102                 *--flags = '=';
3103         if (!err)
3104                 *mpol = new;
3105         return err;
3106 }
3107 #endif /* CONFIG_TMPFS */
3108
3109 /**
3110  * mpol_to_str - format a mempolicy structure for printing
3111  * @buffer:  to contain formatted mempolicy string
3112  * @maxlen:  length of @buffer
3113  * @pol:  pointer to mempolicy to be formatted
3114  *
3115  * Convert @pol into a string.  If @buffer is too short, truncate the string.
3116  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3117  * longest flag, "relative", and to display at least a few node ids.
3118  */
3119 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
3120 {
3121         char *p = buffer;
3122         nodemask_t nodes = NODE_MASK_NONE;
3123         unsigned short mode = MPOL_DEFAULT;
3124         unsigned short flags = 0;
3125
3126         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
3127                 mode = pol->mode;
3128                 flags = pol->flags;
3129         }
3130
3131         switch (mode) {
3132         case MPOL_DEFAULT:
3133         case MPOL_LOCAL:
3134                 break;
3135         case MPOL_PREFERRED:
3136         case MPOL_PREFERRED_MANY:
3137         case MPOL_BIND:
3138         case MPOL_INTERLEAVE:
3139                 nodes = pol->nodes;
3140                 break;
3141         default:
3142                 WARN_ON_ONCE(1);
3143                 snprintf(p, maxlen, "unknown");
3144                 return;
3145         }
3146
3147         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3148
3149         if (flags & MPOL_MODE_FLAGS) {
3150                 p += snprintf(p, buffer + maxlen - p, "=");
3151
3152                 /*
3153                  * Currently, the only defined flags are mutually exclusive
3154                  */
3155                 if (flags & MPOL_F_STATIC_NODES)
3156                         p += snprintf(p, buffer + maxlen - p, "static");
3157                 else if (flags & MPOL_F_RELATIVE_NODES)
3158                         p += snprintf(p, buffer + maxlen - p, "relative");
3159         }
3160
3161         if (!nodes_empty(nodes))
3162                 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3163                                nodemask_pr_args(&nodes));
3164 }