Merge branch 'mana-shared-6.2' of https://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-block.git] / mm / mempolicy.c
CommitLineData
46aeb7e6 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4 28 * preferred Try a specific node first before normal fallback.
00ef2d2f 29 * As a special case NUMA_NO_NODE here means do the allocation
1da177e4
LT
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
b27abacc
DH
34 * preferred many Try a set of nodes first before normal fallback. This is
35 * similar to preferred without the special case.
36 *
1da177e4
LT
37 * default Allocate on the local node first, or when on a VMA
38 * use the process policy. This is what Linux always did
39 * in a NUMA aware kernel and still does by, ahem, default.
40 *
41 * The process policy is applied for most non interrupt memory allocations
42 * in that process' context. Interrupts ignore the policies and always
43 * try to allocate on the local CPU. The VMA policy is only applied for memory
44 * allocations for a VMA in the VM.
45 *
46 * Currently there are a few corner cases in swapping where the policy
47 * is not applied, but the majority should be handled. When process policy
48 * is used it is not remembered over swap outs/swap ins.
49 *
50 * Only the highest zone in the zone hierarchy gets policied. Allocations
51 * requesting a lower zone just use default policy. This implies that
52 * on systems with highmem kernel lowmem allocation don't get policied.
53 * Same with GFP_DMA allocations.
54 *
55 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
56 * all users and remembered even when nobody has memory mapped.
57 */
58
59/* Notebook:
60 fix mmap readahead to honour policy and enable policy for any page cache
61 object
62 statistics for bigpages
63 global policy for page cache? currently it uses process policy. Requires
64 first item above.
65 handle mremap for shared memory (currently ignored for the policy)
66 grows down?
67 make bind policy root only? It can trigger oom much faster and the
68 kernel is not always grateful with that.
1da177e4
LT
69*/
70
b1de0d13
MH
71#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72
1da177e4 73#include <linux/mempolicy.h>
a520110e 74#include <linux/pagewalk.h>
1da177e4
LT
75#include <linux/highmem.h>
76#include <linux/hugetlb.h>
77#include <linux/kernel.h>
78#include <linux/sched.h>
6e84f315 79#include <linux/sched/mm.h>
6a3827d7 80#include <linux/sched/numa_balancing.h>
f719ff9b 81#include <linux/sched/task.h>
1da177e4
LT
82#include <linux/nodemask.h>
83#include <linux/cpuset.h>
1da177e4
LT
84#include <linux/slab.h>
85#include <linux/string.h>
b95f1b31 86#include <linux/export.h>
b488893a 87#include <linux/nsproxy.h>
1da177e4
LT
88#include <linux/interrupt.h>
89#include <linux/init.h>
90#include <linux/compat.h>
31367466 91#include <linux/ptrace.h>
dc9aa5b9 92#include <linux/swap.h>
1a75a6c8
CL
93#include <linux/seq_file.h>
94#include <linux/proc_fs.h>
b20a3503 95#include <linux/migrate.h>
62b61f61 96#include <linux/ksm.h>
95a402c3 97#include <linux/rmap.h>
86c3a764 98#include <linux/security.h>
dbcb0f19 99#include <linux/syscalls.h>
095f1fc4 100#include <linux/ctype.h>
6d9c285a 101#include <linux/mm_inline.h>
b24f53a0 102#include <linux/mmu_notifier.h>
b1de0d13 103#include <linux/printk.h>
c8633798 104#include <linux/swapops.h>
dc9aa5b9 105
1da177e4 106#include <asm/tlbflush.h>
4a18419f 107#include <asm/tlb.h>
7c0f6ba6 108#include <linux/uaccess.h>
1da177e4 109
62695a84
NP
110#include "internal.h"
111
38e35860 112/* Internal flags */
dc9aa5b9 113#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 114#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
dc9aa5b9 115
fcc234f8
PE
116static struct kmem_cache *policy_cache;
117static struct kmem_cache *sn_cache;
1da177e4 118
1da177e4
LT
119/* Highest zone. An specific allocation for a zone below that is not
120 policied. */
6267276f 121enum zone_type policy_zone = 0;
1da177e4 122
bea904d5
LS
123/*
124 * run-time system-wide default policy => local allocation
125 */
e754d79d 126static struct mempolicy default_policy = {
1da177e4 127 .refcnt = ATOMIC_INIT(1), /* never free it */
7858d7bc 128 .mode = MPOL_LOCAL,
1da177e4
LT
129};
130
5606e387
MG
131static struct mempolicy preferred_node_policy[MAX_NUMNODES];
132
b2ca916c
DW
133/**
134 * numa_map_to_online_node - Find closest online node
f6e92f40 135 * @node: Node id to start the search
b2ca916c
DW
136 *
137 * Lookup the next closest node by distance if @nid is not online.
dad5b023
RD
138 *
139 * Return: this @node if it is online, otherwise the closest node by distance
b2ca916c
DW
140 */
141int numa_map_to_online_node(int node)
142{
4fcbe96e 143 int min_dist = INT_MAX, dist, n, min_node;
b2ca916c 144
4fcbe96e
DW
145 if (node == NUMA_NO_NODE || node_online(node))
146 return node;
b2ca916c
DW
147
148 min_node = node;
4fcbe96e
DW
149 for_each_online_node(n) {
150 dist = node_distance(node, n);
151 if (dist < min_dist) {
152 min_dist = dist;
153 min_node = n;
b2ca916c
DW
154 }
155 }
156
157 return min_node;
158}
159EXPORT_SYMBOL_GPL(numa_map_to_online_node);
160
74d2c3a0 161struct mempolicy *get_task_policy(struct task_struct *p)
5606e387
MG
162{
163 struct mempolicy *pol = p->mempolicy;
f15ca78e 164 int node;
5606e387 165
f15ca78e
ON
166 if (pol)
167 return pol;
5606e387 168
f15ca78e
ON
169 node = numa_node_id();
170 if (node != NUMA_NO_NODE) {
171 pol = &preferred_node_policy[node];
172 /* preferred_node_policy is not initialised early in boot */
173 if (pol->mode)
174 return pol;
5606e387
MG
175 }
176
f15ca78e 177 return &default_policy;
5606e387
MG
178}
179
37012946
DR
180static const struct mempolicy_operations {
181 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
213980c0 182 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
37012946
DR
183} mpol_ops[MPOL_MAX];
184
f5b087b5
DR
185static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
186{
6d556294 187 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
188}
189
190static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
191 const nodemask_t *rel)
192{
193 nodemask_t tmp;
194 nodes_fold(tmp, *orig, nodes_weight(*rel));
195 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
196}
197
be897d48 198static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
199{
200 if (nodes_empty(*nodes))
201 return -EINVAL;
269fbe72 202 pol->nodes = *nodes;
37012946
DR
203 return 0;
204}
205
206static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
207{
7858d7bc
FT
208 if (nodes_empty(*nodes))
209 return -EINVAL;
269fbe72
BW
210
211 nodes_clear(pol->nodes);
212 node_set(first_node(*nodes), pol->nodes);
37012946
DR
213 return 0;
214}
215
58568d2a
MX
216/*
217 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
218 * any, for the new policy. mpol_new() has already validated the nodes
7858d7bc 219 * parameter with respect to the policy mode and flags.
58568d2a
MX
220 *
221 * Must be called holding task's alloc_lock to protect task's mems_allowed
c1e8d7c6 222 * and mempolicy. May also be called holding the mmap_lock for write.
58568d2a 223 */
4bfc4495
KH
224static int mpol_set_nodemask(struct mempolicy *pol,
225 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 226{
58568d2a
MX
227 int ret;
228
7858d7bc
FT
229 /*
230 * Default (pol==NULL) resp. local memory policies are not a
231 * subject of any remapping. They also do not need any special
232 * constructor.
233 */
234 if (!pol || pol->mode == MPOL_LOCAL)
58568d2a 235 return 0;
7858d7bc 236
01f13bd6 237 /* Check N_MEMORY */
4bfc4495 238 nodes_and(nsc->mask1,
01f13bd6 239 cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a
MX
240
241 VM_BUG_ON(!nodes);
4bfc4495 242
7858d7bc
FT
243 if (pol->flags & MPOL_F_RELATIVE_NODES)
244 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
245 else
246 nodes_and(nsc->mask2, *nodes, nsc->mask1);
58568d2a 247
7858d7bc
FT
248 if (mpol_store_user_nodemask(pol))
249 pol->w.user_nodemask = *nodes;
4bfc4495 250 else
7858d7bc
FT
251 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
252
253 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
58568d2a
MX
254 return ret;
255}
256
257/*
258 * This function just creates a new policy, does some check and simple
259 * initialization. You must invoke mpol_set_nodemask() to set nodes.
260 */
028fec41
DR
261static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
262 nodemask_t *nodes)
1da177e4
LT
263{
264 struct mempolicy *policy;
265
028fec41 266 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
00ef2d2f 267 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
140d5a49 268
3e1f0645
DR
269 if (mode == MPOL_DEFAULT) {
270 if (nodes && !nodes_empty(*nodes))
37012946 271 return ERR_PTR(-EINVAL);
d3a71033 272 return NULL;
37012946 273 }
3e1f0645
DR
274 VM_BUG_ON(!nodes);
275
276 /*
277 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
278 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
279 * All other modes require a valid pointer to a non-empty nodemask.
280 */
281 if (mode == MPOL_PREFERRED) {
282 if (nodes_empty(*nodes)) {
283 if (((flags & MPOL_F_STATIC_NODES) ||
284 (flags & MPOL_F_RELATIVE_NODES)))
285 return ERR_PTR(-EINVAL);
7858d7bc
FT
286
287 mode = MPOL_LOCAL;
3e1f0645 288 }
479e2802 289 } else if (mode == MPOL_LOCAL) {
8d303e44
PK
290 if (!nodes_empty(*nodes) ||
291 (flags & MPOL_F_STATIC_NODES) ||
292 (flags & MPOL_F_RELATIVE_NODES))
479e2802 293 return ERR_PTR(-EINVAL);
3e1f0645
DR
294 } else if (nodes_empty(*nodes))
295 return ERR_PTR(-EINVAL);
1da177e4
LT
296 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
297 if (!policy)
298 return ERR_PTR(-ENOMEM);
299 atomic_set(&policy->refcnt, 1);
45c4745a 300 policy->mode = mode;
3e1f0645 301 policy->flags = flags;
c6018b4b 302 policy->home_node = NUMA_NO_NODE;
37012946 303
1da177e4 304 return policy;
37012946
DR
305}
306
52cd3b07
LS
307/* Slow path of a mpol destructor. */
308void __mpol_put(struct mempolicy *p)
309{
310 if (!atomic_dec_and_test(&p->refcnt))
311 return;
52cd3b07
LS
312 kmem_cache_free(policy_cache, p);
313}
314
213980c0 315static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
316{
317}
318
213980c0 319static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
320{
321 nodemask_t tmp;
322
323 if (pol->flags & MPOL_F_STATIC_NODES)
324 nodes_and(tmp, pol->w.user_nodemask, *nodes);
325 else if (pol->flags & MPOL_F_RELATIVE_NODES)
326 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
327 else {
269fbe72 328 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
213980c0 329 *nodes);
29b190fa 330 pol->w.cpuset_mems_allowed = *nodes;
37012946 331 }
f5b087b5 332
708c1bbc
MX
333 if (nodes_empty(tmp))
334 tmp = *nodes;
335
269fbe72 336 pol->nodes = tmp;
37012946
DR
337}
338
339static void mpol_rebind_preferred(struct mempolicy *pol,
213980c0 340 const nodemask_t *nodes)
37012946 341{
7858d7bc 342 pol->w.cpuset_mems_allowed = *nodes;
1da177e4
LT
343}
344
708c1bbc
MX
345/*
346 * mpol_rebind_policy - Migrate a policy to a different set of nodes
347 *
c1e8d7c6 348 * Per-vma policies are protected by mmap_lock. Allocations using per-task
213980c0
VB
349 * policies are protected by task->mems_allowed_seq to prevent a premature
350 * OOM/allocation failure due to parallel nodemask modification.
708c1bbc 351 */
213980c0 352static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1d0d2680 353{
018160ad 354 if (!pol || pol->mode == MPOL_LOCAL)
1d0d2680 355 return;
7858d7bc 356 if (!mpol_store_user_nodemask(pol) &&
1d0d2680
DR
357 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
358 return;
708c1bbc 359
213980c0 360 mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680
DR
361}
362
363/*
364 * Wrapper for mpol_rebind_policy() that just requires task
365 * pointer, and updates task mempolicy.
58568d2a
MX
366 *
367 * Called with task's alloc_lock held.
1d0d2680
DR
368 */
369
213980c0 370void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1d0d2680 371{
213980c0 372 mpol_rebind_policy(tsk->mempolicy, new);
1d0d2680
DR
373}
374
375/*
376 * Rebind each vma in mm to new nodemask.
377 *
c1e8d7c6 378 * Call holding a reference to mm. Takes mm->mmap_lock during call.
1d0d2680
DR
379 */
380
381void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
382{
383 struct vm_area_struct *vma;
66850be5 384 VMA_ITERATOR(vmi, mm, 0);
1d0d2680 385
d8ed45c5 386 mmap_write_lock(mm);
66850be5 387 for_each_vma(vmi, vma)
213980c0 388 mpol_rebind_policy(vma->vm_policy, new);
d8ed45c5 389 mmap_write_unlock(mm);
1d0d2680
DR
390}
391
37012946
DR
392static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
393 [MPOL_DEFAULT] = {
394 .rebind = mpol_rebind_default,
395 },
396 [MPOL_INTERLEAVE] = {
be897d48 397 .create = mpol_new_nodemask,
37012946
DR
398 .rebind = mpol_rebind_nodemask,
399 },
400 [MPOL_PREFERRED] = {
401 .create = mpol_new_preferred,
402 .rebind = mpol_rebind_preferred,
403 },
404 [MPOL_BIND] = {
be897d48 405 .create = mpol_new_nodemask,
37012946
DR
406 .rebind = mpol_rebind_nodemask,
407 },
7858d7bc
FT
408 [MPOL_LOCAL] = {
409 .rebind = mpol_rebind_default,
410 },
b27abacc 411 [MPOL_PREFERRED_MANY] = {
be897d48 412 .create = mpol_new_nodemask,
b27abacc
DH
413 .rebind = mpol_rebind_preferred,
414 },
37012946
DR
415};
416
a53190a4 417static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 418 unsigned long flags);
1a75a6c8 419
6f4576e3
NH
420struct queue_pages {
421 struct list_head *pagelist;
422 unsigned long flags;
423 nodemask_t *nmask;
f18da660
LX
424 unsigned long start;
425 unsigned long end;
426 struct vm_area_struct *first;
6f4576e3
NH
427};
428
88aaa2a1
NH
429/*
430 * Check if the page's nid is in qp->nmask.
431 *
432 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
433 * in the invert of qp->nmask.
434 */
435static inline bool queue_pages_required(struct page *page,
436 struct queue_pages *qp)
437{
438 int nid = page_to_nid(page);
439 unsigned long flags = qp->flags;
440
441 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
442}
443
a7f40cfe 444/*
bc78b5ed 445 * queue_pages_pmd() has three possible return values:
e5947d23
YS
446 * 0 - pages are placed on the right node or queued successfully, or
447 * special page is met, i.e. huge zero page.
d8835445
YS
448 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
449 * specified.
d8835445
YS
450 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
451 * existing page was already on a node that does not follow the
452 * policy.
a7f40cfe 453 */
c8633798
NH
454static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
455 unsigned long end, struct mm_walk *walk)
959a7e13 456 __releases(ptl)
c8633798
NH
457{
458 int ret = 0;
459 struct page *page;
460 struct queue_pages *qp = walk->private;
461 unsigned long flags;
462
463 if (unlikely(is_pmd_migration_entry(*pmd))) {
a7f40cfe 464 ret = -EIO;
c8633798
NH
465 goto unlock;
466 }
467 page = pmd_page(*pmd);
468 if (is_huge_zero_page(page)) {
e5947d23 469 walk->action = ACTION_CONTINUE;
6d97cf88 470 goto unlock;
c8633798 471 }
d8835445 472 if (!queue_pages_required(page, qp))
c8633798 473 goto unlock;
c8633798 474
c8633798
NH
475 flags = qp->flags;
476 /* go to thp migration */
a7f40cfe 477 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
a53190a4
YS
478 if (!vma_migratable(walk->vma) ||
479 migrate_page_add(page, qp->pagelist, flags)) {
d8835445 480 ret = 1;
a7f40cfe
YS
481 goto unlock;
482 }
a7f40cfe
YS
483 } else
484 ret = -EIO;
c8633798
NH
485unlock:
486 spin_unlock(ptl);
c8633798
NH
487 return ret;
488}
489
98094945
NH
490/*
491 * Scan through pages checking if pages follow certain conditions,
492 * and move them to the pagelist if they do.
d8835445
YS
493 *
494 * queue_pages_pte_range() has three possible return values:
e5947d23
YS
495 * 0 - pages are placed on the right node or queued successfully, or
496 * special page is met, i.e. zero page.
d8835445
YS
497 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
498 * specified.
499 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
500 * on a node that does not follow the policy.
98094945 501 */
6f4576e3
NH
502static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
503 unsigned long end, struct mm_walk *walk)
1da177e4 504{
6f4576e3
NH
505 struct vm_area_struct *vma = walk->vma;
506 struct page *page;
507 struct queue_pages *qp = walk->private;
508 unsigned long flags = qp->flags;
d8835445 509 bool has_unmovable = false;
3f088420 510 pte_t *pte, *mapped_pte;
705e87c0 511 spinlock_t *ptl;
941150a3 512
c8633798 513 ptl = pmd_trans_huge_lock(pmd, vma);
bc78b5ed
ML
514 if (ptl)
515 return queue_pages_pmd(pmd, ptl, addr, end, walk);
91612e0d 516
337d9abf
NH
517 if (pmd_trans_unstable(pmd))
518 return 0;
94723aaf 519
3f088420 520 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
6f4576e3 521 for (; addr != end; pte++, addr += PAGE_SIZE) {
91612e0d 522 if (!pte_present(*pte))
1da177e4 523 continue;
6aab341e 524 page = vm_normal_page(vma, addr, *pte);
3218f871 525 if (!page || is_zone_device_page(page))
1da177e4 526 continue;
053837fc 527 /*
62b61f61
HD
528 * vm_normal_page() filters out zero pages, but there might
529 * still be PageReserved pages to skip, perhaps in a VDSO.
053837fc 530 */
b79bc0a0 531 if (PageReserved(page))
f4598c8b 532 continue;
88aaa2a1 533 if (!queue_pages_required(page, qp))
38e35860 534 continue;
a7f40cfe 535 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
d8835445
YS
536 /* MPOL_MF_STRICT must be specified if we get here */
537 if (!vma_migratable(vma)) {
538 has_unmovable = true;
a7f40cfe 539 break;
d8835445 540 }
a53190a4
YS
541
542 /*
543 * Do not abort immediately since there may be
544 * temporary off LRU pages in the range. Still
545 * need migrate other LRU pages.
546 */
547 if (migrate_page_add(page, qp->pagelist, flags))
548 has_unmovable = true;
a7f40cfe
YS
549 } else
550 break;
6f4576e3 551 }
3f088420 552 pte_unmap_unlock(mapped_pte, ptl);
6f4576e3 553 cond_resched();
d8835445
YS
554
555 if (has_unmovable)
556 return 1;
557
a7f40cfe 558 return addr != end ? -EIO : 0;
91612e0d
HD
559}
560
6f4576e3
NH
561static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
562 unsigned long addr, unsigned long end,
563 struct mm_walk *walk)
e2d8cf40 564{
dcf17635 565 int ret = 0;
e2d8cf40 566#ifdef CONFIG_HUGETLB_PAGE
6f4576e3 567 struct queue_pages *qp = walk->private;
dcf17635 568 unsigned long flags = (qp->flags & MPOL_MF_VALID);
e2d8cf40 569 struct page *page;
cb900f41 570 spinlock_t *ptl;
d4c54919 571 pte_t entry;
e2d8cf40 572
6f4576e3
NH
573 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
574 entry = huge_ptep_get(pte);
d4c54919
NH
575 if (!pte_present(entry))
576 goto unlock;
577 page = pte_page(entry);
88aaa2a1 578 if (!queue_pages_required(page, qp))
e2d8cf40 579 goto unlock;
dcf17635
LX
580
581 if (flags == MPOL_MF_STRICT) {
582 /*
583 * STRICT alone means only detecting misplaced page and no
584 * need to further check other vma.
585 */
586 ret = -EIO;
587 goto unlock;
588 }
589
590 if (!vma_migratable(walk->vma)) {
591 /*
592 * Must be STRICT with MOVE*, otherwise .test_walk() have
593 * stopped walking current vma.
594 * Detecting misplaced page but allow migrating pages which
595 * have been queued.
596 */
597 ret = 1;
598 goto unlock;
599 }
600
e2d8cf40
NH
601 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
602 if (flags & (MPOL_MF_MOVE_ALL) ||
dcf17635 603 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
7ce82f4c 604 if (isolate_hugetlb(page, qp->pagelist) &&
dcf17635
LX
605 (flags & MPOL_MF_STRICT))
606 /*
607 * Failed to isolate page but allow migrating pages
608 * which have been queued.
609 */
610 ret = 1;
611 }
e2d8cf40 612unlock:
cb900f41 613 spin_unlock(ptl);
e2d8cf40
NH
614#else
615 BUG();
616#endif
dcf17635 617 return ret;
1da177e4
LT
618}
619
5877231f 620#ifdef CONFIG_NUMA_BALANCING
b24f53a0 621/*
4b10e7d5
MG
622 * This is used to mark a range of virtual addresses to be inaccessible.
623 * These are later cleared by a NUMA hinting fault. Depending on these
624 * faults, pages may be migrated for better NUMA placement.
625 *
626 * This is assuming that NUMA faults are handled using PROT_NONE. If
627 * an architecture makes a different choice, it will need further
628 * changes to the core.
b24f53a0 629 */
4b10e7d5
MG
630unsigned long change_prot_numa(struct vm_area_struct *vma,
631 unsigned long addr, unsigned long end)
b24f53a0 632{
4a18419f 633 struct mmu_gather tlb;
4b10e7d5 634 int nr_updated;
b24f53a0 635
4a18419f
NA
636 tlb_gather_mmu(&tlb, vma->vm_mm);
637
638 nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE,
639 MM_CP_PROT_NUMA);
03c5a6e1
MG
640 if (nr_updated)
641 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0 642
4a18419f
NA
643 tlb_finish_mmu(&tlb);
644
4b10e7d5 645 return nr_updated;
b24f53a0
LS
646}
647#else
648static unsigned long change_prot_numa(struct vm_area_struct *vma,
649 unsigned long addr, unsigned long end)
650{
651 return 0;
652}
5877231f 653#endif /* CONFIG_NUMA_BALANCING */
b24f53a0 654
6f4576e3
NH
655static int queue_pages_test_walk(unsigned long start, unsigned long end,
656 struct mm_walk *walk)
657{
66850be5 658 struct vm_area_struct *next, *vma = walk->vma;
6f4576e3
NH
659 struct queue_pages *qp = walk->private;
660 unsigned long endvma = vma->vm_end;
661 unsigned long flags = qp->flags;
662
a18b3ac2 663 /* range check first */
ce33135c 664 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
f18da660
LX
665
666 if (!qp->first) {
667 qp->first = vma;
668 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
669 (qp->start < vma->vm_start))
670 /* hole at head side of range */
a18b3ac2
LX
671 return -EFAULT;
672 }
66850be5 673 next = find_vma(vma->vm_mm, vma->vm_end);
f18da660
LX
674 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
675 ((vma->vm_end < qp->end) &&
66850be5 676 (!next || vma->vm_end < next->vm_start)))
f18da660
LX
677 /* hole at middle or tail of range */
678 return -EFAULT;
a18b3ac2 679
a7f40cfe
YS
680 /*
681 * Need check MPOL_MF_STRICT to return -EIO if possible
682 * regardless of vma_migratable
683 */
684 if (!vma_migratable(vma) &&
685 !(flags & MPOL_MF_STRICT))
48684a65
NH
686 return 1;
687
6f4576e3
NH
688 if (endvma > end)
689 endvma = end;
6f4576e3 690
6f4576e3
NH
691 if (flags & MPOL_MF_LAZY) {
692 /* Similar to task_numa_work, skip inaccessible VMAs */
3122e80e 693 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
4355c018 694 !(vma->vm_flags & VM_MIXEDMAP))
6f4576e3
NH
695 change_prot_numa(vma, start, endvma);
696 return 1;
697 }
698
77bf45e7 699 /* queue pages from current vma */
a7f40cfe 700 if (flags & MPOL_MF_VALID)
6f4576e3
NH
701 return 0;
702 return 1;
703}
704
7b86ac33
CH
705static const struct mm_walk_ops queue_pages_walk_ops = {
706 .hugetlb_entry = queue_pages_hugetlb,
707 .pmd_entry = queue_pages_pte_range,
708 .test_walk = queue_pages_test_walk,
709};
710
dc9aa5b9 711/*
98094945
NH
712 * Walk through page tables and collect pages to be migrated.
713 *
714 * If pages found in a given range are on a set of nodes (determined by
715 * @nodes and @flags,) it's isolated and queued to the pagelist which is
d8835445
YS
716 * passed via @private.
717 *
718 * queue_pages_range() has three possible return values:
719 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
720 * specified.
721 * 0 - queue pages successfully or no misplaced page.
a85dfc30
YS
722 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
723 * memory range specified by nodemask and maxnode points outside
724 * your accessible address space (-EFAULT)
dc9aa5b9 725 */
d05f0cdc 726static int
98094945 727queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
6f4576e3
NH
728 nodemask_t *nodes, unsigned long flags,
729 struct list_head *pagelist)
1da177e4 730{
f18da660 731 int err;
6f4576e3
NH
732 struct queue_pages qp = {
733 .pagelist = pagelist,
734 .flags = flags,
735 .nmask = nodes,
f18da660
LX
736 .start = start,
737 .end = end,
738 .first = NULL,
6f4576e3 739 };
6f4576e3 740
f18da660
LX
741 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
742
743 if (!qp.first)
744 /* whole range in hole */
745 err = -EFAULT;
746
747 return err;
1da177e4
LT
748}
749
869833f2
KM
750/*
751 * Apply policy to a single VMA
c1e8d7c6 752 * This must be called with the mmap_lock held for writing.
869833f2
KM
753 */
754static int vma_replace_policy(struct vm_area_struct *vma,
755 struct mempolicy *pol)
8d34694c 756{
869833f2
KM
757 int err;
758 struct mempolicy *old;
759 struct mempolicy *new;
8d34694c
KM
760
761 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
762 vma->vm_start, vma->vm_end, vma->vm_pgoff,
763 vma->vm_ops, vma->vm_file,
764 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
765
869833f2
KM
766 new = mpol_dup(pol);
767 if (IS_ERR(new))
768 return PTR_ERR(new);
769
770 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 771 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
772 if (err)
773 goto err_out;
8d34694c 774 }
869833f2
KM
775
776 old = vma->vm_policy;
c1e8d7c6 777 vma->vm_policy = new; /* protected by mmap_lock */
869833f2
KM
778 mpol_put(old);
779
780 return 0;
781 err_out:
782 mpol_put(new);
8d34694c
KM
783 return err;
784}
785
1da177e4 786/* Step 2: apply policy to a range and do splits. */
9d8cebd4
KM
787static int mbind_range(struct mm_struct *mm, unsigned long start,
788 unsigned long end, struct mempolicy *new_pol)
1da177e4 789{
7329e3eb 790 MA_STATE(mas, &mm->mm_mt, start, start);
9d8cebd4
KM
791 struct vm_area_struct *prev;
792 struct vm_area_struct *vma;
793 int err = 0;
e26a5114 794 pgoff_t pgoff;
9d8cebd4 795
7329e3eb
LH
796 prev = mas_prev(&mas, 0);
797 if (unlikely(!prev))
798 mas_set(&mas, start);
799
800 vma = mas_find(&mas, end - 1);
801 if (WARN_ON(!vma))
802 return 0;
803
804 if (start > vma->vm_start)
805 prev = vma;
e26a5114 806
66850be5
LH
807 for (; vma; vma = mas_next(&mas, end - 1)) {
808 unsigned long vmstart = max(start, vma->vm_start);
809 unsigned long vmend = min(end, vma->vm_end);
9d8cebd4 810
e26a5114 811 if (mpol_equal(vma_policy(vma), new_pol))
66850be5 812 goto next;
e26a5114
KM
813
814 pgoff = vma->vm_pgoff +
815 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
9d8cebd4 816 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
19a809af 817 vma->anon_vma, vma->vm_file, pgoff,
9a10064f 818 new_pol, vma->vm_userfaultfd_ctx,
5c26f6ac 819 anon_vma_name(vma));
9d8cebd4 820 if (prev) {
66850be5
LH
821 /* vma_merge() invalidated the mas */
822 mas_pause(&mas);
9d8cebd4 823 vma = prev;
3964acd0 824 goto replace;
9d8cebd4
KM
825 }
826 if (vma->vm_start != vmstart) {
827 err = split_vma(vma->vm_mm, vma, vmstart, 1);
828 if (err)
829 goto out;
66850be5
LH
830 /* split_vma() invalidated the mas */
831 mas_pause(&mas);
9d8cebd4
KM
832 }
833 if (vma->vm_end != vmend) {
834 err = split_vma(vma->vm_mm, vma, vmend, 0);
835 if (err)
836 goto out;
66850be5
LH
837 /* split_vma() invalidated the mas */
838 mas_pause(&mas);
9d8cebd4 839 }
66850be5 840replace:
869833f2 841 err = vma_replace_policy(vma, new_pol);
8d34694c
KM
842 if (err)
843 goto out;
66850be5
LH
844next:
845 prev = vma;
1da177e4 846 }
9d8cebd4 847
66850be5 848out:
1da177e4
LT
849 return err;
850}
851
1da177e4 852/* Set the process memory policy */
028fec41
DR
853static long do_set_mempolicy(unsigned short mode, unsigned short flags,
854 nodemask_t *nodes)
1da177e4 855{
58568d2a 856 struct mempolicy *new, *old;
4bfc4495 857 NODEMASK_SCRATCH(scratch);
58568d2a 858 int ret;
1da177e4 859
4bfc4495
KH
860 if (!scratch)
861 return -ENOMEM;
f4e53d91 862
4bfc4495
KH
863 new = mpol_new(mode, flags, nodes);
864 if (IS_ERR(new)) {
865 ret = PTR_ERR(new);
866 goto out;
867 }
2c7c3a7d 868
12c1dc8e 869 task_lock(current);
4bfc4495 870 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a 871 if (ret) {
12c1dc8e 872 task_unlock(current);
58568d2a 873 mpol_put(new);
4bfc4495 874 goto out;
58568d2a 875 }
12c1dc8e 876
58568d2a 877 old = current->mempolicy;
1da177e4 878 current->mempolicy = new;
45816682
VB
879 if (new && new->mode == MPOL_INTERLEAVE)
880 current->il_prev = MAX_NUMNODES-1;
58568d2a 881 task_unlock(current);
58568d2a 882 mpol_put(old);
4bfc4495
KH
883 ret = 0;
884out:
885 NODEMASK_SCRATCH_FREE(scratch);
886 return ret;
1da177e4
LT
887}
888
bea904d5
LS
889/*
890 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
891 *
892 * Called with task's alloc_lock held
bea904d5
LS
893 */
894static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 895{
dfcd3c0d 896 nodes_clear(*nodes);
bea904d5
LS
897 if (p == &default_policy)
898 return;
899
45c4745a 900 switch (p->mode) {
19770b32 901 case MPOL_BIND:
1da177e4 902 case MPOL_INTERLEAVE:
269fbe72 903 case MPOL_PREFERRED:
b27abacc 904 case MPOL_PREFERRED_MANY:
269fbe72 905 *nodes = p->nodes;
1da177e4 906 break;
7858d7bc
FT
907 case MPOL_LOCAL:
908 /* return empty node mask for local allocation */
909 break;
1da177e4
LT
910 default:
911 BUG();
912 }
913}
914
3b9aadf7 915static int lookup_node(struct mm_struct *mm, unsigned long addr)
1da177e4 916{
ba841078 917 struct page *p = NULL;
f728b9c4 918 int ret;
1da177e4 919
f728b9c4
JH
920 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
921 if (ret > 0) {
922 ret = page_to_nid(p);
1da177e4
LT
923 put_page(p);
924 }
f728b9c4 925 return ret;
1da177e4
LT
926}
927
1da177e4 928/* Retrieve NUMA policy */
dbcb0f19
AB
929static long do_get_mempolicy(int *policy, nodemask_t *nmask,
930 unsigned long addr, unsigned long flags)
1da177e4 931{
8bccd85f 932 int err;
1da177e4
LT
933 struct mm_struct *mm = current->mm;
934 struct vm_area_struct *vma = NULL;
3b9aadf7 935 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1da177e4 936
754af6f5
LS
937 if (flags &
938 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 939 return -EINVAL;
754af6f5
LS
940
941 if (flags & MPOL_F_MEMS_ALLOWED) {
942 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
943 return -EINVAL;
944 *policy = 0; /* just so it's initialized */
58568d2a 945 task_lock(current);
754af6f5 946 *nmask = cpuset_current_mems_allowed;
58568d2a 947 task_unlock(current);
754af6f5
LS
948 return 0;
949 }
950
1da177e4 951 if (flags & MPOL_F_ADDR) {
bea904d5
LS
952 /*
953 * Do NOT fall back to task policy if the
954 * vma/shared policy at addr is NULL. We
955 * want to return MPOL_DEFAULT in this case.
956 */
d8ed45c5 957 mmap_read_lock(mm);
33e3575c 958 vma = vma_lookup(mm, addr);
1da177e4 959 if (!vma) {
d8ed45c5 960 mmap_read_unlock(mm);
1da177e4
LT
961 return -EFAULT;
962 }
963 if (vma->vm_ops && vma->vm_ops->get_policy)
964 pol = vma->vm_ops->get_policy(vma, addr);
965 else
966 pol = vma->vm_policy;
967 } else if (addr)
968 return -EINVAL;
969
970 if (!pol)
bea904d5 971 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
972
973 if (flags & MPOL_F_NODE) {
974 if (flags & MPOL_F_ADDR) {
3b9aadf7 975 /*
f728b9c4
JH
976 * Take a refcount on the mpol, because we are about to
977 * drop the mmap_lock, after which only "pol" remains
978 * valid, "vma" is stale.
3b9aadf7
AA
979 */
980 pol_refcount = pol;
981 vma = NULL;
982 mpol_get(pol);
f728b9c4 983 mmap_read_unlock(mm);
3b9aadf7 984 err = lookup_node(mm, addr);
1da177e4
LT
985 if (err < 0)
986 goto out;
8bccd85f 987 *policy = err;
1da177e4 988 } else if (pol == current->mempolicy &&
45c4745a 989 pol->mode == MPOL_INTERLEAVE) {
269fbe72 990 *policy = next_node_in(current->il_prev, pol->nodes);
1da177e4
LT
991 } else {
992 err = -EINVAL;
993 goto out;
994 }
bea904d5
LS
995 } else {
996 *policy = pol == &default_policy ? MPOL_DEFAULT :
997 pol->mode;
d79df630
DR
998 /*
999 * Internal mempolicy flags must be masked off before exposing
1000 * the policy to userspace.
1001 */
1002 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 1003 }
1da177e4 1004
1da177e4 1005 err = 0;
58568d2a 1006 if (nmask) {
c6b6ef8b
LS
1007 if (mpol_store_user_nodemask(pol)) {
1008 *nmask = pol->w.user_nodemask;
1009 } else {
1010 task_lock(current);
1011 get_policy_nodemask(pol, nmask);
1012 task_unlock(current);
1013 }
58568d2a 1014 }
1da177e4
LT
1015
1016 out:
52cd3b07 1017 mpol_cond_put(pol);
1da177e4 1018 if (vma)
d8ed45c5 1019 mmap_read_unlock(mm);
3b9aadf7
AA
1020 if (pol_refcount)
1021 mpol_put(pol_refcount);
1da177e4
LT
1022 return err;
1023}
1024
b20a3503 1025#ifdef CONFIG_MIGRATION
6ce3c4c0 1026/*
c8633798 1027 * page migration, thp tail pages can be passed.
6ce3c4c0 1028 */
a53190a4 1029static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 1030 unsigned long flags)
6ce3c4c0 1031{
c8633798 1032 struct page *head = compound_head(page);
6ce3c4c0 1033 /*
fc301289 1034 * Avoid migrating a page that is shared with others.
6ce3c4c0 1035 */
c8633798
NH
1036 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1037 if (!isolate_lru_page(head)) {
1038 list_add_tail(&head->lru, pagelist);
1039 mod_node_page_state(page_pgdat(head),
9de4f22a 1040 NR_ISOLATED_ANON + page_is_file_lru(head),
6c357848 1041 thp_nr_pages(head));
a53190a4
YS
1042 } else if (flags & MPOL_MF_STRICT) {
1043 /*
1044 * Non-movable page may reach here. And, there may be
1045 * temporary off LRU pages or non-LRU movable pages.
1046 * Treat them as unmovable pages since they can't be
1047 * isolated, so they can't be moved at the moment. It
1048 * should return -EIO for this case too.
1049 */
1050 return -EIO;
62695a84
NP
1051 }
1052 }
a53190a4
YS
1053
1054 return 0;
7e2ab150 1055}
6ce3c4c0 1056
7e2ab150
CL
1057/*
1058 * Migrate pages from one node to a target node.
1059 * Returns error or the number of pages not migrated.
1060 */
dbcb0f19
AB
1061static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1062 int flags)
7e2ab150
CL
1063{
1064 nodemask_t nmask;
66850be5 1065 struct vm_area_struct *vma;
7e2ab150
CL
1066 LIST_HEAD(pagelist);
1067 int err = 0;
a0976311
JK
1068 struct migration_target_control mtc = {
1069 .nid = dest,
1070 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1071 };
7e2ab150
CL
1072
1073 nodes_clear(nmask);
1074 node_set(source, nmask);
6ce3c4c0 1075
08270807
MK
1076 /*
1077 * This does not "check" the range but isolates all pages that
1078 * need migration. Between passing in the full user address
1079 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1080 */
66850be5 1081 vma = find_vma(mm, 0);
08270807 1082 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
66850be5 1083 queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
7e2ab150
CL
1084 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1085
cf608ac1 1086 if (!list_empty(&pagelist)) {
a0976311 1087 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
5ac95884 1088 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
cf608ac1 1089 if (err)
e2d8cf40 1090 putback_movable_pages(&pagelist);
cf608ac1 1091 }
95a402c3 1092
7e2ab150 1093 return err;
6ce3c4c0
CL
1094}
1095
39743889 1096/*
7e2ab150
CL
1097 * Move pages between the two nodesets so as to preserve the physical
1098 * layout as much as possible.
39743889
CL
1099 *
1100 * Returns the number of page that could not be moved.
1101 */
0ce72d4f
AM
1102int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1103 const nodemask_t *to, int flags)
39743889 1104{
7e2ab150 1105 int busy = 0;
f555befd 1106 int err = 0;
7e2ab150 1107 nodemask_t tmp;
39743889 1108
361a2a22 1109 lru_cache_disable();
0aedadf9 1110
d8ed45c5 1111 mmap_read_lock(mm);
39743889 1112
da0aa138
KM
1113 /*
1114 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1115 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1116 * bit in 'tmp', and return that <source, dest> pair for migration.
1117 * The pair of nodemasks 'to' and 'from' define the map.
1118 *
1119 * If no pair of bits is found that way, fallback to picking some
1120 * pair of 'source' and 'dest' bits that are not the same. If the
1121 * 'source' and 'dest' bits are the same, this represents a node
1122 * that will be migrating to itself, so no pages need move.
1123 *
1124 * If no bits are left in 'tmp', or if all remaining bits left
1125 * in 'tmp' correspond to the same bit in 'to', return false
1126 * (nothing left to migrate).
1127 *
1128 * This lets us pick a pair of nodes to migrate between, such that
1129 * if possible the dest node is not already occupied by some other
1130 * source node, minimizing the risk of overloading the memory on a
1131 * node that would happen if we migrated incoming memory to a node
1132 * before migrating outgoing memory source that same node.
1133 *
1134 * A single scan of tmp is sufficient. As we go, we remember the
1135 * most recent <s, d> pair that moved (s != d). If we find a pair
1136 * that not only moved, but what's better, moved to an empty slot
1137 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1138 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1139 * most recent <s, d> pair that moved. If we get all the way through
1140 * the scan of tmp without finding any node that moved, much less
1141 * moved to an empty node, then there is nothing left worth migrating.
1142 */
d4984711 1143
0ce72d4f 1144 tmp = *from;
7e2ab150 1145 while (!nodes_empty(tmp)) {
68d68ff6 1146 int s, d;
b76ac7e7 1147 int source = NUMA_NO_NODE;
7e2ab150
CL
1148 int dest = 0;
1149
1150 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1151
1152 /*
1153 * do_migrate_pages() tries to maintain the relative
1154 * node relationship of the pages established between
1155 * threads and memory areas.
1156 *
1157 * However if the number of source nodes is not equal to
1158 * the number of destination nodes we can not preserve
1159 * this node relative relationship. In that case, skip
1160 * copying memory from a node that is in the destination
1161 * mask.
1162 *
1163 * Example: [2,3,4] -> [3,4,5] moves everything.
1164 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1165 */
1166
0ce72d4f
AM
1167 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1168 (node_isset(s, *to)))
4a5b18cc
LW
1169 continue;
1170
0ce72d4f 1171 d = node_remap(s, *from, *to);
7e2ab150
CL
1172 if (s == d)
1173 continue;
1174
1175 source = s; /* Node moved. Memorize */
1176 dest = d;
1177
1178 /* dest not in remaining from nodes? */
1179 if (!node_isset(dest, tmp))
1180 break;
1181 }
b76ac7e7 1182 if (source == NUMA_NO_NODE)
7e2ab150
CL
1183 break;
1184
1185 node_clear(source, tmp);
1186 err = migrate_to_node(mm, source, dest, flags);
1187 if (err > 0)
1188 busy += err;
1189 if (err < 0)
1190 break;
39743889 1191 }
d8ed45c5 1192 mmap_read_unlock(mm);
d479960e 1193
361a2a22 1194 lru_cache_enable();
7e2ab150
CL
1195 if (err < 0)
1196 return err;
1197 return busy;
b20a3503
CL
1198
1199}
1200
3ad33b24
LS
1201/*
1202 * Allocate a new page for page migration based on vma policy.
d05f0cdc 1203 * Start by assuming the page is mapped by the same vma as contains @start.
3ad33b24
LS
1204 * Search forward from there, if not. N.B., this assumes that the
1205 * list of pages handed to migrate_pages()--which is how we get here--
1206 * is in virtual address order.
1207 */
666feb21 1208static struct page *new_page(struct page *page, unsigned long start)
95a402c3 1209{
ec4858e0 1210 struct folio *dst, *src = page_folio(page);
d05f0cdc 1211 struct vm_area_struct *vma;
3f649ab7 1212 unsigned long address;
66850be5 1213 VMA_ITERATOR(vmi, current->mm, start);
ec4858e0 1214 gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
95a402c3 1215
66850be5 1216 for_each_vma(vmi, vma) {
3ad33b24
LS
1217 address = page_address_in_vma(page, vma);
1218 if (address != -EFAULT)
1219 break;
3ad33b24 1220 }
11c731e8 1221
ec4858e0
MWO
1222 if (folio_test_hugetlb(src))
1223 return alloc_huge_page_vma(page_hstate(&src->page),
389c8178 1224 vma, address);
ec4858e0
MWO
1225
1226 if (folio_test_large(src))
1227 gfp = GFP_TRANSHUGE;
1228
0bf598d8 1229 /*
ec4858e0 1230 * if !vma, vma_alloc_folio() will use task or system default policy
0bf598d8 1231 */
ec4858e0
MWO
1232 dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
1233 folio_test_large(src));
1234 return &dst->page;
95a402c3 1235}
b20a3503
CL
1236#else
1237
a53190a4 1238static int migrate_page_add(struct page *page, struct list_head *pagelist,
b20a3503
CL
1239 unsigned long flags)
1240{
a53190a4 1241 return -EIO;
39743889
CL
1242}
1243
0ce72d4f
AM
1244int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1245 const nodemask_t *to, int flags)
b20a3503
CL
1246{
1247 return -ENOSYS;
1248}
95a402c3 1249
666feb21 1250static struct page *new_page(struct page *page, unsigned long start)
95a402c3
CL
1251{
1252 return NULL;
1253}
b20a3503
CL
1254#endif
1255
dbcb0f19 1256static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1257 unsigned short mode, unsigned short mode_flags,
1258 nodemask_t *nmask, unsigned long flags)
6ce3c4c0 1259{
6ce3c4c0
CL
1260 struct mm_struct *mm = current->mm;
1261 struct mempolicy *new;
1262 unsigned long end;
1263 int err;
d8835445 1264 int ret;
6ce3c4c0
CL
1265 LIST_HEAD(pagelist);
1266
b24f53a0 1267 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1268 return -EINVAL;
74c00241 1269 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1270 return -EPERM;
1271
1272 if (start & ~PAGE_MASK)
1273 return -EINVAL;
1274
1275 if (mode == MPOL_DEFAULT)
1276 flags &= ~MPOL_MF_STRICT;
1277
aaa31e05 1278 len = PAGE_ALIGN(len);
6ce3c4c0
CL
1279 end = start + len;
1280
1281 if (end < start)
1282 return -EINVAL;
1283 if (end == start)
1284 return 0;
1285
028fec41 1286 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1287 if (IS_ERR(new))
1288 return PTR_ERR(new);
1289
b24f53a0
LS
1290 if (flags & MPOL_MF_LAZY)
1291 new->flags |= MPOL_F_MOF;
1292
6ce3c4c0
CL
1293 /*
1294 * If we are using the default policy then operation
1295 * on discontinuous address spaces is okay after all
1296 */
1297 if (!new)
1298 flags |= MPOL_MF_DISCONTIG_OK;
1299
028fec41
DR
1300 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1301 start, start + len, mode, mode_flags,
00ef2d2f 1302 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
6ce3c4c0 1303
0aedadf9
CL
1304 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1305
361a2a22 1306 lru_cache_disable();
0aedadf9 1307 }
4bfc4495
KH
1308 {
1309 NODEMASK_SCRATCH(scratch);
1310 if (scratch) {
d8ed45c5 1311 mmap_write_lock(mm);
4bfc4495 1312 err = mpol_set_nodemask(new, nmask, scratch);
4bfc4495 1313 if (err)
d8ed45c5 1314 mmap_write_unlock(mm);
4bfc4495
KH
1315 } else
1316 err = -ENOMEM;
1317 NODEMASK_SCRATCH_FREE(scratch);
1318 }
b05ca738
KM
1319 if (err)
1320 goto mpol_out;
1321
d8835445 1322 ret = queue_pages_range(mm, start, end, nmask,
6ce3c4c0 1323 flags | MPOL_MF_INVERT, &pagelist);
d8835445
YS
1324
1325 if (ret < 0) {
a85dfc30 1326 err = ret;
d8835445
YS
1327 goto up_out;
1328 }
1329
1330 err = mbind_range(mm, start, end, new);
7e2ab150 1331
b24f53a0
LS
1332 if (!err) {
1333 int nr_failed = 0;
1334
cf608ac1 1335 if (!list_empty(&pagelist)) {
b24f53a0 1336 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
d05f0cdc 1337 nr_failed = migrate_pages(&pagelist, new_page, NULL,
5ac95884 1338 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
cf608ac1 1339 if (nr_failed)
74060e4d 1340 putback_movable_pages(&pagelist);
cf608ac1 1341 }
6ce3c4c0 1342
d8835445 1343 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
6ce3c4c0 1344 err = -EIO;
a85dfc30 1345 } else {
d8835445 1346up_out:
a85dfc30
YS
1347 if (!list_empty(&pagelist))
1348 putback_movable_pages(&pagelist);
1349 }
1350
d8ed45c5 1351 mmap_write_unlock(mm);
d8835445 1352mpol_out:
f0be3d32 1353 mpol_put(new);
d479960e 1354 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
361a2a22 1355 lru_cache_enable();
6ce3c4c0
CL
1356 return err;
1357}
1358
8bccd85f
CL
1359/*
1360 * User space interface with variable sized bitmaps for nodelists.
1361 */
e130242d
AB
1362static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1363 unsigned long maxnode)
1364{
1365 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1366 int ret;
1367
1368 if (in_compat_syscall())
1369 ret = compat_get_bitmap(mask,
1370 (const compat_ulong_t __user *)nmask,
1371 maxnode);
1372 else
1373 ret = copy_from_user(mask, nmask,
1374 nlongs * sizeof(unsigned long));
1375
1376 if (ret)
1377 return -EFAULT;
1378
1379 if (maxnode % BITS_PER_LONG)
1380 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1381
1382 return 0;
1383}
8bccd85f
CL
1384
1385/* Copy a node mask from user space. */
39743889 1386static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1387 unsigned long maxnode)
1388{
8bccd85f
CL
1389 --maxnode;
1390 nodes_clear(*nodes);
1391 if (maxnode == 0 || !nmask)
1392 return 0;
a9c930ba 1393 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1394 return -EINVAL;
8bccd85f 1395
56521e7a
YX
1396 /*
1397 * When the user specified more nodes than supported just check
e130242d
AB
1398 * if the non supported part is all zero, one word at a time,
1399 * starting at the end.
56521e7a 1400 */
e130242d
AB
1401 while (maxnode > MAX_NUMNODES) {
1402 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1403 unsigned long t;
8bccd85f 1404
000eca5d 1405 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
56521e7a 1406 return -EFAULT;
e130242d
AB
1407
1408 if (maxnode - bits >= MAX_NUMNODES) {
1409 maxnode -= bits;
1410 } else {
1411 maxnode = MAX_NUMNODES;
1412 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1413 }
1414 if (t)
56521e7a
YX
1415 return -EINVAL;
1416 }
1417
e130242d 1418 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
8bccd85f
CL
1419}
1420
1421/* Copy a kernel node mask to user space */
1422static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1423 nodemask_t *nodes)
1424{
1425 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
050c17f2 1426 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
e130242d
AB
1427 bool compat = in_compat_syscall();
1428
1429 if (compat)
1430 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
8bccd85f
CL
1431
1432 if (copy > nbytes) {
1433 if (copy > PAGE_SIZE)
1434 return -EINVAL;
1435 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1436 return -EFAULT;
1437 copy = nbytes;
e130242d 1438 maxnode = nr_node_ids;
8bccd85f 1439 }
e130242d
AB
1440
1441 if (compat)
1442 return compat_put_bitmap((compat_ulong_t __user *)mask,
1443 nodes_addr(*nodes), maxnode);
1444
8bccd85f
CL
1445 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1446}
1447
95837924
FT
1448/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1449static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1450{
1451 *flags = *mode & MPOL_MODE_FLAGS;
1452 *mode &= ~MPOL_MODE_FLAGS;
b27abacc 1453
a38a59fd 1454 if ((unsigned int)(*mode) >= MPOL_MAX)
95837924
FT
1455 return -EINVAL;
1456 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1457 return -EINVAL;
6d2aec9e
ED
1458 if (*flags & MPOL_F_NUMA_BALANCING) {
1459 if (*mode != MPOL_BIND)
1460 return -EINVAL;
1461 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1462 }
95837924
FT
1463 return 0;
1464}
1465
e7dc9ad6
DB
1466static long kernel_mbind(unsigned long start, unsigned long len,
1467 unsigned long mode, const unsigned long __user *nmask,
1468 unsigned long maxnode, unsigned int flags)
8bccd85f 1469{
95837924 1470 unsigned short mode_flags;
8bccd85f 1471 nodemask_t nodes;
95837924 1472 int lmode = mode;
8bccd85f
CL
1473 int err;
1474
057d3389 1475 start = untagged_addr(start);
95837924
FT
1476 err = sanitize_mpol_flags(&lmode, &mode_flags);
1477 if (err)
1478 return err;
1479
8bccd85f
CL
1480 err = get_nodes(&nodes, nmask, maxnode);
1481 if (err)
1482 return err;
95837924
FT
1483
1484 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
8bccd85f
CL
1485}
1486
c6018b4b
AK
1487SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1488 unsigned long, home_node, unsigned long, flags)
1489{
1490 struct mm_struct *mm = current->mm;
1491 struct vm_area_struct *vma;
1492 struct mempolicy *new;
1493 unsigned long vmstart;
1494 unsigned long vmend;
1495 unsigned long end;
1496 int err = -ENOENT;
66850be5 1497 VMA_ITERATOR(vmi, mm, start);
c6018b4b
AK
1498
1499 start = untagged_addr(start);
1500 if (start & ~PAGE_MASK)
1501 return -EINVAL;
1502 /*
1503 * flags is used for future extension if any.
1504 */
1505 if (flags != 0)
1506 return -EINVAL;
1507
1508 /*
1509 * Check home_node is online to avoid accessing uninitialized
1510 * NODE_DATA.
1511 */
1512 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1513 return -EINVAL;
1514
aaa31e05 1515 len = PAGE_ALIGN(len);
c6018b4b
AK
1516 end = start + len;
1517
1518 if (end < start)
1519 return -EINVAL;
1520 if (end == start)
1521 return 0;
1522 mmap_write_lock(mm);
66850be5 1523 for_each_vma_range(vmi, vma, end) {
c6018b4b
AK
1524 vmstart = max(start, vma->vm_start);
1525 vmend = min(end, vma->vm_end);
1526 new = mpol_dup(vma_policy(vma));
1527 if (IS_ERR(new)) {
1528 err = PTR_ERR(new);
1529 break;
1530 }
1531 /*
1532 * Only update home node if there is an existing vma policy
1533 */
1534 if (!new)
1535 continue;
1536
1537 /*
1538 * If any vma in the range got policy other than MPOL_BIND
1539 * or MPOL_PREFERRED_MANY we return error. We don't reset
1540 * the home node for vmas we already updated before.
1541 */
1542 if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
1543 err = -EOPNOTSUPP;
1544 break;
1545 }
1546
1547 new->home_node = home_node;
1548 err = mbind_range(mm, vmstart, vmend, new);
1549 mpol_put(new);
1550 if (err)
1551 break;
1552 }
1553 mmap_write_unlock(mm);
1554 return err;
1555}
1556
e7dc9ad6
DB
1557SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1558 unsigned long, mode, const unsigned long __user *, nmask,
1559 unsigned long, maxnode, unsigned int, flags)
1560{
1561 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1562}
1563
8bccd85f 1564/* Set the process memory policy */
af03c4ac
DB
1565static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1566 unsigned long maxnode)
8bccd85f 1567{
95837924 1568 unsigned short mode_flags;
8bccd85f 1569 nodemask_t nodes;
95837924
FT
1570 int lmode = mode;
1571 int err;
1572
1573 err = sanitize_mpol_flags(&lmode, &mode_flags);
1574 if (err)
1575 return err;
8bccd85f 1576
8bccd85f
CL
1577 err = get_nodes(&nodes, nmask, maxnode);
1578 if (err)
1579 return err;
95837924
FT
1580
1581 return do_set_mempolicy(lmode, mode_flags, &nodes);
8bccd85f
CL
1582}
1583
af03c4ac
DB
1584SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1585 unsigned long, maxnode)
1586{
1587 return kernel_set_mempolicy(mode, nmask, maxnode);
1588}
1589
b6e9b0ba
DB
1590static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1591 const unsigned long __user *old_nodes,
1592 const unsigned long __user *new_nodes)
39743889 1593{
596d7cfa 1594 struct mm_struct *mm = NULL;
39743889 1595 struct task_struct *task;
39743889
CL
1596 nodemask_t task_nodes;
1597 int err;
596d7cfa
KM
1598 nodemask_t *old;
1599 nodemask_t *new;
1600 NODEMASK_SCRATCH(scratch);
1601
1602 if (!scratch)
1603 return -ENOMEM;
39743889 1604
596d7cfa
KM
1605 old = &scratch->mask1;
1606 new = &scratch->mask2;
1607
1608 err = get_nodes(old, old_nodes, maxnode);
39743889 1609 if (err)
596d7cfa 1610 goto out;
39743889 1611
596d7cfa 1612 err = get_nodes(new, new_nodes, maxnode);
39743889 1613 if (err)
596d7cfa 1614 goto out;
39743889
CL
1615
1616 /* Find the mm_struct */
55cfaa3c 1617 rcu_read_lock();
228ebcbe 1618 task = pid ? find_task_by_vpid(pid) : current;
39743889 1619 if (!task) {
55cfaa3c 1620 rcu_read_unlock();
596d7cfa
KM
1621 err = -ESRCH;
1622 goto out;
39743889 1623 }
3268c63e 1624 get_task_struct(task);
39743889 1625
596d7cfa 1626 err = -EINVAL;
39743889
CL
1627
1628 /*
31367466
OE
1629 * Check if this process has the right to modify the specified process.
1630 * Use the regular "ptrace_may_access()" checks.
39743889 1631 */
31367466 1632 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
c69e8d9c 1633 rcu_read_unlock();
39743889 1634 err = -EPERM;
3268c63e 1635 goto out_put;
39743889 1636 }
c69e8d9c 1637 rcu_read_unlock();
39743889
CL
1638
1639 task_nodes = cpuset_mems_allowed(task);
1640 /* Is the user allowed to access the target nodes? */
596d7cfa 1641 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1642 err = -EPERM;
3268c63e 1643 goto out_put;
39743889
CL
1644 }
1645
0486a38b
YX
1646 task_nodes = cpuset_mems_allowed(current);
1647 nodes_and(*new, *new, task_nodes);
1648 if (nodes_empty(*new))
1649 goto out_put;
1650
86c3a764
DQ
1651 err = security_task_movememory(task);
1652 if (err)
3268c63e 1653 goto out_put;
86c3a764 1654
3268c63e
CL
1655 mm = get_task_mm(task);
1656 put_task_struct(task);
f2a9ef88
SL
1657
1658 if (!mm) {
3268c63e 1659 err = -EINVAL;
f2a9ef88
SL
1660 goto out;
1661 }
1662
1663 err = do_migrate_pages(mm, old, new,
1664 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1665
1666 mmput(mm);
1667out:
596d7cfa
KM
1668 NODEMASK_SCRATCH_FREE(scratch);
1669
39743889 1670 return err;
3268c63e
CL
1671
1672out_put:
1673 put_task_struct(task);
1674 goto out;
1675
39743889
CL
1676}
1677
b6e9b0ba
DB
1678SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1679 const unsigned long __user *, old_nodes,
1680 const unsigned long __user *, new_nodes)
1681{
1682 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1683}
1684
39743889 1685
8bccd85f 1686/* Retrieve NUMA policy */
af03c4ac
DB
1687static int kernel_get_mempolicy(int __user *policy,
1688 unsigned long __user *nmask,
1689 unsigned long maxnode,
1690 unsigned long addr,
1691 unsigned long flags)
8bccd85f 1692{
dbcb0f19 1693 int err;
3f649ab7 1694 int pval;
8bccd85f
CL
1695 nodemask_t nodes;
1696
050c17f2 1697 if (nmask != NULL && maxnode < nr_node_ids)
8bccd85f
CL
1698 return -EINVAL;
1699
4605f057
WH
1700 addr = untagged_addr(addr);
1701
8bccd85f
CL
1702 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1703
1704 if (err)
1705 return err;
1706
1707 if (policy && put_user(pval, policy))
1708 return -EFAULT;
1709
1710 if (nmask)
1711 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1712
1713 return err;
1714}
1715
af03c4ac
DB
1716SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1717 unsigned long __user *, nmask, unsigned long, maxnode,
1718 unsigned long, addr, unsigned long, flags)
1719{
1720 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1721}
1722
20ca87f2
LX
1723bool vma_migratable(struct vm_area_struct *vma)
1724{
1725 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1726 return false;
1727
1728 /*
1729 * DAX device mappings require predictable access latency, so avoid
1730 * incurring periodic faults.
1731 */
1732 if (vma_is_dax(vma))
1733 return false;
1734
1735 if (is_vm_hugetlb_page(vma) &&
1736 !hugepage_migration_supported(hstate_vma(vma)))
1737 return false;
1738
1739 /*
1740 * Migration allocates pages in the highest zone. If we cannot
1741 * do so then migration (at least from node to node) is not
1742 * possible.
1743 */
1744 if (vma->vm_file &&
1745 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1746 < policy_zone)
1747 return false;
1748 return true;
1749}
1750
74d2c3a0
ON
1751struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1752 unsigned long addr)
1da177e4 1753{
8d90274b 1754 struct mempolicy *pol = NULL;
1da177e4
LT
1755
1756 if (vma) {
480eccf9 1757 if (vma->vm_ops && vma->vm_ops->get_policy) {
8d90274b 1758 pol = vma->vm_ops->get_policy(vma, addr);
00442ad0 1759 } else if (vma->vm_policy) {
1da177e4 1760 pol = vma->vm_policy;
00442ad0
MG
1761
1762 /*
1763 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1764 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1765 * count on these policies which will be dropped by
1766 * mpol_cond_put() later
1767 */
1768 if (mpol_needs_cond_ref(pol))
1769 mpol_get(pol);
1770 }
1da177e4 1771 }
f15ca78e 1772
74d2c3a0
ON
1773 return pol;
1774}
1775
1776/*
dd6eecb9 1777 * get_vma_policy(@vma, @addr)
74d2c3a0
ON
1778 * @vma: virtual memory area whose policy is sought
1779 * @addr: address in @vma for shared policy lookup
1780 *
1781 * Returns effective policy for a VMA at specified address.
dd6eecb9 1782 * Falls back to current->mempolicy or system default policy, as necessary.
74d2c3a0
ON
1783 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1784 * count--added by the get_policy() vm_op, as appropriate--to protect against
1785 * freeing by another task. It is the caller's responsibility to free the
1786 * extra reference for shared policies.
1787 */
ac79f78d 1788static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
dd6eecb9 1789 unsigned long addr)
74d2c3a0
ON
1790{
1791 struct mempolicy *pol = __get_vma_policy(vma, addr);
1792
8d90274b 1793 if (!pol)
dd6eecb9 1794 pol = get_task_policy(current);
8d90274b 1795
1da177e4
LT
1796 return pol;
1797}
1798
6b6482bb 1799bool vma_policy_mof(struct vm_area_struct *vma)
fc314724 1800{
6b6482bb 1801 struct mempolicy *pol;
fc314724 1802
6b6482bb
ON
1803 if (vma->vm_ops && vma->vm_ops->get_policy) {
1804 bool ret = false;
fc314724 1805
6b6482bb
ON
1806 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1807 if (pol && (pol->flags & MPOL_F_MOF))
1808 ret = true;
1809 mpol_cond_put(pol);
8d90274b 1810
6b6482bb 1811 return ret;
fc314724
MG
1812 }
1813
6b6482bb 1814 pol = vma->vm_policy;
8d90274b 1815 if (!pol)
6b6482bb 1816 pol = get_task_policy(current);
8d90274b 1817
fc314724
MG
1818 return pol->flags & MPOL_F_MOF;
1819}
1820
d2226ebd 1821bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
d3eb1570
LJ
1822{
1823 enum zone_type dynamic_policy_zone = policy_zone;
1824
1825 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1826
1827 /*
269fbe72 1828 * if policy->nodes has movable memory only,
d3eb1570
LJ
1829 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1830 *
269fbe72 1831 * policy->nodes is intersect with node_states[N_MEMORY].
f0953a1b 1832 * so if the following test fails, it implies
269fbe72 1833 * policy->nodes has movable memory only.
d3eb1570 1834 */
269fbe72 1835 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
d3eb1570
LJ
1836 dynamic_policy_zone = ZONE_MOVABLE;
1837
1838 return zone >= dynamic_policy_zone;
1839}
1840
52cd3b07
LS
1841/*
1842 * Return a nodemask representing a mempolicy for filtering nodes for
1843 * page allocation
1844 */
8ca39e68 1845nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b32 1846{
b27abacc
DH
1847 int mode = policy->mode;
1848
19770b32 1849 /* Lower zones don't get a nodemask applied for MPOL_BIND */
b27abacc
DH
1850 if (unlikely(mode == MPOL_BIND) &&
1851 apply_policy_zone(policy, gfp_zone(gfp)) &&
1852 cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1853 return &policy->nodes;
1854
1855 if (mode == MPOL_PREFERRED_MANY)
269fbe72 1856 return &policy->nodes;
19770b32
MG
1857
1858 return NULL;
1859}
1860
b27abacc
DH
1861/*
1862 * Return the preferred node id for 'prefer' mempolicy, and return
1863 * the given id for all other policies.
1864 *
1865 * policy_node() is always coupled with policy_nodemask(), which
1866 * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1867 */
f8fd5253 1868static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1da177e4 1869{
7858d7bc 1870 if (policy->mode == MPOL_PREFERRED) {
269fbe72 1871 nd = first_node(policy->nodes);
7858d7bc 1872 } else {
19770b32 1873 /*
6d840958
MH
1874 * __GFP_THISNODE shouldn't even be used with the bind policy
1875 * because we might easily break the expectation to stay on the
1876 * requested node and not break the policy.
19770b32 1877 */
6d840958 1878 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1da177e4 1879 }
6d840958 1880
c6018b4b
AK
1881 if ((policy->mode == MPOL_BIND ||
1882 policy->mode == MPOL_PREFERRED_MANY) &&
1883 policy->home_node != NUMA_NO_NODE)
1884 return policy->home_node;
1885
04ec6264 1886 return nd;
1da177e4
LT
1887}
1888
1889/* Do dynamic interleaving for a process */
1890static unsigned interleave_nodes(struct mempolicy *policy)
1891{
45816682 1892 unsigned next;
1da177e4
LT
1893 struct task_struct *me = current;
1894
269fbe72 1895 next = next_node_in(me->il_prev, policy->nodes);
f5b087b5 1896 if (next < MAX_NUMNODES)
45816682
VB
1897 me->il_prev = next;
1898 return next;
1da177e4
LT
1899}
1900
dc85da15
CL
1901/*
1902 * Depending on the memory policy provide a node from which to allocate the
1903 * next slab entry.
1904 */
2a389610 1905unsigned int mempolicy_slab_node(void)
dc85da15 1906{
e7b691b0 1907 struct mempolicy *policy;
2a389610 1908 int node = numa_mem_id();
e7b691b0 1909
38b031dd 1910 if (!in_task())
2a389610 1911 return node;
e7b691b0
AK
1912
1913 policy = current->mempolicy;
7858d7bc 1914 if (!policy)
2a389610 1915 return node;
bea904d5
LS
1916
1917 switch (policy->mode) {
1918 case MPOL_PREFERRED:
269fbe72 1919 return first_node(policy->nodes);
765c4507 1920
dc85da15
CL
1921 case MPOL_INTERLEAVE:
1922 return interleave_nodes(policy);
1923
b27abacc
DH
1924 case MPOL_BIND:
1925 case MPOL_PREFERRED_MANY:
1926 {
c33d6c06
MG
1927 struct zoneref *z;
1928
dc85da15
CL
1929 /*
1930 * Follow bind policy behavior and start allocation at the
1931 * first node.
1932 */
19770b32 1933 struct zonelist *zonelist;
19770b32 1934 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
c9634cf0 1935 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
c33d6c06 1936 z = first_zones_zonelist(zonelist, highest_zoneidx,
269fbe72 1937 &policy->nodes);
c1093b74 1938 return z->zone ? zone_to_nid(z->zone) : node;
dd1a239f 1939 }
7858d7bc
FT
1940 case MPOL_LOCAL:
1941 return node;
dc85da15 1942
dc85da15 1943 default:
bea904d5 1944 BUG();
dc85da15
CL
1945 }
1946}
1947
fee83b3a
AM
1948/*
1949 * Do static interleaving for a VMA with known offset @n. Returns the n'th
269fbe72 1950 * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
fee83b3a
AM
1951 * number of present nodes.
1952 */
98c70baa 1953static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1da177e4 1954{
276aeee1 1955 nodemask_t nodemask = pol->nodes;
1956 unsigned int target, nnodes;
fee83b3a
AM
1957 int i;
1958 int nid;
276aeee1 1959 /*
1960 * The barrier will stabilize the nodemask in a register or on
1961 * the stack so that it will stop changing under the code.
1962 *
1963 * Between first_node() and next_node(), pol->nodes could be changed
1964 * by other threads. So we put pol->nodes in a local stack.
1965 */
1966 barrier();
1da177e4 1967
276aeee1 1968 nnodes = nodes_weight(nodemask);
f5b087b5
DR
1969 if (!nnodes)
1970 return numa_node_id();
fee83b3a 1971 target = (unsigned int)n % nnodes;
276aeee1 1972 nid = first_node(nodemask);
fee83b3a 1973 for (i = 0; i < target; i++)
276aeee1 1974 nid = next_node(nid, nodemask);
1da177e4
LT
1975 return nid;
1976}
1977
5da7ca86
CL
1978/* Determine a node number for interleave */
1979static inline unsigned interleave_nid(struct mempolicy *pol,
1980 struct vm_area_struct *vma, unsigned long addr, int shift)
1981{
1982 if (vma) {
1983 unsigned long off;
1984
3b98b087
NA
1985 /*
1986 * for small pages, there is no difference between
1987 * shift and PAGE_SHIFT, so the bit-shift is safe.
1988 * for huge pages, since vm_pgoff is in units of small
1989 * pages, we need to shift off the always 0 bits to get
1990 * a useful offset.
1991 */
1992 BUG_ON(shift < PAGE_SHIFT);
1993 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86 1994 off += (addr - vma->vm_start) >> shift;
98c70baa 1995 return offset_il_node(pol, off);
5da7ca86
CL
1996 } else
1997 return interleave_nodes(pol);
1998}
1999
00ac59ad 2000#ifdef CONFIG_HUGETLBFS
480eccf9 2001/*
04ec6264 2002 * huge_node(@vma, @addr, @gfp_flags, @mpol)
b46e14ac
FF
2003 * @vma: virtual memory area whose policy is sought
2004 * @addr: address in @vma for shared policy lookup and interleave policy
2005 * @gfp_flags: for requested zone
2006 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
b27abacc 2007 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
480eccf9 2008 *
04ec6264 2009 * Returns a nid suitable for a huge page allocation and a pointer
52cd3b07 2010 * to the struct mempolicy for conditional unref after allocation.
b27abacc
DH
2011 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2012 * to the mempolicy's @nodemask for filtering the zonelist.
c0ff7453 2013 *
d26914d1 2014 * Must be protected by read_mems_allowed_begin()
480eccf9 2015 */
04ec6264
VB
2016int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2017 struct mempolicy **mpol, nodemask_t **nodemask)
5da7ca86 2018{
04ec6264 2019 int nid;
b27abacc 2020 int mode;
5da7ca86 2021
dd6eecb9 2022 *mpol = get_vma_policy(vma, addr);
b27abacc
DH
2023 *nodemask = NULL;
2024 mode = (*mpol)->mode;
5da7ca86 2025
b27abacc 2026 if (unlikely(mode == MPOL_INTERLEAVE)) {
04ec6264
VB
2027 nid = interleave_nid(*mpol, vma, addr,
2028 huge_page_shift(hstate_vma(vma)));
52cd3b07 2029 } else {
04ec6264 2030 nid = policy_node(gfp_flags, *mpol, numa_node_id());
b27abacc 2031 if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
269fbe72 2032 *nodemask = &(*mpol)->nodes;
480eccf9 2033 }
04ec6264 2034 return nid;
5da7ca86 2035}
06808b08
LS
2036
2037/*
2038 * init_nodemask_of_mempolicy
2039 *
2040 * If the current task's mempolicy is "default" [NULL], return 'false'
2041 * to indicate default policy. Otherwise, extract the policy nodemask
2042 * for 'bind' or 'interleave' policy into the argument nodemask, or
2043 * initialize the argument nodemask to contain the single node for
2044 * 'preferred' or 'local' policy and return 'true' to indicate presence
2045 * of non-default mempolicy.
2046 *
2047 * We don't bother with reference counting the mempolicy [mpol_get/put]
2048 * because the current task is examining it's own mempolicy and a task's
2049 * mempolicy is only ever changed by the task itself.
2050 *
2051 * N.B., it is the caller's responsibility to free a returned nodemask.
2052 */
2053bool init_nodemask_of_mempolicy(nodemask_t *mask)
2054{
2055 struct mempolicy *mempolicy;
06808b08
LS
2056
2057 if (!(mask && current->mempolicy))
2058 return false;
2059
c0ff7453 2060 task_lock(current);
06808b08
LS
2061 mempolicy = current->mempolicy;
2062 switch (mempolicy->mode) {
2063 case MPOL_PREFERRED:
b27abacc 2064 case MPOL_PREFERRED_MANY:
06808b08 2065 case MPOL_BIND:
06808b08 2066 case MPOL_INTERLEAVE:
269fbe72 2067 *mask = mempolicy->nodes;
7858d7bc
FT
2068 break;
2069
2070 case MPOL_LOCAL:
269fbe72 2071 init_nodemask_of_node(mask, numa_node_id());
06808b08
LS
2072 break;
2073
2074 default:
2075 BUG();
2076 }
c0ff7453 2077 task_unlock(current);
06808b08
LS
2078
2079 return true;
2080}
00ac59ad 2081#endif
5da7ca86 2082
6f48d0eb 2083/*
b26e517a 2084 * mempolicy_in_oom_domain
6f48d0eb 2085 *
b26e517a
FT
2086 * If tsk's mempolicy is "bind", check for intersection between mask and
2087 * the policy nodemask. Otherwise, return true for all other policies
2088 * including "interleave", as a tsk with "interleave" policy may have
2089 * memory allocated from all nodes in system.
6f48d0eb
DR
2090 *
2091 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2092 */
b26e517a 2093bool mempolicy_in_oom_domain(struct task_struct *tsk,
6f48d0eb
DR
2094 const nodemask_t *mask)
2095{
2096 struct mempolicy *mempolicy;
2097 bool ret = true;
2098
2099 if (!mask)
2100 return ret;
b26e517a 2101
6f48d0eb
DR
2102 task_lock(tsk);
2103 mempolicy = tsk->mempolicy;
b26e517a 2104 if (mempolicy && mempolicy->mode == MPOL_BIND)
269fbe72 2105 ret = nodes_intersects(mempolicy->nodes, *mask);
6f48d0eb 2106 task_unlock(tsk);
b26e517a 2107
6f48d0eb
DR
2108 return ret;
2109}
2110
1da177e4
LT
2111/* Allocate a page in interleaved policy.
2112 Own path because it needs to do special accounting. */
662f3a0b
AK
2113static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2114 unsigned nid)
1da177e4 2115{
1da177e4
LT
2116 struct page *page;
2117
84172f4b 2118 page = __alloc_pages(gfp, order, nid, NULL);
4518085e
KW
2119 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2120 if (!static_branch_likely(&vm_numa_stat_key))
2121 return page;
de55c8b2
AR
2122 if (page && page_to_nid(page) == nid) {
2123 preempt_disable();
f19298b9 2124 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
de55c8b2
AR
2125 preempt_enable();
2126 }
1da177e4
LT
2127 return page;
2128}
2129
4c54d949
FT
2130static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2131 int nid, struct mempolicy *pol)
2132{
2133 struct page *page;
2134 gfp_t preferred_gfp;
2135
2136 /*
2137 * This is a two pass approach. The first pass will only try the
2138 * preferred nodes but skip the direct reclaim and allow the
2139 * allocation to fail, while the second pass will try all the
2140 * nodes in system.
2141 */
2142 preferred_gfp = gfp | __GFP_NOWARN;
2143 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2144 page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2145 if (!page)
c0455116 2146 page = __alloc_pages(gfp, order, nid, NULL);
4c54d949
FT
2147
2148 return page;
2149}
2150
1da177e4 2151/**
adf88aa8 2152 * vma_alloc_folio - Allocate a folio for a VMA.
eb350739 2153 * @gfp: GFP flags.
adf88aa8 2154 * @order: Order of the folio.
eb350739
MWO
2155 * @vma: Pointer to VMA or NULL if not available.
2156 * @addr: Virtual address of the allocation. Must be inside @vma.
eb350739 2157 * @hugepage: For hugepages try only the preferred node if possible.
1da177e4 2158 *
adf88aa8 2159 * Allocate a folio for a specific address in @vma, using the appropriate
eb350739
MWO
2160 * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
2161 * of the mm_struct of the VMA to prevent it from going away. Should be
adf88aa8 2162 * used for all allocations for folios that will be mapped into user space.
1da177e4 2163 *
adf88aa8 2164 * Return: The folio on success or NULL if allocation fails.
1da177e4 2165 */
adf88aa8 2166struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
be1a13eb 2167 unsigned long addr, bool hugepage)
1da177e4 2168{
cc9a6c87 2169 struct mempolicy *pol;
be1a13eb 2170 int node = numa_node_id();
adf88aa8 2171 struct folio *folio;
04ec6264 2172 int preferred_nid;
be97a41b 2173 nodemask_t *nmask;
cc9a6c87 2174
dd6eecb9 2175 pol = get_vma_policy(vma, addr);
1da177e4 2176
0867a57c 2177 if (pol->mode == MPOL_INTERLEAVE) {
adf88aa8 2178 struct page *page;
0867a57c
VB
2179 unsigned nid;
2180
2181 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2182 mpol_cond_put(pol);
adf88aa8 2183 gfp |= __GFP_COMP;
0867a57c 2184 page = alloc_page_interleave(gfp, order, nid);
adf88aa8
MWO
2185 if (page && order > 1)
2186 prep_transhuge_page(page);
2187 folio = (struct folio *)page;
0867a57c 2188 goto out;
19deb769
DR
2189 }
2190
4c54d949 2191 if (pol->mode == MPOL_PREFERRED_MANY) {
adf88aa8
MWO
2192 struct page *page;
2193
c0455116 2194 node = policy_node(gfp, pol, node);
adf88aa8 2195 gfp |= __GFP_COMP;
4c54d949
FT
2196 page = alloc_pages_preferred_many(gfp, order, node, pol);
2197 mpol_cond_put(pol);
adf88aa8
MWO
2198 if (page && order > 1)
2199 prep_transhuge_page(page);
2200 folio = (struct folio *)page;
4c54d949
FT
2201 goto out;
2202 }
2203
19deb769
DR
2204 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2205 int hpage_node = node;
2206
2207 /*
2208 * For hugepage allocation and non-interleave policy which
2209 * allows the current node (or other explicitly preferred
2210 * node) we only try to allocate from the current/preferred
2211 * node and don't fall back to other nodes, as the cost of
2212 * remote accesses would likely offset THP benefits.
2213 *
b27abacc 2214 * If the policy is interleave or does not allow the current
19deb769
DR
2215 * node in its nodemask, we allocate the standard way.
2216 */
7858d7bc 2217 if (pol->mode == MPOL_PREFERRED)
269fbe72 2218 hpage_node = first_node(pol->nodes);
19deb769
DR
2219
2220 nmask = policy_nodemask(gfp, pol);
2221 if (!nmask || node_isset(hpage_node, *nmask)) {
2222 mpol_cond_put(pol);
cc638f32
VB
2223 /*
2224 * First, try to allocate THP only on local node, but
2225 * don't reclaim unnecessarily, just compact.
2226 */
adf88aa8
MWO
2227 folio = __folio_alloc_node(gfp | __GFP_THISNODE |
2228 __GFP_NORETRY, order, hpage_node);
76e654cc
DR
2229
2230 /*
2231 * If hugepage allocations are configured to always
2232 * synchronous compact or the vma has been madvised
2233 * to prefer hugepage backing, retry allowing remote
cc638f32 2234 * memory with both reclaim and compact as well.
76e654cc 2235 */
adf88aa8
MWO
2236 if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
2237 folio = __folio_alloc(gfp, order, hpage_node,
2238 nmask);
76e654cc 2239
19deb769
DR
2240 goto out;
2241 }
356ff8a9
DR
2242 }
2243
be97a41b 2244 nmask = policy_nodemask(gfp, pol);
04ec6264 2245 preferred_nid = policy_node(gfp, pol, node);
adf88aa8 2246 folio = __folio_alloc(gfp, order, preferred_nid, nmask);
d51e9894 2247 mpol_cond_put(pol);
be97a41b 2248out:
f584b680
MWO
2249 return folio;
2250}
adf88aa8 2251EXPORT_SYMBOL(vma_alloc_folio);
f584b680 2252
1da177e4 2253/**
6421ec76
MWO
2254 * alloc_pages - Allocate pages.
2255 * @gfp: GFP flags.
2256 * @order: Power of two of number of pages to allocate.
1da177e4 2257 *
6421ec76
MWO
2258 * Allocate 1 << @order contiguous pages. The physical address of the
2259 * first page is naturally aligned (eg an order-3 allocation will be aligned
2260 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2261 * process is honoured when in process context.
1da177e4 2262 *
6421ec76
MWO
2263 * Context: Can be called from any context, providing the appropriate GFP
2264 * flags are used.
2265 * Return: The page on success or NULL if allocation fails.
1da177e4 2266 */
d7f946d0 2267struct page *alloc_pages(gfp_t gfp, unsigned order)
1da177e4 2268{
8d90274b 2269 struct mempolicy *pol = &default_policy;
c0ff7453 2270 struct page *page;
1da177e4 2271
8d90274b
ON
2272 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2273 pol = get_task_policy(current);
52cd3b07
LS
2274
2275 /*
2276 * No reference counting needed for current->mempolicy
2277 * nor system default_policy
2278 */
45c4745a 2279 if (pol->mode == MPOL_INTERLEAVE)
c0ff7453 2280 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
4c54d949
FT
2281 else if (pol->mode == MPOL_PREFERRED_MANY)
2282 page = alloc_pages_preferred_many(gfp, order,
c0455116 2283 policy_node(gfp, pol, numa_node_id()), pol);
c0ff7453 2284 else
84172f4b 2285 page = __alloc_pages(gfp, order,
04ec6264 2286 policy_node(gfp, pol, numa_node_id()),
5c4b4be3 2287 policy_nodemask(gfp, pol));
cc9a6c87 2288
c0ff7453 2289 return page;
1da177e4 2290}
d7f946d0 2291EXPORT_SYMBOL(alloc_pages);
1da177e4 2292
cc09cb13
MWO
2293struct folio *folio_alloc(gfp_t gfp, unsigned order)
2294{
2295 struct page *page = alloc_pages(gfp | __GFP_COMP, order);
2296
2297 if (page && order > 1)
2298 prep_transhuge_page(page);
2299 return (struct folio *)page;
2300}
2301EXPORT_SYMBOL(folio_alloc);
2302
c00b6b96
CW
2303static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2304 struct mempolicy *pol, unsigned long nr_pages,
2305 struct page **page_array)
2306{
2307 int nodes;
2308 unsigned long nr_pages_per_node;
2309 int delta;
2310 int i;
2311 unsigned long nr_allocated;
2312 unsigned long total_allocated = 0;
2313
2314 nodes = nodes_weight(pol->nodes);
2315 nr_pages_per_node = nr_pages / nodes;
2316 delta = nr_pages - nodes * nr_pages_per_node;
2317
2318 for (i = 0; i < nodes; i++) {
2319 if (delta) {
2320 nr_allocated = __alloc_pages_bulk(gfp,
2321 interleave_nodes(pol), NULL,
2322 nr_pages_per_node + 1, NULL,
2323 page_array);
2324 delta--;
2325 } else {
2326 nr_allocated = __alloc_pages_bulk(gfp,
2327 interleave_nodes(pol), NULL,
2328 nr_pages_per_node, NULL, page_array);
2329 }
2330
2331 page_array += nr_allocated;
2332 total_allocated += nr_allocated;
2333 }
2334
2335 return total_allocated;
2336}
2337
2338static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2339 struct mempolicy *pol, unsigned long nr_pages,
2340 struct page **page_array)
2341{
2342 gfp_t preferred_gfp;
2343 unsigned long nr_allocated = 0;
2344
2345 preferred_gfp = gfp | __GFP_NOWARN;
2346 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2347
2348 nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2349 nr_pages, NULL, page_array);
2350
2351 if (nr_allocated < nr_pages)
2352 nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2353 nr_pages - nr_allocated, NULL,
2354 page_array + nr_allocated);
2355 return nr_allocated;
2356}
2357
2358/* alloc pages bulk and mempolicy should be considered at the
2359 * same time in some situation such as vmalloc.
2360 *
2361 * It can accelerate memory allocation especially interleaving
2362 * allocate memory.
2363 */
2364unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2365 unsigned long nr_pages, struct page **page_array)
2366{
2367 struct mempolicy *pol = &default_policy;
2368
2369 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2370 pol = get_task_policy(current);
2371
2372 if (pol->mode == MPOL_INTERLEAVE)
2373 return alloc_pages_bulk_array_interleave(gfp, pol,
2374 nr_pages, page_array);
2375
2376 if (pol->mode == MPOL_PREFERRED_MANY)
2377 return alloc_pages_bulk_array_preferred_many(gfp,
2378 numa_node_id(), pol, nr_pages, page_array);
2379
2380 return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2381 policy_nodemask(gfp, pol), nr_pages, NULL,
2382 page_array);
2383}
2384
ef0855d3
ON
2385int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2386{
2387 struct mempolicy *pol = mpol_dup(vma_policy(src));
2388
2389 if (IS_ERR(pol))
2390 return PTR_ERR(pol);
2391 dst->vm_policy = pol;
2392 return 0;
2393}
2394
4225399a 2395/*
846a16bf 2396 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2397 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2398 * with the mems_allowed returned by cpuset_mems_allowed(). This
2399 * keeps mempolicies cpuset relative after its cpuset moves. See
2400 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2401 *
2402 * current's mempolicy may be rebinded by the other task(the task that changes
2403 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2404 */
4225399a 2405
846a16bf
LS
2406/* Slow path of a mempolicy duplicate */
2407struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2408{
2409 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2410
2411 if (!new)
2412 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2413
2414 /* task's mempolicy is protected by alloc_lock */
2415 if (old == current->mempolicy) {
2416 task_lock(current);
2417 *new = *old;
2418 task_unlock(current);
2419 } else
2420 *new = *old;
2421
4225399a
PJ
2422 if (current_cpuset_is_being_rebound()) {
2423 nodemask_t mems = cpuset_mems_allowed(current);
213980c0 2424 mpol_rebind_policy(new, &mems);
4225399a 2425 }
1da177e4 2426 atomic_set(&new->refcnt, 1);
1da177e4
LT
2427 return new;
2428}
2429
2430/* Slow path of a mempolicy comparison */
fcfb4dcc 2431bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2432{
2433 if (!a || !b)
fcfb4dcc 2434 return false;
45c4745a 2435 if (a->mode != b->mode)
fcfb4dcc 2436 return false;
19800502 2437 if (a->flags != b->flags)
fcfb4dcc 2438 return false;
c6018b4b
AK
2439 if (a->home_node != b->home_node)
2440 return false;
19800502
BL
2441 if (mpol_store_user_nodemask(a))
2442 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2443 return false;
19800502 2444
45c4745a 2445 switch (a->mode) {
19770b32 2446 case MPOL_BIND:
1da177e4 2447 case MPOL_INTERLEAVE:
1da177e4 2448 case MPOL_PREFERRED:
b27abacc 2449 case MPOL_PREFERRED_MANY:
269fbe72 2450 return !!nodes_equal(a->nodes, b->nodes);
7858d7bc
FT
2451 case MPOL_LOCAL:
2452 return true;
1da177e4
LT
2453 default:
2454 BUG();
fcfb4dcc 2455 return false;
1da177e4
LT
2456 }
2457}
2458
1da177e4
LT
2459/*
2460 * Shared memory backing store policy support.
2461 *
2462 * Remember policies even when nobody has shared memory mapped.
2463 * The policies are kept in Red-Black tree linked from the inode.
4a8c7bb5 2464 * They are protected by the sp->lock rwlock, which should be held
1da177e4
LT
2465 * for any accesses to the tree.
2466 */
2467
4a8c7bb5
NZ
2468/*
2469 * lookup first element intersecting start-end. Caller holds sp->lock for
2470 * reading or for writing
2471 */
1da177e4
LT
2472static struct sp_node *
2473sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2474{
2475 struct rb_node *n = sp->root.rb_node;
2476
2477 while (n) {
2478 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2479
2480 if (start >= p->end)
2481 n = n->rb_right;
2482 else if (end <= p->start)
2483 n = n->rb_left;
2484 else
2485 break;
2486 }
2487 if (!n)
2488 return NULL;
2489 for (;;) {
2490 struct sp_node *w = NULL;
2491 struct rb_node *prev = rb_prev(n);
2492 if (!prev)
2493 break;
2494 w = rb_entry(prev, struct sp_node, nd);
2495 if (w->end <= start)
2496 break;
2497 n = prev;
2498 }
2499 return rb_entry(n, struct sp_node, nd);
2500}
2501
4a8c7bb5
NZ
2502/*
2503 * Insert a new shared policy into the list. Caller holds sp->lock for
2504 * writing.
2505 */
1da177e4
LT
2506static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2507{
2508 struct rb_node **p = &sp->root.rb_node;
2509 struct rb_node *parent = NULL;
2510 struct sp_node *nd;
2511
2512 while (*p) {
2513 parent = *p;
2514 nd = rb_entry(parent, struct sp_node, nd);
2515 if (new->start < nd->start)
2516 p = &(*p)->rb_left;
2517 else if (new->end > nd->end)
2518 p = &(*p)->rb_right;
2519 else
2520 BUG();
2521 }
2522 rb_link_node(&new->nd, parent, p);
2523 rb_insert_color(&new->nd, &sp->root);
140d5a49 2524 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
45c4745a 2525 new->policy ? new->policy->mode : 0);
1da177e4
LT
2526}
2527
2528/* Find shared policy intersecting idx */
2529struct mempolicy *
2530mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2531{
2532 struct mempolicy *pol = NULL;
2533 struct sp_node *sn;
2534
2535 if (!sp->root.rb_node)
2536 return NULL;
4a8c7bb5 2537 read_lock(&sp->lock);
1da177e4
LT
2538 sn = sp_lookup(sp, idx, idx+1);
2539 if (sn) {
2540 mpol_get(sn->policy);
2541 pol = sn->policy;
2542 }
4a8c7bb5 2543 read_unlock(&sp->lock);
1da177e4
LT
2544 return pol;
2545}
2546
63f74ca2
KM
2547static void sp_free(struct sp_node *n)
2548{
2549 mpol_put(n->policy);
2550 kmem_cache_free(sn_cache, n);
2551}
2552
771fb4d8
LS
2553/**
2554 * mpol_misplaced - check whether current page node is valid in policy
2555 *
b46e14ac
FF
2556 * @page: page to be checked
2557 * @vma: vm area where page mapped
2558 * @addr: virtual address where page mapped
771fb4d8
LS
2559 *
2560 * Lookup current policy node id for vma,addr and "compare to" page's
5f076944 2561 * node id. Policy determination "mimics" alloc_page_vma().
771fb4d8 2562 * Called from fault path where we know the vma and faulting address.
5f076944 2563 *
062db293
BW
2564 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2565 * policy, or a suitable node ID to allocate a replacement page from.
771fb4d8
LS
2566 */
2567int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2568{
2569 struct mempolicy *pol;
c33d6c06 2570 struct zoneref *z;
771fb4d8
LS
2571 int curnid = page_to_nid(page);
2572 unsigned long pgoff;
90572890
PZ
2573 int thiscpu = raw_smp_processor_id();
2574 int thisnid = cpu_to_node(thiscpu);
98fa15f3 2575 int polnid = NUMA_NO_NODE;
062db293 2576 int ret = NUMA_NO_NODE;
771fb4d8 2577
dd6eecb9 2578 pol = get_vma_policy(vma, addr);
771fb4d8
LS
2579 if (!(pol->flags & MPOL_F_MOF))
2580 goto out;
2581
2582 switch (pol->mode) {
2583 case MPOL_INTERLEAVE:
771fb4d8
LS
2584 pgoff = vma->vm_pgoff;
2585 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
98c70baa 2586 polnid = offset_il_node(pol, pgoff);
771fb4d8
LS
2587 break;
2588
2589 case MPOL_PREFERRED:
b27abacc
DH
2590 if (node_isset(curnid, pol->nodes))
2591 goto out;
269fbe72 2592 polnid = first_node(pol->nodes);
7858d7bc
FT
2593 break;
2594
2595 case MPOL_LOCAL:
2596 polnid = numa_node_id();
771fb4d8
LS
2597 break;
2598
2599 case MPOL_BIND:
bda420b9
HY
2600 /* Optimize placement among multiple nodes via NUMA balancing */
2601 if (pol->flags & MPOL_F_MORON) {
269fbe72 2602 if (node_isset(thisnid, pol->nodes))
bda420b9
HY
2603 break;
2604 goto out;
2605 }
b27abacc 2606 fallthrough;
c33d6c06 2607
b27abacc 2608 case MPOL_PREFERRED_MANY:
771fb4d8 2609 /*
771fb4d8
LS
2610 * use current page if in policy nodemask,
2611 * else select nearest allowed node, if any.
2612 * If no allowed nodes, use current [!misplaced].
2613 */
269fbe72 2614 if (node_isset(curnid, pol->nodes))
771fb4d8 2615 goto out;
c33d6c06 2616 z = first_zones_zonelist(
771fb4d8
LS
2617 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2618 gfp_zone(GFP_HIGHUSER),
269fbe72 2619 &pol->nodes);
c1093b74 2620 polnid = zone_to_nid(z->zone);
771fb4d8
LS
2621 break;
2622
2623 default:
2624 BUG();
2625 }
5606e387
MG
2626
2627 /* Migrate the page towards the node whose CPU is referencing it */
e42c8ff2 2628 if (pol->flags & MPOL_F_MORON) {
90572890 2629 polnid = thisnid;
5606e387 2630
10f39042 2631 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
de1c9ce6 2632 goto out;
e42c8ff2
MG
2633 }
2634
771fb4d8
LS
2635 if (curnid != polnid)
2636 ret = polnid;
2637out:
2638 mpol_cond_put(pol);
2639
2640 return ret;
2641}
2642
c11600e4
DR
2643/*
2644 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2645 * dropped after task->mempolicy is set to NULL so that any allocation done as
2646 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2647 * policy.
2648 */
2649void mpol_put_task_policy(struct task_struct *task)
2650{
2651 struct mempolicy *pol;
2652
2653 task_lock(task);
2654 pol = task->mempolicy;
2655 task->mempolicy = NULL;
2656 task_unlock(task);
2657 mpol_put(pol);
2658}
2659
1da177e4
LT
2660static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2661{
140d5a49 2662 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 2663 rb_erase(&n->nd, &sp->root);
63f74ca2 2664 sp_free(n);
1da177e4
LT
2665}
2666
42288fe3
MG
2667static void sp_node_init(struct sp_node *node, unsigned long start,
2668 unsigned long end, struct mempolicy *pol)
2669{
2670 node->start = start;
2671 node->end = end;
2672 node->policy = pol;
2673}
2674
dbcb0f19
AB
2675static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2676 struct mempolicy *pol)
1da177e4 2677{
869833f2
KM
2678 struct sp_node *n;
2679 struct mempolicy *newpol;
1da177e4 2680
869833f2 2681 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2682 if (!n)
2683 return NULL;
869833f2
KM
2684
2685 newpol = mpol_dup(pol);
2686 if (IS_ERR(newpol)) {
2687 kmem_cache_free(sn_cache, n);
2688 return NULL;
2689 }
2690 newpol->flags |= MPOL_F_SHARED;
42288fe3 2691 sp_node_init(n, start, end, newpol);
869833f2 2692
1da177e4
LT
2693 return n;
2694}
2695
2696/* Replace a policy range. */
2697static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2698 unsigned long end, struct sp_node *new)
2699{
b22d127a 2700 struct sp_node *n;
42288fe3
MG
2701 struct sp_node *n_new = NULL;
2702 struct mempolicy *mpol_new = NULL;
b22d127a 2703 int ret = 0;
1da177e4 2704
42288fe3 2705restart:
4a8c7bb5 2706 write_lock(&sp->lock);
1da177e4
LT
2707 n = sp_lookup(sp, start, end);
2708 /* Take care of old policies in the same range. */
2709 while (n && n->start < end) {
2710 struct rb_node *next = rb_next(&n->nd);
2711 if (n->start >= start) {
2712 if (n->end <= end)
2713 sp_delete(sp, n);
2714 else
2715 n->start = end;
2716 } else {
2717 /* Old policy spanning whole new range. */
2718 if (n->end > end) {
42288fe3
MG
2719 if (!n_new)
2720 goto alloc_new;
2721
2722 *mpol_new = *n->policy;
2723 atomic_set(&mpol_new->refcnt, 1);
7880639c 2724 sp_node_init(n_new, end, n->end, mpol_new);
1da177e4 2725 n->end = start;
5ca39575 2726 sp_insert(sp, n_new);
42288fe3
MG
2727 n_new = NULL;
2728 mpol_new = NULL;
1da177e4
LT
2729 break;
2730 } else
2731 n->end = start;
2732 }
2733 if (!next)
2734 break;
2735 n = rb_entry(next, struct sp_node, nd);
2736 }
2737 if (new)
2738 sp_insert(sp, new);
4a8c7bb5 2739 write_unlock(&sp->lock);
42288fe3
MG
2740 ret = 0;
2741
2742err_out:
2743 if (mpol_new)
2744 mpol_put(mpol_new);
2745 if (n_new)
2746 kmem_cache_free(sn_cache, n_new);
2747
b22d127a 2748 return ret;
42288fe3
MG
2749
2750alloc_new:
4a8c7bb5 2751 write_unlock(&sp->lock);
42288fe3
MG
2752 ret = -ENOMEM;
2753 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2754 if (!n_new)
2755 goto err_out;
2756 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2757 if (!mpol_new)
2758 goto err_out;
4ad09955 2759 atomic_set(&mpol_new->refcnt, 1);
42288fe3 2760 goto restart;
1da177e4
LT
2761}
2762
71fe804b
LS
2763/**
2764 * mpol_shared_policy_init - initialize shared policy for inode
2765 * @sp: pointer to inode shared policy
2766 * @mpol: struct mempolicy to install
2767 *
2768 * Install non-NULL @mpol in inode's shared policy rb-tree.
2769 * On entry, the current task has a reference on a non-NULL @mpol.
2770 * This must be released on exit.
4bfc4495 2771 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2772 */
2773void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2774{
58568d2a
MX
2775 int ret;
2776
71fe804b 2777 sp->root = RB_ROOT; /* empty tree == default mempolicy */
4a8c7bb5 2778 rwlock_init(&sp->lock);
71fe804b
LS
2779
2780 if (mpol) {
2781 struct vm_area_struct pvma;
2782 struct mempolicy *new;
4bfc4495 2783 NODEMASK_SCRATCH(scratch);
71fe804b 2784
4bfc4495 2785 if (!scratch)
5c0c1654 2786 goto put_mpol;
71fe804b
LS
2787 /* contextualize the tmpfs mount point mempolicy */
2788 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
15d77835 2789 if (IS_ERR(new))
0cae3457 2790 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2791
2792 task_lock(current);
4bfc4495 2793 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a 2794 task_unlock(current);
15d77835 2795 if (ret)
5c0c1654 2796 goto put_new;
71fe804b
LS
2797
2798 /* Create pseudo-vma that contains just the policy */
2c4541e2 2799 vma_init(&pvma, NULL);
71fe804b
LS
2800 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2801 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
15d77835 2802
5c0c1654 2803put_new:
71fe804b 2804 mpol_put(new); /* drop initial ref */
0cae3457 2805free_scratch:
4bfc4495 2806 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2807put_mpol:
2808 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2809 }
2810}
2811
1da177e4
LT
2812int mpol_set_shared_policy(struct shared_policy *info,
2813 struct vm_area_struct *vma, struct mempolicy *npol)
2814{
2815 int err;
2816 struct sp_node *new = NULL;
2817 unsigned long sz = vma_pages(vma);
2818
028fec41 2819 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 2820 vma->vm_pgoff,
45c4745a 2821 sz, npol ? npol->mode : -1,
028fec41 2822 npol ? npol->flags : -1,
269fbe72 2823 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
1da177e4
LT
2824
2825 if (npol) {
2826 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2827 if (!new)
2828 return -ENOMEM;
2829 }
2830 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2831 if (err && new)
63f74ca2 2832 sp_free(new);
1da177e4
LT
2833 return err;
2834}
2835
2836/* Free a backing policy store on inode delete. */
2837void mpol_free_shared_policy(struct shared_policy *p)
2838{
2839 struct sp_node *n;
2840 struct rb_node *next;
2841
2842 if (!p->root.rb_node)
2843 return;
4a8c7bb5 2844 write_lock(&p->lock);
1da177e4
LT
2845 next = rb_first(&p->root);
2846 while (next) {
2847 n = rb_entry(next, struct sp_node, nd);
2848 next = rb_next(&n->nd);
63f74ca2 2849 sp_delete(p, n);
1da177e4 2850 }
4a8c7bb5 2851 write_unlock(&p->lock);
1da177e4
LT
2852}
2853
1a687c2e 2854#ifdef CONFIG_NUMA_BALANCING
c297663c 2855static int __initdata numabalancing_override;
1a687c2e
MG
2856
2857static void __init check_numabalancing_enable(void)
2858{
2859 bool numabalancing_default = false;
2860
2861 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2862 numabalancing_default = true;
2863
c297663c
MG
2864 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2865 if (numabalancing_override)
2866 set_numabalancing_state(numabalancing_override == 1);
2867
b0dc2b9b 2868 if (num_online_nodes() > 1 && !numabalancing_override) {
756a025f 2869 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
c297663c 2870 numabalancing_default ? "Enabling" : "Disabling");
1a687c2e
MG
2871 set_numabalancing_state(numabalancing_default);
2872 }
2873}
2874
2875static int __init setup_numabalancing(char *str)
2876{
2877 int ret = 0;
2878 if (!str)
2879 goto out;
1a687c2e
MG
2880
2881 if (!strcmp(str, "enable")) {
c297663c 2882 numabalancing_override = 1;
1a687c2e
MG
2883 ret = 1;
2884 } else if (!strcmp(str, "disable")) {
c297663c 2885 numabalancing_override = -1;
1a687c2e
MG
2886 ret = 1;
2887 }
2888out:
2889 if (!ret)
4a404bea 2890 pr_warn("Unable to parse numa_balancing=\n");
1a687c2e
MG
2891
2892 return ret;
2893}
2894__setup("numa_balancing=", setup_numabalancing);
2895#else
2896static inline void __init check_numabalancing_enable(void)
2897{
2898}
2899#endif /* CONFIG_NUMA_BALANCING */
2900
1da177e4
LT
2901/* assumes fs == KERNEL_DS */
2902void __init numa_policy_init(void)
2903{
b71636e2
PM
2904 nodemask_t interleave_nodes;
2905 unsigned long largest = 0;
2906 int nid, prefer = 0;
2907
1da177e4
LT
2908 policy_cache = kmem_cache_create("numa_policy",
2909 sizeof(struct mempolicy),
20c2df83 2910 0, SLAB_PANIC, NULL);
1da177e4
LT
2911
2912 sn_cache = kmem_cache_create("shared_policy_node",
2913 sizeof(struct sp_node),
20c2df83 2914 0, SLAB_PANIC, NULL);
1da177e4 2915
5606e387
MG
2916 for_each_node(nid) {
2917 preferred_node_policy[nid] = (struct mempolicy) {
2918 .refcnt = ATOMIC_INIT(1),
2919 .mode = MPOL_PREFERRED,
2920 .flags = MPOL_F_MOF | MPOL_F_MORON,
269fbe72 2921 .nodes = nodemask_of_node(nid),
5606e387
MG
2922 };
2923 }
2924
b71636e2
PM
2925 /*
2926 * Set interleaving policy for system init. Interleaving is only
2927 * enabled across suitably sized nodes (default is >= 16MB), or
2928 * fall back to the largest node if they're all smaller.
2929 */
2930 nodes_clear(interleave_nodes);
01f13bd6 2931 for_each_node_state(nid, N_MEMORY) {
b71636e2
PM
2932 unsigned long total_pages = node_present_pages(nid);
2933
2934 /* Preserve the largest node */
2935 if (largest < total_pages) {
2936 largest = total_pages;
2937 prefer = nid;
2938 }
2939
2940 /* Interleave this node? */
2941 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2942 node_set(nid, interleave_nodes);
2943 }
2944
2945 /* All too small, use the largest */
2946 if (unlikely(nodes_empty(interleave_nodes)))
2947 node_set(prefer, interleave_nodes);
1da177e4 2948
028fec41 2949 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
b1de0d13 2950 pr_err("%s: interleaving failed\n", __func__);
1a687c2e
MG
2951
2952 check_numabalancing_enable();
1da177e4
LT
2953}
2954
8bccd85f 2955/* Reset policy of current process to default */
1da177e4
LT
2956void numa_default_policy(void)
2957{
028fec41 2958 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2959}
68860ec1 2960
095f1fc4
LS
2961/*
2962 * Parse and format mempolicy from/to strings
2963 */
2964
345ace9c
LS
2965static const char * const policy_modes[] =
2966{
2967 [MPOL_DEFAULT] = "default",
2968 [MPOL_PREFERRED] = "prefer",
2969 [MPOL_BIND] = "bind",
2970 [MPOL_INTERLEAVE] = "interleave",
d3a71033 2971 [MPOL_LOCAL] = "local",
b27abacc 2972 [MPOL_PREFERRED_MANY] = "prefer (many)",
345ace9c 2973};
1a75a6c8 2974
095f1fc4
LS
2975
2976#ifdef CONFIG_TMPFS
2977/**
f2a07f40 2978 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4 2979 * @str: string containing mempolicy to parse
71fe804b 2980 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4
LS
2981 *
2982 * Format of input:
2983 * <mode>[=<flags>][:<nodelist>]
2984 *
dad5b023 2985 * Return: %0 on success, else %1
095f1fc4 2986 */
a7a88b23 2987int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4 2988{
71fe804b 2989 struct mempolicy *new = NULL;
f2a07f40 2990 unsigned short mode_flags;
71fe804b 2991 nodemask_t nodes;
095f1fc4
LS
2992 char *nodelist = strchr(str, ':');
2993 char *flags = strchr(str, '=');
dedf2c73 2994 int err = 1, mode;
095f1fc4 2995
c7a91bc7
DC
2996 if (flags)
2997 *flags++ = '\0'; /* terminate mode string */
2998
095f1fc4
LS
2999 if (nodelist) {
3000 /* NUL-terminate mode or flags string */
3001 *nodelist++ = '\0';
71fe804b 3002 if (nodelist_parse(nodelist, nodes))
095f1fc4 3003 goto out;
01f13bd6 3004 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4 3005 goto out;
71fe804b
LS
3006 } else
3007 nodes_clear(nodes);
3008
dedf2c73 3009 mode = match_string(policy_modes, MPOL_MAX, str);
3010 if (mode < 0)
095f1fc4
LS
3011 goto out;
3012
71fe804b 3013 switch (mode) {
095f1fc4 3014 case MPOL_PREFERRED:
71fe804b 3015 /*
aa9f7d51
RD
3016 * Insist on a nodelist of one node only, although later
3017 * we use first_node(nodes) to grab a single node, so here
3018 * nodelist (or nodes) cannot be empty.
71fe804b 3019 */
095f1fc4
LS
3020 if (nodelist) {
3021 char *rest = nodelist;
3022 while (isdigit(*rest))
3023 rest++;
926f2ae0
KM
3024 if (*rest)
3025 goto out;
aa9f7d51
RD
3026 if (nodes_empty(nodes))
3027 goto out;
095f1fc4
LS
3028 }
3029 break;
095f1fc4
LS
3030 case MPOL_INTERLEAVE:
3031 /*
3032 * Default to online nodes with memory if no nodelist
3033 */
3034 if (!nodelist)
01f13bd6 3035 nodes = node_states[N_MEMORY];
3f226aa1 3036 break;
71fe804b 3037 case MPOL_LOCAL:
3f226aa1 3038 /*
71fe804b 3039 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 3040 */
71fe804b 3041 if (nodelist)
3f226aa1 3042 goto out;
3f226aa1 3043 break;
413b43de
RT
3044 case MPOL_DEFAULT:
3045 /*
3046 * Insist on a empty nodelist
3047 */
3048 if (!nodelist)
3049 err = 0;
3050 goto out;
b27abacc 3051 case MPOL_PREFERRED_MANY:
d69b2e63
KM
3052 case MPOL_BIND:
3053 /*
3054 * Insist on a nodelist
3055 */
3056 if (!nodelist)
3057 goto out;
095f1fc4
LS
3058 }
3059
71fe804b 3060 mode_flags = 0;
095f1fc4
LS
3061 if (flags) {
3062 /*
3063 * Currently, we only support two mutually exclusive
3064 * mode flags.
3065 */
3066 if (!strcmp(flags, "static"))
71fe804b 3067 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 3068 else if (!strcmp(flags, "relative"))
71fe804b 3069 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 3070 else
926f2ae0 3071 goto out;
095f1fc4 3072 }
71fe804b
LS
3073
3074 new = mpol_new(mode, mode_flags, &nodes);
3075 if (IS_ERR(new))
926f2ae0
KM
3076 goto out;
3077
f2a07f40
HD
3078 /*
3079 * Save nodes for mpol_to_str() to show the tmpfs mount options
3080 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3081 */
269fbe72
BW
3082 if (mode != MPOL_PREFERRED) {
3083 new->nodes = nodes;
3084 } else if (nodelist) {
3085 nodes_clear(new->nodes);
3086 node_set(first_node(nodes), new->nodes);
3087 } else {
7858d7bc 3088 new->mode = MPOL_LOCAL;
269fbe72 3089 }
f2a07f40
HD
3090
3091 /*
3092 * Save nodes for contextualization: this will be used to "clone"
3093 * the mempolicy in a specific context [cpuset] at a later time.
3094 */
3095 new->w.user_nodemask = nodes;
3096
926f2ae0 3097 err = 0;
71fe804b 3098
095f1fc4
LS
3099out:
3100 /* Restore string for error message */
3101 if (nodelist)
3102 *--nodelist = ':';
3103 if (flags)
3104 *--flags = '=';
71fe804b
LS
3105 if (!err)
3106 *mpol = new;
095f1fc4
LS
3107 return err;
3108}
3109#endif /* CONFIG_TMPFS */
3110
71fe804b
LS
3111/**
3112 * mpol_to_str - format a mempolicy structure for printing
3113 * @buffer: to contain formatted mempolicy string
3114 * @maxlen: length of @buffer
3115 * @pol: pointer to mempolicy to be formatted
71fe804b 3116 *
948927ee
DR
3117 * Convert @pol into a string. If @buffer is too short, truncate the string.
3118 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3119 * longest flag, "relative", and to display at least a few node ids.
1a75a6c8 3120 */
948927ee 3121void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c8
CL
3122{
3123 char *p = buffer;
948927ee
DR
3124 nodemask_t nodes = NODE_MASK_NONE;
3125 unsigned short mode = MPOL_DEFAULT;
3126 unsigned short flags = 0;
2291990a 3127
8790c71a 3128 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d5 3129 mode = pol->mode;
948927ee
DR
3130 flags = pol->flags;
3131 }
bea904d5 3132
1a75a6c8
CL
3133 switch (mode) {
3134 case MPOL_DEFAULT:
7858d7bc 3135 case MPOL_LOCAL:
1a75a6c8 3136 break;
1a75a6c8 3137 case MPOL_PREFERRED:
b27abacc 3138 case MPOL_PREFERRED_MANY:
1a75a6c8 3139 case MPOL_BIND:
1a75a6c8 3140 case MPOL_INTERLEAVE:
269fbe72 3141 nodes = pol->nodes;
1a75a6c8 3142 break;
1a75a6c8 3143 default:
948927ee
DR
3144 WARN_ON_ONCE(1);
3145 snprintf(p, maxlen, "unknown");
3146 return;
1a75a6c8
CL
3147 }
3148
b7a9f420 3149 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c8 3150
fc36b8d3 3151 if (flags & MPOL_MODE_FLAGS) {
948927ee 3152 p += snprintf(p, buffer + maxlen - p, "=");
f5b087b5 3153
2291990a
LS
3154 /*
3155 * Currently, the only defined flags are mutually exclusive
3156 */
f5b087b5 3157 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
3158 p += snprintf(p, buffer + maxlen - p, "static");
3159 else if (flags & MPOL_F_RELATIVE_NODES)
3160 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
3161 }
3162
9e763e0f
TH
3163 if (!nodes_empty(nodes))
3164 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3165 nodemask_pr_args(&nodes));
1a75a6c8 3166}