mm/page_alloc.c: document bulkfree_pcp_prepare() return value
[linux-block.git] / mm / mempolicy.c
CommitLineData
46aeb7e6 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4 28 * preferred Try a specific node first before normal fallback.
00ef2d2f 29 * As a special case NUMA_NO_NODE here means do the allocation
1da177e4
LT
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
b27abacc
DH
34 * preferred many Try a set of nodes first before normal fallback. This is
35 * similar to preferred without the special case.
36 *
1da177e4
LT
37 * default Allocate on the local node first, or when on a VMA
38 * use the process policy. This is what Linux always did
39 * in a NUMA aware kernel and still does by, ahem, default.
40 *
41 * The process policy is applied for most non interrupt memory allocations
42 * in that process' context. Interrupts ignore the policies and always
43 * try to allocate on the local CPU. The VMA policy is only applied for memory
44 * allocations for a VMA in the VM.
45 *
46 * Currently there are a few corner cases in swapping where the policy
47 * is not applied, but the majority should be handled. When process policy
48 * is used it is not remembered over swap outs/swap ins.
49 *
50 * Only the highest zone in the zone hierarchy gets policied. Allocations
51 * requesting a lower zone just use default policy. This implies that
52 * on systems with highmem kernel lowmem allocation don't get policied.
53 * Same with GFP_DMA allocations.
54 *
55 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
56 * all users and remembered even when nobody has memory mapped.
57 */
58
59/* Notebook:
60 fix mmap readahead to honour policy and enable policy for any page cache
61 object
62 statistics for bigpages
63 global policy for page cache? currently it uses process policy. Requires
64 first item above.
65 handle mremap for shared memory (currently ignored for the policy)
66 grows down?
67 make bind policy root only? It can trigger oom much faster and the
68 kernel is not always grateful with that.
1da177e4
LT
69*/
70
b1de0d13
MH
71#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72
1da177e4 73#include <linux/mempolicy.h>
a520110e 74#include <linux/pagewalk.h>
1da177e4
LT
75#include <linux/highmem.h>
76#include <linux/hugetlb.h>
77#include <linux/kernel.h>
78#include <linux/sched.h>
6e84f315 79#include <linux/sched/mm.h>
6a3827d7 80#include <linux/sched/numa_balancing.h>
f719ff9b 81#include <linux/sched/task.h>
1da177e4
LT
82#include <linux/nodemask.h>
83#include <linux/cpuset.h>
1da177e4
LT
84#include <linux/slab.h>
85#include <linux/string.h>
b95f1b31 86#include <linux/export.h>
b488893a 87#include <linux/nsproxy.h>
1da177e4
LT
88#include <linux/interrupt.h>
89#include <linux/init.h>
90#include <linux/compat.h>
31367466 91#include <linux/ptrace.h>
dc9aa5b9 92#include <linux/swap.h>
1a75a6c8
CL
93#include <linux/seq_file.h>
94#include <linux/proc_fs.h>
b20a3503 95#include <linux/migrate.h>
62b61f61 96#include <linux/ksm.h>
95a402c3 97#include <linux/rmap.h>
86c3a764 98#include <linux/security.h>
dbcb0f19 99#include <linux/syscalls.h>
095f1fc4 100#include <linux/ctype.h>
6d9c285a 101#include <linux/mm_inline.h>
b24f53a0 102#include <linux/mmu_notifier.h>
b1de0d13 103#include <linux/printk.h>
c8633798 104#include <linux/swapops.h>
dc9aa5b9 105
1da177e4 106#include <asm/tlbflush.h>
4a18419f 107#include <asm/tlb.h>
7c0f6ba6 108#include <linux/uaccess.h>
1da177e4 109
62695a84
NP
110#include "internal.h"
111
38e35860 112/* Internal flags */
dc9aa5b9 113#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 114#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
dc9aa5b9 115
fcc234f8
PE
116static struct kmem_cache *policy_cache;
117static struct kmem_cache *sn_cache;
1da177e4 118
1da177e4
LT
119/* Highest zone. An specific allocation for a zone below that is not
120 policied. */
6267276f 121enum zone_type policy_zone = 0;
1da177e4 122
bea904d5
LS
123/*
124 * run-time system-wide default policy => local allocation
125 */
e754d79d 126static struct mempolicy default_policy = {
1da177e4 127 .refcnt = ATOMIC_INIT(1), /* never free it */
7858d7bc 128 .mode = MPOL_LOCAL,
1da177e4
LT
129};
130
5606e387
MG
131static struct mempolicy preferred_node_policy[MAX_NUMNODES];
132
b2ca916c
DW
133/**
134 * numa_map_to_online_node - Find closest online node
f6e92f40 135 * @node: Node id to start the search
b2ca916c
DW
136 *
137 * Lookup the next closest node by distance if @nid is not online.
dad5b023
RD
138 *
139 * Return: this @node if it is online, otherwise the closest node by distance
b2ca916c
DW
140 */
141int numa_map_to_online_node(int node)
142{
4fcbe96e 143 int min_dist = INT_MAX, dist, n, min_node;
b2ca916c 144
4fcbe96e
DW
145 if (node == NUMA_NO_NODE || node_online(node))
146 return node;
b2ca916c
DW
147
148 min_node = node;
4fcbe96e
DW
149 for_each_online_node(n) {
150 dist = node_distance(node, n);
151 if (dist < min_dist) {
152 min_dist = dist;
153 min_node = n;
b2ca916c
DW
154 }
155 }
156
157 return min_node;
158}
159EXPORT_SYMBOL_GPL(numa_map_to_online_node);
160
74d2c3a0 161struct mempolicy *get_task_policy(struct task_struct *p)
5606e387
MG
162{
163 struct mempolicy *pol = p->mempolicy;
f15ca78e 164 int node;
5606e387 165
f15ca78e
ON
166 if (pol)
167 return pol;
5606e387 168
f15ca78e
ON
169 node = numa_node_id();
170 if (node != NUMA_NO_NODE) {
171 pol = &preferred_node_policy[node];
172 /* preferred_node_policy is not initialised early in boot */
173 if (pol->mode)
174 return pol;
5606e387
MG
175 }
176
f15ca78e 177 return &default_policy;
5606e387
MG
178}
179
37012946
DR
180static const struct mempolicy_operations {
181 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
213980c0 182 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
37012946
DR
183} mpol_ops[MPOL_MAX];
184
f5b087b5
DR
185static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
186{
6d556294 187 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
188}
189
190static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
191 const nodemask_t *rel)
192{
193 nodemask_t tmp;
194 nodes_fold(tmp, *orig, nodes_weight(*rel));
195 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
196}
197
be897d48 198static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
199{
200 if (nodes_empty(*nodes))
201 return -EINVAL;
269fbe72 202 pol->nodes = *nodes;
37012946
DR
203 return 0;
204}
205
206static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
207{
7858d7bc
FT
208 if (nodes_empty(*nodes))
209 return -EINVAL;
269fbe72
BW
210
211 nodes_clear(pol->nodes);
212 node_set(first_node(*nodes), pol->nodes);
37012946
DR
213 return 0;
214}
215
58568d2a
MX
216/*
217 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
218 * any, for the new policy. mpol_new() has already validated the nodes
7858d7bc 219 * parameter with respect to the policy mode and flags.
58568d2a
MX
220 *
221 * Must be called holding task's alloc_lock to protect task's mems_allowed
c1e8d7c6 222 * and mempolicy. May also be called holding the mmap_lock for write.
58568d2a 223 */
4bfc4495
KH
224static int mpol_set_nodemask(struct mempolicy *pol,
225 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 226{
58568d2a
MX
227 int ret;
228
7858d7bc
FT
229 /*
230 * Default (pol==NULL) resp. local memory policies are not a
231 * subject of any remapping. They also do not need any special
232 * constructor.
233 */
234 if (!pol || pol->mode == MPOL_LOCAL)
58568d2a 235 return 0;
7858d7bc 236
01f13bd6 237 /* Check N_MEMORY */
4bfc4495 238 nodes_and(nsc->mask1,
01f13bd6 239 cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a
MX
240
241 VM_BUG_ON(!nodes);
4bfc4495 242
7858d7bc
FT
243 if (pol->flags & MPOL_F_RELATIVE_NODES)
244 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
245 else
246 nodes_and(nsc->mask2, *nodes, nsc->mask1);
58568d2a 247
7858d7bc
FT
248 if (mpol_store_user_nodemask(pol))
249 pol->w.user_nodemask = *nodes;
4bfc4495 250 else
7858d7bc
FT
251 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
252
253 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
58568d2a
MX
254 return ret;
255}
256
257/*
258 * This function just creates a new policy, does some check and simple
259 * initialization. You must invoke mpol_set_nodemask() to set nodes.
260 */
028fec41
DR
261static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
262 nodemask_t *nodes)
1da177e4
LT
263{
264 struct mempolicy *policy;
265
028fec41 266 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
00ef2d2f 267 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
140d5a49 268
3e1f0645
DR
269 if (mode == MPOL_DEFAULT) {
270 if (nodes && !nodes_empty(*nodes))
37012946 271 return ERR_PTR(-EINVAL);
d3a71033 272 return NULL;
37012946 273 }
3e1f0645
DR
274 VM_BUG_ON(!nodes);
275
276 /*
277 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
278 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
279 * All other modes require a valid pointer to a non-empty nodemask.
280 */
281 if (mode == MPOL_PREFERRED) {
282 if (nodes_empty(*nodes)) {
283 if (((flags & MPOL_F_STATIC_NODES) ||
284 (flags & MPOL_F_RELATIVE_NODES)))
285 return ERR_PTR(-EINVAL);
7858d7bc
FT
286
287 mode = MPOL_LOCAL;
3e1f0645 288 }
479e2802 289 } else if (mode == MPOL_LOCAL) {
8d303e44
PK
290 if (!nodes_empty(*nodes) ||
291 (flags & MPOL_F_STATIC_NODES) ||
292 (flags & MPOL_F_RELATIVE_NODES))
479e2802 293 return ERR_PTR(-EINVAL);
3e1f0645
DR
294 } else if (nodes_empty(*nodes))
295 return ERR_PTR(-EINVAL);
1da177e4
LT
296 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
297 if (!policy)
298 return ERR_PTR(-ENOMEM);
299 atomic_set(&policy->refcnt, 1);
45c4745a 300 policy->mode = mode;
3e1f0645 301 policy->flags = flags;
c6018b4b 302 policy->home_node = NUMA_NO_NODE;
37012946 303
1da177e4 304 return policy;
37012946
DR
305}
306
52cd3b07
LS
307/* Slow path of a mpol destructor. */
308void __mpol_put(struct mempolicy *p)
309{
310 if (!atomic_dec_and_test(&p->refcnt))
311 return;
52cd3b07
LS
312 kmem_cache_free(policy_cache, p);
313}
314
213980c0 315static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
316{
317}
318
213980c0 319static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
320{
321 nodemask_t tmp;
322
323 if (pol->flags & MPOL_F_STATIC_NODES)
324 nodes_and(tmp, pol->w.user_nodemask, *nodes);
325 else if (pol->flags & MPOL_F_RELATIVE_NODES)
326 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
327 else {
269fbe72 328 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
213980c0 329 *nodes);
29b190fa 330 pol->w.cpuset_mems_allowed = *nodes;
37012946 331 }
f5b087b5 332
708c1bbc
MX
333 if (nodes_empty(tmp))
334 tmp = *nodes;
335
269fbe72 336 pol->nodes = tmp;
37012946
DR
337}
338
339static void mpol_rebind_preferred(struct mempolicy *pol,
213980c0 340 const nodemask_t *nodes)
37012946 341{
7858d7bc 342 pol->w.cpuset_mems_allowed = *nodes;
1da177e4
LT
343}
344
708c1bbc
MX
345/*
346 * mpol_rebind_policy - Migrate a policy to a different set of nodes
347 *
c1e8d7c6 348 * Per-vma policies are protected by mmap_lock. Allocations using per-task
213980c0
VB
349 * policies are protected by task->mems_allowed_seq to prevent a premature
350 * OOM/allocation failure due to parallel nodemask modification.
708c1bbc 351 */
213980c0 352static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1d0d2680 353{
018160ad 354 if (!pol || pol->mode == MPOL_LOCAL)
1d0d2680 355 return;
7858d7bc 356 if (!mpol_store_user_nodemask(pol) &&
1d0d2680
DR
357 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
358 return;
708c1bbc 359
213980c0 360 mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680
DR
361}
362
363/*
364 * Wrapper for mpol_rebind_policy() that just requires task
365 * pointer, and updates task mempolicy.
58568d2a
MX
366 *
367 * Called with task's alloc_lock held.
1d0d2680
DR
368 */
369
213980c0 370void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1d0d2680 371{
213980c0 372 mpol_rebind_policy(tsk->mempolicy, new);
1d0d2680
DR
373}
374
375/*
376 * Rebind each vma in mm to new nodemask.
377 *
c1e8d7c6 378 * Call holding a reference to mm. Takes mm->mmap_lock during call.
1d0d2680
DR
379 */
380
381void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
382{
383 struct vm_area_struct *vma;
66850be5 384 VMA_ITERATOR(vmi, mm, 0);
1d0d2680 385
d8ed45c5 386 mmap_write_lock(mm);
66850be5 387 for_each_vma(vmi, vma)
213980c0 388 mpol_rebind_policy(vma->vm_policy, new);
d8ed45c5 389 mmap_write_unlock(mm);
1d0d2680
DR
390}
391
37012946
DR
392static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
393 [MPOL_DEFAULT] = {
394 .rebind = mpol_rebind_default,
395 },
396 [MPOL_INTERLEAVE] = {
be897d48 397 .create = mpol_new_nodemask,
37012946
DR
398 .rebind = mpol_rebind_nodemask,
399 },
400 [MPOL_PREFERRED] = {
401 .create = mpol_new_preferred,
402 .rebind = mpol_rebind_preferred,
403 },
404 [MPOL_BIND] = {
be897d48 405 .create = mpol_new_nodemask,
37012946
DR
406 .rebind = mpol_rebind_nodemask,
407 },
7858d7bc
FT
408 [MPOL_LOCAL] = {
409 .rebind = mpol_rebind_default,
410 },
b27abacc 411 [MPOL_PREFERRED_MANY] = {
be897d48 412 .create = mpol_new_nodemask,
b27abacc
DH
413 .rebind = mpol_rebind_preferred,
414 },
37012946
DR
415};
416
a53190a4 417static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 418 unsigned long flags);
1a75a6c8 419
6f4576e3
NH
420struct queue_pages {
421 struct list_head *pagelist;
422 unsigned long flags;
423 nodemask_t *nmask;
f18da660
LX
424 unsigned long start;
425 unsigned long end;
426 struct vm_area_struct *first;
6f4576e3
NH
427};
428
88aaa2a1
NH
429/*
430 * Check if the page's nid is in qp->nmask.
431 *
432 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
433 * in the invert of qp->nmask.
434 */
435static inline bool queue_pages_required(struct page *page,
436 struct queue_pages *qp)
437{
438 int nid = page_to_nid(page);
439 unsigned long flags = qp->flags;
440
441 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
442}
443
a7f40cfe 444/*
bc78b5ed 445 * queue_pages_pmd() has three possible return values:
e5947d23
YS
446 * 0 - pages are placed on the right node or queued successfully, or
447 * special page is met, i.e. huge zero page.
d8835445
YS
448 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
449 * specified.
d8835445
YS
450 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
451 * existing page was already on a node that does not follow the
452 * policy.
a7f40cfe 453 */
c8633798
NH
454static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
455 unsigned long end, struct mm_walk *walk)
959a7e13 456 __releases(ptl)
c8633798
NH
457{
458 int ret = 0;
459 struct page *page;
460 struct queue_pages *qp = walk->private;
461 unsigned long flags;
462
463 if (unlikely(is_pmd_migration_entry(*pmd))) {
a7f40cfe 464 ret = -EIO;
c8633798
NH
465 goto unlock;
466 }
467 page = pmd_page(*pmd);
468 if (is_huge_zero_page(page)) {
e5947d23 469 walk->action = ACTION_CONTINUE;
6d97cf88 470 goto unlock;
c8633798 471 }
d8835445 472 if (!queue_pages_required(page, qp))
c8633798 473 goto unlock;
c8633798 474
c8633798
NH
475 flags = qp->flags;
476 /* go to thp migration */
a7f40cfe 477 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
a53190a4
YS
478 if (!vma_migratable(walk->vma) ||
479 migrate_page_add(page, qp->pagelist, flags)) {
d8835445 480 ret = 1;
a7f40cfe
YS
481 goto unlock;
482 }
a7f40cfe
YS
483 } else
484 ret = -EIO;
c8633798
NH
485unlock:
486 spin_unlock(ptl);
c8633798
NH
487 return ret;
488}
489
98094945
NH
490/*
491 * Scan through pages checking if pages follow certain conditions,
492 * and move them to the pagelist if they do.
d8835445
YS
493 *
494 * queue_pages_pte_range() has three possible return values:
e5947d23
YS
495 * 0 - pages are placed on the right node or queued successfully, or
496 * special page is met, i.e. zero page.
d8835445
YS
497 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
498 * specified.
499 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
500 * on a node that does not follow the policy.
98094945 501 */
6f4576e3
NH
502static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
503 unsigned long end, struct mm_walk *walk)
1da177e4 504{
6f4576e3
NH
505 struct vm_area_struct *vma = walk->vma;
506 struct page *page;
507 struct queue_pages *qp = walk->private;
508 unsigned long flags = qp->flags;
d8835445 509 bool has_unmovable = false;
3f088420 510 pte_t *pte, *mapped_pte;
705e87c0 511 spinlock_t *ptl;
941150a3 512
c8633798 513 ptl = pmd_trans_huge_lock(pmd, vma);
bc78b5ed
ML
514 if (ptl)
515 return queue_pages_pmd(pmd, ptl, addr, end, walk);
91612e0d 516
337d9abf
NH
517 if (pmd_trans_unstable(pmd))
518 return 0;
94723aaf 519
3f088420 520 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
6f4576e3 521 for (; addr != end; pte++, addr += PAGE_SIZE) {
91612e0d 522 if (!pte_present(*pte))
1da177e4 523 continue;
6aab341e 524 page = vm_normal_page(vma, addr, *pte);
3218f871 525 if (!page || is_zone_device_page(page))
1da177e4 526 continue;
053837fc 527 /*
62b61f61
HD
528 * vm_normal_page() filters out zero pages, but there might
529 * still be PageReserved pages to skip, perhaps in a VDSO.
053837fc 530 */
b79bc0a0 531 if (PageReserved(page))
f4598c8b 532 continue;
88aaa2a1 533 if (!queue_pages_required(page, qp))
38e35860 534 continue;
a7f40cfe 535 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
d8835445
YS
536 /* MPOL_MF_STRICT must be specified if we get here */
537 if (!vma_migratable(vma)) {
538 has_unmovable = true;
a7f40cfe 539 break;
d8835445 540 }
a53190a4
YS
541
542 /*
543 * Do not abort immediately since there may be
544 * temporary off LRU pages in the range. Still
545 * need migrate other LRU pages.
546 */
547 if (migrate_page_add(page, qp->pagelist, flags))
548 has_unmovable = true;
a7f40cfe
YS
549 } else
550 break;
6f4576e3 551 }
3f088420 552 pte_unmap_unlock(mapped_pte, ptl);
6f4576e3 553 cond_resched();
d8835445
YS
554
555 if (has_unmovable)
556 return 1;
557
a7f40cfe 558 return addr != end ? -EIO : 0;
91612e0d
HD
559}
560
6f4576e3
NH
561static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
562 unsigned long addr, unsigned long end,
563 struct mm_walk *walk)
e2d8cf40 564{
dcf17635 565 int ret = 0;
e2d8cf40 566#ifdef CONFIG_HUGETLB_PAGE
6f4576e3 567 struct queue_pages *qp = walk->private;
dcf17635 568 unsigned long flags = (qp->flags & MPOL_MF_VALID);
e2d8cf40 569 struct page *page;
cb900f41 570 spinlock_t *ptl;
d4c54919 571 pte_t entry;
e2d8cf40 572
6f4576e3
NH
573 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
574 entry = huge_ptep_get(pte);
d4c54919
NH
575 if (!pte_present(entry))
576 goto unlock;
577 page = pte_page(entry);
88aaa2a1 578 if (!queue_pages_required(page, qp))
e2d8cf40 579 goto unlock;
dcf17635
LX
580
581 if (flags == MPOL_MF_STRICT) {
582 /*
583 * STRICT alone means only detecting misplaced page and no
584 * need to further check other vma.
585 */
586 ret = -EIO;
587 goto unlock;
588 }
589
590 if (!vma_migratable(walk->vma)) {
591 /*
592 * Must be STRICT with MOVE*, otherwise .test_walk() have
593 * stopped walking current vma.
594 * Detecting misplaced page but allow migrating pages which
595 * have been queued.
596 */
597 ret = 1;
598 goto unlock;
599 }
600
e2d8cf40
NH
601 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
602 if (flags & (MPOL_MF_MOVE_ALL) ||
dcf17635 603 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
7ce82f4c 604 if (isolate_hugetlb(page, qp->pagelist) &&
dcf17635
LX
605 (flags & MPOL_MF_STRICT))
606 /*
607 * Failed to isolate page but allow migrating pages
608 * which have been queued.
609 */
610 ret = 1;
611 }
e2d8cf40 612unlock:
cb900f41 613 spin_unlock(ptl);
e2d8cf40
NH
614#else
615 BUG();
616#endif
dcf17635 617 return ret;
1da177e4
LT
618}
619
5877231f 620#ifdef CONFIG_NUMA_BALANCING
b24f53a0 621/*
4b10e7d5
MG
622 * This is used to mark a range of virtual addresses to be inaccessible.
623 * These are later cleared by a NUMA hinting fault. Depending on these
624 * faults, pages may be migrated for better NUMA placement.
625 *
626 * This is assuming that NUMA faults are handled using PROT_NONE. If
627 * an architecture makes a different choice, it will need further
628 * changes to the core.
b24f53a0 629 */
4b10e7d5
MG
630unsigned long change_prot_numa(struct vm_area_struct *vma,
631 unsigned long addr, unsigned long end)
b24f53a0 632{
4a18419f 633 struct mmu_gather tlb;
4b10e7d5 634 int nr_updated;
b24f53a0 635
4a18419f
NA
636 tlb_gather_mmu(&tlb, vma->vm_mm);
637
638 nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE,
639 MM_CP_PROT_NUMA);
03c5a6e1
MG
640 if (nr_updated)
641 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0 642
4a18419f
NA
643 tlb_finish_mmu(&tlb);
644
4b10e7d5 645 return nr_updated;
b24f53a0
LS
646}
647#else
648static unsigned long change_prot_numa(struct vm_area_struct *vma,
649 unsigned long addr, unsigned long end)
650{
651 return 0;
652}
5877231f 653#endif /* CONFIG_NUMA_BALANCING */
b24f53a0 654
6f4576e3
NH
655static int queue_pages_test_walk(unsigned long start, unsigned long end,
656 struct mm_walk *walk)
657{
66850be5 658 struct vm_area_struct *next, *vma = walk->vma;
6f4576e3
NH
659 struct queue_pages *qp = walk->private;
660 unsigned long endvma = vma->vm_end;
661 unsigned long flags = qp->flags;
662
a18b3ac2 663 /* range check first */
ce33135c 664 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
f18da660
LX
665
666 if (!qp->first) {
667 qp->first = vma;
668 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
669 (qp->start < vma->vm_start))
670 /* hole at head side of range */
a18b3ac2
LX
671 return -EFAULT;
672 }
66850be5 673 next = find_vma(vma->vm_mm, vma->vm_end);
f18da660
LX
674 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
675 ((vma->vm_end < qp->end) &&
66850be5 676 (!next || vma->vm_end < next->vm_start)))
f18da660
LX
677 /* hole at middle or tail of range */
678 return -EFAULT;
a18b3ac2 679
a7f40cfe
YS
680 /*
681 * Need check MPOL_MF_STRICT to return -EIO if possible
682 * regardless of vma_migratable
683 */
684 if (!vma_migratable(vma) &&
685 !(flags & MPOL_MF_STRICT))
48684a65
NH
686 return 1;
687
6f4576e3
NH
688 if (endvma > end)
689 endvma = end;
6f4576e3 690
6f4576e3
NH
691 if (flags & MPOL_MF_LAZY) {
692 /* Similar to task_numa_work, skip inaccessible VMAs */
3122e80e 693 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
4355c018 694 !(vma->vm_flags & VM_MIXEDMAP))
6f4576e3
NH
695 change_prot_numa(vma, start, endvma);
696 return 1;
697 }
698
77bf45e7 699 /* queue pages from current vma */
a7f40cfe 700 if (flags & MPOL_MF_VALID)
6f4576e3
NH
701 return 0;
702 return 1;
703}
704
7b86ac33
CH
705static const struct mm_walk_ops queue_pages_walk_ops = {
706 .hugetlb_entry = queue_pages_hugetlb,
707 .pmd_entry = queue_pages_pte_range,
708 .test_walk = queue_pages_test_walk,
709};
710
dc9aa5b9 711/*
98094945
NH
712 * Walk through page tables and collect pages to be migrated.
713 *
714 * If pages found in a given range are on a set of nodes (determined by
715 * @nodes and @flags,) it's isolated and queued to the pagelist which is
d8835445
YS
716 * passed via @private.
717 *
718 * queue_pages_range() has three possible return values:
719 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
720 * specified.
721 * 0 - queue pages successfully or no misplaced page.
a85dfc30
YS
722 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
723 * memory range specified by nodemask and maxnode points outside
724 * your accessible address space (-EFAULT)
dc9aa5b9 725 */
d05f0cdc 726static int
98094945 727queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
6f4576e3
NH
728 nodemask_t *nodes, unsigned long flags,
729 struct list_head *pagelist)
1da177e4 730{
f18da660 731 int err;
6f4576e3
NH
732 struct queue_pages qp = {
733 .pagelist = pagelist,
734 .flags = flags,
735 .nmask = nodes,
f18da660
LX
736 .start = start,
737 .end = end,
738 .first = NULL,
6f4576e3 739 };
6f4576e3 740
f18da660
LX
741 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
742
743 if (!qp.first)
744 /* whole range in hole */
745 err = -EFAULT;
746
747 return err;
1da177e4
LT
748}
749
869833f2
KM
750/*
751 * Apply policy to a single VMA
c1e8d7c6 752 * This must be called with the mmap_lock held for writing.
869833f2
KM
753 */
754static int vma_replace_policy(struct vm_area_struct *vma,
755 struct mempolicy *pol)
8d34694c 756{
869833f2
KM
757 int err;
758 struct mempolicy *old;
759 struct mempolicy *new;
8d34694c
KM
760
761 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
762 vma->vm_start, vma->vm_end, vma->vm_pgoff,
763 vma->vm_ops, vma->vm_file,
764 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
765
869833f2
KM
766 new = mpol_dup(pol);
767 if (IS_ERR(new))
768 return PTR_ERR(new);
769
770 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 771 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
772 if (err)
773 goto err_out;
8d34694c 774 }
869833f2
KM
775
776 old = vma->vm_policy;
c1e8d7c6 777 vma->vm_policy = new; /* protected by mmap_lock */
869833f2
KM
778 mpol_put(old);
779
780 return 0;
781 err_out:
782 mpol_put(new);
8d34694c
KM
783 return err;
784}
785
1da177e4 786/* Step 2: apply policy to a range and do splits. */
9d8cebd4
KM
787static int mbind_range(struct mm_struct *mm, unsigned long start,
788 unsigned long end, struct mempolicy *new_pol)
1da177e4 789{
66850be5 790 MA_STATE(mas, &mm->mm_mt, start - 1, start - 1);
9d8cebd4
KM
791 struct vm_area_struct *prev;
792 struct vm_area_struct *vma;
793 int err = 0;
e26a5114 794 pgoff_t pgoff;
9d8cebd4 795
66850be5
LH
796 prev = mas_find_rev(&mas, 0);
797 if (prev && (start < prev->vm_end))
798 vma = prev;
799 else
800 vma = mas_next(&mas, end - 1);
e26a5114 801
66850be5
LH
802 for (; vma; vma = mas_next(&mas, end - 1)) {
803 unsigned long vmstart = max(start, vma->vm_start);
804 unsigned long vmend = min(end, vma->vm_end);
9d8cebd4 805
e26a5114 806 if (mpol_equal(vma_policy(vma), new_pol))
66850be5 807 goto next;
e26a5114
KM
808
809 pgoff = vma->vm_pgoff +
810 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
9d8cebd4 811 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
19a809af 812 vma->anon_vma, vma->vm_file, pgoff,
9a10064f 813 new_pol, vma->vm_userfaultfd_ctx,
5c26f6ac 814 anon_vma_name(vma));
9d8cebd4 815 if (prev) {
66850be5
LH
816 /* vma_merge() invalidated the mas */
817 mas_pause(&mas);
9d8cebd4 818 vma = prev;
3964acd0 819 goto replace;
9d8cebd4
KM
820 }
821 if (vma->vm_start != vmstart) {
822 err = split_vma(vma->vm_mm, vma, vmstart, 1);
823 if (err)
824 goto out;
66850be5
LH
825 /* split_vma() invalidated the mas */
826 mas_pause(&mas);
9d8cebd4
KM
827 }
828 if (vma->vm_end != vmend) {
829 err = split_vma(vma->vm_mm, vma, vmend, 0);
830 if (err)
831 goto out;
66850be5
LH
832 /* split_vma() invalidated the mas */
833 mas_pause(&mas);
9d8cebd4 834 }
66850be5 835replace:
869833f2 836 err = vma_replace_policy(vma, new_pol);
8d34694c
KM
837 if (err)
838 goto out;
66850be5
LH
839next:
840 prev = vma;
1da177e4 841 }
9d8cebd4 842
66850be5 843out:
1da177e4
LT
844 return err;
845}
846
1da177e4 847/* Set the process memory policy */
028fec41
DR
848static long do_set_mempolicy(unsigned short mode, unsigned short flags,
849 nodemask_t *nodes)
1da177e4 850{
58568d2a 851 struct mempolicy *new, *old;
4bfc4495 852 NODEMASK_SCRATCH(scratch);
58568d2a 853 int ret;
1da177e4 854
4bfc4495
KH
855 if (!scratch)
856 return -ENOMEM;
f4e53d91 857
4bfc4495
KH
858 new = mpol_new(mode, flags, nodes);
859 if (IS_ERR(new)) {
860 ret = PTR_ERR(new);
861 goto out;
862 }
2c7c3a7d 863
12c1dc8e 864 task_lock(current);
4bfc4495 865 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a 866 if (ret) {
12c1dc8e 867 task_unlock(current);
58568d2a 868 mpol_put(new);
4bfc4495 869 goto out;
58568d2a 870 }
12c1dc8e 871
58568d2a 872 old = current->mempolicy;
1da177e4 873 current->mempolicy = new;
45816682
VB
874 if (new && new->mode == MPOL_INTERLEAVE)
875 current->il_prev = MAX_NUMNODES-1;
58568d2a 876 task_unlock(current);
58568d2a 877 mpol_put(old);
4bfc4495
KH
878 ret = 0;
879out:
880 NODEMASK_SCRATCH_FREE(scratch);
881 return ret;
1da177e4
LT
882}
883
bea904d5
LS
884/*
885 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
886 *
887 * Called with task's alloc_lock held
bea904d5
LS
888 */
889static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 890{
dfcd3c0d 891 nodes_clear(*nodes);
bea904d5
LS
892 if (p == &default_policy)
893 return;
894
45c4745a 895 switch (p->mode) {
19770b32 896 case MPOL_BIND:
1da177e4 897 case MPOL_INTERLEAVE:
269fbe72 898 case MPOL_PREFERRED:
b27abacc 899 case MPOL_PREFERRED_MANY:
269fbe72 900 *nodes = p->nodes;
1da177e4 901 break;
7858d7bc
FT
902 case MPOL_LOCAL:
903 /* return empty node mask for local allocation */
904 break;
1da177e4
LT
905 default:
906 BUG();
907 }
908}
909
3b9aadf7 910static int lookup_node(struct mm_struct *mm, unsigned long addr)
1da177e4 911{
ba841078 912 struct page *p = NULL;
f728b9c4 913 int ret;
1da177e4 914
f728b9c4
JH
915 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
916 if (ret > 0) {
917 ret = page_to_nid(p);
1da177e4
LT
918 put_page(p);
919 }
f728b9c4 920 return ret;
1da177e4
LT
921}
922
1da177e4 923/* Retrieve NUMA policy */
dbcb0f19
AB
924static long do_get_mempolicy(int *policy, nodemask_t *nmask,
925 unsigned long addr, unsigned long flags)
1da177e4 926{
8bccd85f 927 int err;
1da177e4
LT
928 struct mm_struct *mm = current->mm;
929 struct vm_area_struct *vma = NULL;
3b9aadf7 930 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1da177e4 931
754af6f5
LS
932 if (flags &
933 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 934 return -EINVAL;
754af6f5
LS
935
936 if (flags & MPOL_F_MEMS_ALLOWED) {
937 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
938 return -EINVAL;
939 *policy = 0; /* just so it's initialized */
58568d2a 940 task_lock(current);
754af6f5 941 *nmask = cpuset_current_mems_allowed;
58568d2a 942 task_unlock(current);
754af6f5
LS
943 return 0;
944 }
945
1da177e4 946 if (flags & MPOL_F_ADDR) {
bea904d5
LS
947 /*
948 * Do NOT fall back to task policy if the
949 * vma/shared policy at addr is NULL. We
950 * want to return MPOL_DEFAULT in this case.
951 */
d8ed45c5 952 mmap_read_lock(mm);
33e3575c 953 vma = vma_lookup(mm, addr);
1da177e4 954 if (!vma) {
d8ed45c5 955 mmap_read_unlock(mm);
1da177e4
LT
956 return -EFAULT;
957 }
958 if (vma->vm_ops && vma->vm_ops->get_policy)
959 pol = vma->vm_ops->get_policy(vma, addr);
960 else
961 pol = vma->vm_policy;
962 } else if (addr)
963 return -EINVAL;
964
965 if (!pol)
bea904d5 966 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
967
968 if (flags & MPOL_F_NODE) {
969 if (flags & MPOL_F_ADDR) {
3b9aadf7 970 /*
f728b9c4
JH
971 * Take a refcount on the mpol, because we are about to
972 * drop the mmap_lock, after which only "pol" remains
973 * valid, "vma" is stale.
3b9aadf7
AA
974 */
975 pol_refcount = pol;
976 vma = NULL;
977 mpol_get(pol);
f728b9c4 978 mmap_read_unlock(mm);
3b9aadf7 979 err = lookup_node(mm, addr);
1da177e4
LT
980 if (err < 0)
981 goto out;
8bccd85f 982 *policy = err;
1da177e4 983 } else if (pol == current->mempolicy &&
45c4745a 984 pol->mode == MPOL_INTERLEAVE) {
269fbe72 985 *policy = next_node_in(current->il_prev, pol->nodes);
1da177e4
LT
986 } else {
987 err = -EINVAL;
988 goto out;
989 }
bea904d5
LS
990 } else {
991 *policy = pol == &default_policy ? MPOL_DEFAULT :
992 pol->mode;
d79df630
DR
993 /*
994 * Internal mempolicy flags must be masked off before exposing
995 * the policy to userspace.
996 */
997 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 998 }
1da177e4 999
1da177e4 1000 err = 0;
58568d2a 1001 if (nmask) {
c6b6ef8b
LS
1002 if (mpol_store_user_nodemask(pol)) {
1003 *nmask = pol->w.user_nodemask;
1004 } else {
1005 task_lock(current);
1006 get_policy_nodemask(pol, nmask);
1007 task_unlock(current);
1008 }
58568d2a 1009 }
1da177e4
LT
1010
1011 out:
52cd3b07 1012 mpol_cond_put(pol);
1da177e4 1013 if (vma)
d8ed45c5 1014 mmap_read_unlock(mm);
3b9aadf7
AA
1015 if (pol_refcount)
1016 mpol_put(pol_refcount);
1da177e4
LT
1017 return err;
1018}
1019
b20a3503 1020#ifdef CONFIG_MIGRATION
6ce3c4c0 1021/*
c8633798 1022 * page migration, thp tail pages can be passed.
6ce3c4c0 1023 */
a53190a4 1024static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 1025 unsigned long flags)
6ce3c4c0 1026{
c8633798 1027 struct page *head = compound_head(page);
6ce3c4c0 1028 /*
fc301289 1029 * Avoid migrating a page that is shared with others.
6ce3c4c0 1030 */
c8633798
NH
1031 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1032 if (!isolate_lru_page(head)) {
1033 list_add_tail(&head->lru, pagelist);
1034 mod_node_page_state(page_pgdat(head),
9de4f22a 1035 NR_ISOLATED_ANON + page_is_file_lru(head),
6c357848 1036 thp_nr_pages(head));
a53190a4
YS
1037 } else if (flags & MPOL_MF_STRICT) {
1038 /*
1039 * Non-movable page may reach here. And, there may be
1040 * temporary off LRU pages or non-LRU movable pages.
1041 * Treat them as unmovable pages since they can't be
1042 * isolated, so they can't be moved at the moment. It
1043 * should return -EIO for this case too.
1044 */
1045 return -EIO;
62695a84
NP
1046 }
1047 }
a53190a4
YS
1048
1049 return 0;
7e2ab150 1050}
6ce3c4c0 1051
7e2ab150
CL
1052/*
1053 * Migrate pages from one node to a target node.
1054 * Returns error or the number of pages not migrated.
1055 */
dbcb0f19
AB
1056static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1057 int flags)
7e2ab150
CL
1058{
1059 nodemask_t nmask;
66850be5 1060 struct vm_area_struct *vma;
7e2ab150
CL
1061 LIST_HEAD(pagelist);
1062 int err = 0;
a0976311
JK
1063 struct migration_target_control mtc = {
1064 .nid = dest,
1065 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1066 };
7e2ab150
CL
1067
1068 nodes_clear(nmask);
1069 node_set(source, nmask);
6ce3c4c0 1070
08270807
MK
1071 /*
1072 * This does not "check" the range but isolates all pages that
1073 * need migration. Between passing in the full user address
1074 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1075 */
66850be5 1076 vma = find_vma(mm, 0);
08270807 1077 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
66850be5 1078 queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
7e2ab150
CL
1079 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1080
cf608ac1 1081 if (!list_empty(&pagelist)) {
a0976311 1082 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
5ac95884 1083 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
cf608ac1 1084 if (err)
e2d8cf40 1085 putback_movable_pages(&pagelist);
cf608ac1 1086 }
95a402c3 1087
7e2ab150 1088 return err;
6ce3c4c0
CL
1089}
1090
39743889 1091/*
7e2ab150
CL
1092 * Move pages between the two nodesets so as to preserve the physical
1093 * layout as much as possible.
39743889
CL
1094 *
1095 * Returns the number of page that could not be moved.
1096 */
0ce72d4f
AM
1097int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1098 const nodemask_t *to, int flags)
39743889 1099{
7e2ab150 1100 int busy = 0;
f555befd 1101 int err = 0;
7e2ab150 1102 nodemask_t tmp;
39743889 1103
361a2a22 1104 lru_cache_disable();
0aedadf9 1105
d8ed45c5 1106 mmap_read_lock(mm);
39743889 1107
da0aa138
KM
1108 /*
1109 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1110 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1111 * bit in 'tmp', and return that <source, dest> pair for migration.
1112 * The pair of nodemasks 'to' and 'from' define the map.
1113 *
1114 * If no pair of bits is found that way, fallback to picking some
1115 * pair of 'source' and 'dest' bits that are not the same. If the
1116 * 'source' and 'dest' bits are the same, this represents a node
1117 * that will be migrating to itself, so no pages need move.
1118 *
1119 * If no bits are left in 'tmp', or if all remaining bits left
1120 * in 'tmp' correspond to the same bit in 'to', return false
1121 * (nothing left to migrate).
1122 *
1123 * This lets us pick a pair of nodes to migrate between, such that
1124 * if possible the dest node is not already occupied by some other
1125 * source node, minimizing the risk of overloading the memory on a
1126 * node that would happen if we migrated incoming memory to a node
1127 * before migrating outgoing memory source that same node.
1128 *
1129 * A single scan of tmp is sufficient. As we go, we remember the
1130 * most recent <s, d> pair that moved (s != d). If we find a pair
1131 * that not only moved, but what's better, moved to an empty slot
1132 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1133 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1134 * most recent <s, d> pair that moved. If we get all the way through
1135 * the scan of tmp without finding any node that moved, much less
1136 * moved to an empty node, then there is nothing left worth migrating.
1137 */
d4984711 1138
0ce72d4f 1139 tmp = *from;
7e2ab150 1140 while (!nodes_empty(tmp)) {
68d68ff6 1141 int s, d;
b76ac7e7 1142 int source = NUMA_NO_NODE;
7e2ab150
CL
1143 int dest = 0;
1144
1145 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1146
1147 /*
1148 * do_migrate_pages() tries to maintain the relative
1149 * node relationship of the pages established between
1150 * threads and memory areas.
1151 *
1152 * However if the number of source nodes is not equal to
1153 * the number of destination nodes we can not preserve
1154 * this node relative relationship. In that case, skip
1155 * copying memory from a node that is in the destination
1156 * mask.
1157 *
1158 * Example: [2,3,4] -> [3,4,5] moves everything.
1159 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1160 */
1161
0ce72d4f
AM
1162 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1163 (node_isset(s, *to)))
4a5b18cc
LW
1164 continue;
1165
0ce72d4f 1166 d = node_remap(s, *from, *to);
7e2ab150
CL
1167 if (s == d)
1168 continue;
1169
1170 source = s; /* Node moved. Memorize */
1171 dest = d;
1172
1173 /* dest not in remaining from nodes? */
1174 if (!node_isset(dest, tmp))
1175 break;
1176 }
b76ac7e7 1177 if (source == NUMA_NO_NODE)
7e2ab150
CL
1178 break;
1179
1180 node_clear(source, tmp);
1181 err = migrate_to_node(mm, source, dest, flags);
1182 if (err > 0)
1183 busy += err;
1184 if (err < 0)
1185 break;
39743889 1186 }
d8ed45c5 1187 mmap_read_unlock(mm);
d479960e 1188
361a2a22 1189 lru_cache_enable();
7e2ab150
CL
1190 if (err < 0)
1191 return err;
1192 return busy;
b20a3503
CL
1193
1194}
1195
3ad33b24
LS
1196/*
1197 * Allocate a new page for page migration based on vma policy.
d05f0cdc 1198 * Start by assuming the page is mapped by the same vma as contains @start.
3ad33b24
LS
1199 * Search forward from there, if not. N.B., this assumes that the
1200 * list of pages handed to migrate_pages()--which is how we get here--
1201 * is in virtual address order.
1202 */
666feb21 1203static struct page *new_page(struct page *page, unsigned long start)
95a402c3 1204{
ec4858e0 1205 struct folio *dst, *src = page_folio(page);
d05f0cdc 1206 struct vm_area_struct *vma;
3f649ab7 1207 unsigned long address;
66850be5 1208 VMA_ITERATOR(vmi, current->mm, start);
ec4858e0 1209 gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
95a402c3 1210
66850be5 1211 for_each_vma(vmi, vma) {
3ad33b24
LS
1212 address = page_address_in_vma(page, vma);
1213 if (address != -EFAULT)
1214 break;
3ad33b24 1215 }
11c731e8 1216
ec4858e0
MWO
1217 if (folio_test_hugetlb(src))
1218 return alloc_huge_page_vma(page_hstate(&src->page),
389c8178 1219 vma, address);
ec4858e0
MWO
1220
1221 if (folio_test_large(src))
1222 gfp = GFP_TRANSHUGE;
1223
0bf598d8 1224 /*
ec4858e0 1225 * if !vma, vma_alloc_folio() will use task or system default policy
0bf598d8 1226 */
ec4858e0
MWO
1227 dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
1228 folio_test_large(src));
1229 return &dst->page;
95a402c3 1230}
b20a3503
CL
1231#else
1232
a53190a4 1233static int migrate_page_add(struct page *page, struct list_head *pagelist,
b20a3503
CL
1234 unsigned long flags)
1235{
a53190a4 1236 return -EIO;
39743889
CL
1237}
1238
0ce72d4f
AM
1239int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1240 const nodemask_t *to, int flags)
b20a3503
CL
1241{
1242 return -ENOSYS;
1243}
95a402c3 1244
666feb21 1245static struct page *new_page(struct page *page, unsigned long start)
95a402c3
CL
1246{
1247 return NULL;
1248}
b20a3503
CL
1249#endif
1250
dbcb0f19 1251static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1252 unsigned short mode, unsigned short mode_flags,
1253 nodemask_t *nmask, unsigned long flags)
6ce3c4c0 1254{
6ce3c4c0
CL
1255 struct mm_struct *mm = current->mm;
1256 struct mempolicy *new;
1257 unsigned long end;
1258 int err;
d8835445 1259 int ret;
6ce3c4c0
CL
1260 LIST_HEAD(pagelist);
1261
b24f53a0 1262 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1263 return -EINVAL;
74c00241 1264 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1265 return -EPERM;
1266
1267 if (start & ~PAGE_MASK)
1268 return -EINVAL;
1269
1270 if (mode == MPOL_DEFAULT)
1271 flags &= ~MPOL_MF_STRICT;
1272
1273 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1274 end = start + len;
1275
1276 if (end < start)
1277 return -EINVAL;
1278 if (end == start)
1279 return 0;
1280
028fec41 1281 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1282 if (IS_ERR(new))
1283 return PTR_ERR(new);
1284
b24f53a0
LS
1285 if (flags & MPOL_MF_LAZY)
1286 new->flags |= MPOL_F_MOF;
1287
6ce3c4c0
CL
1288 /*
1289 * If we are using the default policy then operation
1290 * on discontinuous address spaces is okay after all
1291 */
1292 if (!new)
1293 flags |= MPOL_MF_DISCONTIG_OK;
1294
028fec41
DR
1295 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1296 start, start + len, mode, mode_flags,
00ef2d2f 1297 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
6ce3c4c0 1298
0aedadf9
CL
1299 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1300
361a2a22 1301 lru_cache_disable();
0aedadf9 1302 }
4bfc4495
KH
1303 {
1304 NODEMASK_SCRATCH(scratch);
1305 if (scratch) {
d8ed45c5 1306 mmap_write_lock(mm);
4bfc4495 1307 err = mpol_set_nodemask(new, nmask, scratch);
4bfc4495 1308 if (err)
d8ed45c5 1309 mmap_write_unlock(mm);
4bfc4495
KH
1310 } else
1311 err = -ENOMEM;
1312 NODEMASK_SCRATCH_FREE(scratch);
1313 }
b05ca738
KM
1314 if (err)
1315 goto mpol_out;
1316
d8835445 1317 ret = queue_pages_range(mm, start, end, nmask,
6ce3c4c0 1318 flags | MPOL_MF_INVERT, &pagelist);
d8835445
YS
1319
1320 if (ret < 0) {
a85dfc30 1321 err = ret;
d8835445
YS
1322 goto up_out;
1323 }
1324
1325 err = mbind_range(mm, start, end, new);
7e2ab150 1326
b24f53a0
LS
1327 if (!err) {
1328 int nr_failed = 0;
1329
cf608ac1 1330 if (!list_empty(&pagelist)) {
b24f53a0 1331 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
d05f0cdc 1332 nr_failed = migrate_pages(&pagelist, new_page, NULL,
5ac95884 1333 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
cf608ac1 1334 if (nr_failed)
74060e4d 1335 putback_movable_pages(&pagelist);
cf608ac1 1336 }
6ce3c4c0 1337
d8835445 1338 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
6ce3c4c0 1339 err = -EIO;
a85dfc30 1340 } else {
d8835445 1341up_out:
a85dfc30
YS
1342 if (!list_empty(&pagelist))
1343 putback_movable_pages(&pagelist);
1344 }
1345
d8ed45c5 1346 mmap_write_unlock(mm);
d8835445 1347mpol_out:
f0be3d32 1348 mpol_put(new);
d479960e 1349 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
361a2a22 1350 lru_cache_enable();
6ce3c4c0
CL
1351 return err;
1352}
1353
8bccd85f
CL
1354/*
1355 * User space interface with variable sized bitmaps for nodelists.
1356 */
e130242d
AB
1357static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1358 unsigned long maxnode)
1359{
1360 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1361 int ret;
1362
1363 if (in_compat_syscall())
1364 ret = compat_get_bitmap(mask,
1365 (const compat_ulong_t __user *)nmask,
1366 maxnode);
1367 else
1368 ret = copy_from_user(mask, nmask,
1369 nlongs * sizeof(unsigned long));
1370
1371 if (ret)
1372 return -EFAULT;
1373
1374 if (maxnode % BITS_PER_LONG)
1375 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1376
1377 return 0;
1378}
8bccd85f
CL
1379
1380/* Copy a node mask from user space. */
39743889 1381static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1382 unsigned long maxnode)
1383{
8bccd85f
CL
1384 --maxnode;
1385 nodes_clear(*nodes);
1386 if (maxnode == 0 || !nmask)
1387 return 0;
a9c930ba 1388 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1389 return -EINVAL;
8bccd85f 1390
56521e7a
YX
1391 /*
1392 * When the user specified more nodes than supported just check
e130242d
AB
1393 * if the non supported part is all zero, one word at a time,
1394 * starting at the end.
56521e7a 1395 */
e130242d
AB
1396 while (maxnode > MAX_NUMNODES) {
1397 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1398 unsigned long t;
8bccd85f 1399
000eca5d 1400 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
56521e7a 1401 return -EFAULT;
e130242d
AB
1402
1403 if (maxnode - bits >= MAX_NUMNODES) {
1404 maxnode -= bits;
1405 } else {
1406 maxnode = MAX_NUMNODES;
1407 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1408 }
1409 if (t)
56521e7a
YX
1410 return -EINVAL;
1411 }
1412
e130242d 1413 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
8bccd85f
CL
1414}
1415
1416/* Copy a kernel node mask to user space */
1417static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1418 nodemask_t *nodes)
1419{
1420 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
050c17f2 1421 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
e130242d
AB
1422 bool compat = in_compat_syscall();
1423
1424 if (compat)
1425 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
8bccd85f
CL
1426
1427 if (copy > nbytes) {
1428 if (copy > PAGE_SIZE)
1429 return -EINVAL;
1430 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1431 return -EFAULT;
1432 copy = nbytes;
e130242d 1433 maxnode = nr_node_ids;
8bccd85f 1434 }
e130242d
AB
1435
1436 if (compat)
1437 return compat_put_bitmap((compat_ulong_t __user *)mask,
1438 nodes_addr(*nodes), maxnode);
1439
8bccd85f
CL
1440 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1441}
1442
95837924
FT
1443/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1444static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1445{
1446 *flags = *mode & MPOL_MODE_FLAGS;
1447 *mode &= ~MPOL_MODE_FLAGS;
b27abacc 1448
a38a59fd 1449 if ((unsigned int)(*mode) >= MPOL_MAX)
95837924
FT
1450 return -EINVAL;
1451 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1452 return -EINVAL;
6d2aec9e
ED
1453 if (*flags & MPOL_F_NUMA_BALANCING) {
1454 if (*mode != MPOL_BIND)
1455 return -EINVAL;
1456 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1457 }
95837924
FT
1458 return 0;
1459}
1460
e7dc9ad6
DB
1461static long kernel_mbind(unsigned long start, unsigned long len,
1462 unsigned long mode, const unsigned long __user *nmask,
1463 unsigned long maxnode, unsigned int flags)
8bccd85f 1464{
95837924 1465 unsigned short mode_flags;
8bccd85f 1466 nodemask_t nodes;
95837924 1467 int lmode = mode;
8bccd85f
CL
1468 int err;
1469
057d3389 1470 start = untagged_addr(start);
95837924
FT
1471 err = sanitize_mpol_flags(&lmode, &mode_flags);
1472 if (err)
1473 return err;
1474
8bccd85f
CL
1475 err = get_nodes(&nodes, nmask, maxnode);
1476 if (err)
1477 return err;
95837924
FT
1478
1479 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
8bccd85f
CL
1480}
1481
c6018b4b
AK
1482SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1483 unsigned long, home_node, unsigned long, flags)
1484{
1485 struct mm_struct *mm = current->mm;
1486 struct vm_area_struct *vma;
1487 struct mempolicy *new;
1488 unsigned long vmstart;
1489 unsigned long vmend;
1490 unsigned long end;
1491 int err = -ENOENT;
66850be5 1492 VMA_ITERATOR(vmi, mm, start);
c6018b4b
AK
1493
1494 start = untagged_addr(start);
1495 if (start & ~PAGE_MASK)
1496 return -EINVAL;
1497 /*
1498 * flags is used for future extension if any.
1499 */
1500 if (flags != 0)
1501 return -EINVAL;
1502
1503 /*
1504 * Check home_node is online to avoid accessing uninitialized
1505 * NODE_DATA.
1506 */
1507 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1508 return -EINVAL;
1509
1510 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1511 end = start + len;
1512
1513 if (end < start)
1514 return -EINVAL;
1515 if (end == start)
1516 return 0;
1517 mmap_write_lock(mm);
66850be5 1518 for_each_vma_range(vmi, vma, end) {
c6018b4b
AK
1519 vmstart = max(start, vma->vm_start);
1520 vmend = min(end, vma->vm_end);
1521 new = mpol_dup(vma_policy(vma));
1522 if (IS_ERR(new)) {
1523 err = PTR_ERR(new);
1524 break;
1525 }
1526 /*
1527 * Only update home node if there is an existing vma policy
1528 */
1529 if (!new)
1530 continue;
1531
1532 /*
1533 * If any vma in the range got policy other than MPOL_BIND
1534 * or MPOL_PREFERRED_MANY we return error. We don't reset
1535 * the home node for vmas we already updated before.
1536 */
1537 if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
1538 err = -EOPNOTSUPP;
1539 break;
1540 }
1541
1542 new->home_node = home_node;
1543 err = mbind_range(mm, vmstart, vmend, new);
1544 mpol_put(new);
1545 if (err)
1546 break;
1547 }
1548 mmap_write_unlock(mm);
1549 return err;
1550}
1551
e7dc9ad6
DB
1552SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1553 unsigned long, mode, const unsigned long __user *, nmask,
1554 unsigned long, maxnode, unsigned int, flags)
1555{
1556 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1557}
1558
8bccd85f 1559/* Set the process memory policy */
af03c4ac
DB
1560static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1561 unsigned long maxnode)
8bccd85f 1562{
95837924 1563 unsigned short mode_flags;
8bccd85f 1564 nodemask_t nodes;
95837924
FT
1565 int lmode = mode;
1566 int err;
1567
1568 err = sanitize_mpol_flags(&lmode, &mode_flags);
1569 if (err)
1570 return err;
8bccd85f 1571
8bccd85f
CL
1572 err = get_nodes(&nodes, nmask, maxnode);
1573 if (err)
1574 return err;
95837924
FT
1575
1576 return do_set_mempolicy(lmode, mode_flags, &nodes);
8bccd85f
CL
1577}
1578
af03c4ac
DB
1579SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1580 unsigned long, maxnode)
1581{
1582 return kernel_set_mempolicy(mode, nmask, maxnode);
1583}
1584
b6e9b0ba
DB
1585static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1586 const unsigned long __user *old_nodes,
1587 const unsigned long __user *new_nodes)
39743889 1588{
596d7cfa 1589 struct mm_struct *mm = NULL;
39743889 1590 struct task_struct *task;
39743889
CL
1591 nodemask_t task_nodes;
1592 int err;
596d7cfa
KM
1593 nodemask_t *old;
1594 nodemask_t *new;
1595 NODEMASK_SCRATCH(scratch);
1596
1597 if (!scratch)
1598 return -ENOMEM;
39743889 1599
596d7cfa
KM
1600 old = &scratch->mask1;
1601 new = &scratch->mask2;
1602
1603 err = get_nodes(old, old_nodes, maxnode);
39743889 1604 if (err)
596d7cfa 1605 goto out;
39743889 1606
596d7cfa 1607 err = get_nodes(new, new_nodes, maxnode);
39743889 1608 if (err)
596d7cfa 1609 goto out;
39743889
CL
1610
1611 /* Find the mm_struct */
55cfaa3c 1612 rcu_read_lock();
228ebcbe 1613 task = pid ? find_task_by_vpid(pid) : current;
39743889 1614 if (!task) {
55cfaa3c 1615 rcu_read_unlock();
596d7cfa
KM
1616 err = -ESRCH;
1617 goto out;
39743889 1618 }
3268c63e 1619 get_task_struct(task);
39743889 1620
596d7cfa 1621 err = -EINVAL;
39743889
CL
1622
1623 /*
31367466
OE
1624 * Check if this process has the right to modify the specified process.
1625 * Use the regular "ptrace_may_access()" checks.
39743889 1626 */
31367466 1627 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
c69e8d9c 1628 rcu_read_unlock();
39743889 1629 err = -EPERM;
3268c63e 1630 goto out_put;
39743889 1631 }
c69e8d9c 1632 rcu_read_unlock();
39743889
CL
1633
1634 task_nodes = cpuset_mems_allowed(task);
1635 /* Is the user allowed to access the target nodes? */
596d7cfa 1636 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1637 err = -EPERM;
3268c63e 1638 goto out_put;
39743889
CL
1639 }
1640
0486a38b
YX
1641 task_nodes = cpuset_mems_allowed(current);
1642 nodes_and(*new, *new, task_nodes);
1643 if (nodes_empty(*new))
1644 goto out_put;
1645
86c3a764
DQ
1646 err = security_task_movememory(task);
1647 if (err)
3268c63e 1648 goto out_put;
86c3a764 1649
3268c63e
CL
1650 mm = get_task_mm(task);
1651 put_task_struct(task);
f2a9ef88
SL
1652
1653 if (!mm) {
3268c63e 1654 err = -EINVAL;
f2a9ef88
SL
1655 goto out;
1656 }
1657
1658 err = do_migrate_pages(mm, old, new,
1659 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1660
1661 mmput(mm);
1662out:
596d7cfa
KM
1663 NODEMASK_SCRATCH_FREE(scratch);
1664
39743889 1665 return err;
3268c63e
CL
1666
1667out_put:
1668 put_task_struct(task);
1669 goto out;
1670
39743889
CL
1671}
1672
b6e9b0ba
DB
1673SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1674 const unsigned long __user *, old_nodes,
1675 const unsigned long __user *, new_nodes)
1676{
1677 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1678}
1679
39743889 1680
8bccd85f 1681/* Retrieve NUMA policy */
af03c4ac
DB
1682static int kernel_get_mempolicy(int __user *policy,
1683 unsigned long __user *nmask,
1684 unsigned long maxnode,
1685 unsigned long addr,
1686 unsigned long flags)
8bccd85f 1687{
dbcb0f19 1688 int err;
3f649ab7 1689 int pval;
8bccd85f
CL
1690 nodemask_t nodes;
1691
050c17f2 1692 if (nmask != NULL && maxnode < nr_node_ids)
8bccd85f
CL
1693 return -EINVAL;
1694
4605f057
WH
1695 addr = untagged_addr(addr);
1696
8bccd85f
CL
1697 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1698
1699 if (err)
1700 return err;
1701
1702 if (policy && put_user(pval, policy))
1703 return -EFAULT;
1704
1705 if (nmask)
1706 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1707
1708 return err;
1709}
1710
af03c4ac
DB
1711SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1712 unsigned long __user *, nmask, unsigned long, maxnode,
1713 unsigned long, addr, unsigned long, flags)
1714{
1715 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1716}
1717
20ca87f2
LX
1718bool vma_migratable(struct vm_area_struct *vma)
1719{
1720 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1721 return false;
1722
1723 /*
1724 * DAX device mappings require predictable access latency, so avoid
1725 * incurring periodic faults.
1726 */
1727 if (vma_is_dax(vma))
1728 return false;
1729
1730 if (is_vm_hugetlb_page(vma) &&
1731 !hugepage_migration_supported(hstate_vma(vma)))
1732 return false;
1733
1734 /*
1735 * Migration allocates pages in the highest zone. If we cannot
1736 * do so then migration (at least from node to node) is not
1737 * possible.
1738 */
1739 if (vma->vm_file &&
1740 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1741 < policy_zone)
1742 return false;
1743 return true;
1744}
1745
74d2c3a0
ON
1746struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1747 unsigned long addr)
1da177e4 1748{
8d90274b 1749 struct mempolicy *pol = NULL;
1da177e4
LT
1750
1751 if (vma) {
480eccf9 1752 if (vma->vm_ops && vma->vm_ops->get_policy) {
8d90274b 1753 pol = vma->vm_ops->get_policy(vma, addr);
00442ad0 1754 } else if (vma->vm_policy) {
1da177e4 1755 pol = vma->vm_policy;
00442ad0
MG
1756
1757 /*
1758 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1759 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1760 * count on these policies which will be dropped by
1761 * mpol_cond_put() later
1762 */
1763 if (mpol_needs_cond_ref(pol))
1764 mpol_get(pol);
1765 }
1da177e4 1766 }
f15ca78e 1767
74d2c3a0
ON
1768 return pol;
1769}
1770
1771/*
dd6eecb9 1772 * get_vma_policy(@vma, @addr)
74d2c3a0
ON
1773 * @vma: virtual memory area whose policy is sought
1774 * @addr: address in @vma for shared policy lookup
1775 *
1776 * Returns effective policy for a VMA at specified address.
dd6eecb9 1777 * Falls back to current->mempolicy or system default policy, as necessary.
74d2c3a0
ON
1778 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1779 * count--added by the get_policy() vm_op, as appropriate--to protect against
1780 * freeing by another task. It is the caller's responsibility to free the
1781 * extra reference for shared policies.
1782 */
ac79f78d 1783static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
dd6eecb9 1784 unsigned long addr)
74d2c3a0
ON
1785{
1786 struct mempolicy *pol = __get_vma_policy(vma, addr);
1787
8d90274b 1788 if (!pol)
dd6eecb9 1789 pol = get_task_policy(current);
8d90274b 1790
1da177e4
LT
1791 return pol;
1792}
1793
6b6482bb 1794bool vma_policy_mof(struct vm_area_struct *vma)
fc314724 1795{
6b6482bb 1796 struct mempolicy *pol;
fc314724 1797
6b6482bb
ON
1798 if (vma->vm_ops && vma->vm_ops->get_policy) {
1799 bool ret = false;
fc314724 1800
6b6482bb
ON
1801 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1802 if (pol && (pol->flags & MPOL_F_MOF))
1803 ret = true;
1804 mpol_cond_put(pol);
8d90274b 1805
6b6482bb 1806 return ret;
fc314724
MG
1807 }
1808
6b6482bb 1809 pol = vma->vm_policy;
8d90274b 1810 if (!pol)
6b6482bb 1811 pol = get_task_policy(current);
8d90274b 1812
fc314724
MG
1813 return pol->flags & MPOL_F_MOF;
1814}
1815
d2226ebd 1816bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
d3eb1570
LJ
1817{
1818 enum zone_type dynamic_policy_zone = policy_zone;
1819
1820 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1821
1822 /*
269fbe72 1823 * if policy->nodes has movable memory only,
d3eb1570
LJ
1824 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1825 *
269fbe72 1826 * policy->nodes is intersect with node_states[N_MEMORY].
f0953a1b 1827 * so if the following test fails, it implies
269fbe72 1828 * policy->nodes has movable memory only.
d3eb1570 1829 */
269fbe72 1830 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
d3eb1570
LJ
1831 dynamic_policy_zone = ZONE_MOVABLE;
1832
1833 return zone >= dynamic_policy_zone;
1834}
1835
52cd3b07
LS
1836/*
1837 * Return a nodemask representing a mempolicy for filtering nodes for
1838 * page allocation
1839 */
8ca39e68 1840nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b32 1841{
b27abacc
DH
1842 int mode = policy->mode;
1843
19770b32 1844 /* Lower zones don't get a nodemask applied for MPOL_BIND */
b27abacc
DH
1845 if (unlikely(mode == MPOL_BIND) &&
1846 apply_policy_zone(policy, gfp_zone(gfp)) &&
1847 cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1848 return &policy->nodes;
1849
1850 if (mode == MPOL_PREFERRED_MANY)
269fbe72 1851 return &policy->nodes;
19770b32
MG
1852
1853 return NULL;
1854}
1855
b27abacc
DH
1856/*
1857 * Return the preferred node id for 'prefer' mempolicy, and return
1858 * the given id for all other policies.
1859 *
1860 * policy_node() is always coupled with policy_nodemask(), which
1861 * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1862 */
f8fd5253 1863static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1da177e4 1864{
7858d7bc 1865 if (policy->mode == MPOL_PREFERRED) {
269fbe72 1866 nd = first_node(policy->nodes);
7858d7bc 1867 } else {
19770b32 1868 /*
6d840958
MH
1869 * __GFP_THISNODE shouldn't even be used with the bind policy
1870 * because we might easily break the expectation to stay on the
1871 * requested node and not break the policy.
19770b32 1872 */
6d840958 1873 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1da177e4 1874 }
6d840958 1875
c6018b4b
AK
1876 if ((policy->mode == MPOL_BIND ||
1877 policy->mode == MPOL_PREFERRED_MANY) &&
1878 policy->home_node != NUMA_NO_NODE)
1879 return policy->home_node;
1880
04ec6264 1881 return nd;
1da177e4
LT
1882}
1883
1884/* Do dynamic interleaving for a process */
1885static unsigned interleave_nodes(struct mempolicy *policy)
1886{
45816682 1887 unsigned next;
1da177e4
LT
1888 struct task_struct *me = current;
1889
269fbe72 1890 next = next_node_in(me->il_prev, policy->nodes);
f5b087b5 1891 if (next < MAX_NUMNODES)
45816682
VB
1892 me->il_prev = next;
1893 return next;
1da177e4
LT
1894}
1895
dc85da15
CL
1896/*
1897 * Depending on the memory policy provide a node from which to allocate the
1898 * next slab entry.
1899 */
2a389610 1900unsigned int mempolicy_slab_node(void)
dc85da15 1901{
e7b691b0 1902 struct mempolicy *policy;
2a389610 1903 int node = numa_mem_id();
e7b691b0 1904
38b031dd 1905 if (!in_task())
2a389610 1906 return node;
e7b691b0
AK
1907
1908 policy = current->mempolicy;
7858d7bc 1909 if (!policy)
2a389610 1910 return node;
bea904d5
LS
1911
1912 switch (policy->mode) {
1913 case MPOL_PREFERRED:
269fbe72 1914 return first_node(policy->nodes);
765c4507 1915
dc85da15
CL
1916 case MPOL_INTERLEAVE:
1917 return interleave_nodes(policy);
1918
b27abacc
DH
1919 case MPOL_BIND:
1920 case MPOL_PREFERRED_MANY:
1921 {
c33d6c06
MG
1922 struct zoneref *z;
1923
dc85da15
CL
1924 /*
1925 * Follow bind policy behavior and start allocation at the
1926 * first node.
1927 */
19770b32 1928 struct zonelist *zonelist;
19770b32 1929 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
c9634cf0 1930 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
c33d6c06 1931 z = first_zones_zonelist(zonelist, highest_zoneidx,
269fbe72 1932 &policy->nodes);
c1093b74 1933 return z->zone ? zone_to_nid(z->zone) : node;
dd1a239f 1934 }
7858d7bc
FT
1935 case MPOL_LOCAL:
1936 return node;
dc85da15 1937
dc85da15 1938 default:
bea904d5 1939 BUG();
dc85da15
CL
1940 }
1941}
1942
fee83b3a
AM
1943/*
1944 * Do static interleaving for a VMA with known offset @n. Returns the n'th
269fbe72 1945 * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
fee83b3a
AM
1946 * number of present nodes.
1947 */
98c70baa 1948static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1da177e4 1949{
276aeee1 1950 nodemask_t nodemask = pol->nodes;
1951 unsigned int target, nnodes;
fee83b3a
AM
1952 int i;
1953 int nid;
276aeee1 1954 /*
1955 * The barrier will stabilize the nodemask in a register or on
1956 * the stack so that it will stop changing under the code.
1957 *
1958 * Between first_node() and next_node(), pol->nodes could be changed
1959 * by other threads. So we put pol->nodes in a local stack.
1960 */
1961 barrier();
1da177e4 1962
276aeee1 1963 nnodes = nodes_weight(nodemask);
f5b087b5
DR
1964 if (!nnodes)
1965 return numa_node_id();
fee83b3a 1966 target = (unsigned int)n % nnodes;
276aeee1 1967 nid = first_node(nodemask);
fee83b3a 1968 for (i = 0; i < target; i++)
276aeee1 1969 nid = next_node(nid, nodemask);
1da177e4
LT
1970 return nid;
1971}
1972
5da7ca86
CL
1973/* Determine a node number for interleave */
1974static inline unsigned interleave_nid(struct mempolicy *pol,
1975 struct vm_area_struct *vma, unsigned long addr, int shift)
1976{
1977 if (vma) {
1978 unsigned long off;
1979
3b98b087
NA
1980 /*
1981 * for small pages, there is no difference between
1982 * shift and PAGE_SHIFT, so the bit-shift is safe.
1983 * for huge pages, since vm_pgoff is in units of small
1984 * pages, we need to shift off the always 0 bits to get
1985 * a useful offset.
1986 */
1987 BUG_ON(shift < PAGE_SHIFT);
1988 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86 1989 off += (addr - vma->vm_start) >> shift;
98c70baa 1990 return offset_il_node(pol, off);
5da7ca86
CL
1991 } else
1992 return interleave_nodes(pol);
1993}
1994
00ac59ad 1995#ifdef CONFIG_HUGETLBFS
480eccf9 1996/*
04ec6264 1997 * huge_node(@vma, @addr, @gfp_flags, @mpol)
b46e14ac
FF
1998 * @vma: virtual memory area whose policy is sought
1999 * @addr: address in @vma for shared policy lookup and interleave policy
2000 * @gfp_flags: for requested zone
2001 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
b27abacc 2002 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
480eccf9 2003 *
04ec6264 2004 * Returns a nid suitable for a huge page allocation and a pointer
52cd3b07 2005 * to the struct mempolicy for conditional unref after allocation.
b27abacc
DH
2006 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2007 * to the mempolicy's @nodemask for filtering the zonelist.
c0ff7453 2008 *
d26914d1 2009 * Must be protected by read_mems_allowed_begin()
480eccf9 2010 */
04ec6264
VB
2011int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2012 struct mempolicy **mpol, nodemask_t **nodemask)
5da7ca86 2013{
04ec6264 2014 int nid;
b27abacc 2015 int mode;
5da7ca86 2016
dd6eecb9 2017 *mpol = get_vma_policy(vma, addr);
b27abacc
DH
2018 *nodemask = NULL;
2019 mode = (*mpol)->mode;
5da7ca86 2020
b27abacc 2021 if (unlikely(mode == MPOL_INTERLEAVE)) {
04ec6264
VB
2022 nid = interleave_nid(*mpol, vma, addr,
2023 huge_page_shift(hstate_vma(vma)));
52cd3b07 2024 } else {
04ec6264 2025 nid = policy_node(gfp_flags, *mpol, numa_node_id());
b27abacc 2026 if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
269fbe72 2027 *nodemask = &(*mpol)->nodes;
480eccf9 2028 }
04ec6264 2029 return nid;
5da7ca86 2030}
06808b08
LS
2031
2032/*
2033 * init_nodemask_of_mempolicy
2034 *
2035 * If the current task's mempolicy is "default" [NULL], return 'false'
2036 * to indicate default policy. Otherwise, extract the policy nodemask
2037 * for 'bind' or 'interleave' policy into the argument nodemask, or
2038 * initialize the argument nodemask to contain the single node for
2039 * 'preferred' or 'local' policy and return 'true' to indicate presence
2040 * of non-default mempolicy.
2041 *
2042 * We don't bother with reference counting the mempolicy [mpol_get/put]
2043 * because the current task is examining it's own mempolicy and a task's
2044 * mempolicy is only ever changed by the task itself.
2045 *
2046 * N.B., it is the caller's responsibility to free a returned nodemask.
2047 */
2048bool init_nodemask_of_mempolicy(nodemask_t *mask)
2049{
2050 struct mempolicy *mempolicy;
06808b08
LS
2051
2052 if (!(mask && current->mempolicy))
2053 return false;
2054
c0ff7453 2055 task_lock(current);
06808b08
LS
2056 mempolicy = current->mempolicy;
2057 switch (mempolicy->mode) {
2058 case MPOL_PREFERRED:
b27abacc 2059 case MPOL_PREFERRED_MANY:
06808b08 2060 case MPOL_BIND:
06808b08 2061 case MPOL_INTERLEAVE:
269fbe72 2062 *mask = mempolicy->nodes;
7858d7bc
FT
2063 break;
2064
2065 case MPOL_LOCAL:
269fbe72 2066 init_nodemask_of_node(mask, numa_node_id());
06808b08
LS
2067 break;
2068
2069 default:
2070 BUG();
2071 }
c0ff7453 2072 task_unlock(current);
06808b08
LS
2073
2074 return true;
2075}
00ac59ad 2076#endif
5da7ca86 2077
6f48d0eb 2078/*
b26e517a 2079 * mempolicy_in_oom_domain
6f48d0eb 2080 *
b26e517a
FT
2081 * If tsk's mempolicy is "bind", check for intersection between mask and
2082 * the policy nodemask. Otherwise, return true for all other policies
2083 * including "interleave", as a tsk with "interleave" policy may have
2084 * memory allocated from all nodes in system.
6f48d0eb
DR
2085 *
2086 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2087 */
b26e517a 2088bool mempolicy_in_oom_domain(struct task_struct *tsk,
6f48d0eb
DR
2089 const nodemask_t *mask)
2090{
2091 struct mempolicy *mempolicy;
2092 bool ret = true;
2093
2094 if (!mask)
2095 return ret;
b26e517a 2096
6f48d0eb
DR
2097 task_lock(tsk);
2098 mempolicy = tsk->mempolicy;
b26e517a 2099 if (mempolicy && mempolicy->mode == MPOL_BIND)
269fbe72 2100 ret = nodes_intersects(mempolicy->nodes, *mask);
6f48d0eb 2101 task_unlock(tsk);
b26e517a 2102
6f48d0eb
DR
2103 return ret;
2104}
2105
1da177e4
LT
2106/* Allocate a page in interleaved policy.
2107 Own path because it needs to do special accounting. */
662f3a0b
AK
2108static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2109 unsigned nid)
1da177e4 2110{
1da177e4
LT
2111 struct page *page;
2112
84172f4b 2113 page = __alloc_pages(gfp, order, nid, NULL);
4518085e
KW
2114 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2115 if (!static_branch_likely(&vm_numa_stat_key))
2116 return page;
de55c8b2
AR
2117 if (page && page_to_nid(page) == nid) {
2118 preempt_disable();
f19298b9 2119 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
de55c8b2
AR
2120 preempt_enable();
2121 }
1da177e4
LT
2122 return page;
2123}
2124
4c54d949
FT
2125static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2126 int nid, struct mempolicy *pol)
2127{
2128 struct page *page;
2129 gfp_t preferred_gfp;
2130
2131 /*
2132 * This is a two pass approach. The first pass will only try the
2133 * preferred nodes but skip the direct reclaim and allow the
2134 * allocation to fail, while the second pass will try all the
2135 * nodes in system.
2136 */
2137 preferred_gfp = gfp | __GFP_NOWARN;
2138 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2139 page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2140 if (!page)
c0455116 2141 page = __alloc_pages(gfp, order, nid, NULL);
4c54d949
FT
2142
2143 return page;
2144}
2145
1da177e4 2146/**
adf88aa8 2147 * vma_alloc_folio - Allocate a folio for a VMA.
eb350739 2148 * @gfp: GFP flags.
adf88aa8 2149 * @order: Order of the folio.
eb350739
MWO
2150 * @vma: Pointer to VMA or NULL if not available.
2151 * @addr: Virtual address of the allocation. Must be inside @vma.
eb350739 2152 * @hugepage: For hugepages try only the preferred node if possible.
1da177e4 2153 *
adf88aa8 2154 * Allocate a folio for a specific address in @vma, using the appropriate
eb350739
MWO
2155 * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
2156 * of the mm_struct of the VMA to prevent it from going away. Should be
adf88aa8 2157 * used for all allocations for folios that will be mapped into user space.
1da177e4 2158 *
adf88aa8 2159 * Return: The folio on success or NULL if allocation fails.
1da177e4 2160 */
adf88aa8 2161struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
be1a13eb 2162 unsigned long addr, bool hugepage)
1da177e4 2163{
cc9a6c87 2164 struct mempolicy *pol;
be1a13eb 2165 int node = numa_node_id();
adf88aa8 2166 struct folio *folio;
04ec6264 2167 int preferred_nid;
be97a41b 2168 nodemask_t *nmask;
cc9a6c87 2169
dd6eecb9 2170 pol = get_vma_policy(vma, addr);
1da177e4 2171
0867a57c 2172 if (pol->mode == MPOL_INTERLEAVE) {
adf88aa8 2173 struct page *page;
0867a57c
VB
2174 unsigned nid;
2175
2176 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2177 mpol_cond_put(pol);
adf88aa8 2178 gfp |= __GFP_COMP;
0867a57c 2179 page = alloc_page_interleave(gfp, order, nid);
adf88aa8
MWO
2180 if (page && order > 1)
2181 prep_transhuge_page(page);
2182 folio = (struct folio *)page;
0867a57c 2183 goto out;
19deb769
DR
2184 }
2185
4c54d949 2186 if (pol->mode == MPOL_PREFERRED_MANY) {
adf88aa8
MWO
2187 struct page *page;
2188
c0455116 2189 node = policy_node(gfp, pol, node);
adf88aa8 2190 gfp |= __GFP_COMP;
4c54d949
FT
2191 page = alloc_pages_preferred_many(gfp, order, node, pol);
2192 mpol_cond_put(pol);
adf88aa8
MWO
2193 if (page && order > 1)
2194 prep_transhuge_page(page);
2195 folio = (struct folio *)page;
4c54d949
FT
2196 goto out;
2197 }
2198
19deb769
DR
2199 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2200 int hpage_node = node;
2201
2202 /*
2203 * For hugepage allocation and non-interleave policy which
2204 * allows the current node (or other explicitly preferred
2205 * node) we only try to allocate from the current/preferred
2206 * node and don't fall back to other nodes, as the cost of
2207 * remote accesses would likely offset THP benefits.
2208 *
b27abacc 2209 * If the policy is interleave or does not allow the current
19deb769
DR
2210 * node in its nodemask, we allocate the standard way.
2211 */
7858d7bc 2212 if (pol->mode == MPOL_PREFERRED)
269fbe72 2213 hpage_node = first_node(pol->nodes);
19deb769
DR
2214
2215 nmask = policy_nodemask(gfp, pol);
2216 if (!nmask || node_isset(hpage_node, *nmask)) {
2217 mpol_cond_put(pol);
cc638f32
VB
2218 /*
2219 * First, try to allocate THP only on local node, but
2220 * don't reclaim unnecessarily, just compact.
2221 */
adf88aa8
MWO
2222 folio = __folio_alloc_node(gfp | __GFP_THISNODE |
2223 __GFP_NORETRY, order, hpage_node);
76e654cc
DR
2224
2225 /*
2226 * If hugepage allocations are configured to always
2227 * synchronous compact or the vma has been madvised
2228 * to prefer hugepage backing, retry allowing remote
cc638f32 2229 * memory with both reclaim and compact as well.
76e654cc 2230 */
adf88aa8
MWO
2231 if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
2232 folio = __folio_alloc(gfp, order, hpage_node,
2233 nmask);
76e654cc 2234
19deb769
DR
2235 goto out;
2236 }
356ff8a9
DR
2237 }
2238
be97a41b 2239 nmask = policy_nodemask(gfp, pol);
04ec6264 2240 preferred_nid = policy_node(gfp, pol, node);
adf88aa8 2241 folio = __folio_alloc(gfp, order, preferred_nid, nmask);
d51e9894 2242 mpol_cond_put(pol);
be97a41b 2243out:
f584b680
MWO
2244 return folio;
2245}
adf88aa8 2246EXPORT_SYMBOL(vma_alloc_folio);
f584b680 2247
1da177e4 2248/**
6421ec76
MWO
2249 * alloc_pages - Allocate pages.
2250 * @gfp: GFP flags.
2251 * @order: Power of two of number of pages to allocate.
1da177e4 2252 *
6421ec76
MWO
2253 * Allocate 1 << @order contiguous pages. The physical address of the
2254 * first page is naturally aligned (eg an order-3 allocation will be aligned
2255 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2256 * process is honoured when in process context.
1da177e4 2257 *
6421ec76
MWO
2258 * Context: Can be called from any context, providing the appropriate GFP
2259 * flags are used.
2260 * Return: The page on success or NULL if allocation fails.
1da177e4 2261 */
d7f946d0 2262struct page *alloc_pages(gfp_t gfp, unsigned order)
1da177e4 2263{
8d90274b 2264 struct mempolicy *pol = &default_policy;
c0ff7453 2265 struct page *page;
1da177e4 2266
8d90274b
ON
2267 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2268 pol = get_task_policy(current);
52cd3b07
LS
2269
2270 /*
2271 * No reference counting needed for current->mempolicy
2272 * nor system default_policy
2273 */
45c4745a 2274 if (pol->mode == MPOL_INTERLEAVE)
c0ff7453 2275 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
4c54d949
FT
2276 else if (pol->mode == MPOL_PREFERRED_MANY)
2277 page = alloc_pages_preferred_many(gfp, order,
c0455116 2278 policy_node(gfp, pol, numa_node_id()), pol);
c0ff7453 2279 else
84172f4b 2280 page = __alloc_pages(gfp, order,
04ec6264 2281 policy_node(gfp, pol, numa_node_id()),
5c4b4be3 2282 policy_nodemask(gfp, pol));
cc9a6c87 2283
c0ff7453 2284 return page;
1da177e4 2285}
d7f946d0 2286EXPORT_SYMBOL(alloc_pages);
1da177e4 2287
cc09cb13
MWO
2288struct folio *folio_alloc(gfp_t gfp, unsigned order)
2289{
2290 struct page *page = alloc_pages(gfp | __GFP_COMP, order);
2291
2292 if (page && order > 1)
2293 prep_transhuge_page(page);
2294 return (struct folio *)page;
2295}
2296EXPORT_SYMBOL(folio_alloc);
2297
c00b6b96
CW
2298static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2299 struct mempolicy *pol, unsigned long nr_pages,
2300 struct page **page_array)
2301{
2302 int nodes;
2303 unsigned long nr_pages_per_node;
2304 int delta;
2305 int i;
2306 unsigned long nr_allocated;
2307 unsigned long total_allocated = 0;
2308
2309 nodes = nodes_weight(pol->nodes);
2310 nr_pages_per_node = nr_pages / nodes;
2311 delta = nr_pages - nodes * nr_pages_per_node;
2312
2313 for (i = 0; i < nodes; i++) {
2314 if (delta) {
2315 nr_allocated = __alloc_pages_bulk(gfp,
2316 interleave_nodes(pol), NULL,
2317 nr_pages_per_node + 1, NULL,
2318 page_array);
2319 delta--;
2320 } else {
2321 nr_allocated = __alloc_pages_bulk(gfp,
2322 interleave_nodes(pol), NULL,
2323 nr_pages_per_node, NULL, page_array);
2324 }
2325
2326 page_array += nr_allocated;
2327 total_allocated += nr_allocated;
2328 }
2329
2330 return total_allocated;
2331}
2332
2333static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2334 struct mempolicy *pol, unsigned long nr_pages,
2335 struct page **page_array)
2336{
2337 gfp_t preferred_gfp;
2338 unsigned long nr_allocated = 0;
2339
2340 preferred_gfp = gfp | __GFP_NOWARN;
2341 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2342
2343 nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2344 nr_pages, NULL, page_array);
2345
2346 if (nr_allocated < nr_pages)
2347 nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2348 nr_pages - nr_allocated, NULL,
2349 page_array + nr_allocated);
2350 return nr_allocated;
2351}
2352
2353/* alloc pages bulk and mempolicy should be considered at the
2354 * same time in some situation such as vmalloc.
2355 *
2356 * It can accelerate memory allocation especially interleaving
2357 * allocate memory.
2358 */
2359unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2360 unsigned long nr_pages, struct page **page_array)
2361{
2362 struct mempolicy *pol = &default_policy;
2363
2364 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2365 pol = get_task_policy(current);
2366
2367 if (pol->mode == MPOL_INTERLEAVE)
2368 return alloc_pages_bulk_array_interleave(gfp, pol,
2369 nr_pages, page_array);
2370
2371 if (pol->mode == MPOL_PREFERRED_MANY)
2372 return alloc_pages_bulk_array_preferred_many(gfp,
2373 numa_node_id(), pol, nr_pages, page_array);
2374
2375 return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2376 policy_nodemask(gfp, pol), nr_pages, NULL,
2377 page_array);
2378}
2379
ef0855d3
ON
2380int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2381{
2382 struct mempolicy *pol = mpol_dup(vma_policy(src));
2383
2384 if (IS_ERR(pol))
2385 return PTR_ERR(pol);
2386 dst->vm_policy = pol;
2387 return 0;
2388}
2389
4225399a 2390/*
846a16bf 2391 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2392 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2393 * with the mems_allowed returned by cpuset_mems_allowed(). This
2394 * keeps mempolicies cpuset relative after its cpuset moves. See
2395 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2396 *
2397 * current's mempolicy may be rebinded by the other task(the task that changes
2398 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2399 */
4225399a 2400
846a16bf
LS
2401/* Slow path of a mempolicy duplicate */
2402struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2403{
2404 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2405
2406 if (!new)
2407 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2408
2409 /* task's mempolicy is protected by alloc_lock */
2410 if (old == current->mempolicy) {
2411 task_lock(current);
2412 *new = *old;
2413 task_unlock(current);
2414 } else
2415 *new = *old;
2416
4225399a
PJ
2417 if (current_cpuset_is_being_rebound()) {
2418 nodemask_t mems = cpuset_mems_allowed(current);
213980c0 2419 mpol_rebind_policy(new, &mems);
4225399a 2420 }
1da177e4 2421 atomic_set(&new->refcnt, 1);
1da177e4
LT
2422 return new;
2423}
2424
2425/* Slow path of a mempolicy comparison */
fcfb4dcc 2426bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2427{
2428 if (!a || !b)
fcfb4dcc 2429 return false;
45c4745a 2430 if (a->mode != b->mode)
fcfb4dcc 2431 return false;
19800502 2432 if (a->flags != b->flags)
fcfb4dcc 2433 return false;
c6018b4b
AK
2434 if (a->home_node != b->home_node)
2435 return false;
19800502
BL
2436 if (mpol_store_user_nodemask(a))
2437 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2438 return false;
19800502 2439
45c4745a 2440 switch (a->mode) {
19770b32 2441 case MPOL_BIND:
1da177e4 2442 case MPOL_INTERLEAVE:
1da177e4 2443 case MPOL_PREFERRED:
b27abacc 2444 case MPOL_PREFERRED_MANY:
269fbe72 2445 return !!nodes_equal(a->nodes, b->nodes);
7858d7bc
FT
2446 case MPOL_LOCAL:
2447 return true;
1da177e4
LT
2448 default:
2449 BUG();
fcfb4dcc 2450 return false;
1da177e4
LT
2451 }
2452}
2453
1da177e4
LT
2454/*
2455 * Shared memory backing store policy support.
2456 *
2457 * Remember policies even when nobody has shared memory mapped.
2458 * The policies are kept in Red-Black tree linked from the inode.
4a8c7bb5 2459 * They are protected by the sp->lock rwlock, which should be held
1da177e4
LT
2460 * for any accesses to the tree.
2461 */
2462
4a8c7bb5
NZ
2463/*
2464 * lookup first element intersecting start-end. Caller holds sp->lock for
2465 * reading or for writing
2466 */
1da177e4
LT
2467static struct sp_node *
2468sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2469{
2470 struct rb_node *n = sp->root.rb_node;
2471
2472 while (n) {
2473 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2474
2475 if (start >= p->end)
2476 n = n->rb_right;
2477 else if (end <= p->start)
2478 n = n->rb_left;
2479 else
2480 break;
2481 }
2482 if (!n)
2483 return NULL;
2484 for (;;) {
2485 struct sp_node *w = NULL;
2486 struct rb_node *prev = rb_prev(n);
2487 if (!prev)
2488 break;
2489 w = rb_entry(prev, struct sp_node, nd);
2490 if (w->end <= start)
2491 break;
2492 n = prev;
2493 }
2494 return rb_entry(n, struct sp_node, nd);
2495}
2496
4a8c7bb5
NZ
2497/*
2498 * Insert a new shared policy into the list. Caller holds sp->lock for
2499 * writing.
2500 */
1da177e4
LT
2501static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2502{
2503 struct rb_node **p = &sp->root.rb_node;
2504 struct rb_node *parent = NULL;
2505 struct sp_node *nd;
2506
2507 while (*p) {
2508 parent = *p;
2509 nd = rb_entry(parent, struct sp_node, nd);
2510 if (new->start < nd->start)
2511 p = &(*p)->rb_left;
2512 else if (new->end > nd->end)
2513 p = &(*p)->rb_right;
2514 else
2515 BUG();
2516 }
2517 rb_link_node(&new->nd, parent, p);
2518 rb_insert_color(&new->nd, &sp->root);
140d5a49 2519 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
45c4745a 2520 new->policy ? new->policy->mode : 0);
1da177e4
LT
2521}
2522
2523/* Find shared policy intersecting idx */
2524struct mempolicy *
2525mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2526{
2527 struct mempolicy *pol = NULL;
2528 struct sp_node *sn;
2529
2530 if (!sp->root.rb_node)
2531 return NULL;
4a8c7bb5 2532 read_lock(&sp->lock);
1da177e4
LT
2533 sn = sp_lookup(sp, idx, idx+1);
2534 if (sn) {
2535 mpol_get(sn->policy);
2536 pol = sn->policy;
2537 }
4a8c7bb5 2538 read_unlock(&sp->lock);
1da177e4
LT
2539 return pol;
2540}
2541
63f74ca2
KM
2542static void sp_free(struct sp_node *n)
2543{
2544 mpol_put(n->policy);
2545 kmem_cache_free(sn_cache, n);
2546}
2547
771fb4d8
LS
2548/**
2549 * mpol_misplaced - check whether current page node is valid in policy
2550 *
b46e14ac
FF
2551 * @page: page to be checked
2552 * @vma: vm area where page mapped
2553 * @addr: virtual address where page mapped
771fb4d8
LS
2554 *
2555 * Lookup current policy node id for vma,addr and "compare to" page's
5f076944 2556 * node id. Policy determination "mimics" alloc_page_vma().
771fb4d8 2557 * Called from fault path where we know the vma and faulting address.
5f076944 2558 *
062db293
BW
2559 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2560 * policy, or a suitable node ID to allocate a replacement page from.
771fb4d8
LS
2561 */
2562int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2563{
2564 struct mempolicy *pol;
c33d6c06 2565 struct zoneref *z;
771fb4d8
LS
2566 int curnid = page_to_nid(page);
2567 unsigned long pgoff;
90572890
PZ
2568 int thiscpu = raw_smp_processor_id();
2569 int thisnid = cpu_to_node(thiscpu);
98fa15f3 2570 int polnid = NUMA_NO_NODE;
062db293 2571 int ret = NUMA_NO_NODE;
771fb4d8 2572
dd6eecb9 2573 pol = get_vma_policy(vma, addr);
771fb4d8
LS
2574 if (!(pol->flags & MPOL_F_MOF))
2575 goto out;
2576
2577 switch (pol->mode) {
2578 case MPOL_INTERLEAVE:
771fb4d8
LS
2579 pgoff = vma->vm_pgoff;
2580 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
98c70baa 2581 polnid = offset_il_node(pol, pgoff);
771fb4d8
LS
2582 break;
2583
2584 case MPOL_PREFERRED:
b27abacc
DH
2585 if (node_isset(curnid, pol->nodes))
2586 goto out;
269fbe72 2587 polnid = first_node(pol->nodes);
7858d7bc
FT
2588 break;
2589
2590 case MPOL_LOCAL:
2591 polnid = numa_node_id();
771fb4d8
LS
2592 break;
2593
2594 case MPOL_BIND:
bda420b9
HY
2595 /* Optimize placement among multiple nodes via NUMA balancing */
2596 if (pol->flags & MPOL_F_MORON) {
269fbe72 2597 if (node_isset(thisnid, pol->nodes))
bda420b9
HY
2598 break;
2599 goto out;
2600 }
b27abacc 2601 fallthrough;
c33d6c06 2602
b27abacc 2603 case MPOL_PREFERRED_MANY:
771fb4d8 2604 /*
771fb4d8
LS
2605 * use current page if in policy nodemask,
2606 * else select nearest allowed node, if any.
2607 * If no allowed nodes, use current [!misplaced].
2608 */
269fbe72 2609 if (node_isset(curnid, pol->nodes))
771fb4d8 2610 goto out;
c33d6c06 2611 z = first_zones_zonelist(
771fb4d8
LS
2612 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2613 gfp_zone(GFP_HIGHUSER),
269fbe72 2614 &pol->nodes);
c1093b74 2615 polnid = zone_to_nid(z->zone);
771fb4d8
LS
2616 break;
2617
2618 default:
2619 BUG();
2620 }
5606e387
MG
2621
2622 /* Migrate the page towards the node whose CPU is referencing it */
e42c8ff2 2623 if (pol->flags & MPOL_F_MORON) {
90572890 2624 polnid = thisnid;
5606e387 2625
10f39042 2626 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
de1c9ce6 2627 goto out;
e42c8ff2
MG
2628 }
2629
771fb4d8
LS
2630 if (curnid != polnid)
2631 ret = polnid;
2632out:
2633 mpol_cond_put(pol);
2634
2635 return ret;
2636}
2637
c11600e4
DR
2638/*
2639 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2640 * dropped after task->mempolicy is set to NULL so that any allocation done as
2641 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2642 * policy.
2643 */
2644void mpol_put_task_policy(struct task_struct *task)
2645{
2646 struct mempolicy *pol;
2647
2648 task_lock(task);
2649 pol = task->mempolicy;
2650 task->mempolicy = NULL;
2651 task_unlock(task);
2652 mpol_put(pol);
2653}
2654
1da177e4
LT
2655static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2656{
140d5a49 2657 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 2658 rb_erase(&n->nd, &sp->root);
63f74ca2 2659 sp_free(n);
1da177e4
LT
2660}
2661
42288fe3
MG
2662static void sp_node_init(struct sp_node *node, unsigned long start,
2663 unsigned long end, struct mempolicy *pol)
2664{
2665 node->start = start;
2666 node->end = end;
2667 node->policy = pol;
2668}
2669
dbcb0f19
AB
2670static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2671 struct mempolicy *pol)
1da177e4 2672{
869833f2
KM
2673 struct sp_node *n;
2674 struct mempolicy *newpol;
1da177e4 2675
869833f2 2676 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2677 if (!n)
2678 return NULL;
869833f2
KM
2679
2680 newpol = mpol_dup(pol);
2681 if (IS_ERR(newpol)) {
2682 kmem_cache_free(sn_cache, n);
2683 return NULL;
2684 }
2685 newpol->flags |= MPOL_F_SHARED;
42288fe3 2686 sp_node_init(n, start, end, newpol);
869833f2 2687
1da177e4
LT
2688 return n;
2689}
2690
2691/* Replace a policy range. */
2692static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2693 unsigned long end, struct sp_node *new)
2694{
b22d127a 2695 struct sp_node *n;
42288fe3
MG
2696 struct sp_node *n_new = NULL;
2697 struct mempolicy *mpol_new = NULL;
b22d127a 2698 int ret = 0;
1da177e4 2699
42288fe3 2700restart:
4a8c7bb5 2701 write_lock(&sp->lock);
1da177e4
LT
2702 n = sp_lookup(sp, start, end);
2703 /* Take care of old policies in the same range. */
2704 while (n && n->start < end) {
2705 struct rb_node *next = rb_next(&n->nd);
2706 if (n->start >= start) {
2707 if (n->end <= end)
2708 sp_delete(sp, n);
2709 else
2710 n->start = end;
2711 } else {
2712 /* Old policy spanning whole new range. */
2713 if (n->end > end) {
42288fe3
MG
2714 if (!n_new)
2715 goto alloc_new;
2716
2717 *mpol_new = *n->policy;
2718 atomic_set(&mpol_new->refcnt, 1);
7880639c 2719 sp_node_init(n_new, end, n->end, mpol_new);
1da177e4 2720 n->end = start;
5ca39575 2721 sp_insert(sp, n_new);
42288fe3
MG
2722 n_new = NULL;
2723 mpol_new = NULL;
1da177e4
LT
2724 break;
2725 } else
2726 n->end = start;
2727 }
2728 if (!next)
2729 break;
2730 n = rb_entry(next, struct sp_node, nd);
2731 }
2732 if (new)
2733 sp_insert(sp, new);
4a8c7bb5 2734 write_unlock(&sp->lock);
42288fe3
MG
2735 ret = 0;
2736
2737err_out:
2738 if (mpol_new)
2739 mpol_put(mpol_new);
2740 if (n_new)
2741 kmem_cache_free(sn_cache, n_new);
2742
b22d127a 2743 return ret;
42288fe3
MG
2744
2745alloc_new:
4a8c7bb5 2746 write_unlock(&sp->lock);
42288fe3
MG
2747 ret = -ENOMEM;
2748 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2749 if (!n_new)
2750 goto err_out;
2751 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2752 if (!mpol_new)
2753 goto err_out;
4ad09955 2754 atomic_set(&mpol_new->refcnt, 1);
42288fe3 2755 goto restart;
1da177e4
LT
2756}
2757
71fe804b
LS
2758/**
2759 * mpol_shared_policy_init - initialize shared policy for inode
2760 * @sp: pointer to inode shared policy
2761 * @mpol: struct mempolicy to install
2762 *
2763 * Install non-NULL @mpol in inode's shared policy rb-tree.
2764 * On entry, the current task has a reference on a non-NULL @mpol.
2765 * This must be released on exit.
4bfc4495 2766 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2767 */
2768void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2769{
58568d2a
MX
2770 int ret;
2771
71fe804b 2772 sp->root = RB_ROOT; /* empty tree == default mempolicy */
4a8c7bb5 2773 rwlock_init(&sp->lock);
71fe804b
LS
2774
2775 if (mpol) {
2776 struct vm_area_struct pvma;
2777 struct mempolicy *new;
4bfc4495 2778 NODEMASK_SCRATCH(scratch);
71fe804b 2779
4bfc4495 2780 if (!scratch)
5c0c1654 2781 goto put_mpol;
71fe804b
LS
2782 /* contextualize the tmpfs mount point mempolicy */
2783 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
15d77835 2784 if (IS_ERR(new))
0cae3457 2785 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2786
2787 task_lock(current);
4bfc4495 2788 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a 2789 task_unlock(current);
15d77835 2790 if (ret)
5c0c1654 2791 goto put_new;
71fe804b
LS
2792
2793 /* Create pseudo-vma that contains just the policy */
2c4541e2 2794 vma_init(&pvma, NULL);
71fe804b
LS
2795 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2796 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
15d77835 2797
5c0c1654 2798put_new:
71fe804b 2799 mpol_put(new); /* drop initial ref */
0cae3457 2800free_scratch:
4bfc4495 2801 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2802put_mpol:
2803 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2804 }
2805}
2806
1da177e4
LT
2807int mpol_set_shared_policy(struct shared_policy *info,
2808 struct vm_area_struct *vma, struct mempolicy *npol)
2809{
2810 int err;
2811 struct sp_node *new = NULL;
2812 unsigned long sz = vma_pages(vma);
2813
028fec41 2814 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 2815 vma->vm_pgoff,
45c4745a 2816 sz, npol ? npol->mode : -1,
028fec41 2817 npol ? npol->flags : -1,
269fbe72 2818 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
1da177e4
LT
2819
2820 if (npol) {
2821 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2822 if (!new)
2823 return -ENOMEM;
2824 }
2825 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2826 if (err && new)
63f74ca2 2827 sp_free(new);
1da177e4
LT
2828 return err;
2829}
2830
2831/* Free a backing policy store on inode delete. */
2832void mpol_free_shared_policy(struct shared_policy *p)
2833{
2834 struct sp_node *n;
2835 struct rb_node *next;
2836
2837 if (!p->root.rb_node)
2838 return;
4a8c7bb5 2839 write_lock(&p->lock);
1da177e4
LT
2840 next = rb_first(&p->root);
2841 while (next) {
2842 n = rb_entry(next, struct sp_node, nd);
2843 next = rb_next(&n->nd);
63f74ca2 2844 sp_delete(p, n);
1da177e4 2845 }
4a8c7bb5 2846 write_unlock(&p->lock);
1da177e4
LT
2847}
2848
1a687c2e 2849#ifdef CONFIG_NUMA_BALANCING
c297663c 2850static int __initdata numabalancing_override;
1a687c2e
MG
2851
2852static void __init check_numabalancing_enable(void)
2853{
2854 bool numabalancing_default = false;
2855
2856 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2857 numabalancing_default = true;
2858
c297663c
MG
2859 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2860 if (numabalancing_override)
2861 set_numabalancing_state(numabalancing_override == 1);
2862
b0dc2b9b 2863 if (num_online_nodes() > 1 && !numabalancing_override) {
756a025f 2864 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
c297663c 2865 numabalancing_default ? "Enabling" : "Disabling");
1a687c2e
MG
2866 set_numabalancing_state(numabalancing_default);
2867 }
2868}
2869
2870static int __init setup_numabalancing(char *str)
2871{
2872 int ret = 0;
2873 if (!str)
2874 goto out;
1a687c2e
MG
2875
2876 if (!strcmp(str, "enable")) {
c297663c 2877 numabalancing_override = 1;
1a687c2e
MG
2878 ret = 1;
2879 } else if (!strcmp(str, "disable")) {
c297663c 2880 numabalancing_override = -1;
1a687c2e
MG
2881 ret = 1;
2882 }
2883out:
2884 if (!ret)
4a404bea 2885 pr_warn("Unable to parse numa_balancing=\n");
1a687c2e
MG
2886
2887 return ret;
2888}
2889__setup("numa_balancing=", setup_numabalancing);
2890#else
2891static inline void __init check_numabalancing_enable(void)
2892{
2893}
2894#endif /* CONFIG_NUMA_BALANCING */
2895
1da177e4
LT
2896/* assumes fs == KERNEL_DS */
2897void __init numa_policy_init(void)
2898{
b71636e2
PM
2899 nodemask_t interleave_nodes;
2900 unsigned long largest = 0;
2901 int nid, prefer = 0;
2902
1da177e4
LT
2903 policy_cache = kmem_cache_create("numa_policy",
2904 sizeof(struct mempolicy),
20c2df83 2905 0, SLAB_PANIC, NULL);
1da177e4
LT
2906
2907 sn_cache = kmem_cache_create("shared_policy_node",
2908 sizeof(struct sp_node),
20c2df83 2909 0, SLAB_PANIC, NULL);
1da177e4 2910
5606e387
MG
2911 for_each_node(nid) {
2912 preferred_node_policy[nid] = (struct mempolicy) {
2913 .refcnt = ATOMIC_INIT(1),
2914 .mode = MPOL_PREFERRED,
2915 .flags = MPOL_F_MOF | MPOL_F_MORON,
269fbe72 2916 .nodes = nodemask_of_node(nid),
5606e387
MG
2917 };
2918 }
2919
b71636e2
PM
2920 /*
2921 * Set interleaving policy for system init. Interleaving is only
2922 * enabled across suitably sized nodes (default is >= 16MB), or
2923 * fall back to the largest node if they're all smaller.
2924 */
2925 nodes_clear(interleave_nodes);
01f13bd6 2926 for_each_node_state(nid, N_MEMORY) {
b71636e2
PM
2927 unsigned long total_pages = node_present_pages(nid);
2928
2929 /* Preserve the largest node */
2930 if (largest < total_pages) {
2931 largest = total_pages;
2932 prefer = nid;
2933 }
2934
2935 /* Interleave this node? */
2936 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2937 node_set(nid, interleave_nodes);
2938 }
2939
2940 /* All too small, use the largest */
2941 if (unlikely(nodes_empty(interleave_nodes)))
2942 node_set(prefer, interleave_nodes);
1da177e4 2943
028fec41 2944 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
b1de0d13 2945 pr_err("%s: interleaving failed\n", __func__);
1a687c2e
MG
2946
2947 check_numabalancing_enable();
1da177e4
LT
2948}
2949
8bccd85f 2950/* Reset policy of current process to default */
1da177e4
LT
2951void numa_default_policy(void)
2952{
028fec41 2953 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2954}
68860ec1 2955
095f1fc4
LS
2956/*
2957 * Parse and format mempolicy from/to strings
2958 */
2959
345ace9c
LS
2960static const char * const policy_modes[] =
2961{
2962 [MPOL_DEFAULT] = "default",
2963 [MPOL_PREFERRED] = "prefer",
2964 [MPOL_BIND] = "bind",
2965 [MPOL_INTERLEAVE] = "interleave",
d3a71033 2966 [MPOL_LOCAL] = "local",
b27abacc 2967 [MPOL_PREFERRED_MANY] = "prefer (many)",
345ace9c 2968};
1a75a6c8 2969
095f1fc4
LS
2970
2971#ifdef CONFIG_TMPFS
2972/**
f2a07f40 2973 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4 2974 * @str: string containing mempolicy to parse
71fe804b 2975 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4
LS
2976 *
2977 * Format of input:
2978 * <mode>[=<flags>][:<nodelist>]
2979 *
dad5b023 2980 * Return: %0 on success, else %1
095f1fc4 2981 */
a7a88b23 2982int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4 2983{
71fe804b 2984 struct mempolicy *new = NULL;
f2a07f40 2985 unsigned short mode_flags;
71fe804b 2986 nodemask_t nodes;
095f1fc4
LS
2987 char *nodelist = strchr(str, ':');
2988 char *flags = strchr(str, '=');
dedf2c73 2989 int err = 1, mode;
095f1fc4 2990
c7a91bc7
DC
2991 if (flags)
2992 *flags++ = '\0'; /* terminate mode string */
2993
095f1fc4
LS
2994 if (nodelist) {
2995 /* NUL-terminate mode or flags string */
2996 *nodelist++ = '\0';
71fe804b 2997 if (nodelist_parse(nodelist, nodes))
095f1fc4 2998 goto out;
01f13bd6 2999 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4 3000 goto out;
71fe804b
LS
3001 } else
3002 nodes_clear(nodes);
3003
dedf2c73 3004 mode = match_string(policy_modes, MPOL_MAX, str);
3005 if (mode < 0)
095f1fc4
LS
3006 goto out;
3007
71fe804b 3008 switch (mode) {
095f1fc4 3009 case MPOL_PREFERRED:
71fe804b 3010 /*
aa9f7d51
RD
3011 * Insist on a nodelist of one node only, although later
3012 * we use first_node(nodes) to grab a single node, so here
3013 * nodelist (or nodes) cannot be empty.
71fe804b 3014 */
095f1fc4
LS
3015 if (nodelist) {
3016 char *rest = nodelist;
3017 while (isdigit(*rest))
3018 rest++;
926f2ae0
KM
3019 if (*rest)
3020 goto out;
aa9f7d51
RD
3021 if (nodes_empty(nodes))
3022 goto out;
095f1fc4
LS
3023 }
3024 break;
095f1fc4
LS
3025 case MPOL_INTERLEAVE:
3026 /*
3027 * Default to online nodes with memory if no nodelist
3028 */
3029 if (!nodelist)
01f13bd6 3030 nodes = node_states[N_MEMORY];
3f226aa1 3031 break;
71fe804b 3032 case MPOL_LOCAL:
3f226aa1 3033 /*
71fe804b 3034 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 3035 */
71fe804b 3036 if (nodelist)
3f226aa1 3037 goto out;
3f226aa1 3038 break;
413b43de
RT
3039 case MPOL_DEFAULT:
3040 /*
3041 * Insist on a empty nodelist
3042 */
3043 if (!nodelist)
3044 err = 0;
3045 goto out;
b27abacc 3046 case MPOL_PREFERRED_MANY:
d69b2e63
KM
3047 case MPOL_BIND:
3048 /*
3049 * Insist on a nodelist
3050 */
3051 if (!nodelist)
3052 goto out;
095f1fc4
LS
3053 }
3054
71fe804b 3055 mode_flags = 0;
095f1fc4
LS
3056 if (flags) {
3057 /*
3058 * Currently, we only support two mutually exclusive
3059 * mode flags.
3060 */
3061 if (!strcmp(flags, "static"))
71fe804b 3062 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 3063 else if (!strcmp(flags, "relative"))
71fe804b 3064 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 3065 else
926f2ae0 3066 goto out;
095f1fc4 3067 }
71fe804b
LS
3068
3069 new = mpol_new(mode, mode_flags, &nodes);
3070 if (IS_ERR(new))
926f2ae0
KM
3071 goto out;
3072
f2a07f40
HD
3073 /*
3074 * Save nodes for mpol_to_str() to show the tmpfs mount options
3075 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3076 */
269fbe72
BW
3077 if (mode != MPOL_PREFERRED) {
3078 new->nodes = nodes;
3079 } else if (nodelist) {
3080 nodes_clear(new->nodes);
3081 node_set(first_node(nodes), new->nodes);
3082 } else {
7858d7bc 3083 new->mode = MPOL_LOCAL;
269fbe72 3084 }
f2a07f40
HD
3085
3086 /*
3087 * Save nodes for contextualization: this will be used to "clone"
3088 * the mempolicy in a specific context [cpuset] at a later time.
3089 */
3090 new->w.user_nodemask = nodes;
3091
926f2ae0 3092 err = 0;
71fe804b 3093
095f1fc4
LS
3094out:
3095 /* Restore string for error message */
3096 if (nodelist)
3097 *--nodelist = ':';
3098 if (flags)
3099 *--flags = '=';
71fe804b
LS
3100 if (!err)
3101 *mpol = new;
095f1fc4
LS
3102 return err;
3103}
3104#endif /* CONFIG_TMPFS */
3105
71fe804b
LS
3106/**
3107 * mpol_to_str - format a mempolicy structure for printing
3108 * @buffer: to contain formatted mempolicy string
3109 * @maxlen: length of @buffer
3110 * @pol: pointer to mempolicy to be formatted
71fe804b 3111 *
948927ee
DR
3112 * Convert @pol into a string. If @buffer is too short, truncate the string.
3113 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3114 * longest flag, "relative", and to display at least a few node ids.
1a75a6c8 3115 */
948927ee 3116void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c8
CL
3117{
3118 char *p = buffer;
948927ee
DR
3119 nodemask_t nodes = NODE_MASK_NONE;
3120 unsigned short mode = MPOL_DEFAULT;
3121 unsigned short flags = 0;
2291990a 3122
8790c71a 3123 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d5 3124 mode = pol->mode;
948927ee
DR
3125 flags = pol->flags;
3126 }
bea904d5 3127
1a75a6c8
CL
3128 switch (mode) {
3129 case MPOL_DEFAULT:
7858d7bc 3130 case MPOL_LOCAL:
1a75a6c8 3131 break;
1a75a6c8 3132 case MPOL_PREFERRED:
b27abacc 3133 case MPOL_PREFERRED_MANY:
1a75a6c8 3134 case MPOL_BIND:
1a75a6c8 3135 case MPOL_INTERLEAVE:
269fbe72 3136 nodes = pol->nodes;
1a75a6c8 3137 break;
1a75a6c8 3138 default:
948927ee
DR
3139 WARN_ON_ONCE(1);
3140 snprintf(p, maxlen, "unknown");
3141 return;
1a75a6c8
CL
3142 }
3143
b7a9f420 3144 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c8 3145
fc36b8d3 3146 if (flags & MPOL_MODE_FLAGS) {
948927ee 3147 p += snprintf(p, buffer + maxlen - p, "=");
f5b087b5 3148
2291990a
LS
3149 /*
3150 * Currently, the only defined flags are mutually exclusive
3151 */
f5b087b5 3152 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
3153 p += snprintf(p, buffer + maxlen - p, "static");
3154 else if (flags & MPOL_F_RELATIVE_NODES)
3155 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
3156 }
3157
9e763e0f
TH
3158 if (!nodes_empty(nodes))
3159 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3160 nodemask_pr_args(&nodes));
1a75a6c8 3161}