mempolicy: alloc_pages_mpol() for NUMA policy without vma
[linux-2.6-block.git] / mm / mempolicy.c
CommitLineData
46aeb7e6 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
c36f6e6d 28 * preferred Try a specific node first before normal fallback.
00ef2d2f 29 * As a special case NUMA_NO_NODE here means do the allocation
1da177e4
LT
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
b27abacc
DH
34 * preferred many Try a set of nodes first before normal fallback. This is
35 * similar to preferred without the special case.
36 *
1da177e4
LT
37 * default Allocate on the local node first, or when on a VMA
38 * use the process policy. This is what Linux always did
39 * in a NUMA aware kernel and still does by, ahem, default.
40 *
41 * The process policy is applied for most non interrupt memory allocations
42 * in that process' context. Interrupts ignore the policies and always
43 * try to allocate on the local CPU. The VMA policy is only applied for memory
44 * allocations for a VMA in the VM.
45 *
46 * Currently there are a few corner cases in swapping where the policy
47 * is not applied, but the majority should be handled. When process policy
48 * is used it is not remembered over swap outs/swap ins.
49 *
50 * Only the highest zone in the zone hierarchy gets policied. Allocations
51 * requesting a lower zone just use default policy. This implies that
52 * on systems with highmem kernel lowmem allocation don't get policied.
53 * Same with GFP_DMA allocations.
54 *
c36f6e6d 55 * For shmem/tmpfs shared memory the policy is shared between
1da177e4
LT
56 * all users and remembered even when nobody has memory mapped.
57 */
58
59/* Notebook:
60 fix mmap readahead to honour policy and enable policy for any page cache
61 object
62 statistics for bigpages
63 global policy for page cache? currently it uses process policy. Requires
64 first item above.
65 handle mremap for shared memory (currently ignored for the policy)
66 grows down?
67 make bind policy root only? It can trigger oom much faster and the
68 kernel is not always grateful with that.
1da177e4
LT
69*/
70
b1de0d13
MH
71#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72
1da177e4 73#include <linux/mempolicy.h>
a520110e 74#include <linux/pagewalk.h>
1da177e4
LT
75#include <linux/highmem.h>
76#include <linux/hugetlb.h>
77#include <linux/kernel.h>
78#include <linux/sched.h>
6e84f315 79#include <linux/sched/mm.h>
6a3827d7 80#include <linux/sched/numa_balancing.h>
f719ff9b 81#include <linux/sched/task.h>
1da177e4
LT
82#include <linux/nodemask.h>
83#include <linux/cpuset.h>
1da177e4
LT
84#include <linux/slab.h>
85#include <linux/string.h>
b95f1b31 86#include <linux/export.h>
b488893a 87#include <linux/nsproxy.h>
1da177e4
LT
88#include <linux/interrupt.h>
89#include <linux/init.h>
90#include <linux/compat.h>
31367466 91#include <linux/ptrace.h>
dc9aa5b9 92#include <linux/swap.h>
1a75a6c8
CL
93#include <linux/seq_file.h>
94#include <linux/proc_fs.h>
b20a3503 95#include <linux/migrate.h>
62b61f61 96#include <linux/ksm.h>
95a402c3 97#include <linux/rmap.h>
86c3a764 98#include <linux/security.h>
dbcb0f19 99#include <linux/syscalls.h>
095f1fc4 100#include <linux/ctype.h>
6d9c285a 101#include <linux/mm_inline.h>
b24f53a0 102#include <linux/mmu_notifier.h>
b1de0d13 103#include <linux/printk.h>
c8633798 104#include <linux/swapops.h>
dc9aa5b9 105
1da177e4 106#include <asm/tlbflush.h>
4a18419f 107#include <asm/tlb.h>
7c0f6ba6 108#include <linux/uaccess.h>
1da177e4 109
62695a84
NP
110#include "internal.h"
111
38e35860 112/* Internal flags */
dc9aa5b9 113#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
1cb5d11a
HD
114#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
115#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */
dc9aa5b9 116
fcc234f8
PE
117static struct kmem_cache *policy_cache;
118static struct kmem_cache *sn_cache;
1da177e4 119
1da177e4
LT
120/* Highest zone. An specific allocation for a zone below that is not
121 policied. */
6267276f 122enum zone_type policy_zone = 0;
1da177e4 123
bea904d5
LS
124/*
125 * run-time system-wide default policy => local allocation
126 */
e754d79d 127static struct mempolicy default_policy = {
1da177e4 128 .refcnt = ATOMIC_INIT(1), /* never free it */
7858d7bc 129 .mode = MPOL_LOCAL,
1da177e4
LT
130};
131
5606e387
MG
132static struct mempolicy preferred_node_policy[MAX_NUMNODES];
133
b2ca916c
DW
134/**
135 * numa_map_to_online_node - Find closest online node
f6e92f40 136 * @node: Node id to start the search
b2ca916c
DW
137 *
138 * Lookup the next closest node by distance if @nid is not online.
dad5b023
RD
139 *
140 * Return: this @node if it is online, otherwise the closest node by distance
b2ca916c
DW
141 */
142int numa_map_to_online_node(int node)
143{
4fcbe96e 144 int min_dist = INT_MAX, dist, n, min_node;
b2ca916c 145
4fcbe96e
DW
146 if (node == NUMA_NO_NODE || node_online(node))
147 return node;
b2ca916c
DW
148
149 min_node = node;
4fcbe96e
DW
150 for_each_online_node(n) {
151 dist = node_distance(node, n);
152 if (dist < min_dist) {
153 min_dist = dist;
154 min_node = n;
b2ca916c
DW
155 }
156 }
157
158 return min_node;
159}
160EXPORT_SYMBOL_GPL(numa_map_to_online_node);
161
74d2c3a0 162struct mempolicy *get_task_policy(struct task_struct *p)
5606e387
MG
163{
164 struct mempolicy *pol = p->mempolicy;
f15ca78e 165 int node;
5606e387 166
f15ca78e
ON
167 if (pol)
168 return pol;
5606e387 169
f15ca78e
ON
170 node = numa_node_id();
171 if (node != NUMA_NO_NODE) {
172 pol = &preferred_node_policy[node];
173 /* preferred_node_policy is not initialised early in boot */
174 if (pol->mode)
175 return pol;
5606e387
MG
176 }
177
f15ca78e 178 return &default_policy;
5606e387
MG
179}
180
37012946
DR
181static const struct mempolicy_operations {
182 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
213980c0 183 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
37012946
DR
184} mpol_ops[MPOL_MAX];
185
f5b087b5
DR
186static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
187{
6d556294 188 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
189}
190
191static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
192 const nodemask_t *rel)
193{
194 nodemask_t tmp;
195 nodes_fold(tmp, *orig, nodes_weight(*rel));
196 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
197}
198
be897d48 199static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
200{
201 if (nodes_empty(*nodes))
202 return -EINVAL;
269fbe72 203 pol->nodes = *nodes;
37012946
DR
204 return 0;
205}
206
207static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
208{
7858d7bc
FT
209 if (nodes_empty(*nodes))
210 return -EINVAL;
269fbe72
BW
211
212 nodes_clear(pol->nodes);
213 node_set(first_node(*nodes), pol->nodes);
37012946
DR
214 return 0;
215}
216
58568d2a
MX
217/*
218 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
219 * any, for the new policy. mpol_new() has already validated the nodes
7858d7bc 220 * parameter with respect to the policy mode and flags.
58568d2a
MX
221 *
222 * Must be called holding task's alloc_lock to protect task's mems_allowed
c1e8d7c6 223 * and mempolicy. May also be called holding the mmap_lock for write.
58568d2a 224 */
4bfc4495
KH
225static int mpol_set_nodemask(struct mempolicy *pol,
226 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 227{
58568d2a
MX
228 int ret;
229
7858d7bc
FT
230 /*
231 * Default (pol==NULL) resp. local memory policies are not a
232 * subject of any remapping. They also do not need any special
233 * constructor.
234 */
235 if (!pol || pol->mode == MPOL_LOCAL)
58568d2a 236 return 0;
7858d7bc 237
01f13bd6 238 /* Check N_MEMORY */
4bfc4495 239 nodes_and(nsc->mask1,
01f13bd6 240 cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a
MX
241
242 VM_BUG_ON(!nodes);
4bfc4495 243
7858d7bc
FT
244 if (pol->flags & MPOL_F_RELATIVE_NODES)
245 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
246 else
247 nodes_and(nsc->mask2, *nodes, nsc->mask1);
58568d2a 248
7858d7bc
FT
249 if (mpol_store_user_nodemask(pol))
250 pol->w.user_nodemask = *nodes;
4bfc4495 251 else
7858d7bc
FT
252 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
253
254 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
58568d2a
MX
255 return ret;
256}
257
258/*
259 * This function just creates a new policy, does some check and simple
260 * initialization. You must invoke mpol_set_nodemask() to set nodes.
261 */
028fec41
DR
262static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
263 nodemask_t *nodes)
1da177e4
LT
264{
265 struct mempolicy *policy;
266
3e1f0645
DR
267 if (mode == MPOL_DEFAULT) {
268 if (nodes && !nodes_empty(*nodes))
37012946 269 return ERR_PTR(-EINVAL);
d3a71033 270 return NULL;
37012946 271 }
3e1f0645
DR
272 VM_BUG_ON(!nodes);
273
274 /*
275 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
276 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
277 * All other modes require a valid pointer to a non-empty nodemask.
278 */
279 if (mode == MPOL_PREFERRED) {
280 if (nodes_empty(*nodes)) {
281 if (((flags & MPOL_F_STATIC_NODES) ||
282 (flags & MPOL_F_RELATIVE_NODES)))
283 return ERR_PTR(-EINVAL);
7858d7bc
FT
284
285 mode = MPOL_LOCAL;
3e1f0645 286 }
479e2802 287 } else if (mode == MPOL_LOCAL) {
8d303e44
PK
288 if (!nodes_empty(*nodes) ||
289 (flags & MPOL_F_STATIC_NODES) ||
290 (flags & MPOL_F_RELATIVE_NODES))
479e2802 291 return ERR_PTR(-EINVAL);
3e1f0645
DR
292 } else if (nodes_empty(*nodes))
293 return ERR_PTR(-EINVAL);
c36f6e6d 294
1da177e4
LT
295 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
296 if (!policy)
297 return ERR_PTR(-ENOMEM);
298 atomic_set(&policy->refcnt, 1);
45c4745a 299 policy->mode = mode;
3e1f0645 300 policy->flags = flags;
c6018b4b 301 policy->home_node = NUMA_NO_NODE;
37012946 302
1da177e4 303 return policy;
37012946
DR
304}
305
52cd3b07 306/* Slow path of a mpol destructor. */
c36f6e6d 307void __mpol_put(struct mempolicy *pol)
52cd3b07 308{
c36f6e6d 309 if (!atomic_dec_and_test(&pol->refcnt))
52cd3b07 310 return;
c36f6e6d 311 kmem_cache_free(policy_cache, pol);
52cd3b07
LS
312}
313
213980c0 314static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
315{
316}
317
213980c0 318static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
319{
320 nodemask_t tmp;
321
322 if (pol->flags & MPOL_F_STATIC_NODES)
323 nodes_and(tmp, pol->w.user_nodemask, *nodes);
324 else if (pol->flags & MPOL_F_RELATIVE_NODES)
325 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
326 else {
269fbe72 327 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
213980c0 328 *nodes);
29b190fa 329 pol->w.cpuset_mems_allowed = *nodes;
37012946 330 }
f5b087b5 331
708c1bbc
MX
332 if (nodes_empty(tmp))
333 tmp = *nodes;
334
269fbe72 335 pol->nodes = tmp;
37012946
DR
336}
337
338static void mpol_rebind_preferred(struct mempolicy *pol,
213980c0 339 const nodemask_t *nodes)
37012946 340{
7858d7bc 341 pol->w.cpuset_mems_allowed = *nodes;
1da177e4
LT
342}
343
708c1bbc
MX
344/*
345 * mpol_rebind_policy - Migrate a policy to a different set of nodes
346 *
c1e8d7c6 347 * Per-vma policies are protected by mmap_lock. Allocations using per-task
213980c0
VB
348 * policies are protected by task->mems_allowed_seq to prevent a premature
349 * OOM/allocation failure due to parallel nodemask modification.
708c1bbc 350 */
213980c0 351static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1d0d2680 352{
018160ad 353 if (!pol || pol->mode == MPOL_LOCAL)
1d0d2680 354 return;
7858d7bc 355 if (!mpol_store_user_nodemask(pol) &&
1d0d2680
DR
356 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
357 return;
708c1bbc 358
213980c0 359 mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680
DR
360}
361
362/*
363 * Wrapper for mpol_rebind_policy() that just requires task
364 * pointer, and updates task mempolicy.
58568d2a
MX
365 *
366 * Called with task's alloc_lock held.
1d0d2680 367 */
213980c0 368void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1d0d2680 369{
213980c0 370 mpol_rebind_policy(tsk->mempolicy, new);
1d0d2680
DR
371}
372
373/*
374 * Rebind each vma in mm to new nodemask.
375 *
c1e8d7c6 376 * Call holding a reference to mm. Takes mm->mmap_lock during call.
1d0d2680 377 */
1d0d2680
DR
378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379{
380 struct vm_area_struct *vma;
66850be5 381 VMA_ITERATOR(vmi, mm, 0);
1d0d2680 382
d8ed45c5 383 mmap_write_lock(mm);
6c21e066
JH
384 for_each_vma(vmi, vma) {
385 vma_start_write(vma);
213980c0 386 mpol_rebind_policy(vma->vm_policy, new);
6c21e066 387 }
d8ed45c5 388 mmap_write_unlock(mm);
1d0d2680
DR
389}
390
37012946
DR
391static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
392 [MPOL_DEFAULT] = {
393 .rebind = mpol_rebind_default,
394 },
395 [MPOL_INTERLEAVE] = {
be897d48 396 .create = mpol_new_nodemask,
37012946
DR
397 .rebind = mpol_rebind_nodemask,
398 },
399 [MPOL_PREFERRED] = {
400 .create = mpol_new_preferred,
401 .rebind = mpol_rebind_preferred,
402 },
403 [MPOL_BIND] = {
be897d48 404 .create = mpol_new_nodemask,
37012946
DR
405 .rebind = mpol_rebind_nodemask,
406 },
7858d7bc
FT
407 [MPOL_LOCAL] = {
408 .rebind = mpol_rebind_default,
409 },
b27abacc 410 [MPOL_PREFERRED_MANY] = {
be897d48 411 .create = mpol_new_nodemask,
b27abacc
DH
412 .rebind = mpol_rebind_preferred,
413 },
37012946
DR
414};
415
1cb5d11a 416static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
fc301289 417 unsigned long flags);
1a75a6c8 418
1cb5d11a
HD
419static bool strictly_unmovable(unsigned long flags)
420{
421 /*
422 * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
423 * if any misplaced page is found.
424 */
425 return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
426 MPOL_MF_STRICT;
427}
428
6f4576e3
NH
429struct queue_pages {
430 struct list_head *pagelist;
431 unsigned long flags;
432 nodemask_t *nmask;
f18da660
LX
433 unsigned long start;
434 unsigned long end;
435 struct vm_area_struct *first;
1cb5d11a
HD
436 struct folio *large; /* note last large folio encountered */
437 long nr_failed; /* could not be isolated at this time */
6f4576e3
NH
438};
439
88aaa2a1 440/*
d451b89d 441 * Check if the folio's nid is in qp->nmask.
88aaa2a1
NH
442 *
443 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
444 * in the invert of qp->nmask.
445 */
d451b89d 446static inline bool queue_folio_required(struct folio *folio,
88aaa2a1
NH
447 struct queue_pages *qp)
448{
d451b89d 449 int nid = folio_nid(folio);
88aaa2a1
NH
450 unsigned long flags = qp->flags;
451
452 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
453}
454
1cb5d11a 455static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
c8633798 456{
de1f5055 457 struct folio *folio;
c8633798 458 struct queue_pages *qp = walk->private;
c8633798
NH
459
460 if (unlikely(is_pmd_migration_entry(*pmd))) {
1cb5d11a
HD
461 qp->nr_failed++;
462 return;
c8633798 463 }
de1f5055
VMO
464 folio = pfn_folio(pmd_pfn(*pmd));
465 if (is_huge_zero_page(&folio->page)) {
e5947d23 466 walk->action = ACTION_CONTINUE;
1cb5d11a 467 return;
c8633798 468 }
d451b89d 469 if (!queue_folio_required(folio, qp))
1cb5d11a
HD
470 return;
471 if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
472 !vma_migratable(walk->vma) ||
473 !migrate_folio_add(folio, qp->pagelist, qp->flags))
474 qp->nr_failed++;
c8633798
NH
475}
476
98094945 477/*
1cb5d11a
HD
478 * Scan through folios, checking if they satisfy the required conditions,
479 * moving them from LRU to local pagelist for migration if they do (or not).
d8835445 480 *
1cb5d11a
HD
481 * queue_folios_pte_range() has two possible return values:
482 * 0 - continue walking to scan for more, even if an existing folio on the
483 * wrong node could not be isolated and queued for migration.
484 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
485 * and an existing folio was on a node that does not follow the policy.
98094945 486 */
3dae02bb 487static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
6f4576e3 488 unsigned long end, struct mm_walk *walk)
1da177e4 489{
6f4576e3 490 struct vm_area_struct *vma = walk->vma;
3dae02bb 491 struct folio *folio;
6f4576e3
NH
492 struct queue_pages *qp = walk->private;
493 unsigned long flags = qp->flags;
3f088420 494 pte_t *pte, *mapped_pte;
c33c7948 495 pte_t ptent;
705e87c0 496 spinlock_t *ptl;
941150a3 497
c8633798 498 ptl = pmd_trans_huge_lock(pmd, vma);
1cb5d11a
HD
499 if (ptl) {
500 queue_folios_pmd(pmd, walk);
501 spin_unlock(ptl);
502 goto out;
503 }
91612e0d 504
3f088420 505 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
7780d040
HD
506 if (!pte) {
507 walk->action = ACTION_AGAIN;
508 return 0;
509 }
6f4576e3 510 for (; addr != end; pte++, addr += PAGE_SIZE) {
c33c7948 511 ptent = ptep_get(pte);
1cb5d11a 512 if (pte_none(ptent))
1da177e4 513 continue;
1cb5d11a
HD
514 if (!pte_present(ptent)) {
515 if (is_migration_entry(pte_to_swp_entry(ptent)))
516 qp->nr_failed++;
517 continue;
518 }
c33c7948 519 folio = vm_normal_folio(vma, addr, ptent);
3dae02bb 520 if (!folio || folio_is_zone_device(folio))
1da177e4 521 continue;
053837fc 522 /*
3dae02bb
VMO
523 * vm_normal_folio() filters out zero pages, but there might
524 * still be reserved folios to skip, perhaps in a VDSO.
053837fc 525 */
3dae02bb 526 if (folio_test_reserved(folio))
f4598c8b 527 continue;
d451b89d 528 if (!queue_folio_required(folio, qp))
38e35860 529 continue;
1cb5d11a 530 if (folio_test_large(folio)) {
a53190a4 531 /*
1cb5d11a
HD
532 * A large folio can only be isolated from LRU once,
533 * but may be mapped by many PTEs (and Copy-On-Write may
534 * intersperse PTEs of other, order 0, folios). This is
535 * a common case, so don't mistake it for failure (but
536 * there can be other cases of multi-mapped pages which
537 * this quick check does not help to filter out - and a
538 * search of the pagelist might grow to be prohibitive).
539 *
540 * migrate_pages(&pagelist) returns nr_failed folios, so
541 * check "large" now so that queue_pages_range() returns
542 * a comparable nr_failed folios. This does imply that
543 * if folio could not be isolated for some racy reason
544 * at its first PTE, later PTEs will not give it another
545 * chance of isolation; but keeps the accounting simple.
a53190a4 546 */
1cb5d11a
HD
547 if (folio == qp->large)
548 continue;
549 qp->large = folio;
550 }
551 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
552 !vma_migratable(vma) ||
553 !migrate_folio_add(folio, qp->pagelist, flags)) {
554 qp->nr_failed++;
555 if (strictly_unmovable(flags))
556 break;
557 }
6f4576e3 558 }
3f088420 559 pte_unmap_unlock(mapped_pte, ptl);
6f4576e3 560 cond_resched();
1cb5d11a
HD
561out:
562 if (qp->nr_failed && strictly_unmovable(flags))
563 return -EIO;
564 return 0;
91612e0d
HD
565}
566
0a2c1e81 567static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
6f4576e3
NH
568 unsigned long addr, unsigned long end,
569 struct mm_walk *walk)
e2d8cf40
NH
570{
571#ifdef CONFIG_HUGETLB_PAGE
6f4576e3 572 struct queue_pages *qp = walk->private;
1cb5d11a 573 unsigned long flags = qp->flags;
0a2c1e81 574 struct folio *folio;
cb900f41 575 spinlock_t *ptl;
d4c54919 576 pte_t entry;
e2d8cf40 577
6f4576e3
NH
578 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
579 entry = huge_ptep_get(pte);
1cb5d11a
HD
580 if (!pte_present(entry)) {
581 if (unlikely(is_hugetlb_entry_migration(entry)))
582 qp->nr_failed++;
d4c54919 583 goto unlock;
1cb5d11a 584 }
0a2c1e81 585 folio = pfn_folio(pte_pfn(entry));
d451b89d 586 if (!queue_folio_required(folio, qp))
e2d8cf40 587 goto unlock;
1cb5d11a
HD
588 if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
589 !vma_migratable(walk->vma)) {
590 qp->nr_failed++;
dcf17635
LX
591 goto unlock;
592 }
0a2c1e81 593 /*
1cb5d11a
HD
594 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
595 * Choosing not to migrate a shared folio is not counted as a failure.
0a2c1e81
VMO
596 *
597 * To check if the folio is shared, ideally we want to make sure
598 * every page is mapped to the same process. Doing that is very
1cb5d11a 599 * expensive, so check the estimated sharers of the folio instead.
0a2c1e81 600 */
1cb5d11a
HD
601 if ((flags & MPOL_MF_MOVE_ALL) ||
602 (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
603 if (!isolate_hugetlb(folio, qp->pagelist))
604 qp->nr_failed++;
e2d8cf40 605unlock:
cb900f41 606 spin_unlock(ptl);
1cb5d11a
HD
607 if (qp->nr_failed && strictly_unmovable(flags))
608 return -EIO;
e2d8cf40 609#endif
1cb5d11a 610 return 0;
1da177e4
LT
611}
612
5877231f 613#ifdef CONFIG_NUMA_BALANCING
b24f53a0 614/*
4b10e7d5
MG
615 * This is used to mark a range of virtual addresses to be inaccessible.
616 * These are later cleared by a NUMA hinting fault. Depending on these
617 * faults, pages may be migrated for better NUMA placement.
618 *
619 * This is assuming that NUMA faults are handled using PROT_NONE. If
620 * an architecture makes a different choice, it will need further
621 * changes to the core.
b24f53a0 622 */
4b10e7d5
MG
623unsigned long change_prot_numa(struct vm_area_struct *vma,
624 unsigned long addr, unsigned long end)
b24f53a0 625{
4a18419f 626 struct mmu_gather tlb;
a79390f5 627 long nr_updated;
b24f53a0 628
4a18419f
NA
629 tlb_gather_mmu(&tlb, vma->vm_mm);
630
1ef488ed 631 nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
d1751118 632 if (nr_updated > 0)
03c5a6e1 633 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0 634
4a18419f
NA
635 tlb_finish_mmu(&tlb);
636
4b10e7d5 637 return nr_updated;
b24f53a0 638}
5877231f 639#endif /* CONFIG_NUMA_BALANCING */
b24f53a0 640
6f4576e3
NH
641static int queue_pages_test_walk(unsigned long start, unsigned long end,
642 struct mm_walk *walk)
643{
66850be5 644 struct vm_area_struct *next, *vma = walk->vma;
6f4576e3
NH
645 struct queue_pages *qp = walk->private;
646 unsigned long endvma = vma->vm_end;
647 unsigned long flags = qp->flags;
648
a18b3ac2 649 /* range check first */
ce33135c 650 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
f18da660
LX
651
652 if (!qp->first) {
653 qp->first = vma;
654 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
655 (qp->start < vma->vm_start))
656 /* hole at head side of range */
a18b3ac2
LX
657 return -EFAULT;
658 }
66850be5 659 next = find_vma(vma->vm_mm, vma->vm_end);
f18da660
LX
660 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
661 ((vma->vm_end < qp->end) &&
66850be5 662 (!next || vma->vm_end < next->vm_start)))
f18da660
LX
663 /* hole at middle or tail of range */
664 return -EFAULT;
a18b3ac2 665
a7f40cfe
YS
666 /*
667 * Need check MPOL_MF_STRICT to return -EIO if possible
668 * regardless of vma_migratable
669 */
670 if (!vma_migratable(vma) &&
671 !(flags & MPOL_MF_STRICT))
48684a65
NH
672 return 1;
673
6f4576e3
NH
674 if (endvma > end)
675 endvma = end;
6f4576e3 676
1cb5d11a
HD
677 /*
678 * Check page nodes, and queue pages to move, in the current vma.
679 * But if no moving, and no strict checking, the scan can be skipped.
680 */
681 if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
6f4576e3
NH
682 return 0;
683 return 1;
684}
685
7b86ac33 686static const struct mm_walk_ops queue_pages_walk_ops = {
0a2c1e81 687 .hugetlb_entry = queue_folios_hugetlb,
3dae02bb 688 .pmd_entry = queue_folios_pte_range,
7b86ac33 689 .test_walk = queue_pages_test_walk,
49b06385
SB
690 .walk_lock = PGWALK_RDLOCK,
691};
692
693static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
694 .hugetlb_entry = queue_folios_hugetlb,
695 .pmd_entry = queue_folios_pte_range,
696 .test_walk = queue_pages_test_walk,
697 .walk_lock = PGWALK_WRLOCK,
7b86ac33
CH
698};
699
dc9aa5b9 700/*
98094945
NH
701 * Walk through page tables and collect pages to be migrated.
702 *
1cb5d11a
HD
703 * If pages found in a given range are not on the required set of @nodes,
704 * and migration is allowed, they are isolated and queued to @pagelist.
d8835445 705 *
1cb5d11a
HD
706 * queue_pages_range() may return:
707 * 0 - all pages already on the right node, or successfully queued for moving
708 * (or neither strict checking nor moving requested: only range checking).
709 * >0 - this number of misplaced folios could not be queued for moving
710 * (a hugetlbfs page or a transparent huge page being counted as 1).
711 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
712 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
dc9aa5b9 713 */
1cb5d11a 714static long
98094945 715queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
6f4576e3 716 nodemask_t *nodes, unsigned long flags,
1cb5d11a 717 struct list_head *pagelist)
1da177e4 718{
f18da660 719 int err;
6f4576e3
NH
720 struct queue_pages qp = {
721 .pagelist = pagelist,
722 .flags = flags,
723 .nmask = nodes,
f18da660
LX
724 .start = start,
725 .end = end,
726 .first = NULL,
6f4576e3 727 };
1cb5d11a 728 const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
49b06385 729 &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
6f4576e3 730
49b06385 731 err = walk_page_range(mm, start, end, ops, &qp);
f18da660
LX
732
733 if (!qp.first)
734 /* whole range in hole */
735 err = -EFAULT;
736
1cb5d11a 737 return err ? : qp.nr_failed;
1da177e4
LT
738}
739
869833f2
KM
740/*
741 * Apply policy to a single VMA
c1e8d7c6 742 * This must be called with the mmap_lock held for writing.
869833f2
KM
743 */
744static int vma_replace_policy(struct vm_area_struct *vma,
c36f6e6d 745 struct mempolicy *pol)
8d34694c 746{
869833f2
KM
747 int err;
748 struct mempolicy *old;
749 struct mempolicy *new;
8d34694c 750
6c21e066
JH
751 vma_assert_write_locked(vma);
752
869833f2
KM
753 new = mpol_dup(pol);
754 if (IS_ERR(new))
755 return PTR_ERR(new);
756
757 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 758 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
759 if (err)
760 goto err_out;
8d34694c 761 }
869833f2
KM
762
763 old = vma->vm_policy;
c1e8d7c6 764 vma->vm_policy = new; /* protected by mmap_lock */
869833f2
KM
765 mpol_put(old);
766
767 return 0;
768 err_out:
769 mpol_put(new);
8d34694c
KM
770 return err;
771}
772
f4e9e0e6
LH
773/* Split or merge the VMA (if required) and apply the new policy */
774static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
775 struct vm_area_struct **prev, unsigned long start,
776 unsigned long end, struct mempolicy *new_pol)
1da177e4 777{
f4e9e0e6 778 unsigned long vmstart, vmend;
9d8cebd4 779
f4e9e0e6
LH
780 vmend = min(end, vma->vm_end);
781 if (start > vma->vm_start) {
782 *prev = vma;
783 vmstart = start;
784 } else {
785 vmstart = vma->vm_start;
786 }
787
c36f6e6d 788 if (mpol_equal(vma->vm_policy, new_pol)) {
00ca0f2e 789 *prev = vma;
7329e3eb 790 return 0;
00ca0f2e 791 }
7329e3eb 792
94d7d923
LS
793 vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
794 if (IS_ERR(vma))
795 return PTR_ERR(vma);
f4e9e0e6
LH
796
797 *prev = vma;
798 return vma_replace_policy(vma, new_pol);
1da177e4
LT
799}
800
1da177e4 801/* Set the process memory policy */
028fec41
DR
802static long do_set_mempolicy(unsigned short mode, unsigned short flags,
803 nodemask_t *nodes)
1da177e4 804{
58568d2a 805 struct mempolicy *new, *old;
4bfc4495 806 NODEMASK_SCRATCH(scratch);
58568d2a 807 int ret;
1da177e4 808
4bfc4495
KH
809 if (!scratch)
810 return -ENOMEM;
f4e53d91 811
4bfc4495
KH
812 new = mpol_new(mode, flags, nodes);
813 if (IS_ERR(new)) {
814 ret = PTR_ERR(new);
815 goto out;
816 }
2c7c3a7d 817
12c1dc8e 818 task_lock(current);
4bfc4495 819 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a 820 if (ret) {
12c1dc8e 821 task_unlock(current);
58568d2a 822 mpol_put(new);
4bfc4495 823 goto out;
58568d2a 824 }
12c1dc8e 825
58568d2a 826 old = current->mempolicy;
1da177e4 827 current->mempolicy = new;
45816682
VB
828 if (new && new->mode == MPOL_INTERLEAVE)
829 current->il_prev = MAX_NUMNODES-1;
58568d2a 830 task_unlock(current);
58568d2a 831 mpol_put(old);
4bfc4495
KH
832 ret = 0;
833out:
834 NODEMASK_SCRATCH_FREE(scratch);
835 return ret;
1da177e4
LT
836}
837
bea904d5
LS
838/*
839 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
840 *
841 * Called with task's alloc_lock held
bea904d5 842 */
c36f6e6d 843static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
1da177e4 844{
dfcd3c0d 845 nodes_clear(*nodes);
c36f6e6d 846 if (pol == &default_policy)
bea904d5
LS
847 return;
848
c36f6e6d 849 switch (pol->mode) {
19770b32 850 case MPOL_BIND:
1da177e4 851 case MPOL_INTERLEAVE:
269fbe72 852 case MPOL_PREFERRED:
b27abacc 853 case MPOL_PREFERRED_MANY:
c36f6e6d 854 *nodes = pol->nodes;
1da177e4 855 break;
7858d7bc
FT
856 case MPOL_LOCAL:
857 /* return empty node mask for local allocation */
858 break;
1da177e4
LT
859 default:
860 BUG();
861 }
862}
863
3b9aadf7 864static int lookup_node(struct mm_struct *mm, unsigned long addr)
1da177e4 865{
ba841078 866 struct page *p = NULL;
f728b9c4 867 int ret;
1da177e4 868
f728b9c4
JH
869 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
870 if (ret > 0) {
871 ret = page_to_nid(p);
1da177e4
LT
872 put_page(p);
873 }
f728b9c4 874 return ret;
1da177e4
LT
875}
876
1da177e4 877/* Retrieve NUMA policy */
dbcb0f19
AB
878static long do_get_mempolicy(int *policy, nodemask_t *nmask,
879 unsigned long addr, unsigned long flags)
1da177e4 880{
8bccd85f 881 int err;
1da177e4
LT
882 struct mm_struct *mm = current->mm;
883 struct vm_area_struct *vma = NULL;
3b9aadf7 884 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1da177e4 885
754af6f5
LS
886 if (flags &
887 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 888 return -EINVAL;
754af6f5
LS
889
890 if (flags & MPOL_F_MEMS_ALLOWED) {
891 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
892 return -EINVAL;
893 *policy = 0; /* just so it's initialized */
58568d2a 894 task_lock(current);
754af6f5 895 *nmask = cpuset_current_mems_allowed;
58568d2a 896 task_unlock(current);
754af6f5
LS
897 return 0;
898 }
899
1da177e4 900 if (flags & MPOL_F_ADDR) {
ddc1a5cb 901 pgoff_t ilx; /* ignored here */
bea904d5
LS
902 /*
903 * Do NOT fall back to task policy if the
904 * vma/shared policy at addr is NULL. We
905 * want to return MPOL_DEFAULT in this case.
906 */
d8ed45c5 907 mmap_read_lock(mm);
33e3575c 908 vma = vma_lookup(mm, addr);
1da177e4 909 if (!vma) {
d8ed45c5 910 mmap_read_unlock(mm);
1da177e4
LT
911 return -EFAULT;
912 }
ddc1a5cb 913 pol = __get_vma_policy(vma, addr, &ilx);
1da177e4
LT
914 } else if (addr)
915 return -EINVAL;
916
917 if (!pol)
bea904d5 918 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
919
920 if (flags & MPOL_F_NODE) {
921 if (flags & MPOL_F_ADDR) {
3b9aadf7 922 /*
f728b9c4
JH
923 * Take a refcount on the mpol, because we are about to
924 * drop the mmap_lock, after which only "pol" remains
925 * valid, "vma" is stale.
3b9aadf7
AA
926 */
927 pol_refcount = pol;
928 vma = NULL;
929 mpol_get(pol);
f728b9c4 930 mmap_read_unlock(mm);
3b9aadf7 931 err = lookup_node(mm, addr);
1da177e4
LT
932 if (err < 0)
933 goto out;
8bccd85f 934 *policy = err;
1da177e4 935 } else if (pol == current->mempolicy &&
45c4745a 936 pol->mode == MPOL_INTERLEAVE) {
269fbe72 937 *policy = next_node_in(current->il_prev, pol->nodes);
1da177e4
LT
938 } else {
939 err = -EINVAL;
940 goto out;
941 }
bea904d5
LS
942 } else {
943 *policy = pol == &default_policy ? MPOL_DEFAULT :
944 pol->mode;
d79df630
DR
945 /*
946 * Internal mempolicy flags must be masked off before exposing
947 * the policy to userspace.
948 */
949 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 950 }
1da177e4 951
1da177e4 952 err = 0;
58568d2a 953 if (nmask) {
c6b6ef8b
LS
954 if (mpol_store_user_nodemask(pol)) {
955 *nmask = pol->w.user_nodemask;
956 } else {
957 task_lock(current);
958 get_policy_nodemask(pol, nmask);
959 task_unlock(current);
960 }
58568d2a 961 }
1da177e4
LT
962
963 out:
52cd3b07 964 mpol_cond_put(pol);
1da177e4 965 if (vma)
d8ed45c5 966 mmap_read_unlock(mm);
3b9aadf7
AA
967 if (pol_refcount)
968 mpol_put(pol_refcount);
1da177e4
LT
969 return err;
970}
971
b20a3503 972#ifdef CONFIG_MIGRATION
1cb5d11a 973static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
fc301289 974 unsigned long flags)
6ce3c4c0
CL
975{
976 /*
1cb5d11a
HD
977 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
978 * Choosing not to migrate a shared folio is not counted as a failure.
4a64981d
VMO
979 *
980 * To check if the folio is shared, ideally we want to make sure
981 * every page is mapped to the same process. Doing that is very
1cb5d11a 982 * expensive, so check the estimated sharers of the folio instead.
6ce3c4c0 983 */
4a64981d 984 if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
be2d5756 985 if (folio_isolate_lru(folio)) {
4a64981d
VMO
986 list_add_tail(&folio->lru, foliolist);
987 node_stat_mod_folio(folio,
988 NR_ISOLATED_ANON + folio_is_file_lru(folio),
989 folio_nr_pages(folio));
1cb5d11a 990 } else {
a53190a4 991 /*
4a64981d
VMO
992 * Non-movable folio may reach here. And, there may be
993 * temporary off LRU folios or non-LRU movable folios.
994 * Treat them as unmovable folios since they can't be
1cb5d11a 995 * isolated, so they can't be moved at the moment.
a53190a4 996 */
1cb5d11a 997 return false;
62695a84
NP
998 }
999 }
1cb5d11a 1000 return true;
7e2ab150 1001}
6ce3c4c0 1002
7e2ab150
CL
1003/*
1004 * Migrate pages from one node to a target node.
1005 * Returns error or the number of pages not migrated.
1006 */
1cb5d11a
HD
1007static long migrate_to_node(struct mm_struct *mm, int source, int dest,
1008 int flags)
7e2ab150
CL
1009{
1010 nodemask_t nmask;
66850be5 1011 struct vm_area_struct *vma;
7e2ab150 1012 LIST_HEAD(pagelist);
1cb5d11a
HD
1013 long nr_failed;
1014 long err = 0;
a0976311
JK
1015 struct migration_target_control mtc = {
1016 .nid = dest,
1017 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1018 };
7e2ab150
CL
1019
1020 nodes_clear(nmask);
1021 node_set(source, nmask);
6ce3c4c0 1022
1cb5d11a
HD
1023 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1024 vma = find_vma(mm, 0);
1025
08270807 1026 /*
1cb5d11a 1027 * This does not migrate the range, but isolates all pages that
08270807 1028 * need migration. Between passing in the full user address
1cb5d11a
HD
1029 * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
1030 * but passes back the count of pages which could not be isolated.
08270807 1031 */
1cb5d11a
HD
1032 nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
1033 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
7e2ab150 1034
cf608ac1 1035 if (!list_empty(&pagelist)) {
a0976311 1036 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1cb5d11a 1037 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
cf608ac1 1038 if (err)
e2d8cf40 1039 putback_movable_pages(&pagelist);
cf608ac1 1040 }
95a402c3 1041
1cb5d11a
HD
1042 if (err >= 0)
1043 err += nr_failed;
7e2ab150 1044 return err;
6ce3c4c0
CL
1045}
1046
39743889 1047/*
7e2ab150
CL
1048 * Move pages between the two nodesets so as to preserve the physical
1049 * layout as much as possible.
39743889
CL
1050 *
1051 * Returns the number of page that could not be moved.
1052 */
0ce72d4f
AM
1053int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1054 const nodemask_t *to, int flags)
39743889 1055{
1cb5d11a
HD
1056 long nr_failed = 0;
1057 long err = 0;
7e2ab150 1058 nodemask_t tmp;
39743889 1059
361a2a22 1060 lru_cache_disable();
0aedadf9 1061
d8ed45c5 1062 mmap_read_lock(mm);
39743889 1063
da0aa138
KM
1064 /*
1065 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1066 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1067 * bit in 'tmp', and return that <source, dest> pair for migration.
1068 * The pair of nodemasks 'to' and 'from' define the map.
1069 *
1070 * If no pair of bits is found that way, fallback to picking some
1071 * pair of 'source' and 'dest' bits that are not the same. If the
1072 * 'source' and 'dest' bits are the same, this represents a node
1073 * that will be migrating to itself, so no pages need move.
1074 *
1075 * If no bits are left in 'tmp', or if all remaining bits left
1076 * in 'tmp' correspond to the same bit in 'to', return false
1077 * (nothing left to migrate).
1078 *
1079 * This lets us pick a pair of nodes to migrate between, such that
1080 * if possible the dest node is not already occupied by some other
1081 * source node, minimizing the risk of overloading the memory on a
1082 * node that would happen if we migrated incoming memory to a node
1083 * before migrating outgoing memory source that same node.
1084 *
1085 * A single scan of tmp is sufficient. As we go, we remember the
1086 * most recent <s, d> pair that moved (s != d). If we find a pair
1087 * that not only moved, but what's better, moved to an empty slot
1088 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1089 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1090 * most recent <s, d> pair that moved. If we get all the way through
1091 * the scan of tmp without finding any node that moved, much less
1092 * moved to an empty node, then there is nothing left worth migrating.
1093 */
d4984711 1094
0ce72d4f 1095 tmp = *from;
7e2ab150 1096 while (!nodes_empty(tmp)) {
68d68ff6 1097 int s, d;
b76ac7e7 1098 int source = NUMA_NO_NODE;
7e2ab150
CL
1099 int dest = 0;
1100
1101 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1102
1103 /*
1104 * do_migrate_pages() tries to maintain the relative
1105 * node relationship of the pages established between
1106 * threads and memory areas.
1107 *
1108 * However if the number of source nodes is not equal to
1109 * the number of destination nodes we can not preserve
1110 * this node relative relationship. In that case, skip
1111 * copying memory from a node that is in the destination
1112 * mask.
1113 *
1114 * Example: [2,3,4] -> [3,4,5] moves everything.
1115 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1116 */
1117
0ce72d4f
AM
1118 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1119 (node_isset(s, *to)))
4a5b18cc
LW
1120 continue;
1121
0ce72d4f 1122 d = node_remap(s, *from, *to);
7e2ab150
CL
1123 if (s == d)
1124 continue;
1125
1126 source = s; /* Node moved. Memorize */
1127 dest = d;
1128
1129 /* dest not in remaining from nodes? */
1130 if (!node_isset(dest, tmp))
1131 break;
1132 }
b76ac7e7 1133 if (source == NUMA_NO_NODE)
7e2ab150
CL
1134 break;
1135
1136 node_clear(source, tmp);
1137 err = migrate_to_node(mm, source, dest, flags);
1138 if (err > 0)
1cb5d11a 1139 nr_failed += err;
7e2ab150
CL
1140 if (err < 0)
1141 break;
39743889 1142 }
d8ed45c5 1143 mmap_read_unlock(mm);
d479960e 1144
361a2a22 1145 lru_cache_enable();
7e2ab150
CL
1146 if (err < 0)
1147 return err;
1cb5d11a 1148 return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
b20a3503
CL
1149}
1150
3ad33b24
LS
1151/*
1152 * Allocate a new page for page migration based on vma policy.
d05f0cdc 1153 * Start by assuming the page is mapped by the same vma as contains @start.
3ad33b24
LS
1154 * Search forward from there, if not. N.B., this assumes that the
1155 * list of pages handed to migrate_pages()--which is how we get here--
1156 * is in virtual address order.
1157 */
4e096ae1 1158static struct folio *new_folio(struct folio *src, unsigned long start)
95a402c3 1159{
d05f0cdc 1160 struct vm_area_struct *vma;
3f649ab7 1161 unsigned long address;
66850be5 1162 VMA_ITERATOR(vmi, current->mm, start);
ec4858e0 1163 gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
95a402c3 1164
66850be5 1165 for_each_vma(vmi, vma) {
4e096ae1 1166 address = page_address_in_vma(&src->page, vma);
3ad33b24
LS
1167 if (address != -EFAULT)
1168 break;
3ad33b24 1169 }
11c731e8 1170
ddc1a5cb
HD
1171 /*
1172 * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL
1173 * when the page can no longer be located in a vma: that is not ideal
1174 * (migrate_pages() will give up early, presuming ENOMEM), but good
1175 * enough to avoid a crash by syzkaller or concurrent holepunch.
1176 */
1177 if (!vma)
1178 return NULL;
1179
d0ce0e47 1180 if (folio_test_hugetlb(src)) {
4e096ae1 1181 return alloc_hugetlb_folio_vma(folio_hstate(src),
389c8178 1182 vma, address);
d0ce0e47 1183 }
ec4858e0
MWO
1184
1185 if (folio_test_large(src))
1186 gfp = GFP_TRANSHUGE;
1187
4e096ae1 1188 return vma_alloc_folio(gfp, folio_order(src), vma, address,
ec4858e0 1189 folio_test_large(src));
95a402c3 1190}
b20a3503
CL
1191#else
1192
1cb5d11a 1193static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
b20a3503
CL
1194 unsigned long flags)
1195{
1cb5d11a 1196 return false;
39743889
CL
1197}
1198
0ce72d4f
AM
1199int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1200 const nodemask_t *to, int flags)
b20a3503
CL
1201{
1202 return -ENOSYS;
1203}
95a402c3 1204
4e096ae1 1205static struct folio *new_folio(struct folio *src, unsigned long start)
95a402c3
CL
1206{
1207 return NULL;
1208}
b20a3503
CL
1209#endif
1210
dbcb0f19 1211static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1212 unsigned short mode, unsigned short mode_flags,
1213 nodemask_t *nmask, unsigned long flags)
6ce3c4c0 1214{
6ce3c4c0 1215 struct mm_struct *mm = current->mm;
f4e9e0e6
LH
1216 struct vm_area_struct *vma, *prev;
1217 struct vma_iterator vmi;
6ce3c4c0
CL
1218 struct mempolicy *new;
1219 unsigned long end;
1cb5d11a
HD
1220 long err;
1221 long nr_failed;
6ce3c4c0
CL
1222 LIST_HEAD(pagelist);
1223
b24f53a0 1224 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1225 return -EINVAL;
74c00241 1226 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1227 return -EPERM;
1228
1229 if (start & ~PAGE_MASK)
1230 return -EINVAL;
1231
1232 if (mode == MPOL_DEFAULT)
1233 flags &= ~MPOL_MF_STRICT;
1234
aaa31e05 1235 len = PAGE_ALIGN(len);
6ce3c4c0
CL
1236 end = start + len;
1237
1238 if (end < start)
1239 return -EINVAL;
1240 if (end == start)
1241 return 0;
1242
028fec41 1243 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1244 if (IS_ERR(new))
1245 return PTR_ERR(new);
1246
1247 /*
1248 * If we are using the default policy then operation
1249 * on discontinuous address spaces is okay after all
1250 */
1251 if (!new)
1252 flags |= MPOL_MF_DISCONTIG_OK;
1253
1cb5d11a 1254 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
361a2a22 1255 lru_cache_disable();
4bfc4495
KH
1256 {
1257 NODEMASK_SCRATCH(scratch);
1258 if (scratch) {
d8ed45c5 1259 mmap_write_lock(mm);
4bfc4495 1260 err = mpol_set_nodemask(new, nmask, scratch);
4bfc4495 1261 if (err)
d8ed45c5 1262 mmap_write_unlock(mm);
4bfc4495
KH
1263 } else
1264 err = -ENOMEM;
1265 NODEMASK_SCRATCH_FREE(scratch);
1266 }
b05ca738
KM
1267 if (err)
1268 goto mpol_out;
1269
6c21e066 1270 /*
1cb5d11a
HD
1271 * Lock the VMAs before scanning for pages to migrate,
1272 * to ensure we don't miss a concurrently inserted page.
6c21e066 1273 */
1cb5d11a
HD
1274 nr_failed = queue_pages_range(mm, start, end, nmask,
1275 flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);
d8835445 1276
1cb5d11a
HD
1277 if (nr_failed < 0) {
1278 err = nr_failed;
1279 } else {
1280 vma_iter_init(&vmi, mm, start);
1281 prev = vma_prev(&vmi);
1282 for_each_vma_range(vmi, vma, end) {
1283 err = mbind_range(&vmi, vma, &prev, start, end, new);
1284 if (err)
1285 break;
1286 }
f4e9e0e6 1287 }
7e2ab150 1288
b24f53a0 1289 if (!err) {
cf608ac1 1290 if (!list_empty(&pagelist)) {
1cb5d11a 1291 nr_failed |= migrate_pages(&pagelist, new_folio, NULL,
5ac95884 1292 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
cf608ac1 1293 }
1cb5d11a 1294 if (nr_failed && (flags & MPOL_MF_STRICT))
6ce3c4c0 1295 err = -EIO;
a85dfc30
YS
1296 }
1297
1cb5d11a
HD
1298 if (!list_empty(&pagelist))
1299 putback_movable_pages(&pagelist);
1300
d8ed45c5 1301 mmap_write_unlock(mm);
d8835445 1302mpol_out:
f0be3d32 1303 mpol_put(new);
d479960e 1304 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
361a2a22 1305 lru_cache_enable();
6ce3c4c0
CL
1306 return err;
1307}
1308
8bccd85f
CL
1309/*
1310 * User space interface with variable sized bitmaps for nodelists.
1311 */
e130242d
AB
1312static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1313 unsigned long maxnode)
1314{
1315 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1316 int ret;
1317
1318 if (in_compat_syscall())
1319 ret = compat_get_bitmap(mask,
1320 (const compat_ulong_t __user *)nmask,
1321 maxnode);
1322 else
1323 ret = copy_from_user(mask, nmask,
1324 nlongs * sizeof(unsigned long));
1325
1326 if (ret)
1327 return -EFAULT;
1328
1329 if (maxnode % BITS_PER_LONG)
1330 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1331
1332 return 0;
1333}
8bccd85f
CL
1334
1335/* Copy a node mask from user space. */
39743889 1336static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1337 unsigned long maxnode)
1338{
8bccd85f
CL
1339 --maxnode;
1340 nodes_clear(*nodes);
1341 if (maxnode == 0 || !nmask)
1342 return 0;
a9c930ba 1343 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1344 return -EINVAL;
8bccd85f 1345
56521e7a
YX
1346 /*
1347 * When the user specified more nodes than supported just check
e130242d
AB
1348 * if the non supported part is all zero, one word at a time,
1349 * starting at the end.
56521e7a 1350 */
e130242d
AB
1351 while (maxnode > MAX_NUMNODES) {
1352 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1353 unsigned long t;
8bccd85f 1354
000eca5d 1355 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
56521e7a 1356 return -EFAULT;
e130242d
AB
1357
1358 if (maxnode - bits >= MAX_NUMNODES) {
1359 maxnode -= bits;
1360 } else {
1361 maxnode = MAX_NUMNODES;
1362 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1363 }
1364 if (t)
56521e7a
YX
1365 return -EINVAL;
1366 }
1367
e130242d 1368 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
8bccd85f
CL
1369}
1370
1371/* Copy a kernel node mask to user space */
1372static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1373 nodemask_t *nodes)
1374{
1375 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
050c17f2 1376 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
e130242d
AB
1377 bool compat = in_compat_syscall();
1378
1379 if (compat)
1380 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
8bccd85f
CL
1381
1382 if (copy > nbytes) {
1383 if (copy > PAGE_SIZE)
1384 return -EINVAL;
1385 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1386 return -EFAULT;
1387 copy = nbytes;
e130242d 1388 maxnode = nr_node_ids;
8bccd85f 1389 }
e130242d
AB
1390
1391 if (compat)
1392 return compat_put_bitmap((compat_ulong_t __user *)mask,
1393 nodes_addr(*nodes), maxnode);
1394
8bccd85f
CL
1395 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1396}
1397
95837924
FT
1398/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1399static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1400{
1401 *flags = *mode & MPOL_MODE_FLAGS;
1402 *mode &= ~MPOL_MODE_FLAGS;
b27abacc 1403
a38a59fd 1404 if ((unsigned int)(*mode) >= MPOL_MAX)
95837924
FT
1405 return -EINVAL;
1406 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1407 return -EINVAL;
6d2aec9e
ED
1408 if (*flags & MPOL_F_NUMA_BALANCING) {
1409 if (*mode != MPOL_BIND)
1410 return -EINVAL;
1411 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1412 }
95837924
FT
1413 return 0;
1414}
1415
e7dc9ad6
DB
1416static long kernel_mbind(unsigned long start, unsigned long len,
1417 unsigned long mode, const unsigned long __user *nmask,
1418 unsigned long maxnode, unsigned int flags)
8bccd85f 1419{
95837924 1420 unsigned short mode_flags;
8bccd85f 1421 nodemask_t nodes;
95837924 1422 int lmode = mode;
8bccd85f
CL
1423 int err;
1424
057d3389 1425 start = untagged_addr(start);
95837924
FT
1426 err = sanitize_mpol_flags(&lmode, &mode_flags);
1427 if (err)
1428 return err;
1429
8bccd85f
CL
1430 err = get_nodes(&nodes, nmask, maxnode);
1431 if (err)
1432 return err;
95837924
FT
1433
1434 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
8bccd85f
CL
1435}
1436
c6018b4b
AK
1437SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1438 unsigned long, home_node, unsigned long, flags)
1439{
1440 struct mm_struct *mm = current->mm;
f4e9e0e6 1441 struct vm_area_struct *vma, *prev;
e976936c 1442 struct mempolicy *new, *old;
c6018b4b
AK
1443 unsigned long end;
1444 int err = -ENOENT;
66850be5 1445 VMA_ITERATOR(vmi, mm, start);
c6018b4b
AK
1446
1447 start = untagged_addr(start);
1448 if (start & ~PAGE_MASK)
1449 return -EINVAL;
1450 /*
1451 * flags is used for future extension if any.
1452 */
1453 if (flags != 0)
1454 return -EINVAL;
1455
1456 /*
1457 * Check home_node is online to avoid accessing uninitialized
1458 * NODE_DATA.
1459 */
1460 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1461 return -EINVAL;
1462
aaa31e05 1463 len = PAGE_ALIGN(len);
c6018b4b
AK
1464 end = start + len;
1465
1466 if (end < start)
1467 return -EINVAL;
1468 if (end == start)
1469 return 0;
1470 mmap_write_lock(mm);
f4e9e0e6 1471 prev = vma_prev(&vmi);
66850be5 1472 for_each_vma_range(vmi, vma, end) {
c6018b4b
AK
1473 /*
1474 * If any vma in the range got policy other than MPOL_BIND
1475 * or MPOL_PREFERRED_MANY we return error. We don't reset
1476 * the home node for vmas we already updated before.
1477 */
e976936c 1478 old = vma_policy(vma);
51f62537
LH
1479 if (!old) {
1480 prev = vma;
e976936c 1481 continue;
51f62537 1482 }
e976936c 1483 if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
c6018b4b
AK
1484 err = -EOPNOTSUPP;
1485 break;
1486 }
e976936c
MH
1487 new = mpol_dup(old);
1488 if (IS_ERR(new)) {
1489 err = PTR_ERR(new);
1490 break;
1491 }
c6018b4b 1492
6c21e066 1493 vma_start_write(vma);
c6018b4b 1494 new->home_node = home_node;
f4e9e0e6 1495 err = mbind_range(&vmi, vma, &prev, start, end, new);
c6018b4b
AK
1496 mpol_put(new);
1497 if (err)
1498 break;
1499 }
1500 mmap_write_unlock(mm);
1501 return err;
1502}
1503
e7dc9ad6
DB
1504SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1505 unsigned long, mode, const unsigned long __user *, nmask,
1506 unsigned long, maxnode, unsigned int, flags)
1507{
1508 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1509}
1510
8bccd85f 1511/* Set the process memory policy */
af03c4ac
DB
1512static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1513 unsigned long maxnode)
8bccd85f 1514{
95837924 1515 unsigned short mode_flags;
8bccd85f 1516 nodemask_t nodes;
95837924
FT
1517 int lmode = mode;
1518 int err;
1519
1520 err = sanitize_mpol_flags(&lmode, &mode_flags);
1521 if (err)
1522 return err;
8bccd85f 1523
8bccd85f
CL
1524 err = get_nodes(&nodes, nmask, maxnode);
1525 if (err)
1526 return err;
95837924
FT
1527
1528 return do_set_mempolicy(lmode, mode_flags, &nodes);
8bccd85f
CL
1529}
1530
af03c4ac
DB
1531SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1532 unsigned long, maxnode)
1533{
1534 return kernel_set_mempolicy(mode, nmask, maxnode);
1535}
1536
b6e9b0ba
DB
1537static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1538 const unsigned long __user *old_nodes,
1539 const unsigned long __user *new_nodes)
39743889 1540{
596d7cfa 1541 struct mm_struct *mm = NULL;
39743889 1542 struct task_struct *task;
39743889
CL
1543 nodemask_t task_nodes;
1544 int err;
596d7cfa
KM
1545 nodemask_t *old;
1546 nodemask_t *new;
1547 NODEMASK_SCRATCH(scratch);
1548
1549 if (!scratch)
1550 return -ENOMEM;
39743889 1551
596d7cfa
KM
1552 old = &scratch->mask1;
1553 new = &scratch->mask2;
1554
1555 err = get_nodes(old, old_nodes, maxnode);
39743889 1556 if (err)
596d7cfa 1557 goto out;
39743889 1558
596d7cfa 1559 err = get_nodes(new, new_nodes, maxnode);
39743889 1560 if (err)
596d7cfa 1561 goto out;
39743889
CL
1562
1563 /* Find the mm_struct */
55cfaa3c 1564 rcu_read_lock();
228ebcbe 1565 task = pid ? find_task_by_vpid(pid) : current;
39743889 1566 if (!task) {
55cfaa3c 1567 rcu_read_unlock();
596d7cfa
KM
1568 err = -ESRCH;
1569 goto out;
39743889 1570 }
3268c63e 1571 get_task_struct(task);
39743889 1572
596d7cfa 1573 err = -EINVAL;
39743889
CL
1574
1575 /*
31367466
OE
1576 * Check if this process has the right to modify the specified process.
1577 * Use the regular "ptrace_may_access()" checks.
39743889 1578 */
31367466 1579 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
c69e8d9c 1580 rcu_read_unlock();
39743889 1581 err = -EPERM;
3268c63e 1582 goto out_put;
39743889 1583 }
c69e8d9c 1584 rcu_read_unlock();
39743889
CL
1585
1586 task_nodes = cpuset_mems_allowed(task);
1587 /* Is the user allowed to access the target nodes? */
596d7cfa 1588 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1589 err = -EPERM;
3268c63e 1590 goto out_put;
39743889
CL
1591 }
1592
0486a38b
YX
1593 task_nodes = cpuset_mems_allowed(current);
1594 nodes_and(*new, *new, task_nodes);
1595 if (nodes_empty(*new))
1596 goto out_put;
1597
86c3a764
DQ
1598 err = security_task_movememory(task);
1599 if (err)
3268c63e 1600 goto out_put;
86c3a764 1601
3268c63e
CL
1602 mm = get_task_mm(task);
1603 put_task_struct(task);
f2a9ef88
SL
1604
1605 if (!mm) {
3268c63e 1606 err = -EINVAL;
f2a9ef88
SL
1607 goto out;
1608 }
1609
1610 err = do_migrate_pages(mm, old, new,
1611 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1612
1613 mmput(mm);
1614out:
596d7cfa
KM
1615 NODEMASK_SCRATCH_FREE(scratch);
1616
39743889 1617 return err;
3268c63e
CL
1618
1619out_put:
1620 put_task_struct(task);
1621 goto out;
39743889
CL
1622}
1623
b6e9b0ba
DB
1624SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1625 const unsigned long __user *, old_nodes,
1626 const unsigned long __user *, new_nodes)
1627{
1628 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1629}
1630
8bccd85f 1631/* Retrieve NUMA policy */
af03c4ac
DB
1632static int kernel_get_mempolicy(int __user *policy,
1633 unsigned long __user *nmask,
1634 unsigned long maxnode,
1635 unsigned long addr,
1636 unsigned long flags)
8bccd85f 1637{
dbcb0f19 1638 int err;
3f649ab7 1639 int pval;
8bccd85f
CL
1640 nodemask_t nodes;
1641
050c17f2 1642 if (nmask != NULL && maxnode < nr_node_ids)
8bccd85f
CL
1643 return -EINVAL;
1644
4605f057
WH
1645 addr = untagged_addr(addr);
1646
8bccd85f
CL
1647 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1648
1649 if (err)
1650 return err;
1651
1652 if (policy && put_user(pval, policy))
1653 return -EFAULT;
1654
1655 if (nmask)
1656 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1657
1658 return err;
1659}
1660
af03c4ac
DB
1661SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1662 unsigned long __user *, nmask, unsigned long, maxnode,
1663 unsigned long, addr, unsigned long, flags)
1664{
1665 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1666}
1667
20ca87f2
LX
1668bool vma_migratable(struct vm_area_struct *vma)
1669{
1670 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1671 return false;
1672
1673 /*
1674 * DAX device mappings require predictable access latency, so avoid
1675 * incurring periodic faults.
1676 */
1677 if (vma_is_dax(vma))
1678 return false;
1679
1680 if (is_vm_hugetlb_page(vma) &&
1681 !hugepage_migration_supported(hstate_vma(vma)))
1682 return false;
1683
1684 /*
1685 * Migration allocates pages in the highest zone. If we cannot
1686 * do so then migration (at least from node to node) is not
1687 * possible.
1688 */
1689 if (vma->vm_file &&
1690 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1691 < policy_zone)
1692 return false;
1693 return true;
1694}
1695
74d2c3a0 1696struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
ddc1a5cb 1697 unsigned long addr, pgoff_t *ilx)
1da177e4 1698{
ddc1a5cb
HD
1699 *ilx = 0;
1700 return (vma->vm_ops && vma->vm_ops->get_policy) ?
1701 vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
74d2c3a0
ON
1702}
1703
1704/*
ddc1a5cb 1705 * get_vma_policy(@vma, @addr, @order, @ilx)
74d2c3a0
ON
1706 * @vma: virtual memory area whose policy is sought
1707 * @addr: address in @vma for shared policy lookup
ddc1a5cb
HD
1708 * @order: 0, or appropriate huge_page_order for interleaving
1709 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE
74d2c3a0
ON
1710 *
1711 * Returns effective policy for a VMA at specified address.
dd6eecb9 1712 * Falls back to current->mempolicy or system default policy, as necessary.
74d2c3a0
ON
1713 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1714 * count--added by the get_policy() vm_op, as appropriate--to protect against
1715 * freeing by another task. It is the caller's responsibility to free the
1716 * extra reference for shared policies.
1717 */
ddc1a5cb
HD
1718struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1719 unsigned long addr, int order, pgoff_t *ilx)
74d2c3a0 1720{
ddc1a5cb 1721 struct mempolicy *pol;
74d2c3a0 1722
ddc1a5cb 1723 pol = __get_vma_policy(vma, addr, ilx);
8d90274b 1724 if (!pol)
dd6eecb9 1725 pol = get_task_policy(current);
ddc1a5cb
HD
1726 if (pol->mode == MPOL_INTERLEAVE) {
1727 *ilx += vma->vm_pgoff >> order;
1728 *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
1729 }
1da177e4
LT
1730 return pol;
1731}
1732
6b6482bb 1733bool vma_policy_mof(struct vm_area_struct *vma)
fc314724 1734{
6b6482bb 1735 struct mempolicy *pol;
fc314724 1736
6b6482bb
ON
1737 if (vma->vm_ops && vma->vm_ops->get_policy) {
1738 bool ret = false;
ddc1a5cb 1739 pgoff_t ilx; /* ignored here */
fc314724 1740
ddc1a5cb 1741 pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
6b6482bb
ON
1742 if (pol && (pol->flags & MPOL_F_MOF))
1743 ret = true;
1744 mpol_cond_put(pol);
8d90274b 1745
6b6482bb 1746 return ret;
fc314724
MG
1747 }
1748
6b6482bb 1749 pol = vma->vm_policy;
8d90274b 1750 if (!pol)
6b6482bb 1751 pol = get_task_policy(current);
8d90274b 1752
fc314724
MG
1753 return pol->flags & MPOL_F_MOF;
1754}
1755
d2226ebd 1756bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
d3eb1570
LJ
1757{
1758 enum zone_type dynamic_policy_zone = policy_zone;
1759
1760 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1761
1762 /*
269fbe72 1763 * if policy->nodes has movable memory only,
d3eb1570
LJ
1764 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1765 *
269fbe72 1766 * policy->nodes is intersect with node_states[N_MEMORY].
f0953a1b 1767 * so if the following test fails, it implies
269fbe72 1768 * policy->nodes has movable memory only.
d3eb1570 1769 */
269fbe72 1770 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
d3eb1570
LJ
1771 dynamic_policy_zone = ZONE_MOVABLE;
1772
1773 return zone >= dynamic_policy_zone;
1774}
1775
1da177e4 1776/* Do dynamic interleaving for a process */
c36f6e6d 1777static unsigned int interleave_nodes(struct mempolicy *policy)
1da177e4 1778{
c36f6e6d 1779 unsigned int nid;
1da177e4 1780
c36f6e6d
HD
1781 nid = next_node_in(current->il_prev, policy->nodes);
1782 if (nid < MAX_NUMNODES)
1783 current->il_prev = nid;
1784 return nid;
1da177e4
LT
1785}
1786
dc85da15
CL
1787/*
1788 * Depending on the memory policy provide a node from which to allocate the
1789 * next slab entry.
1790 */
2a389610 1791unsigned int mempolicy_slab_node(void)
dc85da15 1792{
e7b691b0 1793 struct mempolicy *policy;
2a389610 1794 int node = numa_mem_id();
e7b691b0 1795
38b031dd 1796 if (!in_task())
2a389610 1797 return node;
e7b691b0
AK
1798
1799 policy = current->mempolicy;
7858d7bc 1800 if (!policy)
2a389610 1801 return node;
bea904d5
LS
1802
1803 switch (policy->mode) {
1804 case MPOL_PREFERRED:
269fbe72 1805 return first_node(policy->nodes);
765c4507 1806
dc85da15
CL
1807 case MPOL_INTERLEAVE:
1808 return interleave_nodes(policy);
1809
b27abacc
DH
1810 case MPOL_BIND:
1811 case MPOL_PREFERRED_MANY:
1812 {
c33d6c06
MG
1813 struct zoneref *z;
1814
dc85da15
CL
1815 /*
1816 * Follow bind policy behavior and start allocation at the
1817 * first node.
1818 */
19770b32 1819 struct zonelist *zonelist;
19770b32 1820 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
c9634cf0 1821 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
c33d6c06 1822 z = first_zones_zonelist(zonelist, highest_zoneidx,
269fbe72 1823 &policy->nodes);
c1093b74 1824 return z->zone ? zone_to_nid(z->zone) : node;
dd1a239f 1825 }
7858d7bc
FT
1826 case MPOL_LOCAL:
1827 return node;
dc85da15 1828
dc85da15 1829 default:
bea904d5 1830 BUG();
dc85da15
CL
1831 }
1832}
1833
fee83b3a 1834/*
ddc1a5cb
HD
1835 * Do static interleaving for interleave index @ilx. Returns the ilx'th
1836 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
1837 * exceeds the number of present nodes.
fee83b3a 1838 */
ddc1a5cb 1839static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
1da177e4 1840{
276aeee1 1841 nodemask_t nodemask = pol->nodes;
1842 unsigned int target, nnodes;
fee83b3a
AM
1843 int i;
1844 int nid;
276aeee1 1845 /*
1846 * The barrier will stabilize the nodemask in a register or on
1847 * the stack so that it will stop changing under the code.
1848 *
1849 * Between first_node() and next_node(), pol->nodes could be changed
1850 * by other threads. So we put pol->nodes in a local stack.
1851 */
1852 barrier();
1da177e4 1853
276aeee1 1854 nnodes = nodes_weight(nodemask);
f5b087b5
DR
1855 if (!nnodes)
1856 return numa_node_id();
ddc1a5cb 1857 target = ilx % nnodes;
276aeee1 1858 nid = first_node(nodemask);
fee83b3a 1859 for (i = 0; i < target; i++)
276aeee1 1860 nid = next_node(nid, nodemask);
1da177e4
LT
1861 return nid;
1862}
1863
ddc1a5cb
HD
1864/*
1865 * Return a nodemask representing a mempolicy for filtering nodes for
1866 * page allocation, together with preferred node id (or the input node id).
1867 */
1868static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
1869 pgoff_t ilx, int *nid)
5da7ca86 1870{
ddc1a5cb 1871 nodemask_t *nodemask = NULL;
5da7ca86 1872
ddc1a5cb
HD
1873 switch (pol->mode) {
1874 case MPOL_PREFERRED:
1875 /* Override input node id */
1876 *nid = first_node(pol->nodes);
1877 break;
1878 case MPOL_PREFERRED_MANY:
1879 nodemask = &pol->nodes;
1880 if (pol->home_node != NUMA_NO_NODE)
1881 *nid = pol->home_node;
1882 break;
1883 case MPOL_BIND:
1884 /* Restrict to nodemask (but not on lower zones) */
1885 if (apply_policy_zone(pol, gfp_zone(gfp)) &&
1886 cpuset_nodemask_valid_mems_allowed(&pol->nodes))
1887 nodemask = &pol->nodes;
1888 if (pol->home_node != NUMA_NO_NODE)
1889 *nid = pol->home_node;
3b98b087 1890 /*
ddc1a5cb
HD
1891 * __GFP_THISNODE shouldn't even be used with the bind policy
1892 * because we might easily break the expectation to stay on the
1893 * requested node and not break the policy.
3b98b087 1894 */
ddc1a5cb
HD
1895 WARN_ON_ONCE(gfp & __GFP_THISNODE);
1896 break;
1897 case MPOL_INTERLEAVE:
1898 /* Override input node id */
1899 *nid = (ilx == NO_INTERLEAVE_INDEX) ?
1900 interleave_nodes(pol) : interleave_nid(pol, ilx);
1901 break;
1902 }
1903
1904 return nodemask;
5da7ca86
CL
1905}
1906
00ac59ad 1907#ifdef CONFIG_HUGETLBFS
480eccf9 1908/*
04ec6264 1909 * huge_node(@vma, @addr, @gfp_flags, @mpol)
b46e14ac
FF
1910 * @vma: virtual memory area whose policy is sought
1911 * @addr: address in @vma for shared policy lookup and interleave policy
1912 * @gfp_flags: for requested zone
1913 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
b27abacc 1914 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
480eccf9 1915 *
04ec6264 1916 * Returns a nid suitable for a huge page allocation and a pointer
52cd3b07 1917 * to the struct mempolicy for conditional unref after allocation.
b27abacc
DH
1918 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
1919 * to the mempolicy's @nodemask for filtering the zonelist.
480eccf9 1920 */
04ec6264 1921int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
ddc1a5cb 1922 struct mempolicy **mpol, nodemask_t **nodemask)
5da7ca86 1923{
ddc1a5cb 1924 pgoff_t ilx;
04ec6264 1925 int nid;
5da7ca86 1926
ddc1a5cb
HD
1927 nid = numa_node_id();
1928 *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
1929 *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
04ec6264 1930 return nid;
5da7ca86 1931}
06808b08
LS
1932
1933/*
1934 * init_nodemask_of_mempolicy
1935 *
1936 * If the current task's mempolicy is "default" [NULL], return 'false'
1937 * to indicate default policy. Otherwise, extract the policy nodemask
1938 * for 'bind' or 'interleave' policy into the argument nodemask, or
1939 * initialize the argument nodemask to contain the single node for
1940 * 'preferred' or 'local' policy and return 'true' to indicate presence
1941 * of non-default mempolicy.
1942 *
1943 * We don't bother with reference counting the mempolicy [mpol_get/put]
1944 * because the current task is examining it's own mempolicy and a task's
1945 * mempolicy is only ever changed by the task itself.
1946 *
1947 * N.B., it is the caller's responsibility to free a returned nodemask.
1948 */
1949bool init_nodemask_of_mempolicy(nodemask_t *mask)
1950{
1951 struct mempolicy *mempolicy;
06808b08
LS
1952
1953 if (!(mask && current->mempolicy))
1954 return false;
1955
c0ff7453 1956 task_lock(current);
06808b08
LS
1957 mempolicy = current->mempolicy;
1958 switch (mempolicy->mode) {
1959 case MPOL_PREFERRED:
b27abacc 1960 case MPOL_PREFERRED_MANY:
06808b08 1961 case MPOL_BIND:
06808b08 1962 case MPOL_INTERLEAVE:
269fbe72 1963 *mask = mempolicy->nodes;
7858d7bc
FT
1964 break;
1965
1966 case MPOL_LOCAL:
269fbe72 1967 init_nodemask_of_node(mask, numa_node_id());
06808b08
LS
1968 break;
1969
1970 default:
1971 BUG();
1972 }
c0ff7453 1973 task_unlock(current);
06808b08
LS
1974
1975 return true;
1976}
00ac59ad 1977#endif
5da7ca86 1978
6f48d0eb 1979/*
b26e517a 1980 * mempolicy_in_oom_domain
6f48d0eb 1981 *
b26e517a
FT
1982 * If tsk's mempolicy is "bind", check for intersection between mask and
1983 * the policy nodemask. Otherwise, return true for all other policies
1984 * including "interleave", as a tsk with "interleave" policy may have
1985 * memory allocated from all nodes in system.
6f48d0eb
DR
1986 *
1987 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1988 */
b26e517a 1989bool mempolicy_in_oom_domain(struct task_struct *tsk,
6f48d0eb
DR
1990 const nodemask_t *mask)
1991{
1992 struct mempolicy *mempolicy;
1993 bool ret = true;
1994
1995 if (!mask)
1996 return ret;
b26e517a 1997
6f48d0eb
DR
1998 task_lock(tsk);
1999 mempolicy = tsk->mempolicy;
b26e517a 2000 if (mempolicy && mempolicy->mode == MPOL_BIND)
269fbe72 2001 ret = nodes_intersects(mempolicy->nodes, *mask);
6f48d0eb 2002 task_unlock(tsk);
b26e517a 2003
6f48d0eb
DR
2004 return ret;
2005}
2006
4c54d949 2007static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
ddc1a5cb 2008 int nid, nodemask_t *nodemask)
4c54d949
FT
2009{
2010 struct page *page;
2011 gfp_t preferred_gfp;
2012
2013 /*
2014 * This is a two pass approach. The first pass will only try the
2015 * preferred nodes but skip the direct reclaim and allow the
2016 * allocation to fail, while the second pass will try all the
2017 * nodes in system.
2018 */
2019 preferred_gfp = gfp | __GFP_NOWARN;
2020 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
ddc1a5cb 2021 page = __alloc_pages(preferred_gfp, order, nid, nodemask);
4c54d949 2022 if (!page)
c0455116 2023 page = __alloc_pages(gfp, order, nid, NULL);
4c54d949
FT
2024
2025 return page;
2026}
2027
1da177e4 2028/**
ddc1a5cb 2029 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
eb350739 2030 * @gfp: GFP flags.
ddc1a5cb
HD
2031 * @order: Order of the page allocation.
2032 * @pol: Pointer to the NUMA mempolicy.
2033 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
2034 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
1da177e4 2035 *
ddc1a5cb 2036 * Return: The page on success or NULL if allocation fails.
1da177e4 2037 */
ddc1a5cb
HD
2038struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
2039 struct mempolicy *pol, pgoff_t ilx, int nid)
1da177e4 2040{
ddc1a5cb
HD
2041 nodemask_t *nodemask;
2042 struct page *page;
adf88aa8 2043
ddc1a5cb 2044 nodemask = policy_nodemask(gfp, pol, ilx, &nid);
4c54d949 2045
ddc1a5cb
HD
2046 if (pol->mode == MPOL_PREFERRED_MANY)
2047 return alloc_pages_preferred_many(gfp, order, nid, nodemask);
19deb769 2048
ddc1a5cb
HD
2049 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
2050 /* filter "hugepage" allocation, unless from alloc_pages() */
2051 order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
19deb769
DR
2052 /*
2053 * For hugepage allocation and non-interleave policy which
2054 * allows the current node (or other explicitly preferred
2055 * node) we only try to allocate from the current/preferred
2056 * node and don't fall back to other nodes, as the cost of
2057 * remote accesses would likely offset THP benefits.
2058 *
b27abacc 2059 * If the policy is interleave or does not allow the current
19deb769
DR
2060 * node in its nodemask, we allocate the standard way.
2061 */
ddc1a5cb
HD
2062 if (pol->mode != MPOL_INTERLEAVE &&
2063 (!nodemask || node_isset(nid, *nodemask))) {
cc638f32
VB
2064 /*
2065 * First, try to allocate THP only on local node, but
2066 * don't reclaim unnecessarily, just compact.
2067 */
ddc1a5cb
HD
2068 page = __alloc_pages_node(nid,
2069 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2070 if (page || !(gfp & __GFP_DIRECT_RECLAIM))
2071 return page;
76e654cc
DR
2072 /*
2073 * If hugepage allocations are configured to always
2074 * synchronous compact or the vma has been madvised
2075 * to prefer hugepage backing, retry allowing remote
cc638f32 2076 * memory with both reclaim and compact as well.
76e654cc 2077 */
ddc1a5cb
HD
2078 }
2079 }
76e654cc 2080
ddc1a5cb
HD
2081 page = __alloc_pages(gfp, order, nid, nodemask);
2082
2083 if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
2084 /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
2085 if (static_branch_likely(&vm_numa_stat_key) &&
2086 page_to_nid(page) == nid) {
2087 preempt_disable();
2088 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
2089 preempt_enable();
19deb769 2090 }
356ff8a9
DR
2091 }
2092
ddc1a5cb
HD
2093 return page;
2094}
2095
2096/**
2097 * vma_alloc_folio - Allocate a folio for a VMA.
2098 * @gfp: GFP flags.
2099 * @order: Order of the folio.
2100 * @vma: Pointer to VMA.
2101 * @addr: Virtual address of the allocation. Must be inside @vma.
2102 * @hugepage: Unused (was: For hugepages try only preferred node if possible).
2103 *
2104 * Allocate a folio for a specific address in @vma, using the appropriate
2105 * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
2106 * VMA to prevent it from going away. Should be used for all allocations
2107 * for folios that will be mapped into user space, excepting hugetlbfs, and
2108 * excepting where direct use of alloc_pages_mpol() is more appropriate.
2109 *
2110 * Return: The folio on success or NULL if allocation fails.
2111 */
2112struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
2113 unsigned long addr, bool hugepage)
2114{
2115 struct mempolicy *pol;
2116 pgoff_t ilx;
2117 struct page *page;
2118
2119 pol = get_vma_policy(vma, addr, order, &ilx);
2120 page = alloc_pages_mpol(gfp | __GFP_COMP, order,
2121 pol, ilx, numa_node_id());
d51e9894 2122 mpol_cond_put(pol);
ddc1a5cb 2123 return page_rmappable_folio(page);
f584b680 2124}
adf88aa8 2125EXPORT_SYMBOL(vma_alloc_folio);
f584b680 2126
1da177e4 2127/**
6421ec76
MWO
2128 * alloc_pages - Allocate pages.
2129 * @gfp: GFP flags.
2130 * @order: Power of two of number of pages to allocate.
1da177e4 2131 *
6421ec76
MWO
2132 * Allocate 1 << @order contiguous pages. The physical address of the
2133 * first page is naturally aligned (eg an order-3 allocation will be aligned
2134 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2135 * process is honoured when in process context.
1da177e4 2136 *
6421ec76
MWO
2137 * Context: Can be called from any context, providing the appropriate GFP
2138 * flags are used.
2139 * Return: The page on success or NULL if allocation fails.
1da177e4 2140 */
ddc1a5cb 2141struct page *alloc_pages(gfp_t gfp, unsigned int order)
1da177e4 2142{
8d90274b 2143 struct mempolicy *pol = &default_policy;
52cd3b07
LS
2144
2145 /*
2146 * No reference counting needed for current->mempolicy
2147 * nor system default_policy
2148 */
ddc1a5cb
HD
2149 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2150 pol = get_task_policy(current);
cc9a6c87 2151
ddc1a5cb
HD
2152 return alloc_pages_mpol(gfp, order,
2153 pol, NO_INTERLEAVE_INDEX, numa_node_id());
1da177e4 2154}
d7f946d0 2155EXPORT_SYMBOL(alloc_pages);
1da177e4 2156
ddc1a5cb 2157struct folio *folio_alloc(gfp_t gfp, unsigned int order)
cc09cb13 2158{
23e48832 2159 return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
cc09cb13
MWO
2160}
2161EXPORT_SYMBOL(folio_alloc);
2162
c00b6b96
CW
2163static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2164 struct mempolicy *pol, unsigned long nr_pages,
2165 struct page **page_array)
2166{
2167 int nodes;
2168 unsigned long nr_pages_per_node;
2169 int delta;
2170 int i;
2171 unsigned long nr_allocated;
2172 unsigned long total_allocated = 0;
2173
2174 nodes = nodes_weight(pol->nodes);
2175 nr_pages_per_node = nr_pages / nodes;
2176 delta = nr_pages - nodes * nr_pages_per_node;
2177
2178 for (i = 0; i < nodes; i++) {
2179 if (delta) {
2180 nr_allocated = __alloc_pages_bulk(gfp,
2181 interleave_nodes(pol), NULL,
2182 nr_pages_per_node + 1, NULL,
2183 page_array);
2184 delta--;
2185 } else {
2186 nr_allocated = __alloc_pages_bulk(gfp,
2187 interleave_nodes(pol), NULL,
2188 nr_pages_per_node, NULL, page_array);
2189 }
2190
2191 page_array += nr_allocated;
2192 total_allocated += nr_allocated;
2193 }
2194
2195 return total_allocated;
2196}
2197
2198static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2199 struct mempolicy *pol, unsigned long nr_pages,
2200 struct page **page_array)
2201{
2202 gfp_t preferred_gfp;
2203 unsigned long nr_allocated = 0;
2204
2205 preferred_gfp = gfp | __GFP_NOWARN;
2206 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2207
2208 nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2209 nr_pages, NULL, page_array);
2210
2211 if (nr_allocated < nr_pages)
2212 nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2213 nr_pages - nr_allocated, NULL,
2214 page_array + nr_allocated);
2215 return nr_allocated;
2216}
2217
2218/* alloc pages bulk and mempolicy should be considered at the
2219 * same time in some situation such as vmalloc.
2220 *
2221 * It can accelerate memory allocation especially interleaving
2222 * allocate memory.
2223 */
2224unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2225 unsigned long nr_pages, struct page **page_array)
2226{
2227 struct mempolicy *pol = &default_policy;
ddc1a5cb
HD
2228 nodemask_t *nodemask;
2229 int nid;
c00b6b96
CW
2230
2231 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2232 pol = get_task_policy(current);
2233
2234 if (pol->mode == MPOL_INTERLEAVE)
2235 return alloc_pages_bulk_array_interleave(gfp, pol,
2236 nr_pages, page_array);
2237
2238 if (pol->mode == MPOL_PREFERRED_MANY)
2239 return alloc_pages_bulk_array_preferred_many(gfp,
2240 numa_node_id(), pol, nr_pages, page_array);
2241
ddc1a5cb
HD
2242 nid = numa_node_id();
2243 nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
2244 return __alloc_pages_bulk(gfp, nid, nodemask,
2245 nr_pages, NULL, page_array);
c00b6b96
CW
2246}
2247
ef0855d3
ON
2248int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2249{
c36f6e6d 2250 struct mempolicy *pol = mpol_dup(src->vm_policy);
ef0855d3
ON
2251
2252 if (IS_ERR(pol))
2253 return PTR_ERR(pol);
2254 dst->vm_policy = pol;
2255 return 0;
2256}
2257
4225399a 2258/*
846a16bf 2259 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2260 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2261 * with the mems_allowed returned by cpuset_mems_allowed(). This
2262 * keeps mempolicies cpuset relative after its cpuset moves. See
2263 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2264 *
2265 * current's mempolicy may be rebinded by the other task(the task that changes
2266 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2267 */
4225399a 2268
846a16bf
LS
2269/* Slow path of a mempolicy duplicate */
2270struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2271{
2272 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2273
2274 if (!new)
2275 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2276
2277 /* task's mempolicy is protected by alloc_lock */
2278 if (old == current->mempolicy) {
2279 task_lock(current);
2280 *new = *old;
2281 task_unlock(current);
2282 } else
2283 *new = *old;
2284
4225399a
PJ
2285 if (current_cpuset_is_being_rebound()) {
2286 nodemask_t mems = cpuset_mems_allowed(current);
213980c0 2287 mpol_rebind_policy(new, &mems);
4225399a 2288 }
1da177e4 2289 atomic_set(&new->refcnt, 1);
1da177e4
LT
2290 return new;
2291}
2292
2293/* Slow path of a mempolicy comparison */
fcfb4dcc 2294bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2295{
2296 if (!a || !b)
fcfb4dcc 2297 return false;
45c4745a 2298 if (a->mode != b->mode)
fcfb4dcc 2299 return false;
19800502 2300 if (a->flags != b->flags)
fcfb4dcc 2301 return false;
c6018b4b
AK
2302 if (a->home_node != b->home_node)
2303 return false;
19800502
BL
2304 if (mpol_store_user_nodemask(a))
2305 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2306 return false;
19800502 2307
45c4745a 2308 switch (a->mode) {
19770b32 2309 case MPOL_BIND:
1da177e4 2310 case MPOL_INTERLEAVE:
1da177e4 2311 case MPOL_PREFERRED:
b27abacc 2312 case MPOL_PREFERRED_MANY:
269fbe72 2313 return !!nodes_equal(a->nodes, b->nodes);
7858d7bc
FT
2314 case MPOL_LOCAL:
2315 return true;
1da177e4
LT
2316 default:
2317 BUG();
fcfb4dcc 2318 return false;
1da177e4
LT
2319 }
2320}
2321
1da177e4
LT
2322/*
2323 * Shared memory backing store policy support.
2324 *
2325 * Remember policies even when nobody has shared memory mapped.
2326 * The policies are kept in Red-Black tree linked from the inode.
4a8c7bb5 2327 * They are protected by the sp->lock rwlock, which should be held
1da177e4
LT
2328 * for any accesses to the tree.
2329 */
2330
4a8c7bb5
NZ
2331/*
2332 * lookup first element intersecting start-end. Caller holds sp->lock for
2333 * reading or for writing
2334 */
93397c3b
HD
2335static struct sp_node *sp_lookup(struct shared_policy *sp,
2336 pgoff_t start, pgoff_t end)
1da177e4
LT
2337{
2338 struct rb_node *n = sp->root.rb_node;
2339
2340 while (n) {
2341 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2342
2343 if (start >= p->end)
2344 n = n->rb_right;
2345 else if (end <= p->start)
2346 n = n->rb_left;
2347 else
2348 break;
2349 }
2350 if (!n)
2351 return NULL;
2352 for (;;) {
2353 struct sp_node *w = NULL;
2354 struct rb_node *prev = rb_prev(n);
2355 if (!prev)
2356 break;
2357 w = rb_entry(prev, struct sp_node, nd);
2358 if (w->end <= start)
2359 break;
2360 n = prev;
2361 }
2362 return rb_entry(n, struct sp_node, nd);
2363}
2364
4a8c7bb5
NZ
2365/*
2366 * Insert a new shared policy into the list. Caller holds sp->lock for
2367 * writing.
2368 */
1da177e4
LT
2369static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2370{
2371 struct rb_node **p = &sp->root.rb_node;
2372 struct rb_node *parent = NULL;
2373 struct sp_node *nd;
2374
2375 while (*p) {
2376 parent = *p;
2377 nd = rb_entry(parent, struct sp_node, nd);
2378 if (new->start < nd->start)
2379 p = &(*p)->rb_left;
2380 else if (new->end > nd->end)
2381 p = &(*p)->rb_right;
2382 else
2383 BUG();
2384 }
2385 rb_link_node(&new->nd, parent, p);
2386 rb_insert_color(&new->nd, &sp->root);
1da177e4
LT
2387}
2388
2389/* Find shared policy intersecting idx */
93397c3b
HD
2390struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
2391 pgoff_t idx)
1da177e4
LT
2392{
2393 struct mempolicy *pol = NULL;
2394 struct sp_node *sn;
2395
2396 if (!sp->root.rb_node)
2397 return NULL;
4a8c7bb5 2398 read_lock(&sp->lock);
1da177e4
LT
2399 sn = sp_lookup(sp, idx, idx+1);
2400 if (sn) {
2401 mpol_get(sn->policy);
2402 pol = sn->policy;
2403 }
4a8c7bb5 2404 read_unlock(&sp->lock);
1da177e4
LT
2405 return pol;
2406}
2407
63f74ca2
KM
2408static void sp_free(struct sp_node *n)
2409{
2410 mpol_put(n->policy);
2411 kmem_cache_free(sn_cache, n);
2412}
2413
771fb4d8 2414/**
75c70128 2415 * mpol_misplaced - check whether current folio node is valid in policy
771fb4d8 2416 *
75c70128
KW
2417 * @folio: folio to be checked
2418 * @vma: vm area where folio mapped
2419 * @addr: virtual address in @vma for shared policy lookup and interleave policy
771fb4d8 2420 *
75c70128 2421 * Lookup current policy node id for vma,addr and "compare to" folio's
5f076944 2422 * node id. Policy determination "mimics" alloc_page_vma().
771fb4d8 2423 * Called from fault path where we know the vma and faulting address.
5f076944 2424 *
062db293 2425 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
75c70128 2426 * policy, or a suitable node ID to allocate a replacement folio from.
771fb4d8 2427 */
75c70128
KW
2428int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
2429 unsigned long addr)
771fb4d8
LS
2430{
2431 struct mempolicy *pol;
ddc1a5cb 2432 pgoff_t ilx;
c33d6c06 2433 struct zoneref *z;
75c70128 2434 int curnid = folio_nid(folio);
90572890
PZ
2435 int thiscpu = raw_smp_processor_id();
2436 int thisnid = cpu_to_node(thiscpu);
98fa15f3 2437 int polnid = NUMA_NO_NODE;
062db293 2438 int ret = NUMA_NO_NODE;
771fb4d8 2439
ddc1a5cb 2440 pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
771fb4d8
LS
2441 if (!(pol->flags & MPOL_F_MOF))
2442 goto out;
2443
2444 switch (pol->mode) {
2445 case MPOL_INTERLEAVE:
ddc1a5cb 2446 polnid = interleave_nid(pol, ilx);
771fb4d8
LS
2447 break;
2448
2449 case MPOL_PREFERRED:
b27abacc
DH
2450 if (node_isset(curnid, pol->nodes))
2451 goto out;
269fbe72 2452 polnid = first_node(pol->nodes);
7858d7bc
FT
2453 break;
2454
2455 case MPOL_LOCAL:
2456 polnid = numa_node_id();
771fb4d8
LS
2457 break;
2458
2459 case MPOL_BIND:
bda420b9
HY
2460 /* Optimize placement among multiple nodes via NUMA balancing */
2461 if (pol->flags & MPOL_F_MORON) {
269fbe72 2462 if (node_isset(thisnid, pol->nodes))
bda420b9
HY
2463 break;
2464 goto out;
2465 }
b27abacc 2466 fallthrough;
c33d6c06 2467
b27abacc 2468 case MPOL_PREFERRED_MANY:
771fb4d8 2469 /*
771fb4d8
LS
2470 * use current page if in policy nodemask,
2471 * else select nearest allowed node, if any.
2472 * If no allowed nodes, use current [!misplaced].
2473 */
269fbe72 2474 if (node_isset(curnid, pol->nodes))
771fb4d8 2475 goto out;
c33d6c06 2476 z = first_zones_zonelist(
771fb4d8
LS
2477 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2478 gfp_zone(GFP_HIGHUSER),
269fbe72 2479 &pol->nodes);
c1093b74 2480 polnid = zone_to_nid(z->zone);
771fb4d8
LS
2481 break;
2482
2483 default:
2484 BUG();
2485 }
5606e387 2486
75c70128 2487 /* Migrate the folio towards the node whose CPU is referencing it */
e42c8ff2 2488 if (pol->flags & MPOL_F_MORON) {
90572890 2489 polnid = thisnid;
5606e387 2490
8c9ae56d 2491 if (!should_numa_migrate_memory(current, folio, curnid,
75c70128 2492 thiscpu))
de1c9ce6 2493 goto out;
e42c8ff2
MG
2494 }
2495
771fb4d8
LS
2496 if (curnid != polnid)
2497 ret = polnid;
2498out:
2499 mpol_cond_put(pol);
2500
2501 return ret;
2502}
2503
c11600e4
DR
2504/*
2505 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2506 * dropped after task->mempolicy is set to NULL so that any allocation done as
2507 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2508 * policy.
2509 */
2510void mpol_put_task_policy(struct task_struct *task)
2511{
2512 struct mempolicy *pol;
2513
2514 task_lock(task);
2515 pol = task->mempolicy;
2516 task->mempolicy = NULL;
2517 task_unlock(task);
2518 mpol_put(pol);
2519}
2520
1da177e4
LT
2521static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2522{
1da177e4 2523 rb_erase(&n->nd, &sp->root);
63f74ca2 2524 sp_free(n);
1da177e4
LT
2525}
2526
42288fe3
MG
2527static void sp_node_init(struct sp_node *node, unsigned long start,
2528 unsigned long end, struct mempolicy *pol)
2529{
2530 node->start = start;
2531 node->end = end;
2532 node->policy = pol;
2533}
2534
dbcb0f19
AB
2535static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2536 struct mempolicy *pol)
1da177e4 2537{
869833f2
KM
2538 struct sp_node *n;
2539 struct mempolicy *newpol;
1da177e4 2540
869833f2 2541 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2542 if (!n)
2543 return NULL;
869833f2
KM
2544
2545 newpol = mpol_dup(pol);
2546 if (IS_ERR(newpol)) {
2547 kmem_cache_free(sn_cache, n);
2548 return NULL;
2549 }
2550 newpol->flags |= MPOL_F_SHARED;
42288fe3 2551 sp_node_init(n, start, end, newpol);
869833f2 2552
1da177e4
LT
2553 return n;
2554}
2555
2556/* Replace a policy range. */
93397c3b
HD
2557static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
2558 pgoff_t end, struct sp_node *new)
1da177e4 2559{
b22d127a 2560 struct sp_node *n;
42288fe3
MG
2561 struct sp_node *n_new = NULL;
2562 struct mempolicy *mpol_new = NULL;
b22d127a 2563 int ret = 0;
1da177e4 2564
42288fe3 2565restart:
4a8c7bb5 2566 write_lock(&sp->lock);
1da177e4
LT
2567 n = sp_lookup(sp, start, end);
2568 /* Take care of old policies in the same range. */
2569 while (n && n->start < end) {
2570 struct rb_node *next = rb_next(&n->nd);
2571 if (n->start >= start) {
2572 if (n->end <= end)
2573 sp_delete(sp, n);
2574 else
2575 n->start = end;
2576 } else {
2577 /* Old policy spanning whole new range. */
2578 if (n->end > end) {
42288fe3
MG
2579 if (!n_new)
2580 goto alloc_new;
2581
2582 *mpol_new = *n->policy;
2583 atomic_set(&mpol_new->refcnt, 1);
7880639c 2584 sp_node_init(n_new, end, n->end, mpol_new);
1da177e4 2585 n->end = start;
5ca39575 2586 sp_insert(sp, n_new);
42288fe3
MG
2587 n_new = NULL;
2588 mpol_new = NULL;
1da177e4
LT
2589 break;
2590 } else
2591 n->end = start;
2592 }
2593 if (!next)
2594 break;
2595 n = rb_entry(next, struct sp_node, nd);
2596 }
2597 if (new)
2598 sp_insert(sp, new);
4a8c7bb5 2599 write_unlock(&sp->lock);
42288fe3
MG
2600 ret = 0;
2601
2602err_out:
2603 if (mpol_new)
2604 mpol_put(mpol_new);
2605 if (n_new)
2606 kmem_cache_free(sn_cache, n_new);
2607
b22d127a 2608 return ret;
42288fe3
MG
2609
2610alloc_new:
4a8c7bb5 2611 write_unlock(&sp->lock);
42288fe3
MG
2612 ret = -ENOMEM;
2613 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2614 if (!n_new)
2615 goto err_out;
2616 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2617 if (!mpol_new)
2618 goto err_out;
4ad09955 2619 atomic_set(&mpol_new->refcnt, 1);
42288fe3 2620 goto restart;
1da177e4
LT
2621}
2622
71fe804b
LS
2623/**
2624 * mpol_shared_policy_init - initialize shared policy for inode
2625 * @sp: pointer to inode shared policy
2626 * @mpol: struct mempolicy to install
2627 *
2628 * Install non-NULL @mpol in inode's shared policy rb-tree.
2629 * On entry, the current task has a reference on a non-NULL @mpol.
2630 * This must be released on exit.
4bfc4495 2631 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2632 */
2633void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2634{
58568d2a
MX
2635 int ret;
2636
71fe804b 2637 sp->root = RB_ROOT; /* empty tree == default mempolicy */
4a8c7bb5 2638 rwlock_init(&sp->lock);
71fe804b
LS
2639
2640 if (mpol) {
35ec8fa0
HD
2641 struct sp_node *sn;
2642 struct mempolicy *npol;
4bfc4495 2643 NODEMASK_SCRATCH(scratch);
71fe804b 2644
4bfc4495 2645 if (!scratch)
5c0c1654 2646 goto put_mpol;
35ec8fa0
HD
2647
2648 /* contextualize the tmpfs mount point mempolicy to this file */
2649 npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2650 if (IS_ERR(npol))
0cae3457 2651 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2652
2653 task_lock(current);
35ec8fa0 2654 ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
58568d2a 2655 task_unlock(current);
15d77835 2656 if (ret)
35ec8fa0
HD
2657 goto put_npol;
2658
2659 /* alloc node covering entire file; adds ref to file's npol */
2660 sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
2661 if (sn)
2662 sp_insert(sp, sn);
2663put_npol:
2664 mpol_put(npol); /* drop initial ref on file's npol */
0cae3457 2665free_scratch:
4bfc4495 2666 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2667put_mpol:
2668 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2669 }
2670}
2671
c36f6e6d
HD
2672int mpol_set_shared_policy(struct shared_policy *sp,
2673 struct vm_area_struct *vma, struct mempolicy *pol)
1da177e4
LT
2674{
2675 int err;
2676 struct sp_node *new = NULL;
2677 unsigned long sz = vma_pages(vma);
2678
c36f6e6d
HD
2679 if (pol) {
2680 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
1da177e4
LT
2681 if (!new)
2682 return -ENOMEM;
2683 }
c36f6e6d 2684 err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
1da177e4 2685 if (err && new)
63f74ca2 2686 sp_free(new);
1da177e4
LT
2687 return err;
2688}
2689
2690/* Free a backing policy store on inode delete. */
c36f6e6d 2691void mpol_free_shared_policy(struct shared_policy *sp)
1da177e4
LT
2692{
2693 struct sp_node *n;
2694 struct rb_node *next;
2695
c36f6e6d 2696 if (!sp->root.rb_node)
1da177e4 2697 return;
c36f6e6d
HD
2698 write_lock(&sp->lock);
2699 next = rb_first(&sp->root);
1da177e4
LT
2700 while (next) {
2701 n = rb_entry(next, struct sp_node, nd);
2702 next = rb_next(&n->nd);
c36f6e6d 2703 sp_delete(sp, n);
1da177e4 2704 }
c36f6e6d 2705 write_unlock(&sp->lock);
1da177e4
LT
2706}
2707
1a687c2e 2708#ifdef CONFIG_NUMA_BALANCING
c297663c 2709static int __initdata numabalancing_override;
1a687c2e
MG
2710
2711static void __init check_numabalancing_enable(void)
2712{
2713 bool numabalancing_default = false;
2714
2715 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2716 numabalancing_default = true;
2717
c297663c
MG
2718 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2719 if (numabalancing_override)
2720 set_numabalancing_state(numabalancing_override == 1);
2721
b0dc2b9b 2722 if (num_online_nodes() > 1 && !numabalancing_override) {
756a025f 2723 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
c297663c 2724 numabalancing_default ? "Enabling" : "Disabling");
1a687c2e
MG
2725 set_numabalancing_state(numabalancing_default);
2726 }
2727}
2728
2729static int __init setup_numabalancing(char *str)
2730{
2731 int ret = 0;
2732 if (!str)
2733 goto out;
1a687c2e
MG
2734
2735 if (!strcmp(str, "enable")) {
c297663c 2736 numabalancing_override = 1;
1a687c2e
MG
2737 ret = 1;
2738 } else if (!strcmp(str, "disable")) {
c297663c 2739 numabalancing_override = -1;
1a687c2e
MG
2740 ret = 1;
2741 }
2742out:
2743 if (!ret)
4a404bea 2744 pr_warn("Unable to parse numa_balancing=\n");
1a687c2e
MG
2745
2746 return ret;
2747}
2748__setup("numa_balancing=", setup_numabalancing);
2749#else
2750static inline void __init check_numabalancing_enable(void)
2751{
2752}
2753#endif /* CONFIG_NUMA_BALANCING */
2754
1da177e4
LT
2755void __init numa_policy_init(void)
2756{
b71636e2
PM
2757 nodemask_t interleave_nodes;
2758 unsigned long largest = 0;
2759 int nid, prefer = 0;
2760
1da177e4
LT
2761 policy_cache = kmem_cache_create("numa_policy",
2762 sizeof(struct mempolicy),
20c2df83 2763 0, SLAB_PANIC, NULL);
1da177e4
LT
2764
2765 sn_cache = kmem_cache_create("shared_policy_node",
2766 sizeof(struct sp_node),
20c2df83 2767 0, SLAB_PANIC, NULL);
1da177e4 2768
5606e387
MG
2769 for_each_node(nid) {
2770 preferred_node_policy[nid] = (struct mempolicy) {
2771 .refcnt = ATOMIC_INIT(1),
2772 .mode = MPOL_PREFERRED,
2773 .flags = MPOL_F_MOF | MPOL_F_MORON,
269fbe72 2774 .nodes = nodemask_of_node(nid),
5606e387
MG
2775 };
2776 }
2777
b71636e2
PM
2778 /*
2779 * Set interleaving policy for system init. Interleaving is only
2780 * enabled across suitably sized nodes (default is >= 16MB), or
2781 * fall back to the largest node if they're all smaller.
2782 */
2783 nodes_clear(interleave_nodes);
01f13bd6 2784 for_each_node_state(nid, N_MEMORY) {
b71636e2
PM
2785 unsigned long total_pages = node_present_pages(nid);
2786
2787 /* Preserve the largest node */
2788 if (largest < total_pages) {
2789 largest = total_pages;
2790 prefer = nid;
2791 }
2792
2793 /* Interleave this node? */
2794 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2795 node_set(nid, interleave_nodes);
2796 }
2797
2798 /* All too small, use the largest */
2799 if (unlikely(nodes_empty(interleave_nodes)))
2800 node_set(prefer, interleave_nodes);
1da177e4 2801
028fec41 2802 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
b1de0d13 2803 pr_err("%s: interleaving failed\n", __func__);
1a687c2e
MG
2804
2805 check_numabalancing_enable();
1da177e4
LT
2806}
2807
8bccd85f 2808/* Reset policy of current process to default */
1da177e4
LT
2809void numa_default_policy(void)
2810{
028fec41 2811 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2812}
68860ec1 2813
095f1fc4
LS
2814/*
2815 * Parse and format mempolicy from/to strings
2816 */
345ace9c
LS
2817static const char * const policy_modes[] =
2818{
2819 [MPOL_DEFAULT] = "default",
2820 [MPOL_PREFERRED] = "prefer",
2821 [MPOL_BIND] = "bind",
2822 [MPOL_INTERLEAVE] = "interleave",
d3a71033 2823 [MPOL_LOCAL] = "local",
b27abacc 2824 [MPOL_PREFERRED_MANY] = "prefer (many)",
345ace9c 2825};
1a75a6c8 2826
095f1fc4
LS
2827#ifdef CONFIG_TMPFS
2828/**
f2a07f40 2829 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4 2830 * @str: string containing mempolicy to parse
71fe804b 2831 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4
LS
2832 *
2833 * Format of input:
2834 * <mode>[=<flags>][:<nodelist>]
2835 *
dad5b023 2836 * Return: %0 on success, else %1
095f1fc4 2837 */
a7a88b23 2838int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4 2839{
71fe804b 2840 struct mempolicy *new = NULL;
f2a07f40 2841 unsigned short mode_flags;
71fe804b 2842 nodemask_t nodes;
095f1fc4
LS
2843 char *nodelist = strchr(str, ':');
2844 char *flags = strchr(str, '=');
dedf2c73 2845 int err = 1, mode;
095f1fc4 2846
c7a91bc7
DC
2847 if (flags)
2848 *flags++ = '\0'; /* terminate mode string */
2849
095f1fc4
LS
2850 if (nodelist) {
2851 /* NUL-terminate mode or flags string */
2852 *nodelist++ = '\0';
71fe804b 2853 if (nodelist_parse(nodelist, nodes))
095f1fc4 2854 goto out;
01f13bd6 2855 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4 2856 goto out;
71fe804b
LS
2857 } else
2858 nodes_clear(nodes);
2859
dedf2c73 2860 mode = match_string(policy_modes, MPOL_MAX, str);
2861 if (mode < 0)
095f1fc4
LS
2862 goto out;
2863
71fe804b 2864 switch (mode) {
095f1fc4 2865 case MPOL_PREFERRED:
71fe804b 2866 /*
aa9f7d51
RD
2867 * Insist on a nodelist of one node only, although later
2868 * we use first_node(nodes) to grab a single node, so here
2869 * nodelist (or nodes) cannot be empty.
71fe804b 2870 */
095f1fc4
LS
2871 if (nodelist) {
2872 char *rest = nodelist;
2873 while (isdigit(*rest))
2874 rest++;
926f2ae0
KM
2875 if (*rest)
2876 goto out;
aa9f7d51
RD
2877 if (nodes_empty(nodes))
2878 goto out;
095f1fc4
LS
2879 }
2880 break;
095f1fc4
LS
2881 case MPOL_INTERLEAVE:
2882 /*
2883 * Default to online nodes with memory if no nodelist
2884 */
2885 if (!nodelist)
01f13bd6 2886 nodes = node_states[N_MEMORY];
3f226aa1 2887 break;
71fe804b 2888 case MPOL_LOCAL:
3f226aa1 2889 /*
71fe804b 2890 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 2891 */
71fe804b 2892 if (nodelist)
3f226aa1 2893 goto out;
3f226aa1 2894 break;
413b43de
RT
2895 case MPOL_DEFAULT:
2896 /*
2897 * Insist on a empty nodelist
2898 */
2899 if (!nodelist)
2900 err = 0;
2901 goto out;
b27abacc 2902 case MPOL_PREFERRED_MANY:
d69b2e63
KM
2903 case MPOL_BIND:
2904 /*
2905 * Insist on a nodelist
2906 */
2907 if (!nodelist)
2908 goto out;
095f1fc4
LS
2909 }
2910
71fe804b 2911 mode_flags = 0;
095f1fc4
LS
2912 if (flags) {
2913 /*
2914 * Currently, we only support two mutually exclusive
2915 * mode flags.
2916 */
2917 if (!strcmp(flags, "static"))
71fe804b 2918 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 2919 else if (!strcmp(flags, "relative"))
71fe804b 2920 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 2921 else
926f2ae0 2922 goto out;
095f1fc4 2923 }
71fe804b
LS
2924
2925 new = mpol_new(mode, mode_flags, &nodes);
2926 if (IS_ERR(new))
926f2ae0
KM
2927 goto out;
2928
f2a07f40
HD
2929 /*
2930 * Save nodes for mpol_to_str() to show the tmpfs mount options
2931 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2932 */
269fbe72
BW
2933 if (mode != MPOL_PREFERRED) {
2934 new->nodes = nodes;
2935 } else if (nodelist) {
2936 nodes_clear(new->nodes);
2937 node_set(first_node(nodes), new->nodes);
2938 } else {
7858d7bc 2939 new->mode = MPOL_LOCAL;
269fbe72 2940 }
f2a07f40
HD
2941
2942 /*
2943 * Save nodes for contextualization: this will be used to "clone"
2944 * the mempolicy in a specific context [cpuset] at a later time.
2945 */
2946 new->w.user_nodemask = nodes;
2947
926f2ae0 2948 err = 0;
71fe804b 2949
095f1fc4
LS
2950out:
2951 /* Restore string for error message */
2952 if (nodelist)
2953 *--nodelist = ':';
2954 if (flags)
2955 *--flags = '=';
71fe804b
LS
2956 if (!err)
2957 *mpol = new;
095f1fc4
LS
2958 return err;
2959}
2960#endif /* CONFIG_TMPFS */
2961
71fe804b
LS
2962/**
2963 * mpol_to_str - format a mempolicy structure for printing
2964 * @buffer: to contain formatted mempolicy string
2965 * @maxlen: length of @buffer
2966 * @pol: pointer to mempolicy to be formatted
71fe804b 2967 *
948927ee
DR
2968 * Convert @pol into a string. If @buffer is too short, truncate the string.
2969 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2970 * longest flag, "relative", and to display at least a few node ids.
1a75a6c8 2971 */
948927ee 2972void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c8
CL
2973{
2974 char *p = buffer;
948927ee
DR
2975 nodemask_t nodes = NODE_MASK_NONE;
2976 unsigned short mode = MPOL_DEFAULT;
2977 unsigned short flags = 0;
2291990a 2978
8790c71a 2979 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d5 2980 mode = pol->mode;
948927ee
DR
2981 flags = pol->flags;
2982 }
bea904d5 2983
1a75a6c8
CL
2984 switch (mode) {
2985 case MPOL_DEFAULT:
7858d7bc 2986 case MPOL_LOCAL:
1a75a6c8 2987 break;
1a75a6c8 2988 case MPOL_PREFERRED:
b27abacc 2989 case MPOL_PREFERRED_MANY:
1a75a6c8 2990 case MPOL_BIND:
1a75a6c8 2991 case MPOL_INTERLEAVE:
269fbe72 2992 nodes = pol->nodes;
1a75a6c8 2993 break;
1a75a6c8 2994 default:
948927ee
DR
2995 WARN_ON_ONCE(1);
2996 snprintf(p, maxlen, "unknown");
2997 return;
1a75a6c8
CL
2998 }
2999
b7a9f420 3000 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c8 3001
fc36b8d3 3002 if (flags & MPOL_MODE_FLAGS) {
948927ee 3003 p += snprintf(p, buffer + maxlen - p, "=");
f5b087b5 3004
2291990a
LS
3005 /*
3006 * Currently, the only defined flags are mutually exclusive
3007 */
f5b087b5 3008 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
3009 p += snprintf(p, buffer + maxlen - p, "static");
3010 else if (flags & MPOL_F_RELATIVE_NODES)
3011 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
3012 }
3013
9e763e0f
TH
3014 if (!nodes_empty(nodes))
3015 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3016 nodemask_pr_args(&nodes));
1a75a6c8 3017}