ublk: remove check IO_URING_F_SQE128 in ublk_ch_uring_cmd
[linux-block.git] / mm / mempolicy.c
CommitLineData
46aeb7e6 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Simple NUMA memory policy for the Linux kernel.
4 *
5 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
8bccd85f 6 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
1da177e4
LT
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
8bccd85f 21 *
1da177e4
LT
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
8bccd85f
CL
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
1da177e4 28 * preferred Try a specific node first before normal fallback.
00ef2d2f 29 * As a special case NUMA_NO_NODE here means do the allocation
1da177e4
LT
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
8bccd85f 33 *
b27abacc
DH
34 * preferred many Try a set of nodes first before normal fallback. This is
35 * similar to preferred without the special case.
36 *
1da177e4
LT
37 * default Allocate on the local node first, or when on a VMA
38 * use the process policy. This is what Linux always did
39 * in a NUMA aware kernel and still does by, ahem, default.
40 *
41 * The process policy is applied for most non interrupt memory allocations
42 * in that process' context. Interrupts ignore the policies and always
43 * try to allocate on the local CPU. The VMA policy is only applied for memory
44 * allocations for a VMA in the VM.
45 *
46 * Currently there are a few corner cases in swapping where the policy
47 * is not applied, but the majority should be handled. When process policy
48 * is used it is not remembered over swap outs/swap ins.
49 *
50 * Only the highest zone in the zone hierarchy gets policied. Allocations
51 * requesting a lower zone just use default policy. This implies that
52 * on systems with highmem kernel lowmem allocation don't get policied.
53 * Same with GFP_DMA allocations.
54 *
55 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
56 * all users and remembered even when nobody has memory mapped.
57 */
58
59/* Notebook:
60 fix mmap readahead to honour policy and enable policy for any page cache
61 object
62 statistics for bigpages
63 global policy for page cache? currently it uses process policy. Requires
64 first item above.
65 handle mremap for shared memory (currently ignored for the policy)
66 grows down?
67 make bind policy root only? It can trigger oom much faster and the
68 kernel is not always grateful with that.
1da177e4
LT
69*/
70
b1de0d13
MH
71#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
72
1da177e4 73#include <linux/mempolicy.h>
a520110e 74#include <linux/pagewalk.h>
1da177e4
LT
75#include <linux/highmem.h>
76#include <linux/hugetlb.h>
77#include <linux/kernel.h>
78#include <linux/sched.h>
6e84f315 79#include <linux/sched/mm.h>
6a3827d7 80#include <linux/sched/numa_balancing.h>
f719ff9b 81#include <linux/sched/task.h>
1da177e4
LT
82#include <linux/nodemask.h>
83#include <linux/cpuset.h>
1da177e4
LT
84#include <linux/slab.h>
85#include <linux/string.h>
b95f1b31 86#include <linux/export.h>
b488893a 87#include <linux/nsproxy.h>
1da177e4
LT
88#include <linux/interrupt.h>
89#include <linux/init.h>
90#include <linux/compat.h>
31367466 91#include <linux/ptrace.h>
dc9aa5b9 92#include <linux/swap.h>
1a75a6c8
CL
93#include <linux/seq_file.h>
94#include <linux/proc_fs.h>
b20a3503 95#include <linux/migrate.h>
62b61f61 96#include <linux/ksm.h>
95a402c3 97#include <linux/rmap.h>
86c3a764 98#include <linux/security.h>
dbcb0f19 99#include <linux/syscalls.h>
095f1fc4 100#include <linux/ctype.h>
6d9c285a 101#include <linux/mm_inline.h>
b24f53a0 102#include <linux/mmu_notifier.h>
b1de0d13 103#include <linux/printk.h>
c8633798 104#include <linux/swapops.h>
dc9aa5b9 105
1da177e4 106#include <asm/tlbflush.h>
4a18419f 107#include <asm/tlb.h>
7c0f6ba6 108#include <linux/uaccess.h>
1da177e4 109
62695a84
NP
110#include "internal.h"
111
38e35860 112/* Internal flags */
dc9aa5b9 113#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
38e35860 114#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
dc9aa5b9 115
fcc234f8
PE
116static struct kmem_cache *policy_cache;
117static struct kmem_cache *sn_cache;
1da177e4 118
1da177e4
LT
119/* Highest zone. An specific allocation for a zone below that is not
120 policied. */
6267276f 121enum zone_type policy_zone = 0;
1da177e4 122
bea904d5
LS
123/*
124 * run-time system-wide default policy => local allocation
125 */
e754d79d 126static struct mempolicy default_policy = {
1da177e4 127 .refcnt = ATOMIC_INIT(1), /* never free it */
7858d7bc 128 .mode = MPOL_LOCAL,
1da177e4
LT
129};
130
5606e387
MG
131static struct mempolicy preferred_node_policy[MAX_NUMNODES];
132
b2ca916c
DW
133/**
134 * numa_map_to_online_node - Find closest online node
f6e92f40 135 * @node: Node id to start the search
b2ca916c
DW
136 *
137 * Lookup the next closest node by distance if @nid is not online.
dad5b023
RD
138 *
139 * Return: this @node if it is online, otherwise the closest node by distance
b2ca916c
DW
140 */
141int numa_map_to_online_node(int node)
142{
4fcbe96e 143 int min_dist = INT_MAX, dist, n, min_node;
b2ca916c 144
4fcbe96e
DW
145 if (node == NUMA_NO_NODE || node_online(node))
146 return node;
b2ca916c
DW
147
148 min_node = node;
4fcbe96e
DW
149 for_each_online_node(n) {
150 dist = node_distance(node, n);
151 if (dist < min_dist) {
152 min_dist = dist;
153 min_node = n;
b2ca916c
DW
154 }
155 }
156
157 return min_node;
158}
159EXPORT_SYMBOL_GPL(numa_map_to_online_node);
160
74d2c3a0 161struct mempolicy *get_task_policy(struct task_struct *p)
5606e387
MG
162{
163 struct mempolicy *pol = p->mempolicy;
f15ca78e 164 int node;
5606e387 165
f15ca78e
ON
166 if (pol)
167 return pol;
5606e387 168
f15ca78e
ON
169 node = numa_node_id();
170 if (node != NUMA_NO_NODE) {
171 pol = &preferred_node_policy[node];
172 /* preferred_node_policy is not initialised early in boot */
173 if (pol->mode)
174 return pol;
5606e387
MG
175 }
176
f15ca78e 177 return &default_policy;
5606e387
MG
178}
179
37012946
DR
180static const struct mempolicy_operations {
181 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
213980c0 182 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
37012946
DR
183} mpol_ops[MPOL_MAX];
184
f5b087b5
DR
185static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
186{
6d556294 187 return pol->flags & MPOL_MODE_FLAGS;
4c50bc01
DR
188}
189
190static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
191 const nodemask_t *rel)
192{
193 nodemask_t tmp;
194 nodes_fold(tmp, *orig, nodes_weight(*rel));
195 nodes_onto(*ret, tmp, *rel);
f5b087b5
DR
196}
197
be897d48 198static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
199{
200 if (nodes_empty(*nodes))
201 return -EINVAL;
269fbe72 202 pol->nodes = *nodes;
37012946
DR
203 return 0;
204}
205
206static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
207{
7858d7bc
FT
208 if (nodes_empty(*nodes))
209 return -EINVAL;
269fbe72
BW
210
211 nodes_clear(pol->nodes);
212 node_set(first_node(*nodes), pol->nodes);
37012946
DR
213 return 0;
214}
215
58568d2a
MX
216/*
217 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
218 * any, for the new policy. mpol_new() has already validated the nodes
7858d7bc 219 * parameter with respect to the policy mode and flags.
58568d2a
MX
220 *
221 * Must be called holding task's alloc_lock to protect task's mems_allowed
c1e8d7c6 222 * and mempolicy. May also be called holding the mmap_lock for write.
58568d2a 223 */
4bfc4495
KH
224static int mpol_set_nodemask(struct mempolicy *pol,
225 const nodemask_t *nodes, struct nodemask_scratch *nsc)
58568d2a 226{
58568d2a
MX
227 int ret;
228
7858d7bc
FT
229 /*
230 * Default (pol==NULL) resp. local memory policies are not a
231 * subject of any remapping. They also do not need any special
232 * constructor.
233 */
234 if (!pol || pol->mode == MPOL_LOCAL)
58568d2a 235 return 0;
7858d7bc 236
01f13bd6 237 /* Check N_MEMORY */
4bfc4495 238 nodes_and(nsc->mask1,
01f13bd6 239 cpuset_current_mems_allowed, node_states[N_MEMORY]);
58568d2a
MX
240
241 VM_BUG_ON(!nodes);
4bfc4495 242
7858d7bc
FT
243 if (pol->flags & MPOL_F_RELATIVE_NODES)
244 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
245 else
246 nodes_and(nsc->mask2, *nodes, nsc->mask1);
58568d2a 247
7858d7bc
FT
248 if (mpol_store_user_nodemask(pol))
249 pol->w.user_nodemask = *nodes;
4bfc4495 250 else
7858d7bc
FT
251 pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
252
253 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
58568d2a
MX
254 return ret;
255}
256
257/*
258 * This function just creates a new policy, does some check and simple
259 * initialization. You must invoke mpol_set_nodemask() to set nodes.
260 */
028fec41
DR
261static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
262 nodemask_t *nodes)
1da177e4
LT
263{
264 struct mempolicy *policy;
265
028fec41 266 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
00ef2d2f 267 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
140d5a49 268
3e1f0645
DR
269 if (mode == MPOL_DEFAULT) {
270 if (nodes && !nodes_empty(*nodes))
37012946 271 return ERR_PTR(-EINVAL);
d3a71033 272 return NULL;
37012946 273 }
3e1f0645
DR
274 VM_BUG_ON(!nodes);
275
276 /*
277 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
278 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
279 * All other modes require a valid pointer to a non-empty nodemask.
280 */
281 if (mode == MPOL_PREFERRED) {
282 if (nodes_empty(*nodes)) {
283 if (((flags & MPOL_F_STATIC_NODES) ||
284 (flags & MPOL_F_RELATIVE_NODES)))
285 return ERR_PTR(-EINVAL);
7858d7bc
FT
286
287 mode = MPOL_LOCAL;
3e1f0645 288 }
479e2802 289 } else if (mode == MPOL_LOCAL) {
8d303e44
PK
290 if (!nodes_empty(*nodes) ||
291 (flags & MPOL_F_STATIC_NODES) ||
292 (flags & MPOL_F_RELATIVE_NODES))
479e2802 293 return ERR_PTR(-EINVAL);
3e1f0645
DR
294 } else if (nodes_empty(*nodes))
295 return ERR_PTR(-EINVAL);
1da177e4
LT
296 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
297 if (!policy)
298 return ERR_PTR(-ENOMEM);
299 atomic_set(&policy->refcnt, 1);
45c4745a 300 policy->mode = mode;
3e1f0645 301 policy->flags = flags;
c6018b4b 302 policy->home_node = NUMA_NO_NODE;
37012946 303
1da177e4 304 return policy;
37012946
DR
305}
306
52cd3b07
LS
307/* Slow path of a mpol destructor. */
308void __mpol_put(struct mempolicy *p)
309{
310 if (!atomic_dec_and_test(&p->refcnt))
311 return;
52cd3b07
LS
312 kmem_cache_free(policy_cache, p);
313}
314
213980c0 315static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
316{
317}
318
213980c0 319static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
37012946
DR
320{
321 nodemask_t tmp;
322
323 if (pol->flags & MPOL_F_STATIC_NODES)
324 nodes_and(tmp, pol->w.user_nodemask, *nodes);
325 else if (pol->flags & MPOL_F_RELATIVE_NODES)
326 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
327 else {
269fbe72 328 nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
213980c0 329 *nodes);
29b190fa 330 pol->w.cpuset_mems_allowed = *nodes;
37012946 331 }
f5b087b5 332
708c1bbc
MX
333 if (nodes_empty(tmp))
334 tmp = *nodes;
335
269fbe72 336 pol->nodes = tmp;
37012946
DR
337}
338
339static void mpol_rebind_preferred(struct mempolicy *pol,
213980c0 340 const nodemask_t *nodes)
37012946 341{
7858d7bc 342 pol->w.cpuset_mems_allowed = *nodes;
1da177e4
LT
343}
344
708c1bbc
MX
345/*
346 * mpol_rebind_policy - Migrate a policy to a different set of nodes
347 *
c1e8d7c6 348 * Per-vma policies are protected by mmap_lock. Allocations using per-task
213980c0
VB
349 * policies are protected by task->mems_allowed_seq to prevent a premature
350 * OOM/allocation failure due to parallel nodemask modification.
708c1bbc 351 */
213980c0 352static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1d0d2680 353{
018160ad 354 if (!pol || pol->mode == MPOL_LOCAL)
1d0d2680 355 return;
7858d7bc 356 if (!mpol_store_user_nodemask(pol) &&
1d0d2680
DR
357 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
358 return;
708c1bbc 359
213980c0 360 mpol_ops[pol->mode].rebind(pol, newmask);
1d0d2680
DR
361}
362
363/*
364 * Wrapper for mpol_rebind_policy() that just requires task
365 * pointer, and updates task mempolicy.
58568d2a
MX
366 *
367 * Called with task's alloc_lock held.
1d0d2680
DR
368 */
369
213980c0 370void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1d0d2680 371{
213980c0 372 mpol_rebind_policy(tsk->mempolicy, new);
1d0d2680
DR
373}
374
375/*
376 * Rebind each vma in mm to new nodemask.
377 *
c1e8d7c6 378 * Call holding a reference to mm. Takes mm->mmap_lock during call.
1d0d2680
DR
379 */
380
381void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
382{
383 struct vm_area_struct *vma;
66850be5 384 VMA_ITERATOR(vmi, mm, 0);
1d0d2680 385
d8ed45c5 386 mmap_write_lock(mm);
66850be5 387 for_each_vma(vmi, vma)
213980c0 388 mpol_rebind_policy(vma->vm_policy, new);
d8ed45c5 389 mmap_write_unlock(mm);
1d0d2680
DR
390}
391
37012946
DR
392static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
393 [MPOL_DEFAULT] = {
394 .rebind = mpol_rebind_default,
395 },
396 [MPOL_INTERLEAVE] = {
be897d48 397 .create = mpol_new_nodemask,
37012946
DR
398 .rebind = mpol_rebind_nodemask,
399 },
400 [MPOL_PREFERRED] = {
401 .create = mpol_new_preferred,
402 .rebind = mpol_rebind_preferred,
403 },
404 [MPOL_BIND] = {
be897d48 405 .create = mpol_new_nodemask,
37012946
DR
406 .rebind = mpol_rebind_nodemask,
407 },
7858d7bc
FT
408 [MPOL_LOCAL] = {
409 .rebind = mpol_rebind_default,
410 },
b27abacc 411 [MPOL_PREFERRED_MANY] = {
be897d48 412 .create = mpol_new_nodemask,
b27abacc
DH
413 .rebind = mpol_rebind_preferred,
414 },
37012946
DR
415};
416
a53190a4 417static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 418 unsigned long flags);
1a75a6c8 419
6f4576e3
NH
420struct queue_pages {
421 struct list_head *pagelist;
422 unsigned long flags;
423 nodemask_t *nmask;
f18da660
LX
424 unsigned long start;
425 unsigned long end;
426 struct vm_area_struct *first;
6f4576e3
NH
427};
428
88aaa2a1
NH
429/*
430 * Check if the page's nid is in qp->nmask.
431 *
432 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
433 * in the invert of qp->nmask.
434 */
435static inline bool queue_pages_required(struct page *page,
436 struct queue_pages *qp)
437{
438 int nid = page_to_nid(page);
439 unsigned long flags = qp->flags;
440
441 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
442}
443
a7f40cfe 444/*
bc78b5ed 445 * queue_pages_pmd() has three possible return values:
e5947d23
YS
446 * 0 - pages are placed on the right node or queued successfully, or
447 * special page is met, i.e. huge zero page.
d8835445
YS
448 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
449 * specified.
d8835445
YS
450 * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
451 * existing page was already on a node that does not follow the
452 * policy.
a7f40cfe 453 */
c8633798
NH
454static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
455 unsigned long end, struct mm_walk *walk)
959a7e13 456 __releases(ptl)
c8633798
NH
457{
458 int ret = 0;
459 struct page *page;
460 struct queue_pages *qp = walk->private;
461 unsigned long flags;
462
463 if (unlikely(is_pmd_migration_entry(*pmd))) {
a7f40cfe 464 ret = -EIO;
c8633798
NH
465 goto unlock;
466 }
467 page = pmd_page(*pmd);
468 if (is_huge_zero_page(page)) {
e5947d23 469 walk->action = ACTION_CONTINUE;
6d97cf88 470 goto unlock;
c8633798 471 }
d8835445 472 if (!queue_pages_required(page, qp))
c8633798 473 goto unlock;
c8633798 474
c8633798
NH
475 flags = qp->flags;
476 /* go to thp migration */
a7f40cfe 477 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
a53190a4
YS
478 if (!vma_migratable(walk->vma) ||
479 migrate_page_add(page, qp->pagelist, flags)) {
d8835445 480 ret = 1;
a7f40cfe
YS
481 goto unlock;
482 }
a7f40cfe
YS
483 } else
484 ret = -EIO;
c8633798
NH
485unlock:
486 spin_unlock(ptl);
c8633798
NH
487 return ret;
488}
489
98094945
NH
490/*
491 * Scan through pages checking if pages follow certain conditions,
492 * and move them to the pagelist if they do.
d8835445
YS
493 *
494 * queue_pages_pte_range() has three possible return values:
e5947d23
YS
495 * 0 - pages are placed on the right node or queued successfully, or
496 * special page is met, i.e. zero page.
d8835445
YS
497 * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
498 * specified.
499 * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
500 * on a node that does not follow the policy.
98094945 501 */
6f4576e3
NH
502static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
503 unsigned long end, struct mm_walk *walk)
1da177e4 504{
6f4576e3
NH
505 struct vm_area_struct *vma = walk->vma;
506 struct page *page;
507 struct queue_pages *qp = walk->private;
508 unsigned long flags = qp->flags;
d8835445 509 bool has_unmovable = false;
3f088420 510 pte_t *pte, *mapped_pte;
705e87c0 511 spinlock_t *ptl;
941150a3 512
c8633798 513 ptl = pmd_trans_huge_lock(pmd, vma);
bc78b5ed
ML
514 if (ptl)
515 return queue_pages_pmd(pmd, ptl, addr, end, walk);
91612e0d 516
337d9abf
NH
517 if (pmd_trans_unstable(pmd))
518 return 0;
94723aaf 519
3f088420 520 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
6f4576e3 521 for (; addr != end; pte++, addr += PAGE_SIZE) {
91612e0d 522 if (!pte_present(*pte))
1da177e4 523 continue;
6aab341e 524 page = vm_normal_page(vma, addr, *pte);
3218f871 525 if (!page || is_zone_device_page(page))
1da177e4 526 continue;
053837fc 527 /*
62b61f61
HD
528 * vm_normal_page() filters out zero pages, but there might
529 * still be PageReserved pages to skip, perhaps in a VDSO.
053837fc 530 */
b79bc0a0 531 if (PageReserved(page))
f4598c8b 532 continue;
88aaa2a1 533 if (!queue_pages_required(page, qp))
38e35860 534 continue;
a7f40cfe 535 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
d8835445
YS
536 /* MPOL_MF_STRICT must be specified if we get here */
537 if (!vma_migratable(vma)) {
538 has_unmovable = true;
a7f40cfe 539 break;
d8835445 540 }
a53190a4
YS
541
542 /*
543 * Do not abort immediately since there may be
544 * temporary off LRU pages in the range. Still
545 * need migrate other LRU pages.
546 */
547 if (migrate_page_add(page, qp->pagelist, flags))
548 has_unmovable = true;
a7f40cfe
YS
549 } else
550 break;
6f4576e3 551 }
3f088420 552 pte_unmap_unlock(mapped_pte, ptl);
6f4576e3 553 cond_resched();
d8835445
YS
554
555 if (has_unmovable)
556 return 1;
557
a7f40cfe 558 return addr != end ? -EIO : 0;
91612e0d
HD
559}
560
6f4576e3
NH
561static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
562 unsigned long addr, unsigned long end,
563 struct mm_walk *walk)
e2d8cf40 564{
dcf17635 565 int ret = 0;
e2d8cf40 566#ifdef CONFIG_HUGETLB_PAGE
6f4576e3 567 struct queue_pages *qp = walk->private;
dcf17635 568 unsigned long flags = (qp->flags & MPOL_MF_VALID);
e2d8cf40 569 struct page *page;
cb900f41 570 spinlock_t *ptl;
d4c54919 571 pte_t entry;
e2d8cf40 572
6f4576e3
NH
573 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
574 entry = huge_ptep_get(pte);
d4c54919
NH
575 if (!pte_present(entry))
576 goto unlock;
577 page = pte_page(entry);
88aaa2a1 578 if (!queue_pages_required(page, qp))
e2d8cf40 579 goto unlock;
dcf17635
LX
580
581 if (flags == MPOL_MF_STRICT) {
582 /*
583 * STRICT alone means only detecting misplaced page and no
584 * need to further check other vma.
585 */
586 ret = -EIO;
587 goto unlock;
588 }
589
590 if (!vma_migratable(walk->vma)) {
591 /*
592 * Must be STRICT with MOVE*, otherwise .test_walk() have
593 * stopped walking current vma.
594 * Detecting misplaced page but allow migrating pages which
595 * have been queued.
596 */
597 ret = 1;
598 goto unlock;
599 }
600
e2d8cf40
NH
601 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
602 if (flags & (MPOL_MF_MOVE_ALL) ||
dcf17635 603 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
7ce82f4c 604 if (isolate_hugetlb(page, qp->pagelist) &&
dcf17635
LX
605 (flags & MPOL_MF_STRICT))
606 /*
607 * Failed to isolate page but allow migrating pages
608 * which have been queued.
609 */
610 ret = 1;
611 }
e2d8cf40 612unlock:
cb900f41 613 spin_unlock(ptl);
e2d8cf40
NH
614#else
615 BUG();
616#endif
dcf17635 617 return ret;
1da177e4
LT
618}
619
5877231f 620#ifdef CONFIG_NUMA_BALANCING
b24f53a0 621/*
4b10e7d5
MG
622 * This is used to mark a range of virtual addresses to be inaccessible.
623 * These are later cleared by a NUMA hinting fault. Depending on these
624 * faults, pages may be migrated for better NUMA placement.
625 *
626 * This is assuming that NUMA faults are handled using PROT_NONE. If
627 * an architecture makes a different choice, it will need further
628 * changes to the core.
b24f53a0 629 */
4b10e7d5
MG
630unsigned long change_prot_numa(struct vm_area_struct *vma,
631 unsigned long addr, unsigned long end)
b24f53a0 632{
4a18419f 633 struct mmu_gather tlb;
4b10e7d5 634 int nr_updated;
b24f53a0 635
4a18419f
NA
636 tlb_gather_mmu(&tlb, vma->vm_mm);
637
638 nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE,
639 MM_CP_PROT_NUMA);
03c5a6e1
MG
640 if (nr_updated)
641 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
b24f53a0 642
4a18419f
NA
643 tlb_finish_mmu(&tlb);
644
4b10e7d5 645 return nr_updated;
b24f53a0
LS
646}
647#else
648static unsigned long change_prot_numa(struct vm_area_struct *vma,
649 unsigned long addr, unsigned long end)
650{
651 return 0;
652}
5877231f 653#endif /* CONFIG_NUMA_BALANCING */
b24f53a0 654
6f4576e3
NH
655static int queue_pages_test_walk(unsigned long start, unsigned long end,
656 struct mm_walk *walk)
657{
66850be5 658 struct vm_area_struct *next, *vma = walk->vma;
6f4576e3
NH
659 struct queue_pages *qp = walk->private;
660 unsigned long endvma = vma->vm_end;
661 unsigned long flags = qp->flags;
662
a18b3ac2 663 /* range check first */
ce33135c 664 VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
f18da660
LX
665
666 if (!qp->first) {
667 qp->first = vma;
668 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
669 (qp->start < vma->vm_start))
670 /* hole at head side of range */
a18b3ac2
LX
671 return -EFAULT;
672 }
66850be5 673 next = find_vma(vma->vm_mm, vma->vm_end);
f18da660
LX
674 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
675 ((vma->vm_end < qp->end) &&
66850be5 676 (!next || vma->vm_end < next->vm_start)))
f18da660
LX
677 /* hole at middle or tail of range */
678 return -EFAULT;
a18b3ac2 679
a7f40cfe
YS
680 /*
681 * Need check MPOL_MF_STRICT to return -EIO if possible
682 * regardless of vma_migratable
683 */
684 if (!vma_migratable(vma) &&
685 !(flags & MPOL_MF_STRICT))
48684a65
NH
686 return 1;
687
6f4576e3
NH
688 if (endvma > end)
689 endvma = end;
6f4576e3 690
6f4576e3
NH
691 if (flags & MPOL_MF_LAZY) {
692 /* Similar to task_numa_work, skip inaccessible VMAs */
3122e80e 693 if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
4355c018 694 !(vma->vm_flags & VM_MIXEDMAP))
6f4576e3
NH
695 change_prot_numa(vma, start, endvma);
696 return 1;
697 }
698
77bf45e7 699 /* queue pages from current vma */
a7f40cfe 700 if (flags & MPOL_MF_VALID)
6f4576e3
NH
701 return 0;
702 return 1;
703}
704
7b86ac33
CH
705static const struct mm_walk_ops queue_pages_walk_ops = {
706 .hugetlb_entry = queue_pages_hugetlb,
707 .pmd_entry = queue_pages_pte_range,
708 .test_walk = queue_pages_test_walk,
709};
710
dc9aa5b9 711/*
98094945
NH
712 * Walk through page tables and collect pages to be migrated.
713 *
714 * If pages found in a given range are on a set of nodes (determined by
715 * @nodes and @flags,) it's isolated and queued to the pagelist which is
d8835445
YS
716 * passed via @private.
717 *
718 * queue_pages_range() has three possible return values:
719 * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
720 * specified.
721 * 0 - queue pages successfully or no misplaced page.
a85dfc30
YS
722 * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
723 * memory range specified by nodemask and maxnode points outside
724 * your accessible address space (-EFAULT)
dc9aa5b9 725 */
d05f0cdc 726static int
98094945 727queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
6f4576e3
NH
728 nodemask_t *nodes, unsigned long flags,
729 struct list_head *pagelist)
1da177e4 730{
f18da660 731 int err;
6f4576e3
NH
732 struct queue_pages qp = {
733 .pagelist = pagelist,
734 .flags = flags,
735 .nmask = nodes,
f18da660
LX
736 .start = start,
737 .end = end,
738 .first = NULL,
6f4576e3 739 };
6f4576e3 740
f18da660
LX
741 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
742
743 if (!qp.first)
744 /* whole range in hole */
745 err = -EFAULT;
746
747 return err;
1da177e4
LT
748}
749
869833f2
KM
750/*
751 * Apply policy to a single VMA
c1e8d7c6 752 * This must be called with the mmap_lock held for writing.
869833f2
KM
753 */
754static int vma_replace_policy(struct vm_area_struct *vma,
755 struct mempolicy *pol)
8d34694c 756{
869833f2
KM
757 int err;
758 struct mempolicy *old;
759 struct mempolicy *new;
8d34694c
KM
760
761 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
762 vma->vm_start, vma->vm_end, vma->vm_pgoff,
763 vma->vm_ops, vma->vm_file,
764 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
765
869833f2
KM
766 new = mpol_dup(pol);
767 if (IS_ERR(new))
768 return PTR_ERR(new);
769
770 if (vma->vm_ops && vma->vm_ops->set_policy) {
8d34694c 771 err = vma->vm_ops->set_policy(vma, new);
869833f2
KM
772 if (err)
773 goto err_out;
8d34694c 774 }
869833f2
KM
775
776 old = vma->vm_policy;
c1e8d7c6 777 vma->vm_policy = new; /* protected by mmap_lock */
869833f2
KM
778 mpol_put(old);
779
780 return 0;
781 err_out:
782 mpol_put(new);
8d34694c
KM
783 return err;
784}
785
1da177e4 786/* Step 2: apply policy to a range and do splits. */
9d8cebd4
KM
787static int mbind_range(struct mm_struct *mm, unsigned long start,
788 unsigned long end, struct mempolicy *new_pol)
1da177e4 789{
7329e3eb 790 MA_STATE(mas, &mm->mm_mt, start, start);
9d8cebd4
KM
791 struct vm_area_struct *prev;
792 struct vm_area_struct *vma;
793 int err = 0;
e26a5114 794 pgoff_t pgoff;
9d8cebd4 795
7329e3eb
LH
796 prev = mas_prev(&mas, 0);
797 if (unlikely(!prev))
798 mas_set(&mas, start);
799
800 vma = mas_find(&mas, end - 1);
801 if (WARN_ON(!vma))
802 return 0;
803
804 if (start > vma->vm_start)
805 prev = vma;
e26a5114 806
66850be5
LH
807 for (; vma; vma = mas_next(&mas, end - 1)) {
808 unsigned long vmstart = max(start, vma->vm_start);
809 unsigned long vmend = min(end, vma->vm_end);
9d8cebd4 810
e26a5114 811 if (mpol_equal(vma_policy(vma), new_pol))
66850be5 812 goto next;
e26a5114
KM
813
814 pgoff = vma->vm_pgoff +
815 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
9d8cebd4 816 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
19a809af 817 vma->anon_vma, vma->vm_file, pgoff,
9a10064f 818 new_pol, vma->vm_userfaultfd_ctx,
5c26f6ac 819 anon_vma_name(vma));
9d8cebd4 820 if (prev) {
66850be5
LH
821 /* vma_merge() invalidated the mas */
822 mas_pause(&mas);
9d8cebd4 823 vma = prev;
3964acd0 824 goto replace;
9d8cebd4
KM
825 }
826 if (vma->vm_start != vmstart) {
827 err = split_vma(vma->vm_mm, vma, vmstart, 1);
828 if (err)
829 goto out;
66850be5
LH
830 /* split_vma() invalidated the mas */
831 mas_pause(&mas);
9d8cebd4
KM
832 }
833 if (vma->vm_end != vmend) {
834 err = split_vma(vma->vm_mm, vma, vmend, 0);
835 if (err)
836 goto out;
66850be5
LH
837 /* split_vma() invalidated the mas */
838 mas_pause(&mas);
9d8cebd4 839 }
66850be5 840replace:
869833f2 841 err = vma_replace_policy(vma, new_pol);
8d34694c
KM
842 if (err)
843 goto out;
66850be5
LH
844next:
845 prev = vma;
1da177e4 846 }
9d8cebd4 847
66850be5 848out:
1da177e4
LT
849 return err;
850}
851
1da177e4 852/* Set the process memory policy */
028fec41
DR
853static long do_set_mempolicy(unsigned short mode, unsigned short flags,
854 nodemask_t *nodes)
1da177e4 855{
58568d2a 856 struct mempolicy *new, *old;
4bfc4495 857 NODEMASK_SCRATCH(scratch);
58568d2a 858 int ret;
1da177e4 859
4bfc4495
KH
860 if (!scratch)
861 return -ENOMEM;
f4e53d91 862
4bfc4495
KH
863 new = mpol_new(mode, flags, nodes);
864 if (IS_ERR(new)) {
865 ret = PTR_ERR(new);
866 goto out;
867 }
2c7c3a7d 868
12c1dc8e 869 task_lock(current);
4bfc4495 870 ret = mpol_set_nodemask(new, nodes, scratch);
58568d2a 871 if (ret) {
12c1dc8e 872 task_unlock(current);
58568d2a 873 mpol_put(new);
4bfc4495 874 goto out;
58568d2a 875 }
12c1dc8e 876
58568d2a 877 old = current->mempolicy;
1da177e4 878 current->mempolicy = new;
45816682
VB
879 if (new && new->mode == MPOL_INTERLEAVE)
880 current->il_prev = MAX_NUMNODES-1;
58568d2a 881 task_unlock(current);
58568d2a 882 mpol_put(old);
4bfc4495
KH
883 ret = 0;
884out:
885 NODEMASK_SCRATCH_FREE(scratch);
886 return ret;
1da177e4
LT
887}
888
bea904d5
LS
889/*
890 * Return nodemask for policy for get_mempolicy() query
58568d2a
MX
891 *
892 * Called with task's alloc_lock held
bea904d5
LS
893 */
894static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
1da177e4 895{
dfcd3c0d 896 nodes_clear(*nodes);
bea904d5
LS
897 if (p == &default_policy)
898 return;
899
45c4745a 900 switch (p->mode) {
19770b32 901 case MPOL_BIND:
1da177e4 902 case MPOL_INTERLEAVE:
269fbe72 903 case MPOL_PREFERRED:
b27abacc 904 case MPOL_PREFERRED_MANY:
269fbe72 905 *nodes = p->nodes;
1da177e4 906 break;
7858d7bc
FT
907 case MPOL_LOCAL:
908 /* return empty node mask for local allocation */
909 break;
1da177e4
LT
910 default:
911 BUG();
912 }
913}
914
3b9aadf7 915static int lookup_node(struct mm_struct *mm, unsigned long addr)
1da177e4 916{
ba841078 917 struct page *p = NULL;
f728b9c4 918 int ret;
1da177e4 919
f728b9c4
JH
920 ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
921 if (ret > 0) {
922 ret = page_to_nid(p);
1da177e4
LT
923 put_page(p);
924 }
f728b9c4 925 return ret;
1da177e4
LT
926}
927
1da177e4 928/* Retrieve NUMA policy */
dbcb0f19
AB
929static long do_get_mempolicy(int *policy, nodemask_t *nmask,
930 unsigned long addr, unsigned long flags)
1da177e4 931{
8bccd85f 932 int err;
1da177e4
LT
933 struct mm_struct *mm = current->mm;
934 struct vm_area_struct *vma = NULL;
3b9aadf7 935 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
1da177e4 936
754af6f5
LS
937 if (flags &
938 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
1da177e4 939 return -EINVAL;
754af6f5
LS
940
941 if (flags & MPOL_F_MEMS_ALLOWED) {
942 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
943 return -EINVAL;
944 *policy = 0; /* just so it's initialized */
58568d2a 945 task_lock(current);
754af6f5 946 *nmask = cpuset_current_mems_allowed;
58568d2a 947 task_unlock(current);
754af6f5
LS
948 return 0;
949 }
950
1da177e4 951 if (flags & MPOL_F_ADDR) {
bea904d5
LS
952 /*
953 * Do NOT fall back to task policy if the
954 * vma/shared policy at addr is NULL. We
955 * want to return MPOL_DEFAULT in this case.
956 */
d8ed45c5 957 mmap_read_lock(mm);
33e3575c 958 vma = vma_lookup(mm, addr);
1da177e4 959 if (!vma) {
d8ed45c5 960 mmap_read_unlock(mm);
1da177e4
LT
961 return -EFAULT;
962 }
963 if (vma->vm_ops && vma->vm_ops->get_policy)
964 pol = vma->vm_ops->get_policy(vma, addr);
965 else
966 pol = vma->vm_policy;
967 } else if (addr)
968 return -EINVAL;
969
970 if (!pol)
bea904d5 971 pol = &default_policy; /* indicates default behavior */
1da177e4
LT
972
973 if (flags & MPOL_F_NODE) {
974 if (flags & MPOL_F_ADDR) {
3b9aadf7 975 /*
f728b9c4
JH
976 * Take a refcount on the mpol, because we are about to
977 * drop the mmap_lock, after which only "pol" remains
978 * valid, "vma" is stale.
3b9aadf7
AA
979 */
980 pol_refcount = pol;
981 vma = NULL;
982 mpol_get(pol);
f728b9c4 983 mmap_read_unlock(mm);
3b9aadf7 984 err = lookup_node(mm, addr);
1da177e4
LT
985 if (err < 0)
986 goto out;
8bccd85f 987 *policy = err;
1da177e4 988 } else if (pol == current->mempolicy &&
45c4745a 989 pol->mode == MPOL_INTERLEAVE) {
269fbe72 990 *policy = next_node_in(current->il_prev, pol->nodes);
1da177e4
LT
991 } else {
992 err = -EINVAL;
993 goto out;
994 }
bea904d5
LS
995 } else {
996 *policy = pol == &default_policy ? MPOL_DEFAULT :
997 pol->mode;
d79df630
DR
998 /*
999 * Internal mempolicy flags must be masked off before exposing
1000 * the policy to userspace.
1001 */
1002 *policy |= (pol->flags & MPOL_MODE_FLAGS);
bea904d5 1003 }
1da177e4 1004
1da177e4 1005 err = 0;
58568d2a 1006 if (nmask) {
c6b6ef8b
LS
1007 if (mpol_store_user_nodemask(pol)) {
1008 *nmask = pol->w.user_nodemask;
1009 } else {
1010 task_lock(current);
1011 get_policy_nodemask(pol, nmask);
1012 task_unlock(current);
1013 }
58568d2a 1014 }
1da177e4
LT
1015
1016 out:
52cd3b07 1017 mpol_cond_put(pol);
1da177e4 1018 if (vma)
d8ed45c5 1019 mmap_read_unlock(mm);
3b9aadf7
AA
1020 if (pol_refcount)
1021 mpol_put(pol_refcount);
1da177e4
LT
1022 return err;
1023}
1024
b20a3503 1025#ifdef CONFIG_MIGRATION
6ce3c4c0 1026/*
c8633798 1027 * page migration, thp tail pages can be passed.
6ce3c4c0 1028 */
a53190a4 1029static int migrate_page_add(struct page *page, struct list_head *pagelist,
fc301289 1030 unsigned long flags)
6ce3c4c0 1031{
c8633798 1032 struct page *head = compound_head(page);
6ce3c4c0 1033 /*
fc301289 1034 * Avoid migrating a page that is shared with others.
6ce3c4c0 1035 */
c8633798
NH
1036 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1037 if (!isolate_lru_page(head)) {
1038 list_add_tail(&head->lru, pagelist);
1039 mod_node_page_state(page_pgdat(head),
9de4f22a 1040 NR_ISOLATED_ANON + page_is_file_lru(head),
6c357848 1041 thp_nr_pages(head));
a53190a4
YS
1042 } else if (flags & MPOL_MF_STRICT) {
1043 /*
1044 * Non-movable page may reach here. And, there may be
1045 * temporary off LRU pages or non-LRU movable pages.
1046 * Treat them as unmovable pages since they can't be
1047 * isolated, so they can't be moved at the moment. It
1048 * should return -EIO for this case too.
1049 */
1050 return -EIO;
62695a84
NP
1051 }
1052 }
a53190a4
YS
1053
1054 return 0;
7e2ab150 1055}
6ce3c4c0 1056
7e2ab150
CL
1057/*
1058 * Migrate pages from one node to a target node.
1059 * Returns error or the number of pages not migrated.
1060 */
dbcb0f19
AB
1061static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1062 int flags)
7e2ab150
CL
1063{
1064 nodemask_t nmask;
66850be5 1065 struct vm_area_struct *vma;
7e2ab150
CL
1066 LIST_HEAD(pagelist);
1067 int err = 0;
a0976311
JK
1068 struct migration_target_control mtc = {
1069 .nid = dest,
1070 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1071 };
7e2ab150
CL
1072
1073 nodes_clear(nmask);
1074 node_set(source, nmask);
6ce3c4c0 1075
08270807
MK
1076 /*
1077 * This does not "check" the range but isolates all pages that
1078 * need migration. Between passing in the full user address
1079 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1080 */
66850be5 1081 vma = find_vma(mm, 0);
08270807 1082 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
66850be5 1083 queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
7e2ab150
CL
1084 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1085
cf608ac1 1086 if (!list_empty(&pagelist)) {
a0976311 1087 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
5ac95884 1088 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
cf608ac1 1089 if (err)
e2d8cf40 1090 putback_movable_pages(&pagelist);
cf608ac1 1091 }
95a402c3 1092
7e2ab150 1093 return err;
6ce3c4c0
CL
1094}
1095
39743889 1096/*
7e2ab150
CL
1097 * Move pages between the two nodesets so as to preserve the physical
1098 * layout as much as possible.
39743889
CL
1099 *
1100 * Returns the number of page that could not be moved.
1101 */
0ce72d4f
AM
1102int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1103 const nodemask_t *to, int flags)
39743889 1104{
7e2ab150 1105 int busy = 0;
f555befd 1106 int err = 0;
7e2ab150 1107 nodemask_t tmp;
39743889 1108
361a2a22 1109 lru_cache_disable();
0aedadf9 1110
d8ed45c5 1111 mmap_read_lock(mm);
39743889 1112
da0aa138
KM
1113 /*
1114 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1115 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1116 * bit in 'tmp', and return that <source, dest> pair for migration.
1117 * The pair of nodemasks 'to' and 'from' define the map.
1118 *
1119 * If no pair of bits is found that way, fallback to picking some
1120 * pair of 'source' and 'dest' bits that are not the same. If the
1121 * 'source' and 'dest' bits are the same, this represents a node
1122 * that will be migrating to itself, so no pages need move.
1123 *
1124 * If no bits are left in 'tmp', or if all remaining bits left
1125 * in 'tmp' correspond to the same bit in 'to', return false
1126 * (nothing left to migrate).
1127 *
1128 * This lets us pick a pair of nodes to migrate between, such that
1129 * if possible the dest node is not already occupied by some other
1130 * source node, minimizing the risk of overloading the memory on a
1131 * node that would happen if we migrated incoming memory to a node
1132 * before migrating outgoing memory source that same node.
1133 *
1134 * A single scan of tmp is sufficient. As we go, we remember the
1135 * most recent <s, d> pair that moved (s != d). If we find a pair
1136 * that not only moved, but what's better, moved to an empty slot
1137 * (d is not set in tmp), then we break out then, with that pair.
ae0e47f0 1138 * Otherwise when we finish scanning from_tmp, we at least have the
da0aa138
KM
1139 * most recent <s, d> pair that moved. If we get all the way through
1140 * the scan of tmp without finding any node that moved, much less
1141 * moved to an empty node, then there is nothing left worth migrating.
1142 */
d4984711 1143
0ce72d4f 1144 tmp = *from;
7e2ab150 1145 while (!nodes_empty(tmp)) {
68d68ff6 1146 int s, d;
b76ac7e7 1147 int source = NUMA_NO_NODE;
7e2ab150
CL
1148 int dest = 0;
1149
1150 for_each_node_mask(s, tmp) {
4a5b18cc
LW
1151
1152 /*
1153 * do_migrate_pages() tries to maintain the relative
1154 * node relationship of the pages established between
1155 * threads and memory areas.
1156 *
1157 * However if the number of source nodes is not equal to
1158 * the number of destination nodes we can not preserve
1159 * this node relative relationship. In that case, skip
1160 * copying memory from a node that is in the destination
1161 * mask.
1162 *
1163 * Example: [2,3,4] -> [3,4,5] moves everything.
1164 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1165 */
1166
0ce72d4f
AM
1167 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1168 (node_isset(s, *to)))
4a5b18cc
LW
1169 continue;
1170
0ce72d4f 1171 d = node_remap(s, *from, *to);
7e2ab150
CL
1172 if (s == d)
1173 continue;
1174
1175 source = s; /* Node moved. Memorize */
1176 dest = d;
1177
1178 /* dest not in remaining from nodes? */
1179 if (!node_isset(dest, tmp))
1180 break;
1181 }
b76ac7e7 1182 if (source == NUMA_NO_NODE)
7e2ab150
CL
1183 break;
1184
1185 node_clear(source, tmp);
1186 err = migrate_to_node(mm, source, dest, flags);
1187 if (err > 0)
1188 busy += err;
1189 if (err < 0)
1190 break;
39743889 1191 }
d8ed45c5 1192 mmap_read_unlock(mm);
d479960e 1193
361a2a22 1194 lru_cache_enable();
7e2ab150
CL
1195 if (err < 0)
1196 return err;
1197 return busy;
b20a3503
CL
1198
1199}
1200
3ad33b24
LS
1201/*
1202 * Allocate a new page for page migration based on vma policy.
d05f0cdc 1203 * Start by assuming the page is mapped by the same vma as contains @start.
3ad33b24
LS
1204 * Search forward from there, if not. N.B., this assumes that the
1205 * list of pages handed to migrate_pages()--which is how we get here--
1206 * is in virtual address order.
1207 */
666feb21 1208static struct page *new_page(struct page *page, unsigned long start)
95a402c3 1209{
ec4858e0 1210 struct folio *dst, *src = page_folio(page);
d05f0cdc 1211 struct vm_area_struct *vma;
3f649ab7 1212 unsigned long address;
66850be5 1213 VMA_ITERATOR(vmi, current->mm, start);
ec4858e0 1214 gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
95a402c3 1215
66850be5 1216 for_each_vma(vmi, vma) {
3ad33b24
LS
1217 address = page_address_in_vma(page, vma);
1218 if (address != -EFAULT)
1219 break;
3ad33b24 1220 }
11c731e8 1221
ec4858e0
MWO
1222 if (folio_test_hugetlb(src))
1223 return alloc_huge_page_vma(page_hstate(&src->page),
389c8178 1224 vma, address);
ec4858e0
MWO
1225
1226 if (folio_test_large(src))
1227 gfp = GFP_TRANSHUGE;
1228
0bf598d8 1229 /*
ec4858e0 1230 * if !vma, vma_alloc_folio() will use task or system default policy
0bf598d8 1231 */
ec4858e0
MWO
1232 dst = vma_alloc_folio(gfp, folio_order(src), vma, address,
1233 folio_test_large(src));
1234 return &dst->page;
95a402c3 1235}
b20a3503
CL
1236#else
1237
a53190a4 1238static int migrate_page_add(struct page *page, struct list_head *pagelist,
b20a3503
CL
1239 unsigned long flags)
1240{
a53190a4 1241 return -EIO;
39743889
CL
1242}
1243
0ce72d4f
AM
1244int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1245 const nodemask_t *to, int flags)
b20a3503
CL
1246{
1247 return -ENOSYS;
1248}
95a402c3 1249
666feb21 1250static struct page *new_page(struct page *page, unsigned long start)
95a402c3
CL
1251{
1252 return NULL;
1253}
b20a3503
CL
1254#endif
1255
dbcb0f19 1256static long do_mbind(unsigned long start, unsigned long len,
028fec41
DR
1257 unsigned short mode, unsigned short mode_flags,
1258 nodemask_t *nmask, unsigned long flags)
6ce3c4c0 1259{
6ce3c4c0
CL
1260 struct mm_struct *mm = current->mm;
1261 struct mempolicy *new;
1262 unsigned long end;
1263 int err;
d8835445 1264 int ret;
6ce3c4c0
CL
1265 LIST_HEAD(pagelist);
1266
b24f53a0 1267 if (flags & ~(unsigned long)MPOL_MF_VALID)
6ce3c4c0 1268 return -EINVAL;
74c00241 1269 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
6ce3c4c0
CL
1270 return -EPERM;
1271
1272 if (start & ~PAGE_MASK)
1273 return -EINVAL;
1274
1275 if (mode == MPOL_DEFAULT)
1276 flags &= ~MPOL_MF_STRICT;
1277
aaa31e05 1278 len = PAGE_ALIGN(len);
6ce3c4c0
CL
1279 end = start + len;
1280
1281 if (end < start)
1282 return -EINVAL;
1283 if (end == start)
1284 return 0;
1285
028fec41 1286 new = mpol_new(mode, mode_flags, nmask);
6ce3c4c0
CL
1287 if (IS_ERR(new))
1288 return PTR_ERR(new);
1289
b24f53a0
LS
1290 if (flags & MPOL_MF_LAZY)
1291 new->flags |= MPOL_F_MOF;
1292
6ce3c4c0
CL
1293 /*
1294 * If we are using the default policy then operation
1295 * on discontinuous address spaces is okay after all
1296 */
1297 if (!new)
1298 flags |= MPOL_MF_DISCONTIG_OK;
1299
028fec41
DR
1300 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1301 start, start + len, mode, mode_flags,
00ef2d2f 1302 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
6ce3c4c0 1303
0aedadf9
CL
1304 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1305
361a2a22 1306 lru_cache_disable();
0aedadf9 1307 }
4bfc4495
KH
1308 {
1309 NODEMASK_SCRATCH(scratch);
1310 if (scratch) {
d8ed45c5 1311 mmap_write_lock(mm);
4bfc4495 1312 err = mpol_set_nodemask(new, nmask, scratch);
4bfc4495 1313 if (err)
d8ed45c5 1314 mmap_write_unlock(mm);
4bfc4495
KH
1315 } else
1316 err = -ENOMEM;
1317 NODEMASK_SCRATCH_FREE(scratch);
1318 }
b05ca738
KM
1319 if (err)
1320 goto mpol_out;
1321
d8835445 1322 ret = queue_pages_range(mm, start, end, nmask,
6ce3c4c0 1323 flags | MPOL_MF_INVERT, &pagelist);
d8835445
YS
1324
1325 if (ret < 0) {
a85dfc30 1326 err = ret;
d8835445
YS
1327 goto up_out;
1328 }
1329
1330 err = mbind_range(mm, start, end, new);
7e2ab150 1331
b24f53a0
LS
1332 if (!err) {
1333 int nr_failed = 0;
1334
cf608ac1 1335 if (!list_empty(&pagelist)) {
b24f53a0 1336 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
d05f0cdc 1337 nr_failed = migrate_pages(&pagelist, new_page, NULL,
5ac95884 1338 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL);
cf608ac1 1339 if (nr_failed)
74060e4d 1340 putback_movable_pages(&pagelist);
cf608ac1 1341 }
6ce3c4c0 1342
d8835445 1343 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
6ce3c4c0 1344 err = -EIO;
a85dfc30 1345 } else {
d8835445 1346up_out:
a85dfc30
YS
1347 if (!list_empty(&pagelist))
1348 putback_movable_pages(&pagelist);
1349 }
1350
d8ed45c5 1351 mmap_write_unlock(mm);
d8835445 1352mpol_out:
f0be3d32 1353 mpol_put(new);
d479960e 1354 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
361a2a22 1355 lru_cache_enable();
6ce3c4c0
CL
1356 return err;
1357}
1358
8bccd85f
CL
1359/*
1360 * User space interface with variable sized bitmaps for nodelists.
1361 */
e130242d
AB
1362static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
1363 unsigned long maxnode)
1364{
1365 unsigned long nlongs = BITS_TO_LONGS(maxnode);
1366 int ret;
1367
1368 if (in_compat_syscall())
1369 ret = compat_get_bitmap(mask,
1370 (const compat_ulong_t __user *)nmask,
1371 maxnode);
1372 else
1373 ret = copy_from_user(mask, nmask,
1374 nlongs * sizeof(unsigned long));
1375
1376 if (ret)
1377 return -EFAULT;
1378
1379 if (maxnode % BITS_PER_LONG)
1380 mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;
1381
1382 return 0;
1383}
8bccd85f
CL
1384
1385/* Copy a node mask from user space. */
39743889 1386static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
8bccd85f
CL
1387 unsigned long maxnode)
1388{
8bccd85f
CL
1389 --maxnode;
1390 nodes_clear(*nodes);
1391 if (maxnode == 0 || !nmask)
1392 return 0;
a9c930ba 1393 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
636f13c1 1394 return -EINVAL;
8bccd85f 1395
56521e7a
YX
1396 /*
1397 * When the user specified more nodes than supported just check
e130242d
AB
1398 * if the non supported part is all zero, one word at a time,
1399 * starting at the end.
56521e7a 1400 */
e130242d
AB
1401 while (maxnode > MAX_NUMNODES) {
1402 unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
1403 unsigned long t;
8bccd85f 1404
000eca5d 1405 if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
56521e7a 1406 return -EFAULT;
e130242d
AB
1407
1408 if (maxnode - bits >= MAX_NUMNODES) {
1409 maxnode -= bits;
1410 } else {
1411 maxnode = MAX_NUMNODES;
1412 t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1413 }
1414 if (t)
56521e7a
YX
1415 return -EINVAL;
1416 }
1417
e130242d 1418 return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
8bccd85f
CL
1419}
1420
1421/* Copy a kernel node mask to user space */
1422static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1423 nodemask_t *nodes)
1424{
1425 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
050c17f2 1426 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
e130242d
AB
1427 bool compat = in_compat_syscall();
1428
1429 if (compat)
1430 nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);
8bccd85f
CL
1431
1432 if (copy > nbytes) {
1433 if (copy > PAGE_SIZE)
1434 return -EINVAL;
1435 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1436 return -EFAULT;
1437 copy = nbytes;
e130242d 1438 maxnode = nr_node_ids;
8bccd85f 1439 }
e130242d
AB
1440
1441 if (compat)
1442 return compat_put_bitmap((compat_ulong_t __user *)mask,
1443 nodes_addr(*nodes), maxnode);
1444
8bccd85f
CL
1445 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1446}
1447
95837924
FT
1448/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
1449static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
1450{
1451 *flags = *mode & MPOL_MODE_FLAGS;
1452 *mode &= ~MPOL_MODE_FLAGS;
b27abacc 1453
a38a59fd 1454 if ((unsigned int)(*mode) >= MPOL_MAX)
95837924
FT
1455 return -EINVAL;
1456 if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
1457 return -EINVAL;
6d2aec9e
ED
1458 if (*flags & MPOL_F_NUMA_BALANCING) {
1459 if (*mode != MPOL_BIND)
1460 return -EINVAL;
1461 *flags |= (MPOL_F_MOF | MPOL_F_MORON);
1462 }
95837924
FT
1463 return 0;
1464}
1465
e7dc9ad6
DB
1466static long kernel_mbind(unsigned long start, unsigned long len,
1467 unsigned long mode, const unsigned long __user *nmask,
1468 unsigned long maxnode, unsigned int flags)
8bccd85f 1469{
95837924 1470 unsigned short mode_flags;
8bccd85f 1471 nodemask_t nodes;
95837924 1472 int lmode = mode;
8bccd85f
CL
1473 int err;
1474
057d3389 1475 start = untagged_addr(start);
95837924
FT
1476 err = sanitize_mpol_flags(&lmode, &mode_flags);
1477 if (err)
1478 return err;
1479
8bccd85f
CL
1480 err = get_nodes(&nodes, nmask, maxnode);
1481 if (err)
1482 return err;
95837924
FT
1483
1484 return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
8bccd85f
CL
1485}
1486
c6018b4b
AK
1487SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
1488 unsigned long, home_node, unsigned long, flags)
1489{
1490 struct mm_struct *mm = current->mm;
1491 struct vm_area_struct *vma;
1492 struct mempolicy *new;
1493 unsigned long vmstart;
1494 unsigned long vmend;
1495 unsigned long end;
1496 int err = -ENOENT;
66850be5 1497 VMA_ITERATOR(vmi, mm, start);
c6018b4b
AK
1498
1499 start = untagged_addr(start);
1500 if (start & ~PAGE_MASK)
1501 return -EINVAL;
1502 /*
1503 * flags is used for future extension if any.
1504 */
1505 if (flags != 0)
1506 return -EINVAL;
1507
1508 /*
1509 * Check home_node is online to avoid accessing uninitialized
1510 * NODE_DATA.
1511 */
1512 if (home_node >= MAX_NUMNODES || !node_online(home_node))
1513 return -EINVAL;
1514
aaa31e05 1515 len = PAGE_ALIGN(len);
c6018b4b
AK
1516 end = start + len;
1517
1518 if (end < start)
1519 return -EINVAL;
1520 if (end == start)
1521 return 0;
1522 mmap_write_lock(mm);
66850be5 1523 for_each_vma_range(vmi, vma, end) {
c6018b4b
AK
1524 vmstart = max(start, vma->vm_start);
1525 vmend = min(end, vma->vm_end);
1526 new = mpol_dup(vma_policy(vma));
1527 if (IS_ERR(new)) {
1528 err = PTR_ERR(new);
1529 break;
1530 }
1531 /*
1532 * Only update home node if there is an existing vma policy
1533 */
1534 if (!new)
1535 continue;
1536
1537 /*
1538 * If any vma in the range got policy other than MPOL_BIND
1539 * or MPOL_PREFERRED_MANY we return error. We don't reset
1540 * the home node for vmas we already updated before.
1541 */
1542 if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
38ce7c9b 1543 mpol_put(new);
c6018b4b
AK
1544 err = -EOPNOTSUPP;
1545 break;
1546 }
1547
1548 new->home_node = home_node;
1549 err = mbind_range(mm, vmstart, vmend, new);
1550 mpol_put(new);
1551 if (err)
1552 break;
1553 }
1554 mmap_write_unlock(mm);
1555 return err;
1556}
1557
e7dc9ad6
DB
1558SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1559 unsigned long, mode, const unsigned long __user *, nmask,
1560 unsigned long, maxnode, unsigned int, flags)
1561{
1562 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1563}
1564
8bccd85f 1565/* Set the process memory policy */
af03c4ac
DB
1566static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1567 unsigned long maxnode)
8bccd85f 1568{
95837924 1569 unsigned short mode_flags;
8bccd85f 1570 nodemask_t nodes;
95837924
FT
1571 int lmode = mode;
1572 int err;
1573
1574 err = sanitize_mpol_flags(&lmode, &mode_flags);
1575 if (err)
1576 return err;
8bccd85f 1577
8bccd85f
CL
1578 err = get_nodes(&nodes, nmask, maxnode);
1579 if (err)
1580 return err;
95837924
FT
1581
1582 return do_set_mempolicy(lmode, mode_flags, &nodes);
8bccd85f
CL
1583}
1584
af03c4ac
DB
1585SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1586 unsigned long, maxnode)
1587{
1588 return kernel_set_mempolicy(mode, nmask, maxnode);
1589}
1590
b6e9b0ba
DB
1591static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1592 const unsigned long __user *old_nodes,
1593 const unsigned long __user *new_nodes)
39743889 1594{
596d7cfa 1595 struct mm_struct *mm = NULL;
39743889 1596 struct task_struct *task;
39743889
CL
1597 nodemask_t task_nodes;
1598 int err;
596d7cfa
KM
1599 nodemask_t *old;
1600 nodemask_t *new;
1601 NODEMASK_SCRATCH(scratch);
1602
1603 if (!scratch)
1604 return -ENOMEM;
39743889 1605
596d7cfa
KM
1606 old = &scratch->mask1;
1607 new = &scratch->mask2;
1608
1609 err = get_nodes(old, old_nodes, maxnode);
39743889 1610 if (err)
596d7cfa 1611 goto out;
39743889 1612
596d7cfa 1613 err = get_nodes(new, new_nodes, maxnode);
39743889 1614 if (err)
596d7cfa 1615 goto out;
39743889
CL
1616
1617 /* Find the mm_struct */
55cfaa3c 1618 rcu_read_lock();
228ebcbe 1619 task = pid ? find_task_by_vpid(pid) : current;
39743889 1620 if (!task) {
55cfaa3c 1621 rcu_read_unlock();
596d7cfa
KM
1622 err = -ESRCH;
1623 goto out;
39743889 1624 }
3268c63e 1625 get_task_struct(task);
39743889 1626
596d7cfa 1627 err = -EINVAL;
39743889
CL
1628
1629 /*
31367466
OE
1630 * Check if this process has the right to modify the specified process.
1631 * Use the regular "ptrace_may_access()" checks.
39743889 1632 */
31367466 1633 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
c69e8d9c 1634 rcu_read_unlock();
39743889 1635 err = -EPERM;
3268c63e 1636 goto out_put;
39743889 1637 }
c69e8d9c 1638 rcu_read_unlock();
39743889
CL
1639
1640 task_nodes = cpuset_mems_allowed(task);
1641 /* Is the user allowed to access the target nodes? */
596d7cfa 1642 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
39743889 1643 err = -EPERM;
3268c63e 1644 goto out_put;
39743889
CL
1645 }
1646
0486a38b
YX
1647 task_nodes = cpuset_mems_allowed(current);
1648 nodes_and(*new, *new, task_nodes);
1649 if (nodes_empty(*new))
1650 goto out_put;
1651
86c3a764
DQ
1652 err = security_task_movememory(task);
1653 if (err)
3268c63e 1654 goto out_put;
86c3a764 1655
3268c63e
CL
1656 mm = get_task_mm(task);
1657 put_task_struct(task);
f2a9ef88
SL
1658
1659 if (!mm) {
3268c63e 1660 err = -EINVAL;
f2a9ef88
SL
1661 goto out;
1662 }
1663
1664 err = do_migrate_pages(mm, old, new,
1665 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
3268c63e
CL
1666
1667 mmput(mm);
1668out:
596d7cfa
KM
1669 NODEMASK_SCRATCH_FREE(scratch);
1670
39743889 1671 return err;
3268c63e
CL
1672
1673out_put:
1674 put_task_struct(task);
1675 goto out;
1676
39743889
CL
1677}
1678
b6e9b0ba
DB
1679SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1680 const unsigned long __user *, old_nodes,
1681 const unsigned long __user *, new_nodes)
1682{
1683 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1684}
1685
39743889 1686
8bccd85f 1687/* Retrieve NUMA policy */
af03c4ac
DB
1688static int kernel_get_mempolicy(int __user *policy,
1689 unsigned long __user *nmask,
1690 unsigned long maxnode,
1691 unsigned long addr,
1692 unsigned long flags)
8bccd85f 1693{
dbcb0f19 1694 int err;
3f649ab7 1695 int pval;
8bccd85f
CL
1696 nodemask_t nodes;
1697
050c17f2 1698 if (nmask != NULL && maxnode < nr_node_ids)
8bccd85f
CL
1699 return -EINVAL;
1700
4605f057
WH
1701 addr = untagged_addr(addr);
1702
8bccd85f
CL
1703 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1704
1705 if (err)
1706 return err;
1707
1708 if (policy && put_user(pval, policy))
1709 return -EFAULT;
1710
1711 if (nmask)
1712 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1713
1714 return err;
1715}
1716
af03c4ac
DB
1717SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1718 unsigned long __user *, nmask, unsigned long, maxnode,
1719 unsigned long, addr, unsigned long, flags)
1720{
1721 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1722}
1723
20ca87f2
LX
1724bool vma_migratable(struct vm_area_struct *vma)
1725{
1726 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
1727 return false;
1728
1729 /*
1730 * DAX device mappings require predictable access latency, so avoid
1731 * incurring periodic faults.
1732 */
1733 if (vma_is_dax(vma))
1734 return false;
1735
1736 if (is_vm_hugetlb_page(vma) &&
1737 !hugepage_migration_supported(hstate_vma(vma)))
1738 return false;
1739
1740 /*
1741 * Migration allocates pages in the highest zone. If we cannot
1742 * do so then migration (at least from node to node) is not
1743 * possible.
1744 */
1745 if (vma->vm_file &&
1746 gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
1747 < policy_zone)
1748 return false;
1749 return true;
1750}
1751
74d2c3a0
ON
1752struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1753 unsigned long addr)
1da177e4 1754{
8d90274b 1755 struct mempolicy *pol = NULL;
1da177e4
LT
1756
1757 if (vma) {
480eccf9 1758 if (vma->vm_ops && vma->vm_ops->get_policy) {
8d90274b 1759 pol = vma->vm_ops->get_policy(vma, addr);
00442ad0 1760 } else if (vma->vm_policy) {
1da177e4 1761 pol = vma->vm_policy;
00442ad0
MG
1762
1763 /*
1764 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1765 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1766 * count on these policies which will be dropped by
1767 * mpol_cond_put() later
1768 */
1769 if (mpol_needs_cond_ref(pol))
1770 mpol_get(pol);
1771 }
1da177e4 1772 }
f15ca78e 1773
74d2c3a0
ON
1774 return pol;
1775}
1776
1777/*
dd6eecb9 1778 * get_vma_policy(@vma, @addr)
74d2c3a0
ON
1779 * @vma: virtual memory area whose policy is sought
1780 * @addr: address in @vma for shared policy lookup
1781 *
1782 * Returns effective policy for a VMA at specified address.
dd6eecb9 1783 * Falls back to current->mempolicy or system default policy, as necessary.
74d2c3a0
ON
1784 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1785 * count--added by the get_policy() vm_op, as appropriate--to protect against
1786 * freeing by another task. It is the caller's responsibility to free the
1787 * extra reference for shared policies.
1788 */
ac79f78d 1789static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
dd6eecb9 1790 unsigned long addr)
74d2c3a0
ON
1791{
1792 struct mempolicy *pol = __get_vma_policy(vma, addr);
1793
8d90274b 1794 if (!pol)
dd6eecb9 1795 pol = get_task_policy(current);
8d90274b 1796
1da177e4
LT
1797 return pol;
1798}
1799
6b6482bb 1800bool vma_policy_mof(struct vm_area_struct *vma)
fc314724 1801{
6b6482bb 1802 struct mempolicy *pol;
fc314724 1803
6b6482bb
ON
1804 if (vma->vm_ops && vma->vm_ops->get_policy) {
1805 bool ret = false;
fc314724 1806
6b6482bb
ON
1807 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1808 if (pol && (pol->flags & MPOL_F_MOF))
1809 ret = true;
1810 mpol_cond_put(pol);
8d90274b 1811
6b6482bb 1812 return ret;
fc314724
MG
1813 }
1814
6b6482bb 1815 pol = vma->vm_policy;
8d90274b 1816 if (!pol)
6b6482bb 1817 pol = get_task_policy(current);
8d90274b 1818
fc314724
MG
1819 return pol->flags & MPOL_F_MOF;
1820}
1821
d2226ebd 1822bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
d3eb1570
LJ
1823{
1824 enum zone_type dynamic_policy_zone = policy_zone;
1825
1826 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1827
1828 /*
269fbe72 1829 * if policy->nodes has movable memory only,
d3eb1570
LJ
1830 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1831 *
269fbe72 1832 * policy->nodes is intersect with node_states[N_MEMORY].
f0953a1b 1833 * so if the following test fails, it implies
269fbe72 1834 * policy->nodes has movable memory only.
d3eb1570 1835 */
269fbe72 1836 if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
d3eb1570
LJ
1837 dynamic_policy_zone = ZONE_MOVABLE;
1838
1839 return zone >= dynamic_policy_zone;
1840}
1841
52cd3b07
LS
1842/*
1843 * Return a nodemask representing a mempolicy for filtering nodes for
1844 * page allocation
1845 */
8ca39e68 1846nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
19770b32 1847{
b27abacc
DH
1848 int mode = policy->mode;
1849
19770b32 1850 /* Lower zones don't get a nodemask applied for MPOL_BIND */
b27abacc
DH
1851 if (unlikely(mode == MPOL_BIND) &&
1852 apply_policy_zone(policy, gfp_zone(gfp)) &&
1853 cpuset_nodemask_valid_mems_allowed(&policy->nodes))
1854 return &policy->nodes;
1855
1856 if (mode == MPOL_PREFERRED_MANY)
269fbe72 1857 return &policy->nodes;
19770b32
MG
1858
1859 return NULL;
1860}
1861
b27abacc
DH
1862/*
1863 * Return the preferred node id for 'prefer' mempolicy, and return
1864 * the given id for all other policies.
1865 *
1866 * policy_node() is always coupled with policy_nodemask(), which
1867 * secures the nodemask limit for 'bind' and 'prefer-many' policy.
1868 */
f8fd5253 1869static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
1da177e4 1870{
7858d7bc 1871 if (policy->mode == MPOL_PREFERRED) {
269fbe72 1872 nd = first_node(policy->nodes);
7858d7bc 1873 } else {
19770b32 1874 /*
6d840958
MH
1875 * __GFP_THISNODE shouldn't even be used with the bind policy
1876 * because we might easily break the expectation to stay on the
1877 * requested node and not break the policy.
19770b32 1878 */
6d840958 1879 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1da177e4 1880 }
6d840958 1881
c6018b4b
AK
1882 if ((policy->mode == MPOL_BIND ||
1883 policy->mode == MPOL_PREFERRED_MANY) &&
1884 policy->home_node != NUMA_NO_NODE)
1885 return policy->home_node;
1886
04ec6264 1887 return nd;
1da177e4
LT
1888}
1889
1890/* Do dynamic interleaving for a process */
1891static unsigned interleave_nodes(struct mempolicy *policy)
1892{
45816682 1893 unsigned next;
1da177e4
LT
1894 struct task_struct *me = current;
1895
269fbe72 1896 next = next_node_in(me->il_prev, policy->nodes);
f5b087b5 1897 if (next < MAX_NUMNODES)
45816682
VB
1898 me->il_prev = next;
1899 return next;
1da177e4
LT
1900}
1901
dc85da15
CL
1902/*
1903 * Depending on the memory policy provide a node from which to allocate the
1904 * next slab entry.
1905 */
2a389610 1906unsigned int mempolicy_slab_node(void)
dc85da15 1907{
e7b691b0 1908 struct mempolicy *policy;
2a389610 1909 int node = numa_mem_id();
e7b691b0 1910
38b031dd 1911 if (!in_task())
2a389610 1912 return node;
e7b691b0
AK
1913
1914 policy = current->mempolicy;
7858d7bc 1915 if (!policy)
2a389610 1916 return node;
bea904d5
LS
1917
1918 switch (policy->mode) {
1919 case MPOL_PREFERRED:
269fbe72 1920 return first_node(policy->nodes);
765c4507 1921
dc85da15
CL
1922 case MPOL_INTERLEAVE:
1923 return interleave_nodes(policy);
1924
b27abacc
DH
1925 case MPOL_BIND:
1926 case MPOL_PREFERRED_MANY:
1927 {
c33d6c06
MG
1928 struct zoneref *z;
1929
dc85da15
CL
1930 /*
1931 * Follow bind policy behavior and start allocation at the
1932 * first node.
1933 */
19770b32 1934 struct zonelist *zonelist;
19770b32 1935 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
c9634cf0 1936 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
c33d6c06 1937 z = first_zones_zonelist(zonelist, highest_zoneidx,
269fbe72 1938 &policy->nodes);
c1093b74 1939 return z->zone ? zone_to_nid(z->zone) : node;
dd1a239f 1940 }
7858d7bc
FT
1941 case MPOL_LOCAL:
1942 return node;
dc85da15 1943
dc85da15 1944 default:
bea904d5 1945 BUG();
dc85da15
CL
1946 }
1947}
1948
fee83b3a
AM
1949/*
1950 * Do static interleaving for a VMA with known offset @n. Returns the n'th
269fbe72 1951 * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
fee83b3a
AM
1952 * number of present nodes.
1953 */
98c70baa 1954static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1da177e4 1955{
276aeee1 1956 nodemask_t nodemask = pol->nodes;
1957 unsigned int target, nnodes;
fee83b3a
AM
1958 int i;
1959 int nid;
276aeee1 1960 /*
1961 * The barrier will stabilize the nodemask in a register or on
1962 * the stack so that it will stop changing under the code.
1963 *
1964 * Between first_node() and next_node(), pol->nodes could be changed
1965 * by other threads. So we put pol->nodes in a local stack.
1966 */
1967 barrier();
1da177e4 1968
276aeee1 1969 nnodes = nodes_weight(nodemask);
f5b087b5
DR
1970 if (!nnodes)
1971 return numa_node_id();
fee83b3a 1972 target = (unsigned int)n % nnodes;
276aeee1 1973 nid = first_node(nodemask);
fee83b3a 1974 for (i = 0; i < target; i++)
276aeee1 1975 nid = next_node(nid, nodemask);
1da177e4
LT
1976 return nid;
1977}
1978
5da7ca86
CL
1979/* Determine a node number for interleave */
1980static inline unsigned interleave_nid(struct mempolicy *pol,
1981 struct vm_area_struct *vma, unsigned long addr, int shift)
1982{
1983 if (vma) {
1984 unsigned long off;
1985
3b98b087
NA
1986 /*
1987 * for small pages, there is no difference between
1988 * shift and PAGE_SHIFT, so the bit-shift is safe.
1989 * for huge pages, since vm_pgoff is in units of small
1990 * pages, we need to shift off the always 0 bits to get
1991 * a useful offset.
1992 */
1993 BUG_ON(shift < PAGE_SHIFT);
1994 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
5da7ca86 1995 off += (addr - vma->vm_start) >> shift;
98c70baa 1996 return offset_il_node(pol, off);
5da7ca86
CL
1997 } else
1998 return interleave_nodes(pol);
1999}
2000
00ac59ad 2001#ifdef CONFIG_HUGETLBFS
480eccf9 2002/*
04ec6264 2003 * huge_node(@vma, @addr, @gfp_flags, @mpol)
b46e14ac
FF
2004 * @vma: virtual memory area whose policy is sought
2005 * @addr: address in @vma for shared policy lookup and interleave policy
2006 * @gfp_flags: for requested zone
2007 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
b27abacc 2008 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
480eccf9 2009 *
04ec6264 2010 * Returns a nid suitable for a huge page allocation and a pointer
52cd3b07 2011 * to the struct mempolicy for conditional unref after allocation.
b27abacc
DH
2012 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
2013 * to the mempolicy's @nodemask for filtering the zonelist.
c0ff7453 2014 *
d26914d1 2015 * Must be protected by read_mems_allowed_begin()
480eccf9 2016 */
04ec6264
VB
2017int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
2018 struct mempolicy **mpol, nodemask_t **nodemask)
5da7ca86 2019{
04ec6264 2020 int nid;
b27abacc 2021 int mode;
5da7ca86 2022
dd6eecb9 2023 *mpol = get_vma_policy(vma, addr);
b27abacc
DH
2024 *nodemask = NULL;
2025 mode = (*mpol)->mode;
5da7ca86 2026
b27abacc 2027 if (unlikely(mode == MPOL_INTERLEAVE)) {
04ec6264
VB
2028 nid = interleave_nid(*mpol, vma, addr,
2029 huge_page_shift(hstate_vma(vma)));
52cd3b07 2030 } else {
04ec6264 2031 nid = policy_node(gfp_flags, *mpol, numa_node_id());
b27abacc 2032 if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
269fbe72 2033 *nodemask = &(*mpol)->nodes;
480eccf9 2034 }
04ec6264 2035 return nid;
5da7ca86 2036}
06808b08
LS
2037
2038/*
2039 * init_nodemask_of_mempolicy
2040 *
2041 * If the current task's mempolicy is "default" [NULL], return 'false'
2042 * to indicate default policy. Otherwise, extract the policy nodemask
2043 * for 'bind' or 'interleave' policy into the argument nodemask, or
2044 * initialize the argument nodemask to contain the single node for
2045 * 'preferred' or 'local' policy and return 'true' to indicate presence
2046 * of non-default mempolicy.
2047 *
2048 * We don't bother with reference counting the mempolicy [mpol_get/put]
2049 * because the current task is examining it's own mempolicy and a task's
2050 * mempolicy is only ever changed by the task itself.
2051 *
2052 * N.B., it is the caller's responsibility to free a returned nodemask.
2053 */
2054bool init_nodemask_of_mempolicy(nodemask_t *mask)
2055{
2056 struct mempolicy *mempolicy;
06808b08
LS
2057
2058 if (!(mask && current->mempolicy))
2059 return false;
2060
c0ff7453 2061 task_lock(current);
06808b08
LS
2062 mempolicy = current->mempolicy;
2063 switch (mempolicy->mode) {
2064 case MPOL_PREFERRED:
b27abacc 2065 case MPOL_PREFERRED_MANY:
06808b08 2066 case MPOL_BIND:
06808b08 2067 case MPOL_INTERLEAVE:
269fbe72 2068 *mask = mempolicy->nodes;
7858d7bc
FT
2069 break;
2070
2071 case MPOL_LOCAL:
269fbe72 2072 init_nodemask_of_node(mask, numa_node_id());
06808b08
LS
2073 break;
2074
2075 default:
2076 BUG();
2077 }
c0ff7453 2078 task_unlock(current);
06808b08
LS
2079
2080 return true;
2081}
00ac59ad 2082#endif
5da7ca86 2083
6f48d0eb 2084/*
b26e517a 2085 * mempolicy_in_oom_domain
6f48d0eb 2086 *
b26e517a
FT
2087 * If tsk's mempolicy is "bind", check for intersection between mask and
2088 * the policy nodemask. Otherwise, return true for all other policies
2089 * including "interleave", as a tsk with "interleave" policy may have
2090 * memory allocated from all nodes in system.
6f48d0eb
DR
2091 *
2092 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
2093 */
b26e517a 2094bool mempolicy_in_oom_domain(struct task_struct *tsk,
6f48d0eb
DR
2095 const nodemask_t *mask)
2096{
2097 struct mempolicy *mempolicy;
2098 bool ret = true;
2099
2100 if (!mask)
2101 return ret;
b26e517a 2102
6f48d0eb
DR
2103 task_lock(tsk);
2104 mempolicy = tsk->mempolicy;
b26e517a 2105 if (mempolicy && mempolicy->mode == MPOL_BIND)
269fbe72 2106 ret = nodes_intersects(mempolicy->nodes, *mask);
6f48d0eb 2107 task_unlock(tsk);
b26e517a 2108
6f48d0eb
DR
2109 return ret;
2110}
2111
1da177e4
LT
2112/* Allocate a page in interleaved policy.
2113 Own path because it needs to do special accounting. */
662f3a0b
AK
2114static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2115 unsigned nid)
1da177e4 2116{
1da177e4
LT
2117 struct page *page;
2118
84172f4b 2119 page = __alloc_pages(gfp, order, nid, NULL);
4518085e
KW
2120 /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
2121 if (!static_branch_likely(&vm_numa_stat_key))
2122 return page;
de55c8b2
AR
2123 if (page && page_to_nid(page) == nid) {
2124 preempt_disable();
f19298b9 2125 __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
de55c8b2
AR
2126 preempt_enable();
2127 }
1da177e4
LT
2128 return page;
2129}
2130
4c54d949
FT
2131static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
2132 int nid, struct mempolicy *pol)
2133{
2134 struct page *page;
2135 gfp_t preferred_gfp;
2136
2137 /*
2138 * This is a two pass approach. The first pass will only try the
2139 * preferred nodes but skip the direct reclaim and allow the
2140 * allocation to fail, while the second pass will try all the
2141 * nodes in system.
2142 */
2143 preferred_gfp = gfp | __GFP_NOWARN;
2144 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2145 page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
2146 if (!page)
c0455116 2147 page = __alloc_pages(gfp, order, nid, NULL);
4c54d949
FT
2148
2149 return page;
2150}
2151
1da177e4 2152/**
adf88aa8 2153 * vma_alloc_folio - Allocate a folio for a VMA.
eb350739 2154 * @gfp: GFP flags.
adf88aa8 2155 * @order: Order of the folio.
eb350739
MWO
2156 * @vma: Pointer to VMA or NULL if not available.
2157 * @addr: Virtual address of the allocation. Must be inside @vma.
eb350739 2158 * @hugepage: For hugepages try only the preferred node if possible.
1da177e4 2159 *
adf88aa8 2160 * Allocate a folio for a specific address in @vma, using the appropriate
eb350739
MWO
2161 * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
2162 * of the mm_struct of the VMA to prevent it from going away. Should be
adf88aa8 2163 * used for all allocations for folios that will be mapped into user space.
1da177e4 2164 *
adf88aa8 2165 * Return: The folio on success or NULL if allocation fails.
1da177e4 2166 */
adf88aa8 2167struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
be1a13eb 2168 unsigned long addr, bool hugepage)
1da177e4 2169{
cc9a6c87 2170 struct mempolicy *pol;
be1a13eb 2171 int node = numa_node_id();
adf88aa8 2172 struct folio *folio;
04ec6264 2173 int preferred_nid;
be97a41b 2174 nodemask_t *nmask;
cc9a6c87 2175
dd6eecb9 2176 pol = get_vma_policy(vma, addr);
1da177e4 2177
0867a57c 2178 if (pol->mode == MPOL_INTERLEAVE) {
adf88aa8 2179 struct page *page;
0867a57c
VB
2180 unsigned nid;
2181
2182 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2183 mpol_cond_put(pol);
adf88aa8 2184 gfp |= __GFP_COMP;
0867a57c 2185 page = alloc_page_interleave(gfp, order, nid);
adf88aa8
MWO
2186 if (page && order > 1)
2187 prep_transhuge_page(page);
2188 folio = (struct folio *)page;
0867a57c 2189 goto out;
19deb769
DR
2190 }
2191
4c54d949 2192 if (pol->mode == MPOL_PREFERRED_MANY) {
adf88aa8
MWO
2193 struct page *page;
2194
c0455116 2195 node = policy_node(gfp, pol, node);
adf88aa8 2196 gfp |= __GFP_COMP;
4c54d949
FT
2197 page = alloc_pages_preferred_many(gfp, order, node, pol);
2198 mpol_cond_put(pol);
adf88aa8
MWO
2199 if (page && order > 1)
2200 prep_transhuge_page(page);
2201 folio = (struct folio *)page;
4c54d949
FT
2202 goto out;
2203 }
2204
19deb769
DR
2205 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2206 int hpage_node = node;
2207
2208 /*
2209 * For hugepage allocation and non-interleave policy which
2210 * allows the current node (or other explicitly preferred
2211 * node) we only try to allocate from the current/preferred
2212 * node and don't fall back to other nodes, as the cost of
2213 * remote accesses would likely offset THP benefits.
2214 *
b27abacc 2215 * If the policy is interleave or does not allow the current
19deb769
DR
2216 * node in its nodemask, we allocate the standard way.
2217 */
7858d7bc 2218 if (pol->mode == MPOL_PREFERRED)
269fbe72 2219 hpage_node = first_node(pol->nodes);
19deb769
DR
2220
2221 nmask = policy_nodemask(gfp, pol);
2222 if (!nmask || node_isset(hpage_node, *nmask)) {
2223 mpol_cond_put(pol);
cc638f32
VB
2224 /*
2225 * First, try to allocate THP only on local node, but
2226 * don't reclaim unnecessarily, just compact.
2227 */
adf88aa8
MWO
2228 folio = __folio_alloc_node(gfp | __GFP_THISNODE |
2229 __GFP_NORETRY, order, hpage_node);
76e654cc
DR
2230
2231 /*
2232 * If hugepage allocations are configured to always
2233 * synchronous compact or the vma has been madvised
2234 * to prefer hugepage backing, retry allowing remote
cc638f32 2235 * memory with both reclaim and compact as well.
76e654cc 2236 */
adf88aa8
MWO
2237 if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
2238 folio = __folio_alloc(gfp, order, hpage_node,
2239 nmask);
76e654cc 2240
19deb769
DR
2241 goto out;
2242 }
356ff8a9
DR
2243 }
2244
be97a41b 2245 nmask = policy_nodemask(gfp, pol);
04ec6264 2246 preferred_nid = policy_node(gfp, pol, node);
adf88aa8 2247 folio = __folio_alloc(gfp, order, preferred_nid, nmask);
d51e9894 2248 mpol_cond_put(pol);
be97a41b 2249out:
f584b680
MWO
2250 return folio;
2251}
adf88aa8 2252EXPORT_SYMBOL(vma_alloc_folio);
f584b680 2253
1da177e4 2254/**
6421ec76
MWO
2255 * alloc_pages - Allocate pages.
2256 * @gfp: GFP flags.
2257 * @order: Power of two of number of pages to allocate.
1da177e4 2258 *
6421ec76
MWO
2259 * Allocate 1 << @order contiguous pages. The physical address of the
2260 * first page is naturally aligned (eg an order-3 allocation will be aligned
2261 * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current
2262 * process is honoured when in process context.
1da177e4 2263 *
6421ec76
MWO
2264 * Context: Can be called from any context, providing the appropriate GFP
2265 * flags are used.
2266 * Return: The page on success or NULL if allocation fails.
1da177e4 2267 */
d7f946d0 2268struct page *alloc_pages(gfp_t gfp, unsigned order)
1da177e4 2269{
8d90274b 2270 struct mempolicy *pol = &default_policy;
c0ff7453 2271 struct page *page;
1da177e4 2272
8d90274b
ON
2273 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2274 pol = get_task_policy(current);
52cd3b07
LS
2275
2276 /*
2277 * No reference counting needed for current->mempolicy
2278 * nor system default_policy
2279 */
45c4745a 2280 if (pol->mode == MPOL_INTERLEAVE)
c0ff7453 2281 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
4c54d949
FT
2282 else if (pol->mode == MPOL_PREFERRED_MANY)
2283 page = alloc_pages_preferred_many(gfp, order,
c0455116 2284 policy_node(gfp, pol, numa_node_id()), pol);
c0ff7453 2285 else
84172f4b 2286 page = __alloc_pages(gfp, order,
04ec6264 2287 policy_node(gfp, pol, numa_node_id()),
5c4b4be3 2288 policy_nodemask(gfp, pol));
cc9a6c87 2289
c0ff7453 2290 return page;
1da177e4 2291}
d7f946d0 2292EXPORT_SYMBOL(alloc_pages);
1da177e4 2293
cc09cb13
MWO
2294struct folio *folio_alloc(gfp_t gfp, unsigned order)
2295{
2296 struct page *page = alloc_pages(gfp | __GFP_COMP, order);
2297
2298 if (page && order > 1)
2299 prep_transhuge_page(page);
2300 return (struct folio *)page;
2301}
2302EXPORT_SYMBOL(folio_alloc);
2303
c00b6b96
CW
2304static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
2305 struct mempolicy *pol, unsigned long nr_pages,
2306 struct page **page_array)
2307{
2308 int nodes;
2309 unsigned long nr_pages_per_node;
2310 int delta;
2311 int i;
2312 unsigned long nr_allocated;
2313 unsigned long total_allocated = 0;
2314
2315 nodes = nodes_weight(pol->nodes);
2316 nr_pages_per_node = nr_pages / nodes;
2317 delta = nr_pages - nodes * nr_pages_per_node;
2318
2319 for (i = 0; i < nodes; i++) {
2320 if (delta) {
2321 nr_allocated = __alloc_pages_bulk(gfp,
2322 interleave_nodes(pol), NULL,
2323 nr_pages_per_node + 1, NULL,
2324 page_array);
2325 delta--;
2326 } else {
2327 nr_allocated = __alloc_pages_bulk(gfp,
2328 interleave_nodes(pol), NULL,
2329 nr_pages_per_node, NULL, page_array);
2330 }
2331
2332 page_array += nr_allocated;
2333 total_allocated += nr_allocated;
2334 }
2335
2336 return total_allocated;
2337}
2338
2339static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
2340 struct mempolicy *pol, unsigned long nr_pages,
2341 struct page **page_array)
2342{
2343 gfp_t preferred_gfp;
2344 unsigned long nr_allocated = 0;
2345
2346 preferred_gfp = gfp | __GFP_NOWARN;
2347 preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2348
2349 nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
2350 nr_pages, NULL, page_array);
2351
2352 if (nr_allocated < nr_pages)
2353 nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
2354 nr_pages - nr_allocated, NULL,
2355 page_array + nr_allocated);
2356 return nr_allocated;
2357}
2358
2359/* alloc pages bulk and mempolicy should be considered at the
2360 * same time in some situation such as vmalloc.
2361 *
2362 * It can accelerate memory allocation especially interleaving
2363 * allocate memory.
2364 */
2365unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
2366 unsigned long nr_pages, struct page **page_array)
2367{
2368 struct mempolicy *pol = &default_policy;
2369
2370 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2371 pol = get_task_policy(current);
2372
2373 if (pol->mode == MPOL_INTERLEAVE)
2374 return alloc_pages_bulk_array_interleave(gfp, pol,
2375 nr_pages, page_array);
2376
2377 if (pol->mode == MPOL_PREFERRED_MANY)
2378 return alloc_pages_bulk_array_preferred_many(gfp,
2379 numa_node_id(), pol, nr_pages, page_array);
2380
2381 return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
2382 policy_nodemask(gfp, pol), nr_pages, NULL,
2383 page_array);
2384}
2385
ef0855d3
ON
2386int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2387{
2388 struct mempolicy *pol = mpol_dup(vma_policy(src));
2389
2390 if (IS_ERR(pol))
2391 return PTR_ERR(pol);
2392 dst->vm_policy = pol;
2393 return 0;
2394}
2395
4225399a 2396/*
846a16bf 2397 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
4225399a
PJ
2398 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2399 * with the mems_allowed returned by cpuset_mems_allowed(). This
2400 * keeps mempolicies cpuset relative after its cpuset moves. See
2401 * further kernel/cpuset.c update_nodemask().
708c1bbc
MX
2402 *
2403 * current's mempolicy may be rebinded by the other task(the task that changes
2404 * cpuset's mems), so we needn't do rebind work for current task.
4225399a 2405 */
4225399a 2406
846a16bf
LS
2407/* Slow path of a mempolicy duplicate */
2408struct mempolicy *__mpol_dup(struct mempolicy *old)
1da177e4
LT
2409{
2410 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2411
2412 if (!new)
2413 return ERR_PTR(-ENOMEM);
708c1bbc
MX
2414
2415 /* task's mempolicy is protected by alloc_lock */
2416 if (old == current->mempolicy) {
2417 task_lock(current);
2418 *new = *old;
2419 task_unlock(current);
2420 } else
2421 *new = *old;
2422
4225399a
PJ
2423 if (current_cpuset_is_being_rebound()) {
2424 nodemask_t mems = cpuset_mems_allowed(current);
213980c0 2425 mpol_rebind_policy(new, &mems);
4225399a 2426 }
1da177e4 2427 atomic_set(&new->refcnt, 1);
1da177e4
LT
2428 return new;
2429}
2430
2431/* Slow path of a mempolicy comparison */
fcfb4dcc 2432bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1da177e4
LT
2433{
2434 if (!a || !b)
fcfb4dcc 2435 return false;
45c4745a 2436 if (a->mode != b->mode)
fcfb4dcc 2437 return false;
19800502 2438 if (a->flags != b->flags)
fcfb4dcc 2439 return false;
c6018b4b
AK
2440 if (a->home_node != b->home_node)
2441 return false;
19800502
BL
2442 if (mpol_store_user_nodemask(a))
2443 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
fcfb4dcc 2444 return false;
19800502 2445
45c4745a 2446 switch (a->mode) {
19770b32 2447 case MPOL_BIND:
1da177e4 2448 case MPOL_INTERLEAVE:
1da177e4 2449 case MPOL_PREFERRED:
b27abacc 2450 case MPOL_PREFERRED_MANY:
269fbe72 2451 return !!nodes_equal(a->nodes, b->nodes);
7858d7bc
FT
2452 case MPOL_LOCAL:
2453 return true;
1da177e4
LT
2454 default:
2455 BUG();
fcfb4dcc 2456 return false;
1da177e4
LT
2457 }
2458}
2459
1da177e4
LT
2460/*
2461 * Shared memory backing store policy support.
2462 *
2463 * Remember policies even when nobody has shared memory mapped.
2464 * The policies are kept in Red-Black tree linked from the inode.
4a8c7bb5 2465 * They are protected by the sp->lock rwlock, which should be held
1da177e4
LT
2466 * for any accesses to the tree.
2467 */
2468
4a8c7bb5
NZ
2469/*
2470 * lookup first element intersecting start-end. Caller holds sp->lock for
2471 * reading or for writing
2472 */
1da177e4
LT
2473static struct sp_node *
2474sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2475{
2476 struct rb_node *n = sp->root.rb_node;
2477
2478 while (n) {
2479 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2480
2481 if (start >= p->end)
2482 n = n->rb_right;
2483 else if (end <= p->start)
2484 n = n->rb_left;
2485 else
2486 break;
2487 }
2488 if (!n)
2489 return NULL;
2490 for (;;) {
2491 struct sp_node *w = NULL;
2492 struct rb_node *prev = rb_prev(n);
2493 if (!prev)
2494 break;
2495 w = rb_entry(prev, struct sp_node, nd);
2496 if (w->end <= start)
2497 break;
2498 n = prev;
2499 }
2500 return rb_entry(n, struct sp_node, nd);
2501}
2502
4a8c7bb5
NZ
2503/*
2504 * Insert a new shared policy into the list. Caller holds sp->lock for
2505 * writing.
2506 */
1da177e4
LT
2507static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2508{
2509 struct rb_node **p = &sp->root.rb_node;
2510 struct rb_node *parent = NULL;
2511 struct sp_node *nd;
2512
2513 while (*p) {
2514 parent = *p;
2515 nd = rb_entry(parent, struct sp_node, nd);
2516 if (new->start < nd->start)
2517 p = &(*p)->rb_left;
2518 else if (new->end > nd->end)
2519 p = &(*p)->rb_right;
2520 else
2521 BUG();
2522 }
2523 rb_link_node(&new->nd, parent, p);
2524 rb_insert_color(&new->nd, &sp->root);
140d5a49 2525 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
45c4745a 2526 new->policy ? new->policy->mode : 0);
1da177e4
LT
2527}
2528
2529/* Find shared policy intersecting idx */
2530struct mempolicy *
2531mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2532{
2533 struct mempolicy *pol = NULL;
2534 struct sp_node *sn;
2535
2536 if (!sp->root.rb_node)
2537 return NULL;
4a8c7bb5 2538 read_lock(&sp->lock);
1da177e4
LT
2539 sn = sp_lookup(sp, idx, idx+1);
2540 if (sn) {
2541 mpol_get(sn->policy);
2542 pol = sn->policy;
2543 }
4a8c7bb5 2544 read_unlock(&sp->lock);
1da177e4
LT
2545 return pol;
2546}
2547
63f74ca2
KM
2548static void sp_free(struct sp_node *n)
2549{
2550 mpol_put(n->policy);
2551 kmem_cache_free(sn_cache, n);
2552}
2553
771fb4d8
LS
2554/**
2555 * mpol_misplaced - check whether current page node is valid in policy
2556 *
b46e14ac
FF
2557 * @page: page to be checked
2558 * @vma: vm area where page mapped
2559 * @addr: virtual address where page mapped
771fb4d8
LS
2560 *
2561 * Lookup current policy node id for vma,addr and "compare to" page's
5f076944 2562 * node id. Policy determination "mimics" alloc_page_vma().
771fb4d8 2563 * Called from fault path where we know the vma and faulting address.
5f076944 2564 *
062db293
BW
2565 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
2566 * policy, or a suitable node ID to allocate a replacement page from.
771fb4d8
LS
2567 */
2568int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2569{
2570 struct mempolicy *pol;
c33d6c06 2571 struct zoneref *z;
771fb4d8
LS
2572 int curnid = page_to_nid(page);
2573 unsigned long pgoff;
90572890
PZ
2574 int thiscpu = raw_smp_processor_id();
2575 int thisnid = cpu_to_node(thiscpu);
98fa15f3 2576 int polnid = NUMA_NO_NODE;
062db293 2577 int ret = NUMA_NO_NODE;
771fb4d8 2578
dd6eecb9 2579 pol = get_vma_policy(vma, addr);
771fb4d8
LS
2580 if (!(pol->flags & MPOL_F_MOF))
2581 goto out;
2582
2583 switch (pol->mode) {
2584 case MPOL_INTERLEAVE:
771fb4d8
LS
2585 pgoff = vma->vm_pgoff;
2586 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
98c70baa 2587 polnid = offset_il_node(pol, pgoff);
771fb4d8
LS
2588 break;
2589
2590 case MPOL_PREFERRED:
b27abacc
DH
2591 if (node_isset(curnid, pol->nodes))
2592 goto out;
269fbe72 2593 polnid = first_node(pol->nodes);
7858d7bc
FT
2594 break;
2595
2596 case MPOL_LOCAL:
2597 polnid = numa_node_id();
771fb4d8
LS
2598 break;
2599
2600 case MPOL_BIND:
bda420b9
HY
2601 /* Optimize placement among multiple nodes via NUMA balancing */
2602 if (pol->flags & MPOL_F_MORON) {
269fbe72 2603 if (node_isset(thisnid, pol->nodes))
bda420b9
HY
2604 break;
2605 goto out;
2606 }
b27abacc 2607 fallthrough;
c33d6c06 2608
b27abacc 2609 case MPOL_PREFERRED_MANY:
771fb4d8 2610 /*
771fb4d8
LS
2611 * use current page if in policy nodemask,
2612 * else select nearest allowed node, if any.
2613 * If no allowed nodes, use current [!misplaced].
2614 */
269fbe72 2615 if (node_isset(curnid, pol->nodes))
771fb4d8 2616 goto out;
c33d6c06 2617 z = first_zones_zonelist(
771fb4d8
LS
2618 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2619 gfp_zone(GFP_HIGHUSER),
269fbe72 2620 &pol->nodes);
c1093b74 2621 polnid = zone_to_nid(z->zone);
771fb4d8
LS
2622 break;
2623
2624 default:
2625 BUG();
2626 }
5606e387
MG
2627
2628 /* Migrate the page towards the node whose CPU is referencing it */
e42c8ff2 2629 if (pol->flags & MPOL_F_MORON) {
90572890 2630 polnid = thisnid;
5606e387 2631
10f39042 2632 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
de1c9ce6 2633 goto out;
e42c8ff2
MG
2634 }
2635
771fb4d8
LS
2636 if (curnid != polnid)
2637 ret = polnid;
2638out:
2639 mpol_cond_put(pol);
2640
2641 return ret;
2642}
2643
c11600e4
DR
2644/*
2645 * Drop the (possibly final) reference to task->mempolicy. It needs to be
2646 * dropped after task->mempolicy is set to NULL so that any allocation done as
2647 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
2648 * policy.
2649 */
2650void mpol_put_task_policy(struct task_struct *task)
2651{
2652 struct mempolicy *pol;
2653
2654 task_lock(task);
2655 pol = task->mempolicy;
2656 task->mempolicy = NULL;
2657 task_unlock(task);
2658 mpol_put(pol);
2659}
2660
1da177e4
LT
2661static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2662{
140d5a49 2663 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1da177e4 2664 rb_erase(&n->nd, &sp->root);
63f74ca2 2665 sp_free(n);
1da177e4
LT
2666}
2667
42288fe3
MG
2668static void sp_node_init(struct sp_node *node, unsigned long start,
2669 unsigned long end, struct mempolicy *pol)
2670{
2671 node->start = start;
2672 node->end = end;
2673 node->policy = pol;
2674}
2675
dbcb0f19
AB
2676static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2677 struct mempolicy *pol)
1da177e4 2678{
869833f2
KM
2679 struct sp_node *n;
2680 struct mempolicy *newpol;
1da177e4 2681
869833f2 2682 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1da177e4
LT
2683 if (!n)
2684 return NULL;
869833f2
KM
2685
2686 newpol = mpol_dup(pol);
2687 if (IS_ERR(newpol)) {
2688 kmem_cache_free(sn_cache, n);
2689 return NULL;
2690 }
2691 newpol->flags |= MPOL_F_SHARED;
42288fe3 2692 sp_node_init(n, start, end, newpol);
869833f2 2693
1da177e4
LT
2694 return n;
2695}
2696
2697/* Replace a policy range. */
2698static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2699 unsigned long end, struct sp_node *new)
2700{
b22d127a 2701 struct sp_node *n;
42288fe3
MG
2702 struct sp_node *n_new = NULL;
2703 struct mempolicy *mpol_new = NULL;
b22d127a 2704 int ret = 0;
1da177e4 2705
42288fe3 2706restart:
4a8c7bb5 2707 write_lock(&sp->lock);
1da177e4
LT
2708 n = sp_lookup(sp, start, end);
2709 /* Take care of old policies in the same range. */
2710 while (n && n->start < end) {
2711 struct rb_node *next = rb_next(&n->nd);
2712 if (n->start >= start) {
2713 if (n->end <= end)
2714 sp_delete(sp, n);
2715 else
2716 n->start = end;
2717 } else {
2718 /* Old policy spanning whole new range. */
2719 if (n->end > end) {
42288fe3
MG
2720 if (!n_new)
2721 goto alloc_new;
2722
2723 *mpol_new = *n->policy;
2724 atomic_set(&mpol_new->refcnt, 1);
7880639c 2725 sp_node_init(n_new, end, n->end, mpol_new);
1da177e4 2726 n->end = start;
5ca39575 2727 sp_insert(sp, n_new);
42288fe3
MG
2728 n_new = NULL;
2729 mpol_new = NULL;
1da177e4
LT
2730 break;
2731 } else
2732 n->end = start;
2733 }
2734 if (!next)
2735 break;
2736 n = rb_entry(next, struct sp_node, nd);
2737 }
2738 if (new)
2739 sp_insert(sp, new);
4a8c7bb5 2740 write_unlock(&sp->lock);
42288fe3
MG
2741 ret = 0;
2742
2743err_out:
2744 if (mpol_new)
2745 mpol_put(mpol_new);
2746 if (n_new)
2747 kmem_cache_free(sn_cache, n_new);
2748
b22d127a 2749 return ret;
42288fe3
MG
2750
2751alloc_new:
4a8c7bb5 2752 write_unlock(&sp->lock);
42288fe3
MG
2753 ret = -ENOMEM;
2754 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2755 if (!n_new)
2756 goto err_out;
2757 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2758 if (!mpol_new)
2759 goto err_out;
4ad09955 2760 atomic_set(&mpol_new->refcnt, 1);
42288fe3 2761 goto restart;
1da177e4
LT
2762}
2763
71fe804b
LS
2764/**
2765 * mpol_shared_policy_init - initialize shared policy for inode
2766 * @sp: pointer to inode shared policy
2767 * @mpol: struct mempolicy to install
2768 *
2769 * Install non-NULL @mpol in inode's shared policy rb-tree.
2770 * On entry, the current task has a reference on a non-NULL @mpol.
2771 * This must be released on exit.
4bfc4495 2772 * This is called at get_inode() calls and we can use GFP_KERNEL.
71fe804b
LS
2773 */
2774void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2775{
58568d2a
MX
2776 int ret;
2777
71fe804b 2778 sp->root = RB_ROOT; /* empty tree == default mempolicy */
4a8c7bb5 2779 rwlock_init(&sp->lock);
71fe804b
LS
2780
2781 if (mpol) {
2782 struct vm_area_struct pvma;
2783 struct mempolicy *new;
4bfc4495 2784 NODEMASK_SCRATCH(scratch);
71fe804b 2785
4bfc4495 2786 if (!scratch)
5c0c1654 2787 goto put_mpol;
71fe804b
LS
2788 /* contextualize the tmpfs mount point mempolicy */
2789 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
15d77835 2790 if (IS_ERR(new))
0cae3457 2791 goto free_scratch; /* no valid nodemask intersection */
58568d2a
MX
2792
2793 task_lock(current);
4bfc4495 2794 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
58568d2a 2795 task_unlock(current);
15d77835 2796 if (ret)
5c0c1654 2797 goto put_new;
71fe804b
LS
2798
2799 /* Create pseudo-vma that contains just the policy */
2c4541e2 2800 vma_init(&pvma, NULL);
71fe804b
LS
2801 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2802 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
15d77835 2803
5c0c1654 2804put_new:
71fe804b 2805 mpol_put(new); /* drop initial ref */
0cae3457 2806free_scratch:
4bfc4495 2807 NODEMASK_SCRATCH_FREE(scratch);
5c0c1654
LS
2808put_mpol:
2809 mpol_put(mpol); /* drop our incoming ref on sb mpol */
7339ff83
RH
2810 }
2811}
2812
1da177e4
LT
2813int mpol_set_shared_policy(struct shared_policy *info,
2814 struct vm_area_struct *vma, struct mempolicy *npol)
2815{
2816 int err;
2817 struct sp_node *new = NULL;
2818 unsigned long sz = vma_pages(vma);
2819
028fec41 2820 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1da177e4 2821 vma->vm_pgoff,
45c4745a 2822 sz, npol ? npol->mode : -1,
028fec41 2823 npol ? npol->flags : -1,
269fbe72 2824 npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
1da177e4
LT
2825
2826 if (npol) {
2827 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2828 if (!new)
2829 return -ENOMEM;
2830 }
2831 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2832 if (err && new)
63f74ca2 2833 sp_free(new);
1da177e4
LT
2834 return err;
2835}
2836
2837/* Free a backing policy store on inode delete. */
2838void mpol_free_shared_policy(struct shared_policy *p)
2839{
2840 struct sp_node *n;
2841 struct rb_node *next;
2842
2843 if (!p->root.rb_node)
2844 return;
4a8c7bb5 2845 write_lock(&p->lock);
1da177e4
LT
2846 next = rb_first(&p->root);
2847 while (next) {
2848 n = rb_entry(next, struct sp_node, nd);
2849 next = rb_next(&n->nd);
63f74ca2 2850 sp_delete(p, n);
1da177e4 2851 }
4a8c7bb5 2852 write_unlock(&p->lock);
1da177e4
LT
2853}
2854
1a687c2e 2855#ifdef CONFIG_NUMA_BALANCING
c297663c 2856static int __initdata numabalancing_override;
1a687c2e
MG
2857
2858static void __init check_numabalancing_enable(void)
2859{
2860 bool numabalancing_default = false;
2861
2862 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2863 numabalancing_default = true;
2864
c297663c
MG
2865 /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2866 if (numabalancing_override)
2867 set_numabalancing_state(numabalancing_override == 1);
2868
b0dc2b9b 2869 if (num_online_nodes() > 1 && !numabalancing_override) {
756a025f 2870 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
c297663c 2871 numabalancing_default ? "Enabling" : "Disabling");
1a687c2e
MG
2872 set_numabalancing_state(numabalancing_default);
2873 }
2874}
2875
2876static int __init setup_numabalancing(char *str)
2877{
2878 int ret = 0;
2879 if (!str)
2880 goto out;
1a687c2e
MG
2881
2882 if (!strcmp(str, "enable")) {
c297663c 2883 numabalancing_override = 1;
1a687c2e
MG
2884 ret = 1;
2885 } else if (!strcmp(str, "disable")) {
c297663c 2886 numabalancing_override = -1;
1a687c2e
MG
2887 ret = 1;
2888 }
2889out:
2890 if (!ret)
4a404bea 2891 pr_warn("Unable to parse numa_balancing=\n");
1a687c2e
MG
2892
2893 return ret;
2894}
2895__setup("numa_balancing=", setup_numabalancing);
2896#else
2897static inline void __init check_numabalancing_enable(void)
2898{
2899}
2900#endif /* CONFIG_NUMA_BALANCING */
2901
1da177e4
LT
2902/* assumes fs == KERNEL_DS */
2903void __init numa_policy_init(void)
2904{
b71636e2
PM
2905 nodemask_t interleave_nodes;
2906 unsigned long largest = 0;
2907 int nid, prefer = 0;
2908
1da177e4
LT
2909 policy_cache = kmem_cache_create("numa_policy",
2910 sizeof(struct mempolicy),
20c2df83 2911 0, SLAB_PANIC, NULL);
1da177e4
LT
2912
2913 sn_cache = kmem_cache_create("shared_policy_node",
2914 sizeof(struct sp_node),
20c2df83 2915 0, SLAB_PANIC, NULL);
1da177e4 2916
5606e387
MG
2917 for_each_node(nid) {
2918 preferred_node_policy[nid] = (struct mempolicy) {
2919 .refcnt = ATOMIC_INIT(1),
2920 .mode = MPOL_PREFERRED,
2921 .flags = MPOL_F_MOF | MPOL_F_MORON,
269fbe72 2922 .nodes = nodemask_of_node(nid),
5606e387
MG
2923 };
2924 }
2925
b71636e2
PM
2926 /*
2927 * Set interleaving policy for system init. Interleaving is only
2928 * enabled across suitably sized nodes (default is >= 16MB), or
2929 * fall back to the largest node if they're all smaller.
2930 */
2931 nodes_clear(interleave_nodes);
01f13bd6 2932 for_each_node_state(nid, N_MEMORY) {
b71636e2
PM
2933 unsigned long total_pages = node_present_pages(nid);
2934
2935 /* Preserve the largest node */
2936 if (largest < total_pages) {
2937 largest = total_pages;
2938 prefer = nid;
2939 }
2940
2941 /* Interleave this node? */
2942 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2943 node_set(nid, interleave_nodes);
2944 }
2945
2946 /* All too small, use the largest */
2947 if (unlikely(nodes_empty(interleave_nodes)))
2948 node_set(prefer, interleave_nodes);
1da177e4 2949
028fec41 2950 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
b1de0d13 2951 pr_err("%s: interleaving failed\n", __func__);
1a687c2e
MG
2952
2953 check_numabalancing_enable();
1da177e4
LT
2954}
2955
8bccd85f 2956/* Reset policy of current process to default */
1da177e4
LT
2957void numa_default_policy(void)
2958{
028fec41 2959 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1da177e4 2960}
68860ec1 2961
095f1fc4
LS
2962/*
2963 * Parse and format mempolicy from/to strings
2964 */
2965
345ace9c
LS
2966static const char * const policy_modes[] =
2967{
2968 [MPOL_DEFAULT] = "default",
2969 [MPOL_PREFERRED] = "prefer",
2970 [MPOL_BIND] = "bind",
2971 [MPOL_INTERLEAVE] = "interleave",
d3a71033 2972 [MPOL_LOCAL] = "local",
b27abacc 2973 [MPOL_PREFERRED_MANY] = "prefer (many)",
345ace9c 2974};
1a75a6c8 2975
095f1fc4
LS
2976
2977#ifdef CONFIG_TMPFS
2978/**
f2a07f40 2979 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
095f1fc4 2980 * @str: string containing mempolicy to parse
71fe804b 2981 * @mpol: pointer to struct mempolicy pointer, returned on success.
095f1fc4
LS
2982 *
2983 * Format of input:
2984 * <mode>[=<flags>][:<nodelist>]
2985 *
dad5b023 2986 * Return: %0 on success, else %1
095f1fc4 2987 */
a7a88b23 2988int mpol_parse_str(char *str, struct mempolicy **mpol)
095f1fc4 2989{
71fe804b 2990 struct mempolicy *new = NULL;
f2a07f40 2991 unsigned short mode_flags;
71fe804b 2992 nodemask_t nodes;
095f1fc4
LS
2993 char *nodelist = strchr(str, ':');
2994 char *flags = strchr(str, '=');
dedf2c73 2995 int err = 1, mode;
095f1fc4 2996
c7a91bc7
DC
2997 if (flags)
2998 *flags++ = '\0'; /* terminate mode string */
2999
095f1fc4
LS
3000 if (nodelist) {
3001 /* NUL-terminate mode or flags string */
3002 *nodelist++ = '\0';
71fe804b 3003 if (nodelist_parse(nodelist, nodes))
095f1fc4 3004 goto out;
01f13bd6 3005 if (!nodes_subset(nodes, node_states[N_MEMORY]))
095f1fc4 3006 goto out;
71fe804b
LS
3007 } else
3008 nodes_clear(nodes);
3009
dedf2c73 3010 mode = match_string(policy_modes, MPOL_MAX, str);
3011 if (mode < 0)
095f1fc4
LS
3012 goto out;
3013
71fe804b 3014 switch (mode) {
095f1fc4 3015 case MPOL_PREFERRED:
71fe804b 3016 /*
aa9f7d51
RD
3017 * Insist on a nodelist of one node only, although later
3018 * we use first_node(nodes) to grab a single node, so here
3019 * nodelist (or nodes) cannot be empty.
71fe804b 3020 */
095f1fc4
LS
3021 if (nodelist) {
3022 char *rest = nodelist;
3023 while (isdigit(*rest))
3024 rest++;
926f2ae0
KM
3025 if (*rest)
3026 goto out;
aa9f7d51
RD
3027 if (nodes_empty(nodes))
3028 goto out;
095f1fc4
LS
3029 }
3030 break;
095f1fc4
LS
3031 case MPOL_INTERLEAVE:
3032 /*
3033 * Default to online nodes with memory if no nodelist
3034 */
3035 if (!nodelist)
01f13bd6 3036 nodes = node_states[N_MEMORY];
3f226aa1 3037 break;
71fe804b 3038 case MPOL_LOCAL:
3f226aa1 3039 /*
71fe804b 3040 * Don't allow a nodelist; mpol_new() checks flags
3f226aa1 3041 */
71fe804b 3042 if (nodelist)
3f226aa1 3043 goto out;
3f226aa1 3044 break;
413b43de
RT
3045 case MPOL_DEFAULT:
3046 /*
3047 * Insist on a empty nodelist
3048 */
3049 if (!nodelist)
3050 err = 0;
3051 goto out;
b27abacc 3052 case MPOL_PREFERRED_MANY:
d69b2e63
KM
3053 case MPOL_BIND:
3054 /*
3055 * Insist on a nodelist
3056 */
3057 if (!nodelist)
3058 goto out;
095f1fc4
LS
3059 }
3060
71fe804b 3061 mode_flags = 0;
095f1fc4
LS
3062 if (flags) {
3063 /*
3064 * Currently, we only support two mutually exclusive
3065 * mode flags.
3066 */
3067 if (!strcmp(flags, "static"))
71fe804b 3068 mode_flags |= MPOL_F_STATIC_NODES;
095f1fc4 3069 else if (!strcmp(flags, "relative"))
71fe804b 3070 mode_flags |= MPOL_F_RELATIVE_NODES;
095f1fc4 3071 else
926f2ae0 3072 goto out;
095f1fc4 3073 }
71fe804b
LS
3074
3075 new = mpol_new(mode, mode_flags, &nodes);
3076 if (IS_ERR(new))
926f2ae0
KM
3077 goto out;
3078
f2a07f40
HD
3079 /*
3080 * Save nodes for mpol_to_str() to show the tmpfs mount options
3081 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
3082 */
269fbe72
BW
3083 if (mode != MPOL_PREFERRED) {
3084 new->nodes = nodes;
3085 } else if (nodelist) {
3086 nodes_clear(new->nodes);
3087 node_set(first_node(nodes), new->nodes);
3088 } else {
7858d7bc 3089 new->mode = MPOL_LOCAL;
269fbe72 3090 }
f2a07f40
HD
3091
3092 /*
3093 * Save nodes for contextualization: this will be used to "clone"
3094 * the mempolicy in a specific context [cpuset] at a later time.
3095 */
3096 new->w.user_nodemask = nodes;
3097
926f2ae0 3098 err = 0;
71fe804b 3099
095f1fc4
LS
3100out:
3101 /* Restore string for error message */
3102 if (nodelist)
3103 *--nodelist = ':';
3104 if (flags)
3105 *--flags = '=';
71fe804b
LS
3106 if (!err)
3107 *mpol = new;
095f1fc4
LS
3108 return err;
3109}
3110#endif /* CONFIG_TMPFS */
3111
71fe804b
LS
3112/**
3113 * mpol_to_str - format a mempolicy structure for printing
3114 * @buffer: to contain formatted mempolicy string
3115 * @maxlen: length of @buffer
3116 * @pol: pointer to mempolicy to be formatted
71fe804b 3117 *
948927ee
DR
3118 * Convert @pol into a string. If @buffer is too short, truncate the string.
3119 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
3120 * longest flag, "relative", and to display at least a few node ids.
1a75a6c8 3121 */
948927ee 3122void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1a75a6c8
CL
3123{
3124 char *p = buffer;
948927ee
DR
3125 nodemask_t nodes = NODE_MASK_NONE;
3126 unsigned short mode = MPOL_DEFAULT;
3127 unsigned short flags = 0;
2291990a 3128
8790c71a 3129 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
bea904d5 3130 mode = pol->mode;
948927ee
DR
3131 flags = pol->flags;
3132 }
bea904d5 3133
1a75a6c8
CL
3134 switch (mode) {
3135 case MPOL_DEFAULT:
7858d7bc 3136 case MPOL_LOCAL:
1a75a6c8 3137 break;
1a75a6c8 3138 case MPOL_PREFERRED:
b27abacc 3139 case MPOL_PREFERRED_MANY:
1a75a6c8 3140 case MPOL_BIND:
1a75a6c8 3141 case MPOL_INTERLEAVE:
269fbe72 3142 nodes = pol->nodes;
1a75a6c8 3143 break;
1a75a6c8 3144 default:
948927ee
DR
3145 WARN_ON_ONCE(1);
3146 snprintf(p, maxlen, "unknown");
3147 return;
1a75a6c8
CL
3148 }
3149
b7a9f420 3150 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
1a75a6c8 3151
fc36b8d3 3152 if (flags & MPOL_MODE_FLAGS) {
948927ee 3153 p += snprintf(p, buffer + maxlen - p, "=");
f5b087b5 3154
2291990a
LS
3155 /*
3156 * Currently, the only defined flags are mutually exclusive
3157 */
f5b087b5 3158 if (flags & MPOL_F_STATIC_NODES)
2291990a
LS
3159 p += snprintf(p, buffer + maxlen - p, "static");
3160 else if (flags & MPOL_F_RELATIVE_NODES)
3161 p += snprintf(p, buffer + maxlen - p, "relative");
f5b087b5
DR
3162 }
3163
9e763e0f
TH
3164 if (!nodes_empty(nodes))
3165 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3166 nodemask_pr_args(&nodes));
1a75a6c8 3167}