hugetlb: optimize update_and_free_pages_bulk to avoid lock cycles
[linux-block.git] / mm / hugetlb.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Generic hugetlb support.
6d49e352 4 * (C) Nadia Yvette Chambers, April 2004
1da177e4 5 */
1da177e4
LT
6#include <linux/list.h>
7#include <linux/init.h>
1da177e4 8#include <linux/mm.h>
e1759c21 9#include <linux/seq_file.h>
1da177e4
LT
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
cddb8a5c 12#include <linux/mmu_notifier.h>
1da177e4 13#include <linux/nodemask.h>
63551ae0 14#include <linux/pagemap.h>
5da7ca86 15#include <linux/mempolicy.h>
3b32123d 16#include <linux/compiler.h>
aea47ff3 17#include <linux/cpuset.h>
3935baa9 18#include <linux/mutex.h>
97ad1087 19#include <linux/memblock.h>
a3437870 20#include <linux/sysfs.h>
5a0e3ad6 21#include <linux/slab.h>
bbe88753 22#include <linux/sched/mm.h>
63489f8e 23#include <linux/mmdebug.h>
174cd4b1 24#include <linux/sched/signal.h>
0fe6e20b 25#include <linux/rmap.h>
c6247f72 26#include <linux/string_helpers.h>
fd6a03ed
NH
27#include <linux/swap.h>
28#include <linux/swapops.h>
8382d914 29#include <linux/jhash.h>
98fa15f3 30#include <linux/numa.h>
c77c0a8a 31#include <linux/llist.h>
cf11e85f 32#include <linux/cma.h>
8cc5fcbb 33#include <linux/migrate.h>
f9317f77 34#include <linux/nospec.h>
662ce1dc 35#include <linux/delayacct.h>
b958d4d0 36#include <linux/memory.h>
af19487f 37#include <linux/mm_inline.h>
d6606683 38
63551ae0 39#include <asm/page.h>
ca15ca40 40#include <asm/pgalloc.h>
24669e58 41#include <asm/tlb.h>
63551ae0 42
24669e58 43#include <linux/io.h>
63551ae0 44#include <linux/hugetlb.h>
9dd540e2 45#include <linux/hugetlb_cgroup.h>
9a305230 46#include <linux/node.h>
ab5ac90a 47#include <linux/page_owner.h>
7835e98b 48#include "internal.h"
f41f2ed4 49#include "hugetlb_vmemmap.h"
1da177e4 50
c3f38a38 51int hugetlb_max_hstate __read_mostly;
e5ff2159
AK
52unsigned int default_hstate_idx;
53struct hstate hstates[HUGE_MAX_HSTATE];
cf11e85f 54
dbda8fea 55#ifdef CONFIG_CMA
cf11e85f 56static struct cma *hugetlb_cma[MAX_NUMNODES];
38e719ab 57static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
2f6c57d6 58static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
a01f4390 59{
2f6c57d6 60 return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page,
a01f4390
MK
61 1 << order);
62}
63#else
2f6c57d6 64static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
a01f4390
MK
65{
66 return false;
67}
dbda8fea
BS
68#endif
69static unsigned long hugetlb_cma_size __initdata;
cf11e85f 70
53ba51d2
JT
71__initdata LIST_HEAD(huge_boot_pages);
72
e5ff2159
AK
73/* for command line parsing */
74static struct hstate * __initdata parsed_hstate;
75static unsigned long __initdata default_hstate_max_huge_pages;
9fee021d 76static bool __initdata parsed_valid_hugepagesz = true;
282f4214 77static bool __initdata parsed_default_hugepagesz;
b5389086 78static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
e5ff2159 79
3935baa9 80/*
31caf665
NH
81 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
82 * free_huge_pages, and surplus_huge_pages.
3935baa9 83 */
c3f38a38 84DEFINE_SPINLOCK(hugetlb_lock);
0bd0f9fb 85
8382d914
DB
86/*
87 * Serializes faults on the same logical page. This is used to
88 * prevent spurious OOMs when the hugepage pool is fully utilized.
89 */
90static int num_fault_mutexes;
c672c7f2 91struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
8382d914 92
7ca02d0a
MK
93/* Forward declaration */
94static int hugetlb_acct_memory(struct hstate *h, long delta);
8d9bfb26
MK
95static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
96static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
ecfbd733 97static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
b30c14cd
JH
98static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
99 unsigned long start, unsigned long end);
bf491692 100static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
7ca02d0a 101
1d88433b 102static inline bool subpool_is_free(struct hugepage_subpool *spool)
90481622 103{
1d88433b
ML
104 if (spool->count)
105 return false;
106 if (spool->max_hpages != -1)
107 return spool->used_hpages == 0;
108 if (spool->min_hpages != -1)
109 return spool->rsv_hpages == spool->min_hpages;
110
111 return true;
112}
90481622 113
db71ef79
MK
114static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
115 unsigned long irq_flags)
1d88433b 116{
db71ef79 117 spin_unlock_irqrestore(&spool->lock, irq_flags);
90481622
DG
118
119 /* If no pages are used, and no other handles to the subpool
7c8de358 120 * remain, give up any reservations based on minimum size and
7ca02d0a 121 * free the subpool */
1d88433b 122 if (subpool_is_free(spool)) {
7ca02d0a
MK
123 if (spool->min_hpages != -1)
124 hugetlb_acct_memory(spool->hstate,
125 -spool->min_hpages);
90481622 126 kfree(spool);
7ca02d0a 127 }
90481622
DG
128}
129
7ca02d0a
MK
130struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
131 long min_hpages)
90481622
DG
132{
133 struct hugepage_subpool *spool;
134
c6a91820 135 spool = kzalloc(sizeof(*spool), GFP_KERNEL);
90481622
DG
136 if (!spool)
137 return NULL;
138
139 spin_lock_init(&spool->lock);
140 spool->count = 1;
7ca02d0a
MK
141 spool->max_hpages = max_hpages;
142 spool->hstate = h;
143 spool->min_hpages = min_hpages;
144
145 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
146 kfree(spool);
147 return NULL;
148 }
149 spool->rsv_hpages = min_hpages;
90481622
DG
150
151 return spool;
152}
153
154void hugepage_put_subpool(struct hugepage_subpool *spool)
155{
db71ef79
MK
156 unsigned long flags;
157
158 spin_lock_irqsave(&spool->lock, flags);
90481622
DG
159 BUG_ON(!spool->count);
160 spool->count--;
db71ef79 161 unlock_or_release_subpool(spool, flags);
90481622
DG
162}
163
1c5ecae3
MK
164/*
165 * Subpool accounting for allocating and reserving pages.
166 * Return -ENOMEM if there are not enough resources to satisfy the
9e7ee400 167 * request. Otherwise, return the number of pages by which the
1c5ecae3
MK
168 * global pools must be adjusted (upward). The returned value may
169 * only be different than the passed value (delta) in the case where
7c8de358 170 * a subpool minimum size must be maintained.
1c5ecae3
MK
171 */
172static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
90481622
DG
173 long delta)
174{
1c5ecae3 175 long ret = delta;
90481622
DG
176
177 if (!spool)
1c5ecae3 178 return ret;
90481622 179
db71ef79 180 spin_lock_irq(&spool->lock);
1c5ecae3
MK
181
182 if (spool->max_hpages != -1) { /* maximum size accounting */
183 if ((spool->used_hpages + delta) <= spool->max_hpages)
184 spool->used_hpages += delta;
185 else {
186 ret = -ENOMEM;
187 goto unlock_ret;
188 }
90481622 189 }
90481622 190
09a95e29
MK
191 /* minimum size accounting */
192 if (spool->min_hpages != -1 && spool->rsv_hpages) {
1c5ecae3
MK
193 if (delta > spool->rsv_hpages) {
194 /*
195 * Asking for more reserves than those already taken on
196 * behalf of subpool. Return difference.
197 */
198 ret = delta - spool->rsv_hpages;
199 spool->rsv_hpages = 0;
200 } else {
201 ret = 0; /* reserves already accounted for */
202 spool->rsv_hpages -= delta;
203 }
204 }
205
206unlock_ret:
db71ef79 207 spin_unlock_irq(&spool->lock);
90481622
DG
208 return ret;
209}
210
1c5ecae3
MK
211/*
212 * Subpool accounting for freeing and unreserving pages.
213 * Return the number of global page reservations that must be dropped.
214 * The return value may only be different than the passed value (delta)
215 * in the case where a subpool minimum size must be maintained.
216 */
217static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
90481622
DG
218 long delta)
219{
1c5ecae3 220 long ret = delta;
db71ef79 221 unsigned long flags;
1c5ecae3 222
90481622 223 if (!spool)
1c5ecae3 224 return delta;
90481622 225
db71ef79 226 spin_lock_irqsave(&spool->lock, flags);
1c5ecae3
MK
227
228 if (spool->max_hpages != -1) /* maximum size accounting */
229 spool->used_hpages -= delta;
230
09a95e29
MK
231 /* minimum size accounting */
232 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
1c5ecae3
MK
233 if (spool->rsv_hpages + delta <= spool->min_hpages)
234 ret = 0;
235 else
236 ret = spool->rsv_hpages + delta - spool->min_hpages;
237
238 spool->rsv_hpages += delta;
239 if (spool->rsv_hpages > spool->min_hpages)
240 spool->rsv_hpages = spool->min_hpages;
241 }
242
243 /*
244 * If hugetlbfs_put_super couldn't free spool due to an outstanding
245 * quota reference, free it now.
246 */
db71ef79 247 unlock_or_release_subpool(spool, flags);
1c5ecae3
MK
248
249 return ret;
90481622
DG
250}
251
252static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
253{
254 return HUGETLBFS_SB(inode->i_sb)->spool;
255}
256
257static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
258{
496ad9aa 259 return subpool_inode(file_inode(vma->vm_file));
90481622
DG
260}
261
e700898f
MK
262/*
263 * hugetlb vma_lock helper routines
264 */
e700898f
MK
265void hugetlb_vma_lock_read(struct vm_area_struct *vma)
266{
267 if (__vma_shareable_lock(vma)) {
268 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
269
270 down_read(&vma_lock->rw_sema);
bf491692
RR
271 } else if (__vma_private_lock(vma)) {
272 struct resv_map *resv_map = vma_resv_map(vma);
273
274 down_read(&resv_map->rw_sema);
e700898f
MK
275 }
276}
277
278void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
279{
280 if (__vma_shareable_lock(vma)) {
281 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
282
283 up_read(&vma_lock->rw_sema);
bf491692
RR
284 } else if (__vma_private_lock(vma)) {
285 struct resv_map *resv_map = vma_resv_map(vma);
286
287 up_read(&resv_map->rw_sema);
e700898f
MK
288 }
289}
290
291void hugetlb_vma_lock_write(struct vm_area_struct *vma)
292{
293 if (__vma_shareable_lock(vma)) {
294 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
295
296 down_write(&vma_lock->rw_sema);
bf491692
RR
297 } else if (__vma_private_lock(vma)) {
298 struct resv_map *resv_map = vma_resv_map(vma);
299
300 down_write(&resv_map->rw_sema);
e700898f
MK
301 }
302}
303
304void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
305{
306 if (__vma_shareable_lock(vma)) {
307 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
308
309 up_write(&vma_lock->rw_sema);
bf491692
RR
310 } else if (__vma_private_lock(vma)) {
311 struct resv_map *resv_map = vma_resv_map(vma);
312
313 up_write(&resv_map->rw_sema);
e700898f
MK
314 }
315}
316
317int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
318{
e700898f 319
bf491692
RR
320 if (__vma_shareable_lock(vma)) {
321 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
e700898f 322
bf491692
RR
323 return down_write_trylock(&vma_lock->rw_sema);
324 } else if (__vma_private_lock(vma)) {
325 struct resv_map *resv_map = vma_resv_map(vma);
326
327 return down_write_trylock(&resv_map->rw_sema);
328 }
329
330 return 1;
e700898f
MK
331}
332
333void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
334{
335 if (__vma_shareable_lock(vma)) {
336 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
337
338 lockdep_assert_held(&vma_lock->rw_sema);
bf491692
RR
339 } else if (__vma_private_lock(vma)) {
340 struct resv_map *resv_map = vma_resv_map(vma);
341
342 lockdep_assert_held(&resv_map->rw_sema);
e700898f
MK
343 }
344}
345
346void hugetlb_vma_lock_release(struct kref *kref)
347{
348 struct hugetlb_vma_lock *vma_lock = container_of(kref,
349 struct hugetlb_vma_lock, refs);
350
351 kfree(vma_lock);
352}
353
354static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
355{
356 struct vm_area_struct *vma = vma_lock->vma;
357
358 /*
359 * vma_lock structure may or not be released as a result of put,
360 * it certainly will no longer be attached to vma so clear pointer.
361 * Semaphore synchronizes access to vma_lock->vma field.
362 */
363 vma_lock->vma = NULL;
364 vma->vm_private_data = NULL;
365 up_write(&vma_lock->rw_sema);
366 kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
367}
368
369static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
370{
371 if (__vma_shareable_lock(vma)) {
372 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
373
374 __hugetlb_vma_unlock_write_put(vma_lock);
bf491692
RR
375 } else if (__vma_private_lock(vma)) {
376 struct resv_map *resv_map = vma_resv_map(vma);
377
378 /* no free for anon vmas, but still need to unlock */
379 up_write(&resv_map->rw_sema);
e700898f
MK
380 }
381}
382
383static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
384{
385 /*
386 * Only present in sharable vmas.
387 */
388 if (!vma || !__vma_shareable_lock(vma))
389 return;
390
391 if (vma->vm_private_data) {
392 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
393
394 down_write(&vma_lock->rw_sema);
395 __hugetlb_vma_unlock_write_put(vma_lock);
396 }
397}
398
399static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
400{
401 struct hugetlb_vma_lock *vma_lock;
402
403 /* Only establish in (flags) sharable vmas */
404 if (!vma || !(vma->vm_flags & VM_MAYSHARE))
405 return;
406
407 /* Should never get here with non-NULL vm_private_data */
408 if (vma->vm_private_data)
409 return;
410
411 vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
412 if (!vma_lock) {
413 /*
414 * If we can not allocate structure, then vma can not
415 * participate in pmd sharing. This is only a possible
416 * performance enhancement and memory saving issue.
417 * However, the lock is also used to synchronize page
418 * faults with truncation. If the lock is not present,
419 * unlikely races could leave pages in a file past i_size
420 * until the file is removed. Warn in the unlikely case of
421 * allocation failure.
422 */
423 pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
424 return;
425 }
426
427 kref_init(&vma_lock->refs);
428 init_rwsem(&vma_lock->rw_sema);
429 vma_lock->vma = vma;
430 vma->vm_private_data = vma_lock;
431}
432
0db9d74e
MA
433/* Helper that removes a struct file_region from the resv_map cache and returns
434 * it for use.
435 */
436static struct file_region *
437get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
438{
3259914f 439 struct file_region *nrg;
0db9d74e
MA
440
441 VM_BUG_ON(resv->region_cache_count <= 0);
442
443 resv->region_cache_count--;
444 nrg = list_first_entry(&resv->region_cache, struct file_region, link);
0db9d74e
MA
445 list_del(&nrg->link);
446
447 nrg->from = from;
448 nrg->to = to;
449
450 return nrg;
451}
452
075a61d0
MA
453static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
454 struct file_region *rg)
455{
456#ifdef CONFIG_CGROUP_HUGETLB
457 nrg->reservation_counter = rg->reservation_counter;
458 nrg->css = rg->css;
459 if (rg->css)
460 css_get(rg->css);
461#endif
462}
463
464/* Helper that records hugetlb_cgroup uncharge info. */
465static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
466 struct hstate *h,
467 struct resv_map *resv,
468 struct file_region *nrg)
469{
470#ifdef CONFIG_CGROUP_HUGETLB
471 if (h_cg) {
472 nrg->reservation_counter =
473 &h_cg->rsvd_hugepage[hstate_index(h)];
474 nrg->css = &h_cg->css;
d85aecf2
ML
475 /*
476 * The caller will hold exactly one h_cg->css reference for the
477 * whole contiguous reservation region. But this area might be
478 * scattered when there are already some file_regions reside in
479 * it. As a result, many file_regions may share only one css
480 * reference. In order to ensure that one file_region must hold
481 * exactly one h_cg->css reference, we should do css_get for
482 * each file_region and leave the reference held by caller
483 * untouched.
484 */
485 css_get(&h_cg->css);
075a61d0
MA
486 if (!resv->pages_per_hpage)
487 resv->pages_per_hpage = pages_per_huge_page(h);
488 /* pages_per_hpage should be the same for all entries in
489 * a resv_map.
490 */
491 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
492 } else {
493 nrg->reservation_counter = NULL;
494 nrg->css = NULL;
495 }
496#endif
497}
498
d85aecf2
ML
499static void put_uncharge_info(struct file_region *rg)
500{
501#ifdef CONFIG_CGROUP_HUGETLB
502 if (rg->css)
503 css_put(rg->css);
504#endif
505}
506
a9b3f867
MA
507static bool has_same_uncharge_info(struct file_region *rg,
508 struct file_region *org)
509{
510#ifdef CONFIG_CGROUP_HUGETLB
0739eb43 511 return rg->reservation_counter == org->reservation_counter &&
a9b3f867
MA
512 rg->css == org->css;
513
514#else
515 return true;
516#endif
517}
518
519static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
520{
3259914f 521 struct file_region *nrg, *prg;
a9b3f867
MA
522
523 prg = list_prev_entry(rg, link);
524 if (&prg->link != &resv->regions && prg->to == rg->from &&
525 has_same_uncharge_info(prg, rg)) {
526 prg->to = rg->to;
527
528 list_del(&rg->link);
d85aecf2 529 put_uncharge_info(rg);
a9b3f867
MA
530 kfree(rg);
531
7db5e7b6 532 rg = prg;
a9b3f867
MA
533 }
534
535 nrg = list_next_entry(rg, link);
536 if (&nrg->link != &resv->regions && nrg->from == rg->to &&
537 has_same_uncharge_info(nrg, rg)) {
538 nrg->from = rg->from;
539
540 list_del(&rg->link);
d85aecf2 541 put_uncharge_info(rg);
a9b3f867 542 kfree(rg);
a9b3f867
MA
543 }
544}
545
2103cf9c 546static inline long
84448c8e 547hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
2103cf9c
PX
548 long to, struct hstate *h, struct hugetlb_cgroup *cg,
549 long *regions_needed)
550{
551 struct file_region *nrg;
552
553 if (!regions_needed) {
554 nrg = get_file_region_entry_from_cache(map, from, to);
555 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
84448c8e 556 list_add(&nrg->link, rg);
2103cf9c
PX
557 coalesce_file_region(map, nrg);
558 } else
559 *regions_needed += 1;
560
561 return to - from;
562}
563
972a3da3
WY
564/*
565 * Must be called with resv->lock held.
566 *
567 * Calling this with regions_needed != NULL will count the number of pages
568 * to be added but will not modify the linked list. And regions_needed will
569 * indicate the number of file_regions needed in the cache to carry out to add
570 * the regions for this range.
d75c6af9
MA
571 */
572static long add_reservation_in_range(struct resv_map *resv, long f, long t,
075a61d0 573 struct hugetlb_cgroup *h_cg,
972a3da3 574 struct hstate *h, long *regions_needed)
d75c6af9 575{
0db9d74e 576 long add = 0;
d75c6af9 577 struct list_head *head = &resv->regions;
0db9d74e 578 long last_accounted_offset = f;
84448c8e
JK
579 struct file_region *iter, *trg = NULL;
580 struct list_head *rg = NULL;
d75c6af9 581
0db9d74e
MA
582 if (regions_needed)
583 *regions_needed = 0;
d75c6af9 584
0db9d74e 585 /* In this loop, we essentially handle an entry for the range
84448c8e 586 * [last_accounted_offset, iter->from), at every iteration, with some
0db9d74e
MA
587 * bounds checking.
588 */
84448c8e 589 list_for_each_entry_safe(iter, trg, head, link) {
0db9d74e 590 /* Skip irrelevant regions that start before our range. */
84448c8e 591 if (iter->from < f) {
0db9d74e
MA
592 /* If this region ends after the last accounted offset,
593 * then we need to update last_accounted_offset.
594 */
84448c8e
JK
595 if (iter->to > last_accounted_offset)
596 last_accounted_offset = iter->to;
0db9d74e
MA
597 continue;
598 }
d75c6af9 599
0db9d74e
MA
600 /* When we find a region that starts beyond our range, we've
601 * finished.
602 */
84448c8e
JK
603 if (iter->from >= t) {
604 rg = iter->link.prev;
d75c6af9 605 break;
84448c8e 606 }
d75c6af9 607
84448c8e 608 /* Add an entry for last_accounted_offset -> iter->from, and
0db9d74e
MA
609 * update last_accounted_offset.
610 */
84448c8e
JK
611 if (iter->from > last_accounted_offset)
612 add += hugetlb_resv_map_add(resv, iter->link.prev,
2103cf9c 613 last_accounted_offset,
84448c8e 614 iter->from, h, h_cg,
2103cf9c 615 regions_needed);
0db9d74e 616
84448c8e 617 last_accounted_offset = iter->to;
0db9d74e
MA
618 }
619
620 /* Handle the case where our range extends beyond
621 * last_accounted_offset.
622 */
84448c8e
JK
623 if (!rg)
624 rg = head->prev;
2103cf9c
PX
625 if (last_accounted_offset < t)
626 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
627 t, h, h_cg, regions_needed);
0db9d74e 628
0db9d74e
MA
629 return add;
630}
631
632/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
633 */
634static int allocate_file_region_entries(struct resv_map *resv,
635 int regions_needed)
636 __must_hold(&resv->lock)
637{
34665341 638 LIST_HEAD(allocated_regions);
0db9d74e
MA
639 int to_allocate = 0, i = 0;
640 struct file_region *trg = NULL, *rg = NULL;
641
642 VM_BUG_ON(regions_needed < 0);
643
0db9d74e
MA
644 /*
645 * Check for sufficient descriptors in the cache to accommodate
646 * the number of in progress add operations plus regions_needed.
647 *
648 * This is a while loop because when we drop the lock, some other call
649 * to region_add or region_del may have consumed some region_entries,
650 * so we keep looping here until we finally have enough entries for
651 * (adds_in_progress + regions_needed).
652 */
653 while (resv->region_cache_count <
654 (resv->adds_in_progress + regions_needed)) {
655 to_allocate = resv->adds_in_progress + regions_needed -
656 resv->region_cache_count;
657
658 /* At this point, we should have enough entries in the cache
f0953a1b 659 * for all the existing adds_in_progress. We should only be
0db9d74e 660 * needing to allocate for regions_needed.
d75c6af9 661 */
0db9d74e
MA
662 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
663
664 spin_unlock(&resv->lock);
665 for (i = 0; i < to_allocate; i++) {
666 trg = kmalloc(sizeof(*trg), GFP_KERNEL);
667 if (!trg)
668 goto out_of_memory;
669 list_add(&trg->link, &allocated_regions);
d75c6af9 670 }
d75c6af9 671
0db9d74e
MA
672 spin_lock(&resv->lock);
673
d3ec7b6e
WY
674 list_splice(&allocated_regions, &resv->region_cache);
675 resv->region_cache_count += to_allocate;
d75c6af9
MA
676 }
677
0db9d74e 678 return 0;
d75c6af9 679
0db9d74e
MA
680out_of_memory:
681 list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
682 list_del(&rg->link);
683 kfree(rg);
684 }
685 return -ENOMEM;
d75c6af9
MA
686}
687
1dd308a7
MK
688/*
689 * Add the huge page range represented by [f, t) to the reserve
0db9d74e
MA
690 * map. Regions will be taken from the cache to fill in this range.
691 * Sufficient regions should exist in the cache due to the previous
692 * call to region_chg with the same range, but in some cases the cache will not
693 * have sufficient entries due to races with other code doing region_add or
694 * region_del. The extra needed entries will be allocated.
cf3ad20b 695 *
0db9d74e
MA
696 * regions_needed is the out value provided by a previous call to region_chg.
697 *
698 * Return the number of new huge pages added to the map. This number is greater
699 * than or equal to zero. If file_region entries needed to be allocated for
7c8de358 700 * this operation and we were not able to allocate, it returns -ENOMEM.
0db9d74e
MA
701 * region_add of regions of length 1 never allocate file_regions and cannot
702 * fail; region_chg will always allocate at least 1 entry and a region_add for
703 * 1 page will only require at most 1 entry.
1dd308a7 704 */
0db9d74e 705static long region_add(struct resv_map *resv, long f, long t,
075a61d0
MA
706 long in_regions_needed, struct hstate *h,
707 struct hugetlb_cgroup *h_cg)
96822904 708{
0db9d74e 709 long add = 0, actual_regions_needed = 0;
96822904 710
7b24d861 711 spin_lock(&resv->lock);
0db9d74e
MA
712retry:
713
714 /* Count how many regions are actually needed to execute this add. */
972a3da3
WY
715 add_reservation_in_range(resv, f, t, NULL, NULL,
716 &actual_regions_needed);
96822904 717
5e911373 718 /*
0db9d74e
MA
719 * Check for sufficient descriptors in the cache to accommodate
720 * this add operation. Note that actual_regions_needed may be greater
721 * than in_regions_needed, as the resv_map may have been modified since
722 * the region_chg call. In this case, we need to make sure that we
723 * allocate extra entries, such that we have enough for all the
724 * existing adds_in_progress, plus the excess needed for this
725 * operation.
5e911373 726 */
0db9d74e
MA
727 if (actual_regions_needed > in_regions_needed &&
728 resv->region_cache_count <
729 resv->adds_in_progress +
730 (actual_regions_needed - in_regions_needed)) {
731 /* region_add operation of range 1 should never need to
732 * allocate file_region entries.
733 */
734 VM_BUG_ON(t - f <= 1);
5e911373 735
0db9d74e
MA
736 if (allocate_file_region_entries(
737 resv, actual_regions_needed - in_regions_needed)) {
738 return -ENOMEM;
739 }
5e911373 740
0db9d74e 741 goto retry;
5e911373
MK
742 }
743
972a3da3 744 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
0db9d74e
MA
745
746 resv->adds_in_progress -= in_regions_needed;
cf3ad20b 747
7b24d861 748 spin_unlock(&resv->lock);
cf3ad20b 749 return add;
96822904
AW
750}
751
1dd308a7
MK
752/*
753 * Examine the existing reserve map and determine how many
754 * huge pages in the specified range [f, t) are NOT currently
755 * represented. This routine is called before a subsequent
756 * call to region_add that will actually modify the reserve
757 * map to add the specified range [f, t). region_chg does
758 * not change the number of huge pages represented by the
0db9d74e
MA
759 * map. A number of new file_region structures is added to the cache as a
760 * placeholder, for the subsequent region_add call to use. At least 1
761 * file_region structure is added.
762 *
763 * out_regions_needed is the number of regions added to the
764 * resv->adds_in_progress. This value needs to be provided to a follow up call
765 * to region_add or region_abort for proper accounting.
5e911373
MK
766 *
767 * Returns the number of huge pages that need to be added to the existing
768 * reservation map for the range [f, t). This number is greater or equal to
769 * zero. -ENOMEM is returned if a new file_region structure or cache entry
770 * is needed and can not be allocated.
1dd308a7 771 */
0db9d74e
MA
772static long region_chg(struct resv_map *resv, long f, long t,
773 long *out_regions_needed)
96822904 774{
96822904
AW
775 long chg = 0;
776
7b24d861 777 spin_lock(&resv->lock);
5e911373 778
972a3da3 779 /* Count how many hugepages in this range are NOT represented. */
075a61d0 780 chg = add_reservation_in_range(resv, f, t, NULL, NULL,
972a3da3 781 out_regions_needed);
5e911373 782
0db9d74e
MA
783 if (*out_regions_needed == 0)
784 *out_regions_needed = 1;
5e911373 785
0db9d74e
MA
786 if (allocate_file_region_entries(resv, *out_regions_needed))
787 return -ENOMEM;
5e911373 788
0db9d74e 789 resv->adds_in_progress += *out_regions_needed;
7b24d861 790
7b24d861 791 spin_unlock(&resv->lock);
96822904
AW
792 return chg;
793}
794
5e911373
MK
795/*
796 * Abort the in progress add operation. The adds_in_progress field
797 * of the resv_map keeps track of the operations in progress between
798 * calls to region_chg and region_add. Operations are sometimes
799 * aborted after the call to region_chg. In such cases, region_abort
0db9d74e
MA
800 * is called to decrement the adds_in_progress counter. regions_needed
801 * is the value returned by the region_chg call, it is used to decrement
802 * the adds_in_progress counter.
5e911373
MK
803 *
804 * NOTE: The range arguments [f, t) are not needed or used in this
805 * routine. They are kept to make reading the calling code easier as
806 * arguments will match the associated region_chg call.
807 */
0db9d74e
MA
808static void region_abort(struct resv_map *resv, long f, long t,
809 long regions_needed)
5e911373
MK
810{
811 spin_lock(&resv->lock);
812 VM_BUG_ON(!resv->region_cache_count);
0db9d74e 813 resv->adds_in_progress -= regions_needed;
5e911373
MK
814 spin_unlock(&resv->lock);
815}
816
1dd308a7 817/*
feba16e2
MK
818 * Delete the specified range [f, t) from the reserve map. If the
819 * t parameter is LONG_MAX, this indicates that ALL regions after f
820 * should be deleted. Locate the regions which intersect [f, t)
821 * and either trim, delete or split the existing regions.
822 *
823 * Returns the number of huge pages deleted from the reserve map.
824 * In the normal case, the return value is zero or more. In the
825 * case where a region must be split, a new region descriptor must
826 * be allocated. If the allocation fails, -ENOMEM will be returned.
827 * NOTE: If the parameter t == LONG_MAX, then we will never split
828 * a region and possibly return -ENOMEM. Callers specifying
829 * t == LONG_MAX do not need to check for -ENOMEM error.
1dd308a7 830 */
feba16e2 831static long region_del(struct resv_map *resv, long f, long t)
96822904 832{
1406ec9b 833 struct list_head *head = &resv->regions;
96822904 834 struct file_region *rg, *trg;
feba16e2
MK
835 struct file_region *nrg = NULL;
836 long del = 0;
96822904 837
feba16e2 838retry:
7b24d861 839 spin_lock(&resv->lock);
feba16e2 840 list_for_each_entry_safe(rg, trg, head, link) {
dbe409e4
MK
841 /*
842 * Skip regions before the range to be deleted. file_region
843 * ranges are normally of the form [from, to). However, there
844 * may be a "placeholder" entry in the map which is of the form
845 * (from, to) with from == to. Check for placeholder entries
846 * at the beginning of the range to be deleted.
847 */
848 if (rg->to <= f && (rg->to != rg->from || rg->to != f))
feba16e2 849 continue;
dbe409e4 850
feba16e2 851 if (rg->from >= t)
96822904 852 break;
96822904 853
feba16e2
MK
854 if (f > rg->from && t < rg->to) { /* Must split region */
855 /*
856 * Check for an entry in the cache before dropping
857 * lock and attempting allocation.
858 */
859 if (!nrg &&
860 resv->region_cache_count > resv->adds_in_progress) {
861 nrg = list_first_entry(&resv->region_cache,
862 struct file_region,
863 link);
864 list_del(&nrg->link);
865 resv->region_cache_count--;
866 }
96822904 867
feba16e2
MK
868 if (!nrg) {
869 spin_unlock(&resv->lock);
870 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
871 if (!nrg)
872 return -ENOMEM;
873 goto retry;
874 }
875
876 del += t - f;
79aa925b 877 hugetlb_cgroup_uncharge_file_region(
d85aecf2 878 resv, rg, t - f, false);
feba16e2
MK
879
880 /* New entry for end of split region */
881 nrg->from = t;
882 nrg->to = rg->to;
075a61d0
MA
883
884 copy_hugetlb_cgroup_uncharge_info(nrg, rg);
885
feba16e2
MK
886 INIT_LIST_HEAD(&nrg->link);
887
888 /* Original entry is trimmed */
889 rg->to = f;
890
891 list_add(&nrg->link, &rg->link);
892 nrg = NULL;
96822904 893 break;
feba16e2
MK
894 }
895
896 if (f <= rg->from && t >= rg->to) { /* Remove entire region */
897 del += rg->to - rg->from;
075a61d0 898 hugetlb_cgroup_uncharge_file_region(resv, rg,
d85aecf2 899 rg->to - rg->from, true);
feba16e2
MK
900 list_del(&rg->link);
901 kfree(rg);
902 continue;
903 }
904
905 if (f <= rg->from) { /* Trim beginning of region */
075a61d0 906 hugetlb_cgroup_uncharge_file_region(resv, rg,
d85aecf2 907 t - rg->from, false);
075a61d0 908
79aa925b
MK
909 del += t - rg->from;
910 rg->from = t;
911 } else { /* Trim end of region */
075a61d0 912 hugetlb_cgroup_uncharge_file_region(resv, rg,
d85aecf2 913 rg->to - f, false);
79aa925b
MK
914
915 del += rg->to - f;
916 rg->to = f;
feba16e2 917 }
96822904 918 }
7b24d861 919
7b24d861 920 spin_unlock(&resv->lock);
feba16e2
MK
921 kfree(nrg);
922 return del;
96822904
AW
923}
924
b5cec28d
MK
925/*
926 * A rare out of memory error was encountered which prevented removal of
927 * the reserve map region for a page. The huge page itself was free'ed
928 * and removed from the page cache. This routine will adjust the subpool
929 * usage count, and the global reserve count if needed. By incrementing
930 * these counts, the reserve map entry which could not be deleted will
931 * appear as a "reserved" entry instead of simply dangling with incorrect
932 * counts.
933 */
72e2936c 934void hugetlb_fix_reserve_counts(struct inode *inode)
b5cec28d
MK
935{
936 struct hugepage_subpool *spool = subpool_inode(inode);
937 long rsv_adjust;
da56388c 938 bool reserved = false;
b5cec28d
MK
939
940 rsv_adjust = hugepage_subpool_get_pages(spool, 1);
da56388c 941 if (rsv_adjust > 0) {
b5cec28d
MK
942 struct hstate *h = hstate_inode(inode);
943
da56388c
ML
944 if (!hugetlb_acct_memory(h, 1))
945 reserved = true;
946 } else if (!rsv_adjust) {
947 reserved = true;
b5cec28d 948 }
da56388c
ML
949
950 if (!reserved)
951 pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
b5cec28d
MK
952}
953
1dd308a7
MK
954/*
955 * Count and return the number of huge pages in the reserve map
956 * that intersect with the range [f, t).
957 */
1406ec9b 958static long region_count(struct resv_map *resv, long f, long t)
84afd99b 959{
1406ec9b 960 struct list_head *head = &resv->regions;
84afd99b
AW
961 struct file_region *rg;
962 long chg = 0;
963
7b24d861 964 spin_lock(&resv->lock);
84afd99b
AW
965 /* Locate each segment we overlap with, and count that overlap. */
966 list_for_each_entry(rg, head, link) {
f2135a4a
WSH
967 long seg_from;
968 long seg_to;
84afd99b
AW
969
970 if (rg->to <= f)
971 continue;
972 if (rg->from >= t)
973 break;
974
975 seg_from = max(rg->from, f);
976 seg_to = min(rg->to, t);
977
978 chg += seg_to - seg_from;
979 }
7b24d861 980 spin_unlock(&resv->lock);
84afd99b
AW
981
982 return chg;
983}
984
e7c4b0bf
AW
985/*
986 * Convert the address within this vma to the page offset within
a08c7193 987 * the mapping, huge page units here.
e7c4b0bf 988 */
a5516438
AK
989static pgoff_t vma_hugecache_offset(struct hstate *h,
990 struct vm_area_struct *vma, unsigned long address)
e7c4b0bf 991{
a5516438
AK
992 return ((address - vma->vm_start) >> huge_page_shift(h)) +
993 (vma->vm_pgoff >> huge_page_order(h));
e7c4b0bf
AW
994}
995
8cfd014e
MWO
996/**
997 * vma_kernel_pagesize - Page size granularity for this VMA.
998 * @vma: The user mapping.
999 *
1000 * Folios in this VMA will be aligned to, and at least the size of the
1001 * number of bytes returned by this function.
1002 *
1003 * Return: The default size of the folios allocated when backing a VMA.
08fba699
MG
1004 */
1005unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
1006{
05ea8860
DW
1007 if (vma->vm_ops && vma->vm_ops->pagesize)
1008 return vma->vm_ops->pagesize(vma);
1009 return PAGE_SIZE;
08fba699 1010}
f340ca0f 1011EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
08fba699 1012
3340289d
MG
1013/*
1014 * Return the page size being used by the MMU to back a VMA. In the majority
1015 * of cases, the page size used by the kernel matches the MMU size. On
09135cc5
DW
1016 * architectures where it differs, an architecture-specific 'strong'
1017 * version of this symbol is required.
3340289d 1018 */
09135cc5 1019__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
3340289d
MG
1020{
1021 return vma_kernel_pagesize(vma);
1022}
3340289d 1023
84afd99b
AW
1024/*
1025 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
1026 * bits of the reservation map pointer, which are always clear due to
1027 * alignment.
1028 */
1029#define HPAGE_RESV_OWNER (1UL << 0)
1030#define HPAGE_RESV_UNMAPPED (1UL << 1)
04f2cbe3 1031#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
84afd99b 1032
a1e78772
MG
1033/*
1034 * These helpers are used to track how many pages are reserved for
1035 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
1036 * is guaranteed to have their future faults succeed.
1037 *
8d9bfb26 1038 * With the exception of hugetlb_dup_vma_private() which is called at fork(),
a1e78772
MG
1039 * the reserve counters are updated with the hugetlb_lock held. It is safe
1040 * to reset the VMA at fork() time as it is not in use yet and there is no
1041 * chance of the global counters getting corrupted as a result of the values.
84afd99b
AW
1042 *
1043 * The private mapping reservation is represented in a subtly different
1044 * manner to a shared mapping. A shared mapping has a region map associated
1045 * with the underlying file, this region map represents the backing file
1046 * pages which have ever had a reservation assigned which this persists even
1047 * after the page is instantiated. A private mapping has a region map
1048 * associated with the original mmap which is attached to all VMAs which
1049 * reference it, this region map represents those offsets which have consumed
1050 * reservation ie. where pages have been instantiated.
a1e78772 1051 */
e7c4b0bf
AW
1052static unsigned long get_vma_private_data(struct vm_area_struct *vma)
1053{
1054 return (unsigned long)vma->vm_private_data;
1055}
1056
1057static void set_vma_private_data(struct vm_area_struct *vma,
1058 unsigned long value)
1059{
1060 vma->vm_private_data = (void *)value;
1061}
1062
e9fe92ae
MA
1063static void
1064resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
1065 struct hugetlb_cgroup *h_cg,
1066 struct hstate *h)
1067{
1068#ifdef CONFIG_CGROUP_HUGETLB
1069 if (!h_cg || !h) {
1070 resv_map->reservation_counter = NULL;
1071 resv_map->pages_per_hpage = 0;
1072 resv_map->css = NULL;
1073 } else {
1074 resv_map->reservation_counter =
1075 &h_cg->rsvd_hugepage[hstate_index(h)];
1076 resv_map->pages_per_hpage = pages_per_huge_page(h);
1077 resv_map->css = &h_cg->css;
1078 }
1079#endif
1080}
1081
9119a41e 1082struct resv_map *resv_map_alloc(void)
84afd99b
AW
1083{
1084 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
5e911373
MK
1085 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
1086
1087 if (!resv_map || !rg) {
1088 kfree(resv_map);
1089 kfree(rg);
84afd99b 1090 return NULL;
5e911373 1091 }
84afd99b
AW
1092
1093 kref_init(&resv_map->refs);
7b24d861 1094 spin_lock_init(&resv_map->lock);
84afd99b 1095 INIT_LIST_HEAD(&resv_map->regions);
bf491692 1096 init_rwsem(&resv_map->rw_sema);
84afd99b 1097
5e911373 1098 resv_map->adds_in_progress = 0;
e9fe92ae
MA
1099 /*
1100 * Initialize these to 0. On shared mappings, 0's here indicate these
1101 * fields don't do cgroup accounting. On private mappings, these will be
1102 * re-initialized to the proper values, to indicate that hugetlb cgroup
1103 * reservations are to be un-charged from here.
1104 */
1105 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
5e911373
MK
1106
1107 INIT_LIST_HEAD(&resv_map->region_cache);
1108 list_add(&rg->link, &resv_map->region_cache);
1109 resv_map->region_cache_count = 1;
1110
84afd99b
AW
1111 return resv_map;
1112}
1113
9119a41e 1114void resv_map_release(struct kref *ref)
84afd99b
AW
1115{
1116 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
5e911373
MK
1117 struct list_head *head = &resv_map->region_cache;
1118 struct file_region *rg, *trg;
84afd99b
AW
1119
1120 /* Clear out any active regions before we release the map. */
feba16e2 1121 region_del(resv_map, 0, LONG_MAX);
5e911373
MK
1122
1123 /* ... and any entries left in the cache */
1124 list_for_each_entry_safe(rg, trg, head, link) {
1125 list_del(&rg->link);
1126 kfree(rg);
1127 }
1128
1129 VM_BUG_ON(resv_map->adds_in_progress);
1130
84afd99b
AW
1131 kfree(resv_map);
1132}
1133
4e35f483
JK
1134static inline struct resv_map *inode_resv_map(struct inode *inode)
1135{
f27a5136
MK
1136 /*
1137 * At inode evict time, i_mapping may not point to the original
1138 * address space within the inode. This original address space
1139 * contains the pointer to the resv_map. So, always use the
1140 * address space embedded within the inode.
1141 * The VERY common case is inode->mapping == &inode->i_data but,
1142 * this may not be true for device special inodes.
1143 */
1144 return (struct resv_map *)(&inode->i_data)->private_data;
4e35f483
JK
1145}
1146
84afd99b 1147static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
a1e78772 1148{
81d1b09c 1149 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
4e35f483
JK
1150 if (vma->vm_flags & VM_MAYSHARE) {
1151 struct address_space *mapping = vma->vm_file->f_mapping;
1152 struct inode *inode = mapping->host;
1153
1154 return inode_resv_map(inode);
1155
1156 } else {
84afd99b
AW
1157 return (struct resv_map *)(get_vma_private_data(vma) &
1158 ~HPAGE_RESV_MASK);
4e35f483 1159 }
a1e78772
MG
1160}
1161
84afd99b 1162static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
a1e78772 1163{
81d1b09c
SL
1164 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1165 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
a1e78772 1166
92fe9dcb 1167 set_vma_private_data(vma, (unsigned long)map);
04f2cbe3
MG
1168}
1169
1170static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
1171{
81d1b09c
SL
1172 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1173 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
e7c4b0bf
AW
1174
1175 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
04f2cbe3
MG
1176}
1177
1178static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
1179{
81d1b09c 1180 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
e7c4b0bf
AW
1181
1182 return (get_vma_private_data(vma) & flag) != 0;
a1e78772
MG
1183}
1184
8d9bfb26 1185void hugetlb_dup_vma_private(struct vm_area_struct *vma)
a1e78772 1186{
81d1b09c 1187 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
8d9bfb26
MK
1188 /*
1189 * Clear vm_private_data
612b8a31
MK
1190 * - For shared mappings this is a per-vma semaphore that may be
1191 * allocated in a subsequent call to hugetlb_vm_op_open.
1192 * Before clearing, make sure pointer is not associated with vma
1193 * as this will leak the structure. This is the case when called
1194 * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
1195 * been called to allocate a new structure.
8d9bfb26
MK
1196 * - For MAP_PRIVATE mappings, this is the reserve map which does
1197 * not apply to children. Faults generated by the children are
1198 * not guaranteed to succeed, even if read-only.
8d9bfb26 1199 */
612b8a31
MK
1200 if (vma->vm_flags & VM_MAYSHARE) {
1201 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
1202
1203 if (vma_lock && vma_lock->vma != vma)
1204 vma->vm_private_data = NULL;
1205 } else
1206 vma->vm_private_data = NULL;
a1e78772
MG
1207}
1208
550a7d60
MA
1209/*
1210 * Reset and decrement one ref on hugepage private reservation.
8651a137 1211 * Called with mm->mmap_lock writer semaphore held.
550a7d60
MA
1212 * This function should be only used by move_vma() and operate on
1213 * same sized vma. It should never come here with last ref on the
1214 * reservation.
1215 */
1216void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1217{
1218 /*
1219 * Clear the old hugetlb private page reservation.
1220 * It has already been transferred to new_vma.
1221 *
1222 * During a mremap() operation of a hugetlb vma we call move_vma()
1223 * which copies vma into new_vma and unmaps vma. After the copy
1224 * operation both new_vma and vma share a reference to the resv_map
1225 * struct, and at that point vma is about to be unmapped. We don't
1226 * want to return the reservation to the pool at unmap of vma because
1227 * the reservation still lives on in new_vma, so simply decrement the
1228 * ref here and remove the resv_map reference from this vma.
1229 */
1230 struct resv_map *reservations = vma_resv_map(vma);
1231
afe041c2
BQM
1232 if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1233 resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
550a7d60 1234 kref_put(&reservations->refs, resv_map_release);
afe041c2 1235 }
550a7d60 1236
8d9bfb26 1237 hugetlb_dup_vma_private(vma);
550a7d60
MA
1238}
1239
a1e78772 1240/* Returns true if the VMA has associated reserve pages */
559ec2f8 1241static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
a1e78772 1242{
af0ed73e
JK
1243 if (vma->vm_flags & VM_NORESERVE) {
1244 /*
1245 * This address is already reserved by other process(chg == 0),
1246 * so, we should decrement reserved count. Without decrementing,
1247 * reserve count remains after releasing inode, because this
1248 * allocated page will go into page cache and is regarded as
1249 * coming from reserved pool in releasing step. Currently, we
1250 * don't have any other solution to deal with this situation
1251 * properly, so add work-around here.
1252 */
1253 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
559ec2f8 1254 return true;
af0ed73e 1255 else
559ec2f8 1256 return false;
af0ed73e 1257 }
a63884e9
JK
1258
1259 /* Shared mappings always use reserves */
1fb1b0e9
MK
1260 if (vma->vm_flags & VM_MAYSHARE) {
1261 /*
1262 * We know VM_NORESERVE is not set. Therefore, there SHOULD
1263 * be a region map for all pages. The only situation where
1264 * there is no region map is if a hole was punched via
7c8de358 1265 * fallocate. In this case, there really are no reserves to
1fb1b0e9
MK
1266 * use. This situation is indicated if chg != 0.
1267 */
1268 if (chg)
1269 return false;
1270 else
1271 return true;
1272 }
a63884e9
JK
1273
1274 /*
1275 * Only the process that called mmap() has reserves for
1276 * private mappings.
1277 */
67961f9d
MK
1278 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1279 /*
1280 * Like the shared case above, a hole punch or truncate
1281 * could have been performed on the private mapping.
1282 * Examine the value of chg to determine if reserves
1283 * actually exist or were previously consumed.
1284 * Very Subtle - The value of chg comes from a previous
1285 * call to vma_needs_reserves(). The reserve map for
1286 * private mappings has different (opposite) semantics
1287 * than that of shared mappings. vma_needs_reserves()
1288 * has already taken this difference in semantics into
1289 * account. Therefore, the meaning of chg is the same
1290 * as in the shared case above. Code could easily be
1291 * combined, but keeping it separate draws attention to
1292 * subtle differences.
1293 */
1294 if (chg)
1295 return false;
1296 else
1297 return true;
1298 }
a63884e9 1299
559ec2f8 1300 return false;
a1e78772
MG
1301}
1302
240d67a8 1303static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
1da177e4 1304{
240d67a8 1305 int nid = folio_nid(folio);
9487ca60
MK
1306
1307 lockdep_assert_held(&hugetlb_lock);
240d67a8 1308 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
b65a4eda 1309
240d67a8 1310 list_move(&folio->lru, &h->hugepage_freelists[nid]);
a5516438
AK
1311 h->free_huge_pages++;
1312 h->free_huge_pages_node[nid]++;
240d67a8 1313 folio_set_hugetlb_freed(folio);
1da177e4
LT
1314}
1315
a36f1e90
SK
1316static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
1317 int nid)
bf50bab2 1318{
a36f1e90 1319 struct folio *folio;
1a08ae36 1320 bool pin = !!(current->flags & PF_MEMALLOC_PIN);
bbe88753 1321
9487ca60 1322 lockdep_assert_held(&hugetlb_lock);
a36f1e90
SK
1323 list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
1324 if (pin && !folio_is_longterm_pinnable(folio))
bbe88753 1325 continue;
bf50bab2 1326
a36f1e90 1327 if (folio_test_hwpoison(folio))
6664bfc8
WY
1328 continue;
1329
a36f1e90
SK
1330 list_move(&folio->lru, &h->hugepage_activelist);
1331 folio_ref_unfreeze(folio, 1);
1332 folio_clear_hugetlb_freed(folio);
6664bfc8
WY
1333 h->free_huge_pages--;
1334 h->free_huge_pages_node[nid]--;
a36f1e90 1335 return folio;
bbe88753
JK
1336 }
1337
6664bfc8 1338 return NULL;
bf50bab2
NH
1339}
1340
a36f1e90
SK
1341static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
1342 int nid, nodemask_t *nmask)
94310cbc 1343{
3e59fcb0
MH
1344 unsigned int cpuset_mems_cookie;
1345 struct zonelist *zonelist;
1346 struct zone *zone;
1347 struct zoneref *z;
98fa15f3 1348 int node = NUMA_NO_NODE;
94310cbc 1349
3e59fcb0
MH
1350 zonelist = node_zonelist(nid, gfp_mask);
1351
1352retry_cpuset:
1353 cpuset_mems_cookie = read_mems_allowed_begin();
1354 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
a36f1e90 1355 struct folio *folio;
3e59fcb0
MH
1356
1357 if (!cpuset_zone_allowed(zone, gfp_mask))
1358 continue;
1359 /*
1360 * no need to ask again on the same node. Pool is node rather than
1361 * zone aware
1362 */
1363 if (zone_to_nid(zone) == node)
1364 continue;
1365 node = zone_to_nid(zone);
94310cbc 1366
a36f1e90
SK
1367 folio = dequeue_hugetlb_folio_node_exact(h, node);
1368 if (folio)
1369 return folio;
94310cbc 1370 }
3e59fcb0
MH
1371 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1372 goto retry_cpuset;
1373
94310cbc
AK
1374 return NULL;
1375}
1376
8346d69d
XH
1377static unsigned long available_huge_pages(struct hstate *h)
1378{
1379 return h->free_huge_pages - h->resv_huge_pages;
1380}
1381
ff7d853b 1382static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
a5516438 1383 struct vm_area_struct *vma,
af0ed73e
JK
1384 unsigned long address, int avoid_reserve,
1385 long chg)
1da177e4 1386{
a36f1e90 1387 struct folio *folio = NULL;
480eccf9 1388 struct mempolicy *mpol;
04ec6264 1389 gfp_t gfp_mask;
3e59fcb0 1390 nodemask_t *nodemask;
04ec6264 1391 int nid;
1da177e4 1392
a1e78772
MG
1393 /*
1394 * A child process with MAP_PRIVATE mappings created by their parent
1395 * have no page reserves. This check ensures that reservations are
1396 * not "stolen". The child may still get SIGKILLed
1397 */
8346d69d 1398 if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
c0ff7453 1399 goto err;
a1e78772 1400
04f2cbe3 1401 /* If reserves cannot be used, ensure enough pages are in the pool */
8346d69d 1402 if (avoid_reserve && !available_huge_pages(h))
6eab04a8 1403 goto err;
04f2cbe3 1404
04ec6264
VB
1405 gfp_mask = htlb_alloc_mask(h);
1406 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
cfcaa66f
BW
1407
1408 if (mpol_is_preferred_many(mpol)) {
a36f1e90
SK
1409 folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1410 nid, nodemask);
cfcaa66f
BW
1411
1412 /* Fallback to all nodes if page==NULL */
1413 nodemask = NULL;
1414 }
1415
a36f1e90
SK
1416 if (!folio)
1417 folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
1418 nid, nodemask);
cfcaa66f 1419
a36f1e90
SK
1420 if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
1421 folio_set_hugetlb_restore_reserve(folio);
3e59fcb0 1422 h->resv_huge_pages--;
1da177e4 1423 }
cc9a6c87 1424
52cd3b07 1425 mpol_cond_put(mpol);
ff7d853b 1426 return folio;
cc9a6c87
MG
1427
1428err:
cc9a6c87 1429 return NULL;
1da177e4
LT
1430}
1431
1cac6f2c
LC
1432/*
1433 * common helper functions for hstate_next_node_to_{alloc|free}.
1434 * We may have allocated or freed a huge page based on a different
1435 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
1436 * be outside of *nodes_allowed. Ensure that we use an allowed
1437 * node for alloc or free.
1438 */
1439static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1440{
0edaf86c 1441 nid = next_node_in(nid, *nodes_allowed);
1cac6f2c
LC
1442 VM_BUG_ON(nid >= MAX_NUMNODES);
1443
1444 return nid;
1445}
1446
1447static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1448{
1449 if (!node_isset(nid, *nodes_allowed))
1450 nid = next_node_allowed(nid, nodes_allowed);
1451 return nid;
1452}
1453
1454/*
1455 * returns the previously saved node ["this node"] from which to
1456 * allocate a persistent huge page for the pool and advance the
1457 * next node from which to allocate, handling wrap at end of node
1458 * mask.
1459 */
1460static int hstate_next_node_to_alloc(struct hstate *h,
1461 nodemask_t *nodes_allowed)
1462{
1463 int nid;
1464
1465 VM_BUG_ON(!nodes_allowed);
1466
1467 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
1468 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
1469
1470 return nid;
1471}
1472
1473/*
d5b43e96 1474 * helper for remove_pool_hugetlb_folio() - return the previously saved
1cac6f2c
LC
1475 * node ["this node"] from which to free a huge page. Advance the
1476 * next node id whether or not we find a free huge page to free so
1477 * that the next attempt to free addresses the next node.
1478 */
1479static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1480{
1481 int nid;
1482
1483 VM_BUG_ON(!nodes_allowed);
1484
1485 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1486 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1487
1488 return nid;
1489}
1490
1491#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
1492 for (nr_nodes = nodes_weight(*mask); \
1493 nr_nodes > 0 && \
1494 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
1495 nr_nodes--)
1496
1497#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
1498 for (nr_nodes = nodes_weight(*mask); \
1499 nr_nodes > 0 && \
1500 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
1501 nr_nodes--)
1502
8531fc6f 1503/* used to demote non-gigantic_huge pages as well */
911565b8 1504static void __destroy_compound_gigantic_folio(struct folio *folio,
34d9e35b 1505 unsigned int order, bool demote)
944d9fec
LC
1506{
1507 int i;
1508 int nr_pages = 1 << order;
14455eab 1509 struct page *p;
944d9fec 1510
46f27228 1511 atomic_set(&folio->_entire_mapcount, 0);
eec20426 1512 atomic_set(&folio->_nr_pages_mapped, 0);
94688e8e 1513 atomic_set(&folio->_pincount, 0);
47e29d32 1514
14455eab 1515 for (i = 1; i < nr_pages; i++) {
911565b8 1516 p = folio_page(folio, i);
6c141973 1517 p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE;
a01f4390 1518 p->mapping = NULL;
1d798ca3 1519 clear_compound_head(p);
34d9e35b
MK
1520 if (!demote)
1521 set_page_refcounted(p);
944d9fec
LC
1522 }
1523
911565b8 1524 __folio_clear_head(folio);
944d9fec
LC
1525}
1526
911565b8 1527static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio,
8531fc6f
MK
1528 unsigned int order)
1529{
911565b8 1530 __destroy_compound_gigantic_folio(folio, order, true);
8531fc6f
MK
1531}
1532
1533#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
911565b8 1534static void destroy_compound_gigantic_folio(struct folio *folio,
34d9e35b
MK
1535 unsigned int order)
1536{
911565b8 1537 __destroy_compound_gigantic_folio(folio, order, false);
34d9e35b
MK
1538}
1539
7f325a8d 1540static void free_gigantic_folio(struct folio *folio, unsigned int order)
944d9fec 1541{
cf11e85f
RG
1542 /*
1543 * If the page isn't allocated using the cma allocator,
1544 * cma_release() returns false.
1545 */
dbda8fea 1546#ifdef CONFIG_CMA
7f325a8d
SK
1547 int nid = folio_nid(folio);
1548
1549 if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order))
cf11e85f 1550 return;
dbda8fea 1551#endif
cf11e85f 1552
7f325a8d 1553 free_contig_range(folio_pfn(folio), 1 << order);
944d9fec
LC
1554}
1555
4eb0716e 1556#ifdef CONFIG_CONTIG_ALLOC
19fc1a7e 1557static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
d9cc948f 1558 int nid, nodemask_t *nodemask)
944d9fec 1559{
19fc1a7e 1560 struct page *page;
04adbc3f 1561 unsigned long nr_pages = pages_per_huge_page(h);
953f064a
LX
1562 if (nid == NUMA_NO_NODE)
1563 nid = numa_mem_id();
944d9fec 1564
dbda8fea
BS
1565#ifdef CONFIG_CMA
1566 {
cf11e85f
RG
1567 int node;
1568
953f064a
LX
1569 if (hugetlb_cma[nid]) {
1570 page = cma_alloc(hugetlb_cma[nid], nr_pages,
1571 huge_page_order(h), true);
cf11e85f 1572 if (page)
19fc1a7e 1573 return page_folio(page);
cf11e85f 1574 }
953f064a
LX
1575
1576 if (!(gfp_mask & __GFP_THISNODE)) {
1577 for_each_node_mask(node, *nodemask) {
1578 if (node == nid || !hugetlb_cma[node])
1579 continue;
1580
1581 page = cma_alloc(hugetlb_cma[node], nr_pages,
1582 huge_page_order(h), true);
1583 if (page)
19fc1a7e 1584 return page_folio(page);
953f064a
LX
1585 }
1586 }
cf11e85f 1587 }
dbda8fea 1588#endif
cf11e85f 1589
19fc1a7e
SK
1590 page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1591 return page ? page_folio(page) : NULL;
944d9fec
LC
1592}
1593
4eb0716e 1594#else /* !CONFIG_CONTIG_ALLOC */
19fc1a7e 1595static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
4eb0716e
AG
1596 int nid, nodemask_t *nodemask)
1597{
1598 return NULL;
1599}
1600#endif /* CONFIG_CONTIG_ALLOC */
944d9fec 1601
e1073d1e 1602#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
19fc1a7e 1603static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
4eb0716e
AG
1604 int nid, nodemask_t *nodemask)
1605{
1606 return NULL;
1607}
7f325a8d
SK
1608static inline void free_gigantic_folio(struct folio *folio,
1609 unsigned int order) { }
911565b8 1610static inline void destroy_compound_gigantic_folio(struct folio *folio,
d00181b9 1611 unsigned int order) { }
944d9fec
LC
1612#endif
1613
32c87719
MK
1614static inline void __clear_hugetlb_destructor(struct hstate *h,
1615 struct folio *folio)
1616{
1617 lockdep_assert_held(&hugetlb_lock);
1618
9c5ccf2d 1619 folio_clear_hugetlb(folio);
32c87719
MK
1620}
1621
6eb4e88a 1622/*
32c87719
MK
1623 * Remove hugetlb folio from lists.
1624 * If vmemmap exists for the folio, update dtor so that the folio appears
1625 * as just a compound page. Otherwise, wait until after allocating vmemmap
1626 * to update dtor.
34d9e35b 1627 *
cfd5082b 1628 * A reference is held on the folio, except in the case of demote.
6eb4e88a
MK
1629 *
1630 * Must be called with hugetlb lock held.
1631 */
cfd5082b 1632static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio,
34d9e35b
MK
1633 bool adjust_surplus,
1634 bool demote)
6eb4e88a 1635{
cfd5082b 1636 int nid = folio_nid(folio);
6eb4e88a 1637
f074732d
SK
1638 VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
1639 VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);
6eb4e88a 1640
9487ca60 1641 lockdep_assert_held(&hugetlb_lock);
6eb4e88a
MK
1642 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1643 return;
1644
cfd5082b 1645 list_del(&folio->lru);
6eb4e88a 1646
cfd5082b 1647 if (folio_test_hugetlb_freed(folio)) {
6eb4e88a
MK
1648 h->free_huge_pages--;
1649 h->free_huge_pages_node[nid]--;
1650 }
1651 if (adjust_surplus) {
1652 h->surplus_huge_pages--;
1653 h->surplus_huge_pages_node[nid]--;
1654 }
1655
e32d20c0 1656 /*
32c87719
MK
1657 * We can only clear the hugetlb destructor after allocating vmemmap
1658 * pages. Otherwise, someone (memory error handling) may try to write
1659 * to tail struct pages.
1660 */
1661 if (!folio_test_hugetlb_vmemmap_optimized(folio))
1662 __clear_hugetlb_destructor(h, folio);
1663
1664 /*
1665 * In the case of demote we do not ref count the page as it will soon
1666 * be turned into a page of smaller size.
e32d20c0 1667 */
34d9e35b 1668 if (!demote)
cfd5082b 1669 folio_ref_unfreeze(folio, 1);
6eb4e88a
MK
1670
1671 h->nr_huge_pages--;
1672 h->nr_huge_pages_node[nid]--;
1673}
1674
cfd5082b 1675static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
34d9e35b
MK
1676 bool adjust_surplus)
1677{
cfd5082b 1678 __remove_hugetlb_folio(h, folio, adjust_surplus, false);
34d9e35b
MK
1679}
1680
cfd5082b 1681static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio,
8531fc6f
MK
1682 bool adjust_surplus)
1683{
cfd5082b 1684 __remove_hugetlb_folio(h, folio, adjust_surplus, true);
8531fc6f
MK
1685}
1686
2f6c57d6 1687static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
ad2fa371
MS
1688 bool adjust_surplus)
1689{
1690 int zeroed;
2f6c57d6 1691 int nid = folio_nid(folio);
ad2fa371 1692
2f6c57d6 1693 VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);
ad2fa371
MS
1694
1695 lockdep_assert_held(&hugetlb_lock);
1696
2f6c57d6 1697 INIT_LIST_HEAD(&folio->lru);
ad2fa371
MS
1698 h->nr_huge_pages++;
1699 h->nr_huge_pages_node[nid]++;
1700
1701 if (adjust_surplus) {
1702 h->surplus_huge_pages++;
1703 h->surplus_huge_pages_node[nid]++;
1704 }
1705
9c5ccf2d 1706 folio_set_hugetlb(folio);
2f6c57d6 1707 folio_change_private(folio, NULL);
a9e1eab2 1708 /*
2f6c57d6
SK
1709 * We have to set hugetlb_vmemmap_optimized again as above
1710 * folio_change_private(folio, NULL) cleared it.
a9e1eab2 1711 */
2f6c57d6 1712 folio_set_hugetlb_vmemmap_optimized(folio);
ad2fa371
MS
1713
1714 /*
2f6c57d6 1715 * This folio is about to be managed by the hugetlb allocator and
b65a4eda
MK
1716 * should have no users. Drop our reference, and check for others
1717 * just in case.
ad2fa371 1718 */
2f6c57d6
SK
1719 zeroed = folio_put_testzero(folio);
1720 if (unlikely(!zeroed))
b65a4eda 1721 /*
454a00c4
MWO
1722 * It is VERY unlikely soneone else has taken a ref
1723 * on the folio. In this case, we simply return as
1724 * free_huge_folio() will be called when this other ref
1725 * is dropped.
b65a4eda
MK
1726 */
1727 return;
1728
2f6c57d6 1729 arch_clear_hugepage_flags(&folio->page);
240d67a8 1730 enqueue_hugetlb_folio(h, folio);
ad2fa371
MS
1731}
1732
6f6956cf
SK
1733static void __update_and_free_hugetlb_folio(struct hstate *h,
1734 struct folio *folio)
6af2acb6 1735{
32c87719 1736 bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio);
a5516438 1737
4eb0716e 1738 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
944d9fec 1739 return;
18229df5 1740
161df60e
NH
1741 /*
1742 * If we don't know which subpages are hwpoisoned, we can't free
1743 * the hugepage, so it's leaked intentionally.
1744 */
7f325a8d 1745 if (folio_test_hugetlb_raw_hwp_unreliable(folio))
161df60e
NH
1746 return;
1747
d8f5f7e4
MK
1748 /*
1749 * If folio is not vmemmap optimized (!clear_dtor), then the folio
1750 * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore
1751 * can only be passed hugetlb pages and will BUG otherwise.
1752 */
1753 if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) {
ad2fa371
MS
1754 spin_lock_irq(&hugetlb_lock);
1755 /*
1756 * If we cannot allocate vmemmap pages, just refuse to free the
1757 * page and put the page back on the hugetlb free list and treat
1758 * as a surplus page.
1759 */
7f325a8d 1760 add_hugetlb_folio(h, folio, true);
ad2fa371
MS
1761 spin_unlock_irq(&hugetlb_lock);
1762 return;
1763 }
1764
161df60e
NH
1765 /*
1766 * Move PageHWPoison flag from head page to the raw error pages,
1767 * which makes any healthy subpages reusable.
1768 */
911565b8 1769 if (unlikely(folio_test_hwpoison(folio)))
2ff6cece 1770 folio_clear_hugetlb_hwpoison(folio);
161df60e 1771
32c87719
MK
1772 /*
1773 * If vmemmap pages were allocated above, then we need to clear the
1774 * hugetlb destructor under the hugetlb lock.
1775 */
1776 if (clear_dtor) {
1777 spin_lock_irq(&hugetlb_lock);
1778 __clear_hugetlb_destructor(h, folio);
1779 spin_unlock_irq(&hugetlb_lock);
1780 }
1781
a01f4390
MK
1782 /*
1783 * Non-gigantic pages demoted from CMA allocated gigantic pages
7f325a8d 1784 * need to be given back to CMA in free_gigantic_folio.
a01f4390
MK
1785 */
1786 if (hstate_is_gigantic(h) ||
2f6c57d6 1787 hugetlb_cma_folio(folio, huge_page_order(h))) {
911565b8 1788 destroy_compound_gigantic_folio(folio, huge_page_order(h));
7f325a8d 1789 free_gigantic_folio(folio, huge_page_order(h));
944d9fec 1790 } else {
6f6956cf 1791 __free_pages(&folio->page, huge_page_order(h));
944d9fec 1792 }
6af2acb6
AL
1793}
1794
b65d4adb 1795/*
d6ef19e2 1796 * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
b65d4adb
MS
1797 * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
1798 * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
1799 * the vmemmap pages.
1800 *
1801 * free_hpage_workfn() locklessly retrieves the linked list of pages to be
1802 * freed and frees them one-by-one. As the page->mapping pointer is going
1803 * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
1804 * structure of a lockless linked list of huge pages to be freed.
1805 */
1806static LLIST_HEAD(hpage_freelist);
1807
1808static void free_hpage_workfn(struct work_struct *work)
1809{
1810 struct llist_node *node;
1811
1812 node = llist_del_all(&hpage_freelist);
1813
1814 while (node) {
3ec145f9 1815 struct folio *folio;
b65d4adb
MS
1816 struct hstate *h;
1817
3ec145f9
MWO
1818 folio = container_of((struct address_space **)node,
1819 struct folio, mapping);
b65d4adb 1820 node = node->next;
3ec145f9 1821 folio->mapping = NULL;
b65d4adb 1822 /*
affd26b1
SK
1823 * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
1824 * folio_hstate() is going to trigger because a previous call to
9c5ccf2d
MWO
1825 * remove_hugetlb_folio() will clear the hugetlb bit, so do
1826 * not use folio_hstate() directly.
b65d4adb 1827 */
3ec145f9 1828 h = size_to_hstate(folio_size(folio));
b65d4adb 1829
3ec145f9 1830 __update_and_free_hugetlb_folio(h, folio);
b65d4adb
MS
1831
1832 cond_resched();
1833 }
1834}
1835static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1836
1837static inline void flush_free_hpage_work(struct hstate *h)
1838{
6213834c 1839 if (hugetlb_vmemmap_optimizable(h))
b65d4adb
MS
1840 flush_work(&free_hpage_work);
1841}
1842
d6ef19e2 1843static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
b65d4adb
MS
1844 bool atomic)
1845{
d6ef19e2 1846 if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
6f6956cf 1847 __update_and_free_hugetlb_folio(h, folio);
b65d4adb
MS
1848 return;
1849 }
1850
1851 /*
1852 * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
1853 *
1854 * Only call schedule_work() if hpage_freelist is previously
1855 * empty. Otherwise, schedule_work() had been called but the workfn
1856 * hasn't retrieved the list yet.
1857 */
d6ef19e2 1858 if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
b65d4adb
MS
1859 schedule_work(&free_hpage_work);
1860}
1861
10c6ec49
MK
1862static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
1863{
04bbfd84 1864 struct folio *folio, *t_folio;
d2cf88c2 1865 bool clear_dtor = false;
10c6ec49 1866
d2cf88c2
MK
1867 /*
1868 * First allocate required vmemmmap (if necessary) for all folios on
1869 * list. If vmemmap can not be allocated, we can not free folio to
1870 * lower level allocator, so add back as hugetlb surplus page.
1871 * add_hugetlb_folio() removes the page from THIS list.
1872 * Use clear_dtor to note if vmemmap was successfully allocated for
1873 * ANY page on the list.
1874 */
1875 list_for_each_entry_safe(folio, t_folio, list, lru) {
1876 if (folio_test_hugetlb_vmemmap_optimized(folio)) {
1877 if (hugetlb_vmemmap_restore(h, &folio->page)) {
1878 spin_lock_irq(&hugetlb_lock);
1879 add_hugetlb_folio(h, folio, true);
1880 spin_unlock_irq(&hugetlb_lock);
1881 } else
1882 clear_dtor = true;
1883 }
1884 }
1885
1886 /*
1887 * If vmemmmap allocation was performed on any folio above, take lock
1888 * to clear destructor of all folios on list. This avoids the need to
1889 * lock/unlock for each individual folio.
1890 * The assumption is vmemmap allocation was performed on all or none
1891 * of the folios on the list. This is true expect in VERY rare cases.
1892 */
1893 if (clear_dtor) {
1894 spin_lock_irq(&hugetlb_lock);
1895 list_for_each_entry(folio, list, lru)
1896 __clear_hugetlb_destructor(h, folio);
1897 spin_unlock_irq(&hugetlb_lock);
1898 }
1899
1900 /*
1901 * Free folios back to low level allocators. vmemmap and destructors
1902 * were taken care of above, so update_and_free_hugetlb_folio will
1903 * not need to take hugetlb lock.
1904 */
04bbfd84 1905 list_for_each_entry_safe(folio, t_folio, list, lru) {
d6ef19e2 1906 update_and_free_hugetlb_folio(h, folio, false);
10c6ec49
MK
1907 cond_resched();
1908 }
1909}
1910
e5ff2159
AK
1911struct hstate *size_to_hstate(unsigned long size)
1912{
1913 struct hstate *h;
1914
1915 for_each_hstate(h) {
1916 if (huge_page_size(h) == size)
1917 return h;
1918 }
1919 return NULL;
1920}
1921
454a00c4 1922void free_huge_folio(struct folio *folio)
27a85ef1 1923{
a5516438
AK
1924 /*
1925 * Can't pass hstate in here because it is called from the
1926 * compound page destructor.
1927 */
0356c4b9
SK
1928 struct hstate *h = folio_hstate(folio);
1929 int nid = folio_nid(folio);
1930 struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
07443a85 1931 bool restore_reserve;
db71ef79 1932 unsigned long flags;
27a85ef1 1933
0356c4b9
SK
1934 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
1935 VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);
8ace22bc 1936
0356c4b9
SK
1937 hugetlb_set_folio_subpool(folio, NULL);
1938 if (folio_test_anon(folio))
1939 __ClearPageAnonExclusive(&folio->page);
1940 folio->mapping = NULL;
1941 restore_reserve = folio_test_hugetlb_restore_reserve(folio);
1942 folio_clear_hugetlb_restore_reserve(folio);
27a85ef1 1943
1c5ecae3 1944 /*
d6995da3 1945 * If HPageRestoreReserve was set on page, page allocation consumed a
0919e1b6
MK
1946 * reservation. If the page was associated with a subpool, there
1947 * would have been a page reserved in the subpool before allocation
1948 * via hugepage_subpool_get_pages(). Since we are 'restoring' the
6c26d310 1949 * reservation, do not call hugepage_subpool_put_pages() as this will
0919e1b6 1950 * remove the reserved page from the subpool.
1c5ecae3 1951 */
0919e1b6
MK
1952 if (!restore_reserve) {
1953 /*
1954 * A return code of zero implies that the subpool will be
1955 * under its minimum size if the reservation is not restored
1956 * after page is free. Therefore, force restore_reserve
1957 * operation.
1958 */
1959 if (hugepage_subpool_put_pages(spool, 1) == 0)
1960 restore_reserve = true;
1961 }
1c5ecae3 1962
db71ef79 1963 spin_lock_irqsave(&hugetlb_lock, flags);
0356c4b9 1964 folio_clear_hugetlb_migratable(folio);
d4ab0316
SK
1965 hugetlb_cgroup_uncharge_folio(hstate_index(h),
1966 pages_per_huge_page(h), folio);
1967 hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
1968 pages_per_huge_page(h), folio);
8cba9576 1969 mem_cgroup_uncharge(folio);
07443a85
JK
1970 if (restore_reserve)
1971 h->resv_huge_pages++;
1972
0356c4b9 1973 if (folio_test_hugetlb_temporary(folio)) {
cfd5082b 1974 remove_hugetlb_folio(h, folio, false);
db71ef79 1975 spin_unlock_irqrestore(&hugetlb_lock, flags);
d6ef19e2 1976 update_and_free_hugetlb_folio(h, folio, true);
ab5ac90a 1977 } else if (h->surplus_huge_pages_node[nid]) {
0edaecfa 1978 /* remove the page from active list */
cfd5082b 1979 remove_hugetlb_folio(h, folio, true);
db71ef79 1980 spin_unlock_irqrestore(&hugetlb_lock, flags);
d6ef19e2 1981 update_and_free_hugetlb_folio(h, folio, true);
7893d1d5 1982 } else {
454a00c4 1983 arch_clear_hugepage_flags(&folio->page);
240d67a8 1984 enqueue_hugetlb_folio(h, folio);
db71ef79 1985 spin_unlock_irqrestore(&hugetlb_lock, flags);
c77c0a8a 1986 }
c77c0a8a
WL
1987}
1988
d3d99fcc
OS
1989/*
1990 * Must be called with the hugetlb lock held
1991 */
1992static void __prep_account_new_huge_page(struct hstate *h, int nid)
1993{
1994 lockdep_assert_held(&hugetlb_lock);
1995 h->nr_huge_pages++;
1996 h->nr_huge_pages_node[nid]++;
1997}
1998
de656ed3 1999static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
b7ba30c6 2000{
d8f5f7e4 2001 folio_set_hugetlb(folio);
de656ed3
SK
2002 hugetlb_vmemmap_optimize(h, &folio->page);
2003 INIT_LIST_HEAD(&folio->lru);
de656ed3
SK
2004 hugetlb_set_folio_subpool(folio, NULL);
2005 set_hugetlb_cgroup(folio, NULL);
2006 set_hugetlb_cgroup_rsvd(folio, NULL);
d3d99fcc
OS
2007}
2008
d1c60955 2009static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
d3d99fcc 2010{
de656ed3 2011 __prep_new_hugetlb_folio(h, folio);
db71ef79 2012 spin_lock_irq(&hugetlb_lock);
d3d99fcc 2013 __prep_account_new_huge_page(h, nid);
db71ef79 2014 spin_unlock_irq(&hugetlb_lock);
b7ba30c6
AK
2015}
2016
d1c60955
SK
2017static bool __prep_compound_gigantic_folio(struct folio *folio,
2018 unsigned int order, bool demote)
20a0307c 2019{
7118fc29 2020 int i, j;
20a0307c 2021 int nr_pages = 1 << order;
14455eab 2022 struct page *p;
20a0307c 2023
d1c60955 2024 __folio_clear_reserved(folio);
2b21624f 2025 for (i = 0; i < nr_pages; i++) {
d1c60955 2026 p = folio_page(folio, i);
14455eab 2027
ef5a22be
AA
2028 /*
2029 * For gigantic hugepages allocated through bootmem at
2030 * boot, it's safer to be consistent with the not-gigantic
2031 * hugepages and clear the PG_reserved bit from all tail pages
7c8de358 2032 * too. Otherwise drivers using get_user_pages() to access tail
ef5a22be
AA
2033 * pages may get the reference counting wrong if they see
2034 * PG_reserved set on a tail page (despite the head page not
2035 * having PG_reserved set). Enforcing this consistency between
2036 * head and tail pages allows drivers to optimize away a check
2037 * on the head page when they need know if put_page() is needed
2038 * after get_user_pages().
2039 */
7fb0728a
MK
2040 if (i != 0) /* head page cleared above */
2041 __ClearPageReserved(p);
7118fc29
MK
2042 /*
2043 * Subtle and very unlikely
2044 *
2045 * Gigantic 'page allocators' such as memblock or cma will
2046 * return a set of pages with each page ref counted. We need
2047 * to turn this set of pages into a compound page with tail
2048 * page ref counts set to zero. Code such as speculative page
2049 * cache adding could take a ref on a 'to be' tail page.
2050 * We need to respect any increased ref count, and only set
2051 * the ref count to zero if count is currently 1. If count
416d85ed
MK
2052 * is not 1, we return an error. An error return indicates
2053 * the set of pages can not be converted to a gigantic page.
2054 * The caller who allocated the pages should then discard the
2055 * pages using the appropriate free interface.
34d9e35b
MK
2056 *
2057 * In the case of demote, the ref count will be zero.
7118fc29 2058 */
34d9e35b
MK
2059 if (!demote) {
2060 if (!page_ref_freeze(p, 1)) {
2061 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
2062 goto out_error;
2063 }
2064 } else {
2065 VM_BUG_ON_PAGE(page_count(p), p);
7118fc29 2066 }
2b21624f 2067 if (i != 0)
d1c60955 2068 set_compound_head(p, &folio->page);
20a0307c 2069 }
e3b7bf97
TS
2070 __folio_set_head(folio);
2071 /* we rely on prep_new_hugetlb_folio to set the destructor */
2072 folio_set_order(folio, order);
46f27228 2073 atomic_set(&folio->_entire_mapcount, -1);
eec20426 2074 atomic_set(&folio->_nr_pages_mapped, 0);
94688e8e 2075 atomic_set(&folio->_pincount, 0);
7118fc29
MK
2076 return true;
2077
2078out_error:
2b21624f
MK
2079 /* undo page modifications made above */
2080 for (j = 0; j < i; j++) {
d1c60955 2081 p = folio_page(folio, j);
2b21624f
MK
2082 if (j != 0)
2083 clear_compound_head(p);
7118fc29
MK
2084 set_page_refcounted(p);
2085 }
2086 /* need to clear PG_reserved on remaining tail pages */
14455eab 2087 for (; j < nr_pages; j++) {
d1c60955 2088 p = folio_page(folio, j);
7118fc29 2089 __ClearPageReserved(p);
14455eab 2090 }
7118fc29 2091 return false;
20a0307c
WF
2092}
2093
d1c60955
SK
2094static bool prep_compound_gigantic_folio(struct folio *folio,
2095 unsigned int order)
34d9e35b 2096{
d1c60955 2097 return __prep_compound_gigantic_folio(folio, order, false);
34d9e35b
MK
2098}
2099
d1c60955 2100static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
8531fc6f
MK
2101 unsigned int order)
2102{
d1c60955 2103 return __prep_compound_gigantic_folio(folio, order, true);
8531fc6f
MK
2104}
2105
7795912c
AM
2106/*
2107 * PageHuge() only returns true for hugetlbfs pages, but not for normal or
2108 * transparent huge pages. See the PageTransHuge() documentation for more
2109 * details.
2110 */
20a0307c
WF
2111int PageHuge(struct page *page)
2112{
2d678c64
MWO
2113 struct folio *folio;
2114
20a0307c
WF
2115 if (!PageCompound(page))
2116 return 0;
2d678c64 2117 folio = page_folio(page);
9c5ccf2d 2118 return folio_test_hugetlb(folio);
20a0307c 2119}
43131e14
NH
2120EXPORT_SYMBOL_GPL(PageHuge);
2121
c0d0381a
MK
2122/*
2123 * Find and lock address space (mapping) in write mode.
2124 *
336bf30e
MK
2125 * Upon entry, the page is locked which means that page_mapping() is
2126 * stable. Due to locking order, we can only trylock_write. If we can
2127 * not get the lock, simply return NULL to caller.
c0d0381a
MK
2128 */
2129struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
2130{
336bf30e 2131 struct address_space *mapping = page_mapping(hpage);
c0d0381a 2132
c0d0381a
MK
2133 if (!mapping)
2134 return mapping;
2135
c0d0381a
MK
2136 if (i_mmap_trylock_write(mapping))
2137 return mapping;
2138
336bf30e 2139 return NULL;
c0d0381a
MK
2140}
2141
19fc1a7e 2142static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
f60858f9
MK
2143 gfp_t gfp_mask, int nid, nodemask_t *nmask,
2144 nodemask_t *node_alloc_noretry)
1da177e4 2145{
af0fb9df 2146 int order = huge_page_order(h);
1da177e4 2147 struct page *page;
f60858f9 2148 bool alloc_try_hard = true;
2b21624f 2149 bool retry = true;
f96efd58 2150
f60858f9
MK
2151 /*
2152 * By default we always try hard to allocate the page with
2153 * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
2154 * a loop (to adjust global huge page counts) and previous allocation
2155 * failed, do not continue to try hard on the same node. Use the
2156 * node_alloc_noretry bitmap to manage this state information.
2157 */
2158 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
2159 alloc_try_hard = false;
2160 gfp_mask |= __GFP_COMP|__GFP_NOWARN;
2161 if (alloc_try_hard)
2162 gfp_mask |= __GFP_RETRY_MAYFAIL;
af0fb9df
MH
2163 if (nid == NUMA_NO_NODE)
2164 nid = numa_mem_id();
2b21624f 2165retry:
84172f4b 2166 page = __alloc_pages(gfp_mask, order, nid, nmask);
2b21624f
MK
2167
2168 /* Freeze head page */
2169 if (page && !page_ref_freeze(page, 1)) {
2170 __free_pages(page, order);
2171 if (retry) { /* retry once */
2172 retry = false;
2173 goto retry;
2174 }
2175 /* WOW! twice in a row. */
2176 pr_warn("HugeTLB head page unexpected inflated ref count\n");
2177 page = NULL;
2178 }
2179
f60858f9
MK
2180 /*
2181 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
2182 * indicates an overall state change. Clear bit so that we resume
2183 * normal 'try hard' allocations.
2184 */
2185 if (node_alloc_noretry && page && !alloc_try_hard)
2186 node_clear(nid, *node_alloc_noretry);
2187
2188 /*
2189 * If we tried hard to get a page but failed, set bit so that
2190 * subsequent attempts will not try as hard until there is an
2191 * overall state change.
2192 */
2193 if (node_alloc_noretry && !page && alloc_try_hard)
2194 node_set(nid, *node_alloc_noretry);
2195
19fc1a7e
SK
2196 if (!page) {
2197 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
2198 return NULL;
2199 }
2200
2201 __count_vm_event(HTLB_BUDDY_PGALLOC);
2202 return page_folio(page);
63b4613c
NA
2203}
2204
0c397dae
MH
2205/*
2206 * Common helper to allocate a fresh hugetlb page. All specific allocators
2207 * should use this function to get new hugetlb pages
2b21624f
MK
2208 *
2209 * Note that returned page is 'frozen': ref count of head page and all tail
2210 * pages is zero.
0c397dae 2211 */
19fc1a7e 2212static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
f60858f9
MK
2213 gfp_t gfp_mask, int nid, nodemask_t *nmask,
2214 nodemask_t *node_alloc_noretry)
0c397dae 2215{
7f325a8d 2216 struct folio *folio;
7118fc29 2217 bool retry = false;
0c397dae 2218
7118fc29 2219retry:
0c397dae 2220 if (hstate_is_gigantic(h))
19fc1a7e 2221 folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
0c397dae 2222 else
19fc1a7e 2223 folio = alloc_buddy_hugetlb_folio(h, gfp_mask,
f60858f9 2224 nid, nmask, node_alloc_noretry);
19fc1a7e 2225 if (!folio)
0c397dae 2226 return NULL;
7118fc29 2227 if (hstate_is_gigantic(h)) {
d1c60955 2228 if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
7118fc29
MK
2229 /*
2230 * Rare failure to convert pages to compound page.
2231 * Free pages and try again - ONCE!
2232 */
7f325a8d 2233 free_gigantic_folio(folio, huge_page_order(h));
7118fc29
MK
2234 if (!retry) {
2235 retry = true;
2236 goto retry;
2237 }
7118fc29
MK
2238 return NULL;
2239 }
2240 }
d1c60955 2241 prep_new_hugetlb_folio(h, folio, folio_nid(folio));
0c397dae 2242
19fc1a7e 2243 return folio;
0c397dae
MH
2244}
2245
af0fb9df
MH
2246/*
2247 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
2248 * manner.
2249 */
f60858f9
MK
2250static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
2251 nodemask_t *node_alloc_noretry)
b2261026 2252{
19fc1a7e 2253 struct folio *folio;
b2261026 2254 int nr_nodes, node;
af0fb9df 2255 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
b2261026
JK
2256
2257 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
19fc1a7e
SK
2258 folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node,
2259 nodes_allowed, node_alloc_noretry);
2260 if (folio) {
454a00c4 2261 free_huge_folio(folio); /* free it into the hugepage allocator */
19fc1a7e
SK
2262 return 1;
2263 }
b2261026
JK
2264 }
2265
19fc1a7e 2266 return 0;
b2261026
JK
2267}
2268
e8c5c824 2269/*
10c6ec49
MK
2270 * Remove huge page from pool from next node to free. Attempt to keep
2271 * persistent huge pages more or less balanced over allowed nodes.
2272 * This routine only 'removes' the hugetlb page. The caller must make
2273 * an additional call to free the page to low level allocators.
e8c5c824
LS
2274 * Called with hugetlb_lock locked.
2275 */
d5b43e96
MWO
2276static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
2277 nodemask_t *nodes_allowed, bool acct_surplus)
e8c5c824 2278{
b2261026 2279 int nr_nodes, node;
04bbfd84 2280 struct folio *folio = NULL;
e8c5c824 2281
9487ca60 2282 lockdep_assert_held(&hugetlb_lock);
b2261026 2283 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
685f3457
LS
2284 /*
2285 * If we're returning unused surplus pages, only examine
2286 * nodes with surplus pages.
2287 */
b2261026
JK
2288 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2289 !list_empty(&h->hugepage_freelists[node])) {
04bbfd84
MWO
2290 folio = list_entry(h->hugepage_freelists[node].next,
2291 struct folio, lru);
cfd5082b 2292 remove_hugetlb_folio(h, folio, acct_surplus);
9a76db09 2293 break;
e8c5c824 2294 }
b2261026 2295 }
e8c5c824 2296
d5b43e96 2297 return folio;
e8c5c824
LS
2298}
2299
c8721bbb
NH
2300/*
2301 * Dissolve a given free hugepage into free buddy pages. This function does
faf53def
NH
2302 * nothing for in-use hugepages and non-hugepages.
2303 * This function returns values like below:
2304 *
ad2fa371
MS
2305 * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
2306 * when the system is under memory pressure and the feature of
2307 * freeing unused vmemmap pages associated with each hugetlb page
2308 * is enabled.
2309 * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
2310 * (allocated or reserved.)
2311 * 0: successfully dissolved free hugepages or the page is not a
2312 * hugepage (considered as already dissolved)
c8721bbb 2313 */
c3114a84 2314int dissolve_free_huge_page(struct page *page)
c8721bbb 2315{
6bc9b564 2316 int rc = -EBUSY;
1a7cdab5 2317 struct folio *folio = page_folio(page);
082d5b6b 2318
7ffddd49 2319retry:
faf53def 2320 /* Not to disrupt normal path by vainly holding hugetlb_lock */
1a7cdab5 2321 if (!folio_test_hugetlb(folio))
faf53def
NH
2322 return 0;
2323
db71ef79 2324 spin_lock_irq(&hugetlb_lock);
1a7cdab5 2325 if (!folio_test_hugetlb(folio)) {
faf53def
NH
2326 rc = 0;
2327 goto out;
2328 }
2329
1a7cdab5
SK
2330 if (!folio_ref_count(folio)) {
2331 struct hstate *h = folio_hstate(folio);
8346d69d 2332 if (!available_huge_pages(h))
082d5b6b 2333 goto out;
7ffddd49
MS
2334
2335 /*
2336 * We should make sure that the page is already on the free list
2337 * when it is dissolved.
2338 */
1a7cdab5 2339 if (unlikely(!folio_test_hugetlb_freed(folio))) {
db71ef79 2340 spin_unlock_irq(&hugetlb_lock);
7ffddd49
MS
2341 cond_resched();
2342
2343 /*
2344 * Theoretically, we should return -EBUSY when we
2345 * encounter this race. In fact, we have a chance
2346 * to successfully dissolve the page if we do a
2347 * retry. Because the race window is quite small.
2348 * If we seize this opportunity, it is an optimization
2349 * for increasing the success rate of dissolving page.
2350 */
2351 goto retry;
2352 }
2353
cfd5082b 2354 remove_hugetlb_folio(h, folio, false);
c1470b33 2355 h->max_huge_pages--;
db71ef79 2356 spin_unlock_irq(&hugetlb_lock);
ad2fa371
MS
2357
2358 /*
d6ef19e2
SK
2359 * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
2360 * before freeing the page. update_and_free_hugtlb_folio will fail to
ad2fa371
MS
2361 * free the page if it can not allocate required vmemmap. We
2362 * need to adjust max_huge_pages if the page is not freed.
2363 * Attempt to allocate vmemmmap here so that we can take
2364 * appropriate action on failure.
30a89adf
MK
2365 *
2366 * The folio_test_hugetlb check here is because
2367 * remove_hugetlb_folio will clear hugetlb folio flag for
2368 * non-vmemmap optimized hugetlb folios.
ad2fa371 2369 */
30a89adf
MK
2370 if (folio_test_hugetlb(folio)) {
2371 rc = hugetlb_vmemmap_restore(h, &folio->page);
2372 if (rc) {
2373 spin_lock_irq(&hugetlb_lock);
2374 add_hugetlb_folio(h, folio, false);
2375 h->max_huge_pages++;
2376 goto out;
2377 }
2378 } else
2379 rc = 0;
ad2fa371 2380
30a89adf 2381 update_and_free_hugetlb_folio(h, folio, false);
ad2fa371 2382 return rc;
c8721bbb 2383 }
082d5b6b 2384out:
db71ef79 2385 spin_unlock_irq(&hugetlb_lock);
082d5b6b 2386 return rc;
c8721bbb
NH
2387}
2388
2389/*
2390 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
2391 * make specified memory blocks removable from the system.
2247bb33
GS
2392 * Note that this will dissolve a free gigantic hugepage completely, if any
2393 * part of it lies within the given range.
082d5b6b
GS
2394 * Also note that if dissolve_free_huge_page() returns with an error, all
2395 * free hugepages that were dissolved before that error are lost.
c8721bbb 2396 */
082d5b6b 2397int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
c8721bbb 2398{
c8721bbb 2399 unsigned long pfn;
eb03aa00 2400 struct page *page;
082d5b6b 2401 int rc = 0;
dc2628f3
MS
2402 unsigned int order;
2403 struct hstate *h;
c8721bbb 2404
d0177639 2405 if (!hugepages_supported())
082d5b6b 2406 return rc;
d0177639 2407
dc2628f3
MS
2408 order = huge_page_order(&default_hstate);
2409 for_each_hstate(h)
2410 order = min(order, huge_page_order(h));
2411
2412 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
eb03aa00 2413 page = pfn_to_page(pfn);
faf53def
NH
2414 rc = dissolve_free_huge_page(page);
2415 if (rc)
2416 break;
eb03aa00 2417 }
082d5b6b
GS
2418
2419 return rc;
c8721bbb
NH
2420}
2421
ab5ac90a
MH
2422/*
2423 * Allocates a fresh surplus page from the page allocator.
2424 */
3a740e8b
SK
2425static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
2426 gfp_t gfp_mask, int nid, nodemask_t *nmask)
7893d1d5 2427{
19fc1a7e 2428 struct folio *folio = NULL;
7893d1d5 2429
bae7f4ae 2430 if (hstate_is_gigantic(h))
aa888a74
AK
2431 return NULL;
2432
db71ef79 2433 spin_lock_irq(&hugetlb_lock);
9980d744
MH
2434 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2435 goto out_unlock;
db71ef79 2436 spin_unlock_irq(&hugetlb_lock);
d1c3fb1f 2437
19fc1a7e
SK
2438 folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
2439 if (!folio)
0c397dae 2440 return NULL;
d1c3fb1f 2441
db71ef79 2442 spin_lock_irq(&hugetlb_lock);
9980d744
MH
2443 /*
2444 * We could have raced with the pool size change.
2445 * Double check that and simply deallocate the new page
2446 * if we would end up overcommiting the surpluses. Abuse
454a00c4 2447 * temporary page to workaround the nasty free_huge_folio
9980d744
MH
2448 * codeflow
2449 */
2450 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
19fc1a7e 2451 folio_set_hugetlb_temporary(folio);
db71ef79 2452 spin_unlock_irq(&hugetlb_lock);
454a00c4 2453 free_huge_folio(folio);
2bf753e6 2454 return NULL;
7893d1d5 2455 }
9980d744 2456
b65a4eda 2457 h->surplus_huge_pages++;
19fc1a7e 2458 h->surplus_huge_pages_node[folio_nid(folio)]++;
b65a4eda 2459
9980d744 2460out_unlock:
db71ef79 2461 spin_unlock_irq(&hugetlb_lock);
7893d1d5 2462
3a740e8b 2463 return folio;
7893d1d5
AL
2464}
2465
e37d3e83 2466static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
9a4e9f3b 2467 int nid, nodemask_t *nmask)
ab5ac90a 2468{
19fc1a7e 2469 struct folio *folio;
ab5ac90a
MH
2470
2471 if (hstate_is_gigantic(h))
2472 return NULL;
2473
19fc1a7e
SK
2474 folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
2475 if (!folio)
ab5ac90a
MH
2476 return NULL;
2477
2b21624f 2478 /* fresh huge pages are frozen */
19fc1a7e 2479 folio_ref_unfreeze(folio, 1);
ab5ac90a
MH
2480 /*
2481 * We do not account these pages as surplus because they are only
2482 * temporary and will be released properly on the last reference
2483 */
19fc1a7e 2484 folio_set_hugetlb_temporary(folio);
ab5ac90a 2485
e37d3e83 2486 return folio;
ab5ac90a
MH
2487}
2488
099730d6
DH
2489/*
2490 * Use the VMA's mpolicy to allocate a huge page from the buddy.
2491 */
e0ec90ee 2492static
ff7d853b 2493struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
099730d6
DH
2494 struct vm_area_struct *vma, unsigned long addr)
2495{
3a740e8b 2496 struct folio *folio = NULL;
aaf14e40
MH
2497 struct mempolicy *mpol;
2498 gfp_t gfp_mask = htlb_alloc_mask(h);
2499 int nid;
2500 nodemask_t *nodemask;
2501
2502 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
cfcaa66f
BW
2503 if (mpol_is_preferred_many(mpol)) {
2504 gfp_t gfp = gfp_mask | __GFP_NOWARN;
2505
2506 gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
3a740e8b 2507 folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);
aaf14e40 2508
cfcaa66f
BW
2509 /* Fallback to all nodes if page==NULL */
2510 nodemask = NULL;
2511 }
2512
3a740e8b
SK
2513 if (!folio)
2514 folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask);
cfcaa66f 2515 mpol_cond_put(mpol);
ff7d853b 2516 return folio;
099730d6
DH
2517}
2518
e37d3e83
SK
2519/* folio migration callback function */
2520struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
d92bbc27 2521 nodemask_t *nmask, gfp_t gfp_mask)
4db9b2ef 2522{
db71ef79 2523 spin_lock_irq(&hugetlb_lock);
8346d69d 2524 if (available_huge_pages(h)) {
a36f1e90 2525 struct folio *folio;
3e59fcb0 2526
a36f1e90
SK
2527 folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
2528 preferred_nid, nmask);
2529 if (folio) {
db71ef79 2530 spin_unlock_irq(&hugetlb_lock);
e37d3e83 2531 return folio;
4db9b2ef
MH
2532 }
2533 }
db71ef79 2534 spin_unlock_irq(&hugetlb_lock);
4db9b2ef 2535
e37d3e83 2536 return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
4db9b2ef
MH
2537}
2538
ebd63723 2539/* mempolicy aware migration callback */
d0ce0e47 2540struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
389c8178 2541 unsigned long address)
ebd63723
MH
2542{
2543 struct mempolicy *mpol;
2544 nodemask_t *nodemask;
e37d3e83 2545 struct folio *folio;
ebd63723
MH
2546 gfp_t gfp_mask;
2547 int node;
2548
ebd63723
MH
2549 gfp_mask = htlb_alloc_mask(h);
2550 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
e37d3e83 2551 folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
ebd63723
MH
2552 mpol_cond_put(mpol);
2553
d0ce0e47 2554 return folio;
ebd63723
MH
2555}
2556
e4e574b7 2557/*
25985edc 2558 * Increase the hugetlb pool such that it can accommodate a reservation
e4e574b7
AL
2559 * of size 'delta'.
2560 */
0a4f3d1b 2561static int gather_surplus_pages(struct hstate *h, long delta)
1b2a1e7b 2562 __must_hold(&hugetlb_lock)
e4e574b7 2563{
34665341 2564 LIST_HEAD(surplus_list);
454a00c4 2565 struct folio *folio, *tmp;
0a4f3d1b
LX
2566 int ret;
2567 long i;
2568 long needed, allocated;
28073b02 2569 bool alloc_ok = true;
e4e574b7 2570
9487ca60 2571 lockdep_assert_held(&hugetlb_lock);
a5516438 2572 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
ac09b3a1 2573 if (needed <= 0) {
a5516438 2574 h->resv_huge_pages += delta;
e4e574b7 2575 return 0;
ac09b3a1 2576 }
e4e574b7
AL
2577
2578 allocated = 0;
e4e574b7
AL
2579
2580 ret = -ENOMEM;
2581retry:
db71ef79 2582 spin_unlock_irq(&hugetlb_lock);
e4e574b7 2583 for (i = 0; i < needed; i++) {
3a740e8b 2584 folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
2b21624f 2585 NUMA_NO_NODE, NULL);
3a740e8b 2586 if (!folio) {
28073b02
HD
2587 alloc_ok = false;
2588 break;
2589 }
3a740e8b 2590 list_add(&folio->lru, &surplus_list);
69ed779a 2591 cond_resched();
e4e574b7 2592 }
28073b02 2593 allocated += i;
e4e574b7
AL
2594
2595 /*
2596 * After retaking hugetlb_lock, we need to recalculate 'needed'
2597 * because either resv_huge_pages or free_huge_pages may have changed.
2598 */
db71ef79 2599 spin_lock_irq(&hugetlb_lock);
a5516438
AK
2600 needed = (h->resv_huge_pages + delta) -
2601 (h->free_huge_pages + allocated);
28073b02
HD
2602 if (needed > 0) {
2603 if (alloc_ok)
2604 goto retry;
2605 /*
2606 * We were not able to allocate enough pages to
2607 * satisfy the entire reservation so we free what
2608 * we've allocated so far.
2609 */
2610 goto free;
2611 }
e4e574b7
AL
2612 /*
2613 * The surplus_list now contains _at_least_ the number of extra pages
25985edc 2614 * needed to accommodate the reservation. Add the appropriate number
e4e574b7 2615 * of pages to the hugetlb pool and free the extras back to the buddy
ac09b3a1
AL
2616 * allocator. Commit the entire reservation here to prevent another
2617 * process from stealing the pages as they are added to the pool but
2618 * before they are reserved.
e4e574b7
AL
2619 */
2620 needed += allocated;
a5516438 2621 h->resv_huge_pages += delta;
e4e574b7 2622 ret = 0;
a9869b83 2623
19fc3f0a 2624 /* Free the needed pages to the hugetlb pool */
454a00c4 2625 list_for_each_entry_safe(folio, tmp, &surplus_list, lru) {
19fc3f0a
AL
2626 if ((--needed) < 0)
2627 break;
b65a4eda 2628 /* Add the page to the hugetlb allocator */
454a00c4 2629 enqueue_hugetlb_folio(h, folio);
19fc3f0a 2630 }
28073b02 2631free:
db71ef79 2632 spin_unlock_irq(&hugetlb_lock);
19fc3f0a 2633
b65a4eda
MK
2634 /*
2635 * Free unnecessary surplus pages to the buddy allocator.
454a00c4 2636 * Pages have no ref count, call free_huge_folio directly.
b65a4eda 2637 */
454a00c4
MWO
2638 list_for_each_entry_safe(folio, tmp, &surplus_list, lru)
2639 free_huge_folio(folio);
db71ef79 2640 spin_lock_irq(&hugetlb_lock);
e4e574b7
AL
2641
2642 return ret;
2643}
2644
2645/*
e5bbc8a6
MK
2646 * This routine has two main purposes:
2647 * 1) Decrement the reservation count (resv_huge_pages) by the value passed
2648 * in unused_resv_pages. This corresponds to the prior adjustments made
2649 * to the associated reservation map.
2650 * 2) Free any unused surplus pages that may have been allocated to satisfy
2651 * the reservation. As many as unused_resv_pages may be freed.
e4e574b7 2652 */
a5516438
AK
2653static void return_unused_surplus_pages(struct hstate *h,
2654 unsigned long unused_resv_pages)
e4e574b7 2655{
e4e574b7 2656 unsigned long nr_pages;
10c6ec49
MK
2657 LIST_HEAD(page_list);
2658
9487ca60 2659 lockdep_assert_held(&hugetlb_lock);
10c6ec49
MK
2660 /* Uncommit the reservation */
2661 h->resv_huge_pages -= unused_resv_pages;
e4e574b7 2662
c0531714 2663 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
e5bbc8a6 2664 goto out;
aa888a74 2665
e5bbc8a6
MK
2666 /*
2667 * Part (or even all) of the reservation could have been backed
2668 * by pre-allocated pages. Only free surplus pages.
2669 */
a5516438 2670 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
e4e574b7 2671
685f3457
LS
2672 /*
2673 * We want to release as many surplus pages as possible, spread
9b5e5d0f
LS
2674 * evenly across all nodes with memory. Iterate across these nodes
2675 * until we can no longer free unreserved surplus pages. This occurs
2676 * when the nodes with surplus pages have no free pages.
d5b43e96 2677 * remove_pool_hugetlb_folio() will balance the freed pages across the
9b5e5d0f 2678 * on-line nodes with memory and will handle the hstate accounting.
685f3457
LS
2679 */
2680 while (nr_pages--) {
d5b43e96
MWO
2681 struct folio *folio;
2682
2683 folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1);
2684 if (!folio)
e5bbc8a6 2685 goto out;
10c6ec49 2686
d5b43e96 2687 list_add(&folio->lru, &page_list);
e4e574b7 2688 }
e5bbc8a6
MK
2689
2690out:
db71ef79 2691 spin_unlock_irq(&hugetlb_lock);
10c6ec49 2692 update_and_free_pages_bulk(h, &page_list);
db71ef79 2693 spin_lock_irq(&hugetlb_lock);
e4e574b7
AL
2694}
2695
5e911373 2696
c37f9fb1 2697/*
feba16e2 2698 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
5e911373 2699 * are used by the huge page allocation routines to manage reservations.
cf3ad20b
MK
2700 *
2701 * vma_needs_reservation is called to determine if the huge page at addr
2702 * within the vma has an associated reservation. If a reservation is
2703 * needed, the value 1 is returned. The caller is then responsible for
2704 * managing the global reservation and subpool usage counts. After
2705 * the huge page has been allocated, vma_commit_reservation is called
feba16e2
MK
2706 * to add the page to the reservation map. If the page allocation fails,
2707 * the reservation must be ended instead of committed. vma_end_reservation
2708 * is called in such cases.
cf3ad20b
MK
2709 *
2710 * In the normal case, vma_commit_reservation returns the same value
2711 * as the preceding vma_needs_reservation call. The only time this
2712 * is not the case is if a reserve map was changed between calls. It
2713 * is the responsibility of the caller to notice the difference and
2714 * take appropriate action.
96b96a96
MK
2715 *
2716 * vma_add_reservation is used in error paths where a reservation must
2717 * be restored when a newly allocated huge page must be freed. It is
2718 * to be called after calling vma_needs_reservation to determine if a
2719 * reservation exists.
846be085
MK
2720 *
2721 * vma_del_reservation is used in error paths where an entry in the reserve
2722 * map was created during huge page allocation and must be removed. It is to
2723 * be called after calling vma_needs_reservation to determine if a reservation
2724 * exists.
c37f9fb1 2725 */
5e911373
MK
2726enum vma_resv_mode {
2727 VMA_NEEDS_RESV,
2728 VMA_COMMIT_RESV,
feba16e2 2729 VMA_END_RESV,
96b96a96 2730 VMA_ADD_RESV,
846be085 2731 VMA_DEL_RESV,
5e911373 2732};
cf3ad20b
MK
2733static long __vma_reservation_common(struct hstate *h,
2734 struct vm_area_struct *vma, unsigned long addr,
5e911373 2735 enum vma_resv_mode mode)
c37f9fb1 2736{
4e35f483
JK
2737 struct resv_map *resv;
2738 pgoff_t idx;
cf3ad20b 2739 long ret;
0db9d74e 2740 long dummy_out_regions_needed;
c37f9fb1 2741
4e35f483
JK
2742 resv = vma_resv_map(vma);
2743 if (!resv)
84afd99b 2744 return 1;
c37f9fb1 2745
4e35f483 2746 idx = vma_hugecache_offset(h, vma, addr);
5e911373
MK
2747 switch (mode) {
2748 case VMA_NEEDS_RESV:
0db9d74e
MA
2749 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2750 /* We assume that vma_reservation_* routines always operate on
2751 * 1 page, and that adding to resv map a 1 page entry can only
2752 * ever require 1 region.
2753 */
2754 VM_BUG_ON(dummy_out_regions_needed != 1);
5e911373
MK
2755 break;
2756 case VMA_COMMIT_RESV:
075a61d0 2757 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
0db9d74e
MA
2758 /* region_add calls of range 1 should never fail. */
2759 VM_BUG_ON(ret < 0);
5e911373 2760 break;
feba16e2 2761 case VMA_END_RESV:
0db9d74e 2762 region_abort(resv, idx, idx + 1, 1);
5e911373
MK
2763 ret = 0;
2764 break;
96b96a96 2765 case VMA_ADD_RESV:
0db9d74e 2766 if (vma->vm_flags & VM_MAYSHARE) {
075a61d0 2767 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
0db9d74e
MA
2768 /* region_add calls of range 1 should never fail. */
2769 VM_BUG_ON(ret < 0);
2770 } else {
2771 region_abort(resv, idx, idx + 1, 1);
96b96a96
MK
2772 ret = region_del(resv, idx, idx + 1);
2773 }
2774 break;
846be085
MK
2775 case VMA_DEL_RESV:
2776 if (vma->vm_flags & VM_MAYSHARE) {
2777 region_abort(resv, idx, idx + 1, 1);
2778 ret = region_del(resv, idx, idx + 1);
2779 } else {
2780 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2781 /* region_add calls of range 1 should never fail. */
2782 VM_BUG_ON(ret < 0);
2783 }
2784 break;
5e911373
MK
2785 default:
2786 BUG();
2787 }
84afd99b 2788
846be085 2789 if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
cf3ad20b 2790 return ret;
bf3d12b9
ML
2791 /*
2792 * We know private mapping must have HPAGE_RESV_OWNER set.
2793 *
2794 * In most cases, reserves always exist for private mappings.
2795 * However, a file associated with mapping could have been
2796 * hole punched or truncated after reserves were consumed.
2797 * As subsequent fault on such a range will not use reserves.
2798 * Subtle - The reserve map for private mappings has the
2799 * opposite meaning than that of shared mappings. If NO
2800 * entry is in the reserve map, it means a reservation exists.
2801 * If an entry exists in the reserve map, it means the
2802 * reservation has already been consumed. As a result, the
2803 * return value of this routine is the opposite of the
2804 * value returned from reserve map manipulation routines above.
2805 */
2806 if (ret > 0)
2807 return 0;
2808 if (ret == 0)
2809 return 1;
2810 return ret;
c37f9fb1 2811}
cf3ad20b
MK
2812
2813static long vma_needs_reservation(struct hstate *h,
a5516438 2814 struct vm_area_struct *vma, unsigned long addr)
c37f9fb1 2815{
5e911373 2816 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
cf3ad20b 2817}
84afd99b 2818
cf3ad20b
MK
2819static long vma_commit_reservation(struct hstate *h,
2820 struct vm_area_struct *vma, unsigned long addr)
2821{
5e911373
MK
2822 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2823}
2824
feba16e2 2825static void vma_end_reservation(struct hstate *h,
5e911373
MK
2826 struct vm_area_struct *vma, unsigned long addr)
2827{
feba16e2 2828 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
c37f9fb1
AW
2829}
2830
96b96a96
MK
2831static long vma_add_reservation(struct hstate *h,
2832 struct vm_area_struct *vma, unsigned long addr)
2833{
2834 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2835}
2836
846be085
MK
2837static long vma_del_reservation(struct hstate *h,
2838 struct vm_area_struct *vma, unsigned long addr)
2839{
2840 return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2841}
2842
96b96a96 2843/*
846be085 2844 * This routine is called to restore reservation information on error paths.
d0ce0e47
SK
2845 * It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
2846 * and the hugetlb mutex should remain held when calling this routine.
846be085
MK
2847 *
2848 * It handles two specific cases:
d2d7bb44
SK
2849 * 1) A reservation was in place and the folio consumed the reservation.
2850 * hugetlb_restore_reserve is set in the folio.
2851 * 2) No reservation was in place for the page, so hugetlb_restore_reserve is
d0ce0e47 2852 * not set. However, alloc_hugetlb_folio always updates the reserve map.
846be085 2853 *
454a00c4
MWO
2854 * In case 1, free_huge_folio later in the error path will increment the
2855 * global reserve count. But, free_huge_folio does not have enough context
846be085
MK
2856 * to adjust the reservation map. This case deals primarily with private
2857 * mappings. Adjust the reserve map here to be consistent with global
454a00c4 2858 * reserve count adjustments to be made by free_huge_folio. Make sure the
846be085
MK
2859 * reserve map indicates there is a reservation present.
2860 *
d0ce0e47 2861 * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
96b96a96 2862 */
846be085 2863void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
d2d7bb44 2864 unsigned long address, struct folio *folio)
96b96a96 2865{
846be085 2866 long rc = vma_needs_reservation(h, vma, address);
96b96a96 2867
0ffdc38e 2868 if (folio_test_hugetlb_restore_reserve(folio)) {
846be085 2869 if (unlikely(rc < 0))
96b96a96
MK
2870 /*
2871 * Rare out of memory condition in reserve map
0ffdc38e
SK
2872 * manipulation. Clear hugetlb_restore_reserve so
2873 * that global reserve count will not be incremented
454a00c4 2874 * by free_huge_folio. This will make it appear
0ffdc38e 2875 * as though the reservation for this folio was
96b96a96 2876 * consumed. This may prevent the task from
0ffdc38e 2877 * faulting in the folio at a later time. This
96b96a96
MK
2878 * is better than inconsistent global huge page
2879 * accounting of reserve counts.
2880 */
0ffdc38e 2881 folio_clear_hugetlb_restore_reserve(folio);
846be085
MK
2882 else if (rc)
2883 (void)vma_add_reservation(h, vma, address);
2884 else
2885 vma_end_reservation(h, vma, address);
2886 } else {
2887 if (!rc) {
2888 /*
2889 * This indicates there is an entry in the reserve map
d0ce0e47
SK
2890 * not added by alloc_hugetlb_folio. We know it was added
2891 * before the alloc_hugetlb_folio call, otherwise
0ffdc38e 2892 * hugetlb_restore_reserve would be set on the folio.
846be085
MK
2893 * Remove the entry so that a subsequent allocation
2894 * does not consume a reservation.
2895 */
2896 rc = vma_del_reservation(h, vma, address);
2897 if (rc < 0)
96b96a96 2898 /*
846be085
MK
2899 * VERY rare out of memory condition. Since
2900 * we can not delete the entry, set
0ffdc38e
SK
2901 * hugetlb_restore_reserve so that the reserve
2902 * count will be incremented when the folio
846be085
MK
2903 * is freed. This reserve will be consumed
2904 * on a subsequent allocation.
96b96a96 2905 */
0ffdc38e 2906 folio_set_hugetlb_restore_reserve(folio);
846be085
MK
2907 } else if (rc < 0) {
2908 /*
2909 * Rare out of memory condition from
2910 * vma_needs_reservation call. Memory allocation is
2911 * only attempted if a new entry is needed. Therefore,
2912 * this implies there is not an entry in the
2913 * reserve map.
2914 *
2915 * For shared mappings, no entry in the map indicates
2916 * no reservation. We are done.
2917 */
2918 if (!(vma->vm_flags & VM_MAYSHARE))
2919 /*
2920 * For private mappings, no entry indicates
2921 * a reservation is present. Since we can
0ffdc38e
SK
2922 * not add an entry, set hugetlb_restore_reserve
2923 * on the folio so reserve count will be
846be085
MK
2924 * incremented when freed. This reserve will
2925 * be consumed on a subsequent allocation.
2926 */
0ffdc38e 2927 folio_set_hugetlb_restore_reserve(folio);
96b96a96 2928 } else
846be085
MK
2929 /*
2930 * No reservation present, do nothing
2931 */
2932 vma_end_reservation(h, vma, address);
96b96a96
MK
2933 }
2934}
2935
369fa227 2936/*
19fc1a7e
SK
2937 * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
2938 * the old one
369fa227 2939 * @h: struct hstate old page belongs to
19fc1a7e 2940 * @old_folio: Old folio to dissolve
ae37c7ff 2941 * @list: List to isolate the page in case we need to
369fa227
OS
2942 * Returns 0 on success, otherwise negated error.
2943 */
19fc1a7e
SK
2944static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
2945 struct folio *old_folio, struct list_head *list)
369fa227
OS
2946{
2947 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
de656ed3 2948 int nid = folio_nid(old_folio);
de656ed3 2949 struct folio *new_folio;
369fa227
OS
2950 int ret = 0;
2951
2952 /*
19fc1a7e
SK
2953 * Before dissolving the folio, we need to allocate a new one for the
2954 * pool to remain stable. Here, we allocate the folio and 'prep' it
f41f2ed4
MS
2955 * by doing everything but actually updating counters and adding to
2956 * the pool. This simplifies and let us do most of the processing
2957 * under the lock.
369fa227 2958 */
19fc1a7e
SK
2959 new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL);
2960 if (!new_folio)
369fa227 2961 return -ENOMEM;
de656ed3 2962 __prep_new_hugetlb_folio(h, new_folio);
369fa227
OS
2963
2964retry:
2965 spin_lock_irq(&hugetlb_lock);
de656ed3 2966 if (!folio_test_hugetlb(old_folio)) {
369fa227 2967 /*
19fc1a7e 2968 * Freed from under us. Drop new_folio too.
369fa227
OS
2969 */
2970 goto free_new;
de656ed3 2971 } else if (folio_ref_count(old_folio)) {
9747b9e9
BW
2972 bool isolated;
2973
369fa227 2974 /*
19fc1a7e 2975 * Someone has grabbed the folio, try to isolate it here.
ae37c7ff 2976 * Fail with -EBUSY if not possible.
369fa227 2977 */
ae37c7ff 2978 spin_unlock_irq(&hugetlb_lock);
9747b9e9
BW
2979 isolated = isolate_hugetlb(old_folio, list);
2980 ret = isolated ? 0 : -EBUSY;
ae37c7ff 2981 spin_lock_irq(&hugetlb_lock);
369fa227 2982 goto free_new;
de656ed3 2983 } else if (!folio_test_hugetlb_freed(old_folio)) {
369fa227 2984 /*
19fc1a7e 2985 * Folio's refcount is 0 but it has not been enqueued in the
369fa227
OS
2986 * freelist yet. Race window is small, so we can succeed here if
2987 * we retry.
2988 */
2989 spin_unlock_irq(&hugetlb_lock);
2990 cond_resched();
2991 goto retry;
2992 } else {
2993 /*
19fc1a7e 2994 * Ok, old_folio is still a genuine free hugepage. Remove it from
369fa227
OS
2995 * the freelist and decrease the counters. These will be
2996 * incremented again when calling __prep_account_new_huge_page()
240d67a8
SK
2997 * and enqueue_hugetlb_folio() for new_folio. The counters will
2998 * remain stable since this happens under the lock.
369fa227 2999 */
cfd5082b 3000 remove_hugetlb_folio(h, old_folio, false);
369fa227
OS
3001
3002 /*
19fc1a7e 3003 * Ref count on new_folio is already zero as it was dropped
b65a4eda 3004 * earlier. It can be directly added to the pool free list.
369fa227 3005 */
369fa227 3006 __prep_account_new_huge_page(h, nid);
240d67a8 3007 enqueue_hugetlb_folio(h, new_folio);
369fa227
OS
3008
3009 /*
19fc1a7e 3010 * Folio has been replaced, we can safely free the old one.
369fa227
OS
3011 */
3012 spin_unlock_irq(&hugetlb_lock);
d6ef19e2 3013 update_and_free_hugetlb_folio(h, old_folio, false);
369fa227
OS
3014 }
3015
3016 return ret;
3017
3018free_new:
3019 spin_unlock_irq(&hugetlb_lock);
19fc1a7e 3020 /* Folio has a zero ref count, but needs a ref to be freed */
de656ed3 3021 folio_ref_unfreeze(new_folio, 1);
d6ef19e2 3022 update_and_free_hugetlb_folio(h, new_folio, false);
369fa227
OS
3023
3024 return ret;
3025}
3026
ae37c7ff 3027int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
369fa227
OS
3028{
3029 struct hstate *h;
d5e33bd8 3030 struct folio *folio = page_folio(page);
ae37c7ff 3031 int ret = -EBUSY;
369fa227
OS
3032
3033 /*
3034 * The page might have been dissolved from under our feet, so make sure
3035 * to carefully check the state under the lock.
3036 * Return success when racing as if we dissolved the page ourselves.
3037 */
3038 spin_lock_irq(&hugetlb_lock);
d5e33bd8
SK
3039 if (folio_test_hugetlb(folio)) {
3040 h = folio_hstate(folio);
369fa227
OS
3041 } else {
3042 spin_unlock_irq(&hugetlb_lock);
3043 return 0;
3044 }
3045 spin_unlock_irq(&hugetlb_lock);
3046
3047 /*
3048 * Fence off gigantic pages as there is a cyclic dependency between
3049 * alloc_contig_range and them. Return -ENOMEM as this has the effect
3050 * of bailing out right away without further retrying.
3051 */
3052 if (hstate_is_gigantic(h))
3053 return -ENOMEM;
3054
9747b9e9 3055 if (folio_ref_count(folio) && isolate_hugetlb(folio, list))
ae37c7ff 3056 ret = 0;
d5e33bd8 3057 else if (!folio_ref_count(folio))
19fc1a7e 3058 ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
ae37c7ff
OS
3059
3060 return ret;
369fa227
OS
3061}
3062
d0ce0e47 3063struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
04f2cbe3 3064 unsigned long addr, int avoid_reserve)
1da177e4 3065{
90481622 3066 struct hugepage_subpool *spool = subpool_vma(vma);
a5516438 3067 struct hstate *h = hstate_vma(vma);
d4ab0316 3068 struct folio *folio;
8cba9576 3069 long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
d85f69b0 3070 long gbl_chg;
8cba9576 3071 int memcg_charge_ret, ret, idx;
d0ce0e47 3072 struct hugetlb_cgroup *h_cg = NULL;
8cba9576 3073 struct mem_cgroup *memcg;
08cf9faf 3074 bool deferred_reserve;
8cba9576
NP
3075 gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
3076
3077 memcg = get_mem_cgroup_from_current();
3078 memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
3079 if (memcg_charge_ret == -ENOMEM) {
3080 mem_cgroup_put(memcg);
3081 return ERR_PTR(-ENOMEM);
3082 }
a1e78772 3083
6d76dcf4 3084 idx = hstate_index(h);
a1e78772 3085 /*
d85f69b0
MK
3086 * Examine the region/reserve map to determine if the process
3087 * has a reservation for the page to be allocated. A return
3088 * code of zero indicates a reservation exists (no change).
a1e78772 3089 */
d85f69b0 3090 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
8cba9576
NP
3091 if (map_chg < 0) {
3092 if (!memcg_charge_ret)
3093 mem_cgroup_cancel_charge(memcg, nr_pages);
3094 mem_cgroup_put(memcg);
76dcee75 3095 return ERR_PTR(-ENOMEM);
8cba9576 3096 }
d85f69b0
MK
3097
3098 /*
3099 * Processes that did not create the mapping will have no
3100 * reserves as indicated by the region/reserve map. Check
3101 * that the allocation will not exceed the subpool limit.
3102 * Allocations for MAP_NORESERVE mappings also need to be
3103 * checked against any subpool limit.
3104 */
3105 if (map_chg || avoid_reserve) {
3106 gbl_chg = hugepage_subpool_get_pages(spool, 1);
8cba9576
NP
3107 if (gbl_chg < 0)
3108 goto out_end_reservation;
1da177e4 3109
d85f69b0
MK
3110 /*
3111 * Even though there was no reservation in the region/reserve
3112 * map, there could be reservations associated with the
3113 * subpool that can be used. This would be indicated if the
3114 * return value of hugepage_subpool_get_pages() is zero.
3115 * However, if avoid_reserve is specified we still avoid even
3116 * the subpool reservations.
3117 */
3118 if (avoid_reserve)
3119 gbl_chg = 1;
3120 }
3121
08cf9faf
MA
3122 /* If this allocation is not consuming a reservation, charge it now.
3123 */
6501fe5f 3124 deferred_reserve = map_chg || avoid_reserve;
08cf9faf
MA
3125 if (deferred_reserve) {
3126 ret = hugetlb_cgroup_charge_cgroup_rsvd(
3127 idx, pages_per_huge_page(h), &h_cg);
3128 if (ret)
3129 goto out_subpool_put;
3130 }
3131
6d76dcf4 3132 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
8f34af6f 3133 if (ret)
08cf9faf 3134 goto out_uncharge_cgroup_reservation;
8f34af6f 3135
db71ef79 3136 spin_lock_irq(&hugetlb_lock);
d85f69b0
MK
3137 /*
3138 * glb_chg is passed to indicate whether or not a page must be taken
3139 * from the global free pool (global change). gbl_chg == 0 indicates
3140 * a reservation exists for the allocation.
3141 */
ff7d853b
SK
3142 folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg);
3143 if (!folio) {
db71ef79 3144 spin_unlock_irq(&hugetlb_lock);
ff7d853b
SK
3145 folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
3146 if (!folio)
8f34af6f 3147 goto out_uncharge_cgroup;
12df140f 3148 spin_lock_irq(&hugetlb_lock);
a88c7695 3149 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
ff7d853b 3150 folio_set_hugetlb_restore_reserve(folio);
a88c7695
NH
3151 h->resv_huge_pages--;
3152 }
ff7d853b
SK
3153 list_add(&folio->lru, &h->hugepage_activelist);
3154 folio_ref_unfreeze(folio, 1);
81a6fcae 3155 /* Fall through */
68842c9b 3156 }
ff7d853b
SK
3157
3158 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
08cf9faf
MA
3159 /* If allocation is not consuming a reservation, also store the
3160 * hugetlb_cgroup pointer on the page.
3161 */
3162 if (deferred_reserve) {
3163 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
ff7d853b 3164 h_cg, folio);
08cf9faf
MA
3165 }
3166
db71ef79 3167 spin_unlock_irq(&hugetlb_lock);
348ea204 3168
ff7d853b 3169 hugetlb_set_folio_subpool(folio, spool);
90d8b7e6 3170
d85f69b0
MK
3171 map_commit = vma_commit_reservation(h, vma, addr);
3172 if (unlikely(map_chg > map_commit)) {
33039678
MK
3173 /*
3174 * The page was added to the reservation map between
3175 * vma_needs_reservation and vma_commit_reservation.
3176 * This indicates a race with hugetlb_reserve_pages.
3177 * Adjust for the subpool count incremented above AND
3178 * in hugetlb_reserve_pages for the same page. Also,
3179 * the reservation count added in hugetlb_reserve_pages
3180 * no longer applies.
3181 */
3182 long rsv_adjust;
3183
3184 rsv_adjust = hugepage_subpool_put_pages(spool, 1);
3185 hugetlb_acct_memory(h, -rsv_adjust);
79aa925b 3186 if (deferred_reserve)
d4ab0316
SK
3187 hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
3188 pages_per_huge_page(h), folio);
33039678 3189 }
8cba9576
NP
3190
3191 if (!memcg_charge_ret)
3192 mem_cgroup_commit_charge(folio, memcg);
3193 mem_cgroup_put(memcg);
3194
d0ce0e47 3195 return folio;
8f34af6f
JZ
3196
3197out_uncharge_cgroup:
3198 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
08cf9faf
MA
3199out_uncharge_cgroup_reservation:
3200 if (deferred_reserve)
3201 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
3202 h_cg);
8f34af6f 3203out_subpool_put:
d85f69b0 3204 if (map_chg || avoid_reserve)
8f34af6f 3205 hugepage_subpool_put_pages(spool, 1);
8cba9576 3206out_end_reservation:
feba16e2 3207 vma_end_reservation(h, vma, addr);
8cba9576
NP
3208 if (!memcg_charge_ret)
3209 mem_cgroup_cancel_charge(memcg, nr_pages);
3210 mem_cgroup_put(memcg);
8f34af6f 3211 return ERR_PTR(-ENOSPC);
b45b5bd6
DG
3212}
3213
b5389086 3214int alloc_bootmem_huge_page(struct hstate *h, int nid)
e24a1307 3215 __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
b5389086 3216int __alloc_bootmem_huge_page(struct hstate *h, int nid)
aa888a74 3217{
b5389086 3218 struct huge_bootmem_page *m = NULL; /* initialize for clang */
b2261026 3219 int nr_nodes, node;
aa888a74 3220
b5389086
ZY
3221 /* do node specific alloc */
3222 if (nid != NUMA_NO_NODE) {
3223 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
3224 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
3225 if (!m)
3226 return 0;
3227 goto found;
3228 }
3229 /* allocate from next node when distributing huge pages */
b2261026 3230 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
b5389086 3231 m = memblock_alloc_try_nid_raw(
8b89a116 3232 huge_page_size(h), huge_page_size(h),
97ad1087 3233 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
b5389086
ZY
3234 /*
3235 * Use the beginning of the huge page to store the
3236 * huge_bootmem_page struct (until gather_bootmem
3237 * puts them into the mem_map).
3238 */
3239 if (!m)
3240 return 0;
3241 goto found;
aa888a74 3242 }
aa888a74
AK
3243
3244found:
fde1c4ec
UA
3245
3246 /*
3247 * Only initialize the head struct page in memmap_init_reserved_pages,
3248 * rest of the struct pages will be initialized by the HugeTLB
3249 * subsystem itself.
3250 * The head struct page is used to get folio information by the HugeTLB
3251 * subsystem like zone id and node id.
3252 */
3253 memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
3254 huge_page_size(h) - PAGE_SIZE);
aa888a74 3255 /* Put them into a private list first because mem_map is not up yet */
330d6e48 3256 INIT_LIST_HEAD(&m->list);
aa888a74
AK
3257 list_add(&m->list, &huge_boot_pages);
3258 m->hstate = h;
3259 return 1;
3260}
3261
fde1c4ec
UA
3262/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */
3263static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
3264 unsigned long start_page_number,
3265 unsigned long end_page_number)
3266{
3267 enum zone_type zone = zone_idx(folio_zone(folio));
3268 int nid = folio_nid(folio);
3269 unsigned long head_pfn = folio_pfn(folio);
3270 unsigned long pfn, end_pfn = head_pfn + end_page_number;
3271 int ret;
3272
3273 for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
3274 struct page *page = pfn_to_page(pfn);
3275
3276 __init_single_page(page, pfn, zone, nid);
3277 prep_compound_tail((struct page *)folio, pfn - head_pfn);
3278 ret = page_ref_freeze(page, 1);
3279 VM_BUG_ON(!ret);
3280 }
3281}
3282
3283static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
3284 struct hstate *h,
3285 unsigned long nr_pages)
3286{
3287 int ret;
3288
3289 /* Prepare folio head */
3290 __folio_clear_reserved(folio);
3291 __folio_set_head(folio);
a48bf7b4 3292 ret = folio_ref_freeze(folio, 1);
fde1c4ec
UA
3293 VM_BUG_ON(!ret);
3294 /* Initialize the necessary tail struct pages */
3295 hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
3296 prep_compound_head((struct page *)folio, huge_page_order(h));
3297}
3298
48b8d744
MK
3299/*
3300 * Put bootmem huge pages into the standard lists after mem_map is up.
3301 * Note: This only applies to gigantic (order > MAX_ORDER) pages.
3302 */
aa888a74
AK
3303static void __init gather_bootmem_prealloc(void)
3304{
3305 struct huge_bootmem_page *m;
3306
3307 list_for_each_entry(m, &huge_boot_pages, list) {
40d18ebf 3308 struct page *page = virt_to_page(m);
fde1c4ec 3309 struct folio *folio = (void *)page;
aa888a74 3310 struct hstate *h = m->hstate;
ee8f248d 3311
48b8d744 3312 VM_BUG_ON(!hstate_is_gigantic(h));
d1c60955 3313 WARN_ON(folio_ref_count(folio) != 1);
fde1c4ec
UA
3314
3315 hugetlb_folio_init_vmemmap(folio, h,
3316 HUGETLB_VMEMMAP_RESERVE_PAGES);
3317 prep_new_hugetlb_folio(h, folio, folio_nid(folio));
3318 /* If HVO fails, initialize all tail struct pages */
3319 if (!HPageVmemmapOptimized(&folio->page))
3320 hugetlb_folio_init_tail_vmemmap(folio,
3321 HUGETLB_VMEMMAP_RESERVE_PAGES,
3322 pages_per_huge_page(h));
3323 free_huge_folio(folio); /* add to the hugepage allocator */
af0fb9df 3324
b0320c7b 3325 /*
48b8d744
MK
3326 * We need to restore the 'stolen' pages to totalram_pages
3327 * in order to fix confusing memory reports from free(1) and
3328 * other side-effects, like CommitLimit going negative.
b0320c7b 3329 */
48b8d744 3330 adjust_managed_page_count(page, pages_per_huge_page(h));
520495fe 3331 cond_resched();
aa888a74
AK
3332 }
3333}
fde1c4ec 3334
b5389086
ZY
3335static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3336{
3337 unsigned long i;
3338 char buf[32];
3339
3340 for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3341 if (hstate_is_gigantic(h)) {
3342 if (!alloc_bootmem_huge_page(h, nid))
3343 break;
3344 } else {
19fc1a7e 3345 struct folio *folio;
b5389086
ZY
3346 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3347
19fc1a7e 3348 folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
b5389086 3349 &node_states[N_MEMORY], NULL);
19fc1a7e 3350 if (!folio)
b5389086 3351 break;
454a00c4 3352 free_huge_folio(folio); /* free it into the hugepage allocator */
b5389086
ZY
3353 }
3354 cond_resched();
3355 }
3356 if (i == h->max_huge_pages_node[nid])
3357 return;
3358
3359 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3360 pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
3361 h->max_huge_pages_node[nid], buf, nid, i);
3362 h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3363 h->max_huge_pages_node[nid] = i;
3364}
aa888a74 3365
8faa8b07 3366static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1da177e4
LT
3367{
3368 unsigned long i;
f60858f9 3369 nodemask_t *node_alloc_noretry;
b5389086
ZY
3370 bool node_specific_alloc = false;
3371
3372 /* skip gigantic hugepages allocation if hugetlb_cma enabled */
3373 if (hstate_is_gigantic(h) && hugetlb_cma_size) {
3374 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3375 return;
3376 }
3377
3378 /* do node specific alloc */
0a7a0f6f 3379 for_each_online_node(i) {
b5389086
ZY
3380 if (h->max_huge_pages_node[i] > 0) {
3381 hugetlb_hstate_alloc_pages_onenode(h, i);
3382 node_specific_alloc = true;
3383 }
3384 }
f60858f9 3385
b5389086
ZY
3386 if (node_specific_alloc)
3387 return;
3388
3389 /* below will do all node balanced alloc */
f60858f9
MK
3390 if (!hstate_is_gigantic(h)) {
3391 /*
3392 * Bit mask controlling how hard we retry per-node allocations.
3393 * Ignore errors as lower level routines can deal with
3394 * node_alloc_noretry == NULL. If this kmalloc fails at boot
3395 * time, we are likely in bigger trouble.
3396 */
3397 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
3398 GFP_KERNEL);
3399 } else {
3400 /* allocations done at boot time */
3401 node_alloc_noretry = NULL;
3402 }
3403
3404 /* bit mask controlling how hard we retry per-node allocations */
3405 if (node_alloc_noretry)
3406 nodes_clear(*node_alloc_noretry);
a5516438 3407
e5ff2159 3408 for (i = 0; i < h->max_huge_pages; ++i) {
bae7f4ae 3409 if (hstate_is_gigantic(h)) {
b5389086 3410 if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
aa888a74 3411 break;
0c397dae 3412 } else if (!alloc_pool_huge_page(h,
f60858f9
MK
3413 &node_states[N_MEMORY],
3414 node_alloc_noretry))
1da177e4 3415 break;
69ed779a 3416 cond_resched();
1da177e4 3417 }
d715cf80
LH
3418 if (i < h->max_huge_pages) {
3419 char buf[32];
3420
c6247f72 3421 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
d715cf80
LH
3422 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
3423 h->max_huge_pages, buf, i);
3424 h->max_huge_pages = i;
3425 }
f60858f9 3426 kfree(node_alloc_noretry);
e5ff2159
AK
3427}
3428
3429static void __init hugetlb_init_hstates(void)
3430{
79dfc695 3431 struct hstate *h, *h2;
e5ff2159
AK
3432
3433 for_each_hstate(h) {
8faa8b07 3434 /* oversize hugepages were init'ed in early boot */
bae7f4ae 3435 if (!hstate_is_gigantic(h))
8faa8b07 3436 hugetlb_hstate_alloc_pages(h);
79dfc695
MK
3437
3438 /*
3439 * Set demote order for each hstate. Note that
3440 * h->demote_order is initially 0.
3441 * - We can not demote gigantic pages if runtime freeing
3442 * is not supported, so skip this.
a01f4390
MK
3443 * - If CMA allocation is possible, we can not demote
3444 * HUGETLB_PAGE_ORDER or smaller size pages.
79dfc695
MK
3445 */
3446 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3447 continue;
a01f4390
MK
3448 if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
3449 continue;
79dfc695
MK
3450 for_each_hstate(h2) {
3451 if (h2 == h)
3452 continue;
3453 if (h2->order < h->order &&
3454 h2->order > h->demote_order)
3455 h->demote_order = h2->order;
3456 }
e5ff2159
AK
3457 }
3458}
3459
3460static void __init report_hugepages(void)
3461{
3462 struct hstate *h;
3463
3464 for_each_hstate(h) {
4abd32db 3465 char buf[32];
c6247f72
MW
3466
3467 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
6213834c 3468 pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
c6247f72 3469 buf, h->free_huge_pages);
6213834c
MS
3470 pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
3471 hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
e5ff2159
AK
3472 }
3473}
3474
1da177e4 3475#ifdef CONFIG_HIGHMEM
6ae11b27
LS
3476static void try_to_free_low(struct hstate *h, unsigned long count,
3477 nodemask_t *nodes_allowed)
1da177e4 3478{
4415cc8d 3479 int i;
1121828a 3480 LIST_HEAD(page_list);
4415cc8d 3481
9487ca60 3482 lockdep_assert_held(&hugetlb_lock);
bae7f4ae 3483 if (hstate_is_gigantic(h))
aa888a74
AK
3484 return;
3485
1121828a
MK
3486 /*
3487 * Collect pages to be freed on a list, and free after dropping lock
3488 */
6ae11b27 3489 for_each_node_mask(i, *nodes_allowed) {
04bbfd84 3490 struct folio *folio, *next;
a5516438 3491 struct list_head *freel = &h->hugepage_freelists[i];
04bbfd84 3492 list_for_each_entry_safe(folio, next, freel, lru) {
a5516438 3493 if (count >= h->nr_huge_pages)
1121828a 3494 goto out;
04bbfd84 3495 if (folio_test_highmem(folio))
1da177e4 3496 continue;
04bbfd84
MWO
3497 remove_hugetlb_folio(h, folio, false);
3498 list_add(&folio->lru, &page_list);
1da177e4
LT
3499 }
3500 }
1121828a
MK
3501
3502out:
db71ef79 3503 spin_unlock_irq(&hugetlb_lock);
10c6ec49 3504 update_and_free_pages_bulk(h, &page_list);
db71ef79 3505 spin_lock_irq(&hugetlb_lock);
1da177e4
LT
3506}
3507#else
6ae11b27
LS
3508static inline void try_to_free_low(struct hstate *h, unsigned long count,
3509 nodemask_t *nodes_allowed)
1da177e4
LT
3510{
3511}
3512#endif
3513
20a0307c
WF
3514/*
3515 * Increment or decrement surplus_huge_pages. Keep node-specific counters
3516 * balanced by operating on them in a round-robin fashion.
3517 * Returns 1 if an adjustment was made.
3518 */
6ae11b27
LS
3519static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
3520 int delta)
20a0307c 3521{
b2261026 3522 int nr_nodes, node;
20a0307c 3523
9487ca60 3524 lockdep_assert_held(&hugetlb_lock);
20a0307c 3525 VM_BUG_ON(delta != -1 && delta != 1);
20a0307c 3526
b2261026
JK
3527 if (delta < 0) {
3528 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
3529 if (h->surplus_huge_pages_node[node])
3530 goto found;
e8c5c824 3531 }
b2261026
JK
3532 } else {
3533 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3534 if (h->surplus_huge_pages_node[node] <
3535 h->nr_huge_pages_node[node])
3536 goto found;
e8c5c824 3537 }
b2261026
JK
3538 }
3539 return 0;
20a0307c 3540
b2261026
JK
3541found:
3542 h->surplus_huge_pages += delta;
3543 h->surplus_huge_pages_node[node] += delta;
3544 return 1;
20a0307c
WF
3545}
3546
a5516438 3547#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
fd875dca 3548static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
4eb0716e 3549 nodemask_t *nodes_allowed)
1da177e4 3550{
7893d1d5 3551 unsigned long min_count, ret;
10c6ec49 3552 LIST_HEAD(page_list);
f60858f9
MK
3553 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3554
3555 /*
3556 * Bit mask controlling how hard we retry per-node allocations.
3557 * If we can not allocate the bit mask, do not attempt to allocate
3558 * the requested huge pages.
3559 */
3560 if (node_alloc_noretry)
3561 nodes_clear(*node_alloc_noretry);
3562 else
3563 return -ENOMEM;
1da177e4 3564
29383967
MK
3565 /*
3566 * resize_lock mutex prevents concurrent adjustments to number of
3567 * pages in hstate via the proc/sysfs interfaces.
3568 */
3569 mutex_lock(&h->resize_lock);
b65d4adb 3570 flush_free_hpage_work(h);
db71ef79 3571 spin_lock_irq(&hugetlb_lock);
4eb0716e 3572
fd875dca
MK
3573 /*
3574 * Check for a node specific request.
3575 * Changing node specific huge page count may require a corresponding
3576 * change to the global count. In any case, the passed node mask
3577 * (nodes_allowed) will restrict alloc/free to the specified node.
3578 */
3579 if (nid != NUMA_NO_NODE) {
3580 unsigned long old_count = count;
3581
b72b3c9c
XH
3582 count += persistent_huge_pages(h) -
3583 (h->nr_huge_pages_node[nid] -
3584 h->surplus_huge_pages_node[nid]);
fd875dca
MK
3585 /*
3586 * User may have specified a large count value which caused the
3587 * above calculation to overflow. In this case, they wanted
3588 * to allocate as many huge pages as possible. Set count to
3589 * largest possible value to align with their intention.
3590 */
3591 if (count < old_count)
3592 count = ULONG_MAX;
3593 }
3594
4eb0716e
AG
3595 /*
3596 * Gigantic pages runtime allocation depend on the capability for large
3597 * page range allocation.
3598 * If the system does not provide this feature, return an error when
3599 * the user tries to allocate gigantic pages but let the user free the
3600 * boottime allocated gigantic pages.
3601 */
3602 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3603 if (count > persistent_huge_pages(h)) {
db71ef79 3604 spin_unlock_irq(&hugetlb_lock);
29383967 3605 mutex_unlock(&h->resize_lock);
f60858f9 3606 NODEMASK_FREE(node_alloc_noretry);
4eb0716e
AG
3607 return -EINVAL;
3608 }
3609 /* Fall through to decrease pool */
3610 }
aa888a74 3611
7893d1d5
AL
3612 /*
3613 * Increase the pool size
3614 * First take pages out of surplus state. Then make up the
3615 * remaining difference by allocating fresh huge pages.
d1c3fb1f 3616 *
3a740e8b 3617 * We might race with alloc_surplus_hugetlb_folio() here and be unable
d1c3fb1f
NA
3618 * to convert a surplus huge page to a normal huge page. That is
3619 * not critical, though, it just means the overall size of the
3620 * pool might be one hugepage larger than it needs to be, but
3621 * within all the constraints specified by the sysctls.
7893d1d5 3622 */
a5516438 3623 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
6ae11b27 3624 if (!adjust_pool_surplus(h, nodes_allowed, -1))
7893d1d5
AL
3625 break;
3626 }
3627
a5516438 3628 while (count > persistent_huge_pages(h)) {
7893d1d5
AL
3629 /*
3630 * If this allocation races such that we no longer need the
454a00c4 3631 * page, free_huge_folio will handle it by freeing the page
7893d1d5
AL
3632 * and reducing the surplus.
3633 */
db71ef79 3634 spin_unlock_irq(&hugetlb_lock);
649920c6
JH
3635
3636 /* yield cpu to avoid soft lockup */
3637 cond_resched();
3638
f60858f9
MK
3639 ret = alloc_pool_huge_page(h, nodes_allowed,
3640 node_alloc_noretry);
db71ef79 3641 spin_lock_irq(&hugetlb_lock);
7893d1d5
AL
3642 if (!ret)
3643 goto out;
3644
536240f2
MG
3645 /* Bail for signals. Probably ctrl-c from user */
3646 if (signal_pending(current))
3647 goto out;
7893d1d5 3648 }
7893d1d5
AL
3649
3650 /*
3651 * Decrease the pool size
3652 * First return free pages to the buddy allocator (being careful
3653 * to keep enough around to satisfy reservations). Then place
3654 * pages into surplus state as needed so the pool will shrink
3655 * to the desired size as pages become free.
d1c3fb1f
NA
3656 *
3657 * By placing pages into the surplus state independent of the
3658 * overcommit value, we are allowing the surplus pool size to
3659 * exceed overcommit. There are few sane options here. Since
3a740e8b 3660 * alloc_surplus_hugetlb_folio() is checking the global counter,
d1c3fb1f
NA
3661 * though, we'll note that we're not allowed to exceed surplus
3662 * and won't grow the pool anywhere else. Not until one of the
3663 * sysctls are changed, or the surplus pages go out of use.
7893d1d5 3664 */
a5516438 3665 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
6b0c880d 3666 min_count = max(count, min_count);
6ae11b27 3667 try_to_free_low(h, min_count, nodes_allowed);
10c6ec49
MK
3668
3669 /*
3670 * Collect pages to be removed on list without dropping lock
3671 */
a5516438 3672 while (min_count < persistent_huge_pages(h)) {
d5b43e96
MWO
3673 struct folio *folio;
3674
3675 folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
3676 if (!folio)
1da177e4 3677 break;
10c6ec49 3678
d5b43e96 3679 list_add(&folio->lru, &page_list);
1da177e4 3680 }
10c6ec49 3681 /* free the pages after dropping lock */
db71ef79 3682 spin_unlock_irq(&hugetlb_lock);
10c6ec49 3683 update_and_free_pages_bulk(h, &page_list);
b65d4adb 3684 flush_free_hpage_work(h);
db71ef79 3685 spin_lock_irq(&hugetlb_lock);
10c6ec49 3686
a5516438 3687 while (count < persistent_huge_pages(h)) {
6ae11b27 3688 if (!adjust_pool_surplus(h, nodes_allowed, 1))
7893d1d5
AL
3689 break;
3690 }
3691out:
4eb0716e 3692 h->max_huge_pages = persistent_huge_pages(h);
db71ef79 3693 spin_unlock_irq(&hugetlb_lock);
29383967 3694 mutex_unlock(&h->resize_lock);
4eb0716e 3695
f60858f9
MK
3696 NODEMASK_FREE(node_alloc_noretry);
3697
4eb0716e 3698 return 0;
1da177e4
LT
3699}
3700
bdd7be07 3701static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
8531fc6f 3702{
bdd7be07 3703 int i, nid = folio_nid(folio);
8531fc6f 3704 struct hstate *target_hstate;
31731452 3705 struct page *subpage;
bdd7be07 3706 struct folio *inner_folio;
8531fc6f
MK
3707 int rc = 0;
3708
3709 target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
3710
cfd5082b 3711 remove_hugetlb_folio_for_demote(h, folio, false);
8531fc6f
MK
3712 spin_unlock_irq(&hugetlb_lock);
3713
d8f5f7e4
MK
3714 /*
3715 * If vmemmap already existed for folio, the remove routine above would
3716 * have cleared the hugetlb folio flag. Hence the folio is technically
3717 * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be
3718 * passed hugetlb folios and will BUG otherwise.
3719 */
3720 if (folio_test_hugetlb(folio)) {
3721 rc = hugetlb_vmemmap_restore(h, &folio->page);
3722 if (rc) {
3723 /* Allocation of vmemmmap failed, we can not demote folio */
3724 spin_lock_irq(&hugetlb_lock);
3725 folio_ref_unfreeze(folio, 1);
3726 add_hugetlb_folio(h, folio, false);
3727 return rc;
3728 }
8531fc6f
MK
3729 }
3730
3731 /*
911565b8 3732 * Use destroy_compound_hugetlb_folio_for_demote for all huge page
bdd7be07 3733 * sizes as it will not ref count folios.
8531fc6f 3734 */
911565b8 3735 destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
8531fc6f
MK
3736
3737 /*
3738 * Taking target hstate mutex synchronizes with set_max_huge_pages.
3739 * Without the mutex, pages added to target hstate could be marked
3740 * as surplus.
3741 *
3742 * Note that we already hold h->resize_lock. To prevent deadlock,
3743 * use the convention of always taking larger size hstate mutex first.
3744 */
3745 mutex_lock(&target_hstate->resize_lock);
3746 for (i = 0; i < pages_per_huge_page(h);
3747 i += pages_per_huge_page(target_hstate)) {
bdd7be07
SK
3748 subpage = folio_page(folio, i);
3749 inner_folio = page_folio(subpage);
8531fc6f 3750 if (hstate_is_gigantic(target_hstate))
bdd7be07 3751 prep_compound_gigantic_folio_for_demote(inner_folio,
8531fc6f
MK
3752 target_hstate->order);
3753 else
31731452 3754 prep_compound_page(subpage, target_hstate->order);
bdd7be07
SK
3755 folio_change_private(inner_folio, NULL);
3756 prep_new_hugetlb_folio(target_hstate, inner_folio, nid);
454a00c4 3757 free_huge_folio(inner_folio);
8531fc6f
MK
3758 }
3759 mutex_unlock(&target_hstate->resize_lock);
3760
3761 spin_lock_irq(&hugetlb_lock);
3762
3763 /*
3764 * Not absolutely necessary, but for consistency update max_huge_pages
3765 * based on pool changes for the demoted page.
3766 */
3767 h->max_huge_pages--;
a43a83c7
ML
3768 target_hstate->max_huge_pages +=
3769 pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
8531fc6f
MK
3770
3771 return rc;
3772}
3773
79dfc695
MK
3774static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
3775 __must_hold(&hugetlb_lock)
3776{
8531fc6f 3777 int nr_nodes, node;
bdd7be07 3778 struct folio *folio;
79dfc695
MK
3779
3780 lockdep_assert_held(&hugetlb_lock);
3781
3782 /* We should never get here if no demote order */
3783 if (!h->demote_order) {
3784 pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
3785 return -EINVAL; /* internal error */
3786 }
3787
8531fc6f 3788 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
bdd7be07
SK
3789 list_for_each_entry(folio, &h->hugepage_freelists[node], lru) {
3790 if (folio_test_hwpoison(folio))
5a317412 3791 continue;
bdd7be07 3792 return demote_free_hugetlb_folio(h, folio);
8531fc6f
MK
3793 }
3794 }
3795
5a317412
MK
3796 /*
3797 * Only way to get here is if all pages on free lists are poisoned.
3798 * Return -EBUSY so that caller will not retry.
3799 */
3800 return -EBUSY;
79dfc695
MK
3801}
3802
a3437870
NA
3803#define HSTATE_ATTR_RO(_name) \
3804 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
3805
79dfc695
MK
3806#define HSTATE_ATTR_WO(_name) \
3807 static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
3808
a3437870 3809#define HSTATE_ATTR(_name) \
98bc26ac 3810 static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
a3437870
NA
3811
3812static struct kobject *hugepages_kobj;
3813static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3814
9a305230
LS
3815static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
3816
3817static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
a3437870
NA
3818{
3819 int i;
9a305230 3820
a3437870 3821 for (i = 0; i < HUGE_MAX_HSTATE; i++)
9a305230
LS
3822 if (hstate_kobjs[i] == kobj) {
3823 if (nidp)
3824 *nidp = NUMA_NO_NODE;
a3437870 3825 return &hstates[i];
9a305230
LS
3826 }
3827
3828 return kobj_to_node_hstate(kobj, nidp);
a3437870
NA
3829}
3830
06808b08 3831static ssize_t nr_hugepages_show_common(struct kobject *kobj,
a3437870
NA
3832 struct kobj_attribute *attr, char *buf)
3833{
9a305230
LS
3834 struct hstate *h;
3835 unsigned long nr_huge_pages;
3836 int nid;
3837
3838 h = kobj_to_hstate(kobj, &nid);
3839 if (nid == NUMA_NO_NODE)
3840 nr_huge_pages = h->nr_huge_pages;
3841 else
3842 nr_huge_pages = h->nr_huge_pages_node[nid];
3843
ae7a927d 3844 return sysfs_emit(buf, "%lu\n", nr_huge_pages);
a3437870 3845}
adbe8726 3846
238d3c13
DR
3847static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
3848 struct hstate *h, int nid,
3849 unsigned long count, size_t len)
a3437870
NA
3850{
3851 int err;
2d0adf7e 3852 nodemask_t nodes_allowed, *n_mask;
a3437870 3853
2d0adf7e
OS
3854 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3855 return -EINVAL;
adbe8726 3856
9a305230
LS
3857 if (nid == NUMA_NO_NODE) {
3858 /*
3859 * global hstate attribute
3860 */
3861 if (!(obey_mempolicy &&
2d0adf7e
OS
3862 init_nodemask_of_mempolicy(&nodes_allowed)))
3863 n_mask = &node_states[N_MEMORY];
3864 else
3865 n_mask = &nodes_allowed;
3866 } else {
9a305230 3867 /*
fd875dca
MK
3868 * Node specific request. count adjustment happens in
3869 * set_max_huge_pages() after acquiring hugetlb_lock.
9a305230 3870 */
2d0adf7e
OS
3871 init_nodemask_of_node(&nodes_allowed, nid);
3872 n_mask = &nodes_allowed;
fd875dca 3873 }
9a305230 3874
2d0adf7e 3875 err = set_max_huge_pages(h, count, nid, n_mask);
06808b08 3876
4eb0716e 3877 return err ? err : len;
06808b08
LS
3878}
3879
238d3c13
DR
3880static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
3881 struct kobject *kobj, const char *buf,
3882 size_t len)
3883{
3884 struct hstate *h;
3885 unsigned long count;
3886 int nid;
3887 int err;
3888
3889 err = kstrtoul(buf, 10, &count);
3890 if (err)
3891 return err;
3892
3893 h = kobj_to_hstate(kobj, &nid);
3894 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
3895}
3896
06808b08
LS
3897static ssize_t nr_hugepages_show(struct kobject *kobj,
3898 struct kobj_attribute *attr, char *buf)
3899{
3900 return nr_hugepages_show_common(kobj, attr, buf);
3901}
3902
3903static ssize_t nr_hugepages_store(struct kobject *kobj,
3904 struct kobj_attribute *attr, const char *buf, size_t len)
3905{
238d3c13 3906 return nr_hugepages_store_common(false, kobj, buf, len);
a3437870
NA
3907}
3908HSTATE_ATTR(nr_hugepages);
3909
06808b08
LS
3910#ifdef CONFIG_NUMA
3911
3912/*
3913 * hstate attribute for optionally mempolicy-based constraint on persistent
3914 * huge page alloc/free.
3915 */
3916static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
ae7a927d
JP
3917 struct kobj_attribute *attr,
3918 char *buf)
06808b08
LS
3919{
3920 return nr_hugepages_show_common(kobj, attr, buf);
3921}
3922
3923static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
3924 struct kobj_attribute *attr, const char *buf, size_t len)
3925{
238d3c13 3926 return nr_hugepages_store_common(true, kobj, buf, len);
06808b08
LS
3927}
3928HSTATE_ATTR(nr_hugepages_mempolicy);
3929#endif
3930
3931
a3437870
NA
3932static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
3933 struct kobj_attribute *attr, char *buf)
3934{
9a305230 3935 struct hstate *h = kobj_to_hstate(kobj, NULL);
ae7a927d 3936 return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
a3437870 3937}
adbe8726 3938
a3437870
NA
3939static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
3940 struct kobj_attribute *attr, const char *buf, size_t count)
3941{
3942 int err;
3943 unsigned long input;
9a305230 3944 struct hstate *h = kobj_to_hstate(kobj, NULL);
a3437870 3945
bae7f4ae 3946 if (hstate_is_gigantic(h))
adbe8726
EM
3947 return -EINVAL;
3948
3dbb95f7 3949 err = kstrtoul(buf, 10, &input);
a3437870 3950 if (err)
73ae31e5 3951 return err;
a3437870 3952
db71ef79 3953 spin_lock_irq(&hugetlb_lock);
a3437870 3954 h->nr_overcommit_huge_pages = input;
db71ef79 3955 spin_unlock_irq(&hugetlb_lock);
a3437870
NA
3956
3957 return count;
3958}
3959HSTATE_ATTR(nr_overcommit_hugepages);
3960
3961static ssize_t free_hugepages_show(struct kobject *kobj,
3962 struct kobj_attribute *attr, char *buf)
3963{
9a305230
LS
3964 struct hstate *h;
3965 unsigned long free_huge_pages;
3966 int nid;
3967
3968 h = kobj_to_hstate(kobj, &nid);
3969 if (nid == NUMA_NO_NODE)
3970 free_huge_pages = h->free_huge_pages;
3971 else
3972 free_huge_pages = h->free_huge_pages_node[nid];
3973
ae7a927d 3974 return sysfs_emit(buf, "%lu\n", free_huge_pages);
a3437870
NA
3975}
3976HSTATE_ATTR_RO(free_hugepages);
3977
3978static ssize_t resv_hugepages_show(struct kobject *kobj,
3979 struct kobj_attribute *attr, char *buf)
3980{
9a305230 3981 struct hstate *h = kobj_to_hstate(kobj, NULL);
ae7a927d 3982 return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
a3437870
NA
3983}
3984HSTATE_ATTR_RO(resv_hugepages);
3985
3986static ssize_t surplus_hugepages_show(struct kobject *kobj,
3987 struct kobj_attribute *attr, char *buf)
3988{
9a305230
LS
3989 struct hstate *h;
3990 unsigned long surplus_huge_pages;
3991 int nid;
3992
3993 h = kobj_to_hstate(kobj, &nid);
3994 if (nid == NUMA_NO_NODE)
3995 surplus_huge_pages = h->surplus_huge_pages;
3996 else
3997 surplus_huge_pages = h->surplus_huge_pages_node[nid];
3998
ae7a927d 3999 return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
a3437870
NA
4000}
4001HSTATE_ATTR_RO(surplus_hugepages);
4002
79dfc695
MK
4003static ssize_t demote_store(struct kobject *kobj,
4004 struct kobj_attribute *attr, const char *buf, size_t len)
4005{
4006 unsigned long nr_demote;
4007 unsigned long nr_available;
4008 nodemask_t nodes_allowed, *n_mask;
4009 struct hstate *h;
8eeda55f 4010 int err;
79dfc695
MK
4011 int nid;
4012
4013 err = kstrtoul(buf, 10, &nr_demote);
4014 if (err)
4015 return err;
4016 h = kobj_to_hstate(kobj, &nid);
4017
4018 if (nid != NUMA_NO_NODE) {
4019 init_nodemask_of_node(&nodes_allowed, nid);
4020 n_mask = &nodes_allowed;
4021 } else {
4022 n_mask = &node_states[N_MEMORY];
4023 }
4024
4025 /* Synchronize with other sysfs operations modifying huge pages */
4026 mutex_lock(&h->resize_lock);
4027 spin_lock_irq(&hugetlb_lock);
4028
4029 while (nr_demote) {
4030 /*
4031 * Check for available pages to demote each time thorough the
4032 * loop as demote_pool_huge_page will drop hugetlb_lock.
79dfc695
MK
4033 */
4034 if (nid != NUMA_NO_NODE)
4035 nr_available = h->free_huge_pages_node[nid];
4036 else
4037 nr_available = h->free_huge_pages;
4038 nr_available -= h->resv_huge_pages;
4039 if (!nr_available)
4040 break;
4041
4042 err = demote_pool_huge_page(h, n_mask);
4043 if (err)
4044 break;
4045
4046 nr_demote--;
4047 }
4048
4049 spin_unlock_irq(&hugetlb_lock);
4050 mutex_unlock(&h->resize_lock);
4051
4052 if (err)
4053 return err;
4054 return len;
4055}
4056HSTATE_ATTR_WO(demote);
4057
4058static ssize_t demote_size_show(struct kobject *kobj,
4059 struct kobj_attribute *attr, char *buf)
4060{
12658abf 4061 struct hstate *h = kobj_to_hstate(kobj, NULL);
79dfc695
MK
4062 unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
4063
4064 return sysfs_emit(buf, "%lukB\n", demote_size);
4065}
4066
4067static ssize_t demote_size_store(struct kobject *kobj,
4068 struct kobj_attribute *attr,
4069 const char *buf, size_t count)
4070{
4071 struct hstate *h, *demote_hstate;
4072 unsigned long demote_size;
4073 unsigned int demote_order;
79dfc695
MK
4074
4075 demote_size = (unsigned long)memparse(buf, NULL);
4076
4077 demote_hstate = size_to_hstate(demote_size);
4078 if (!demote_hstate)
4079 return -EINVAL;
4080 demote_order = demote_hstate->order;
a01f4390
MK
4081 if (demote_order < HUGETLB_PAGE_ORDER)
4082 return -EINVAL;
79dfc695
MK
4083
4084 /* demote order must be smaller than hstate order */
12658abf 4085 h = kobj_to_hstate(kobj, NULL);
79dfc695
MK
4086 if (demote_order >= h->order)
4087 return -EINVAL;
4088
4089 /* resize_lock synchronizes access to demote size and writes */
4090 mutex_lock(&h->resize_lock);
4091 h->demote_order = demote_order;
4092 mutex_unlock(&h->resize_lock);
4093
4094 return count;
4095}
4096HSTATE_ATTR(demote_size);
4097
a3437870
NA
4098static struct attribute *hstate_attrs[] = {
4099 &nr_hugepages_attr.attr,
4100 &nr_overcommit_hugepages_attr.attr,
4101 &free_hugepages_attr.attr,
4102 &resv_hugepages_attr.attr,
4103 &surplus_hugepages_attr.attr,
06808b08
LS
4104#ifdef CONFIG_NUMA
4105 &nr_hugepages_mempolicy_attr.attr,
4106#endif
a3437870
NA
4107 NULL,
4108};
4109
67e5ed96 4110static const struct attribute_group hstate_attr_group = {
a3437870
NA
4111 .attrs = hstate_attrs,
4112};
4113
79dfc695
MK
4114static struct attribute *hstate_demote_attrs[] = {
4115 &demote_size_attr.attr,
4116 &demote_attr.attr,
4117 NULL,
4118};
4119
4120static const struct attribute_group hstate_demote_attr_group = {
4121 .attrs = hstate_demote_attrs,
4122};
4123
094e9539
JM
4124static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
4125 struct kobject **hstate_kobjs,
67e5ed96 4126 const struct attribute_group *hstate_attr_group)
a3437870
NA
4127{
4128 int retval;
972dc4de 4129 int hi = hstate_index(h);
a3437870 4130
9a305230
LS
4131 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
4132 if (!hstate_kobjs[hi])
a3437870
NA
4133 return -ENOMEM;
4134
9a305230 4135 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
cc2205a6 4136 if (retval) {
9a305230 4137 kobject_put(hstate_kobjs[hi]);
cc2205a6 4138 hstate_kobjs[hi] = NULL;
3a6bdda0 4139 return retval;
cc2205a6 4140 }
a3437870 4141
79dfc695 4142 if (h->demote_order) {
01088a60
ML
4143 retval = sysfs_create_group(hstate_kobjs[hi],
4144 &hstate_demote_attr_group);
4145 if (retval) {
79dfc695 4146 pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
01088a60
ML
4147 sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
4148 kobject_put(hstate_kobjs[hi]);
4149 hstate_kobjs[hi] = NULL;
4150 return retval;
4151 }
79dfc695
MK
4152 }
4153
01088a60 4154 return 0;
a3437870
NA
4155}
4156
9a305230 4157#ifdef CONFIG_NUMA
a4a00b45 4158static bool hugetlb_sysfs_initialized __ro_after_init;
9a305230
LS
4159
4160/*
4161 * node_hstate/s - associate per node hstate attributes, via their kobjects,
10fbcf4c
KS
4162 * with node devices in node_devices[] using a parallel array. The array
4163 * index of a node device or _hstate == node id.
4164 * This is here to avoid any static dependency of the node device driver, in
9a305230
LS
4165 * the base kernel, on the hugetlb module.
4166 */
4167struct node_hstate {
4168 struct kobject *hugepages_kobj;
4169 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
4170};
b4e289a6 4171static struct node_hstate node_hstates[MAX_NUMNODES];
9a305230
LS
4172
4173/*
10fbcf4c 4174 * A subset of global hstate attributes for node devices
9a305230
LS
4175 */
4176static struct attribute *per_node_hstate_attrs[] = {
4177 &nr_hugepages_attr.attr,
4178 &free_hugepages_attr.attr,
4179 &surplus_hugepages_attr.attr,
4180 NULL,
4181};
4182
67e5ed96 4183static const struct attribute_group per_node_hstate_attr_group = {
9a305230
LS
4184 .attrs = per_node_hstate_attrs,
4185};
4186
4187/*
10fbcf4c 4188 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
9a305230
LS
4189 * Returns node id via non-NULL nidp.
4190 */
4191static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4192{
4193 int nid;
4194
4195 for (nid = 0; nid < nr_node_ids; nid++) {
4196 struct node_hstate *nhs = &node_hstates[nid];
4197 int i;
4198 for (i = 0; i < HUGE_MAX_HSTATE; i++)
4199 if (nhs->hstate_kobjs[i] == kobj) {
4200 if (nidp)
4201 *nidp = nid;
4202 return &hstates[i];
4203 }
4204 }
4205
4206 BUG();
4207 return NULL;
4208}
4209
4210/*
10fbcf4c 4211 * Unregister hstate attributes from a single node device.
9a305230
LS
4212 * No-op if no hstate attributes attached.
4213 */
a4a00b45 4214void hugetlb_unregister_node(struct node *node)
9a305230
LS
4215{
4216 struct hstate *h;
10fbcf4c 4217 struct node_hstate *nhs = &node_hstates[node->dev.id];
9a305230
LS
4218
4219 if (!nhs->hugepages_kobj)
9b5e5d0f 4220 return; /* no hstate attributes */
9a305230 4221
972dc4de
AK
4222 for_each_hstate(h) {
4223 int idx = hstate_index(h);
01088a60
ML
4224 struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
4225
4226 if (!hstate_kobj)
4227 continue;
4228 if (h->demote_order)
4229 sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
4230 sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
4231 kobject_put(hstate_kobj);
4232 nhs->hstate_kobjs[idx] = NULL;
972dc4de 4233 }
9a305230
LS
4234
4235 kobject_put(nhs->hugepages_kobj);
4236 nhs->hugepages_kobj = NULL;
4237}
4238
9a305230
LS
4239
4240/*
10fbcf4c 4241 * Register hstate attributes for a single node device.
9a305230
LS
4242 * No-op if attributes already registered.
4243 */
a4a00b45 4244void hugetlb_register_node(struct node *node)
9a305230
LS
4245{
4246 struct hstate *h;
10fbcf4c 4247 struct node_hstate *nhs = &node_hstates[node->dev.id];
9a305230
LS
4248 int err;
4249
a4a00b45
MS
4250 if (!hugetlb_sysfs_initialized)
4251 return;
4252
9a305230
LS
4253 if (nhs->hugepages_kobj)
4254 return; /* already allocated */
4255
4256 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
10fbcf4c 4257 &node->dev.kobj);
9a305230
LS
4258 if (!nhs->hugepages_kobj)
4259 return;
4260
4261 for_each_hstate(h) {
4262 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
4263 nhs->hstate_kobjs,
4264 &per_node_hstate_attr_group);
4265 if (err) {
282f4214 4266 pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
ffb22af5 4267 h->name, node->dev.id);
9a305230
LS
4268 hugetlb_unregister_node(node);
4269 break;
4270 }
4271 }
4272}
4273
4274/*
9b5e5d0f 4275 * hugetlb init time: register hstate attributes for all registered node
10fbcf4c
KS
4276 * devices of nodes that have memory. All on-line nodes should have
4277 * registered their associated device by this time.
9a305230 4278 */
7d9ca000 4279static void __init hugetlb_register_all_nodes(void)
9a305230
LS
4280{
4281 int nid;
4282
a4a00b45 4283 for_each_online_node(nid)
b958d4d0 4284 hugetlb_register_node(node_devices[nid]);
9a305230
LS
4285}
4286#else /* !CONFIG_NUMA */
4287
4288static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4289{
4290 BUG();
4291 if (nidp)
4292 *nidp = -1;
4293 return NULL;
4294}
4295
9a305230
LS
4296static void hugetlb_register_all_nodes(void) { }
4297
4298#endif
4299
263b8998
ML
4300#ifdef CONFIG_CMA
4301static void __init hugetlb_cma_check(void);
4302#else
4303static inline __init void hugetlb_cma_check(void)
4304{
4305}
4306#endif
4307
a4a00b45
MS
4308static void __init hugetlb_sysfs_init(void)
4309{
4310 struct hstate *h;
4311 int err;
4312
4313 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
4314 if (!hugepages_kobj)
4315 return;
4316
4317 for_each_hstate(h) {
4318 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
4319 hstate_kobjs, &hstate_attr_group);
4320 if (err)
4321 pr_err("HugeTLB: Unable to add hstate %s", h->name);
4322 }
4323
4324#ifdef CONFIG_NUMA
4325 hugetlb_sysfs_initialized = true;
4326#endif
4327 hugetlb_register_all_nodes();
4328}
4329
962de548
KW
4330#ifdef CONFIG_SYSCTL
4331static void hugetlb_sysctl_init(void);
4332#else
4333static inline void hugetlb_sysctl_init(void) { }
4334#endif
4335
a3437870
NA
4336static int __init hugetlb_init(void)
4337{
8382d914
DB
4338 int i;
4339
d6995da3
MK
4340 BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4341 __NR_HPAGEFLAGS);
4342
c2833a5b
MK
4343 if (!hugepages_supported()) {
4344 if (hugetlb_max_hstate || default_hstate_max_huge_pages)
4345 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
0ef89d25 4346 return 0;
c2833a5b 4347 }
a3437870 4348
282f4214
MK
4349 /*
4350 * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
4351 * architectures depend on setup being done here.
4352 */
4353 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4354 if (!parsed_default_hugepagesz) {
4355 /*
4356 * If we did not parse a default huge page size, set
4357 * default_hstate_idx to HPAGE_SIZE hstate. And, if the
4358 * number of huge pages for this default size was implicitly
4359 * specified, set that here as well.
4360 * Note that the implicit setting will overwrite an explicit
4361 * setting. A warning will be printed in this case.
4362 */
4363 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
4364 if (default_hstate_max_huge_pages) {
4365 if (default_hstate.max_huge_pages) {
4366 char buf[32];
4367
4368 string_get_size(huge_page_size(&default_hstate),
4369 1, STRING_UNITS_2, buf, 32);
4370 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4371 default_hstate.max_huge_pages, buf);
4372 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4373 default_hstate_max_huge_pages);
4374 }
4375 default_hstate.max_huge_pages =
4376 default_hstate_max_huge_pages;
b5389086 4377
0a7a0f6f 4378 for_each_online_node(i)
b5389086
ZY
4379 default_hstate.max_huge_pages_node[i] =
4380 default_hugepages_in_node[i];
d715cf80 4381 }
f8b74815 4382 }
a3437870 4383
cf11e85f 4384 hugetlb_cma_check();
a3437870 4385 hugetlb_init_hstates();
aa888a74 4386 gather_bootmem_prealloc();
a3437870
NA
4387 report_hugepages();
4388
4389 hugetlb_sysfs_init();
7179e7bf 4390 hugetlb_cgroup_file_init();
962de548 4391 hugetlb_sysctl_init();
9a305230 4392
8382d914
DB
4393#ifdef CONFIG_SMP
4394 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
4395#else
4396 num_fault_mutexes = 1;
4397#endif
c672c7f2 4398 hugetlb_fault_mutex_table =
6da2ec56
KC
4399 kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
4400 GFP_KERNEL);
c672c7f2 4401 BUG_ON(!hugetlb_fault_mutex_table);
8382d914
DB
4402
4403 for (i = 0; i < num_fault_mutexes; i++)
c672c7f2 4404 mutex_init(&hugetlb_fault_mutex_table[i]);
a3437870
NA
4405 return 0;
4406}
3e89e1c5 4407subsys_initcall(hugetlb_init);
a3437870 4408
ae94da89
MK
4409/* Overwritten by architectures with more huge page sizes */
4410bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
9fee021d 4411{
ae94da89 4412 return size == HPAGE_SIZE;
9fee021d
VT
4413}
4414
d00181b9 4415void __init hugetlb_add_hstate(unsigned int order)
a3437870
NA
4416{
4417 struct hstate *h;
8faa8b07
AK
4418 unsigned long i;
4419
a3437870 4420 if (size_to_hstate(PAGE_SIZE << order)) {
a3437870
NA
4421 return;
4422 }
47d38344 4423 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
59838b25 4424 BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
47d38344 4425 h = &hstates[hugetlb_max_hstate++];
29383967 4426 mutex_init(&h->resize_lock);
a3437870 4427 h->order = order;
aca78307 4428 h->mask = ~(huge_page_size(h) - 1);
8faa8b07
AK
4429 for (i = 0; i < MAX_NUMNODES; ++i)
4430 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
0edaecfa 4431 INIT_LIST_HEAD(&h->hugepage_activelist);
54f18d35
AM
4432 h->next_nid_to_alloc = first_memory_node;
4433 h->next_nid_to_free = first_memory_node;
a3437870 4434 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
c2c3a60a 4435 huge_page_size(h)/SZ_1K);
8faa8b07 4436
a3437870
NA
4437 parsed_hstate = h;
4438}
4439
b5389086
ZY
4440bool __init __weak hugetlb_node_alloc_supported(void)
4441{
4442 return true;
4443}
f87442f4
PL
4444
4445static void __init hugepages_clear_pages_in_node(void)
4446{
4447 if (!hugetlb_max_hstate) {
4448 default_hstate_max_huge_pages = 0;
4449 memset(default_hugepages_in_node, 0,
10395680 4450 sizeof(default_hugepages_in_node));
f87442f4
PL
4451 } else {
4452 parsed_hstate->max_huge_pages = 0;
4453 memset(parsed_hstate->max_huge_pages_node, 0,
10395680 4454 sizeof(parsed_hstate->max_huge_pages_node));
f87442f4
PL
4455 }
4456}
4457
282f4214
MK
4458/*
4459 * hugepages command line processing
4460 * hugepages normally follows a valid hugepagsz or default_hugepagsz
4461 * specification. If not, ignore the hugepages value. hugepages can also
4462 * be the first huge page command line option in which case it implicitly
4463 * specifies the number of huge pages for the default size.
4464 */
4465static int __init hugepages_setup(char *s)
a3437870
NA
4466{
4467 unsigned long *mhp;
8faa8b07 4468 static unsigned long *last_mhp;
b5389086
ZY
4469 int node = NUMA_NO_NODE;
4470 int count;
4471 unsigned long tmp;
4472 char *p = s;
a3437870 4473
9fee021d 4474 if (!parsed_valid_hugepagesz) {
282f4214 4475 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
9fee021d 4476 parsed_valid_hugepagesz = true;
f81f6e4b 4477 return 1;
9fee021d 4478 }
282f4214 4479
a3437870 4480 /*
282f4214
MK
4481 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
4482 * yet, so this hugepages= parameter goes to the "default hstate".
4483 * Otherwise, it goes with the previously parsed hugepagesz or
4484 * default_hugepagesz.
a3437870 4485 */
9fee021d 4486 else if (!hugetlb_max_hstate)
a3437870
NA
4487 mhp = &default_hstate_max_huge_pages;
4488 else
4489 mhp = &parsed_hstate->max_huge_pages;
4490
8faa8b07 4491 if (mhp == last_mhp) {
282f4214 4492 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
f81f6e4b 4493 return 1;
8faa8b07
AK
4494 }
4495
b5389086
ZY
4496 while (*p) {
4497 count = 0;
4498 if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4499 goto invalid;
4500 /* Parameter is node format */
4501 if (p[count] == ':') {
4502 if (!hugetlb_node_alloc_supported()) {
4503 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
f81f6e4b 4504 return 1;
b5389086 4505 }
0a7a0f6f 4506 if (tmp >= MAX_NUMNODES || !node_online(tmp))
e79ce983 4507 goto invalid;
0a7a0f6f 4508 node = array_index_nospec(tmp, MAX_NUMNODES);
b5389086 4509 p += count + 1;
b5389086
ZY
4510 /* Parse hugepages */
4511 if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4512 goto invalid;
4513 if (!hugetlb_max_hstate)
4514 default_hugepages_in_node[node] = tmp;
4515 else
4516 parsed_hstate->max_huge_pages_node[node] = tmp;
4517 *mhp += tmp;
4518 /* Go to parse next node*/
4519 if (p[count] == ',')
4520 p += count + 1;
4521 else
4522 break;
4523 } else {
4524 if (p != s)
4525 goto invalid;
4526 *mhp = tmp;
4527 break;
4528 }
4529 }
a3437870 4530
8faa8b07
AK
4531 /*
4532 * Global state is always initialized later in hugetlb_init.
04adbc3f 4533 * But we need to allocate gigantic hstates here early to still
8faa8b07
AK
4534 * use the bootmem allocator.
4535 */
04adbc3f 4536 if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
8faa8b07
AK
4537 hugetlb_hstate_alloc_pages(parsed_hstate);
4538
4539 last_mhp = mhp;
4540
a3437870 4541 return 1;
b5389086
ZY
4542
4543invalid:
4544 pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
f87442f4 4545 hugepages_clear_pages_in_node();
f81f6e4b 4546 return 1;
a3437870 4547}
282f4214 4548__setup("hugepages=", hugepages_setup);
e11bfbfc 4549
282f4214
MK
4550/*
4551 * hugepagesz command line processing
4552 * A specific huge page size can only be specified once with hugepagesz.
4553 * hugepagesz is followed by hugepages on the command line. The global
4554 * variable 'parsed_valid_hugepagesz' is used to determine if prior
4555 * hugepagesz argument was valid.
4556 */
359f2544 4557static int __init hugepagesz_setup(char *s)
e11bfbfc 4558{
359f2544 4559 unsigned long size;
282f4214
MK
4560 struct hstate *h;
4561
4562 parsed_valid_hugepagesz = false;
359f2544
MK
4563 size = (unsigned long)memparse(s, NULL);
4564
4565 if (!arch_hugetlb_valid_size(size)) {
282f4214 4566 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
f81f6e4b 4567 return 1;
359f2544
MK
4568 }
4569
282f4214
MK
4570 h = size_to_hstate(size);
4571 if (h) {
4572 /*
4573 * hstate for this size already exists. This is normally
4574 * an error, but is allowed if the existing hstate is the
4575 * default hstate. More specifically, it is only allowed if
4576 * the number of huge pages for the default hstate was not
4577 * previously specified.
4578 */
4579 if (!parsed_default_hugepagesz || h != &default_hstate ||
4580 default_hstate.max_huge_pages) {
4581 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
f81f6e4b 4582 return 1;
282f4214
MK
4583 }
4584
4585 /*
4586 * No need to call hugetlb_add_hstate() as hstate already
4587 * exists. But, do set parsed_hstate so that a following
4588 * hugepages= parameter will be applied to this hstate.
4589 */
4590 parsed_hstate = h;
4591 parsed_valid_hugepagesz = true;
4592 return 1;
38237830
MK
4593 }
4594
359f2544 4595 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
282f4214 4596 parsed_valid_hugepagesz = true;
e11bfbfc
NP
4597 return 1;
4598}
359f2544
MK
4599__setup("hugepagesz=", hugepagesz_setup);
4600
282f4214
MK
4601/*
4602 * default_hugepagesz command line input
4603 * Only one instance of default_hugepagesz allowed on command line.
4604 */
ae94da89 4605static int __init default_hugepagesz_setup(char *s)
e11bfbfc 4606{
ae94da89 4607 unsigned long size;
b5389086 4608 int i;
ae94da89 4609
282f4214 4610 parsed_valid_hugepagesz = false;
282f4214
MK
4611 if (parsed_default_hugepagesz) {
4612 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
f81f6e4b 4613 return 1;
282f4214
MK
4614 }
4615
ae94da89
MK
4616 size = (unsigned long)memparse(s, NULL);
4617
4618 if (!arch_hugetlb_valid_size(size)) {
282f4214 4619 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
f81f6e4b 4620 return 1;
ae94da89
MK
4621 }
4622
282f4214
MK
4623 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4624 parsed_valid_hugepagesz = true;
4625 parsed_default_hugepagesz = true;
4626 default_hstate_idx = hstate_index(size_to_hstate(size));
4627
4628 /*
4629 * The number of default huge pages (for this size) could have been
4630 * specified as the first hugetlb parameter: hugepages=X. If so,
4631 * then default_hstate_max_huge_pages is set. If the default huge
23baf831 4632 * page size is gigantic (> MAX_ORDER), then the pages must be
282f4214
MK
4633 * allocated here from bootmem allocator.
4634 */
4635 if (default_hstate_max_huge_pages) {
4636 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
0a7a0f6f 4637 for_each_online_node(i)
b5389086
ZY
4638 default_hstate.max_huge_pages_node[i] =
4639 default_hugepages_in_node[i];
282f4214
MK
4640 if (hstate_is_gigantic(&default_hstate))
4641 hugetlb_hstate_alloc_pages(&default_hstate);
4642 default_hstate_max_huge_pages = 0;
4643 }
4644
e11bfbfc
NP
4645 return 1;
4646}
ae94da89 4647__setup("default_hugepagesz=", default_hugepagesz_setup);
a3437870 4648
d2226ebd
FT
4649static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
4650{
4651#ifdef CONFIG_NUMA
4652 struct mempolicy *mpol = get_task_policy(current);
4653
4654 /*
4655 * Only enforce MPOL_BIND policy which overlaps with cpuset policy
4656 * (from policy_nodemask) specifically for hugetlb case
4657 */
4658 if (mpol->mode == MPOL_BIND &&
4659 (apply_policy_zone(mpol, gfp_zone(gfp)) &&
4660 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
4661 return &mpol->nodes;
4662#endif
4663 return NULL;
4664}
4665
8ca39e68 4666static unsigned int allowed_mems_nr(struct hstate *h)
8a213460
NA
4667{
4668 int node;
4669 unsigned int nr = 0;
d2226ebd 4670 nodemask_t *mbind_nodemask;
8ca39e68
MS
4671 unsigned int *array = h->free_huge_pages_node;
4672 gfp_t gfp_mask = htlb_alloc_mask(h);
4673
d2226ebd 4674 mbind_nodemask = policy_mbind_nodemask(gfp_mask);
8ca39e68 4675 for_each_node_mask(node, cpuset_current_mems_allowed) {
d2226ebd 4676 if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
8ca39e68
MS
4677 nr += array[node];
4678 }
8a213460
NA
4679
4680 return nr;
4681}
4682
4683#ifdef CONFIG_SYSCTL
17743798
MS
4684static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
4685 void *buffer, size_t *length,
4686 loff_t *ppos, unsigned long *out)
4687{
4688 struct ctl_table dup_table;
4689
4690 /*
4691 * In order to avoid races with __do_proc_doulongvec_minmax(), we
4692 * can duplicate the @table and alter the duplicate of it.
4693 */
4694 dup_table = *table;
4695 dup_table.data = out;
4696
4697 return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
4698}
4699
06808b08
LS
4700static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
4701 struct ctl_table *table, int write,
32927393 4702 void *buffer, size_t *length, loff_t *ppos)
1da177e4 4703{
e5ff2159 4704 struct hstate *h = &default_hstate;
238d3c13 4705 unsigned long tmp = h->max_huge_pages;
08d4a246 4706 int ret;
e5ff2159 4707
457c1b27 4708 if (!hugepages_supported())
86613628 4709 return -EOPNOTSUPP;
457c1b27 4710
17743798
MS
4711 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4712 &tmp);
08d4a246
MH
4713 if (ret)
4714 goto out;
e5ff2159 4715
238d3c13
DR
4716 if (write)
4717 ret = __nr_hugepages_store_common(obey_mempolicy, h,
4718 NUMA_NO_NODE, tmp, *length);
08d4a246
MH
4719out:
4720 return ret;
1da177e4 4721}
396faf03 4722
962de548 4723static int hugetlb_sysctl_handler(struct ctl_table *table, int write,
32927393 4724 void *buffer, size_t *length, loff_t *ppos)
06808b08
LS
4725{
4726
4727 return hugetlb_sysctl_handler_common(false, table, write,
4728 buffer, length, ppos);
4729}
4730
4731#ifdef CONFIG_NUMA
962de548 4732static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
32927393 4733 void *buffer, size_t *length, loff_t *ppos)
06808b08
LS
4734{
4735 return hugetlb_sysctl_handler_common(true, table, write,
4736 buffer, length, ppos);
4737}
4738#endif /* CONFIG_NUMA */
4739
962de548 4740static int hugetlb_overcommit_handler(struct ctl_table *table, int write,
32927393 4741 void *buffer, size_t *length, loff_t *ppos)
a3d0c6aa 4742{
a5516438 4743 struct hstate *h = &default_hstate;
e5ff2159 4744 unsigned long tmp;
08d4a246 4745 int ret;
e5ff2159 4746
457c1b27 4747 if (!hugepages_supported())
86613628 4748 return -EOPNOTSUPP;
457c1b27 4749
c033a93c 4750 tmp = h->nr_overcommit_huge_pages;
e5ff2159 4751
bae7f4ae 4752 if (write && hstate_is_gigantic(h))
adbe8726
EM
4753 return -EINVAL;
4754
17743798
MS
4755 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4756 &tmp);
08d4a246
MH
4757 if (ret)
4758 goto out;
e5ff2159
AK
4759
4760 if (write) {
db71ef79 4761 spin_lock_irq(&hugetlb_lock);
e5ff2159 4762 h->nr_overcommit_huge_pages = tmp;
db71ef79 4763 spin_unlock_irq(&hugetlb_lock);
e5ff2159 4764 }
08d4a246
MH
4765out:
4766 return ret;
a3d0c6aa
NA
4767}
4768
962de548
KW
4769static struct ctl_table hugetlb_table[] = {
4770 {
4771 .procname = "nr_hugepages",
4772 .data = NULL,
4773 .maxlen = sizeof(unsigned long),
4774 .mode = 0644,
4775 .proc_handler = hugetlb_sysctl_handler,
4776 },
4777#ifdef CONFIG_NUMA
4778 {
4779 .procname = "nr_hugepages_mempolicy",
4780 .data = NULL,
4781 .maxlen = sizeof(unsigned long),
4782 .mode = 0644,
4783 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
4784 },
4785#endif
4786 {
4787 .procname = "hugetlb_shm_group",
4788 .data = &sysctl_hugetlb_shm_group,
4789 .maxlen = sizeof(gid_t),
4790 .mode = 0644,
4791 .proc_handler = proc_dointvec,
4792 },
4793 {
4794 .procname = "nr_overcommit_hugepages",
4795 .data = NULL,
4796 .maxlen = sizeof(unsigned long),
4797 .mode = 0644,
4798 .proc_handler = hugetlb_overcommit_handler,
4799 },
4800 { }
4801};
4802
4803static void hugetlb_sysctl_init(void)
4804{
4805 register_sysctl_init("vm", hugetlb_table);
4806}
1da177e4
LT
4807#endif /* CONFIG_SYSCTL */
4808
e1759c21 4809void hugetlb_report_meminfo(struct seq_file *m)
1da177e4 4810{
fcb2b0c5
RG
4811 struct hstate *h;
4812 unsigned long total = 0;
4813
457c1b27
NA
4814 if (!hugepages_supported())
4815 return;
fcb2b0c5
RG
4816
4817 for_each_hstate(h) {
4818 unsigned long count = h->nr_huge_pages;
4819
aca78307 4820 total += huge_page_size(h) * count;
fcb2b0c5
RG
4821
4822 if (h == &default_hstate)
4823 seq_printf(m,
4824 "HugePages_Total: %5lu\n"
4825 "HugePages_Free: %5lu\n"
4826 "HugePages_Rsvd: %5lu\n"
4827 "HugePages_Surp: %5lu\n"
4828 "Hugepagesize: %8lu kB\n",
4829 count,
4830 h->free_huge_pages,
4831 h->resv_huge_pages,
4832 h->surplus_huge_pages,
aca78307 4833 huge_page_size(h) / SZ_1K);
fcb2b0c5
RG
4834 }
4835
aca78307 4836 seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
1da177e4
LT
4837}
4838
7981593b 4839int hugetlb_report_node_meminfo(char *buf, int len, int nid)
1da177e4 4840{
a5516438 4841 struct hstate *h = &default_hstate;
7981593b 4842
457c1b27
NA
4843 if (!hugepages_supported())
4844 return 0;
7981593b
JP
4845
4846 return sysfs_emit_at(buf, len,
4847 "Node %d HugePages_Total: %5u\n"
4848 "Node %d HugePages_Free: %5u\n"
4849 "Node %d HugePages_Surp: %5u\n",
4850 nid, h->nr_huge_pages_node[nid],
4851 nid, h->free_huge_pages_node[nid],
4852 nid, h->surplus_huge_pages_node[nid]);
1da177e4
LT
4853}
4854
dcadcf1c 4855void hugetlb_show_meminfo_node(int nid)
949f7ec5
DR
4856{
4857 struct hstate *h;
949f7ec5 4858
457c1b27
NA
4859 if (!hugepages_supported())
4860 return;
4861
dcadcf1c
GL
4862 for_each_hstate(h)
4863 printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
4864 nid,
4865 h->nr_huge_pages_node[nid],
4866 h->free_huge_pages_node[nid],
4867 h->surplus_huge_pages_node[nid],
4868 huge_page_size(h) / SZ_1K);
949f7ec5
DR
4869}
4870
5d317b2b
NH
4871void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
4872{
4873 seq_printf(m, "HugetlbPages:\t%8lu kB\n",
6c1aa2d3 4874 K(atomic_long_read(&mm->hugetlb_usage)));
5d317b2b
NH
4875}
4876
1da177e4
LT
4877/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
4878unsigned long hugetlb_total_pages(void)
4879{
d0028588
WL
4880 struct hstate *h;
4881 unsigned long nr_total_pages = 0;
4882
4883 for_each_hstate(h)
4884 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
4885 return nr_total_pages;
1da177e4 4886}
1da177e4 4887
a5516438 4888static int hugetlb_acct_memory(struct hstate *h, long delta)
fc1b8a73
MG
4889{
4890 int ret = -ENOMEM;
4891
0aa7f354
ML
4892 if (!delta)
4893 return 0;
4894
db71ef79 4895 spin_lock_irq(&hugetlb_lock);
fc1b8a73
MG
4896 /*
4897 * When cpuset is configured, it breaks the strict hugetlb page
4898 * reservation as the accounting is done on a global variable. Such
4899 * reservation is completely rubbish in the presence of cpuset because
4900 * the reservation is not checked against page availability for the
4901 * current cpuset. Application can still potentially OOM'ed by kernel
4902 * with lack of free htlb page in cpuset that the task is in.
4903 * Attempt to enforce strict accounting with cpuset is almost
4904 * impossible (or too ugly) because cpuset is too fluid that
4905 * task or memory node can be dynamically moved between cpusets.
4906 *
4907 * The change of semantics for shared hugetlb mapping with cpuset is
4908 * undesirable. However, in order to preserve some of the semantics,
4909 * we fall back to check against current free page availability as
4910 * a best attempt and hopefully to minimize the impact of changing
4911 * semantics that cpuset has.
8ca39e68
MS
4912 *
4913 * Apart from cpuset, we also have memory policy mechanism that
4914 * also determines from which node the kernel will allocate memory
4915 * in a NUMA system. So similar to cpuset, we also should consider
4916 * the memory policy of the current task. Similar to the description
4917 * above.
fc1b8a73
MG
4918 */
4919 if (delta > 0) {
a5516438 4920 if (gather_surplus_pages(h, delta) < 0)
fc1b8a73
MG
4921 goto out;
4922
8ca39e68 4923 if (delta > allowed_mems_nr(h)) {
a5516438 4924 return_unused_surplus_pages(h, delta);
fc1b8a73
MG
4925 goto out;
4926 }
4927 }
4928
4929 ret = 0;
4930 if (delta < 0)
a5516438 4931 return_unused_surplus_pages(h, (unsigned long) -delta);
fc1b8a73
MG
4932
4933out:
db71ef79 4934 spin_unlock_irq(&hugetlb_lock);
fc1b8a73
MG
4935 return ret;
4936}
4937
84afd99b
AW
4938static void hugetlb_vm_op_open(struct vm_area_struct *vma)
4939{
f522c3ac 4940 struct resv_map *resv = vma_resv_map(vma);
84afd99b
AW
4941
4942 /*
612b8a31 4943 * HPAGE_RESV_OWNER indicates a private mapping.
84afd99b
AW
4944 * This new VMA should share its siblings reservation map if present.
4945 * The VMA will only ever have a valid reservation map pointer where
4946 * it is being copied for another still existing VMA. As that VMA
25985edc 4947 * has a reference to the reservation map it cannot disappear until
84afd99b
AW
4948 * after this open call completes. It is therefore safe to take a
4949 * new reference here without additional locking.
4950 */
09a26e83
MK
4951 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
4952 resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
f522c3ac 4953 kref_get(&resv->refs);
09a26e83 4954 }
8d9bfb26 4955
131a79b4
MK
4956 /*
4957 * vma_lock structure for sharable mappings is vma specific.
612b8a31
MK
4958 * Clear old pointer (if copied via vm_area_dup) and allocate
4959 * new structure. Before clearing, make sure vma_lock is not
4960 * for this vma.
131a79b4
MK
4961 */
4962 if (vma->vm_flags & VM_MAYSHARE) {
612b8a31
MK
4963 struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
4964
4965 if (vma_lock) {
4966 if (vma_lock->vma != vma) {
4967 vma->vm_private_data = NULL;
4968 hugetlb_vma_lock_alloc(vma);
4969 } else
4970 pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
4971 } else
4972 hugetlb_vma_lock_alloc(vma);
131a79b4 4973 }
84afd99b
AW
4974}
4975
a1e78772
MG
4976static void hugetlb_vm_op_close(struct vm_area_struct *vma)
4977{
a5516438 4978 struct hstate *h = hstate_vma(vma);
8d9bfb26 4979 struct resv_map *resv;
90481622 4980 struct hugepage_subpool *spool = subpool_vma(vma);
4e35f483 4981 unsigned long reserve, start, end;
1c5ecae3 4982 long gbl_reserve;
84afd99b 4983
8d9bfb26
MK
4984 hugetlb_vma_lock_free(vma);
4985
4986 resv = vma_resv_map(vma);
4e35f483
JK
4987 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4988 return;
84afd99b 4989
4e35f483
JK
4990 start = vma_hugecache_offset(h, vma, vma->vm_start);
4991 end = vma_hugecache_offset(h, vma, vma->vm_end);
84afd99b 4992
4e35f483 4993 reserve = (end - start) - region_count(resv, start, end);
e9fe92ae 4994 hugetlb_cgroup_uncharge_counter(resv, start, end);
4e35f483 4995 if (reserve) {
1c5ecae3
MK
4996 /*
4997 * Decrement reserve counts. The global reserve count may be
4998 * adjusted if the subpool has a minimum size.
4999 */
5000 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
5001 hugetlb_acct_memory(h, -gbl_reserve);
84afd99b 5002 }
e9fe92ae
MA
5003
5004 kref_put(&resv->refs, resv_map_release);
a1e78772
MG
5005}
5006
31383c68
DW
5007static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
5008{
5009 if (addr & ~(huge_page_mask(hstate_vma(vma))))
5010 return -EINVAL;
b30c14cd
JH
5011
5012 /*
5013 * PMD sharing is only possible for PUD_SIZE-aligned address ranges
5014 * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
5015 * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5016 */
5017 if (addr & ~PUD_MASK) {
5018 /*
5019 * hugetlb_vm_op_split is called right before we attempt to
5020 * split the VMA. We will need to unshare PMDs in the old and
5021 * new VMAs, so let's unshare before we split.
5022 */
5023 unsigned long floor = addr & PUD_MASK;
5024 unsigned long ceil = floor + PUD_SIZE;
5025
5026 if (floor >= vma->vm_start && ceil <= vma->vm_end)
5027 hugetlb_unshare_pmds(vma, floor, ceil);
5028 }
5029
31383c68
DW
5030 return 0;
5031}
5032
05ea8860
DW
5033static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
5034{
aca78307 5035 return huge_page_size(hstate_vma(vma));
05ea8860
DW
5036}
5037
1da177e4
LT
5038/*
5039 * We cannot handle pagefaults against hugetlb pages at all. They cause
5040 * handle_mm_fault() to try to instantiate regular-sized pages in the
6c26d310 5041 * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get
1da177e4
LT
5042 * this far.
5043 */
b3ec9f33 5044static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
1da177e4
LT
5045{
5046 BUG();
d0217ac0 5047 return 0;
1da177e4
LT
5048}
5049
eec3636a
JC
5050/*
5051 * When a new function is introduced to vm_operations_struct and added
5052 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
5053 * This is because under System V memory model, mappings created via
5054 * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
5055 * their original vm_ops are overwritten with shm_vm_ops.
5056 */
f0f37e2f 5057const struct vm_operations_struct hugetlb_vm_ops = {
d0217ac0 5058 .fault = hugetlb_vm_op_fault,
84afd99b 5059 .open = hugetlb_vm_op_open,
a1e78772 5060 .close = hugetlb_vm_op_close,
dd3b614f 5061 .may_split = hugetlb_vm_op_split,
05ea8860 5062 .pagesize = hugetlb_vm_op_pagesize,
1da177e4
LT
5063};
5064
1e8f889b
DG
5065static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
5066 int writable)
63551ae0
DG
5067{
5068 pte_t entry;
79c1c594 5069 unsigned int shift = huge_page_shift(hstate_vma(vma));
63551ae0 5070
1e8f889b 5071 if (writable) {
106c992a
GS
5072 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
5073 vma->vm_page_prot)));
63551ae0 5074 } else {
106c992a
GS
5075 entry = huge_pte_wrprotect(mk_huge_pte(page,
5076 vma->vm_page_prot));
63551ae0
DG
5077 }
5078 entry = pte_mkyoung(entry);
79c1c594 5079 entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
63551ae0
DG
5080
5081 return entry;
5082}
5083
1e8f889b
DG
5084static void set_huge_ptep_writable(struct vm_area_struct *vma,
5085 unsigned long address, pte_t *ptep)
5086{
5087 pte_t entry;
5088
106c992a 5089 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
32f84528 5090 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
4b3073e1 5091 update_mmu_cache(vma, address, ptep);
1e8f889b
DG
5092}
5093
d5ed7444 5094bool is_hugetlb_entry_migration(pte_t pte)
4a705fef
NH
5095{
5096 swp_entry_t swp;
5097
5098 if (huge_pte_none(pte) || pte_present(pte))
d5ed7444 5099 return false;
4a705fef 5100 swp = pte_to_swp_entry(pte);
d79d176a 5101 if (is_migration_entry(swp))
d5ed7444 5102 return true;
4a705fef 5103 else
d5ed7444 5104 return false;
4a705fef
NH
5105}
5106
52526ca7 5107bool is_hugetlb_entry_hwpoisoned(pte_t pte)
4a705fef
NH
5108{
5109 swp_entry_t swp;
5110
5111 if (huge_pte_none(pte) || pte_present(pte))
3e5c3600 5112 return false;
4a705fef 5113 swp = pte_to_swp_entry(pte);
d79d176a 5114 if (is_hwpoison_entry(swp))
3e5c3600 5115 return true;
4a705fef 5116 else
3e5c3600 5117 return false;
4a705fef 5118}
1e8f889b 5119
4eae4efa 5120static void
ea4c353d 5121hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
935d4f0c 5122 struct folio *new_folio, pte_t old, unsigned long sz)
4eae4efa 5123{
5a2f8d22
PX
5124 pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
5125
ea4c353d 5126 __folio_mark_uptodate(new_folio);
d0ce0e47 5127 hugepage_add_new_anon_rmap(new_folio, vma, addr);
5a2f8d22
PX
5128 if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
5129 newpte = huge_pte_mkuffd_wp(newpte);
935d4f0c 5130 set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
4eae4efa 5131 hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
ea4c353d 5132 folio_set_hugetlb_migratable(new_folio);
4eae4efa
PX
5133}
5134
63551ae0 5135int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
bc70fbf2
PX
5136 struct vm_area_struct *dst_vma,
5137 struct vm_area_struct *src_vma)
63551ae0 5138{
3aa4ed80 5139 pte_t *src_pte, *dst_pte, entry;
ad27ce20 5140 struct folio *pte_folio;
1c59827d 5141 unsigned long addr;
bc70fbf2
PX
5142 bool cow = is_cow_mapping(src_vma->vm_flags);
5143 struct hstate *h = hstate_vma(src_vma);
a5516438 5144 unsigned long sz = huge_page_size(h);
4eae4efa 5145 unsigned long npages = pages_per_huge_page(h);
ac46d4f3 5146 struct mmu_notifier_range range;
e95a9851 5147 unsigned long last_addr_mask;
e8569dd2 5148 int ret = 0;
1e8f889b 5149
ac46d4f3 5150 if (cow) {
7d4a8be0 5151 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src,
bc70fbf2
PX
5152 src_vma->vm_start,
5153 src_vma->vm_end);
ac46d4f3 5154 mmu_notifier_invalidate_range_start(&range);
e727bfd5 5155 vma_assert_write_locked(src_vma);
623a1ddf 5156 raw_write_seqcount_begin(&src->write_protect_seq);
40549ba8
MK
5157 } else {
5158 /*
5159 * For shared mappings the vma lock must be held before
9c67a207 5160 * calling hugetlb_walk() in the src vma. Otherwise, the
40549ba8
MK
5161 * returned ptep could go away if part of a shared pmd and
5162 * another thread calls huge_pmd_unshare.
5163 */
5164 hugetlb_vma_lock_read(src_vma);
ac46d4f3 5165 }
e8569dd2 5166
e95a9851 5167 last_addr_mask = hugetlb_mask_last_page(h);
bc70fbf2 5168 for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
cb900f41 5169 spinlock_t *src_ptl, *dst_ptl;
9c67a207 5170 src_pte = hugetlb_walk(src_vma, addr, sz);
e95a9851
MK
5171 if (!src_pte) {
5172 addr |= last_addr_mask;
c74df32c 5173 continue;
e95a9851 5174 }
bc70fbf2 5175 dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
e8569dd2
AS
5176 if (!dst_pte) {
5177 ret = -ENOMEM;
5178 break;
5179 }
c5c99429 5180
5e41540c
MK
5181 /*
5182 * If the pagetables are shared don't copy or take references.
5e41540c 5183 *
3aa4ed80 5184 * dst_pte == src_pte is the common case of src/dest sharing.
5e41540c 5185 * However, src could have 'unshared' and dst shares with
3aa4ed80
ML
5186 * another vma. So page_count of ptep page is checked instead
5187 * to reliably determine whether pte is shared.
5e41540c 5188 */
3aa4ed80 5189 if (page_count(virt_to_page(dst_pte)) > 1) {
e95a9851 5190 addr |= last_addr_mask;
c5c99429 5191 continue;
e95a9851 5192 }
c5c99429 5193
cb900f41
KS
5194 dst_ptl = huge_pte_lock(h, dst, dst_pte);
5195 src_ptl = huge_pte_lockptr(h, src, src_pte);
5196 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4a705fef 5197 entry = huge_ptep_get(src_pte);
4eae4efa 5198again:
3aa4ed80 5199 if (huge_pte_none(entry)) {
5e41540c 5200 /*
3aa4ed80 5201 * Skip if src entry none.
5e41540c 5202 */
4a705fef 5203 ;
c2cb0dcc 5204 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
5a2f8d22 5205 if (!userfaultfd_wp(dst_vma))
c2cb0dcc 5206 entry = huge_pte_clear_uffd_wp(entry);
935d4f0c 5207 set_huge_pte_at(dst, addr, dst_pte, entry, sz);
c2cb0dcc 5208 } else if (unlikely(is_hugetlb_entry_migration(entry))) {
4a705fef 5209 swp_entry_t swp_entry = pte_to_swp_entry(entry);
5a2f8d22 5210 bool uffd_wp = pte_swp_uffd_wp(entry);
4a705fef 5211
6c287605 5212 if (!is_readable_migration_entry(swp_entry) && cow) {
4a705fef
NH
5213 /*
5214 * COW mappings require pages in both
5215 * parent and child to be set to read.
5216 */
4dd845b5
AP
5217 swp_entry = make_readable_migration_entry(
5218 swp_offset(swp_entry));
4a705fef 5219 entry = swp_entry_to_pte(swp_entry);
bc70fbf2 5220 if (userfaultfd_wp(src_vma) && uffd_wp)
5a2f8d22 5221 entry = pte_swp_mkuffd_wp(entry);
935d4f0c 5222 set_huge_pte_at(src, addr, src_pte, entry, sz);
4a705fef 5223 }
5a2f8d22 5224 if (!userfaultfd_wp(dst_vma))
bc70fbf2 5225 entry = huge_pte_clear_uffd_wp(entry);
935d4f0c 5226 set_huge_pte_at(dst, addr, dst_pte, entry, sz);
bc70fbf2 5227 } else if (unlikely(is_pte_marker(entry))) {
af19487f
AR
5228 pte_marker marker = copy_pte_marker(
5229 pte_to_swp_entry(entry), dst_vma);
5230
5231 if (marker)
5232 set_huge_pte_at(dst, addr, dst_pte,
935d4f0c 5233 make_pte_marker(marker), sz);
4a705fef 5234 } else {
4eae4efa 5235 entry = huge_ptep_get(src_pte);
ad27ce20
Z
5236 pte_folio = page_folio(pte_page(entry));
5237 folio_get(pte_folio);
4eae4efa
PX
5238
5239 /*
fb3d824d
DH
5240 * Failing to duplicate the anon rmap is a rare case
5241 * where we see pinned hugetlb pages while they're
5242 * prone to COW. We need to do the COW earlier during
5243 * fork.
4eae4efa
PX
5244 *
5245 * When pre-allocating the page or copying data, we
5246 * need to be without the pgtable locks since we could
5247 * sleep during the process.
5248 */
ad27ce20
Z
5249 if (!folio_test_anon(pte_folio)) {
5250 page_dup_file_rmap(&pte_folio->page, true);
5251 } else if (page_try_dup_anon_rmap(&pte_folio->page,
5252 true, src_vma)) {
4eae4efa 5253 pte_t src_pte_old = entry;
d0ce0e47 5254 struct folio *new_folio;
4eae4efa
PX
5255
5256 spin_unlock(src_ptl);
5257 spin_unlock(dst_ptl);
5258 /* Do not use reserve as it's private owned */
d0ce0e47
SK
5259 new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
5260 if (IS_ERR(new_folio)) {
ad27ce20 5261 folio_put(pte_folio);
d0ce0e47 5262 ret = PTR_ERR(new_folio);
4eae4efa
PX
5263 break;
5264 }
1cb9dc4b 5265 ret = copy_user_large_folio(new_folio,
ad27ce20
Z
5266 pte_folio,
5267 addr, dst_vma);
5268 folio_put(pte_folio);
1cb9dc4b
LS
5269 if (ret) {
5270 folio_put(new_folio);
5271 break;
5272 }
4eae4efa 5273
d0ce0e47 5274 /* Install the new hugetlb folio if src pte stable */
4eae4efa
PX
5275 dst_ptl = huge_pte_lock(h, dst, dst_pte);
5276 src_ptl = huge_pte_lockptr(h, src, src_pte);
5277 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5278 entry = huge_ptep_get(src_pte);
5279 if (!pte_same(src_pte_old, entry)) {
bc70fbf2 5280 restore_reserve_on_error(h, dst_vma, addr,
d2d7bb44 5281 new_folio);
d0ce0e47 5282 folio_put(new_folio);
3aa4ed80 5283 /* huge_ptep of dst_pte won't change as in child */
4eae4efa
PX
5284 goto again;
5285 }
5a2f8d22 5286 hugetlb_install_folio(dst_vma, dst_pte, addr,
935d4f0c 5287 new_folio, src_pte_old, sz);
4eae4efa
PX
5288 spin_unlock(src_ptl);
5289 spin_unlock(dst_ptl);
5290 continue;
5291 }
5292
34ee645e 5293 if (cow) {
0f10851e
JG
5294 /*
5295 * No need to notify as we are downgrading page
5296 * table protection not changing it to point
5297 * to a new page.
5298 *
ee65728e 5299 * See Documentation/mm/mmu_notifier.rst
0f10851e 5300 */
7f2e9525 5301 huge_ptep_set_wrprotect(src, addr, src_pte);
84894e1c 5302 entry = huge_pte_wrprotect(entry);
34ee645e 5303 }
4eae4efa 5304
5a2f8d22
PX
5305 if (!userfaultfd_wp(dst_vma))
5306 entry = huge_pte_clear_uffd_wp(entry);
5307
935d4f0c 5308 set_huge_pte_at(dst, addr, dst_pte, entry, sz);
4eae4efa 5309 hugetlb_count_add(npages, dst);
1c59827d 5310 }
cb900f41
KS
5311 spin_unlock(src_ptl);
5312 spin_unlock(dst_ptl);
63551ae0 5313 }
63551ae0 5314
623a1ddf
DH
5315 if (cow) {
5316 raw_write_seqcount_end(&src->write_protect_seq);
ac46d4f3 5317 mmu_notifier_invalidate_range_end(&range);
40549ba8
MK
5318 } else {
5319 hugetlb_vma_unlock_read(src_vma);
623a1ddf 5320 }
e8569dd2
AS
5321
5322 return ret;
63551ae0
DG
5323}
5324
550a7d60 5325static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
935d4f0c
RR
5326 unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
5327 unsigned long sz)
550a7d60
MA
5328{
5329 struct hstate *h = hstate_vma(vma);
5330 struct mm_struct *mm = vma->vm_mm;
550a7d60 5331 spinlock_t *src_ptl, *dst_ptl;
db110a99 5332 pte_t pte;
550a7d60 5333
550a7d60
MA
5334 dst_ptl = huge_pte_lock(h, mm, dst_pte);
5335 src_ptl = huge_pte_lockptr(h, mm, src_pte);
5336
5337 /*
5338 * We don't have to worry about the ordering of src and dst ptlocks
8651a137 5339 * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
550a7d60
MA
5340 */
5341 if (src_ptl != dst_ptl)
5342 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
5343
5344 pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
935d4f0c 5345 set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
550a7d60
MA
5346
5347 if (src_ptl != dst_ptl)
5348 spin_unlock(src_ptl);
5349 spin_unlock(dst_ptl);
5350}
5351
5352int move_hugetlb_page_tables(struct vm_area_struct *vma,
5353 struct vm_area_struct *new_vma,
5354 unsigned long old_addr, unsigned long new_addr,
5355 unsigned long len)
5356{
5357 struct hstate *h = hstate_vma(vma);
5358 struct address_space *mapping = vma->vm_file->f_mapping;
5359 unsigned long sz = huge_page_size(h);
5360 struct mm_struct *mm = vma->vm_mm;
5361 unsigned long old_end = old_addr + len;
e95a9851 5362 unsigned long last_addr_mask;
550a7d60
MA
5363 pte_t *src_pte, *dst_pte;
5364 struct mmu_notifier_range range;
3d0b95cd 5365 bool shared_pmd = false;
550a7d60 5366
7d4a8be0 5367 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
550a7d60
MA
5368 old_end);
5369 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
3d0b95cd
BW
5370 /*
5371 * In case of shared PMDs, we should cover the maximum possible
5372 * range.
5373 */
5374 flush_cache_range(vma, range.start, range.end);
5375
550a7d60 5376 mmu_notifier_invalidate_range_start(&range);
e95a9851 5377 last_addr_mask = hugetlb_mask_last_page(h);
550a7d60 5378 /* Prevent race with file truncation */
40549ba8 5379 hugetlb_vma_lock_write(vma);
550a7d60
MA
5380 i_mmap_lock_write(mapping);
5381 for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
9c67a207 5382 src_pte = hugetlb_walk(vma, old_addr, sz);
e95a9851
MK
5383 if (!src_pte) {
5384 old_addr |= last_addr_mask;
5385 new_addr |= last_addr_mask;
550a7d60 5386 continue;
e95a9851 5387 }
550a7d60
MA
5388 if (huge_pte_none(huge_ptep_get(src_pte)))
5389 continue;
5390
4ddb4d91 5391 if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
3d0b95cd 5392 shared_pmd = true;
4ddb4d91
MK
5393 old_addr |= last_addr_mask;
5394 new_addr |= last_addr_mask;
550a7d60 5395 continue;
3d0b95cd 5396 }
550a7d60
MA
5397
5398 dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
5399 if (!dst_pte)
5400 break;
5401
935d4f0c 5402 move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
550a7d60 5403 }
3d0b95cd
BW
5404
5405 if (shared_pmd)
f720b471 5406 flush_hugetlb_tlb_range(vma, range.start, range.end);
3d0b95cd 5407 else
f720b471 5408 flush_hugetlb_tlb_range(vma, old_end - len, old_end);
550a7d60 5409 mmu_notifier_invalidate_range_end(&range);
13e4ad2c 5410 i_mmap_unlock_write(mapping);
40549ba8 5411 hugetlb_vma_unlock_write(vma);
550a7d60
MA
5412
5413 return len + old_addr - old_end;
5414}
5415
2820b0f0
RR
5416void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
5417 unsigned long start, unsigned long end,
5418 struct page *ref_page, zap_flags_t zap_flags)
63551ae0
DG
5419{
5420 struct mm_struct *mm = vma->vm_mm;
5421 unsigned long address;
c7546f8f 5422 pte_t *ptep;
63551ae0 5423 pte_t pte;
cb900f41 5424 spinlock_t *ptl;
63551ae0 5425 struct page *page;
a5516438
AK
5426 struct hstate *h = hstate_vma(vma);
5427 unsigned long sz = huge_page_size(h);
e95a9851 5428 unsigned long last_addr_mask;
a4a118f2 5429 bool force_flush = false;
a5516438 5430
63551ae0 5431 WARN_ON(!is_vm_hugetlb_page(vma));
a5516438
AK
5432 BUG_ON(start & ~huge_page_mask(h));
5433 BUG_ON(end & ~huge_page_mask(h));
63551ae0 5434
07e32661
AK
5435 /*
5436 * This is a hugetlb vma, all the pte entries should point
5437 * to huge page.
5438 */
ed6a7935 5439 tlb_change_page_size(tlb, sz);
24669e58 5440 tlb_start_vma(tlb, vma);
dff11abe 5441
e95a9851 5442 last_addr_mask = hugetlb_mask_last_page(h);
569f48b8 5443 address = start;
569f48b8 5444 for (; address < end; address += sz) {
9c67a207 5445 ptep = hugetlb_walk(vma, address, sz);
e95a9851
MK
5446 if (!ptep) {
5447 address |= last_addr_mask;
c7546f8f 5448 continue;
e95a9851 5449 }
c7546f8f 5450
cb900f41 5451 ptl = huge_pte_lock(h, mm, ptep);
4ddb4d91 5452 if (huge_pmd_unshare(mm, vma, address, ptep)) {
31d49da5 5453 spin_unlock(ptl);
a4a118f2
NA
5454 tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
5455 force_flush = true;
4ddb4d91 5456 address |= last_addr_mask;
31d49da5
AK
5457 continue;
5458 }
39dde65c 5459
6629326b 5460 pte = huge_ptep_get(ptep);
31d49da5
AK
5461 if (huge_pte_none(pte)) {
5462 spin_unlock(ptl);
5463 continue;
5464 }
6629326b
HD
5465
5466 /*
9fbc1f63
NH
5467 * Migrating hugepage or HWPoisoned hugepage is already
5468 * unmapped and its refcount is dropped, so just clear pte here.
6629326b 5469 */
9fbc1f63 5470 if (unlikely(!pte_present(pte))) {
05e90bd0
PX
5471 /*
5472 * If the pte was wr-protected by uffd-wp in any of the
5473 * swap forms, meanwhile the caller does not want to
5474 * drop the uffd-wp bit in this zap, then replace the
5475 * pte with a marker.
5476 */
5477 if (pte_swp_uffd_wp_any(pte) &&
5478 !(zap_flags & ZAP_FLAG_DROP_MARKER))
5479 set_huge_pte_at(mm, address, ptep,
935d4f0c
RR
5480 make_pte_marker(PTE_MARKER_UFFD_WP),
5481 sz);
05e90bd0
PX
5482 else
5483 huge_pte_clear(mm, address, ptep, sz);
31d49da5
AK
5484 spin_unlock(ptl);
5485 continue;
8c4894c6 5486 }
6629326b
HD
5487
5488 page = pte_page(pte);
04f2cbe3
MG
5489 /*
5490 * If a reference page is supplied, it is because a specific
5491 * page is being unmapped, not a range. Ensure the page we
5492 * are about to unmap is the actual page of interest.
5493 */
5494 if (ref_page) {
31d49da5
AK
5495 if (page != ref_page) {
5496 spin_unlock(ptl);
5497 continue;
5498 }
04f2cbe3
MG
5499 /*
5500 * Mark the VMA as having unmapped its page so that
5501 * future faults in this VMA will fail rather than
5502 * looking like data was lost
5503 */
5504 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5505 }
5506
c7546f8f 5507 pte = huge_ptep_get_and_clear(mm, address, ptep);
b528e4b6 5508 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
106c992a 5509 if (huge_pte_dirty(pte))
6649a386 5510 set_page_dirty(page);
05e90bd0
PX
5511 /* Leave a uffd-wp pte marker if needed */
5512 if (huge_pte_uffd_wp(pte) &&
5513 !(zap_flags & ZAP_FLAG_DROP_MARKER))
5514 set_huge_pte_at(mm, address, ptep,
935d4f0c
RR
5515 make_pte_marker(PTE_MARKER_UFFD_WP),
5516 sz);
5d317b2b 5517 hugetlb_count_sub(pages_per_huge_page(h), mm);
cea86fe2 5518 page_remove_rmap(page, vma, true);
31d49da5 5519
cb900f41 5520 spin_unlock(ptl);
e77b0852 5521 tlb_remove_page_size(tlb, page, huge_page_size(h));
31d49da5
AK
5522 /*
5523 * Bail out after unmapping reference page if supplied
5524 */
5525 if (ref_page)
5526 break;
fe1668ae 5527 }
24669e58 5528 tlb_end_vma(tlb, vma);
a4a118f2
NA
5529
5530 /*
5531 * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
5532 * could defer the flush until now, since by holding i_mmap_rwsem we
5533 * guaranteed that the last refernece would not be dropped. But we must
5534 * do the flushing before we return, as otherwise i_mmap_rwsem will be
5535 * dropped and the last reference to the shared PMDs page might be
5536 * dropped as well.
5537 *
5538 * In theory we could defer the freeing of the PMD pages as well, but
5539 * huge_pmd_unshare() relies on the exact page_count for the PMD page to
5540 * detect sharing, so we cannot defer the release of the page either.
5541 * Instead, do flush now.
5542 */
5543 if (force_flush)
5544 tlb_flush_mmu_tlbonly(tlb);
1da177e4 5545}
63551ae0 5546
2820b0f0
RR
5547void __hugetlb_zap_begin(struct vm_area_struct *vma,
5548 unsigned long *start, unsigned long *end)
d833352a 5549{
2820b0f0
RR
5550 if (!vma->vm_file) /* hugetlbfs_file_mmap error */
5551 return;
5552
5553 adjust_range_if_pmd_sharing_possible(vma, start, end);
131a79b4 5554 hugetlb_vma_lock_write(vma);
2820b0f0
RR
5555 if (vma->vm_file)
5556 i_mmap_lock_write(vma->vm_file->f_mapping);
5557}
131a79b4 5558
2820b0f0
RR
5559void __hugetlb_zap_end(struct vm_area_struct *vma,
5560 struct zap_details *details)
5561{
5562 zap_flags_t zap_flags = details ? details->zap_flags : 0;
131a79b4 5563
2820b0f0
RR
5564 if (!vma->vm_file) /* hugetlbfs_file_mmap error */
5565 return;
d833352a 5566
04ada095
MK
5567 if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
5568 /*
5569 * Unlock and free the vma lock before releasing i_mmap_rwsem.
5570 * When the vma_lock is freed, this makes the vma ineligible
5571 * for pmd sharing. And, i_mmap_rwsem is required to set up
5572 * pmd sharing. This is important as page tables for this
5573 * unmapped range will be asynchrously deleted. If the page
5574 * tables are shared, there will be issues when accessed by
5575 * someone else.
5576 */
5577 __hugetlb_vma_unlock_write_free(vma);
04ada095 5578 } else {
04ada095
MK
5579 hugetlb_vma_unlock_write(vma);
5580 }
2820b0f0
RR
5581
5582 if (vma->vm_file)
5583 i_mmap_unlock_write(vma->vm_file->f_mapping);
d833352a
MG
5584}
5585
502717f4 5586void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
05e90bd0
PX
5587 unsigned long end, struct page *ref_page,
5588 zap_flags_t zap_flags)
502717f4 5589{
369258ce 5590 struct mmu_notifier_range range;
24669e58 5591 struct mmu_gather tlb;
dff11abe 5592
7d4a8be0 5593 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
369258ce
MK
5594 start, end);
5595 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
5596 mmu_notifier_invalidate_range_start(&range);
a72afd87 5597 tlb_gather_mmu(&tlb, vma->vm_mm);
369258ce 5598
05e90bd0 5599 __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
369258ce
MK
5600
5601 mmu_notifier_invalidate_range_end(&range);
ae8eba8b 5602 tlb_finish_mmu(&tlb);
502717f4
CK
5603}
5604
04f2cbe3
MG
5605/*
5606 * This is called when the original mapper is failing to COW a MAP_PRIVATE
578b7725 5607 * mapping it owns the reserve page for. The intention is to unmap the page
04f2cbe3
MG
5608 * from other VMAs and let the children be SIGKILLed if they are faulting the
5609 * same region.
5610 */
2f4612af
DB
5611static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
5612 struct page *page, unsigned long address)
04f2cbe3 5613{
7526674d 5614 struct hstate *h = hstate_vma(vma);
04f2cbe3
MG
5615 struct vm_area_struct *iter_vma;
5616 struct address_space *mapping;
04f2cbe3
MG
5617 pgoff_t pgoff;
5618
5619 /*
5620 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
5621 * from page cache lookup which is in HPAGE_SIZE units.
5622 */
7526674d 5623 address = address & huge_page_mask(h);
36e4f20a
MH
5624 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
5625 vma->vm_pgoff;
93c76a3d 5626 mapping = vma->vm_file->f_mapping;
04f2cbe3 5627
4eb2b1dc
MG
5628 /*
5629 * Take the mapping lock for the duration of the table walk. As
5630 * this mapping should be shared between all the VMAs,
5631 * __unmap_hugepage_range() is called as the lock is already held
5632 */
83cde9e8 5633 i_mmap_lock_write(mapping);
6b2dbba8 5634 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
04f2cbe3
MG
5635 /* Do not unmap the current VMA */
5636 if (iter_vma == vma)
5637 continue;
5638
2f84a899
MG
5639 /*
5640 * Shared VMAs have their own reserves and do not affect
5641 * MAP_PRIVATE accounting but it is possible that a shared
5642 * VMA is using the same page so check and skip such VMAs.
5643 */
5644 if (iter_vma->vm_flags & VM_MAYSHARE)
5645 continue;
5646
04f2cbe3
MG
5647 /*
5648 * Unmap the page from other VMAs without their own reserves.
5649 * They get marked to be SIGKILLed if they fault in these
5650 * areas. This is because a future no-page fault on this VMA
5651 * could insert a zeroed page instead of the data existing
5652 * from the time of fork. This would look like data corruption
5653 */
5654 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
24669e58 5655 unmap_hugepage_range(iter_vma, address,
05e90bd0 5656 address + huge_page_size(h), page, 0);
04f2cbe3 5657 }
83cde9e8 5658 i_mmap_unlock_write(mapping);
04f2cbe3
MG
5659}
5660
0fe6e20b 5661/*
c89357e2 5662 * hugetlb_wp() should be called with page lock of the original hugepage held.
aa6d2e8c 5663 * Called with hugetlb_fault_mutex_table held and pte_page locked so we
ef009b25
MH
5664 * cannot race with other handlers or page migration.
5665 * Keep the pte_same checks anyway to make transition from the mutex easier.
0fe6e20b 5666 */
c89357e2
DH
5667static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
5668 unsigned long address, pte_t *ptep, unsigned int flags,
371607a3 5669 struct folio *pagecache_folio, spinlock_t *ptl)
1e8f889b 5670{
c89357e2 5671 const bool unshare = flags & FAULT_FLAG_UNSHARE;
60d5b473 5672 pte_t pte = huge_ptep_get(ptep);
a5516438 5673 struct hstate *h = hstate_vma(vma);
959a78b6 5674 struct folio *old_folio;
d0ce0e47 5675 struct folio *new_folio;
2b740303
SJ
5676 int outside_reserve = 0;
5677 vm_fault_t ret = 0;
974e6d66 5678 unsigned long haddr = address & huge_page_mask(h);
ac46d4f3 5679 struct mmu_notifier_range range;
1e8f889b 5680
60d5b473
PX
5681 /*
5682 * Never handle CoW for uffd-wp protected pages. It should be only
5683 * handled when the uffd-wp protection is removed.
5684 *
5685 * Note that only the CoW optimization path (in hugetlb_no_page())
5686 * can trigger this, because hugetlb_fault() will always resolve
5687 * uffd-wp bit first.
5688 */
5689 if (!unshare && huge_pte_uffd_wp(pte))
5690 return 0;
5691
1d8d1464
DH
5692 /*
5693 * hugetlb does not support FOLL_FORCE-style write faults that keep the
5694 * PTE mapped R/O such as maybe_mkwrite() would do.
5695 */
5696 if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
5697 return VM_FAULT_SIGSEGV;
5698
5699 /* Let's take out MAP_SHARED mappings first. */
5700 if (vma->vm_flags & VM_MAYSHARE) {
1d8d1464
DH
5701 set_huge_ptep_writable(vma, haddr, ptep);
5702 return 0;
5703 }
5704
959a78b6 5705 old_folio = page_folio(pte_page(pte));
1e8f889b 5706
662ce1dc
YY
5707 delayacct_wpcopy_start();
5708
04f2cbe3 5709retry_avoidcopy:
c89357e2
DH
5710 /*
5711 * If no-one else is actually using this page, we're the exclusive
5712 * owner and can reuse this page.
5713 */
959a78b6 5714 if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
5ca43289 5715 if (!PageAnonExclusive(&old_folio->page)) {
06968625 5716 folio_move_anon_rmap(old_folio, vma);
5ca43289
DH
5717 SetPageAnonExclusive(&old_folio->page);
5718 }
c89357e2
DH
5719 if (likely(!unshare))
5720 set_huge_ptep_writable(vma, haddr, ptep);
662ce1dc
YY
5721
5722 delayacct_wpcopy_end();
83c54070 5723 return 0;
1e8f889b 5724 }
959a78b6
Z
5725 VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
5726 PageAnonExclusive(&old_folio->page), &old_folio->page);
1e8f889b 5727
04f2cbe3
MG
5728 /*
5729 * If the process that created a MAP_PRIVATE mapping is about to
5730 * perform a COW due to a shared page count, attempt to satisfy
5731 * the allocation without using the existing reserves. The pagecache
5732 * page is used to determine if the reserve at this address was
5733 * consumed or not. If reserves were used, a partial faulted mapping
5734 * at the time of fork() could consume its reserves on COW instead
5735 * of the full address range.
5736 */
5944d011 5737 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
959a78b6 5738 old_folio != pagecache_folio)
04f2cbe3
MG
5739 outside_reserve = 1;
5740
959a78b6 5741 folio_get(old_folio);
b76c8cfb 5742
ad4404a2
DB
5743 /*
5744 * Drop page table lock as buddy allocator may be called. It will
5745 * be acquired again before returning to the caller, as expected.
5746 */
cb900f41 5747 spin_unlock(ptl);
d0ce0e47 5748 new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
1e8f889b 5749
d0ce0e47 5750 if (IS_ERR(new_folio)) {
04f2cbe3
MG
5751 /*
5752 * If a process owning a MAP_PRIVATE mapping fails to COW,
5753 * it is due to references held by a child and an insufficient
5754 * huge page pool. To guarantee the original mappers
5755 * reliability, unmap the page from child processes. The child
5756 * may get SIGKILLed if it later faults.
5757 */
5758 if (outside_reserve) {
40549ba8
MK
5759 struct address_space *mapping = vma->vm_file->f_mapping;
5760 pgoff_t idx;
5761 u32 hash;
5762
959a78b6 5763 folio_put(old_folio);
40549ba8
MK
5764 /*
5765 * Drop hugetlb_fault_mutex and vma_lock before
5766 * unmapping. unmapping needs to hold vma_lock
5767 * in write mode. Dropping vma_lock in read mode
5768 * here is OK as COW mappings do not interact with
5769 * PMD sharing.
5770 *
5771 * Reacquire both after unmap operation.
5772 */
5773 idx = vma_hugecache_offset(h, vma, haddr);
5774 hash = hugetlb_fault_mutex_hash(mapping, idx);
5775 hugetlb_vma_unlock_read(vma);
5776 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5777
959a78b6 5778 unmap_ref_private(mm, vma, &old_folio->page, haddr);
40549ba8
MK
5779
5780 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5781 hugetlb_vma_lock_read(vma);
2f4612af 5782 spin_lock(ptl);
9c67a207 5783 ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
2f4612af
DB
5784 if (likely(ptep &&
5785 pte_same(huge_ptep_get(ptep), pte)))
5786 goto retry_avoidcopy;
5787 /*
5788 * race occurs while re-acquiring page table
5789 * lock, and our job is done.
5790 */
662ce1dc 5791 delayacct_wpcopy_end();
2f4612af 5792 return 0;
04f2cbe3
MG
5793 }
5794
d0ce0e47 5795 ret = vmf_error(PTR_ERR(new_folio));
ad4404a2 5796 goto out_release_old;
1e8f889b
DG
5797 }
5798
0fe6e20b
NH
5799 /*
5800 * When the original hugepage is shared one, it does not have
5801 * anon_vma prepared.
5802 */
44e2aa93 5803 if (unlikely(anon_vma_prepare(vma))) {
ad4404a2
DB
5804 ret = VM_FAULT_OOM;
5805 goto out_release_all;
44e2aa93 5806 }
0fe6e20b 5807
959a78b6 5808 if (copy_user_large_folio(new_folio, old_folio, address, vma)) {
1cb9dc4b
LS
5809 ret = VM_FAULT_HWPOISON_LARGE;
5810 goto out_release_all;
5811 }
d0ce0e47 5812 __folio_mark_uptodate(new_folio);
1e8f889b 5813
7d4a8be0 5814 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
6f4f13e8 5815 haddr + huge_page_size(h));
ac46d4f3 5816 mmu_notifier_invalidate_range_start(&range);
ad4404a2 5817
b76c8cfb 5818 /*
cb900f41 5819 * Retake the page table lock to check for racing updates
b76c8cfb
LW
5820 * before the page tables are altered
5821 */
cb900f41 5822 spin_lock(ptl);
9c67a207 5823 ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
a9af0c5d 5824 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
0f230bc2
PX
5825 pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
5826
c89357e2 5827 /* Break COW or unshare */
5b7a1d40 5828 huge_ptep_clear_flush(vma, haddr, ptep);
959a78b6 5829 page_remove_rmap(&old_folio->page, vma, true);
d0ce0e47 5830 hugepage_add_new_anon_rmap(new_folio, vma, haddr);
0f230bc2
PX
5831 if (huge_pte_uffd_wp(pte))
5832 newpte = huge_pte_mkuffd_wp(newpte);
935d4f0c 5833 set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h));
d0ce0e47 5834 folio_set_hugetlb_migratable(new_folio);
1e8f889b 5835 /* Make the old page be freed below */
959a78b6 5836 new_folio = old_folio;
1e8f889b 5837 }
cb900f41 5838 spin_unlock(ptl);
ac46d4f3 5839 mmu_notifier_invalidate_range_end(&range);
ad4404a2 5840out_release_all:
c89357e2
DH
5841 /*
5842 * No restore in case of successful pagetable update (Break COW or
5843 * unshare)
5844 */
959a78b6 5845 if (new_folio != old_folio)
d2d7bb44 5846 restore_reserve_on_error(h, vma, haddr, new_folio);
d0ce0e47 5847 folio_put(new_folio);
ad4404a2 5848out_release_old:
959a78b6 5849 folio_put(old_folio);
8312034f 5850
ad4404a2 5851 spin_lock(ptl); /* Caller expects lock to be held */
662ce1dc
YY
5852
5853 delayacct_wpcopy_end();
ad4404a2 5854 return ret;
1e8f889b
DG
5855}
5856
3ae77f43
HD
5857/*
5858 * Return whether there is a pagecache page to back given address within VMA.
3ae77f43
HD
5859 */
5860static bool hugetlbfs_pagecache_present(struct hstate *h,
2a15efc9
HD
5861 struct vm_area_struct *vma, unsigned long address)
5862{
91a2fb95 5863 struct address_space *mapping = vma->vm_file->f_mapping;
a08c7193 5864 pgoff_t idx = linear_page_index(vma, address);
fd4aed8d 5865 struct folio *folio;
2a15efc9 5866
fd4aed8d
MK
5867 folio = filemap_get_folio(mapping, idx);
5868 if (IS_ERR(folio))
5869 return false;
5870 folio_put(folio);
5871 return true;
2a15efc9
HD
5872}
5873
9b91c0e2 5874int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
ab76ad54
MK
5875 pgoff_t idx)
5876{
5877 struct inode *inode = mapping->host;
5878 struct hstate *h = hstate_inode(inode);
d9ef44de 5879 int err;
ab76ad54 5880
a08c7193 5881 idx <<= huge_page_order(h);
d9ef44de
MWO
5882 __folio_set_locked(folio);
5883 err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
5884
5885 if (unlikely(err)) {
5886 __folio_clear_locked(folio);
ab76ad54 5887 return err;
d9ef44de 5888 }
9b91c0e2 5889 folio_clear_hugetlb_restore_reserve(folio);
ab76ad54 5890
22146c3c 5891 /*
d9ef44de 5892 * mark folio dirty so that it will not be removed from cache/file
22146c3c
MK
5893 * by non-hugetlbfs specific code paths.
5894 */
d9ef44de 5895 folio_mark_dirty(folio);
22146c3c 5896
ab76ad54
MK
5897 spin_lock(&inode->i_lock);
5898 inode->i_blocks += blocks_per_huge_page(h);
5899 spin_unlock(&inode->i_lock);
5900 return 0;
5901}
5902
7677f7fd
AR
5903static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
5904 struct address_space *mapping,
5905 pgoff_t idx,
5906 unsigned int flags,
5907 unsigned long haddr,
824ddc60 5908 unsigned long addr,
7677f7fd
AR
5909 unsigned long reason)
5910{
7677f7fd
AR
5911 u32 hash;
5912 struct vm_fault vmf = {
5913 .vma = vma,
5914 .address = haddr,
824ddc60 5915 .real_address = addr,
7677f7fd
AR
5916 .flags = flags,
5917
5918 /*
5919 * Hard to debug if it ends up being
5920 * used by a callee that assumes
5921 * something about the other
5922 * uninitialized fields... same as in
5923 * memory.c
5924 */
5925 };
5926
5927 /*
958f32ce
LS
5928 * vma_lock and hugetlb_fault_mutex must be dropped before handling
5929 * userfault. Also mmap_lock could be dropped due to handling
5930 * userfault, any vma operation should be careful from here.
7677f7fd 5931 */
40549ba8 5932 hugetlb_vma_unlock_read(vma);
7677f7fd
AR
5933 hash = hugetlb_fault_mutex_hash(mapping, idx);
5934 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
958f32ce 5935 return handle_userfault(&vmf, reason);
7677f7fd
AR
5936}
5937
2ea7ff1e
PX
5938/*
5939 * Recheck pte with pgtable lock. Returns true if pte didn't change, or
5940 * false if pte changed or is changing.
5941 */
5942static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
5943 pte_t *ptep, pte_t old_pte)
5944{
5945 spinlock_t *ptl;
5946 bool same;
5947
5948 ptl = huge_pte_lock(h, mm, ptep);
5949 same = pte_same(huge_ptep_get(ptep), old_pte);
5950 spin_unlock(ptl);
5951
5952 return same;
5953}
5954
2b740303
SJ
5955static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
5956 struct vm_area_struct *vma,
5957 struct address_space *mapping, pgoff_t idx,
c64e912c
PX
5958 unsigned long address, pte_t *ptep,
5959 pte_t old_pte, unsigned int flags)
ac9b9c66 5960{
a5516438 5961 struct hstate *h = hstate_vma(vma);
2b740303 5962 vm_fault_t ret = VM_FAULT_SIGBUS;
409eb8c2 5963 int anon_rmap = 0;
4c887265 5964 unsigned long size;
d0ce0e47 5965 struct folio *folio;
1e8f889b 5966 pte_t new_pte;
cb900f41 5967 spinlock_t *ptl;
285b8dca 5968 unsigned long haddr = address & huge_page_mask(h);
d0ce0e47 5969 bool new_folio, new_pagecache_folio = false;
958f32ce 5970 u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
4c887265 5971
04f2cbe3
MG
5972 /*
5973 * Currently, we are forced to kill the process in the event the
5974 * original mapper has unmapped pages from the child due to a failed
c89357e2
DH
5975 * COW/unsharing. Warn that such a situation has occurred as it may not
5976 * be obvious.
04f2cbe3
MG
5977 */
5978 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
910154d5 5979 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
ffb22af5 5980 current->pid);
958f32ce 5981 goto out;
04f2cbe3
MG
5982 }
5983
4c887265 5984 /*
188a3972
MK
5985 * Use page lock to guard against racing truncation
5986 * before we get page_table_lock.
4c887265 5987 */
d0ce0e47 5988 new_folio = false;
a08c7193 5989 folio = filemap_lock_hugetlb_folio(h, mapping, idx);
66dabbb6 5990 if (IS_ERR(folio)) {
188a3972
MK
5991 size = i_size_read(mapping->host) >> huge_page_shift(h);
5992 if (idx >= size)
5993 goto out;
7677f7fd 5994 /* Check for page in userfault range */
2ea7ff1e
PX
5995 if (userfaultfd_missing(vma)) {
5996 /*
5997 * Since hugetlb_no_page() was examining pte
5998 * without pgtable lock, we need to re-test under
5999 * lock because the pte may not be stable and could
6000 * have changed from under us. Try to detect
6001 * either changed or during-changing ptes and retry
6002 * properly when needed.
6003 *
6004 * Note that userfaultfd is actually fine with
6005 * false positives (e.g. caused by pte changed),
6006 * but not wrong logical events (e.g. caused by
6007 * reading a pte during changing). The latter can
6008 * confuse the userspace, so the strictness is very
6009 * much preferred. E.g., MISSING event should
6010 * never happen on the page after UFFDIO_COPY has
6011 * correctly installed the page and returned.
6012 */
6013 if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
6014 ret = 0;
6015 goto out;
6016 }
6017
6018 return hugetlb_handle_userfault(vma, mapping, idx, flags,
6019 haddr, address,
6020 VM_UFFD_MISSING);
6021 }
1a1aad8a 6022
d0ce0e47
SK
6023 folio = alloc_hugetlb_folio(vma, haddr, 0);
6024 if (IS_ERR(folio)) {
4643d67e
MK
6025 /*
6026 * Returning error will result in faulting task being
6027 * sent SIGBUS. The hugetlb fault mutex prevents two
6028 * tasks from racing to fault in the same page which
6029 * could result in false unable to allocate errors.
6030 * Page migration does not take the fault mutex, but
6031 * does a clear then write of pte's under page table
6032 * lock. Page fault code could race with migration,
6033 * notice the clear pte and try to allocate a page
6034 * here. Before returning error, get ptl and make
6035 * sure there really is no pte entry.
6036 */
f9bf6c03 6037 if (hugetlb_pte_stable(h, mm, ptep, old_pte))
d0ce0e47 6038 ret = vmf_error(PTR_ERR(folio));
f9bf6c03
PX
6039 else
6040 ret = 0;
6bda666a
CL
6041 goto out;
6042 }
d0ce0e47
SK
6043 clear_huge_page(&folio->page, address, pages_per_huge_page(h));
6044 __folio_mark_uptodate(folio);
6045 new_folio = true;
ac9b9c66 6046
f83a275d 6047 if (vma->vm_flags & VM_MAYSHARE) {
9b91c0e2 6048 int err = hugetlb_add_to_page_cache(folio, mapping, idx);
6bda666a 6049 if (err) {
3a5497a2
ML
6050 /*
6051 * err can't be -EEXIST which implies someone
6052 * else consumed the reservation since hugetlb
6053 * fault mutex is held when add a hugetlb page
6054 * to the page cache. So it's safe to call
6055 * restore_reserve_on_error() here.
6056 */
d2d7bb44 6057 restore_reserve_on_error(h, vma, haddr, folio);
d0ce0e47 6058 folio_put(folio);
6bda666a
CL
6059 goto out;
6060 }
d0ce0e47 6061 new_pagecache_folio = true;
23be7468 6062 } else {
d0ce0e47 6063 folio_lock(folio);
0fe6e20b
NH
6064 if (unlikely(anon_vma_prepare(vma))) {
6065 ret = VM_FAULT_OOM;
6066 goto backout_unlocked;
6067 }
409eb8c2 6068 anon_rmap = 1;
23be7468 6069 }
0fe6e20b 6070 } else {
998b4382
NH
6071 /*
6072 * If memory error occurs between mmap() and fault, some process
6073 * don't have hwpoisoned swap entry for errored virtual address.
6074 * So we need to block hugepage fault by PG_hwpoison bit check.
6075 */
d0ce0e47 6076 if (unlikely(folio_test_hwpoison(folio))) {
0eb98f15 6077 ret = VM_FAULT_HWPOISON_LARGE |
972dc4de 6078 VM_FAULT_SET_HINDEX(hstate_index(h));
998b4382
NH
6079 goto backout_unlocked;
6080 }
7677f7fd
AR
6081
6082 /* Check for page in userfault range. */
6083 if (userfaultfd_minor(vma)) {
d0ce0e47
SK
6084 folio_unlock(folio);
6085 folio_put(folio);
2ea7ff1e
PX
6086 /* See comment in userfaultfd_missing() block above */
6087 if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
6088 ret = 0;
6089 goto out;
6090 }
6091 return hugetlb_handle_userfault(vma, mapping, idx, flags,
6092 haddr, address,
6093 VM_UFFD_MINOR);
7677f7fd 6094 }
6bda666a 6095 }
1e8f889b 6096
57303d80
AW
6097 /*
6098 * If we are going to COW a private mapping later, we examine the
6099 * pending reservations for this page now. This will ensure that
6100 * any allocations necessary to record that reservation occur outside
6101 * the spinlock.
6102 */
5e911373 6103 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
285b8dca 6104 if (vma_needs_reservation(h, vma, haddr) < 0) {
2b26736c
AW
6105 ret = VM_FAULT_OOM;
6106 goto backout_unlocked;
6107 }
5e911373 6108 /* Just decrements count, does not deallocate */
285b8dca 6109 vma_end_reservation(h, vma, haddr);
5e911373 6110 }
57303d80 6111
8bea8052 6112 ptl = huge_pte_lock(h, mm, ptep);
83c54070 6113 ret = 0;
c64e912c
PX
6114 /* If pte changed from under us, retry */
6115 if (!pte_same(huge_ptep_get(ptep), old_pte))
4c887265
AL
6116 goto backout;
6117
4781593d 6118 if (anon_rmap)
d0ce0e47 6119 hugepage_add_new_anon_rmap(folio, vma, haddr);
4781593d 6120 else
d0ce0e47
SK
6121 page_dup_file_rmap(&folio->page, true);
6122 new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
1e8f889b 6123 && (vma->vm_flags & VM_SHARED)));
c64e912c
PX
6124 /*
6125 * If this pte was previously wr-protected, keep it wr-protected even
6126 * if populated.
6127 */
6128 if (unlikely(pte_marker_uffd_wp(old_pte)))
f1eb1bac 6129 new_pte = huge_pte_mkuffd_wp(new_pte);
935d4f0c 6130 set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h));
1e8f889b 6131
5d317b2b 6132 hugetlb_count_add(pages_per_huge_page(h), mm);
788c7df4 6133 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
1e8f889b 6134 /* Optimization, do the COW without a second fault */
371607a3 6135 ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl);
1e8f889b
DG
6136 }
6137
cb900f41 6138 spin_unlock(ptl);
cb6acd01
MK
6139
6140 /*
d0ce0e47
SK
6141 * Only set hugetlb_migratable in newly allocated pages. Existing pages
6142 * found in the pagecache may not have hugetlb_migratable if they have
8f251a3d 6143 * been isolated for migration.
cb6acd01 6144 */
d0ce0e47
SK
6145 if (new_folio)
6146 folio_set_hugetlb_migratable(folio);
cb6acd01 6147
d0ce0e47 6148 folio_unlock(folio);
4c887265 6149out:
958f32ce
LS
6150 hugetlb_vma_unlock_read(vma);
6151 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ac9b9c66 6152 return ret;
4c887265
AL
6153
6154backout:
cb900f41 6155 spin_unlock(ptl);
2b26736c 6156backout_unlocked:
d0ce0e47 6157 if (new_folio && !new_pagecache_folio)
d2d7bb44 6158 restore_reserve_on_error(h, vma, haddr, folio);
fa27759a 6159
d0ce0e47
SK
6160 folio_unlock(folio);
6161 folio_put(folio);
4c887265 6162 goto out;
ac9b9c66
HD
6163}
6164
8382d914 6165#ifdef CONFIG_SMP
188b04a7 6166u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
8382d914
DB
6167{
6168 unsigned long key[2];
6169 u32 hash;
6170
1b426bac
MK
6171 key[0] = (unsigned long) mapping;
6172 key[1] = idx;
8382d914 6173
55254636 6174 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
8382d914
DB
6175
6176 return hash & (num_fault_mutexes - 1);
6177}
6178#else
6179/*
6c26d310 6180 * For uniprocessor systems we always use a single mutex, so just
8382d914
DB
6181 * return 0 and avoid the hashing overhead.
6182 */
188b04a7 6183u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
8382d914
DB
6184{
6185 return 0;
6186}
6187#endif
6188
2b740303 6189vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
788c7df4 6190 unsigned long address, unsigned int flags)
86e5216f 6191{
8382d914 6192 pte_t *ptep, entry;
cb900f41 6193 spinlock_t *ptl;
2b740303 6194 vm_fault_t ret;
8382d914
DB
6195 u32 hash;
6196 pgoff_t idx;
061e62e8 6197 struct folio *folio = NULL;
371607a3 6198 struct folio *pagecache_folio = NULL;
a5516438 6199 struct hstate *h = hstate_vma(vma);
8382d914 6200 struct address_space *mapping;
0f792cf9 6201 int need_wait_lock = 0;
285b8dca 6202 unsigned long haddr = address & huge_page_mask(h);
86e5216f 6203
4ec31152
MWO
6204 /* TODO: Handle faults under the VMA lock */
6205 if (flags & FAULT_FLAG_VMA_LOCK) {
6206 vma_end_read(vma);
6207 return VM_FAULT_RETRY;
6208 }
6209
3935baa9
DG
6210 /*
6211 * Serialize hugepage allocation and instantiation, so that we don't
6212 * get spurious allocation failures if two CPUs race to instantiate
6213 * the same page in the page cache.
6214 */
40549ba8
MK
6215 mapping = vma->vm_file->f_mapping;
6216 idx = vma_hugecache_offset(h, vma, haddr);
188b04a7 6217 hash = hugetlb_fault_mutex_hash(mapping, idx);
c672c7f2 6218 mutex_lock(&hugetlb_fault_mutex_table[hash]);
8382d914 6219
40549ba8
MK
6220 /*
6221 * Acquire vma lock before calling huge_pte_alloc and hold
6222 * until finished with ptep. This prevents huge_pmd_unshare from
6223 * being called elsewhere and making the ptep no longer valid.
40549ba8
MK
6224 */
6225 hugetlb_vma_lock_read(vma);
6226 ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
6227 if (!ptep) {
6228 hugetlb_vma_unlock_read(vma);
6229 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6230 return VM_FAULT_OOM;
6231 }
6232
7f2e9525 6233 entry = huge_ptep_get(ptep);
af19487f
AR
6234 if (huge_pte_none_mostly(entry)) {
6235 if (is_pte_marker(entry)) {
6236 pte_marker marker =
6237 pte_marker_get(pte_to_swp_entry(entry));
6238
6239 if (marker & PTE_MARKER_POISONED) {
6240 ret = VM_FAULT_HWPOISON_LARGE;
6241 goto out_mutex;
6242 }
6243 }
6244
958f32ce 6245 /*
af19487f
AR
6246 * Other PTE markers should be handled the same way as none PTE.
6247 *
958f32ce
LS
6248 * hugetlb_no_page will drop vma lock and hugetlb fault
6249 * mutex internally, which make us return immediately.
6250 */
6251 return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
c64e912c 6252 entry, flags);
af19487f 6253 }
86e5216f 6254
83c54070 6255 ret = 0;
1e8f889b 6256
0f792cf9
NH
6257 /*
6258 * entry could be a migration/hwpoison entry at this point, so this
6259 * check prevents the kernel from going below assuming that we have
7c8de358
EP
6260 * an active hugepage in pagecache. This goto expects the 2nd page
6261 * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
6262 * properly handle it.
0f792cf9 6263 */
fcd48540
PX
6264 if (!pte_present(entry)) {
6265 if (unlikely(is_hugetlb_entry_migration(entry))) {
6266 /*
6267 * Release the hugetlb fault lock now, but retain
6268 * the vma lock, because it is needed to guard the
6269 * huge_pte_lockptr() later in
6270 * migration_entry_wait_huge(). The vma lock will
6271 * be released there.
6272 */
6273 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6274 migration_entry_wait_huge(vma, ptep);
6275 return 0;
6276 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
6277 ret = VM_FAULT_HWPOISON_LARGE |
6278 VM_FAULT_SET_HINDEX(hstate_index(h));
0f792cf9 6279 goto out_mutex;
fcd48540 6280 }
0f792cf9 6281
57303d80 6282 /*
c89357e2
DH
6283 * If we are going to COW/unshare the mapping later, we examine the
6284 * pending reservations for this page now. This will ensure that any
57303d80 6285 * allocations necessary to record that reservation occur outside the
1d8d1464
DH
6286 * spinlock. Also lookup the pagecache page now as it is used to
6287 * determine if a reservation has been consumed.
57303d80 6288 */
c89357e2 6289 if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
1d8d1464 6290 !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
285b8dca 6291 if (vma_needs_reservation(h, vma, haddr) < 0) {
2b26736c 6292 ret = VM_FAULT_OOM;
b4d1d99f 6293 goto out_mutex;
2b26736c 6294 }
5e911373 6295 /* Just decrements count, does not deallocate */
285b8dca 6296 vma_end_reservation(h, vma, haddr);
57303d80 6297
a08c7193 6298 pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx);
66dabbb6
CH
6299 if (IS_ERR(pagecache_folio))
6300 pagecache_folio = NULL;
57303d80
AW
6301 }
6302
0f792cf9
NH
6303 ptl = huge_pte_lock(h, mm, ptep);
6304
c89357e2 6305 /* Check for a racing update before calling hugetlb_wp() */
0f792cf9
NH
6306 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
6307 goto out_ptl;
6308
166f3ecc
PX
6309 /* Handle userfault-wp first, before trying to lock more pages */
6310 if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
6311 (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
d61ea1cb
PX
6312 if (!userfaultfd_wp_async(vma)) {
6313 struct vm_fault vmf = {
6314 .vma = vma,
6315 .address = haddr,
6316 .real_address = address,
6317 .flags = flags,
6318 };
166f3ecc 6319
d61ea1cb
PX
6320 spin_unlock(ptl);
6321 if (pagecache_folio) {
6322 folio_unlock(pagecache_folio);
6323 folio_put(pagecache_folio);
6324 }
6325 hugetlb_vma_unlock_read(vma);
6326 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
6327 return handle_userfault(&vmf, VM_UFFD_WP);
166f3ecc 6328 }
d61ea1cb
PX
6329
6330 entry = huge_pte_clear_uffd_wp(entry);
52526ca7
MUA
6331 set_huge_pte_at(mm, haddr, ptep, entry,
6332 huge_page_size(hstate_vma(vma)));
d61ea1cb 6333 /* Fallthrough to CoW */
166f3ecc
PX
6334 }
6335
56c9cfb1 6336 /*
c89357e2 6337 * hugetlb_wp() requires page locks of pte_page(entry) and
371607a3 6338 * pagecache_folio, so here we need take the former one
061e62e8 6339 * when folio != pagecache_folio or !pagecache_folio.
56c9cfb1 6340 */
061e62e8
Z
6341 folio = page_folio(pte_page(entry));
6342 if (folio != pagecache_folio)
6343 if (!folio_trylock(folio)) {
0f792cf9
NH
6344 need_wait_lock = 1;
6345 goto out_ptl;
6346 }
b4d1d99f 6347
061e62e8 6348 folio_get(folio);
b4d1d99f 6349
c89357e2 6350 if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
106c992a 6351 if (!huge_pte_write(entry)) {
c89357e2 6352 ret = hugetlb_wp(mm, vma, address, ptep, flags,
371607a3 6353 pagecache_folio, ptl);
0f792cf9 6354 goto out_put_page;
c89357e2
DH
6355 } else if (likely(flags & FAULT_FLAG_WRITE)) {
6356 entry = huge_pte_mkdirty(entry);
b4d1d99f 6357 }
b4d1d99f
DG
6358 }
6359 entry = pte_mkyoung(entry);
285b8dca 6360 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
788c7df4 6361 flags & FAULT_FLAG_WRITE))
285b8dca 6362 update_mmu_cache(vma, haddr, ptep);
0f792cf9 6363out_put_page:
061e62e8
Z
6364 if (folio != pagecache_folio)
6365 folio_unlock(folio);
6366 folio_put(folio);
cb900f41
KS
6367out_ptl:
6368 spin_unlock(ptl);
57303d80 6369
371607a3
SK
6370 if (pagecache_folio) {
6371 folio_unlock(pagecache_folio);
6372 folio_put(pagecache_folio);
57303d80 6373 }
b4d1d99f 6374out_mutex:
40549ba8 6375 hugetlb_vma_unlock_read(vma);
c672c7f2 6376 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
0f792cf9
NH
6377 /*
6378 * Generally it's safe to hold refcount during waiting page lock. But
6379 * here we just wait to defer the next page fault to avoid busy loop and
6380 * the page is not used after unlocked before returning from the current
6381 * page fault. So we are safe from accessing freed page, even if we wait
6382 * here without taking refcount.
6383 */
6384 if (need_wait_lock)
061e62e8 6385 folio_wait_locked(folio);
1e8f889b 6386 return ret;
86e5216f
AL
6387}
6388
714c1891 6389#ifdef CONFIG_USERFAULTFD
8fb5debc 6390/*
a734991c
AR
6391 * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
6392 * with modifications for hugetlb pages.
8fb5debc 6393 */
61c50040 6394int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
a734991c
AR
6395 struct vm_area_struct *dst_vma,
6396 unsigned long dst_addr,
6397 unsigned long src_addr,
d9712937 6398 uffd_flags_t flags,
0169fd51 6399 struct folio **foliop)
8fb5debc 6400{
61c50040 6401 struct mm_struct *dst_mm = dst_vma->vm_mm;
d9712937
AR
6402 bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
6403 bool wp_enabled = (flags & MFILL_ATOMIC_WP);
8cc5fcbb
MA
6404 struct hstate *h = hstate_vma(dst_vma);
6405 struct address_space *mapping = dst_vma->vm_file->f_mapping;
6406 pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
1e392147 6407 unsigned long size;
1c9e8def 6408 int vm_shared = dst_vma->vm_flags & VM_SHARED;
8fb5debc
MK
6409 pte_t _dst_pte;
6410 spinlock_t *ptl;
8cc5fcbb 6411 int ret = -ENOMEM;
d0ce0e47 6412 struct folio *folio;
f6191471 6413 int writable;
d0ce0e47 6414 bool folio_in_pagecache = false;
8fb5debc 6415
8a13897f
AR
6416 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
6417 ptl = huge_pte_lock(h, dst_mm, dst_pte);
6418
6419 /* Don't overwrite any existing PTEs (even markers) */
6420 if (!huge_pte_none(huge_ptep_get(dst_pte))) {
6421 spin_unlock(ptl);
6422 return -EEXIST;
6423 }
6424
6425 _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
935d4f0c
RR
6426 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte,
6427 huge_page_size(h));
8a13897f
AR
6428
6429 /* No need to invalidate - it was non-present before */
6430 update_mmu_cache(dst_vma, dst_addr, dst_pte);
6431
6432 spin_unlock(ptl);
6433 return 0;
6434 }
6435
f6191471
AR
6436 if (is_continue) {
6437 ret = -EFAULT;
a08c7193 6438 folio = filemap_lock_hugetlb_folio(h, mapping, idx);
66dabbb6 6439 if (IS_ERR(folio))
f6191471 6440 goto out;
d0ce0e47 6441 folio_in_pagecache = true;
0169fd51
Z
6442 } else if (!*foliop) {
6443 /* If a folio already exists, then it's UFFDIO_COPY for
d84cf06e
MA
6444 * a non-missing case. Return -EEXIST.
6445 */
6446 if (vm_shared &&
6447 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
6448 ret = -EEXIST;
6449 goto out;
6450 }
6451
d0ce0e47
SK
6452 folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
6453 if (IS_ERR(folio)) {
d84cf06e 6454 ret = -ENOMEM;
8fb5debc 6455 goto out;
d84cf06e 6456 }
8fb5debc 6457
e87340ca
Z
6458 ret = copy_folio_from_user(folio, (const void __user *) src_addr,
6459 false);
8fb5debc 6460
c1e8d7c6 6461 /* fallback to copy_from_user outside mmap_lock */
8fb5debc 6462 if (unlikely(ret)) {
9e368259 6463 ret = -ENOENT;
d0ce0e47 6464 /* Free the allocated folio which may have
8cc5fcbb
MA
6465 * consumed a reservation.
6466 */
d2d7bb44 6467 restore_reserve_on_error(h, dst_vma, dst_addr, folio);
d0ce0e47 6468 folio_put(folio);
8cc5fcbb 6469
d0ce0e47 6470 /* Allocate a temporary folio to hold the copied
8cc5fcbb
MA
6471 * contents.
6472 */
d0ce0e47
SK
6473 folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr);
6474 if (!folio) {
8cc5fcbb
MA
6475 ret = -ENOMEM;
6476 goto out;
6477 }
0169fd51
Z
6478 *foliop = folio;
6479 /* Set the outparam foliop and return to the caller to
8cc5fcbb 6480 * copy the contents outside the lock. Don't free the
0169fd51 6481 * folio.
8cc5fcbb 6482 */
8fb5debc
MK
6483 goto out;
6484 }
6485 } else {
8cc5fcbb
MA
6486 if (vm_shared &&
6487 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
0169fd51 6488 folio_put(*foliop);
8cc5fcbb 6489 ret = -EEXIST;
0169fd51 6490 *foliop = NULL;
8cc5fcbb
MA
6491 goto out;
6492 }
6493
d0ce0e47
SK
6494 folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
6495 if (IS_ERR(folio)) {
0169fd51 6496 folio_put(*foliop);
8cc5fcbb 6497 ret = -ENOMEM;
0169fd51 6498 *foliop = NULL;
8cc5fcbb
MA
6499 goto out;
6500 }
1cb9dc4b 6501 ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
0169fd51
Z
6502 folio_put(*foliop);
6503 *foliop = NULL;
1cb9dc4b
LS
6504 if (ret) {
6505 folio_put(folio);
8cc5fcbb
MA
6506 goto out;
6507 }
8fb5debc
MK
6508 }
6509
6510 /*
d0ce0e47 6511 * The memory barrier inside __folio_mark_uptodate makes sure that
8fb5debc
MK
6512 * preceding stores to the page contents become visible before
6513 * the set_pte_at() write.
6514 */
d0ce0e47 6515 __folio_mark_uptodate(folio);
8fb5debc 6516
f6191471
AR
6517 /* Add shared, newly allocated pages to the page cache. */
6518 if (vm_shared && !is_continue) {
1e392147
AA
6519 size = i_size_read(mapping->host) >> huge_page_shift(h);
6520 ret = -EFAULT;
6521 if (idx >= size)
6522 goto out_release_nounlock;
1c9e8def 6523
1e392147
AA
6524 /*
6525 * Serialization between remove_inode_hugepages() and
7e1813d4 6526 * hugetlb_add_to_page_cache() below happens through the
1e392147
AA
6527 * hugetlb_fault_mutex_table that here must be hold by
6528 * the caller.
6529 */
9b91c0e2 6530 ret = hugetlb_add_to_page_cache(folio, mapping, idx);
1c9e8def
MK
6531 if (ret)
6532 goto out_release_nounlock;
d0ce0e47 6533 folio_in_pagecache = true;
1c9e8def
MK
6534 }
6535
bcc66543 6536 ptl = huge_pte_lock(h, dst_mm, dst_pte);
8fb5debc 6537
8625147c 6538 ret = -EIO;
d0ce0e47 6539 if (folio_test_hwpoison(folio))
8625147c
JH
6540 goto out_release_unlock;
6541
6041c691
PX
6542 /*
6543 * We allow to overwrite a pte marker: consider when both MISSING|WP
6544 * registered, we firstly wr-protect a none pte which has no page cache
6545 * page backing it, then access the page.
6546 */
fa27759a 6547 ret = -EEXIST;
6041c691 6548 if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
8fb5debc
MK
6549 goto out_release_unlock;
6550
d0ce0e47
SK
6551 if (folio_in_pagecache)
6552 page_dup_file_rmap(&folio->page, true);
4781593d 6553 else
d0ce0e47 6554 hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr);
8fb5debc 6555
6041c691
PX
6556 /*
6557 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
6558 * with wp flag set, don't set pte write bit.
6559 */
d9712937 6560 if (wp_enabled || (is_continue && !vm_shared))
f6191471
AR
6561 writable = 0;
6562 else
6563 writable = dst_vma->vm_flags & VM_WRITE;
6564
d0ce0e47 6565 _dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
6041c691
PX
6566 /*
6567 * Always mark UFFDIO_COPY page dirty; note that this may not be
6568 * extremely important for hugetlbfs for now since swapping is not
6569 * supported, but we should still be clear in that this page cannot be
6570 * thrown away at will, even if write bit not set.
6571 */
6572 _dst_pte = huge_pte_mkdirty(_dst_pte);
8fb5debc
MK
6573 _dst_pte = pte_mkyoung(_dst_pte);
6574
d9712937 6575 if (wp_enabled)
6041c691
PX
6576 _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
6577
935d4f0c 6578 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, huge_page_size(h));
8fb5debc 6579
8fb5debc
MK
6580 hugetlb_count_add(pages_per_huge_page(h), dst_mm);
6581
6582 /* No need to invalidate - it was non-present before */
6583 update_mmu_cache(dst_vma, dst_addr, dst_pte);
6584
6585 spin_unlock(ptl);
f6191471 6586 if (!is_continue)
d0ce0e47 6587 folio_set_hugetlb_migratable(folio);
f6191471 6588 if (vm_shared || is_continue)
d0ce0e47 6589 folio_unlock(folio);
8fb5debc
MK
6590 ret = 0;
6591out:
6592 return ret;
6593out_release_unlock:
6594 spin_unlock(ptl);
f6191471 6595 if (vm_shared || is_continue)
d0ce0e47 6596 folio_unlock(folio);
5af10dfd 6597out_release_nounlock:
d0ce0e47 6598 if (!folio_in_pagecache)
d2d7bb44 6599 restore_reserve_on_error(h, dst_vma, dst_addr, folio);
d0ce0e47 6600 folio_put(folio);
8fb5debc
MK
6601 goto out;
6602}
714c1891 6603#endif /* CONFIG_USERFAULTFD */
8fb5debc 6604
57a196a5 6605struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
5502ea44
PX
6606 unsigned long address, unsigned int flags,
6607 unsigned int *page_mask)
57a196a5
MK
6608{
6609 struct hstate *h = hstate_vma(vma);
6610 struct mm_struct *mm = vma->vm_mm;
6611 unsigned long haddr = address & huge_page_mask(h);
6612 struct page *page = NULL;
6613 spinlock_t *ptl;
6614 pte_t *pte, entry;
458568c9 6615 int ret;
57a196a5 6616
7d049f3a 6617 hugetlb_vma_lock_read(vma);
9c67a207 6618 pte = hugetlb_walk(vma, haddr, huge_page_size(h));
57a196a5 6619 if (!pte)
7d049f3a 6620 goto out_unlock;
57a196a5
MK
6621
6622 ptl = huge_pte_lock(h, mm, pte);
6623 entry = huge_ptep_get(pte);
6624 if (pte_present(entry)) {
458568c9
PX
6625 page = pte_page(entry);
6626
6627 if (!huge_pte_write(entry)) {
6628 if (flags & FOLL_WRITE) {
6629 page = NULL;
6630 goto out;
6631 }
6632
6633 if (gup_must_unshare(vma, flags, page)) {
6634 /* Tell the caller to do unsharing */
6635 page = ERR_PTR(-EMLINK);
6636 goto out;
6637 }
6638 }
6639
426056ef 6640 page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT));
458568c9 6641
57a196a5
MK
6642 /*
6643 * Note that page may be a sub-page, and with vmemmap
6644 * optimizations the page struct may be read only.
6645 * try_grab_page() will increase the ref count on the
6646 * head page, so this will be OK.
6647 *
e2ca6ba6
LT
6648 * try_grab_page() should always be able to get the page here,
6649 * because we hold the ptl lock and have verified pte_present().
57a196a5 6650 */
458568c9
PX
6651 ret = try_grab_page(page, flags);
6652
6653 if (WARN_ON_ONCE(ret)) {
6654 page = ERR_PTR(ret);
57a196a5
MK
6655 goto out;
6656 }
5502ea44
PX
6657
6658 *page_mask = (1U << huge_page_order(h)) - 1;
57a196a5
MK
6659 }
6660out:
6661 spin_unlock(ptl);
7d049f3a
PX
6662out_unlock:
6663 hugetlb_vma_unlock_read(vma);
dd767aaa
PX
6664
6665 /*
6666 * Fixup retval for dump requests: if pagecache doesn't exist,
6667 * don't try to allocate a new page but just skip it.
6668 */
6669 if (!page && (flags & FOLL_DUMP) &&
6670 !hugetlbfs_pagecache_present(h, vma, address))
6671 page = ERR_PTR(-EFAULT);
6672
57a196a5
MK
6673 return page;
6674}
6675
a79390f5 6676long hugetlb_change_protection(struct vm_area_struct *vma,
5a90d5a1
PX
6677 unsigned long address, unsigned long end,
6678 pgprot_t newprot, unsigned long cp_flags)
8f860591
ZY
6679{
6680 struct mm_struct *mm = vma->vm_mm;
6681 unsigned long start = address;
6682 pte_t *ptep;
6683 pte_t pte;
a5516438 6684 struct hstate *h = hstate_vma(vma);
a79390f5 6685 long pages = 0, psize = huge_page_size(h);
dff11abe 6686 bool shared_pmd = false;
ac46d4f3 6687 struct mmu_notifier_range range;
e95a9851 6688 unsigned long last_addr_mask;
5a90d5a1
PX
6689 bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
6690 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
dff11abe
MK
6691
6692 /*
6693 * In the case of shared PMDs, the area to flush could be beyond
ac46d4f3 6694 * start/end. Set range.start/range.end to cover the maximum possible
dff11abe
MK
6695 * range if PMD sharing is possible.
6696 */
7269f999 6697 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
7d4a8be0 6698 0, mm, start, end);
ac46d4f3 6699 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
8f860591
ZY
6700
6701 BUG_ON(address >= end);
ac46d4f3 6702 flush_cache_range(vma, range.start, range.end);
8f860591 6703
ac46d4f3 6704 mmu_notifier_invalidate_range_start(&range);
40549ba8 6705 hugetlb_vma_lock_write(vma);
83cde9e8 6706 i_mmap_lock_write(vma->vm_file->f_mapping);
40549ba8 6707 last_addr_mask = hugetlb_mask_last_page(h);
60dfaad6 6708 for (; address < end; address += psize) {
cb900f41 6709 spinlock_t *ptl;
9c67a207 6710 ptep = hugetlb_walk(vma, address, psize);
e95a9851 6711 if (!ptep) {
fed15f13
PX
6712 if (!uffd_wp) {
6713 address |= last_addr_mask;
6714 continue;
6715 }
6716 /*
6717 * Userfaultfd wr-protect requires pgtable
6718 * pre-allocations to install pte markers.
6719 */
6720 ptep = huge_pte_alloc(mm, vma, address, psize);
d1751118
PX
6721 if (!ptep) {
6722 pages = -ENOMEM;
fed15f13 6723 break;
d1751118 6724 }
e95a9851 6725 }
cb900f41 6726 ptl = huge_pte_lock(h, mm, ptep);
4ddb4d91 6727 if (huge_pmd_unshare(mm, vma, address, ptep)) {
60dfaad6
PX
6728 /*
6729 * When uffd-wp is enabled on the vma, unshare
6730 * shouldn't happen at all. Warn about it if it
6731 * happened due to some reason.
6732 */
6733 WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
7da4d641 6734 pages++;
cb900f41 6735 spin_unlock(ptl);
dff11abe 6736 shared_pmd = true;
4ddb4d91 6737 address |= last_addr_mask;
39dde65c 6738 continue;
7da4d641 6739 }
a8bda28d
NH
6740 pte = huge_ptep_get(ptep);
6741 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
0e678153
DH
6742 /* Nothing to do. */
6743 } else if (unlikely(is_hugetlb_entry_migration(pte))) {
a8bda28d 6744 swp_entry_t entry = pte_to_swp_entry(pte);
6c287605 6745 struct page *page = pfn_swap_entry_to_page(entry);
44f86392 6746 pte_t newpte = pte;
a8bda28d 6747
44f86392 6748 if (is_writable_migration_entry(entry)) {
6c287605
DH
6749 if (PageAnon(page))
6750 entry = make_readable_exclusive_migration_entry(
6751 swp_offset(entry));
6752 else
6753 entry = make_readable_migration_entry(
6754 swp_offset(entry));
a8bda28d 6755 newpte = swp_entry_to_pte(entry);
a8bda28d
NH
6756 pages++;
6757 }
44f86392
DH
6758
6759 if (uffd_wp)
6760 newpte = pte_swp_mkuffd_wp(newpte);
6761 else if (uffd_wp_resolve)
6762 newpte = pte_swp_clear_uffd_wp(newpte);
6763 if (!pte_same(pte, newpte))
935d4f0c 6764 set_huge_pte_at(mm, address, ptep, newpte, psize);
0e678153
DH
6765 } else if (unlikely(is_pte_marker(pte))) {
6766 /* No other markers apply for now. */
6767 WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
60dfaad6 6768 if (uffd_wp_resolve)
0e678153 6769 /* Safe to modify directly (non-present->none). */
60dfaad6 6770 huge_pte_clear(mm, address, ptep, psize);
0e678153 6771 } else if (!huge_pte_none(pte)) {
023bdd00 6772 pte_t old_pte;
79c1c594 6773 unsigned int shift = huge_page_shift(hstate_vma(vma));
023bdd00
AK
6774
6775 old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
16785bd7 6776 pte = huge_pte_modify(old_pte, newprot);
79c1c594 6777 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
5a90d5a1 6778 if (uffd_wp)
f1eb1bac 6779 pte = huge_pte_mkuffd_wp(pte);
5a90d5a1
PX
6780 else if (uffd_wp_resolve)
6781 pte = huge_pte_clear_uffd_wp(pte);
023bdd00 6782 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
7da4d641 6783 pages++;
60dfaad6
PX
6784 } else {
6785 /* None pte */
6786 if (unlikely(uffd_wp))
6787 /* Safe to modify directly (none->non-present). */
6788 set_huge_pte_at(mm, address, ptep,
935d4f0c
RR
6789 make_pte_marker(PTE_MARKER_UFFD_WP),
6790 psize);
8f860591 6791 }
cb900f41 6792 spin_unlock(ptl);
8f860591 6793 }
d833352a 6794 /*
c8c06efa 6795 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
d833352a 6796 * may have cleared our pud entry and done put_page on the page table:
c8c06efa 6797 * once we release i_mmap_rwsem, another task can do the final put_page
dff11abe
MK
6798 * and that page table be reused and filled with junk. If we actually
6799 * did unshare a page of pmds, flush the range corresponding to the pud.
d833352a 6800 */
dff11abe 6801 if (shared_pmd)
ac46d4f3 6802 flush_hugetlb_tlb_range(vma, range.start, range.end);
dff11abe
MK
6803 else
6804 flush_hugetlb_tlb_range(vma, start, end);
0f10851e 6805 /*
1af5a810
AP
6806 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
6807 * downgrading page table protection not changing it to point to a new
6808 * page.
0f10851e 6809 *
ee65728e 6810 * See Documentation/mm/mmu_notifier.rst
0f10851e 6811 */
83cde9e8 6812 i_mmap_unlock_write(vma->vm_file->f_mapping);
40549ba8 6813 hugetlb_vma_unlock_write(vma);
ac46d4f3 6814 mmu_notifier_invalidate_range_end(&range);
7da4d641 6815
d1751118 6816 return pages > 0 ? (pages << h->order) : pages;
8f860591
ZY
6817}
6818
33b8f84a
MK
6819/* Return true if reservation was successful, false otherwise. */
6820bool hugetlb_reserve_pages(struct inode *inode,
a1e78772 6821 long from, long to,
5a6fe125 6822 struct vm_area_struct *vma,
ca16d140 6823 vm_flags_t vm_flags)
e4e574b7 6824{
c5094ec7 6825 long chg = -1, add = -1;
a5516438 6826 struct hstate *h = hstate_inode(inode);
90481622 6827 struct hugepage_subpool *spool = subpool_inode(inode);
9119a41e 6828 struct resv_map *resv_map;
075a61d0 6829 struct hugetlb_cgroup *h_cg = NULL;
0db9d74e 6830 long gbl_reserve, regions_needed = 0;
e4e574b7 6831
63489f8e
MK
6832 /* This should never happen */
6833 if (from > to) {
6834 VM_WARN(1, "%s called with a negative range\n", __func__);
33b8f84a 6835 return false;
63489f8e
MK
6836 }
6837
8d9bfb26 6838 /*
e700898f
MK
6839 * vma specific semaphore used for pmd sharing and fault/truncation
6840 * synchronization
8d9bfb26
MK
6841 */
6842 hugetlb_vma_lock_alloc(vma);
6843
17c9d12e
MG
6844 /*
6845 * Only apply hugepage reservation if asked. At fault time, an
6846 * attempt will be made for VM_NORESERVE to allocate a page
90481622 6847 * without using reserves
17c9d12e 6848 */
ca16d140 6849 if (vm_flags & VM_NORESERVE)
33b8f84a 6850 return true;
17c9d12e 6851
a1e78772
MG
6852 /*
6853 * Shared mappings base their reservation on the number of pages that
6854 * are already allocated on behalf of the file. Private mappings need
6855 * to reserve the full area even if read-only as mprotect() may be
6856 * called to make the mapping read-write. Assume !vma is a shm mapping
6857 */
9119a41e 6858 if (!vma || vma->vm_flags & VM_MAYSHARE) {
f27a5136
MK
6859 /*
6860 * resv_map can not be NULL as hugetlb_reserve_pages is only
6861 * called for inodes for which resv_maps were created (see
6862 * hugetlbfs_get_inode).
6863 */
4e35f483 6864 resv_map = inode_resv_map(inode);
9119a41e 6865
0db9d74e 6866 chg = region_chg(resv_map, from, to, &regions_needed);
9119a41e 6867 } else {
e9fe92ae 6868 /* Private mapping. */
9119a41e 6869 resv_map = resv_map_alloc();
17c9d12e 6870 if (!resv_map)
8d9bfb26 6871 goto out_err;
17c9d12e 6872
a1e78772 6873 chg = to - from;
84afd99b 6874
17c9d12e
MG
6875 set_vma_resv_map(vma, resv_map);
6876 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
6877 }
6878
33b8f84a 6879 if (chg < 0)
c50ac050 6880 goto out_err;
8a630112 6881
33b8f84a
MK
6882 if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
6883 chg * pages_per_huge_page(h), &h_cg) < 0)
075a61d0 6884 goto out_err;
075a61d0
MA
6885
6886 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
6887 /* For private mappings, the hugetlb_cgroup uncharge info hangs
6888 * of the resv_map.
6889 */
6890 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
6891 }
6892
1c5ecae3
MK
6893 /*
6894 * There must be enough pages in the subpool for the mapping. If
6895 * the subpool has a minimum size, there may be some global
6896 * reservations already in place (gbl_reserve).
6897 */
6898 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
33b8f84a 6899 if (gbl_reserve < 0)
075a61d0 6900 goto out_uncharge_cgroup;
5a6fe125
MG
6901
6902 /*
17c9d12e 6903 * Check enough hugepages are available for the reservation.
90481622 6904 * Hand the pages back to the subpool if there are not
5a6fe125 6905 */
33b8f84a 6906 if (hugetlb_acct_memory(h, gbl_reserve) < 0)
075a61d0 6907 goto out_put_pages;
17c9d12e
MG
6908
6909 /*
6910 * Account for the reservations made. Shared mappings record regions
6911 * that have reservations as they are shared by multiple VMAs.
6912 * When the last VMA disappears, the region map says how much
6913 * the reservation was and the page cache tells how much of
6914 * the reservation was consumed. Private mappings are per-VMA and
6915 * only the consumed reservations are tracked. When the VMA
6916 * disappears, the original reservation is the VMA size and the
6917 * consumed reservations are stored in the map. Hence, nothing
6918 * else has to be done for private mappings here
6919 */
33039678 6920 if (!vma || vma->vm_flags & VM_MAYSHARE) {
075a61d0 6921 add = region_add(resv_map, from, to, regions_needed, h, h_cg);
0db9d74e
MA
6922
6923 if (unlikely(add < 0)) {
6924 hugetlb_acct_memory(h, -gbl_reserve);
075a61d0 6925 goto out_put_pages;
0db9d74e 6926 } else if (unlikely(chg > add)) {
33039678
MK
6927 /*
6928 * pages in this range were added to the reserve
6929 * map between region_chg and region_add. This
d0ce0e47 6930 * indicates a race with alloc_hugetlb_folio. Adjust
33039678
MK
6931 * the subpool and reserve counts modified above
6932 * based on the difference.
6933 */
6934 long rsv_adjust;
6935
d85aecf2
ML
6936 /*
6937 * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
6938 * reference to h_cg->css. See comment below for detail.
6939 */
075a61d0
MA
6940 hugetlb_cgroup_uncharge_cgroup_rsvd(
6941 hstate_index(h),
6942 (chg - add) * pages_per_huge_page(h), h_cg);
6943
33039678
MK
6944 rsv_adjust = hugepage_subpool_put_pages(spool,
6945 chg - add);
6946 hugetlb_acct_memory(h, -rsv_adjust);
d85aecf2
ML
6947 } else if (h_cg) {
6948 /*
6949 * The file_regions will hold their own reference to
6950 * h_cg->css. So we should release the reference held
6951 * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
6952 * done.
6953 */
6954 hugetlb_cgroup_put_rsvd_cgroup(h_cg);
33039678
MK
6955 }
6956 }
33b8f84a
MK
6957 return true;
6958
075a61d0
MA
6959out_put_pages:
6960 /* put back original number of pages, chg */
6961 (void)hugepage_subpool_put_pages(spool, chg);
6962out_uncharge_cgroup:
6963 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
6964 chg * pages_per_huge_page(h), h_cg);
c50ac050 6965out_err:
8d9bfb26 6966 hugetlb_vma_lock_free(vma);
5e911373 6967 if (!vma || vma->vm_flags & VM_MAYSHARE)
0db9d74e
MA
6968 /* Only call region_abort if the region_chg succeeded but the
6969 * region_add failed or didn't run.
6970 */
6971 if (chg >= 0 && add < 0)
6972 region_abort(resv_map, from, to, regions_needed);
92fe9dcb 6973 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
f031dd27 6974 kref_put(&resv_map->refs, resv_map_release);
92fe9dcb
RR
6975 set_vma_resv_map(vma, NULL);
6976 }
33b8f84a 6977 return false;
a43a8c39
CK
6978}
6979
b5cec28d
MK
6980long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
6981 long freed)
a43a8c39 6982{
a5516438 6983 struct hstate *h = hstate_inode(inode);
4e35f483 6984 struct resv_map *resv_map = inode_resv_map(inode);
9119a41e 6985 long chg = 0;
90481622 6986 struct hugepage_subpool *spool = subpool_inode(inode);
1c5ecae3 6987 long gbl_reserve;
45c682a6 6988
f27a5136
MK
6989 /*
6990 * Since this routine can be called in the evict inode path for all
6991 * hugetlbfs inodes, resv_map could be NULL.
6992 */
b5cec28d
MK
6993 if (resv_map) {
6994 chg = region_del(resv_map, start, end);
6995 /*
6996 * region_del() can fail in the rare case where a region
6997 * must be split and another region descriptor can not be
6998 * allocated. If end == LONG_MAX, it will not fail.
6999 */
7000 if (chg < 0)
7001 return chg;
7002 }
7003
45c682a6 7004 spin_lock(&inode->i_lock);
e4c6f8be 7005 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
45c682a6
KC
7006 spin_unlock(&inode->i_lock);
7007
1c5ecae3
MK
7008 /*
7009 * If the subpool has a minimum size, the number of global
7010 * reservations to be released may be adjusted.
dddf31a4
ML
7011 *
7012 * Note that !resv_map implies freed == 0. So (chg - freed)
7013 * won't go negative.
1c5ecae3
MK
7014 */
7015 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
7016 hugetlb_acct_memory(h, -gbl_reserve);
b5cec28d
MK
7017
7018 return 0;
a43a8c39 7019}
93f70f90 7020
3212b535
SC
7021#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
7022static unsigned long page_table_shareable(struct vm_area_struct *svma,
7023 struct vm_area_struct *vma,
7024 unsigned long addr, pgoff_t idx)
7025{
7026 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
7027 svma->vm_start;
7028 unsigned long sbase = saddr & PUD_MASK;
7029 unsigned long s_end = sbase + PUD_SIZE;
7030
7031 /* Allow segments to share if only one is marked locked */
e430a95a
SB
7032 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
7033 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
3212b535
SC
7034
7035 /*
7036 * match the virtual addresses, permission and the alignment of the
7037 * page table page.
131a79b4
MK
7038 *
7039 * Also, vma_lock (vm_private_data) is required for sharing.
3212b535
SC
7040 */
7041 if (pmd_index(addr) != pmd_index(saddr) ||
7042 vm_flags != svm_flags ||
131a79b4
MK
7043 !range_in_vma(svma, sbase, s_end) ||
7044 !svma->vm_private_data)
3212b535
SC
7045 return 0;
7046
7047 return saddr;
7048}
7049
bbff39cc 7050bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
3212b535 7051{
bbff39cc
MK
7052 unsigned long start = addr & PUD_MASK;
7053 unsigned long end = start + PUD_SIZE;
7054
8d9bfb26
MK
7055#ifdef CONFIG_USERFAULTFD
7056 if (uffd_disable_huge_pmd_share(vma))
7057 return false;
7058#endif
3212b535
SC
7059 /*
7060 * check on proper vm_flags and page table alignment
7061 */
8d9bfb26
MK
7062 if (!(vma->vm_flags & VM_MAYSHARE))
7063 return false;
bbff39cc 7064 if (!vma->vm_private_data) /* vma lock required for sharing */
8d9bfb26
MK
7065 return false;
7066 if (!range_in_vma(vma, start, end))
7067 return false;
7068 return true;
7069}
7070
017b1660
MK
7071/*
7072 * Determine if start,end range within vma could be mapped by shared pmd.
7073 * If yes, adjust start and end to cover range associated with possible
7074 * shared pmd mappings.
7075 */
7076void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7077 unsigned long *start, unsigned long *end)
7078{
a1ba9da8
LX
7079 unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
7080 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
017b1660 7081
a1ba9da8 7082 /*
f0953a1b
IM
7083 * vma needs to span at least one aligned PUD size, and the range
7084 * must be at least partially within in.
a1ba9da8
LX
7085 */
7086 if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
7087 (*end <= v_start) || (*start >= v_end))
017b1660
MK
7088 return;
7089
75802ca6 7090 /* Extend the range to be PUD aligned for a worst case scenario */
a1ba9da8
LX
7091 if (*start > v_start)
7092 *start = ALIGN_DOWN(*start, PUD_SIZE);
017b1660 7093
a1ba9da8
LX
7094 if (*end < v_end)
7095 *end = ALIGN(*end, PUD_SIZE);
017b1660
MK
7096}
7097
3212b535
SC
7098/*
7099 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
7100 * and returns the corresponding pte. While this is not necessary for the
7101 * !shared pmd case because we can allocate the pmd later as well, it makes the
3a47c54f
MK
7102 * code much cleaner. pmd allocation is essential for the shared case because
7103 * pud has to be populated inside the same i_mmap_rwsem section - otherwise
7104 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
7105 * bad pmd for sharing.
3212b535 7106 */
aec44e0f
PX
7107pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
7108 unsigned long addr, pud_t *pud)
3212b535 7109{
3212b535
SC
7110 struct address_space *mapping = vma->vm_file->f_mapping;
7111 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
7112 vma->vm_pgoff;
7113 struct vm_area_struct *svma;
7114 unsigned long saddr;
7115 pte_t *spte = NULL;
7116 pte_t *pte;
7117
3a47c54f 7118 i_mmap_lock_read(mapping);
3212b535
SC
7119 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
7120 if (svma == vma)
7121 continue;
7122
7123 saddr = page_table_shareable(svma, vma, addr, idx);
7124 if (saddr) {
9c67a207
PX
7125 spte = hugetlb_walk(svma, saddr,
7126 vma_mmu_pagesize(svma));
3212b535
SC
7127 if (spte) {
7128 get_page(virt_to_page(spte));
7129 break;
7130 }
7131 }
7132 }
7133
7134 if (!spte)
7135 goto out;
7136
349d1670 7137 spin_lock(&mm->page_table_lock);
dc6c9a35 7138 if (pud_none(*pud)) {
3212b535
SC
7139 pud_populate(mm, pud,
7140 (pmd_t *)((unsigned long)spte & PAGE_MASK));
c17b1f42 7141 mm_inc_nr_pmds(mm);
dc6c9a35 7142 } else {
3212b535 7143 put_page(virt_to_page(spte));
dc6c9a35 7144 }
349d1670 7145 spin_unlock(&mm->page_table_lock);
3212b535
SC
7146out:
7147 pte = (pte_t *)pmd_alloc(mm, pud, addr);
3a47c54f 7148 i_mmap_unlock_read(mapping);
3212b535
SC
7149 return pte;
7150}
7151
7152/*
7153 * unmap huge page backed by shared pte.
7154 *
7155 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
7156 * indicated by page_count > 1, unmap is achieved by clearing pud and
7157 * decrementing the ref count. If count == 1, the pte page is not shared.
7158 *
3a47c54f 7159 * Called with page table lock held.
3212b535
SC
7160 *
7161 * returns: 1 successfully unmapped a shared pte page
7162 * 0 the underlying pte page is not shared, or it is the last user
7163 */
34ae204f 7164int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
4ddb4d91 7165 unsigned long addr, pte_t *ptep)
3212b535 7166{
4ddb4d91
MK
7167 pgd_t *pgd = pgd_offset(mm, addr);
7168 p4d_t *p4d = p4d_offset(pgd, addr);
7169 pud_t *pud = pud_offset(p4d, addr);
3212b535 7170
34ae204f 7171 i_mmap_assert_write_locked(vma->vm_file->f_mapping);
40549ba8 7172 hugetlb_vma_assert_locked(vma);
3212b535
SC
7173 BUG_ON(page_count(virt_to_page(ptep)) == 0);
7174 if (page_count(virt_to_page(ptep)) == 1)
7175 return 0;
7176
7177 pud_clear(pud);
7178 put_page(virt_to_page(ptep));
dc6c9a35 7179 mm_dec_nr_pmds(mm);
3212b535
SC
7180 return 1;
7181}
c1991e07 7182
9e5fc74c 7183#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
8d9bfb26 7184
aec44e0f
PX
7185pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
7186 unsigned long addr, pud_t *pud)
9e5fc74c
SC
7187{
7188 return NULL;
7189}
e81f2d22 7190
34ae204f 7191int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
4ddb4d91 7192 unsigned long addr, pte_t *ptep)
e81f2d22
ZZ
7193{
7194 return 0;
7195}
017b1660
MK
7196
7197void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
7198 unsigned long *start, unsigned long *end)
7199{
7200}
c1991e07
PX
7201
7202bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
7203{
7204 return false;
7205}
3212b535
SC
7206#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
7207
9e5fc74c 7208#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
aec44e0f 7209pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
9e5fc74c
SC
7210 unsigned long addr, unsigned long sz)
7211{
7212 pgd_t *pgd;
c2febafc 7213 p4d_t *p4d;
9e5fc74c
SC
7214 pud_t *pud;
7215 pte_t *pte = NULL;
7216
7217 pgd = pgd_offset(mm, addr);
f4f0a3d8
KS
7218 p4d = p4d_alloc(mm, pgd, addr);
7219 if (!p4d)
7220 return NULL;
c2febafc 7221 pud = pud_alloc(mm, p4d, addr);
9e5fc74c
SC
7222 if (pud) {
7223 if (sz == PUD_SIZE) {
7224 pte = (pte_t *)pud;
7225 } else {
7226 BUG_ON(sz != PMD_SIZE);
c1991e07 7227 if (want_pmd_share(vma, addr) && pud_none(*pud))
aec44e0f 7228 pte = huge_pmd_share(mm, vma, addr, pud);
9e5fc74c
SC
7229 else
7230 pte = (pte_t *)pmd_alloc(mm, pud, addr);
7231 }
7232 }
191fcdb6
JH
7233
7234 if (pte) {
7235 pte_t pteval = ptep_get_lockless(pte);
7236
7237 BUG_ON(pte_present(pteval) && !pte_huge(pteval));
7238 }
9e5fc74c
SC
7239
7240 return pte;
7241}
7242
9b19df29
PA
7243/*
7244 * huge_pte_offset() - Walk the page table to resolve the hugepage
7245 * entry at address @addr
7246 *
8ac0b81a
LX
7247 * Return: Pointer to page table entry (PUD or PMD) for
7248 * address @addr, or NULL if a !p*d_present() entry is encountered and the
9b19df29
PA
7249 * size @sz doesn't match the hugepage size at this level of the page
7250 * table.
7251 */
7868a208
PA
7252pte_t *huge_pte_offset(struct mm_struct *mm,
7253 unsigned long addr, unsigned long sz)
9e5fc74c
SC
7254{
7255 pgd_t *pgd;
c2febafc 7256 p4d_t *p4d;
8ac0b81a
LX
7257 pud_t *pud;
7258 pmd_t *pmd;
9e5fc74c
SC
7259
7260 pgd = pgd_offset(mm, addr);
c2febafc
KS
7261 if (!pgd_present(*pgd))
7262 return NULL;
7263 p4d = p4d_offset(pgd, addr);
7264 if (!p4d_present(*p4d))
7265 return NULL;
9b19df29 7266
c2febafc 7267 pud = pud_offset(p4d, addr);
8ac0b81a
LX
7268 if (sz == PUD_SIZE)
7269 /* must be pud huge, non-present or none */
c2febafc 7270 return (pte_t *)pud;
8ac0b81a 7271 if (!pud_present(*pud))
9b19df29 7272 return NULL;
8ac0b81a 7273 /* must have a valid entry and size to go further */
9b19df29 7274
8ac0b81a
LX
7275 pmd = pmd_offset(pud, addr);
7276 /* must be pmd huge, non-present or none */
7277 return (pte_t *)pmd;
9e5fc74c
SC
7278}
7279
e95a9851
MK
7280/*
7281 * Return a mask that can be used to update an address to the last huge
7282 * page in a page table page mapping size. Used to skip non-present
7283 * page table entries when linearly scanning address ranges. Architectures
7284 * with unique huge page to page table relationships can define their own
7285 * version of this routine.
7286 */
7287unsigned long hugetlb_mask_last_page(struct hstate *h)
7288{
7289 unsigned long hp_size = huge_page_size(h);
7290
7291 if (hp_size == PUD_SIZE)
7292 return P4D_SIZE - PUD_SIZE;
7293 else if (hp_size == PMD_SIZE)
7294 return PUD_SIZE - PMD_SIZE;
7295 else
7296 return 0UL;
7297}
7298
7299#else
7300
7301/* See description above. Architectures can provide their own version. */
7302__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
7303{
4ddb4d91
MK
7304#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
7305 if (huge_page_size(h) == PMD_SIZE)
7306 return PUD_SIZE - PMD_SIZE;
7307#endif
e95a9851
MK
7308 return 0UL;
7309}
7310
61f77eda
NH
7311#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
7312
7313/*
7314 * These functions are overwritable if your architecture needs its own
7315 * behavior.
7316 */
9747b9e9 7317bool isolate_hugetlb(struct folio *folio, struct list_head *list)
31caf665 7318{
9747b9e9 7319 bool ret = true;
bcc54222 7320
db71ef79 7321 spin_lock_irq(&hugetlb_lock);
6aa3a920
SK
7322 if (!folio_test_hugetlb(folio) ||
7323 !folio_test_hugetlb_migratable(folio) ||
7324 !folio_try_get(folio)) {
9747b9e9 7325 ret = false;
bcc54222
NH
7326 goto unlock;
7327 }
6aa3a920
SK
7328 folio_clear_hugetlb_migratable(folio);
7329 list_move_tail(&folio->lru, list);
bcc54222 7330unlock:
db71ef79 7331 spin_unlock_irq(&hugetlb_lock);
bcc54222 7332 return ret;
31caf665
NH
7333}
7334
04bac040 7335int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
25182f05
NH
7336{
7337 int ret = 0;
7338
7339 *hugetlb = false;
7340 spin_lock_irq(&hugetlb_lock);
04bac040 7341 if (folio_test_hugetlb(folio)) {
25182f05 7342 *hugetlb = true;
04bac040 7343 if (folio_test_hugetlb_freed(folio))
b283d983 7344 ret = 0;
04bac040
SK
7345 else if (folio_test_hugetlb_migratable(folio) || unpoison)
7346 ret = folio_try_get(folio);
0ed950d1
NH
7347 else
7348 ret = -EBUSY;
25182f05
NH
7349 }
7350 spin_unlock_irq(&hugetlb_lock);
7351 return ret;
7352}
7353
e591ef7d
NH
7354int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
7355 bool *migratable_cleared)
405ce051
NH
7356{
7357 int ret;
7358
7359 spin_lock_irq(&hugetlb_lock);
e591ef7d 7360 ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
405ce051
NH
7361 spin_unlock_irq(&hugetlb_lock);
7362 return ret;
7363}
7364
ea8e72f4 7365void folio_putback_active_hugetlb(struct folio *folio)
31caf665 7366{
db71ef79 7367 spin_lock_irq(&hugetlb_lock);
ea8e72f4
SK
7368 folio_set_hugetlb_migratable(folio);
7369 list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
db71ef79 7370 spin_unlock_irq(&hugetlb_lock);
ea8e72f4 7371 folio_put(folio);
31caf665 7372}
ab5ac90a 7373
345c62d1 7374void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
ab5ac90a 7375{
345c62d1 7376 struct hstate *h = folio_hstate(old_folio);
ab5ac90a 7377
345c62d1
SK
7378 hugetlb_cgroup_migrate(old_folio, new_folio);
7379 set_page_owner_migrate_reason(&new_folio->page, reason);
ab5ac90a
MH
7380
7381 /*
345c62d1 7382 * transfer temporary state of the new hugetlb folio. This is
ab5ac90a
MH
7383 * reverse to other transitions because the newpage is going to
7384 * be final while the old one will be freed so it takes over
7385 * the temporary status.
7386 *
7387 * Also note that we have to transfer the per-node surplus state
7388 * here as well otherwise the global surplus count will not match
7389 * the per-node's.
7390 */
345c62d1
SK
7391 if (folio_test_hugetlb_temporary(new_folio)) {
7392 int old_nid = folio_nid(old_folio);
7393 int new_nid = folio_nid(new_folio);
7394
345c62d1
SK
7395 folio_set_hugetlb_temporary(old_folio);
7396 folio_clear_hugetlb_temporary(new_folio);
ab5ac90a 7397
ab5ac90a 7398
5af1ab1d
ML
7399 /*
7400 * There is no need to transfer the per-node surplus state
7401 * when we do not cross the node.
7402 */
7403 if (new_nid == old_nid)
7404 return;
db71ef79 7405 spin_lock_irq(&hugetlb_lock);
ab5ac90a
MH
7406 if (h->surplus_huge_pages_node[old_nid]) {
7407 h->surplus_huge_pages_node[old_nid]--;
7408 h->surplus_huge_pages_node[new_nid]++;
7409 }
db71ef79 7410 spin_unlock_irq(&hugetlb_lock);
ab5ac90a
MH
7411 }
7412}
cf11e85f 7413
b30c14cd
JH
7414static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
7415 unsigned long start,
7416 unsigned long end)
6dfeaff9
PX
7417{
7418 struct hstate *h = hstate_vma(vma);
7419 unsigned long sz = huge_page_size(h);
7420 struct mm_struct *mm = vma->vm_mm;
7421 struct mmu_notifier_range range;
b30c14cd 7422 unsigned long address;
6dfeaff9
PX
7423 spinlock_t *ptl;
7424 pte_t *ptep;
7425
7426 if (!(vma->vm_flags & VM_MAYSHARE))
7427 return;
7428
6dfeaff9
PX
7429 if (start >= end)
7430 return;
7431
9c8bbfac 7432 flush_cache_range(vma, start, end);
6dfeaff9
PX
7433 /*
7434 * No need to call adjust_range_if_pmd_sharing_possible(), because
7435 * we have already done the PUD_SIZE alignment.
7436 */
7d4a8be0 7437 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
6dfeaff9
PX
7438 start, end);
7439 mmu_notifier_invalidate_range_start(&range);
40549ba8 7440 hugetlb_vma_lock_write(vma);
6dfeaff9
PX
7441 i_mmap_lock_write(vma->vm_file->f_mapping);
7442 for (address = start; address < end; address += PUD_SIZE) {
9c67a207 7443 ptep = hugetlb_walk(vma, address, sz);
6dfeaff9
PX
7444 if (!ptep)
7445 continue;
7446 ptl = huge_pte_lock(h, mm, ptep);
4ddb4d91 7447 huge_pmd_unshare(mm, vma, address, ptep);
6dfeaff9
PX
7448 spin_unlock(ptl);
7449 }
7450 flush_hugetlb_tlb_range(vma, start, end);
7451 i_mmap_unlock_write(vma->vm_file->f_mapping);
40549ba8 7452 hugetlb_vma_unlock_write(vma);
6dfeaff9 7453 /*
1af5a810 7454 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
ee65728e 7455 * Documentation/mm/mmu_notifier.rst.
6dfeaff9
PX
7456 */
7457 mmu_notifier_invalidate_range_end(&range);
7458}
7459
b30c14cd
JH
7460/*
7461 * This function will unconditionally remove all the shared pmd pgtable entries
7462 * within the specific vma for a hugetlbfs memory range.
7463 */
7464void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
7465{
7466 hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7467 ALIGN_DOWN(vma->vm_end, PUD_SIZE));
7468}
7469
cf11e85f 7470#ifdef CONFIG_CMA
cf11e85f
RG
7471static bool cma_reserve_called __initdata;
7472
7473static int __init cmdline_parse_hugetlb_cma(char *p)
7474{
38e719ab
BW
7475 int nid, count = 0;
7476 unsigned long tmp;
7477 char *s = p;
7478
7479 while (*s) {
7480 if (sscanf(s, "%lu%n", &tmp, &count) != 1)
7481 break;
7482
7483 if (s[count] == ':') {
f9317f77 7484 if (tmp >= MAX_NUMNODES)
38e719ab 7485 break;
f9317f77 7486 nid = array_index_nospec(tmp, MAX_NUMNODES);
38e719ab
BW
7487
7488 s += count + 1;
7489 tmp = memparse(s, &s);
7490 hugetlb_cma_size_in_node[nid] = tmp;
7491 hugetlb_cma_size += tmp;
7492
7493 /*
7494 * Skip the separator if have one, otherwise
7495 * break the parsing.
7496 */
7497 if (*s == ',')
7498 s++;
7499 else
7500 break;
7501 } else {
7502 hugetlb_cma_size = memparse(p, &p);
7503 break;
7504 }
7505 }
7506
cf11e85f
RG
7507 return 0;
7508}
7509
7510early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
7511
7512void __init hugetlb_cma_reserve(int order)
7513{
7514 unsigned long size, reserved, per_node;
38e719ab 7515 bool node_specific_cma_alloc = false;
cf11e85f
RG
7516 int nid;
7517
7518 cma_reserve_called = true;
7519
38e719ab
BW
7520 if (!hugetlb_cma_size)
7521 return;
7522
7523 for (nid = 0; nid < MAX_NUMNODES; nid++) {
7524 if (hugetlb_cma_size_in_node[nid] == 0)
7525 continue;
7526
30a51400 7527 if (!node_online(nid)) {
38e719ab
BW
7528 pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
7529 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7530 hugetlb_cma_size_in_node[nid] = 0;
7531 continue;
7532 }
7533
7534 if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
7535 pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
7536 nid, (PAGE_SIZE << order) / SZ_1M);
7537 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7538 hugetlb_cma_size_in_node[nid] = 0;
7539 } else {
7540 node_specific_cma_alloc = true;
7541 }
7542 }
7543
7544 /* Validate the CMA size again in case some invalid nodes specified. */
cf11e85f
RG
7545 if (!hugetlb_cma_size)
7546 return;
7547
7548 if (hugetlb_cma_size < (PAGE_SIZE << order)) {
7549 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
7550 (PAGE_SIZE << order) / SZ_1M);
a01f4390 7551 hugetlb_cma_size = 0;
cf11e85f
RG
7552 return;
7553 }
7554
38e719ab
BW
7555 if (!node_specific_cma_alloc) {
7556 /*
7557 * If 3 GB area is requested on a machine with 4 numa nodes,
7558 * let's allocate 1 GB on first three nodes and ignore the last one.
7559 */
7560 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
7561 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
7562 hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
7563 }
cf11e85f
RG
7564
7565 reserved = 0;
30a51400 7566 for_each_online_node(nid) {
cf11e85f 7567 int res;
2281f797 7568 char name[CMA_MAX_NAME];
cf11e85f 7569
38e719ab
BW
7570 if (node_specific_cma_alloc) {
7571 if (hugetlb_cma_size_in_node[nid] == 0)
7572 continue;
7573
7574 size = hugetlb_cma_size_in_node[nid];
7575 } else {
7576 size = min(per_node, hugetlb_cma_size - reserved);
7577 }
7578
cf11e85f
RG
7579 size = round_up(size, PAGE_SIZE << order);
7580
2281f797 7581 snprintf(name, sizeof(name), "hugetlb%d", nid);
a01f4390
MK
7582 /*
7583 * Note that 'order per bit' is based on smallest size that
7584 * may be returned to CMA allocator in the case of
7585 * huge page demotion.
7586 */
7587 res = cma_declare_contiguous_nid(0, size, 0,
7588 PAGE_SIZE << HUGETLB_PAGE_ORDER,
29d0f41d 7589 0, false, name,
cf11e85f
RG
7590 &hugetlb_cma[nid], nid);
7591 if (res) {
7592 pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
7593 res, nid);
7594 continue;
7595 }
7596
7597 reserved += size;
7598 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
7599 size / SZ_1M, nid);
7600
7601 if (reserved >= hugetlb_cma_size)
7602 break;
7603 }
a01f4390
MK
7604
7605 if (!reserved)
7606 /*
7607 * hugetlb_cma_size is used to determine if allocations from
7608 * cma are possible. Set to zero if no cma regions are set up.
7609 */
7610 hugetlb_cma_size = 0;
cf11e85f
RG
7611}
7612
263b8998 7613static void __init hugetlb_cma_check(void)
cf11e85f
RG
7614{
7615 if (!hugetlb_cma_size || cma_reserve_called)
7616 return;
7617
7618 pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
7619}
7620
7621#endif /* CONFIG_CMA */