Commit | Line | Data |
---|---|---|
f41f2ed4 MS |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
dff03381 | 3 | * HugeTLB Vmemmap Optimization (HVO) |
f41f2ed4 | 4 | * |
dff03381 | 5 | * Copyright (c) 2020, ByteDance. All rights reserved. |
f41f2ed4 MS |
6 | * |
7 | * Author: Muchun Song <songmuchun@bytedance.com> | |
8 | * | |
ee65728e | 9 | * See Documentation/mm/vmemmap_dedup.rst |
f41f2ed4 | 10 | */ |
e9fdff87 MS |
11 | #define pr_fmt(fmt) "HugeTLB: " fmt |
12 | ||
998a2997 | 13 | #include <linux/pgtable.h> |
db5e8d84 | 14 | #include <linux/moduleparam.h> |
998a2997 | 15 | #include <linux/bootmem_info.h> |
d8f5f7e4 | 16 | #include <linux/mmdebug.h> |
fb93ed63 | 17 | #include <linux/pagewalk.h> |
998a2997 MS |
18 | #include <asm/pgalloc.h> |
19 | #include <asm/tlbflush.h> | |
f41f2ed4 MS |
20 | #include "hugetlb_vmemmap.h" |
21 | ||
998a2997 MS |
22 | /** |
23 | * struct vmemmap_remap_walk - walk vmemmap page table | |
24 | * | |
25 | * @remap_pte: called for each lowest-level entry (PTE). | |
26 | * @nr_walked: the number of walked pte. | |
27 | * @reuse_page: the page which is reused for the tail vmemmap pages. | |
28 | * @reuse_addr: the virtual address of the @reuse_page page. | |
29 | * @vmemmap_pages: the list head of the vmemmap pages that can be freed | |
30 | * or is mapped from. | |
f4b7e3ef JM |
31 | * @flags: used to modify behavior in vmemmap page table walking |
32 | * operations. | |
998a2997 MS |
33 | */ |
34 | struct vmemmap_remap_walk { | |
35 | void (*remap_pte)(pte_t *pte, unsigned long addr, | |
36 | struct vmemmap_remap_walk *walk); | |
37 | unsigned long nr_walked; | |
38 | struct page *reuse_page; | |
39 | unsigned long reuse_addr; | |
40 | struct list_head *vmemmap_pages; | |
f4b7e3ef JM |
41 | |
42 | /* Skip the TLB flush when we split the PMD */ | |
43 | #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) | |
f13b83fd JM |
44 | /* Skip the TLB flush when we remap the PTE */ |
45 | #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) | |
c2a967f6 YZ |
46 | /* synchronize_rcu() to avoid writes from page_ref_add_unless() */ |
47 | #define VMEMMAP_SYNCHRONIZE_RCU BIT(2) | |
f4b7e3ef | 48 | unsigned long flags; |
998a2997 MS |
49 | }; |
50 | ||
fb93ed63 MS |
51 | static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, |
52 | struct vmemmap_remap_walk *walk) | |
998a2997 MS |
53 | { |
54 | pmd_t __pmd; | |
55 | int i; | |
56 | unsigned long addr = start; | |
3ce2c24c MS |
57 | pte_t *pgtable; |
58 | ||
3ce2c24c | 59 | pgtable = pte_alloc_one_kernel(&init_mm); |
998a2997 MS |
60 | if (!pgtable) |
61 | return -ENOMEM; | |
62 | ||
63 | pmd_populate_kernel(&init_mm, &__pmd, pgtable); | |
64 | ||
e38f055d | 65 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { |
998a2997 MS |
66 | pte_t entry, *pte; |
67 | pgprot_t pgprot = PAGE_KERNEL; | |
68 | ||
3ce2c24c | 69 | entry = mk_pte(head + i, pgprot); |
998a2997 MS |
70 | pte = pte_offset_kernel(&__pmd, addr); |
71 | set_pte_at(&init_mm, addr, pte, entry); | |
72 | } | |
73 | ||
74 | spin_lock(&init_mm.page_table_lock); | |
75 | if (likely(pmd_leaf(*pmd))) { | |
76 | /* | |
77 | * Higher order allocations from buddy allocator must be able to | |
78 | * be treated as indepdenent small pages (as they can be freed | |
79 | * individually). | |
80 | */ | |
3ce2c24c MS |
81 | if (!PageReserved(head)) |
82 | split_page(head, get_order(PMD_SIZE)); | |
998a2997 MS |
83 | |
84 | /* Make pte visible before pmd. See comment in pmd_install(). */ | |
85 | smp_wmb(); | |
86 | pmd_populate_kernel(&init_mm, pmd, pgtable); | |
fb93ed63 | 87 | if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) |
f4b7e3ef | 88 | flush_tlb_kernel_range(start, start + PMD_SIZE); |
998a2997 MS |
89 | } else { |
90 | pte_free_kernel(&init_mm, pgtable); | |
91 | } | |
92 | spin_unlock(&init_mm.page_table_lock); | |
93 | ||
94 | return 0; | |
95 | } | |
96 | ||
fb93ed63 MS |
97 | static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, |
98 | unsigned long next, struct mm_walk *walk) | |
998a2997 | 99 | { |
be035a2a | 100 | int ret = 0; |
fb93ed63 MS |
101 | struct page *head; |
102 | struct vmemmap_remap_walk *vmemmap_walk = walk->private; | |
f4b7e3ef | 103 | |
fb93ed63 MS |
104 | /* Only splitting, not remapping the vmemmap pages. */ |
105 | if (!vmemmap_walk->remap_pte) | |
106 | walk->action = ACTION_CONTINUE; | |
f4b7e3ef | 107 | |
fb93ed63 MS |
108 | spin_lock(&init_mm.page_table_lock); |
109 | head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; | |
be035a2a MS |
110 | /* |
111 | * Due to HugeTLB alignment requirements and the vmemmap | |
112 | * pages being at the start of the hotplugged memory | |
113 | * region in memory_hotplug.memmap_on_memory case. Checking | |
114 | * the vmemmap page associated with the first vmemmap page | |
115 | * if it is self-hosted is sufficient. | |
116 | * | |
117 | * [ hotplugged memory ] | |
118 | * [ section ][...][ section ] | |
119 | * [ vmemmap ][ usable memory ] | |
120 | * ^ | ^ | | |
121 | * +--+ | | | |
122 | * +------------------------+ | |
123 | */ | |
47e61d88 | 124 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { |
be035a2a MS |
125 | struct page *page = head ? head + pte_index(addr) : |
126 | pte_page(ptep_get(pte_offset_kernel(pmd, addr))); | |
127 | ||
128 | if (PageVmemmapSelfHosted(page)) | |
129 | ret = -ENOTSUPP; | |
130 | } | |
fb93ed63 | 131 | spin_unlock(&init_mm.page_table_lock); |
be035a2a MS |
132 | if (!head || ret) |
133 | return ret; | |
998a2997 | 134 | |
fb93ed63 | 135 | return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); |
998a2997 MS |
136 | } |
137 | ||
fb93ed63 MS |
138 | static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, |
139 | unsigned long next, struct mm_walk *walk) | |
998a2997 | 140 | { |
fb93ed63 | 141 | struct vmemmap_remap_walk *vmemmap_walk = walk->private; |
998a2997 | 142 | |
fb93ed63 MS |
143 | /* |
144 | * The reuse_page is found 'first' in page table walking before | |
145 | * starting remapping. | |
146 | */ | |
147 | if (!vmemmap_walk->reuse_page) | |
148 | vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); | |
149 | else | |
150 | vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); | |
151 | vmemmap_walk->nr_walked++; | |
998a2997 MS |
152 | |
153 | return 0; | |
154 | } | |
155 | ||
fb93ed63 MS |
156 | static const struct mm_walk_ops vmemmap_remap_ops = { |
157 | .pmd_entry = vmemmap_pmd_entry, | |
158 | .pte_entry = vmemmap_pte_entry, | |
159 | }; | |
998a2997 MS |
160 | |
161 | static int vmemmap_remap_range(unsigned long start, unsigned long end, | |
162 | struct vmemmap_remap_walk *walk) | |
163 | { | |
fb93ed63 | 164 | int ret; |
998a2997 | 165 | |
fb93ed63 | 166 | VM_BUG_ON(!PAGE_ALIGNED(start | end)); |
998a2997 | 167 | |
49b960de | 168 | mmap_read_lock(&init_mm); |
fb93ed63 MS |
169 | ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, |
170 | NULL, walk); | |
49b960de | 171 | mmap_read_unlock(&init_mm); |
fb93ed63 MS |
172 | if (ret) |
173 | return ret; | |
998a2997 | 174 | |
f13b83fd | 175 | if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) |
f4b7e3ef | 176 | flush_tlb_kernel_range(start, end); |
998a2997 MS |
177 | |
178 | return 0; | |
179 | } | |
180 | ||
181 | /* | |
182 | * Free a vmemmap page. A vmemmap page can be allocated from the memblock | |
183 | * allocator or buddy allocator. If the PG_reserved flag is set, it means | |
184 | * that it allocated from the memblock allocator, just free it via the | |
185 | * free_bootmem_page(). Otherwise, use __free_page(). | |
186 | */ | |
187 | static inline void free_vmemmap_page(struct page *page) | |
188 | { | |
15995a35 | 189 | if (PageReserved(page)) { |
9d857311 | 190 | memmap_boot_pages_add(-1); |
998a2997 | 191 | free_bootmem_page(page); |
15995a35 | 192 | } else { |
9d857311 | 193 | memmap_pages_add(-1); |
998a2997 | 194 | __free_page(page); |
15995a35 | 195 | } |
998a2997 MS |
196 | } |
197 | ||
198 | /* Free a list of the vmemmap pages */ | |
199 | static void free_vmemmap_page_list(struct list_head *list) | |
200 | { | |
201 | struct page *page, *next; | |
202 | ||
1cc53a04 | 203 | list_for_each_entry_safe(page, next, list, lru) |
998a2997 | 204 | free_vmemmap_page(page); |
998a2997 MS |
205 | } |
206 | ||
207 | static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, | |
208 | struct vmemmap_remap_walk *walk) | |
209 | { | |
210 | /* | |
211 | * Remap the tail pages as read-only to catch illegal write operation | |
212 | * to the tail pages. | |
213 | */ | |
214 | pgprot_t pgprot = PAGE_KERNEL_RO; | |
c33c7948 | 215 | struct page *page = pte_page(ptep_get(pte)); |
11aad263 JM |
216 | pte_t entry; |
217 | ||
218 | /* Remapping the head page requires r/w */ | |
219 | if (unlikely(addr == walk->reuse_addr)) { | |
220 | pgprot = PAGE_KERNEL; | |
221 | list_del(&walk->reuse_page->lru); | |
222 | ||
223 | /* | |
224 | * Makes sure that preceding stores to the page contents from | |
225 | * vmemmap_remap_free() become visible before the set_pte_at() | |
226 | * write. | |
227 | */ | |
228 | smp_wmb(); | |
229 | } | |
998a2997 | 230 | |
11aad263 | 231 | entry = mk_pte(walk->reuse_page, pgprot); |
91f386bf | 232 | list_add(&page->lru, walk->vmemmap_pages); |
998a2997 MS |
233 | set_pte_at(&init_mm, addr, pte, entry); |
234 | } | |
235 | ||
236 | /* | |
237 | * How many struct page structs need to be reset. When we reuse the head | |
238 | * struct page, the special metadata (e.g. page->flags or page->mapping) | |
239 | * cannot copy to the tail struct page structs. The invalid value will be | |
8666925c | 240 | * checked in the free_tail_page_prepare(). In order to avoid the message |
998a2997 MS |
241 | * of "corrupted mapping in tail page". We need to reset at least 3 (one |
242 | * head struct page struct and two tail struct page structs) struct page | |
243 | * structs. | |
244 | */ | |
245 | #define NR_RESET_STRUCT_PAGE 3 | |
246 | ||
247 | static inline void reset_struct_pages(struct page *start) | |
248 | { | |
998a2997 MS |
249 | struct page *from = start + NR_RESET_STRUCT_PAGE; |
250 | ||
33febb51 MS |
251 | BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); |
252 | memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); | |
998a2997 MS |
253 | } |
254 | ||
255 | static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, | |
256 | struct vmemmap_remap_walk *walk) | |
257 | { | |
258 | pgprot_t pgprot = PAGE_KERNEL; | |
259 | struct page *page; | |
260 | void *to; | |
261 | ||
c33c7948 | 262 | BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); |
998a2997 MS |
263 | |
264 | page = list_first_entry(walk->vmemmap_pages, struct page, lru); | |
265 | list_del(&page->lru); | |
266 | to = page_to_virt(page); | |
267 | copy_page(to, (void *)walk->reuse_addr); | |
268 | reset_struct_pages(to); | |
269 | ||
939de63d ML |
270 | /* |
271 | * Makes sure that preceding stores to the page contents become visible | |
272 | * before the set_pte_at() write. | |
273 | */ | |
274 | smp_wmb(); | |
998a2997 MS |
275 | set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); |
276 | } | |
277 | ||
f4b7e3ef JM |
278 | /** |
279 | * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) | |
280 | * backing PMDs of the directmap into PTEs | |
281 | * @start: start address of the vmemmap virtual address range that we want | |
282 | * to remap. | |
283 | * @end: end address of the vmemmap virtual address range that we want to | |
284 | * remap. | |
285 | * @reuse: reuse address. | |
286 | * | |
287 | * Return: %0 on success, negative error code otherwise. | |
288 | */ | |
289 | static int vmemmap_remap_split(unsigned long start, unsigned long end, | |
ebc20dca | 290 | unsigned long reuse) |
f4b7e3ef | 291 | { |
f4b7e3ef JM |
292 | struct vmemmap_remap_walk walk = { |
293 | .remap_pte = NULL, | |
294 | .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, | |
295 | }; | |
296 | ||
297 | /* See the comment in the vmemmap_remap_free(). */ | |
298 | BUG_ON(start - reuse != PAGE_SIZE); | |
299 | ||
49b960de | 300 | return vmemmap_remap_range(reuse, end, &walk); |
f4b7e3ef JM |
301 | } |
302 | ||
998a2997 MS |
303 | /** |
304 | * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) | |
305 | * to the page which @reuse is mapped to, then free vmemmap | |
306 | * which the range are mapped to. | |
307 | * @start: start address of the vmemmap virtual address range that we want | |
308 | * to remap. | |
309 | * @end: end address of the vmemmap virtual address range that we want to | |
310 | * remap. | |
311 | * @reuse: reuse address. | |
91f386bf MK |
312 | * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers |
313 | * responsibility to free pages. | |
f13b83fd | 314 | * @flags: modifications to vmemmap_remap_walk flags |
998a2997 MS |
315 | * |
316 | * Return: %0 on success, negative error code otherwise. | |
317 | */ | |
318 | static int vmemmap_remap_free(unsigned long start, unsigned long end, | |
91f386bf | 319 | unsigned long reuse, |
f13b83fd JM |
320 | struct list_head *vmemmap_pages, |
321 | unsigned long flags) | |
998a2997 MS |
322 | { |
323 | int ret; | |
998a2997 MS |
324 | struct vmemmap_remap_walk walk = { |
325 | .remap_pte = vmemmap_remap_pte, | |
326 | .reuse_addr = reuse, | |
91f386bf | 327 | .vmemmap_pages = vmemmap_pages, |
f13b83fd | 328 | .flags = flags, |
998a2997 | 329 | }; |
a9e34ea1 | 330 | int nid = page_to_nid((struct page *)reuse); |
6a898c27 | 331 | gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; |
11aad263 JM |
332 | |
333 | /* | |
334 | * Allocate a new head vmemmap page to avoid breaking a contiguous | |
335 | * block of struct page memory when freeing it back to page allocator | |
336 | * in free_vmemmap_page_list(). This will allow the likely contiguous | |
337 | * struct page backing memory to be kept contiguous and allowing for | |
338 | * more allocations of hugepages. Fallback to the currently | |
339 | * mapped head page in case should it fail to allocate. | |
340 | */ | |
341 | walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); | |
342 | if (walk.reuse_page) { | |
343 | copy_page(page_to_virt(walk.reuse_page), | |
344 | (void *)walk.reuse_addr); | |
91f386bf | 345 | list_add(&walk.reuse_page->lru, vmemmap_pages); |
9d857311 | 346 | memmap_pages_add(1); |
11aad263 | 347 | } |
998a2997 MS |
348 | |
349 | /* | |
350 | * In order to make remapping routine most efficient for the huge pages, | |
351 | * the routine of vmemmap page table walking has the following rules | |
352 | * (see more details from the vmemmap_pte_range()): | |
353 | * | |
354 | * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) | |
355 | * should be continuous. | |
356 | * - The @reuse address is part of the range [@reuse, @end) that we are | |
357 | * walking which is passed to vmemmap_remap_range(). | |
358 | * - The @reuse address is the first in the complete range. | |
359 | * | |
360 | * So we need to make sure that @start and @reuse meet the above rules. | |
361 | */ | |
362 | BUG_ON(start - reuse != PAGE_SIZE); | |
363 | ||
998a2997 MS |
364 | ret = vmemmap_remap_range(reuse, end, &walk); |
365 | if (ret && walk.nr_walked) { | |
366 | end = reuse + walk.nr_walked * PAGE_SIZE; | |
367 | /* | |
368 | * vmemmap_pages contains pages from the previous | |
369 | * vmemmap_remap_range call which failed. These | |
370 | * are pages which were removed from the vmemmap. | |
371 | * They will be restored in the following call. | |
372 | */ | |
373 | walk = (struct vmemmap_remap_walk) { | |
374 | .remap_pte = vmemmap_restore_pte, | |
375 | .reuse_addr = reuse, | |
91f386bf | 376 | .vmemmap_pages = vmemmap_pages, |
f4b7e3ef | 377 | .flags = 0, |
998a2997 MS |
378 | }; |
379 | ||
380 | vmemmap_remap_range(reuse, end, &walk); | |
381 | } | |
998a2997 | 382 | |
998a2997 MS |
383 | return ret; |
384 | } | |
385 | ||
386 | static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, | |
eb83f652 | 387 | struct list_head *list) |
998a2997 | 388 | { |
2eaa6c2a | 389 | gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; |
998a2997 MS |
390 | unsigned long nr_pages = (end - start) >> PAGE_SHIFT; |
391 | int nid = page_to_nid((struct page *)start); | |
392 | struct page *page, *next; | |
15995a35 | 393 | int i; |
998a2997 | 394 | |
15995a35 | 395 | for (i = 0; i < nr_pages; i++) { |
998a2997 | 396 | page = alloc_pages_node(nid, gfp_mask, 0); |
ace0741a | 397 | if (!page) |
998a2997 | 398 | goto out; |
91f386bf | 399 | list_add(&page->lru, list); |
998a2997 | 400 | } |
9d857311 | 401 | memmap_pages_add(nr_pages); |
15995a35 | 402 | |
998a2997 MS |
403 | return 0; |
404 | out: | |
405 | list_for_each_entry_safe(page, next, list, lru) | |
dcc1be11 | 406 | __free_page(page); |
998a2997 MS |
407 | return -ENOMEM; |
408 | } | |
409 | ||
410 | /** | |
411 | * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) | |
412 | * to the page which is from the @vmemmap_pages | |
413 | * respectively. | |
414 | * @start: start address of the vmemmap virtual address range that we want | |
415 | * to remap. | |
416 | * @end: end address of the vmemmap virtual address range that we want to | |
417 | * remap. | |
418 | * @reuse: reuse address. | |
c24f188b | 419 | * @flags: modifications to vmemmap_remap_walk flags |
998a2997 MS |
420 | * |
421 | * Return: %0 on success, negative error code otherwise. | |
422 | */ | |
423 | static int vmemmap_remap_alloc(unsigned long start, unsigned long end, | |
c24f188b | 424 | unsigned long reuse, unsigned long flags) |
998a2997 MS |
425 | { |
426 | LIST_HEAD(vmemmap_pages); | |
427 | struct vmemmap_remap_walk walk = { | |
428 | .remap_pte = vmemmap_restore_pte, | |
429 | .reuse_addr = reuse, | |
430 | .vmemmap_pages = &vmemmap_pages, | |
c24f188b | 431 | .flags = flags, |
998a2997 MS |
432 | }; |
433 | ||
434 | /* See the comment in the vmemmap_remap_free(). */ | |
435 | BUG_ON(start - reuse != PAGE_SIZE); | |
436 | ||
eb83f652 | 437 | if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) |
998a2997 MS |
438 | return -ENOMEM; |
439 | ||
49b960de | 440 | return vmemmap_remap_range(reuse, end, &walk); |
998a2997 MS |
441 | } |
442 | ||
cf5472e5 | 443 | DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); |
f10f1442 | 444 | EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); |
e9fdff87 | 445 | |
30152245 MS |
446 | static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); |
447 | core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); | |
f41f2ed4 | 448 | |
ebc20dca MS |
449 | static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, |
450 | struct folio *folio, unsigned long flags) | |
ad2fa371 MS |
451 | { |
452 | int ret; | |
ebc20dca | 453 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
6213834c | 454 | unsigned long vmemmap_reuse; |
ad2fa371 | 455 | |
ebc20dca | 456 | VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); |
bd225530 YZ |
457 | VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); |
458 | ||
c5ad3233 | 459 | if (!folio_test_hugetlb_vmemmap_optimized(folio)) |
ad2fa371 MS |
460 | return 0; |
461 | ||
c2a967f6 YZ |
462 | if (flags & VMEMMAP_SYNCHRONIZE_RCU) |
463 | synchronize_rcu(); | |
464 | ||
6213834c MS |
465 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
466 | vmemmap_reuse = vmemmap_start; | |
467 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; | |
5981611d | 468 | |
ad2fa371 | 469 | /* |
6213834c | 470 | * The pages which the vmemmap virtual address range [@vmemmap_start, |
ad2fa371 MS |
471 | * @vmemmap_end) are mapped to are freed to the buddy allocator, and |
472 | * the range is mapped to the page which @vmemmap_reuse is mapped to. | |
473 | * When a HugeTLB page is freed to the buddy allocator, previously | |
474 | * discarded vmemmap pages must be allocated and remapping. | |
475 | */ | |
c24f188b | 476 | ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); |
78f39084 | 477 | if (!ret) { |
c5ad3233 | 478 | folio_clear_hugetlb_vmemmap_optimized(folio); |
78f39084 MS |
479 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
480 | } | |
ad2fa371 MS |
481 | |
482 | return ret; | |
483 | } | |
484 | ||
c24f188b | 485 | /** |
c5ad3233 UA |
486 | * hugetlb_vmemmap_restore_folio - restore previously optimized (by |
487 | * hugetlb_vmemmap_optimize_folio()) vmemmap pages which | |
c24f188b MK |
488 | * will be reallocated and remapped. |
489 | * @h: struct hstate. | |
c5ad3233 | 490 | * @folio: the folio whose vmemmap pages will be restored. |
c24f188b | 491 | * |
c5ad3233 | 492 | * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, |
c24f188b MK |
493 | * negative error code otherwise. |
494 | */ | |
c5ad3233 | 495 | int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) |
c24f188b | 496 | { |
c2a967f6 | 497 | return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); |
c24f188b MK |
498 | } |
499 | ||
cfb8c750 MK |
500 | /** |
501 | * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. | |
502 | * @h: hstate. | |
503 | * @folio_list: list of folios. | |
504 | * @non_hvo_folios: Output list of folios for which vmemmap exists. | |
505 | * | |
506 | * Return: number of folios for which vmemmap was restored, or an error code | |
507 | * if an error was encountered restoring vmemmap for a folio. | |
508 | * Folios that have vmemmap are moved to the non_hvo_folios | |
509 | * list. Processing of entries stops when the first error is | |
510 | * encountered. The folio that experienced the error and all | |
511 | * non-processed folios will remain on folio_list. | |
512 | */ | |
513 | long hugetlb_vmemmap_restore_folios(const struct hstate *h, | |
514 | struct list_head *folio_list, | |
515 | struct list_head *non_hvo_folios) | |
516 | { | |
517 | struct folio *folio, *t_folio; | |
518 | long restored = 0; | |
519 | long ret = 0; | |
c2a967f6 | 520 | unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; |
bd225530 | 521 | |
cfb8c750 MK |
522 | list_for_each_entry_safe(folio, t_folio, folio_list, lru) { |
523 | if (folio_test_hugetlb_vmemmap_optimized(folio)) { | |
c2a967f6 YZ |
524 | ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); |
525 | /* only need to synchronize_rcu() once for each batch */ | |
526 | flags &= ~VMEMMAP_SYNCHRONIZE_RCU; | |
527 | ||
cfb8c750 MK |
528 | if (ret) |
529 | break; | |
530 | restored++; | |
531 | } | |
532 | ||
533 | /* Add non-optimized folios to output list */ | |
534 | list_move(&folio->lru, non_hvo_folios); | |
535 | } | |
536 | ||
c24f188b MK |
537 | if (restored) |
538 | flush_tlb_all(); | |
cfb8c750 MK |
539 | if (!ret) |
540 | ret = restored; | |
541 | return ret; | |
542 | } | |
543 | ||
6213834c | 544 | /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ |
ebc20dca | 545 | static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) |
66361095 | 546 | { |
ebc20dca | 547 | if (folio_test_hugetlb_vmemmap_optimized(folio)) |
79359d6d MK |
548 | return false; |
549 | ||
cf5472e5 | 550 | if (!READ_ONCE(vmemmap_optimize_enabled)) |
6213834c MS |
551 | return false; |
552 | ||
553 | if (!hugetlb_vmemmap_optimizable(h)) | |
554 | return false; | |
66361095 | 555 | |
6213834c | 556 | return true; |
66361095 MS |
557 | } |
558 | ||
c5ad3233 | 559 | static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, |
ebc20dca MS |
560 | struct folio *folio, |
561 | struct list_head *vmemmap_pages, | |
562 | unsigned long flags) | |
f41f2ed4 | 563 | { |
91f386bf | 564 | int ret = 0; |
ebc20dca | 565 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
6213834c | 566 | unsigned long vmemmap_reuse; |
f41f2ed4 | 567 | |
ebc20dca | 568 | VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); |
bd225530 YZ |
569 | VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); |
570 | ||
ebc20dca | 571 | if (!vmemmap_should_optimize_folio(h, folio)) |
91f386bf | 572 | return ret; |
f41f2ed4 | 573 | |
78f39084 | 574 | static_branch_inc(&hugetlb_optimize_vmemmap_key); |
c2a967f6 YZ |
575 | |
576 | if (flags & VMEMMAP_SYNCHRONIZE_RCU) | |
577 | synchronize_rcu(); | |
f13b83fd JM |
578 | /* |
579 | * Very Subtle | |
580 | * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed | |
581 | * immediately after remapping. As a result, subsequent accesses | |
582 | * and modifications to struct pages associated with the hugetlb | |
583 | * page could be to the OLD struct pages. Set the vmemmap optimized | |
584 | * flag here so that it is copied to the new head page. This keeps | |
585 | * the old and new struct pages in sync. | |
586 | * If there is an error during optimization, we will immediately FLUSH | |
587 | * the TLB and clear the flag below. | |
588 | */ | |
c5ad3233 | 589 | folio_set_hugetlb_vmemmap_optimized(folio); |
78f39084 | 590 | |
6213834c MS |
591 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
592 | vmemmap_reuse = vmemmap_start; | |
593 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; | |
f41f2ed4 MS |
594 | |
595 | /* | |
6213834c | 596 | * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) |
91f386bf MK |
597 | * to the page which @vmemmap_reuse is mapped to. Add pages previously |
598 | * mapping the range to vmemmap_pages list so that they can be freed by | |
599 | * the caller. | |
f41f2ed4 | 600 | */ |
f13b83fd | 601 | ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, |
ebc20dca | 602 | vmemmap_pages, flags); |
f13b83fd | 603 | if (ret) { |
78f39084 | 604 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
c5ad3233 | 605 | folio_clear_hugetlb_vmemmap_optimized(folio); |
f13b83fd | 606 | } |
91f386bf MK |
607 | |
608 | return ret; | |
609 | } | |
610 | ||
611 | /** | |
c5ad3233 | 612 | * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. |
91f386bf | 613 | * @h: struct hstate. |
c5ad3233 | 614 | * @folio: the folio whose vmemmap pages will be optimized. |
91f386bf | 615 | * |
c5ad3233 | 616 | * This function only tries to optimize @folio's vmemmap pages and does not |
91f386bf | 617 | * guarantee that the optimization will succeed after it returns. The caller |
c5ad3233 UA |
618 | * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's |
619 | * vmemmap pages have been optimized. | |
91f386bf | 620 | */ |
c5ad3233 | 621 | void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) |
91f386bf MK |
622 | { |
623 | LIST_HEAD(vmemmap_pages); | |
624 | ||
c2a967f6 | 625 | __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); |
91f386bf | 626 | free_vmemmap_page_list(&vmemmap_pages); |
f41f2ed4 | 627 | } |
77490587 | 628 | |
ebc20dca | 629 | static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) |
f4b7e3ef | 630 | { |
ebc20dca | 631 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
f4b7e3ef JM |
632 | unsigned long vmemmap_reuse; |
633 | ||
ebc20dca | 634 | if (!vmemmap_should_optimize_folio(h, folio)) |
f4b7e3ef JM |
635 | return 0; |
636 | ||
637 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); | |
638 | vmemmap_reuse = vmemmap_start; | |
639 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; | |
640 | ||
641 | /* | |
642 | * Split PMDs on the vmemmap virtual address range [@vmemmap_start, | |
643 | * @vmemmap_end] | |
644 | */ | |
645 | return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); | |
646 | } | |
647 | ||
79359d6d MK |
648 | void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) |
649 | { | |
650 | struct folio *folio; | |
91f386bf | 651 | LIST_HEAD(vmemmap_pages); |
c2a967f6 | 652 | unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; |
91f386bf | 653 | |
f4b7e3ef | 654 | list_for_each_entry(folio, folio_list, lru) { |
ebc20dca | 655 | int ret = hugetlb_vmemmap_split_folio(h, folio); |
f4b7e3ef JM |
656 | |
657 | /* | |
658 | * Spliting the PMD requires allocating a page, thus lets fail | |
659 | * early once we encounter the first OOM. No point in retrying | |
660 | * as it can be dynamically done on remap with the memory | |
661 | * we get back from the vmemmap deduplication. | |
662 | */ | |
663 | if (ret == -ENOMEM) | |
664 | break; | |
665 | } | |
666 | ||
667 | flush_tlb_all(); | |
668 | ||
91f386bf | 669 | list_for_each_entry(folio, folio_list, lru) { |
ebc20dca MS |
670 | int ret; |
671 | ||
c2a967f6 YZ |
672 | ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); |
673 | /* only need to synchronize_rcu() once for each batch */ | |
674 | flags &= ~VMEMMAP_SYNCHRONIZE_RCU; | |
91f386bf MK |
675 | |
676 | /* | |
677 | * Pages to be freed may have been accumulated. If we | |
678 | * encounter an ENOMEM, free what we have and try again. | |
f13b83fd JM |
679 | * This can occur in the case that both spliting fails |
680 | * halfway and head page allocation also failed. In this | |
c5ad3233 | 681 | * case __hugetlb_vmemmap_optimize_folio() would free memory |
f13b83fd | 682 | * allowing more vmemmap remaps to occur. |
91f386bf MK |
683 | */ |
684 | if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { | |
f13b83fd | 685 | flush_tlb_all(); |
91f386bf MK |
686 | free_vmemmap_page_list(&vmemmap_pages); |
687 | INIT_LIST_HEAD(&vmemmap_pages); | |
c2a967f6 | 688 | __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); |
91f386bf MK |
689 | } |
690 | } | |
79359d6d | 691 | |
f13b83fd | 692 | flush_tlb_all(); |
91f386bf | 693 | free_vmemmap_page_list(&vmemmap_pages); |
79359d6d MK |
694 | } |
695 | ||
78f39084 MS |
696 | static struct ctl_table hugetlb_vmemmap_sysctls[] = { |
697 | { | |
698 | .procname = "hugetlb_optimize_vmemmap", | |
cf5472e5 | 699 | .data = &vmemmap_optimize_enabled, |
f1aa2eb5 | 700 | .maxlen = sizeof(vmemmap_optimize_enabled), |
78f39084 | 701 | .mode = 0644, |
cf5472e5 | 702 | .proc_handler = proc_dobool, |
78f39084 | 703 | }, |
78f39084 MS |
704 | }; |
705 | ||
6213834c | 706 | static int __init hugetlb_vmemmap_init(void) |
78f39084 | 707 | { |
12318566 MS |
708 | const struct hstate *h; |
709 | ||
6213834c | 710 | /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ |
fde1c4ec | 711 | BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); |
6213834c | 712 | |
12318566 MS |
713 | for_each_hstate(h) { |
714 | if (hugetlb_vmemmap_optimizable(h)) { | |
715 | register_sysctl_init("vm", hugetlb_vmemmap_sysctls); | |
716 | break; | |
6213834c MS |
717 | } |
718 | } | |
78f39084 MS |
719 | return 0; |
720 | } | |
6213834c | 721 | late_initcall(hugetlb_vmemmap_init); |