Commit | Line | Data |
---|---|---|
f41f2ed4 MS |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
dff03381 | 3 | * HugeTLB Vmemmap Optimization (HVO) |
f41f2ed4 | 4 | * |
dff03381 | 5 | * Copyright (c) 2020, ByteDance. All rights reserved. |
f41f2ed4 MS |
6 | * |
7 | * Author: Muchun Song <songmuchun@bytedance.com> | |
8 | * | |
ee65728e | 9 | * See Documentation/mm/vmemmap_dedup.rst |
f41f2ed4 | 10 | */ |
e9fdff87 MS |
11 | #define pr_fmt(fmt) "HugeTLB: " fmt |
12 | ||
998a2997 | 13 | #include <linux/pgtable.h> |
db5e8d84 | 14 | #include <linux/moduleparam.h> |
998a2997 | 15 | #include <linux/bootmem_info.h> |
d8f5f7e4 | 16 | #include <linux/mmdebug.h> |
fb93ed63 | 17 | #include <linux/pagewalk.h> |
998a2997 MS |
18 | #include <asm/pgalloc.h> |
19 | #include <asm/tlbflush.h> | |
f41f2ed4 MS |
20 | #include "hugetlb_vmemmap.h" |
21 | ||
998a2997 MS |
22 | /** |
23 | * struct vmemmap_remap_walk - walk vmemmap page table | |
24 | * | |
25 | * @remap_pte: called for each lowest-level entry (PTE). | |
26 | * @nr_walked: the number of walked pte. | |
27 | * @reuse_page: the page which is reused for the tail vmemmap pages. | |
28 | * @reuse_addr: the virtual address of the @reuse_page page. | |
29 | * @vmemmap_pages: the list head of the vmemmap pages that can be freed | |
30 | * or is mapped from. | |
f4b7e3ef JM |
31 | * @flags: used to modify behavior in vmemmap page table walking |
32 | * operations. | |
998a2997 MS |
33 | */ |
34 | struct vmemmap_remap_walk { | |
35 | void (*remap_pte)(pte_t *pte, unsigned long addr, | |
36 | struct vmemmap_remap_walk *walk); | |
37 | unsigned long nr_walked; | |
38 | struct page *reuse_page; | |
39 | unsigned long reuse_addr; | |
40 | struct list_head *vmemmap_pages; | |
f4b7e3ef JM |
41 | |
42 | /* Skip the TLB flush when we split the PMD */ | |
43 | #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) | |
f13b83fd JM |
44 | /* Skip the TLB flush when we remap the PTE */ |
45 | #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) | |
f4b7e3ef | 46 | unsigned long flags; |
998a2997 MS |
47 | }; |
48 | ||
fb93ed63 MS |
49 | static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, |
50 | struct vmemmap_remap_walk *walk) | |
998a2997 MS |
51 | { |
52 | pmd_t __pmd; | |
53 | int i; | |
54 | unsigned long addr = start; | |
3ce2c24c MS |
55 | pte_t *pgtable; |
56 | ||
3ce2c24c | 57 | pgtable = pte_alloc_one_kernel(&init_mm); |
998a2997 MS |
58 | if (!pgtable) |
59 | return -ENOMEM; | |
60 | ||
61 | pmd_populate_kernel(&init_mm, &__pmd, pgtable); | |
62 | ||
e38f055d | 63 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { |
998a2997 MS |
64 | pte_t entry, *pte; |
65 | pgprot_t pgprot = PAGE_KERNEL; | |
66 | ||
3ce2c24c | 67 | entry = mk_pte(head + i, pgprot); |
998a2997 MS |
68 | pte = pte_offset_kernel(&__pmd, addr); |
69 | set_pte_at(&init_mm, addr, pte, entry); | |
70 | } | |
71 | ||
72 | spin_lock(&init_mm.page_table_lock); | |
73 | if (likely(pmd_leaf(*pmd))) { | |
74 | /* | |
75 | * Higher order allocations from buddy allocator must be able to | |
76 | * be treated as indepdenent small pages (as they can be freed | |
77 | * individually). | |
78 | */ | |
3ce2c24c MS |
79 | if (!PageReserved(head)) |
80 | split_page(head, get_order(PMD_SIZE)); | |
998a2997 MS |
81 | |
82 | /* Make pte visible before pmd. See comment in pmd_install(). */ | |
83 | smp_wmb(); | |
84 | pmd_populate_kernel(&init_mm, pmd, pgtable); | |
fb93ed63 | 85 | if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) |
f4b7e3ef | 86 | flush_tlb_kernel_range(start, start + PMD_SIZE); |
998a2997 MS |
87 | } else { |
88 | pte_free_kernel(&init_mm, pgtable); | |
89 | } | |
90 | spin_unlock(&init_mm.page_table_lock); | |
91 | ||
92 | return 0; | |
93 | } | |
94 | ||
fb93ed63 MS |
95 | static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, |
96 | unsigned long next, struct mm_walk *walk) | |
998a2997 | 97 | { |
be035a2a | 98 | int ret = 0; |
fb93ed63 MS |
99 | struct page *head; |
100 | struct vmemmap_remap_walk *vmemmap_walk = walk->private; | |
f4b7e3ef | 101 | |
fb93ed63 MS |
102 | /* Only splitting, not remapping the vmemmap pages. */ |
103 | if (!vmemmap_walk->remap_pte) | |
104 | walk->action = ACTION_CONTINUE; | |
f4b7e3ef | 105 | |
fb93ed63 MS |
106 | spin_lock(&init_mm.page_table_lock); |
107 | head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; | |
be035a2a MS |
108 | /* |
109 | * Due to HugeTLB alignment requirements and the vmemmap | |
110 | * pages being at the start of the hotplugged memory | |
111 | * region in memory_hotplug.memmap_on_memory case. Checking | |
112 | * the vmemmap page associated with the first vmemmap page | |
113 | * if it is self-hosted is sufficient. | |
114 | * | |
115 | * [ hotplugged memory ] | |
116 | * [ section ][...][ section ] | |
117 | * [ vmemmap ][ usable memory ] | |
118 | * ^ | ^ | | |
119 | * +--+ | | | |
120 | * +------------------------+ | |
121 | */ | |
47e61d88 | 122 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { |
be035a2a MS |
123 | struct page *page = head ? head + pte_index(addr) : |
124 | pte_page(ptep_get(pte_offset_kernel(pmd, addr))); | |
125 | ||
126 | if (PageVmemmapSelfHosted(page)) | |
127 | ret = -ENOTSUPP; | |
128 | } | |
fb93ed63 | 129 | spin_unlock(&init_mm.page_table_lock); |
be035a2a MS |
130 | if (!head || ret) |
131 | return ret; | |
998a2997 | 132 | |
fb93ed63 | 133 | return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); |
998a2997 MS |
134 | } |
135 | ||
fb93ed63 MS |
136 | static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, |
137 | unsigned long next, struct mm_walk *walk) | |
998a2997 | 138 | { |
fb93ed63 | 139 | struct vmemmap_remap_walk *vmemmap_walk = walk->private; |
998a2997 | 140 | |
fb93ed63 MS |
141 | /* |
142 | * The reuse_page is found 'first' in page table walking before | |
143 | * starting remapping. | |
144 | */ | |
145 | if (!vmemmap_walk->reuse_page) | |
146 | vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); | |
147 | else | |
148 | vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); | |
149 | vmemmap_walk->nr_walked++; | |
998a2997 MS |
150 | |
151 | return 0; | |
152 | } | |
153 | ||
fb93ed63 MS |
154 | static const struct mm_walk_ops vmemmap_remap_ops = { |
155 | .pmd_entry = vmemmap_pmd_entry, | |
156 | .pte_entry = vmemmap_pte_entry, | |
157 | }; | |
998a2997 MS |
158 | |
159 | static int vmemmap_remap_range(unsigned long start, unsigned long end, | |
160 | struct vmemmap_remap_walk *walk) | |
161 | { | |
fb93ed63 | 162 | int ret; |
998a2997 | 163 | |
fb93ed63 | 164 | VM_BUG_ON(!PAGE_ALIGNED(start | end)); |
998a2997 | 165 | |
49b960de | 166 | mmap_read_lock(&init_mm); |
fb93ed63 MS |
167 | ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, |
168 | NULL, walk); | |
49b960de | 169 | mmap_read_unlock(&init_mm); |
fb93ed63 MS |
170 | if (ret) |
171 | return ret; | |
998a2997 | 172 | |
f13b83fd | 173 | if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) |
f4b7e3ef | 174 | flush_tlb_kernel_range(start, end); |
998a2997 MS |
175 | |
176 | return 0; | |
177 | } | |
178 | ||
179 | /* | |
180 | * Free a vmemmap page. A vmemmap page can be allocated from the memblock | |
181 | * allocator or buddy allocator. If the PG_reserved flag is set, it means | |
182 | * that it allocated from the memblock allocator, just free it via the | |
183 | * free_bootmem_page(). Otherwise, use __free_page(). | |
184 | */ | |
185 | static inline void free_vmemmap_page(struct page *page) | |
186 | { | |
187 | if (PageReserved(page)) | |
188 | free_bootmem_page(page); | |
189 | else | |
190 | __free_page(page); | |
191 | } | |
192 | ||
193 | /* Free a list of the vmemmap pages */ | |
194 | static void free_vmemmap_page_list(struct list_head *list) | |
195 | { | |
196 | struct page *page, *next; | |
197 | ||
1cc53a04 | 198 | list_for_each_entry_safe(page, next, list, lru) |
998a2997 | 199 | free_vmemmap_page(page); |
998a2997 MS |
200 | } |
201 | ||
202 | static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, | |
203 | struct vmemmap_remap_walk *walk) | |
204 | { | |
205 | /* | |
206 | * Remap the tail pages as read-only to catch illegal write operation | |
207 | * to the tail pages. | |
208 | */ | |
209 | pgprot_t pgprot = PAGE_KERNEL_RO; | |
c33c7948 | 210 | struct page *page = pte_page(ptep_get(pte)); |
11aad263 JM |
211 | pte_t entry; |
212 | ||
213 | /* Remapping the head page requires r/w */ | |
214 | if (unlikely(addr == walk->reuse_addr)) { | |
215 | pgprot = PAGE_KERNEL; | |
216 | list_del(&walk->reuse_page->lru); | |
217 | ||
218 | /* | |
219 | * Makes sure that preceding stores to the page contents from | |
220 | * vmemmap_remap_free() become visible before the set_pte_at() | |
221 | * write. | |
222 | */ | |
223 | smp_wmb(); | |
224 | } | |
998a2997 | 225 | |
11aad263 | 226 | entry = mk_pte(walk->reuse_page, pgprot); |
91f386bf | 227 | list_add(&page->lru, walk->vmemmap_pages); |
998a2997 MS |
228 | set_pte_at(&init_mm, addr, pte, entry); |
229 | } | |
230 | ||
231 | /* | |
232 | * How many struct page structs need to be reset. When we reuse the head | |
233 | * struct page, the special metadata (e.g. page->flags or page->mapping) | |
234 | * cannot copy to the tail struct page structs. The invalid value will be | |
8666925c | 235 | * checked in the free_tail_page_prepare(). In order to avoid the message |
998a2997 MS |
236 | * of "corrupted mapping in tail page". We need to reset at least 3 (one |
237 | * head struct page struct and two tail struct page structs) struct page | |
238 | * structs. | |
239 | */ | |
240 | #define NR_RESET_STRUCT_PAGE 3 | |
241 | ||
242 | static inline void reset_struct_pages(struct page *start) | |
243 | { | |
998a2997 MS |
244 | struct page *from = start + NR_RESET_STRUCT_PAGE; |
245 | ||
33febb51 MS |
246 | BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); |
247 | memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); | |
998a2997 MS |
248 | } |
249 | ||
250 | static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, | |
251 | struct vmemmap_remap_walk *walk) | |
252 | { | |
253 | pgprot_t pgprot = PAGE_KERNEL; | |
254 | struct page *page; | |
255 | void *to; | |
256 | ||
c33c7948 | 257 | BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); |
998a2997 MS |
258 | |
259 | page = list_first_entry(walk->vmemmap_pages, struct page, lru); | |
260 | list_del(&page->lru); | |
261 | to = page_to_virt(page); | |
262 | copy_page(to, (void *)walk->reuse_addr); | |
263 | reset_struct_pages(to); | |
264 | ||
939de63d ML |
265 | /* |
266 | * Makes sure that preceding stores to the page contents become visible | |
267 | * before the set_pte_at() write. | |
268 | */ | |
269 | smp_wmb(); | |
998a2997 MS |
270 | set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); |
271 | } | |
272 | ||
f4b7e3ef JM |
273 | /** |
274 | * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) | |
275 | * backing PMDs of the directmap into PTEs | |
276 | * @start: start address of the vmemmap virtual address range that we want | |
277 | * to remap. | |
278 | * @end: end address of the vmemmap virtual address range that we want to | |
279 | * remap. | |
280 | * @reuse: reuse address. | |
281 | * | |
282 | * Return: %0 on success, negative error code otherwise. | |
283 | */ | |
284 | static int vmemmap_remap_split(unsigned long start, unsigned long end, | |
ebc20dca | 285 | unsigned long reuse) |
f4b7e3ef | 286 | { |
f4b7e3ef JM |
287 | struct vmemmap_remap_walk walk = { |
288 | .remap_pte = NULL, | |
289 | .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, | |
290 | }; | |
291 | ||
292 | /* See the comment in the vmemmap_remap_free(). */ | |
293 | BUG_ON(start - reuse != PAGE_SIZE); | |
294 | ||
49b960de | 295 | return vmemmap_remap_range(reuse, end, &walk); |
f4b7e3ef JM |
296 | } |
297 | ||
998a2997 MS |
298 | /** |
299 | * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) | |
300 | * to the page which @reuse is mapped to, then free vmemmap | |
301 | * which the range are mapped to. | |
302 | * @start: start address of the vmemmap virtual address range that we want | |
303 | * to remap. | |
304 | * @end: end address of the vmemmap virtual address range that we want to | |
305 | * remap. | |
306 | * @reuse: reuse address. | |
91f386bf MK |
307 | * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers |
308 | * responsibility to free pages. | |
f13b83fd | 309 | * @flags: modifications to vmemmap_remap_walk flags |
998a2997 MS |
310 | * |
311 | * Return: %0 on success, negative error code otherwise. | |
312 | */ | |
313 | static int vmemmap_remap_free(unsigned long start, unsigned long end, | |
91f386bf | 314 | unsigned long reuse, |
f13b83fd JM |
315 | struct list_head *vmemmap_pages, |
316 | unsigned long flags) | |
998a2997 MS |
317 | { |
318 | int ret; | |
998a2997 MS |
319 | struct vmemmap_remap_walk walk = { |
320 | .remap_pte = vmemmap_remap_pte, | |
321 | .reuse_addr = reuse, | |
91f386bf | 322 | .vmemmap_pages = vmemmap_pages, |
f13b83fd | 323 | .flags = flags, |
998a2997 | 324 | }; |
a9e34ea1 | 325 | int nid = page_to_nid((struct page *)reuse); |
6a898c27 | 326 | gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; |
11aad263 JM |
327 | |
328 | /* | |
329 | * Allocate a new head vmemmap page to avoid breaking a contiguous | |
330 | * block of struct page memory when freeing it back to page allocator | |
331 | * in free_vmemmap_page_list(). This will allow the likely contiguous | |
332 | * struct page backing memory to be kept contiguous and allowing for | |
333 | * more allocations of hugepages. Fallback to the currently | |
334 | * mapped head page in case should it fail to allocate. | |
335 | */ | |
336 | walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); | |
337 | if (walk.reuse_page) { | |
338 | copy_page(page_to_virt(walk.reuse_page), | |
339 | (void *)walk.reuse_addr); | |
91f386bf | 340 | list_add(&walk.reuse_page->lru, vmemmap_pages); |
11aad263 | 341 | } |
998a2997 MS |
342 | |
343 | /* | |
344 | * In order to make remapping routine most efficient for the huge pages, | |
345 | * the routine of vmemmap page table walking has the following rules | |
346 | * (see more details from the vmemmap_pte_range()): | |
347 | * | |
348 | * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) | |
349 | * should be continuous. | |
350 | * - The @reuse address is part of the range [@reuse, @end) that we are | |
351 | * walking which is passed to vmemmap_remap_range(). | |
352 | * - The @reuse address is the first in the complete range. | |
353 | * | |
354 | * So we need to make sure that @start and @reuse meet the above rules. | |
355 | */ | |
356 | BUG_ON(start - reuse != PAGE_SIZE); | |
357 | ||
998a2997 MS |
358 | ret = vmemmap_remap_range(reuse, end, &walk); |
359 | if (ret && walk.nr_walked) { | |
360 | end = reuse + walk.nr_walked * PAGE_SIZE; | |
361 | /* | |
362 | * vmemmap_pages contains pages from the previous | |
363 | * vmemmap_remap_range call which failed. These | |
364 | * are pages which were removed from the vmemmap. | |
365 | * They will be restored in the following call. | |
366 | */ | |
367 | walk = (struct vmemmap_remap_walk) { | |
368 | .remap_pte = vmemmap_restore_pte, | |
369 | .reuse_addr = reuse, | |
91f386bf | 370 | .vmemmap_pages = vmemmap_pages, |
f4b7e3ef | 371 | .flags = 0, |
998a2997 MS |
372 | }; |
373 | ||
374 | vmemmap_remap_range(reuse, end, &walk); | |
375 | } | |
998a2997 | 376 | |
998a2997 MS |
377 | return ret; |
378 | } | |
379 | ||
380 | static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, | |
eb83f652 | 381 | struct list_head *list) |
998a2997 | 382 | { |
2eaa6c2a | 383 | gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; |
998a2997 MS |
384 | unsigned long nr_pages = (end - start) >> PAGE_SHIFT; |
385 | int nid = page_to_nid((struct page *)start); | |
386 | struct page *page, *next; | |
387 | ||
388 | while (nr_pages--) { | |
389 | page = alloc_pages_node(nid, gfp_mask, 0); | |
390 | if (!page) | |
391 | goto out; | |
91f386bf | 392 | list_add(&page->lru, list); |
998a2997 MS |
393 | } |
394 | ||
395 | return 0; | |
396 | out: | |
397 | list_for_each_entry_safe(page, next, list, lru) | |
dcc1be11 | 398 | __free_page(page); |
998a2997 MS |
399 | return -ENOMEM; |
400 | } | |
401 | ||
402 | /** | |
403 | * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) | |
404 | * to the page which is from the @vmemmap_pages | |
405 | * respectively. | |
406 | * @start: start address of the vmemmap virtual address range that we want | |
407 | * to remap. | |
408 | * @end: end address of the vmemmap virtual address range that we want to | |
409 | * remap. | |
410 | * @reuse: reuse address. | |
c24f188b | 411 | * @flags: modifications to vmemmap_remap_walk flags |
998a2997 MS |
412 | * |
413 | * Return: %0 on success, negative error code otherwise. | |
414 | */ | |
415 | static int vmemmap_remap_alloc(unsigned long start, unsigned long end, | |
c24f188b | 416 | unsigned long reuse, unsigned long flags) |
998a2997 MS |
417 | { |
418 | LIST_HEAD(vmemmap_pages); | |
419 | struct vmemmap_remap_walk walk = { | |
420 | .remap_pte = vmemmap_restore_pte, | |
421 | .reuse_addr = reuse, | |
422 | .vmemmap_pages = &vmemmap_pages, | |
c24f188b | 423 | .flags = flags, |
998a2997 MS |
424 | }; |
425 | ||
426 | /* See the comment in the vmemmap_remap_free(). */ | |
427 | BUG_ON(start - reuse != PAGE_SIZE); | |
428 | ||
eb83f652 | 429 | if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) |
998a2997 MS |
430 | return -ENOMEM; |
431 | ||
49b960de | 432 | return vmemmap_remap_range(reuse, end, &walk); |
998a2997 MS |
433 | } |
434 | ||
cf5472e5 | 435 | DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); |
f10f1442 | 436 | EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); |
e9fdff87 | 437 | |
30152245 MS |
438 | static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); |
439 | core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); | |
f41f2ed4 | 440 | |
ebc20dca MS |
441 | static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, |
442 | struct folio *folio, unsigned long flags) | |
ad2fa371 MS |
443 | { |
444 | int ret; | |
ebc20dca | 445 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
6213834c | 446 | unsigned long vmemmap_reuse; |
ad2fa371 | 447 | |
ebc20dca | 448 | VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); |
c5ad3233 | 449 | if (!folio_test_hugetlb_vmemmap_optimized(folio)) |
ad2fa371 MS |
450 | return 0; |
451 | ||
6213834c MS |
452 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
453 | vmemmap_reuse = vmemmap_start; | |
454 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; | |
5981611d | 455 | |
ad2fa371 | 456 | /* |
6213834c | 457 | * The pages which the vmemmap virtual address range [@vmemmap_start, |
ad2fa371 MS |
458 | * @vmemmap_end) are mapped to are freed to the buddy allocator, and |
459 | * the range is mapped to the page which @vmemmap_reuse is mapped to. | |
460 | * When a HugeTLB page is freed to the buddy allocator, previously | |
461 | * discarded vmemmap pages must be allocated and remapping. | |
462 | */ | |
c24f188b | 463 | ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); |
78f39084 | 464 | if (!ret) { |
c5ad3233 | 465 | folio_clear_hugetlb_vmemmap_optimized(folio); |
78f39084 MS |
466 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
467 | } | |
ad2fa371 MS |
468 | |
469 | return ret; | |
470 | } | |
471 | ||
c24f188b | 472 | /** |
c5ad3233 UA |
473 | * hugetlb_vmemmap_restore_folio - restore previously optimized (by |
474 | * hugetlb_vmemmap_optimize_folio()) vmemmap pages which | |
c24f188b MK |
475 | * will be reallocated and remapped. |
476 | * @h: struct hstate. | |
c5ad3233 | 477 | * @folio: the folio whose vmemmap pages will be restored. |
c24f188b | 478 | * |
c5ad3233 | 479 | * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, |
c24f188b MK |
480 | * negative error code otherwise. |
481 | */ | |
c5ad3233 | 482 | int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) |
c24f188b | 483 | { |
c5ad3233 | 484 | return __hugetlb_vmemmap_restore_folio(h, folio, 0); |
c24f188b MK |
485 | } |
486 | ||
cfb8c750 MK |
487 | /** |
488 | * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. | |
489 | * @h: hstate. | |
490 | * @folio_list: list of folios. | |
491 | * @non_hvo_folios: Output list of folios for which vmemmap exists. | |
492 | * | |
493 | * Return: number of folios for which vmemmap was restored, or an error code | |
494 | * if an error was encountered restoring vmemmap for a folio. | |
495 | * Folios that have vmemmap are moved to the non_hvo_folios | |
496 | * list. Processing of entries stops when the first error is | |
497 | * encountered. The folio that experienced the error and all | |
498 | * non-processed folios will remain on folio_list. | |
499 | */ | |
500 | long hugetlb_vmemmap_restore_folios(const struct hstate *h, | |
501 | struct list_head *folio_list, | |
502 | struct list_head *non_hvo_folios) | |
503 | { | |
504 | struct folio *folio, *t_folio; | |
505 | long restored = 0; | |
506 | long ret = 0; | |
507 | ||
508 | list_for_each_entry_safe(folio, t_folio, folio_list, lru) { | |
509 | if (folio_test_hugetlb_vmemmap_optimized(folio)) { | |
c5ad3233 | 510 | ret = __hugetlb_vmemmap_restore_folio(h, folio, |
ebc20dca | 511 | VMEMMAP_REMAP_NO_TLB_FLUSH); |
cfb8c750 MK |
512 | if (ret) |
513 | break; | |
514 | restored++; | |
515 | } | |
516 | ||
517 | /* Add non-optimized folios to output list */ | |
518 | list_move(&folio->lru, non_hvo_folios); | |
519 | } | |
520 | ||
c24f188b MK |
521 | if (restored) |
522 | flush_tlb_all(); | |
cfb8c750 MK |
523 | if (!ret) |
524 | ret = restored; | |
525 | return ret; | |
526 | } | |
527 | ||
6213834c | 528 | /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ |
ebc20dca | 529 | static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) |
66361095 | 530 | { |
ebc20dca | 531 | if (folio_test_hugetlb_vmemmap_optimized(folio)) |
79359d6d MK |
532 | return false; |
533 | ||
cf5472e5 | 534 | if (!READ_ONCE(vmemmap_optimize_enabled)) |
6213834c MS |
535 | return false; |
536 | ||
537 | if (!hugetlb_vmemmap_optimizable(h)) | |
538 | return false; | |
66361095 | 539 | |
6213834c | 540 | return true; |
66361095 MS |
541 | } |
542 | ||
c5ad3233 | 543 | static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, |
ebc20dca MS |
544 | struct folio *folio, |
545 | struct list_head *vmemmap_pages, | |
546 | unsigned long flags) | |
f41f2ed4 | 547 | { |
91f386bf | 548 | int ret = 0; |
ebc20dca | 549 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
6213834c | 550 | unsigned long vmemmap_reuse; |
f41f2ed4 | 551 | |
ebc20dca MS |
552 | VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); |
553 | if (!vmemmap_should_optimize_folio(h, folio)) | |
91f386bf | 554 | return ret; |
f41f2ed4 | 555 | |
78f39084 | 556 | static_branch_inc(&hugetlb_optimize_vmemmap_key); |
f13b83fd JM |
557 | /* |
558 | * Very Subtle | |
559 | * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed | |
560 | * immediately after remapping. As a result, subsequent accesses | |
561 | * and modifications to struct pages associated with the hugetlb | |
562 | * page could be to the OLD struct pages. Set the vmemmap optimized | |
563 | * flag here so that it is copied to the new head page. This keeps | |
564 | * the old and new struct pages in sync. | |
565 | * If there is an error during optimization, we will immediately FLUSH | |
566 | * the TLB and clear the flag below. | |
567 | */ | |
c5ad3233 | 568 | folio_set_hugetlb_vmemmap_optimized(folio); |
78f39084 | 569 | |
6213834c MS |
570 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
571 | vmemmap_reuse = vmemmap_start; | |
572 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; | |
f41f2ed4 MS |
573 | |
574 | /* | |
6213834c | 575 | * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) |
91f386bf MK |
576 | * to the page which @vmemmap_reuse is mapped to. Add pages previously |
577 | * mapping the range to vmemmap_pages list so that they can be freed by | |
578 | * the caller. | |
f41f2ed4 | 579 | */ |
f13b83fd | 580 | ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, |
ebc20dca | 581 | vmemmap_pages, flags); |
f13b83fd | 582 | if (ret) { |
78f39084 | 583 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
c5ad3233 | 584 | folio_clear_hugetlb_vmemmap_optimized(folio); |
f13b83fd | 585 | } |
91f386bf MK |
586 | |
587 | return ret; | |
588 | } | |
589 | ||
590 | /** | |
c5ad3233 | 591 | * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. |
91f386bf | 592 | * @h: struct hstate. |
c5ad3233 | 593 | * @folio: the folio whose vmemmap pages will be optimized. |
91f386bf | 594 | * |
c5ad3233 | 595 | * This function only tries to optimize @folio's vmemmap pages and does not |
91f386bf | 596 | * guarantee that the optimization will succeed after it returns. The caller |
c5ad3233 UA |
597 | * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's |
598 | * vmemmap pages have been optimized. | |
91f386bf | 599 | */ |
c5ad3233 | 600 | void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) |
91f386bf MK |
601 | { |
602 | LIST_HEAD(vmemmap_pages); | |
603 | ||
c5ad3233 | 604 | __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); |
91f386bf | 605 | free_vmemmap_page_list(&vmemmap_pages); |
f41f2ed4 | 606 | } |
77490587 | 607 | |
ebc20dca | 608 | static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) |
f4b7e3ef | 609 | { |
ebc20dca | 610 | unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; |
f4b7e3ef JM |
611 | unsigned long vmemmap_reuse; |
612 | ||
ebc20dca | 613 | if (!vmemmap_should_optimize_folio(h, folio)) |
f4b7e3ef JM |
614 | return 0; |
615 | ||
616 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); | |
617 | vmemmap_reuse = vmemmap_start; | |
618 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; | |
619 | ||
620 | /* | |
621 | * Split PMDs on the vmemmap virtual address range [@vmemmap_start, | |
622 | * @vmemmap_end] | |
623 | */ | |
624 | return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); | |
625 | } | |
626 | ||
79359d6d MK |
627 | void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) |
628 | { | |
629 | struct folio *folio; | |
91f386bf MK |
630 | LIST_HEAD(vmemmap_pages); |
631 | ||
f4b7e3ef | 632 | list_for_each_entry(folio, folio_list, lru) { |
ebc20dca | 633 | int ret = hugetlb_vmemmap_split_folio(h, folio); |
f4b7e3ef JM |
634 | |
635 | /* | |
636 | * Spliting the PMD requires allocating a page, thus lets fail | |
637 | * early once we encounter the first OOM. No point in retrying | |
638 | * as it can be dynamically done on remap with the memory | |
639 | * we get back from the vmemmap deduplication. | |
640 | */ | |
641 | if (ret == -ENOMEM) | |
642 | break; | |
643 | } | |
644 | ||
645 | flush_tlb_all(); | |
646 | ||
91f386bf | 647 | list_for_each_entry(folio, folio_list, lru) { |
ebc20dca MS |
648 | int ret; |
649 | ||
650 | ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, | |
651 | VMEMMAP_REMAP_NO_TLB_FLUSH); | |
91f386bf MK |
652 | |
653 | /* | |
654 | * Pages to be freed may have been accumulated. If we | |
655 | * encounter an ENOMEM, free what we have and try again. | |
f13b83fd JM |
656 | * This can occur in the case that both spliting fails |
657 | * halfway and head page allocation also failed. In this | |
c5ad3233 | 658 | * case __hugetlb_vmemmap_optimize_folio() would free memory |
f13b83fd | 659 | * allowing more vmemmap remaps to occur. |
91f386bf MK |
660 | */ |
661 | if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { | |
f13b83fd | 662 | flush_tlb_all(); |
91f386bf MK |
663 | free_vmemmap_page_list(&vmemmap_pages); |
664 | INIT_LIST_HEAD(&vmemmap_pages); | |
ebc20dca MS |
665 | __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, |
666 | VMEMMAP_REMAP_NO_TLB_FLUSH); | |
91f386bf MK |
667 | } |
668 | } | |
79359d6d | 669 | |
f13b83fd | 670 | flush_tlb_all(); |
91f386bf | 671 | free_vmemmap_page_list(&vmemmap_pages); |
79359d6d MK |
672 | } |
673 | ||
78f39084 MS |
674 | static struct ctl_table hugetlb_vmemmap_sysctls[] = { |
675 | { | |
676 | .procname = "hugetlb_optimize_vmemmap", | |
cf5472e5 | 677 | .data = &vmemmap_optimize_enabled, |
f1aa2eb5 | 678 | .maxlen = sizeof(vmemmap_optimize_enabled), |
78f39084 | 679 | .mode = 0644, |
cf5472e5 | 680 | .proc_handler = proc_dobool, |
78f39084 MS |
681 | }, |
682 | { } | |
683 | }; | |
684 | ||
6213834c | 685 | static int __init hugetlb_vmemmap_init(void) |
78f39084 | 686 | { |
12318566 MS |
687 | const struct hstate *h; |
688 | ||
6213834c | 689 | /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ |
fde1c4ec | 690 | BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); |
6213834c | 691 | |
12318566 MS |
692 | for_each_hstate(h) { |
693 | if (hugetlb_vmemmap_optimizable(h)) { | |
694 | register_sysctl_init("vm", hugetlb_vmemmap_sysctls); | |
695 | break; | |
6213834c MS |
696 | } |
697 | } | |
78f39084 MS |
698 | return 0; |
699 | } | |
6213834c | 700 | late_initcall(hugetlb_vmemmap_init); |