Commit | Line | Data |
---|---|---|
f41f2ed4 MS |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
5981611d | 3 | * Optimize vmemmap pages associated with HugeTLB |
f41f2ed4 MS |
4 | * |
5 | * Copyright (c) 2020, Bytedance. All rights reserved. | |
6 | * | |
7 | * Author: Muchun Song <songmuchun@bytedance.com> | |
8 | * | |
ee65728e | 9 | * See Documentation/mm/vmemmap_dedup.rst |
f41f2ed4 | 10 | */ |
e9fdff87 MS |
11 | #define pr_fmt(fmt) "HugeTLB: " fmt |
12 | ||
66361095 | 13 | #include <linux/memory.h> |
f41f2ed4 MS |
14 | #include "hugetlb_vmemmap.h" |
15 | ||
16 | /* | |
17 | * There are a lot of struct page structures associated with each HugeTLB page. | |
18 | * For tail pages, the value of compound_head is the same. So we can reuse first | |
e7d32485 MS |
19 | * page of head page structures. We map the virtual addresses of all the pages |
20 | * of tail page structures to the head page struct, and then free these page | |
21 | * frames. Therefore, we need to reserve one pages as vmemmap areas. | |
f41f2ed4 | 22 | */ |
e7d32485 | 23 | #define RESERVE_VMEMMAP_NR 1U |
f41f2ed4 MS |
24 | #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) |
25 | ||
78f39084 MS |
26 | enum vmemmap_optimize_mode { |
27 | VMEMMAP_OPTIMIZE_OFF, | |
28 | VMEMMAP_OPTIMIZE_ON, | |
29 | }; | |
30 | ||
47010c04 | 31 | DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, |
f10f1442 MS |
32 | hugetlb_optimize_vmemmap_key); |
33 | EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); | |
e9fdff87 | 34 | |
78f39084 | 35 | static enum vmemmap_optimize_mode vmemmap_optimize_mode = |
0111def9 | 36 | IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); |
78f39084 MS |
37 | |
38 | static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) | |
39 | { | |
40 | if (vmemmap_optimize_mode == to) | |
41 | return; | |
42 | ||
43 | if (to == VMEMMAP_OPTIMIZE_OFF) | |
44 | static_branch_dec(&hugetlb_optimize_vmemmap_key); | |
45 | else | |
46 | static_branch_inc(&hugetlb_optimize_vmemmap_key); | |
47 | WRITE_ONCE(vmemmap_optimize_mode, to); | |
48 | } | |
49 | ||
5981611d | 50 | static int __init hugetlb_vmemmap_early_param(char *buf) |
e9fdff87 | 51 | { |
9c54c522 | 52 | bool enable; |
78f39084 | 53 | enum vmemmap_optimize_mode mode; |
9c54c522 MS |
54 | |
55 | if (kstrtobool(buf, &enable)) | |
e9fdff87 MS |
56 | return -EINVAL; |
57 | ||
78f39084 MS |
58 | mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; |
59 | vmemmap_optimize_mode_switch(mode); | |
e9fdff87 MS |
60 | |
61 | return 0; | |
62 | } | |
5981611d | 63 | early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); |
f41f2ed4 | 64 | |
ad2fa371 MS |
65 | /* |
66 | * Previously discarded vmemmap pages will be allocated and remapping | |
67 | * after this function returns zero. | |
68 | */ | |
5981611d | 69 | int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) |
ad2fa371 MS |
70 | { |
71 | int ret; | |
72 | unsigned long vmemmap_addr = (unsigned long)head; | |
5981611d | 73 | unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; |
ad2fa371 MS |
74 | |
75 | if (!HPageVmemmapOptimized(head)) | |
76 | return 0; | |
77 | ||
5981611d MS |
78 | vmemmap_addr += RESERVE_VMEMMAP_SIZE; |
79 | vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); | |
80 | vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); | |
81 | vmemmap_reuse = vmemmap_addr - PAGE_SIZE; | |
82 | ||
ad2fa371 MS |
83 | /* |
84 | * The pages which the vmemmap virtual address range [@vmemmap_addr, | |
85 | * @vmemmap_end) are mapped to are freed to the buddy allocator, and | |
86 | * the range is mapped to the page which @vmemmap_reuse is mapped to. | |
87 | * When a HugeTLB page is freed to the buddy allocator, previously | |
88 | * discarded vmemmap pages must be allocated and remapping. | |
89 | */ | |
90 | ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, | |
91 | GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); | |
78f39084 | 92 | if (!ret) { |
ad2fa371 | 93 | ClearHPageVmemmapOptimized(head); |
78f39084 MS |
94 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
95 | } | |
ad2fa371 MS |
96 | |
97 | return ret; | |
98 | } | |
99 | ||
66361095 MS |
100 | static unsigned int vmemmap_optimizable_pages(struct hstate *h, |
101 | struct page *head) | |
102 | { | |
103 | if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) | |
104 | return 0; | |
105 | ||
106 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { | |
107 | pmd_t *pmdp, pmd; | |
108 | struct page *vmemmap_page; | |
109 | unsigned long vaddr = (unsigned long)head; | |
110 | ||
111 | /* | |
112 | * Only the vmemmap page's vmemmap page can be self-hosted. | |
113 | * Walking the page tables to find the backing page of the | |
114 | * vmemmap page. | |
115 | */ | |
116 | pmdp = pmd_off_k(vaddr); | |
117 | /* | |
118 | * The READ_ONCE() is used to stabilize *pmdp in a register or | |
119 | * on the stack so that it will stop changing under the code. | |
120 | * The only concurrent operation where it can be changed is | |
121 | * split_vmemmap_huge_pmd() (*pmdp will be stable after this | |
122 | * operation). | |
123 | */ | |
124 | pmd = READ_ONCE(*pmdp); | |
125 | if (pmd_leaf(pmd)) | |
126 | vmemmap_page = pmd_page(pmd) + pte_index(vaddr); | |
127 | else | |
128 | vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr)); | |
129 | /* | |
130 | * Due to HugeTLB alignment requirements and the vmemmap pages | |
131 | * being at the start of the hotplugged memory region in | |
132 | * memory_hotplug.memmap_on_memory case. Checking any vmemmap | |
133 | * page's vmemmap page if it is marked as VmemmapSelfHosted is | |
134 | * sufficient. | |
135 | * | |
136 | * [ hotplugged memory ] | |
137 | * [ section ][...][ section ] | |
138 | * [ vmemmap ][ usable memory ] | |
139 | * ^ | | | | |
140 | * +---+ | | | |
141 | * ^ | | | |
142 | * +-------+ | | |
143 | * ^ | | |
144 | * +-------------------------------------------+ | |
145 | */ | |
146 | if (PageVmemmapSelfHosted(vmemmap_page)) | |
147 | return 0; | |
148 | } | |
149 | ||
150 | return hugetlb_optimize_vmemmap_pages(h); | |
151 | } | |
152 | ||
5981611d | 153 | void hugetlb_vmemmap_free(struct hstate *h, struct page *head) |
f41f2ed4 MS |
154 | { |
155 | unsigned long vmemmap_addr = (unsigned long)head; | |
5981611d | 156 | unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; |
f41f2ed4 | 157 | |
66361095 | 158 | vmemmap_pages = vmemmap_optimizable_pages(h, head); |
5981611d | 159 | if (!vmemmap_pages) |
f41f2ed4 MS |
160 | return; |
161 | ||
78f39084 MS |
162 | static_branch_inc(&hugetlb_optimize_vmemmap_key); |
163 | ||
5981611d MS |
164 | vmemmap_addr += RESERVE_VMEMMAP_SIZE; |
165 | vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); | |
166 | vmemmap_reuse = vmemmap_addr - PAGE_SIZE; | |
f41f2ed4 MS |
167 | |
168 | /* | |
169 | * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) | |
170 | * to the page which @vmemmap_reuse is mapped to, then free the pages | |
171 | * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. | |
172 | */ | |
78f39084 MS |
173 | if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) |
174 | static_branch_dec(&hugetlb_optimize_vmemmap_key); | |
175 | else | |
3bc2b6a7 | 176 | SetHPageVmemmapOptimized(head); |
f41f2ed4 | 177 | } |
77490587 MS |
178 | |
179 | void __init hugetlb_vmemmap_init(struct hstate *h) | |
180 | { | |
181 | unsigned int nr_pages = pages_per_huge_page(h); | |
182 | unsigned int vmemmap_pages; | |
183 | ||
184 | /* | |
185 | * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct | |
47010c04 | 186 | * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, |
77490587 MS |
187 | * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. |
188 | */ | |
189 | BUILD_BUG_ON(__NR_USED_SUBPAGE >= | |
190 | RESERVE_VMEMMAP_SIZE / sizeof(struct page)); | |
191 | ||
0effdf46 MS |
192 | if (!is_power_of_2(sizeof(struct page))) { |
193 | pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); | |
194 | static_branch_disable(&hugetlb_optimize_vmemmap_key); | |
195 | return; | |
196 | } | |
197 | ||
77490587 MS |
198 | vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; |
199 | /* | |
e7d32485 MS |
200 | * The head page is not to be freed to buddy allocator, the other tail |
201 | * pages will map to the head page, so they can be freed. | |
77490587 MS |
202 | * |
203 | * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true | |
204 | * on some architectures (e.g. aarch64). See Documentation/arm64/ | |
205 | * hugetlbpage.rst for more details. | |
206 | */ | |
207 | if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) | |
5981611d | 208 | h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; |
77490587 | 209 | |
5981611d MS |
210 | pr_info("can optimize %d vmemmap pages for %s\n", |
211 | h->optimize_vmemmap_pages, h->name); | |
77490587 | 212 | } |
78f39084 MS |
213 | |
214 | #ifdef CONFIG_PROC_SYSCTL | |
215 | static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, | |
216 | void *buffer, size_t *length, | |
217 | loff_t *ppos) | |
218 | { | |
219 | int ret; | |
220 | enum vmemmap_optimize_mode mode; | |
221 | static DEFINE_MUTEX(sysctl_mutex); | |
222 | ||
223 | if (write && !capable(CAP_SYS_ADMIN)) | |
224 | return -EPERM; | |
225 | ||
226 | mutex_lock(&sysctl_mutex); | |
227 | mode = vmemmap_optimize_mode; | |
228 | table->data = &mode; | |
229 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | |
230 | if (write && !ret) | |
231 | vmemmap_optimize_mode_switch(mode); | |
232 | mutex_unlock(&sysctl_mutex); | |
233 | ||
234 | return ret; | |
235 | } | |
236 | ||
237 | static struct ctl_table hugetlb_vmemmap_sysctls[] = { | |
238 | { | |
239 | .procname = "hugetlb_optimize_vmemmap", | |
240 | .maxlen = sizeof(enum vmemmap_optimize_mode), | |
241 | .mode = 0644, | |
242 | .proc_handler = hugetlb_optimize_vmemmap_handler, | |
243 | .extra1 = SYSCTL_ZERO, | |
244 | .extra2 = SYSCTL_ONE, | |
245 | }, | |
246 | { } | |
247 | }; | |
248 | ||
249 | static __init int hugetlb_vmemmap_sysctls_init(void) | |
250 | { | |
251 | /* | |
66361095 MS |
252 | * If "struct page" crosses page boundaries, the vmemmap pages cannot |
253 | * be optimized. | |
78f39084 | 254 | */ |
66361095 | 255 | if (is_power_of_2(sizeof(struct page))) |
78f39084 MS |
256 | register_sysctl_init("vm", hugetlb_vmemmap_sysctls); |
257 | ||
258 | return 0; | |
259 | } | |
260 | late_initcall(hugetlb_vmemmap_sysctls_init); | |
261 | #endif /* CONFIG_PROC_SYSCTL */ |