Commit | Line | Data |
---|---|---|
df3e764d LC |
1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* | |
3 | * Debugging module statistics. | |
4 | * | |
5 | * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> | |
6 | */ | |
7 | ||
8 | #include <linux/module.h> | |
635dc383 | 9 | #include <uapi/linux/module.h> |
df3e764d LC |
10 | #include <linux/string.h> |
11 | #include <linux/printk.h> | |
12 | #include <linux/slab.h> | |
13 | #include <linux/list.h> | |
14 | #include <linux/debugfs.h> | |
15 | #include <linux/rculist.h> | |
16 | #include <linux/math.h> | |
17 | ||
18 | #include "internal.h" | |
19 | ||
20 | /** | |
21 | * DOC: module debugging statistics overview | |
22 | * | |
23 | * Enabling CONFIG_MODULE_STATS enables module debugging statistics which | |
24 | * are useful to monitor and root cause memory pressure issues with module | |
25 | * loading. These statistics are useful to allow us to improve production | |
26 | * workloads. | |
27 | * | |
28 | * The current module debugging statistics supported help keep track of module | |
29 | * loading failures to enable improvements either for kernel module auto-loading | |
30 | * usage (request_module()) or interactions with userspace. Statistics are | |
31 | * provided to track all possible failures in the finit_module() path and memory | |
32 | * wasted in this process space. Each of the failure counters are associated | |
33 | * to a type of module loading failure which is known to incur a certain amount | |
34 | * of memory allocation loss. In the worst case loading a module will fail after | |
35 | * a 3 step memory allocation process: | |
36 | * | |
37 | * a) memory allocated with kernel_read_file_from_fd() | |
38 | * b) module decompression processes the file read from | |
39 | * kernel_read_file_from_fd(), and vmap() is used to map | |
40 | * the decompressed module to a new local buffer which represents | |
41 | * a copy of the decompressed module passed from userspace. The buffer | |
42 | * from kernel_read_file_from_fd() is freed right away. | |
43 | * c) layout_and_allocate() allocates space for the final resting | |
44 | * place where we would keep the module if it were to be processed | |
45 | * successfully. | |
46 | * | |
47 | * If a failure occurs after these three different allocations only one | |
48 | * counter will be incremented with the summation of the allocated bytes freed | |
49 | * incurred during this failure. Likewise, if module loading failed only after | |
50 | * step b) a separate counter is used and incremented for the bytes freed and | |
51 | * not used during both of those allocations. | |
52 | * | |
53 | * Virtual memory space can be limited, for example on x86 virtual memory size | |
54 | * defaults to 128 MiB. We should strive to limit and avoid wasting virtual | |
55 | * memory allocations when possible. These module debugging statistics help | |
56 | * to evaluate how much memory is being wasted on bootup due to module loading | |
57 | * failures. | |
58 | * | |
59 | * All counters are designed to be incremental. Atomic counters are used so to | |
60 | * remain simple and avoid delays and deadlocks. | |
61 | */ | |
62 | ||
63 | /** | |
64 | * DOC: dup_failed_modules - tracks duplicate failed modules | |
65 | * | |
66 | * Linked list of modules which failed to be loaded because an already existing | |
67 | * module with the same name was already being processed or already loaded. | |
68 | * The finit_module() system call incurs heavy virtual memory allocations. In | |
69 | * the worst case an finit_module() system call can end up allocating virtual | |
70 | * memory 3 times: | |
71 | * | |
72 | * 1) kernel_read_file_from_fd() call uses vmalloc() | |
73 | * 2) optional module decompression uses vmap() | |
74 | * 3) layout_and allocate() can use vzalloc() or an arch specific variation of | |
75 | * vmalloc to deal with ELF sections requiring special permissions | |
76 | * | |
77 | * In practice on a typical boot today most finit_module() calls fail due to | |
78 | * the module with the same name already being loaded or about to be processed. | |
79 | * All virtual memory allocated to these failed modules will be freed with | |
80 | * no functional use. | |
81 | * | |
82 | * To help with this the dup_failed_modules allows us to track modules which | |
83 | * failed to load due to the fact that a module was already loaded or being | |
84 | * processed. There are only two points at which we can fail such calls, | |
85 | * we list them below along with the number of virtual memory allocation | |
86 | * calls: | |
87 | * | |
88 | * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before | |
064f4536 | 89 | * layout_and_allocate(). |
df3e764d LC |
90 | * - with module decompression: 2 virtual memory allocation calls |
91 | * - without module decompression: 1 virtual memory allocation calls | |
92 | * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module() | |
93 | * - with module decompression 3 virtual memory allocation calls | |
94 | * - without module decompression 2 virtual memory allocation calls | |
95 | * | |
96 | * We should strive to get this list to be as small as possible. If this list | |
97 | * is not empty it is a reflection of possible work or optimizations possible | |
98 | * either in-kernel or in userspace. | |
99 | */ | |
100 | static LIST_HEAD(dup_failed_modules); | |
101 | ||
102 | /** | |
103 | * DOC: module statistics debugfs counters | |
104 | * | |
105 | * The total amount of wasted virtual memory allocation space during module | |
106 | * loading can be computed by adding the total from the summation: | |
107 | * | |
108 | * * @invalid_kread_bytes + | |
109 | * @invalid_decompress_bytes + | |
110 | * @invalid_becoming_bytes + | |
111 | * @invalid_mod_bytes | |
112 | * | |
113 | * The following debugfs counters are available to inspect module loading | |
114 | * failures: | |
115 | * | |
116 | * * total_mod_size: total bytes ever used by all modules we've dealt with on | |
117 | * this system | |
118 | * * total_text_size: total bytes of the .text and .init.text ELF section | |
119 | * sizes we've dealt with on this system | |
120 | * * invalid_kread_bytes: bytes allocated and then freed on failures which | |
121 | * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd() | |
122 | * uses vmalloc(). These should typically not happen unless your system is | |
123 | * under memory pressure. | |
124 | * * invalid_decompress_bytes: number of bytes allocated and freed due to | |
125 | * memory allocations in the module decompression path that use vmap(). | |
126 | * These typically should not happen unless your system is under memory | |
127 | * pressure. | |
128 | * * invalid_becoming_bytes: total number of bytes allocated and freed used | |
fd06da77 | 129 | * to read the kernel module userspace wants us to read before we |
064f4536 LC |
130 | * promote it to be processed to be added to our @modules linked list. These |
131 | * failures can happen if we had a check in between a successful kernel_read_file_from_fd() | |
df3e764d LC |
132 | * call and right before we allocate the our private memory for the module |
133 | * which would be kept if the module is successfully loaded. The most common | |
134 | * reason for this failure is when userspace is racing to load a module | |
135 | * which it does not yet see loaded. The first module to succeed in | |
136 | * add_unformed_module() will add a module to our &modules list and | |
137 | * subsequent loads of modules with the same name will error out at the | |
064f4536 LC |
138 | * end of early_mod_check(). The check for module_patient_check_exists() |
139 | * at the end of early_mod_check() prevents duplicate allocations | |
df3e764d LC |
140 | * on layout_and_allocate() for modules already being processed. These |
141 | * duplicate failed modules are non-fatal, however they typically are | |
142 | * indicative of userspace not seeing a module in userspace loaded yet and | |
143 | * unnecessarily trying to load a module before the kernel even has a chance | |
144 | * to begin to process prior requests. Although duplicate failures can be | |
145 | * non-fatal, we should try to reduce vmalloc() pressure proactively, so | |
146 | * ideally after boot this will be close to as 0 as possible. If module | |
147 | * decompression was used we also add to this counter the cost of the | |
148 | * initial kernel_read_file_from_fd() of the compressed module. If module | |
149 | * decompression was not used the value represents the total allocated and | |
150 | * freed bytes in kernel_read_file_from_fd() calls for these type of | |
151 | * failures. These failures can occur because: | |
152 | * | |
153 | * * module_sig_check() - module signature checks | |
154 | * * elf_validity_cache_copy() - some ELF validation issue | |
155 | * * early_mod_check(): | |
156 | * | |
157 | * * blacklisting | |
158 | * * failed to rewrite section headers | |
159 | * * version magic | |
160 | * * live patch requirements didn't check out | |
161 | * * the module was detected as being already present | |
162 | * | |
163 | * * invalid_mod_bytes: these are the total number of bytes allocated and | |
164 | * freed due to failures after we did all the sanity checks of the module | |
165 | * which userspace passed to us and after our first check that the module | |
166 | * is unique. A module can still fail to load if we detect the module is | |
167 | * loaded after we allocate space for it with layout_and_allocate(), we do | |
168 | * this check right before processing the module as live and run its | |
169 | * initialization routines. Note that you have a failure of this type it | |
170 | * also means the respective kernel_read_file_from_fd() memory space was | |
171 | * also freed and not used, and so we increment this counter with twice | |
172 | * the size of the module. Additionally if you used module decompression | |
173 | * the size of the compressed module is also added to this counter. | |
174 | * | |
175 | * * modcount: how many modules we've loaded in our kernel life time | |
176 | * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd() | |
177 | * * failed_decompress: how many failed module decompression attempts we've had. | |
178 | * These really should not happen unless your compression / decompression | |
179 | * might be broken. | |
180 | * * failed_becoming: how many modules failed after we kernel_read_file_from_fd() | |
181 | * it and before we allocate memory for it with layout_and_allocate(). This | |
182 | * counter is never incremented if you manage to validate the module and | |
183 | * call layout_and_allocate() for it. | |
184 | * * failed_load_modules: how many modules failed once we've allocated our | |
185 | * private space for our module using layout_and_allocate(). These failures | |
186 | * should hopefully mostly be dealt with already. Races in theory could | |
187 | * still exist here, but it would just mean the kernel had started processing | |
188 | * two threads concurrently up to early_mod_check() and one thread won. | |
189 | * These failures are good signs the kernel or userspace is doing something | |
190 | * seriously stupid or that could be improved. We should strive to fix these, | |
191 | * but it is perhaps not easy to fix them. A recent example are the modules | |
192 | * requests incurred for frequency modules, a separate module request was | |
193 | * being issued for each CPU on a system. | |
194 | */ | |
195 | ||
196 | atomic_long_t total_mod_size; | |
197 | atomic_long_t total_text_size; | |
198 | atomic_long_t invalid_kread_bytes; | |
199 | atomic_long_t invalid_decompress_bytes; | |
200 | static atomic_long_t invalid_becoming_bytes; | |
201 | static atomic_long_t invalid_mod_bytes; | |
202 | atomic_t modcount; | |
203 | atomic_t failed_kreads; | |
204 | atomic_t failed_decompress; | |
205 | static atomic_t failed_becoming; | |
206 | static atomic_t failed_load_modules; | |
207 | ||
208 | static const char *mod_fail_to_str(struct mod_fail_load *mod_fail) | |
209 | { | |
210 | if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) && | |
211 | test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) | |
212 | return "Becoming & Load"; | |
213 | if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask)) | |
214 | return "Becoming"; | |
215 | if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) | |
216 | return "Load"; | |
217 | return "Bug-on-stats"; | |
218 | } | |
219 | ||
220 | void mod_stat_bump_invalid(struct load_info *info, int flags) | |
221 | { | |
222 | atomic_long_add(info->len * 2, &invalid_mod_bytes); | |
223 | atomic_inc(&failed_load_modules); | |
224 | #if defined(CONFIG_MODULE_DECOMPRESS) | |
225 | if (flags & MODULE_INIT_COMPRESSED_FILE) | |
a81b1fc8 | 226 | atomic_long_add(info->compressed_len, &invalid_mod_bytes); |
df3e764d LC |
227 | #endif |
228 | } | |
229 | ||
230 | void mod_stat_bump_becoming(struct load_info *info, int flags) | |
231 | { | |
232 | atomic_inc(&failed_becoming); | |
233 | atomic_long_add(info->len, &invalid_becoming_bytes); | |
234 | #if defined(CONFIG_MODULE_DECOMPRESS) | |
235 | if (flags & MODULE_INIT_COMPRESSED_FILE) | |
236 | atomic_long_add(info->compressed_len, &invalid_becoming_bytes); | |
237 | #endif | |
238 | } | |
239 | ||
240 | int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason) | |
241 | { | |
242 | struct mod_fail_load *mod_fail; | |
243 | ||
244 | list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list, | |
245 | lockdep_is_held(&module_mutex)) { | |
246 | if (!strcmp(mod_fail->name, name)) { | |
247 | atomic_long_inc(&mod_fail->count); | |
248 | __set_bit(reason, &mod_fail->dup_fail_mask); | |
249 | goto out; | |
250 | } | |
251 | } | |
252 | ||
253 | mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL); | |
254 | if (!mod_fail) | |
255 | return -ENOMEM; | |
256 | memcpy(mod_fail->name, name, strlen(name)); | |
257 | __set_bit(reason, &mod_fail->dup_fail_mask); | |
258 | atomic_long_inc(&mod_fail->count); | |
259 | list_add_rcu(&mod_fail->list, &dup_failed_modules); | |
260 | out: | |
261 | return 0; | |
262 | } | |
263 | ||
264 | /* | |
265 | * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the | |
266 | * 112 module prints within 8k. | |
267 | * | |
268 | * 1024 + (64*112) = 8k | |
269 | */ | |
270 | #define MAX_PREAMBLE 1024 | |
271 | #define MAX_FAILED_MOD_PRINT 112 | |
272 | #define MAX_BYTES_PER_MOD 64 | |
273 | static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf, | |
274 | size_t count, loff_t *ppos) | |
275 | { | |
276 | struct mod_fail_load *mod_fail; | |
277 | unsigned int len, size, count_failed = 0; | |
278 | char *buf; | |
d36f6efb | 279 | int ret; |
df3e764d | 280 | u32 live_mod_count, fkreads, fdecompress, fbecoming, floads; |
719ccd80 AB |
281 | unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes, |
282 | idecompress_bytes, imod_bytes, total_virtual_lost; | |
df3e764d LC |
283 | |
284 | live_mod_count = atomic_read(&modcount); | |
285 | fkreads = atomic_read(&failed_kreads); | |
286 | fdecompress = atomic_read(&failed_decompress); | |
287 | fbecoming = atomic_read(&failed_becoming); | |
288 | floads = atomic_read(&failed_load_modules); | |
289 | ||
719ccd80 AB |
290 | total_size = atomic_long_read(&total_mod_size); |
291 | text_size = atomic_long_read(&total_text_size); | |
292 | ikread_bytes = atomic_long_read(&invalid_kread_bytes); | |
293 | idecompress_bytes = atomic_long_read(&invalid_decompress_bytes); | |
294 | ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes); | |
295 | imod_bytes = atomic_long_read(&invalid_mod_bytes); | |
df3e764d LC |
296 | |
297 | total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes; | |
298 | ||
299 | size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming), | |
300 | (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD; | |
301 | buf = kzalloc(size, GFP_KERNEL); | |
302 | if (buf == NULL) | |
303 | return -ENOMEM; | |
304 | ||
305 | /* The beginning of our debug preamble */ | |
9f5cab17 | 306 | len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count); |
df3e764d LC |
307 | |
308 | len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads); | |
309 | ||
310 | len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress", | |
311 | fdecompress); | |
312 | len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming); | |
313 | ||
314 | len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads); | |
315 | ||
719ccd80 AB |
316 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size); |
317 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size); | |
df3e764d | 318 | |
719ccd80 | 319 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes); |
df3e764d | 320 | |
719ccd80 | 321 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes", |
df3e764d LC |
322 | idecompress_bytes); |
323 | ||
719ccd80 | 324 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes); |
df3e764d | 325 | |
719ccd80 | 326 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes); |
df3e764d | 327 | |
719ccd80 | 328 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost); |
df3e764d LC |
329 | |
330 | if (live_mod_count && total_size) { | |
719ccd80 | 331 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size", |
df3e764d LC |
332 | DIV_ROUND_UP(total_size, live_mod_count)); |
333 | } | |
334 | ||
335 | if (live_mod_count && text_size) { | |
719ccd80 | 336 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size", |
df3e764d LC |
337 | DIV_ROUND_UP(text_size, live_mod_count)); |
338 | } | |
339 | ||
340 | /* | |
341 | * We use WARN_ON_ONCE() for the counters to ensure we always have parity | |
342 | * for keeping tabs on a type of failure with one type of byte counter. | |
343 | * The counters for imod_bytes does not increase for fkreads failures | |
344 | * for example, and so on. | |
345 | */ | |
346 | ||
347 | WARN_ON_ONCE(ikread_bytes && !fkreads); | |
348 | if (fkreads && ikread_bytes) { | |
719ccd80 | 349 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes", |
df3e764d LC |
350 | DIV_ROUND_UP(ikread_bytes, fkreads)); |
351 | } | |
352 | ||
353 | WARN_ON_ONCE(ibecoming_bytes && !fbecoming); | |
354 | if (fbecoming && ibecoming_bytes) { | |
719ccd80 | 355 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes", |
df3e764d LC |
356 | DIV_ROUND_UP(ibecoming_bytes, fbecoming)); |
357 | } | |
358 | ||
359 | WARN_ON_ONCE(idecompress_bytes && !fdecompress); | |
360 | if (fdecompress && idecompress_bytes) { | |
719ccd80 | 361 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes", |
df3e764d LC |
362 | DIV_ROUND_UP(idecompress_bytes, fdecompress)); |
363 | } | |
364 | ||
365 | WARN_ON_ONCE(imod_bytes && !floads); | |
366 | if (floads && imod_bytes) { | |
719ccd80 | 367 | len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes", |
df3e764d LC |
368 | DIV_ROUND_UP(imod_bytes, floads)); |
369 | } | |
370 | ||
371 | /* End of our debug preamble header. */ | |
372 | ||
373 | /* Catch when we've gone beyond our expected preamble */ | |
374 | WARN_ON_ONCE(len >= MAX_PREAMBLE); | |
375 | ||
376 | if (list_empty(&dup_failed_modules)) | |
377 | goto out; | |
378 | ||
379 | len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n"); | |
380 | len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n", | |
381 | "Module-name", "How-many-times", "Reason"); | |
382 | mutex_lock(&module_mutex); | |
383 | ||
384 | ||
385 | list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) { | |
386 | if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT)) | |
387 | goto out_unlock; | |
719ccd80 AB |
388 | len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name, |
389 | atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail)); | |
df3e764d LC |
390 | } |
391 | out_unlock: | |
392 | mutex_unlock(&module_mutex); | |
393 | out: | |
d36f6efb | 394 | ret = simple_read_from_buffer(user_buf, count, ppos, buf, len); |
df3e764d | 395 | kfree(buf); |
d36f6efb | 396 | return ret; |
df3e764d LC |
397 | } |
398 | #undef MAX_PREAMBLE | |
399 | #undef MAX_FAILED_MOD_PRINT | |
400 | #undef MAX_BYTES_PER_MOD | |
401 | ||
402 | static const struct file_operations fops_mod_stats = { | |
403 | .read = read_file_mod_stats, | |
404 | .open = simple_open, | |
405 | .owner = THIS_MODULE, | |
406 | .llseek = default_llseek, | |
407 | }; | |
408 | ||
409 | #define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter) | |
410 | #define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name) | |
411 | static int __init module_stats_init(void) | |
412 | { | |
413 | mod_debug_add_ulong(total_mod_size); | |
414 | mod_debug_add_ulong(total_text_size); | |
415 | mod_debug_add_ulong(invalid_kread_bytes); | |
416 | mod_debug_add_ulong(invalid_decompress_bytes); | |
417 | mod_debug_add_ulong(invalid_becoming_bytes); | |
418 | mod_debug_add_ulong(invalid_mod_bytes); | |
419 | ||
420 | mod_debug_add_atomic(modcount); | |
421 | mod_debug_add_atomic(failed_kreads); | |
422 | mod_debug_add_atomic(failed_decompress); | |
423 | mod_debug_add_atomic(failed_becoming); | |
424 | mod_debug_add_atomic(failed_load_modules); | |
425 | ||
426 | debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats); | |
427 | ||
428 | return 0; | |
429 | } | |
430 | #undef mod_debug_add_ulong | |
431 | #undef mod_debug_add_atomic | |
432 | module_init(module_stats_init); |