module: avoid allocation if module is already present and ready
[linux-2.6-block.git] / kernel / module / stats.c
CommitLineData
df3e764d
LC
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Debugging module statistics.
4 *
5 * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
6 */
7
8#include <linux/module.h>
9#include <linux/string.h>
10#include <linux/printk.h>
11#include <linux/slab.h>
12#include <linux/list.h>
13#include <linux/debugfs.h>
14#include <linux/rculist.h>
15#include <linux/math.h>
16
17#include "internal.h"
18
19/**
20 * DOC: module debugging statistics overview
21 *
22 * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
23 * are useful to monitor and root cause memory pressure issues with module
24 * loading. These statistics are useful to allow us to improve production
25 * workloads.
26 *
27 * The current module debugging statistics supported help keep track of module
28 * loading failures to enable improvements either for kernel module auto-loading
29 * usage (request_module()) or interactions with userspace. Statistics are
30 * provided to track all possible failures in the finit_module() path and memory
31 * wasted in this process space. Each of the failure counters are associated
32 * to a type of module loading failure which is known to incur a certain amount
33 * of memory allocation loss. In the worst case loading a module will fail after
34 * a 3 step memory allocation process:
35 *
36 * a) memory allocated with kernel_read_file_from_fd()
37 * b) module decompression processes the file read from
38 * kernel_read_file_from_fd(), and vmap() is used to map
39 * the decompressed module to a new local buffer which represents
40 * a copy of the decompressed module passed from userspace. The buffer
41 * from kernel_read_file_from_fd() is freed right away.
42 * c) layout_and_allocate() allocates space for the final resting
43 * place where we would keep the module if it were to be processed
44 * successfully.
45 *
46 * If a failure occurs after these three different allocations only one
47 * counter will be incremented with the summation of the allocated bytes freed
48 * incurred during this failure. Likewise, if module loading failed only after
49 * step b) a separate counter is used and incremented for the bytes freed and
50 * not used during both of those allocations.
51 *
52 * Virtual memory space can be limited, for example on x86 virtual memory size
53 * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
54 * memory allocations when possible. These module debugging statistics help
55 * to evaluate how much memory is being wasted on bootup due to module loading
56 * failures.
57 *
58 * All counters are designed to be incremental. Atomic counters are used so to
59 * remain simple and avoid delays and deadlocks.
60 */
61
62/**
63 * DOC: dup_failed_modules - tracks duplicate failed modules
64 *
65 * Linked list of modules which failed to be loaded because an already existing
66 * module with the same name was already being processed or already loaded.
67 * The finit_module() system call incurs heavy virtual memory allocations. In
68 * the worst case an finit_module() system call can end up allocating virtual
69 * memory 3 times:
70 *
71 * 1) kernel_read_file_from_fd() call uses vmalloc()
72 * 2) optional module decompression uses vmap()
73 * 3) layout_and allocate() can use vzalloc() or an arch specific variation of
74 * vmalloc to deal with ELF sections requiring special permissions
75 *
76 * In practice on a typical boot today most finit_module() calls fail due to
77 * the module with the same name already being loaded or about to be processed.
78 * All virtual memory allocated to these failed modules will be freed with
79 * no functional use.
80 *
81 * To help with this the dup_failed_modules allows us to track modules which
82 * failed to load due to the fact that a module was already loaded or being
83 * processed. There are only two points at which we can fail such calls,
84 * we list them below along with the number of virtual memory allocation
85 * calls:
86 *
87 * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
064f4536 88 * layout_and_allocate().
df3e764d
LC
89 * - with module decompression: 2 virtual memory allocation calls
90 * - without module decompression: 1 virtual memory allocation calls
91 * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
92 * - with module decompression 3 virtual memory allocation calls
93 * - without module decompression 2 virtual memory allocation calls
94 *
95 * We should strive to get this list to be as small as possible. If this list
96 * is not empty it is a reflection of possible work or optimizations possible
97 * either in-kernel or in userspace.
98 */
99static LIST_HEAD(dup_failed_modules);
100
101/**
102 * DOC: module statistics debugfs counters
103 *
104 * The total amount of wasted virtual memory allocation space during module
105 * loading can be computed by adding the total from the summation:
106 *
107 * * @invalid_kread_bytes +
108 * @invalid_decompress_bytes +
109 * @invalid_becoming_bytes +
110 * @invalid_mod_bytes
111 *
112 * The following debugfs counters are available to inspect module loading
113 * failures:
114 *
115 * * total_mod_size: total bytes ever used by all modules we've dealt with on
116 * this system
117 * * total_text_size: total bytes of the .text and .init.text ELF section
118 * sizes we've dealt with on this system
119 * * invalid_kread_bytes: bytes allocated and then freed on failures which
120 * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
121 * uses vmalloc(). These should typically not happen unless your system is
122 * under memory pressure.
123 * * invalid_decompress_bytes: number of bytes allocated and freed due to
124 * memory allocations in the module decompression path that use vmap().
125 * These typically should not happen unless your system is under memory
126 * pressure.
127 * * invalid_becoming_bytes: total number of bytes allocated and freed used
128 * used to read the kernel module userspace wants us to read before we
064f4536
LC
129 * promote it to be processed to be added to our @modules linked list. These
130 * failures can happen if we had a check in between a successful kernel_read_file_from_fd()
df3e764d
LC
131 * call and right before we allocate the our private memory for the module
132 * which would be kept if the module is successfully loaded. The most common
133 * reason for this failure is when userspace is racing to load a module
134 * which it does not yet see loaded. The first module to succeed in
135 * add_unformed_module() will add a module to our &modules list and
136 * subsequent loads of modules with the same name will error out at the
064f4536
LC
137 * end of early_mod_check(). The check for module_patient_check_exists()
138 * at the end of early_mod_check() prevents duplicate allocations
df3e764d
LC
139 * on layout_and_allocate() for modules already being processed. These
140 * duplicate failed modules are non-fatal, however they typically are
141 * indicative of userspace not seeing a module in userspace loaded yet and
142 * unnecessarily trying to load a module before the kernel even has a chance
143 * to begin to process prior requests. Although duplicate failures can be
144 * non-fatal, we should try to reduce vmalloc() pressure proactively, so
145 * ideally after boot this will be close to as 0 as possible. If module
146 * decompression was used we also add to this counter the cost of the
147 * initial kernel_read_file_from_fd() of the compressed module. If module
148 * decompression was not used the value represents the total allocated and
149 * freed bytes in kernel_read_file_from_fd() calls for these type of
150 * failures. These failures can occur because:
151 *
152 * * module_sig_check() - module signature checks
153 * * elf_validity_cache_copy() - some ELF validation issue
154 * * early_mod_check():
155 *
156 * * blacklisting
157 * * failed to rewrite section headers
158 * * version magic
159 * * live patch requirements didn't check out
160 * * the module was detected as being already present
161 *
162 * * invalid_mod_bytes: these are the total number of bytes allocated and
163 * freed due to failures after we did all the sanity checks of the module
164 * which userspace passed to us and after our first check that the module
165 * is unique. A module can still fail to load if we detect the module is
166 * loaded after we allocate space for it with layout_and_allocate(), we do
167 * this check right before processing the module as live and run its
168 * initialization routines. Note that you have a failure of this type it
169 * also means the respective kernel_read_file_from_fd() memory space was
170 * also freed and not used, and so we increment this counter with twice
171 * the size of the module. Additionally if you used module decompression
172 * the size of the compressed module is also added to this counter.
173 *
174 * * modcount: how many modules we've loaded in our kernel life time
175 * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
176 * * failed_decompress: how many failed module decompression attempts we've had.
177 * These really should not happen unless your compression / decompression
178 * might be broken.
179 * * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
180 * it and before we allocate memory for it with layout_and_allocate(). This
181 * counter is never incremented if you manage to validate the module and
182 * call layout_and_allocate() for it.
183 * * failed_load_modules: how many modules failed once we've allocated our
184 * private space for our module using layout_and_allocate(). These failures
185 * should hopefully mostly be dealt with already. Races in theory could
186 * still exist here, but it would just mean the kernel had started processing
187 * two threads concurrently up to early_mod_check() and one thread won.
188 * These failures are good signs the kernel or userspace is doing something
189 * seriously stupid or that could be improved. We should strive to fix these,
190 * but it is perhaps not easy to fix them. A recent example are the modules
191 * requests incurred for frequency modules, a separate module request was
192 * being issued for each CPU on a system.
193 */
194
195atomic_long_t total_mod_size;
196atomic_long_t total_text_size;
197atomic_long_t invalid_kread_bytes;
198atomic_long_t invalid_decompress_bytes;
199static atomic_long_t invalid_becoming_bytes;
200static atomic_long_t invalid_mod_bytes;
201atomic_t modcount;
202atomic_t failed_kreads;
203atomic_t failed_decompress;
204static atomic_t failed_becoming;
205static atomic_t failed_load_modules;
206
207static const char *mod_fail_to_str(struct mod_fail_load *mod_fail)
208{
209 if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) &&
210 test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
211 return "Becoming & Load";
212 if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask))
213 return "Becoming";
214 if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
215 return "Load";
216 return "Bug-on-stats";
217}
218
219void mod_stat_bump_invalid(struct load_info *info, int flags)
220{
221 atomic_long_add(info->len * 2, &invalid_mod_bytes);
222 atomic_inc(&failed_load_modules);
223#if defined(CONFIG_MODULE_DECOMPRESS)
224 if (flags & MODULE_INIT_COMPRESSED_FILE)
225 atomic_long_add(info->compressed_len, &invalid_mod_byte);
226#endif
227}
228
229void mod_stat_bump_becoming(struct load_info *info, int flags)
230{
231 atomic_inc(&failed_becoming);
232 atomic_long_add(info->len, &invalid_becoming_bytes);
233#if defined(CONFIG_MODULE_DECOMPRESS)
234 if (flags & MODULE_INIT_COMPRESSED_FILE)
235 atomic_long_add(info->compressed_len, &invalid_becoming_bytes);
236#endif
237}
238
239int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason)
240{
241 struct mod_fail_load *mod_fail;
242
243 list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list,
244 lockdep_is_held(&module_mutex)) {
245 if (!strcmp(mod_fail->name, name)) {
246 atomic_long_inc(&mod_fail->count);
247 __set_bit(reason, &mod_fail->dup_fail_mask);
248 goto out;
249 }
250 }
251
252 mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL);
253 if (!mod_fail)
254 return -ENOMEM;
255 memcpy(mod_fail->name, name, strlen(name));
256 __set_bit(reason, &mod_fail->dup_fail_mask);
257 atomic_long_inc(&mod_fail->count);
258 list_add_rcu(&mod_fail->list, &dup_failed_modules);
259out:
260 return 0;
261}
262
263/*
264 * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
265 * 112 module prints within 8k.
266 *
267 * 1024 + (64*112) = 8k
268 */
269#define MAX_PREAMBLE 1024
270#define MAX_FAILED_MOD_PRINT 112
271#define MAX_BYTES_PER_MOD 64
272static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
273 size_t count, loff_t *ppos)
274{
275 struct mod_fail_load *mod_fail;
276 unsigned int len, size, count_failed = 0;
277 char *buf;
278 u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
279 u64 total_size, text_size, ikread_bytes, ibecoming_bytes, idecompress_bytes, imod_bytes,
280 total_virtual_lost;
281
282 live_mod_count = atomic_read(&modcount);
283 fkreads = atomic_read(&failed_kreads);
284 fdecompress = atomic_read(&failed_decompress);
285 fbecoming = atomic_read(&failed_becoming);
286 floads = atomic_read(&failed_load_modules);
287
288 total_size = atomic64_read(&total_mod_size);
289 text_size = atomic64_read(&total_text_size);
290 ikread_bytes = atomic64_read(&invalid_kread_bytes);
291 idecompress_bytes = atomic64_read(&invalid_decompress_bytes);
292 ibecoming_bytes = atomic64_read(&invalid_becoming_bytes);
293 imod_bytes = atomic64_read(&invalid_mod_bytes);
294
295 total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes;
296
297 size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming),
298 (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD;
299 buf = kzalloc(size, GFP_KERNEL);
300 if (buf == NULL)
301 return -ENOMEM;
302
303 /* The beginning of our debug preamble */
304 len = scnprintf(buf + 0, size - len, "%25s\t%u\n", "Mods ever loaded", live_mod_count);
305
306 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads);
307
308 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress",
309 fdecompress);
310 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming);
311
312 len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads);
313
314 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total module size", total_size);
315 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Total mod text size", text_size);
316
317 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kread bytes", ikread_bytes);
318
319 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed decompress bytes",
320 idecompress_bytes);
321
322 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed becoming bytes", ibecoming_bytes);
323
324 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Failed kmod bytes", imod_bytes);
325
326 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Virtual mem wasted bytes", total_virtual_lost);
327
328 if (live_mod_count && total_size) {
329 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod size",
330 DIV_ROUND_UP(total_size, live_mod_count));
331 }
332
333 if (live_mod_count && text_size) {
334 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average mod text size",
335 DIV_ROUND_UP(text_size, live_mod_count));
336 }
337
338 /*
339 * We use WARN_ON_ONCE() for the counters to ensure we always have parity
340 * for keeping tabs on a type of failure with one type of byte counter.
341 * The counters for imod_bytes does not increase for fkreads failures
342 * for example, and so on.
343 */
344
345 WARN_ON_ONCE(ikread_bytes && !fkreads);
346 if (fkreads && ikread_bytes) {
347 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Avg fail kread bytes",
348 DIV_ROUND_UP(ikread_bytes, fkreads));
349 }
350
351 WARN_ON_ONCE(ibecoming_bytes && !fbecoming);
352 if (fbecoming && ibecoming_bytes) {
353 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Avg fail becoming bytes",
354 DIV_ROUND_UP(ibecoming_bytes, fbecoming));
355 }
356
357 WARN_ON_ONCE(idecompress_bytes && !fdecompress);
358 if (fdecompress && idecompress_bytes) {
359 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Avg fail decomp bytes",
360 DIV_ROUND_UP(idecompress_bytes, fdecompress));
361 }
362
363 WARN_ON_ONCE(imod_bytes && !floads);
364 if (floads && imod_bytes) {
365 len += scnprintf(buf + len, size - len, "%25s\t%llu\n", "Average fail load bytes",
366 DIV_ROUND_UP(imod_bytes, floads));
367 }
368
369 /* End of our debug preamble header. */
370
371 /* Catch when we've gone beyond our expected preamble */
372 WARN_ON_ONCE(len >= MAX_PREAMBLE);
373
374 if (list_empty(&dup_failed_modules))
375 goto out;
376
377 len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n");
378 len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n",
379 "Module-name", "How-many-times", "Reason");
380 mutex_lock(&module_mutex);
381
382
383 list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) {
384 if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT))
385 goto out_unlock;
386 len += scnprintf(buf + len, size - len, "%25s\t%15llu\t%25s\n", mod_fail->name,
387 atomic64_read(&mod_fail->count), mod_fail_to_str(mod_fail));
388 }
389out_unlock:
390 mutex_unlock(&module_mutex);
391out:
392 kfree(buf);
393 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
394}
395#undef MAX_PREAMBLE
396#undef MAX_FAILED_MOD_PRINT
397#undef MAX_BYTES_PER_MOD
398
399static const struct file_operations fops_mod_stats = {
400 .read = read_file_mod_stats,
401 .open = simple_open,
402 .owner = THIS_MODULE,
403 .llseek = default_llseek,
404};
405
406#define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter)
407#define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name)
408static int __init module_stats_init(void)
409{
410 mod_debug_add_ulong(total_mod_size);
411 mod_debug_add_ulong(total_text_size);
412 mod_debug_add_ulong(invalid_kread_bytes);
413 mod_debug_add_ulong(invalid_decompress_bytes);
414 mod_debug_add_ulong(invalid_becoming_bytes);
415 mod_debug_add_ulong(invalid_mod_bytes);
416
417 mod_debug_add_atomic(modcount);
418 mod_debug_add_atomic(failed_kreads);
419 mod_debug_add_atomic(failed_decompress);
420 mod_debug_add_atomic(failed_becoming);
421 mod_debug_add_atomic(failed_load_modules);
422
423 debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
424
425 return 0;
426}
427#undef mod_debug_add_ulong
428#undef mod_debug_add_atomic
429module_init(module_stats_init);