Merge tag 'x86-asm-2024-03-11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
[linux-2.6-block.git] / lib / stackdepot.c
CommitLineData
1802d0be 1// SPDX-License-Identifier: GPL-2.0-only
cd11016e 2/*
b232b999 3 * Stack depot - a stack trace storage that avoids duplication.
cd11016e 4 *
b232b999
AK
5 * Internally, stack depot maintains a hash table of unique stacktraces. The
6 * stack traces themselves are stored contiguously one after another in a set
7 * of separate page allocations.
8 *
cd11016e
AP
9 * Author: Alexander Potapenko <glider@google.com>
10 * Copyright (C) 2016 Google, Inc.
11 *
b232b999 12 * Based on the code by Dmitry Chernenkov.
cd11016e
AP
13 */
14
4a6b5314
AK
15#define pr_fmt(fmt) "stackdepot: " fmt
16
c2a29254 17#include <linux/debugfs.h>
cd11016e
AP
18#include <linux/gfp.h>
19#include <linux/jhash.h>
20#include <linux/kernel.h>
8e00b2df 21#include <linux/kmsan.h>
4805180b 22#include <linux/list.h>
cd11016e 23#include <linux/mm.h>
2dba5eb1 24#include <linux/mutex.h>
31639fd6 25#include <linux/poison.h>
cd11016e 26#include <linux/printk.h>
4434a56e
ME
27#include <linux/rculist.h>
28#include <linux/rcupdate.h>
410b764f 29#include <linux/refcount.h>
cd11016e 30#include <linux/slab.h>
a6cd9570 31#include <linux/spinlock.h>
cd11016e
AP
32#include <linux/stacktrace.h>
33#include <linux/stackdepot.h>
34#include <linux/string.h>
35#include <linux/types.h>
e1fdc403 36#include <linux/memblock.h>
f9987921 37#include <linux/kasan-enabled.h>
cd11016e 38
424cafee
AK
39#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
40
424cafee
AK
41#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
42#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
43#define DEPOT_STACK_ALIGN 4
44#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
5f9ce55e
AK
45#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
46 STACK_DEPOT_EXTRA_BITS)
424cafee
AK
47#define DEPOT_POOLS_CAP 8192
48#define DEPOT_MAX_POOLS \
49 (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
50 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
cd11016e 51
b232b999 52/* Compact structure that stores a reference to a stack. */
cd11016e
AP
53union handle_parts {
54 depot_stack_handle_t handle;
55 struct {
424cafee
AK
56 u32 pool_index : DEPOT_POOL_INDEX_BITS;
57 u32 offset : DEPOT_OFFSET_BITS;
424cafee 58 u32 extra : STACK_DEPOT_EXTRA_BITS;
cd11016e
AP
59 };
60};
61
62struct stack_record {
4434a56e 63 struct list_head hash_list; /* Links in the hash table */
b29d3188 64 u32 hash; /* Hash in hash table */
b232b999 65 u32 size; /* Number of stored frames */
4434a56e 66 union handle_parts handle; /* Constant after initialization */
410b764f 67 refcount_t count;
4434a56e
ME
68 union {
69 unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */
70 struct {
71 /*
72 * An important invariant of the implementation is to
73 * only place a stack record onto the freelist iff its
74 * refcount is zero. Because stack records with a zero
75 * refcount are never considered as valid, it is safe to
76 * union @entries and freelist management state below.
77 * Conversely, as soon as an entry is off the freelist
78 * and its refcount becomes non-zero, the below must not
79 * be accessed until being placed back on the freelist.
80 */
81 struct list_head free_list; /* Links in the freelist */
82 unsigned long rcu_state; /* RCU cookie */
83 };
84 };
cd11016e
AP
85};
86
735df3c3 87static bool stack_depot_disabled;
1c0310ad 88static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
a5f1783b
VB
89static bool __stack_depot_early_init_passed __initdata;
90
0d249ac0 91/* Use one hash table bucket per 16 KB of memory. */
4c2e9a67 92#define STACK_HASH_TABLE_SCALE 14
0d249ac0 93/* Limit the number of buckets between 4K and 1M. */
4c2e9a67
AK
94#define STACK_BUCKET_NUMBER_ORDER_MIN 12
95#define STACK_BUCKET_NUMBER_ORDER_MAX 20
0d249ac0 96/* Initial seed for jhash2. */
cd11016e
AP
97#define STACK_HASH_SEED 0x9747b28c
98
4805180b
AK
99/* Hash table of stored stack records. */
100static struct list_head *stack_table;
0d249ac0 101/* Fixed order of the number of table buckets. Used when KASAN is enabled. */
4c2e9a67 102static unsigned int stack_bucket_number_order;
0d249ac0 103/* Hash mask for indexing the table. */
f9987921
VB
104static unsigned int stack_hash_mask;
105
4805180b 106/* Array of memory regions that store stack records. */
424cafee 107static void *stack_pools[DEPOT_MAX_POOLS];
a5d21f71
AK
108/* Newly allocated pool that is not yet added to stack_pools. */
109static void *new_pool;
b29d3188
AK
110/* Number of pools in stack_pools. */
111static int pools_num;
31639fd6
ME
112/* Offset to the unused space in the currently used pool. */
113static size_t pool_offset = DEPOT_POOL_SIZE;
4805180b
AK
114/* Freelist of stack records within stack_pools. */
115static LIST_HEAD(free_stacks);
4434a56e
ME
116/* The lock must be held when performing pool or freelist modifications. */
117static DEFINE_RAW_SPINLOCK(pool_lock);
e1fdc403 118
c2a29254
ME
119/* Statistics counters for debugfs. */
120enum depot_counter_id {
31639fd6
ME
121 DEPOT_COUNTER_REFD_ALLOCS,
122 DEPOT_COUNTER_REFD_FREES,
123 DEPOT_COUNTER_REFD_INUSE,
c2a29254 124 DEPOT_COUNTER_FREELIST_SIZE,
31639fd6
ME
125 DEPOT_COUNTER_PERSIST_COUNT,
126 DEPOT_COUNTER_PERSIST_BYTES,
c2a29254
ME
127 DEPOT_COUNTER_COUNT,
128};
129static long counters[DEPOT_COUNTER_COUNT];
130static const char *const counter_names[] = {
31639fd6
ME
131 [DEPOT_COUNTER_REFD_ALLOCS] = "refcounted_allocations",
132 [DEPOT_COUNTER_REFD_FREES] = "refcounted_frees",
133 [DEPOT_COUNTER_REFD_INUSE] = "refcounted_in_use",
c2a29254 134 [DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size",
31639fd6
ME
135 [DEPOT_COUNTER_PERSIST_COUNT] = "persistent_count",
136 [DEPOT_COUNTER_PERSIST_BYTES] = "persistent_bytes",
c2a29254
ME
137};
138static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
139
735df3c3 140static int __init disable_stack_depot(char *str)
e1fdc403 141{
4d07a037 142 return kstrtobool(str, &stack_depot_disabled);
e1fdc403 143}
735df3c3 144early_param("stack_depot_disable", disable_stack_depot);
e1fdc403 145
1c0310ad 146void __init stack_depot_request_early_init(void)
a5f1783b 147{
1c0310ad 148 /* Too late to request early init now. */
a5f1783b
VB
149 WARN_ON(__stack_depot_early_init_passed);
150
1c0310ad 151 __stack_depot_early_init_requested = true;
a5f1783b
VB
152}
153
4805180b
AK
154/* Initialize list_head's within the hash table. */
155static void init_stack_table(unsigned long entries)
156{
157 unsigned long i;
158
159 for (i = 0; i < entries; i++)
160 INIT_LIST_HEAD(&stack_table[i]);
161}
162
df225c87 163/* Allocates a hash table via memblock. Can only be used during early boot. */
a5f1783b
VB
164int __init stack_depot_early_init(void)
165{
f9987921 166 unsigned long entries = 0;
a5f1783b 167
df225c87 168 /* This function must be called only once, from mm_init(). */
a5f1783b
VB
169 if (WARN_ON(__stack_depot_early_init_passed))
170 return 0;
a5f1783b
VB
171 __stack_depot_early_init_passed = true;
172
4d07a037
AK
173 /*
174 * Print disabled message even if early init has not been requested:
175 * stack_depot_init() will not print one.
176 */
177 if (stack_depot_disabled) {
178 pr_info("disabled\n");
179 return 0;
180 }
181
df225c87
AK
182 /*
183 * If KASAN is enabled, use the maximum order: KASAN is frequently used
184 * in fuzzing scenarios, which leads to a large number of different
185 * stack traces being stored in stack depot.
186 */
4c2e9a67
AK
187 if (kasan_enabled() && !stack_bucket_number_order)
188 stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX;
f9987921 189
4d07a037
AK
190 /*
191 * Check if early init has been requested after setting
192 * stack_bucket_number_order: stack_depot_init() uses its value.
193 */
194 if (!__stack_depot_early_init_requested)
a5f1783b
VB
195 return 0;
196
df225c87 197 /*
4c2e9a67 198 * If stack_bucket_number_order is not set, leave entries as 0 to rely
4805180b 199 * on the automatic calculations performed by alloc_large_system_hash().
df225c87 200 */
4c2e9a67
AK
201 if (stack_bucket_number_order)
202 entries = 1UL << stack_bucket_number_order;
df225c87 203 pr_info("allocating hash table via alloc_large_system_hash\n");
f9987921 204 stack_table = alloc_large_system_hash("stackdepot",
4805180b 205 sizeof(struct list_head),
f9987921 206 entries,
4c2e9a67 207 STACK_HASH_TABLE_SCALE,
4805180b 208 HASH_EARLY,
f9987921
VB
209 NULL,
210 &stack_hash_mask,
4c2e9a67
AK
211 1UL << STACK_BUCKET_NUMBER_ORDER_MIN,
212 1UL << STACK_BUCKET_NUMBER_ORDER_MAX);
a5f1783b 213 if (!stack_table) {
4a6b5314 214 pr_err("hash table allocation failed, disabling\n");
735df3c3 215 stack_depot_disabled = true;
a5f1783b
VB
216 return -ENOMEM;
217 }
4805180b
AK
218 if (!entries) {
219 /*
220 * Obtain the number of entries that was calculated by
221 * alloc_large_system_hash().
222 */
223 entries = stack_hash_mask + 1;
224 }
225 init_stack_table(entries);
a5f1783b
VB
226
227 return 0;
228}
229
df225c87 230/* Allocates a hash table via kvcalloc. Can be used after boot. */
a5f1783b 231int stack_depot_init(void)
e1fdc403 232{
2dba5eb1 233 static DEFINE_MUTEX(stack_depot_init_mutex);
c60324fb 234 unsigned long entries;
a5f1783b 235 int ret = 0;
2dba5eb1
VB
236
237 mutex_lock(&stack_depot_init_mutex);
f9987921 238
c60324fb
AK
239 if (stack_depot_disabled || stack_table)
240 goto out_unlock;
f9987921 241
c60324fb 242 /*
4c2e9a67 243 * Similarly to stack_depot_early_init, use stack_bucket_number_order
c60324fb
AK
244 * if assigned, and rely on automatic scaling otherwise.
245 */
4c2e9a67
AK
246 if (stack_bucket_number_order) {
247 entries = 1UL << stack_bucket_number_order;
c60324fb 248 } else {
4c2e9a67 249 int scale = STACK_HASH_TABLE_SCALE;
c60324fb
AK
250
251 entries = nr_free_buffer_pages();
252 entries = roundup_pow_of_two(entries);
253
254 if (scale > PAGE_SHIFT)
255 entries >>= (scale - PAGE_SHIFT);
256 else
257 entries <<= (PAGE_SHIFT - scale);
e1fdc403 258 }
c60324fb 259
4c2e9a67
AK
260 if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN)
261 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN;
262 if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX)
263 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX;
c60324fb
AK
264
265 pr_info("allocating hash table of %lu entries via kvcalloc\n", entries);
4805180b 266 stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL);
c60324fb
AK
267 if (!stack_table) {
268 pr_err("hash table allocation failed, disabling\n");
269 stack_depot_disabled = true;
270 ret = -ENOMEM;
271 goto out_unlock;
272 }
273 stack_hash_mask = entries - 1;
4805180b 274 init_stack_table(entries);
c60324fb
AK
275
276out_unlock:
2dba5eb1 277 mutex_unlock(&stack_depot_init_mutex);
c60324fb 278
a5f1783b 279 return ret;
e1fdc403 280}
2dba5eb1 281EXPORT_SYMBOL_GPL(stack_depot_init);
cd11016e 282
4434a56e 283/*
31639fd6 284 * Initializes new stack pool, and updates the list of pools.
4434a56e 285 */
31639fd6 286static bool depot_init_pool(void **prealloc)
b29d3188 287{
4434a56e 288 lockdep_assert_held(&pool_lock);
4805180b 289
31639fd6
ME
290 if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
291 /* Bail out if we reached the pool limit. */
292 WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
293 WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
294 WARN_ONCE(1, "Stack depot reached limit capacity");
295 return false;
296 }
4434a56e 297
31639fd6
ME
298 if (!new_pool && *prealloc) {
299 /* We have preallocated memory, use it. */
300 WRITE_ONCE(new_pool, *prealloc);
301 *prealloc = NULL;
b29d3188
AK
302 }
303
31639fd6
ME
304 if (!new_pool)
305 return false; /* new_pool and *prealloc are NULL */
306
b29d3188 307 /* Save reference to the pool to be used by depot_fetch_stack(). */
31639fd6
ME
308 stack_pools[pools_num] = new_pool;
309
310 /*
311 * Stack depot tries to keep an extra pool allocated even before it runs
312 * out of space in the currently used pool.
313 *
314 * To indicate that a new preallocation is needed new_pool is reset to
315 * NULL; do not reset to NULL if we have reached the maximum number of
316 * pools.
317 */
318 if (pools_num < DEPOT_MAX_POOLS)
319 WRITE_ONCE(new_pool, NULL);
320 else
321 WRITE_ONCE(new_pool, STACK_DEPOT_POISON);
4434a56e
ME
322
323 /* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
324 WRITE_ONCE(pools_num, pools_num + 1);
325 ASSERT_EXCLUSIVE_WRITER(pools_num);
31639fd6
ME
326
327 pool_offset = 0;
328
329 return true;
b29d3188
AK
330}
331
b6a353d3
AK
332/* Keeps the preallocated memory to be used for a new stack depot pool. */
333static void depot_keep_new_pool(void **prealloc)
15ef6a98 334{
4434a56e 335 lockdep_assert_held(&pool_lock);
a6cd9570 336
15ef6a98 337 /*
b6a353d3 338 * If a new pool is already saved or the maximum number of
d11a5621 339 * pools is reached, do not use the preallocated memory.
15ef6a98 340 */
31639fd6 341 if (new_pool)
514d5c55 342 return;
cd0fc64e 343
31639fd6
ME
344 WRITE_ONCE(new_pool, *prealloc);
345 *prealloc = NULL;
15ef6a98
AK
346}
347
4434a56e 348/*
31639fd6
ME
349 * Try to initialize a new stack record from the current pool, a cached pool, or
350 * the current pre-allocation.
4434a56e 351 */
31639fd6 352static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size)
15ef6a98 353{
31639fd6
ME
354 struct stack_record *stack;
355 void *current_pool;
356 u32 pool_index;
357
4434a56e 358 lockdep_assert_held(&pool_lock);
b29d3188 359
31639fd6
ME
360 if (pool_offset + size > DEPOT_POOL_SIZE) {
361 if (!depot_init_pool(prealloc))
362 return NULL;
363 }
b29d3188 364
31639fd6
ME
365 if (WARN_ON_ONCE(pools_num < 1))
366 return NULL;
367 pool_index = pools_num - 1;
368 current_pool = stack_pools[pool_index];
369 if (WARN_ON_ONCE(!current_pool))
370 return NULL;
b29d3188 371
31639fd6 372 stack = current_pool + pool_offset;
b29d3188 373
31639fd6
ME
374 /* Pre-initialize handle once. */
375 stack->handle.pool_index = pool_index;
376 stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
377 stack->handle.extra = 0;
378 INIT_LIST_HEAD(&stack->hash_list);
cd0fc64e 379
31639fd6 380 pool_offset += size;
94b7d328 381
31639fd6 382 return stack;
4434a56e
ME
383}
384
31639fd6 385/* Try to find next free usable entry from the freelist. */
4434a56e
ME
386static struct stack_record *depot_pop_free(void)
387{
388 struct stack_record *stack;
b29d3188 389
4434a56e
ME
390 lockdep_assert_held(&pool_lock);
391
392 if (list_empty(&free_stacks))
393 return NULL;
394
395 /*
396 * We maintain the invariant that the elements in front are least
397 * recently used, and are therefore more likely to be associated with an
398 * RCU grace period in the past. Consequently it is sufficient to only
399 * check the first entry.
400 */
401 stack = list_first_entry(&free_stacks, struct stack_record, free_list);
31639fd6 402 if (!poll_state_synchronize_rcu(stack->rcu_state))
4434a56e
ME
403 return NULL;
404
405 list_del(&stack->free_list);
406 counters[DEPOT_COUNTER_FREELIST_SIZE]--;
407
408 return stack;
94b7d328
AK
409}
410
31639fd6
ME
411static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries)
412{
413 const size_t used = flex_array_size(s, entries, nr_entries);
414 const size_t unused = sizeof(s->entries) - used;
415
416 WARN_ON_ONCE(sizeof(s->entries) < used);
417
418 return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN);
419}
420
94b7d328
AK
421/* Allocates a new stack in a stack depot pool. */
422static struct stack_record *
31639fd6 423depot_alloc_stack(unsigned long *entries, unsigned int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
94b7d328 424{
31639fd6
ME
425 struct stack_record *stack = NULL;
426 size_t record_size;
94b7d328 427
4434a56e 428 lockdep_assert_held(&pool_lock);
a6cd9570 429
4434a56e 430 /* This should already be checked by public API entry points. */
31639fd6 431 if (WARN_ON_ONCE(!nr_entries))
94b7d328 432 return NULL;
cd0fc64e 433
31639fd6
ME
434 /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
435 if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES)
436 nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES;
437
438 if (flags & STACK_DEPOT_FLAG_GET) {
439 /*
440 * Evictable entries have to allocate the max. size so they may
441 * safely be re-used by differently sized allocations.
442 */
443 record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES);
4434a56e 444 stack = depot_pop_free();
31639fd6
ME
445 } else {
446 record_size = depot_stack_record_size(stack, nr_entries);
4434a56e 447 }
b29d3188 448
31639fd6
ME
449 if (!stack) {
450 stack = depot_pop_free_pool(prealloc, record_size);
451 if (!stack)
452 return NULL;
453 }
fc60e0ca 454
cd0fc64e 455 /* Save the stack trace. */
15ef6a98 456 stack->hash = hash;
31639fd6
ME
457 stack->size = nr_entries;
458 /* stack->handle is already filled in by depot_pop_free_pool(). */
459 memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries));
460
461 if (flags & STACK_DEPOT_FLAG_GET) {
462 refcount_set(&stack->count, 1);
463 counters[DEPOT_COUNTER_REFD_ALLOCS]++;
464 counters[DEPOT_COUNTER_REFD_INUSE]++;
465 } else {
466 /* Warn on attempts to switch to refcounting this entry. */
467 refcount_set(&stack->count, REFCOUNT_SATURATED);
468 counters[DEPOT_COUNTER_PERSIST_COUNT]++;
469 counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size;
470 }
83130ab2 471
8e00b2df
AP
472 /*
473 * Let KMSAN know the stored stack record is initialized. This shall
474 * prevent false positive reports if instrumented code accesses it.
475 */
31639fd6 476 kmsan_unpoison_memory(stack, record_size);
15ef6a98
AK
477
478 return stack;
479}
480
83130ab2
AK
481static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
482{
4434a56e 483 const int pools_num_cached = READ_ONCE(pools_num);
83130ab2 484 union handle_parts parts = { .handle = handle };
83130ab2
AK
485 void *pool;
486 size_t offset = parts.offset << DEPOT_STACK_ALIGN;
487 struct stack_record *stack;
488
4434a56e 489 lockdep_assert_not_held(&pool_lock);
a6cd9570 490
4434a56e 491 if (parts.pool_index > pools_num_cached) {
83130ab2 492 WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
4434a56e 493 parts.pool_index, pools_num_cached, handle);
83130ab2
AK
494 return NULL;
495 }
496
497 pool = stack_pools[parts.pool_index];
4434a56e 498 if (WARN_ON(!pool))
83130ab2
AK
499 return NULL;
500
501 stack = pool + offset;
4434a56e
ME
502 if (WARN_ON(!refcount_read(&stack->count)))
503 return NULL;
504
83130ab2
AK
505 return stack;
506}
507
108be8de
AK
508/* Links stack into the freelist. */
509static void depot_free_stack(struct stack_record *stack)
510{
4434a56e
ME
511 unsigned long flags;
512
513 lockdep_assert_not_held(&pool_lock);
514
515 raw_spin_lock_irqsave(&pool_lock, flags);
516 printk_deferred_enter();
108be8de 517
4434a56e
ME
518 /*
519 * Remove the entry from the hash list. Concurrent list traversal may
520 * still observe the entry, but since the refcount is zero, this entry
521 * will no longer be considered as valid.
522 */
523 list_del_rcu(&stack->hash_list);
524
525 /*
526 * Due to being used from constrained contexts such as the allocators,
527 * NMI, or even RCU itself, stack depot cannot rely on primitives that
528 * would sleep (such as synchronize_rcu()) or recursively call into
529 * stack depot again (such as call_rcu()).
530 *
531 * Instead, get an RCU cookie, so that we can ensure this entry isn't
532 * moved onto another list until the next grace period, and concurrent
533 * RCU list traversal remains safe.
534 */
535 stack->rcu_state = get_state_synchronize_rcu();
536
537 /*
538 * Add the entry to the freelist tail, so that older entries are
539 * considered first - their RCU cookie is more likely to no longer be
540 * associated with the current grace period.
541 */
542 list_add_tail(&stack->free_list, &free_stacks);
c2a29254
ME
543
544 counters[DEPOT_COUNTER_FREELIST_SIZE]++;
31639fd6
ME
545 counters[DEPOT_COUNTER_REFD_FREES]++;
546 counters[DEPOT_COUNTER_REFD_INUSE]--;
4434a56e
ME
547
548 printk_deferred_exit();
549 raw_spin_unlock_irqrestore(&pool_lock, flags);
108be8de
AK
550}
551
b232b999 552/* Calculates the hash for a stack. */
cd11016e
AP
553static inline u32 hash_stack(unsigned long *entries, unsigned int size)
554{
555 return jhash2((u32 *)entries,
180644f8
GS
556 array_size(size, sizeof(*entries)) / sizeof(u32),
557 STACK_HASH_SEED);
cd11016e
AP
558}
559
b232b999
AK
560/*
561 * Non-instrumented version of memcmp().
562 * Does not check the lexicographical order, only the equality.
a571b272
AP
563 */
564static inline
565int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
566 unsigned int n)
567{
568 for ( ; n-- ; u1++, u2++) {
569 if (*u1 != *u2)
570 return 1;
571 }
572 return 0;
573}
574
b232b999 575/* Finds a stack in a bucket of the hash table. */
4805180b 576static inline struct stack_record *find_stack(struct list_head *bucket,
4434a56e
ME
577 unsigned long *entries, int size,
578 u32 hash, depot_flags_t flags)
cd11016e 579{
4434a56e
ME
580 struct stack_record *stack, *ret = NULL;
581
582 /*
583 * Stack depot may be used from instrumentation that instruments RCU or
584 * tracing itself; use variant that does not call into RCU and cannot be
585 * traced.
586 *
587 * Note: Such use cases must take care when using refcounting to evict
588 * unused entries, because the stack record free-then-reuse code paths
589 * do call into RCU.
590 */
591 rcu_read_lock_sched_notrace();
592
593 list_for_each_entry_rcu(stack, bucket, hash_list) {
594 if (stack->hash != hash || stack->size != size)
595 continue;
cd11016e 596
4434a56e
ME
597 /*
598 * This may race with depot_free_stack() accessing the freelist
599 * management state unioned with @entries. The refcount is zero
600 * in that case and the below refcount_inc_not_zero() will fail.
601 */
602 if (data_race(stackdepot_memcmp(entries, stack->entries, size)))
603 continue;
a6cd9570 604
4434a56e
ME
605 /*
606 * Try to increment refcount. If this succeeds, the stack record
607 * is valid and has not yet been freed.
608 *
609 * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior
610 * to then call stack_depot_put() later, and we can assume that
611 * a stack record is never placed back on the freelist.
612 */
613 if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count))
614 continue;
615
616 ret = stack;
617 break;
cd11016e 618 }
4434a56e
ME
619
620 rcu_read_unlock_sched_notrace();
621
622 return ret;
cd11016e
AP
623}
624
022012dc
AK
625depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
626 unsigned int nr_entries,
627 gfp_t alloc_flags,
628 depot_flags_t depot_flags)
cd11016e 629{
4805180b
AK
630 struct list_head *bucket;
631 struct stack_record *found = NULL;
603c000c 632 depot_stack_handle_t handle = 0;
cd11016e
AP
633 struct page *page = NULL;
634 void *prealloc = NULL;
022012dc 635 bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
c0cfc337
TG
636 unsigned long flags;
637 u32 hash;
cd11016e 638
022012dc
AK
639 if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK))
640 return 0;
641
e9400660
ME
642 /*
643 * If this stack trace is from an interrupt, including anything before
b232b999 644 * interrupt entry usually leads to unbounded stack depot growth.
e9400660 645 *
b232b999
AK
646 * Since use of filter_irq_stacks() is a requirement to ensure stack
647 * depot can efficiently deduplicate interrupt stacks, always
648 * filter_irq_stacks() to simplify all callers' use of stack depot.
e9400660
ME
649 */
650 nr_entries = filter_irq_stacks(entries, nr_entries);
651
735df3c3 652 if (unlikely(nr_entries == 0) || stack_depot_disabled)
603c000c 653 return 0;
cd11016e 654
c0cfc337 655 hash = hash_stack(entries, nr_entries);
f9987921 656 bucket = &stack_table[hash & stack_hash_mask];
cd11016e 657
4434a56e
ME
658 /* Fast path: look the stack trace up without locking. */
659 found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
660 if (found)
cd11016e
AP
661 goto exit;
662
663 /*
a6cd9570
AK
664 * Allocate memory for a new pool if required now:
665 * we won't be able to do that under the lock.
cd11016e 666 */
31639fd6 667 if (unlikely(can_alloc && !READ_ONCE(new_pool))) {
cd11016e
AP
668 /*
669 * Zero out zone modifiers, as we don't have specific zone
670 * requirements. Keep the flags related to allocation in atomic
671 * contexts and I/O.
672 */
673 alloc_flags &= ~GFP_ZONEMASK;
674 alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
87cc271d 675 alloc_flags |= __GFP_NOWARN;
424cafee 676 page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
cd11016e
AP
677 if (page)
678 prealloc = page_address(page);
679 }
680
4434a56e 681 raw_spin_lock_irqsave(&pool_lock, flags);
a914d8d6 682 printk_deferred_enter();
cd11016e 683
4434a56e
ME
684 /* Try to find again, to avoid concurrently inserting duplicates. */
685 found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
cd11016e 686 if (!found) {
b232b999 687 struct stack_record *new =
31639fd6 688 depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc);
7f2b8818 689
cd11016e 690 if (new) {
4434a56e
ME
691 /*
692 * This releases the stack record into the bucket and
693 * makes it visible to readers in find_stack().
694 */
695 list_add_rcu(&new->hash_list, bucket);
cd11016e
AP
696 found = new;
697 }
4434a56e
ME
698 }
699
700 if (prealloc) {
cd11016e 701 /*
4434a56e
ME
702 * Either stack depot already contains this stack trace, or
703 * depot_alloc_stack() did not consume the preallocated memory.
704 * Try to keep the preallocated memory for future.
cd11016e 705 */
4434a56e 706 depot_keep_new_pool(&prealloc);
cd11016e
AP
707 }
708
a914d8d6 709 printk_deferred_exit();
4434a56e 710 raw_spin_unlock_irqrestore(&pool_lock, flags);
cd11016e
AP
711exit:
712 if (prealloc) {
b232b999 713 /* Stack depot didn't use this memory, free it. */
424cafee 714 free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
cd11016e
AP
715 }
716 if (found)
603c000c
AK
717 handle = found->handle.handle;
718 return handle;
cd11016e 719}
022012dc 720EXPORT_SYMBOL_GPL(stack_depot_save_flags);
11ac25c6 721
11ac25c6
ME
722depot_stack_handle_t stack_depot_save(unsigned long *entries,
723 unsigned int nr_entries,
724 gfp_t alloc_flags)
725{
022012dc
AK
726 return stack_depot_save_flags(entries, nr_entries, alloc_flags,
727 STACK_DEPOT_FLAG_CAN_ALLOC);
11ac25c6 728}
c0cfc337 729EXPORT_SYMBOL_GPL(stack_depot_save);
15ef6a98 730
15ef6a98
AK
731unsigned int stack_depot_fetch(depot_stack_handle_t handle,
732 unsigned long **entries)
733{
15ef6a98
AK
734 struct stack_record *stack;
735
736 *entries = NULL;
8e00b2df
AP
737 /*
738 * Let KMSAN know *entries is initialized. This shall prevent false
739 * positive reports if instrumented code accesses it.
740 */
741 kmsan_unpoison_memory(entries, sizeof(*entries));
742
0c5d44a8 743 if (!handle || stack_depot_disabled)
15ef6a98
AK
744 return 0;
745
83130ab2 746 stack = depot_fetch_stack(handle);
4434a56e
ME
747 /*
748 * Should never be NULL, otherwise this is a use-after-put (or just a
749 * corrupt handle).
750 */
751 if (WARN(!stack, "corrupt handle or use after stack_depot_put()"))
752 return 0;
a6cd9570 753
15ef6a98
AK
754 *entries = stack->entries;
755 return stack->size;
756}
757EXPORT_SYMBOL_GPL(stack_depot_fetch);
758
108be8de
AK
759void stack_depot_put(depot_stack_handle_t handle)
760{
761 struct stack_record *stack;
108be8de
AK
762
763 if (!handle || stack_depot_disabled)
764 return;
765
108be8de 766 stack = depot_fetch_stack(handle);
4434a56e
ME
767 /*
768 * Should always be able to find the stack record, otherwise this is an
769 * unbalanced put attempt (or corrupt handle).
770 */
771 if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()"))
772 return;
108be8de 773
4434a56e 774 if (refcount_dec_and_test(&stack->count))
108be8de 775 depot_free_stack(stack);
108be8de
AK
776}
777EXPORT_SYMBOL_GPL(stack_depot_put);
778
15ef6a98
AK
779void stack_depot_print(depot_stack_handle_t stack)
780{
781 unsigned long *entries;
782 unsigned int nr_entries;
783
784 nr_entries = stack_depot_fetch(stack, &entries);
785 if (nr_entries > 0)
786 stack_trace_print(entries, nr_entries, 0);
787}
788EXPORT_SYMBOL_GPL(stack_depot_print);
789
15ef6a98
AK
790int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
791 int spaces)
792{
793 unsigned long *entries;
794 unsigned int nr_entries;
795
796 nr_entries = stack_depot_fetch(handle, &entries);
797 return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
798 spaces) : 0;
799}
800EXPORT_SYMBOL_GPL(stack_depot_snprint);
801
36aa1e67
AK
802depot_stack_handle_t __must_check stack_depot_set_extra_bits(
803 depot_stack_handle_t handle, unsigned int extra_bits)
804{
805 union handle_parts parts = { .handle = handle };
806
807 /* Don't set extra bits on empty handles. */
808 if (!handle)
809 return 0;
810
811 parts.extra = extra_bits;
812 return parts.handle;
813}
814EXPORT_SYMBOL(stack_depot_set_extra_bits);
815
15ef6a98
AK
816unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
817{
818 union handle_parts parts = { .handle = handle };
819
820 return parts.extra;
821}
822EXPORT_SYMBOL(stack_depot_get_extra_bits);
c2a29254
ME
823
824static int stats_show(struct seq_file *seq, void *v)
825{
826 /*
827 * data race ok: These are just statistics counters, and approximate
828 * statistics are ok for debugging.
829 */
830 seq_printf(seq, "pools: %d\n", data_race(pools_num));
831 for (int i = 0; i < DEPOT_COUNTER_COUNT; i++)
832 seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i]));
833
834 return 0;
835}
836DEFINE_SHOW_ATTRIBUTE(stats);
837
838static int depot_debugfs_init(void)
839{
840 struct dentry *dir;
841
842 if (stack_depot_disabled)
843 return 0;
844
845 dir = debugfs_create_dir("stackdepot", NULL);
846 debugfs_create_file("stats", 0444, dir, NULL, &stats_fops);
847 return 0;
848}
849late_initcall(depot_debugfs_init);