Merge tag 'arm-smmu-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/will...
[linux-block.git] / lib / stackdepot.c
CommitLineData
1802d0be 1// SPDX-License-Identifier: GPL-2.0-only
cd11016e 2/*
b232b999 3 * Stack depot - a stack trace storage that avoids duplication.
cd11016e 4 *
b232b999
AK
5 * Internally, stack depot maintains a hash table of unique stacktraces. The
6 * stack traces themselves are stored contiguously one after another in a set
7 * of separate page allocations.
8 *
cd11016e
AP
9 * Author: Alexander Potapenko <glider@google.com>
10 * Copyright (C) 2016 Google, Inc.
11 *
b232b999 12 * Based on the code by Dmitry Chernenkov.
cd11016e
AP
13 */
14
4a6b5314
AK
15#define pr_fmt(fmt) "stackdepot: " fmt
16
c2a29254 17#include <linux/debugfs.h>
cd11016e
AP
18#include <linux/gfp.h>
19#include <linux/jhash.h>
20#include <linux/kernel.h>
8e00b2df 21#include <linux/kmsan.h>
4805180b 22#include <linux/list.h>
cd11016e 23#include <linux/mm.h>
2dba5eb1 24#include <linux/mutex.h>
cd11016e 25#include <linux/printk.h>
4434a56e
ME
26#include <linux/rculist.h>
27#include <linux/rcupdate.h>
410b764f 28#include <linux/refcount.h>
cd11016e 29#include <linux/slab.h>
a6cd9570 30#include <linux/spinlock.h>
cd11016e
AP
31#include <linux/stacktrace.h>
32#include <linux/stackdepot.h>
33#include <linux/string.h>
34#include <linux/types.h>
e1fdc403 35#include <linux/memblock.h>
f9987921 36#include <linux/kasan-enabled.h>
cd11016e 37
424cafee
AK
38#define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8)
39
424cafee
AK
40#define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */
41#define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER))
42#define DEPOT_STACK_ALIGN 4
43#define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN)
5f9ce55e
AK
44#define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \
45 STACK_DEPOT_EXTRA_BITS)
bd9d9624
AK
46#if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32
47/*
48 * KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack
49 * traces. As KMSAN does not support evicting stack traces from the stack
50 * depot, the stack depot capacity might be reached quickly with large stack
51 * records. Adjust the maximum number of stack depot pools for this case.
52 */
53#define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16))
54#else
424cafee 55#define DEPOT_POOLS_CAP 8192
bd9d9624 56#endif
424cafee
AK
57#define DEPOT_MAX_POOLS \
58 (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \
59 (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP)
cd11016e 60
b232b999 61/* Compact structure that stores a reference to a stack. */
cd11016e
AP
62union handle_parts {
63 depot_stack_handle_t handle;
64 struct {
424cafee
AK
65 u32 pool_index : DEPOT_POOL_INDEX_BITS;
66 u32 offset : DEPOT_OFFSET_BITS;
424cafee 67 u32 extra : STACK_DEPOT_EXTRA_BITS;
cd11016e
AP
68 };
69};
70
71struct stack_record {
4434a56e 72 struct list_head hash_list; /* Links in the hash table */
b29d3188 73 u32 hash; /* Hash in hash table */
b232b999 74 u32 size; /* Number of stored frames */
4434a56e 75 union handle_parts handle; /* Constant after initialization */
410b764f 76 refcount_t count;
4434a56e
ME
77 union {
78 unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */
79 struct {
80 /*
81 * An important invariant of the implementation is to
82 * only place a stack record onto the freelist iff its
83 * refcount is zero. Because stack records with a zero
84 * refcount are never considered as valid, it is safe to
85 * union @entries and freelist management state below.
86 * Conversely, as soon as an entry is off the freelist
87 * and its refcount becomes non-zero, the below must not
88 * be accessed until being placed back on the freelist.
89 */
90 struct list_head free_list; /* Links in the freelist */
91 unsigned long rcu_state; /* RCU cookie */
92 };
93 };
cd11016e
AP
94};
95
fc60e0ca
AK
96#define DEPOT_STACK_RECORD_SIZE \
97 ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN)
98
735df3c3 99static bool stack_depot_disabled;
1c0310ad 100static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
a5f1783b
VB
101static bool __stack_depot_early_init_passed __initdata;
102
0d249ac0 103/* Use one hash table bucket per 16 KB of memory. */
4c2e9a67 104#define STACK_HASH_TABLE_SCALE 14
0d249ac0 105/* Limit the number of buckets between 4K and 1M. */
4c2e9a67
AK
106#define STACK_BUCKET_NUMBER_ORDER_MIN 12
107#define STACK_BUCKET_NUMBER_ORDER_MAX 20
0d249ac0 108/* Initial seed for jhash2. */
cd11016e
AP
109#define STACK_HASH_SEED 0x9747b28c
110
4805180b
AK
111/* Hash table of stored stack records. */
112static struct list_head *stack_table;
0d249ac0 113/* Fixed order of the number of table buckets. Used when KASAN is enabled. */
4c2e9a67 114static unsigned int stack_bucket_number_order;
0d249ac0 115/* Hash mask for indexing the table. */
f9987921
VB
116static unsigned int stack_hash_mask;
117
4805180b 118/* Array of memory regions that store stack records. */
424cafee 119static void *stack_pools[DEPOT_MAX_POOLS];
a5d21f71
AK
120/* Newly allocated pool that is not yet added to stack_pools. */
121static void *new_pool;
b29d3188
AK
122/* Number of pools in stack_pools. */
123static int pools_num;
4805180b
AK
124/* Freelist of stack records within stack_pools. */
125static LIST_HEAD(free_stacks);
d11a5621
AK
126/*
127 * Stack depot tries to keep an extra pool allocated even before it runs out
b6a353d3
AK
128 * of space in the currently used pool. This flag marks whether this extra pool
129 * needs to be allocated. It has the value 0 when either an extra pool is not
130 * yet allocated or if the limit on the number of pools is reached.
d11a5621 131 */
a6cd9570 132static bool new_pool_required = true;
4434a56e
ME
133/* The lock must be held when performing pool or freelist modifications. */
134static DEFINE_RAW_SPINLOCK(pool_lock);
e1fdc403 135
c2a29254
ME
136/* Statistics counters for debugfs. */
137enum depot_counter_id {
138 DEPOT_COUNTER_ALLOCS,
139 DEPOT_COUNTER_FREES,
140 DEPOT_COUNTER_INUSE,
141 DEPOT_COUNTER_FREELIST_SIZE,
142 DEPOT_COUNTER_COUNT,
143};
144static long counters[DEPOT_COUNTER_COUNT];
145static const char *const counter_names[] = {
146 [DEPOT_COUNTER_ALLOCS] = "allocations",
147 [DEPOT_COUNTER_FREES] = "frees",
148 [DEPOT_COUNTER_INUSE] = "in_use",
149 [DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size",
150};
151static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
152
735df3c3 153static int __init disable_stack_depot(char *str)
e1fdc403 154{
4d07a037 155 return kstrtobool(str, &stack_depot_disabled);
e1fdc403 156}
735df3c3 157early_param("stack_depot_disable", disable_stack_depot);
e1fdc403 158
1c0310ad 159void __init stack_depot_request_early_init(void)
a5f1783b 160{
1c0310ad 161 /* Too late to request early init now. */
a5f1783b
VB
162 WARN_ON(__stack_depot_early_init_passed);
163
1c0310ad 164 __stack_depot_early_init_requested = true;
a5f1783b
VB
165}
166
4805180b
AK
167/* Initialize list_head's within the hash table. */
168static void init_stack_table(unsigned long entries)
169{
170 unsigned long i;
171
172 for (i = 0; i < entries; i++)
173 INIT_LIST_HEAD(&stack_table[i]);
174}
175
df225c87 176/* Allocates a hash table via memblock. Can only be used during early boot. */
a5f1783b
VB
177int __init stack_depot_early_init(void)
178{
f9987921 179 unsigned long entries = 0;
a5f1783b 180
df225c87 181 /* This function must be called only once, from mm_init(). */
a5f1783b
VB
182 if (WARN_ON(__stack_depot_early_init_passed))
183 return 0;
a5f1783b
VB
184 __stack_depot_early_init_passed = true;
185
4d07a037
AK
186 /*
187 * Print disabled message even if early init has not been requested:
188 * stack_depot_init() will not print one.
189 */
190 if (stack_depot_disabled) {
191 pr_info("disabled\n");
192 return 0;
193 }
194
df225c87
AK
195 /*
196 * If KASAN is enabled, use the maximum order: KASAN is frequently used
197 * in fuzzing scenarios, which leads to a large number of different
198 * stack traces being stored in stack depot.
199 */
4c2e9a67
AK
200 if (kasan_enabled() && !stack_bucket_number_order)
201 stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX;
f9987921 202
4d07a037
AK
203 /*
204 * Check if early init has been requested after setting
205 * stack_bucket_number_order: stack_depot_init() uses its value.
206 */
207 if (!__stack_depot_early_init_requested)
a5f1783b
VB
208 return 0;
209
df225c87 210 /*
4c2e9a67 211 * If stack_bucket_number_order is not set, leave entries as 0 to rely
4805180b 212 * on the automatic calculations performed by alloc_large_system_hash().
df225c87 213 */
4c2e9a67
AK
214 if (stack_bucket_number_order)
215 entries = 1UL << stack_bucket_number_order;
df225c87 216 pr_info("allocating hash table via alloc_large_system_hash\n");
f9987921 217 stack_table = alloc_large_system_hash("stackdepot",
4805180b 218 sizeof(struct list_head),
f9987921 219 entries,
4c2e9a67 220 STACK_HASH_TABLE_SCALE,
4805180b 221 HASH_EARLY,
f9987921
VB
222 NULL,
223 &stack_hash_mask,
4c2e9a67
AK
224 1UL << STACK_BUCKET_NUMBER_ORDER_MIN,
225 1UL << STACK_BUCKET_NUMBER_ORDER_MAX);
a5f1783b 226 if (!stack_table) {
4a6b5314 227 pr_err("hash table allocation failed, disabling\n");
735df3c3 228 stack_depot_disabled = true;
a5f1783b
VB
229 return -ENOMEM;
230 }
4805180b
AK
231 if (!entries) {
232 /*
233 * Obtain the number of entries that was calculated by
234 * alloc_large_system_hash().
235 */
236 entries = stack_hash_mask + 1;
237 }
238 init_stack_table(entries);
a5f1783b
VB
239
240 return 0;
241}
242
df225c87 243/* Allocates a hash table via kvcalloc. Can be used after boot. */
a5f1783b 244int stack_depot_init(void)
e1fdc403 245{
2dba5eb1 246 static DEFINE_MUTEX(stack_depot_init_mutex);
c60324fb 247 unsigned long entries;
a5f1783b 248 int ret = 0;
2dba5eb1
VB
249
250 mutex_lock(&stack_depot_init_mutex);
f9987921 251
c60324fb
AK
252 if (stack_depot_disabled || stack_table)
253 goto out_unlock;
f9987921 254
c60324fb 255 /*
4c2e9a67 256 * Similarly to stack_depot_early_init, use stack_bucket_number_order
c60324fb
AK
257 * if assigned, and rely on automatic scaling otherwise.
258 */
4c2e9a67
AK
259 if (stack_bucket_number_order) {
260 entries = 1UL << stack_bucket_number_order;
c60324fb 261 } else {
4c2e9a67 262 int scale = STACK_HASH_TABLE_SCALE;
c60324fb
AK
263
264 entries = nr_free_buffer_pages();
265 entries = roundup_pow_of_two(entries);
266
267 if (scale > PAGE_SHIFT)
268 entries >>= (scale - PAGE_SHIFT);
269 else
270 entries <<= (PAGE_SHIFT - scale);
e1fdc403 271 }
c60324fb 272
4c2e9a67
AK
273 if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN)
274 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN;
275 if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX)
276 entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX;
c60324fb
AK
277
278 pr_info("allocating hash table of %lu entries via kvcalloc\n", entries);
4805180b 279 stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL);
c60324fb
AK
280 if (!stack_table) {
281 pr_err("hash table allocation failed, disabling\n");
282 stack_depot_disabled = true;
283 ret = -ENOMEM;
284 goto out_unlock;
285 }
286 stack_hash_mask = entries - 1;
4805180b 287 init_stack_table(entries);
c60324fb
AK
288
289out_unlock:
2dba5eb1 290 mutex_unlock(&stack_depot_init_mutex);
c60324fb 291
a5f1783b 292 return ret;
e1fdc403 293}
2dba5eb1 294EXPORT_SYMBOL_GPL(stack_depot_init);
cd11016e 295
4434a56e
ME
296/*
297 * Initializes new stack depot @pool, release all its entries to the freelist,
298 * and update the list of pools.
299 */
b29d3188
AK
300static void depot_init_pool(void *pool)
301{
4805180b 302 int offset;
b29d3188 303
4434a56e 304 lockdep_assert_held(&pool_lock);
4805180b
AK
305
306 /* Initialize handles and link stack records into the freelist. */
307 for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
308 offset += DEPOT_STACK_RECORD_SIZE) {
b29d3188
AK
309 struct stack_record *stack = pool + offset;
310
311 stack->handle.pool_index = pools_num;
312 stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
313 stack->handle.extra = 0;
314
4434a56e
ME
315 /*
316 * Stack traces of size 0 are never saved, and we can simply use
317 * the size field as an indicator if this is a new unused stack
318 * record in the freelist.
319 */
320 stack->size = 0;
321
322 INIT_LIST_HEAD(&stack->hash_list);
323 /*
324 * Add to the freelist front to prioritize never-used entries:
325 * required in case there are entries in the freelist, but their
326 * RCU cookie still belongs to the current RCU grace period
327 * (there can still be concurrent readers).
328 */
329 list_add(&stack->free_list, &free_stacks);
c2a29254 330 counters[DEPOT_COUNTER_FREELIST_SIZE]++;
b29d3188
AK
331 }
332
b29d3188
AK
333 /* Save reference to the pool to be used by depot_fetch_stack(). */
334 stack_pools[pools_num] = pool;
4434a56e
ME
335
336 /* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
337 WRITE_ONCE(pools_num, pools_num + 1);
338 ASSERT_EXCLUSIVE_WRITER(pools_num);
b29d3188
AK
339}
340
b6a353d3
AK
341/* Keeps the preallocated memory to be used for a new stack depot pool. */
342static void depot_keep_new_pool(void **prealloc)
15ef6a98 343{
4434a56e 344 lockdep_assert_held(&pool_lock);
a6cd9570 345
15ef6a98 346 /*
b6a353d3 347 * If a new pool is already saved or the maximum number of
d11a5621 348 * pools is reached, do not use the preallocated memory.
15ef6a98 349 */
b6a353d3 350 if (!new_pool_required)
514d5c55 351 return;
cd0fc64e 352
94b7d328 353 /*
b6a353d3 354 * Use the preallocated memory for the new pool
94b7d328
AK
355 * as long as we do not exceed the maximum number of pools.
356 */
b29d3188 357 if (pools_num < DEPOT_MAX_POOLS) {
a5d21f71 358 new_pool = *prealloc;
15ef6a98 359 *prealloc = NULL;
15ef6a98 360 }
94b7d328
AK
361
362 /*
b6a353d3 363 * At this point, either a new pool is kept or the maximum
94b7d328
AK
364 * number of pools is reached. In either case, take note that
365 * keeping another pool is not required.
94b7d328 366 */
4434a56e 367 WRITE_ONCE(new_pool_required, false);
15ef6a98
AK
368}
369
4434a56e
ME
370/*
371 * Try to initialize a new stack depot pool from either a previous or the
372 * current pre-allocation, and release all its entries to the freelist.
373 */
374static bool depot_try_init_pool(void **prealloc)
15ef6a98 375{
4434a56e 376 lockdep_assert_held(&pool_lock);
b29d3188
AK
377
378 /* Check if we have a new pool saved and use it. */
379 if (new_pool) {
380 depot_init_pool(new_pool);
a5d21f71 381 new_pool = NULL;
b29d3188
AK
382
383 /* Take note that we might need a new new_pool. */
384 if (pools_num < DEPOT_MAX_POOLS)
4434a56e 385 WRITE_ONCE(new_pool_required, true);
b29d3188 386
4434a56e 387 return true;
b29d3188
AK
388 }
389
390 /* Bail out if we reached the pool limit. */
391 if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
392 WARN_ONCE(1, "Stack depot reached limit capacity");
393 return false;
15ef6a98 394 }
cd0fc64e 395
b29d3188
AK
396 /* Check if we have preallocated memory and use it. */
397 if (*prealloc) {
398 depot_init_pool(*prealloc);
94b7d328
AK
399 *prealloc = NULL;
400 return true;
401 }
402
b29d3188 403 return false;
4434a56e
ME
404}
405
406/* Try to find next free usable entry. */
407static struct stack_record *depot_pop_free(void)
408{
409 struct stack_record *stack;
b29d3188 410
4434a56e
ME
411 lockdep_assert_held(&pool_lock);
412
413 if (list_empty(&free_stacks))
414 return NULL;
415
416 /*
417 * We maintain the invariant that the elements in front are least
418 * recently used, and are therefore more likely to be associated with an
419 * RCU grace period in the past. Consequently it is sufficient to only
420 * check the first entry.
421 */
422 stack = list_first_entry(&free_stacks, struct stack_record, free_list);
423 if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state))
424 return NULL;
425
426 list_del(&stack->free_list);
427 counters[DEPOT_COUNTER_FREELIST_SIZE]--;
428
429 return stack;
94b7d328
AK
430}
431
432/* Allocates a new stack in a stack depot pool. */
433static struct stack_record *
434depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
435{
436 struct stack_record *stack;
94b7d328 437
4434a56e 438 lockdep_assert_held(&pool_lock);
a6cd9570 439
4434a56e
ME
440 /* This should already be checked by public API entry points. */
441 if (WARN_ON_ONCE(!size))
94b7d328 442 return NULL;
cd0fc64e 443
b29d3188 444 /* Check if we have a stack record to save the stack trace. */
4434a56e
ME
445 stack = depot_pop_free();
446 if (!stack) {
447 /* No usable entries on the freelist - try to refill the freelist. */
448 if (!depot_try_init_pool(prealloc))
449 return NULL;
450 stack = depot_pop_free();
451 if (WARN_ON(!stack))
452 return NULL;
453 }
b29d3188 454
fc60e0ca
AK
455 /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
456 if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
457 size = CONFIG_STACKDEPOT_MAX_FRAMES;
458
cd0fc64e 459 /* Save the stack trace. */
15ef6a98
AK
460 stack->hash = hash;
461 stack->size = size;
b29d3188 462 /* stack->handle is already filled in by depot_init_pool(). */
410b764f 463 refcount_set(&stack->count, 1);
15ef6a98 464 memcpy(stack->entries, entries, flex_array_size(stack, entries, size));
83130ab2 465
8e00b2df
AP
466 /*
467 * Let KMSAN know the stored stack record is initialized. This shall
468 * prevent false positive reports if instrumented code accesses it.
469 */
b29d3188 470 kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
15ef6a98 471
c2a29254
ME
472 counters[DEPOT_COUNTER_ALLOCS]++;
473 counters[DEPOT_COUNTER_INUSE]++;
15ef6a98
AK
474 return stack;
475}
476
83130ab2
AK
477static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
478{
4434a56e 479 const int pools_num_cached = READ_ONCE(pools_num);
83130ab2 480 union handle_parts parts = { .handle = handle };
83130ab2
AK
481 void *pool;
482 size_t offset = parts.offset << DEPOT_STACK_ALIGN;
483 struct stack_record *stack;
484
4434a56e 485 lockdep_assert_not_held(&pool_lock);
a6cd9570 486
4434a56e 487 if (parts.pool_index > pools_num_cached) {
83130ab2 488 WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
4434a56e 489 parts.pool_index, pools_num_cached, handle);
83130ab2
AK
490 return NULL;
491 }
492
493 pool = stack_pools[parts.pool_index];
4434a56e 494 if (WARN_ON(!pool))
83130ab2
AK
495 return NULL;
496
497 stack = pool + offset;
4434a56e
ME
498 if (WARN_ON(!refcount_read(&stack->count)))
499 return NULL;
500
83130ab2
AK
501 return stack;
502}
503
108be8de
AK
504/* Links stack into the freelist. */
505static void depot_free_stack(struct stack_record *stack)
506{
4434a56e
ME
507 unsigned long flags;
508
509 lockdep_assert_not_held(&pool_lock);
510
511 raw_spin_lock_irqsave(&pool_lock, flags);
512 printk_deferred_enter();
108be8de 513
4434a56e
ME
514 /*
515 * Remove the entry from the hash list. Concurrent list traversal may
516 * still observe the entry, but since the refcount is zero, this entry
517 * will no longer be considered as valid.
518 */
519 list_del_rcu(&stack->hash_list);
520
521 /*
522 * Due to being used from constrained contexts such as the allocators,
523 * NMI, or even RCU itself, stack depot cannot rely on primitives that
524 * would sleep (such as synchronize_rcu()) or recursively call into
525 * stack depot again (such as call_rcu()).
526 *
527 * Instead, get an RCU cookie, so that we can ensure this entry isn't
528 * moved onto another list until the next grace period, and concurrent
529 * RCU list traversal remains safe.
530 */
531 stack->rcu_state = get_state_synchronize_rcu();
532
533 /*
534 * Add the entry to the freelist tail, so that older entries are
535 * considered first - their RCU cookie is more likely to no longer be
536 * associated with the current grace period.
537 */
538 list_add_tail(&stack->free_list, &free_stacks);
c2a29254
ME
539
540 counters[DEPOT_COUNTER_FREELIST_SIZE]++;
541 counters[DEPOT_COUNTER_FREES]++;
542 counters[DEPOT_COUNTER_INUSE]--;
4434a56e
ME
543
544 printk_deferred_exit();
545 raw_spin_unlock_irqrestore(&pool_lock, flags);
108be8de
AK
546}
547
b232b999 548/* Calculates the hash for a stack. */
cd11016e
AP
549static inline u32 hash_stack(unsigned long *entries, unsigned int size)
550{
551 return jhash2((u32 *)entries,
180644f8
GS
552 array_size(size, sizeof(*entries)) / sizeof(u32),
553 STACK_HASH_SEED);
cd11016e
AP
554}
555
b232b999
AK
556/*
557 * Non-instrumented version of memcmp().
558 * Does not check the lexicographical order, only the equality.
a571b272
AP
559 */
560static inline
561int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
562 unsigned int n)
563{
564 for ( ; n-- ; u1++, u2++) {
565 if (*u1 != *u2)
566 return 1;
567 }
568 return 0;
569}
570
b232b999 571/* Finds a stack in a bucket of the hash table. */
4805180b 572static inline struct stack_record *find_stack(struct list_head *bucket,
4434a56e
ME
573 unsigned long *entries, int size,
574 u32 hash, depot_flags_t flags)
cd11016e 575{
4434a56e
ME
576 struct stack_record *stack, *ret = NULL;
577
578 /*
579 * Stack depot may be used from instrumentation that instruments RCU or
580 * tracing itself; use variant that does not call into RCU and cannot be
581 * traced.
582 *
583 * Note: Such use cases must take care when using refcounting to evict
584 * unused entries, because the stack record free-then-reuse code paths
585 * do call into RCU.
586 */
587 rcu_read_lock_sched_notrace();
588
589 list_for_each_entry_rcu(stack, bucket, hash_list) {
590 if (stack->hash != hash || stack->size != size)
591 continue;
cd11016e 592
4434a56e
ME
593 /*
594 * This may race with depot_free_stack() accessing the freelist
595 * management state unioned with @entries. The refcount is zero
596 * in that case and the below refcount_inc_not_zero() will fail.
597 */
598 if (data_race(stackdepot_memcmp(entries, stack->entries, size)))
599 continue;
a6cd9570 600
4434a56e
ME
601 /*
602 * Try to increment refcount. If this succeeds, the stack record
603 * is valid and has not yet been freed.
604 *
605 * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior
606 * to then call stack_depot_put() later, and we can assume that
607 * a stack record is never placed back on the freelist.
608 */
609 if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count))
610 continue;
611
612 ret = stack;
613 break;
cd11016e 614 }
4434a56e
ME
615
616 rcu_read_unlock_sched_notrace();
617
618 return ret;
cd11016e
AP
619}
620
022012dc
AK
621depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
622 unsigned int nr_entries,
623 gfp_t alloc_flags,
624 depot_flags_t depot_flags)
cd11016e 625{
4805180b
AK
626 struct list_head *bucket;
627 struct stack_record *found = NULL;
603c000c 628 depot_stack_handle_t handle = 0;
cd11016e
AP
629 struct page *page = NULL;
630 void *prealloc = NULL;
022012dc 631 bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
c0cfc337
TG
632 unsigned long flags;
633 u32 hash;
cd11016e 634
022012dc
AK
635 if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK))
636 return 0;
637
e9400660
ME
638 /*
639 * If this stack trace is from an interrupt, including anything before
b232b999 640 * interrupt entry usually leads to unbounded stack depot growth.
e9400660 641 *
b232b999
AK
642 * Since use of filter_irq_stacks() is a requirement to ensure stack
643 * depot can efficiently deduplicate interrupt stacks, always
644 * filter_irq_stacks() to simplify all callers' use of stack depot.
e9400660
ME
645 */
646 nr_entries = filter_irq_stacks(entries, nr_entries);
647
735df3c3 648 if (unlikely(nr_entries == 0) || stack_depot_disabled)
603c000c 649 return 0;
cd11016e 650
c0cfc337 651 hash = hash_stack(entries, nr_entries);
f9987921 652 bucket = &stack_table[hash & stack_hash_mask];
cd11016e 653
4434a56e
ME
654 /* Fast path: look the stack trace up without locking. */
655 found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
656 if (found)
cd11016e
AP
657 goto exit;
658
659 /*
a6cd9570
AK
660 * Allocate memory for a new pool if required now:
661 * we won't be able to do that under the lock.
cd11016e 662 */
4434a56e 663 if (unlikely(can_alloc && READ_ONCE(new_pool_required))) {
cd11016e
AP
664 /*
665 * Zero out zone modifiers, as we don't have specific zone
666 * requirements. Keep the flags related to allocation in atomic
667 * contexts and I/O.
668 */
669 alloc_flags &= ~GFP_ZONEMASK;
670 alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
87cc271d 671 alloc_flags |= __GFP_NOWARN;
424cafee 672 page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
cd11016e
AP
673 if (page)
674 prealloc = page_address(page);
675 }
676
4434a56e 677 raw_spin_lock_irqsave(&pool_lock, flags);
a914d8d6 678 printk_deferred_enter();
cd11016e 679
4434a56e
ME
680 /* Try to find again, to avoid concurrently inserting duplicates. */
681 found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
cd11016e 682 if (!found) {
b232b999
AK
683 struct stack_record *new =
684 depot_alloc_stack(entries, nr_entries, hash, &prealloc);
7f2b8818 685
cd11016e 686 if (new) {
4434a56e
ME
687 /*
688 * This releases the stack record into the bucket and
689 * makes it visible to readers in find_stack().
690 */
691 list_add_rcu(&new->hash_list, bucket);
cd11016e
AP
692 found = new;
693 }
4434a56e
ME
694 }
695
696 if (prealloc) {
cd11016e 697 /*
4434a56e
ME
698 * Either stack depot already contains this stack trace, or
699 * depot_alloc_stack() did not consume the preallocated memory.
700 * Try to keep the preallocated memory for future.
cd11016e 701 */
4434a56e 702 depot_keep_new_pool(&prealloc);
cd11016e
AP
703 }
704
a914d8d6 705 printk_deferred_exit();
4434a56e 706 raw_spin_unlock_irqrestore(&pool_lock, flags);
cd11016e
AP
707exit:
708 if (prealloc) {
b232b999 709 /* Stack depot didn't use this memory, free it. */
424cafee 710 free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
cd11016e
AP
711 }
712 if (found)
603c000c
AK
713 handle = found->handle.handle;
714 return handle;
cd11016e 715}
022012dc 716EXPORT_SYMBOL_GPL(stack_depot_save_flags);
11ac25c6 717
11ac25c6
ME
718depot_stack_handle_t stack_depot_save(unsigned long *entries,
719 unsigned int nr_entries,
720 gfp_t alloc_flags)
721{
022012dc
AK
722 return stack_depot_save_flags(entries, nr_entries, alloc_flags,
723 STACK_DEPOT_FLAG_CAN_ALLOC);
11ac25c6 724}
c0cfc337 725EXPORT_SYMBOL_GPL(stack_depot_save);
15ef6a98 726
15ef6a98
AK
727unsigned int stack_depot_fetch(depot_stack_handle_t handle,
728 unsigned long **entries)
729{
15ef6a98
AK
730 struct stack_record *stack;
731
732 *entries = NULL;
8e00b2df
AP
733 /*
734 * Let KMSAN know *entries is initialized. This shall prevent false
735 * positive reports if instrumented code accesses it.
736 */
737 kmsan_unpoison_memory(entries, sizeof(*entries));
738
0c5d44a8 739 if (!handle || stack_depot_disabled)
15ef6a98
AK
740 return 0;
741
83130ab2 742 stack = depot_fetch_stack(handle);
4434a56e
ME
743 /*
744 * Should never be NULL, otherwise this is a use-after-put (or just a
745 * corrupt handle).
746 */
747 if (WARN(!stack, "corrupt handle or use after stack_depot_put()"))
748 return 0;
a6cd9570 749
15ef6a98
AK
750 *entries = stack->entries;
751 return stack->size;
752}
753EXPORT_SYMBOL_GPL(stack_depot_fetch);
754
108be8de
AK
755void stack_depot_put(depot_stack_handle_t handle)
756{
757 struct stack_record *stack;
108be8de
AK
758
759 if (!handle || stack_depot_disabled)
760 return;
761
108be8de 762 stack = depot_fetch_stack(handle);
4434a56e
ME
763 /*
764 * Should always be able to find the stack record, otherwise this is an
765 * unbalanced put attempt (or corrupt handle).
766 */
767 if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()"))
768 return;
108be8de 769
4434a56e 770 if (refcount_dec_and_test(&stack->count))
108be8de 771 depot_free_stack(stack);
108be8de
AK
772}
773EXPORT_SYMBOL_GPL(stack_depot_put);
774
15ef6a98
AK
775void stack_depot_print(depot_stack_handle_t stack)
776{
777 unsigned long *entries;
778 unsigned int nr_entries;
779
780 nr_entries = stack_depot_fetch(stack, &entries);
781 if (nr_entries > 0)
782 stack_trace_print(entries, nr_entries, 0);
783}
784EXPORT_SYMBOL_GPL(stack_depot_print);
785
15ef6a98
AK
786int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
787 int spaces)
788{
789 unsigned long *entries;
790 unsigned int nr_entries;
791
792 nr_entries = stack_depot_fetch(handle, &entries);
793 return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
794 spaces) : 0;
795}
796EXPORT_SYMBOL_GPL(stack_depot_snprint);
797
36aa1e67
AK
798depot_stack_handle_t __must_check stack_depot_set_extra_bits(
799 depot_stack_handle_t handle, unsigned int extra_bits)
800{
801 union handle_parts parts = { .handle = handle };
802
803 /* Don't set extra bits on empty handles. */
804 if (!handle)
805 return 0;
806
807 parts.extra = extra_bits;
808 return parts.handle;
809}
810EXPORT_SYMBOL(stack_depot_set_extra_bits);
811
15ef6a98
AK
812unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
813{
814 union handle_parts parts = { .handle = handle };
815
816 return parts.extra;
817}
818EXPORT_SYMBOL(stack_depot_get_extra_bits);
c2a29254
ME
819
820static int stats_show(struct seq_file *seq, void *v)
821{
822 /*
823 * data race ok: These are just statistics counters, and approximate
824 * statistics are ok for debugging.
825 */
826 seq_printf(seq, "pools: %d\n", data_race(pools_num));
827 for (int i = 0; i < DEPOT_COUNTER_COUNT; i++)
828 seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i]));
829
830 return 0;
831}
832DEFINE_SHOW_ATTRIBUTE(stats);
833
834static int depot_debugfs_init(void)
835{
836 struct dentry *dir;
837
838 if (stack_depot_disabled)
839 return 0;
840
841 dir = debugfs_create_dir("stackdepot", NULL);
842 debugfs_create_file("stats", 0444, dir, NULL, &stats_fops);
843 return 0;
844}
845late_initcall(depot_debugfs_init);