Commit | Line | Data |
---|---|---|
1802d0be | 1 | // SPDX-License-Identifier: GPL-2.0-only |
cd11016e | 2 | /* |
b232b999 | 3 | * Stack depot - a stack trace storage that avoids duplication. |
cd11016e | 4 | * |
b232b999 AK |
5 | * Internally, stack depot maintains a hash table of unique stacktraces. The |
6 | * stack traces themselves are stored contiguously one after another in a set | |
7 | * of separate page allocations. | |
8 | * | |
cd11016e AP |
9 | * Author: Alexander Potapenko <glider@google.com> |
10 | * Copyright (C) 2016 Google, Inc. | |
11 | * | |
b232b999 | 12 | * Based on the code by Dmitry Chernenkov. |
cd11016e AP |
13 | */ |
14 | ||
4a6b5314 AK |
15 | #define pr_fmt(fmt) "stackdepot: " fmt |
16 | ||
c2a29254 | 17 | #include <linux/debugfs.h> |
cd11016e AP |
18 | #include <linux/gfp.h> |
19 | #include <linux/jhash.h> | |
20 | #include <linux/kernel.h> | |
8e00b2df | 21 | #include <linux/kmsan.h> |
4805180b | 22 | #include <linux/list.h> |
cd11016e | 23 | #include <linux/mm.h> |
2dba5eb1 | 24 | #include <linux/mutex.h> |
cd11016e | 25 | #include <linux/printk.h> |
4434a56e ME |
26 | #include <linux/rculist.h> |
27 | #include <linux/rcupdate.h> | |
410b764f | 28 | #include <linux/refcount.h> |
cd11016e | 29 | #include <linux/slab.h> |
a6cd9570 | 30 | #include <linux/spinlock.h> |
cd11016e AP |
31 | #include <linux/stacktrace.h> |
32 | #include <linux/stackdepot.h> | |
33 | #include <linux/string.h> | |
34 | #include <linux/types.h> | |
e1fdc403 | 35 | #include <linux/memblock.h> |
f9987921 | 36 | #include <linux/kasan-enabled.h> |
cd11016e | 37 | |
424cafee AK |
38 | #define DEPOT_HANDLE_BITS (sizeof(depot_stack_handle_t) * 8) |
39 | ||
424cafee AK |
40 | #define DEPOT_POOL_ORDER 2 /* Pool size order, 4 pages */ |
41 | #define DEPOT_POOL_SIZE (1LL << (PAGE_SHIFT + DEPOT_POOL_ORDER)) | |
42 | #define DEPOT_STACK_ALIGN 4 | |
43 | #define DEPOT_OFFSET_BITS (DEPOT_POOL_ORDER + PAGE_SHIFT - DEPOT_STACK_ALIGN) | |
5f9ce55e AK |
44 | #define DEPOT_POOL_INDEX_BITS (DEPOT_HANDLE_BITS - DEPOT_OFFSET_BITS - \ |
45 | STACK_DEPOT_EXTRA_BITS) | |
bd9d9624 AK |
46 | #if IS_ENABLED(CONFIG_KMSAN) && CONFIG_STACKDEPOT_MAX_FRAMES >= 32 |
47 | /* | |
48 | * KMSAN is frequently used in fuzzing scenarios and thus saves a lot of stack | |
49 | * traces. As KMSAN does not support evicting stack traces from the stack | |
50 | * depot, the stack depot capacity might be reached quickly with large stack | |
51 | * records. Adjust the maximum number of stack depot pools for this case. | |
52 | */ | |
53 | #define DEPOT_POOLS_CAP (8192 * (CONFIG_STACKDEPOT_MAX_FRAMES / 16)) | |
54 | #else | |
424cafee | 55 | #define DEPOT_POOLS_CAP 8192 |
bd9d9624 | 56 | #endif |
424cafee AK |
57 | #define DEPOT_MAX_POOLS \ |
58 | (((1LL << (DEPOT_POOL_INDEX_BITS)) < DEPOT_POOLS_CAP) ? \ | |
59 | (1LL << (DEPOT_POOL_INDEX_BITS)) : DEPOT_POOLS_CAP) | |
cd11016e | 60 | |
b232b999 | 61 | /* Compact structure that stores a reference to a stack. */ |
cd11016e AP |
62 | union handle_parts { |
63 | depot_stack_handle_t handle; | |
64 | struct { | |
424cafee AK |
65 | u32 pool_index : DEPOT_POOL_INDEX_BITS; |
66 | u32 offset : DEPOT_OFFSET_BITS; | |
424cafee | 67 | u32 extra : STACK_DEPOT_EXTRA_BITS; |
cd11016e AP |
68 | }; |
69 | }; | |
70 | ||
71 | struct stack_record { | |
4434a56e | 72 | struct list_head hash_list; /* Links in the hash table */ |
b29d3188 | 73 | u32 hash; /* Hash in hash table */ |
b232b999 | 74 | u32 size; /* Number of stored frames */ |
4434a56e | 75 | union handle_parts handle; /* Constant after initialization */ |
410b764f | 76 | refcount_t count; |
4434a56e ME |
77 | union { |
78 | unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES]; /* Frames */ | |
79 | struct { | |
80 | /* | |
81 | * An important invariant of the implementation is to | |
82 | * only place a stack record onto the freelist iff its | |
83 | * refcount is zero. Because stack records with a zero | |
84 | * refcount are never considered as valid, it is safe to | |
85 | * union @entries and freelist management state below. | |
86 | * Conversely, as soon as an entry is off the freelist | |
87 | * and its refcount becomes non-zero, the below must not | |
88 | * be accessed until being placed back on the freelist. | |
89 | */ | |
90 | struct list_head free_list; /* Links in the freelist */ | |
91 | unsigned long rcu_state; /* RCU cookie */ | |
92 | }; | |
93 | }; | |
cd11016e AP |
94 | }; |
95 | ||
fc60e0ca AK |
96 | #define DEPOT_STACK_RECORD_SIZE \ |
97 | ALIGN(sizeof(struct stack_record), 1 << DEPOT_STACK_ALIGN) | |
98 | ||
735df3c3 | 99 | static bool stack_depot_disabled; |
1c0310ad | 100 | static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); |
a5f1783b VB |
101 | static bool __stack_depot_early_init_passed __initdata; |
102 | ||
0d249ac0 | 103 | /* Use one hash table bucket per 16 KB of memory. */ |
4c2e9a67 | 104 | #define STACK_HASH_TABLE_SCALE 14 |
0d249ac0 | 105 | /* Limit the number of buckets between 4K and 1M. */ |
4c2e9a67 AK |
106 | #define STACK_BUCKET_NUMBER_ORDER_MIN 12 |
107 | #define STACK_BUCKET_NUMBER_ORDER_MAX 20 | |
0d249ac0 | 108 | /* Initial seed for jhash2. */ |
cd11016e AP |
109 | #define STACK_HASH_SEED 0x9747b28c |
110 | ||
4805180b AK |
111 | /* Hash table of stored stack records. */ |
112 | static struct list_head *stack_table; | |
0d249ac0 | 113 | /* Fixed order of the number of table buckets. Used when KASAN is enabled. */ |
4c2e9a67 | 114 | static unsigned int stack_bucket_number_order; |
0d249ac0 | 115 | /* Hash mask for indexing the table. */ |
f9987921 VB |
116 | static unsigned int stack_hash_mask; |
117 | ||
4805180b | 118 | /* Array of memory regions that store stack records. */ |
424cafee | 119 | static void *stack_pools[DEPOT_MAX_POOLS]; |
a5d21f71 AK |
120 | /* Newly allocated pool that is not yet added to stack_pools. */ |
121 | static void *new_pool; | |
b29d3188 AK |
122 | /* Number of pools in stack_pools. */ |
123 | static int pools_num; | |
4805180b AK |
124 | /* Freelist of stack records within stack_pools. */ |
125 | static LIST_HEAD(free_stacks); | |
d11a5621 AK |
126 | /* |
127 | * Stack depot tries to keep an extra pool allocated even before it runs out | |
b6a353d3 AK |
128 | * of space in the currently used pool. This flag marks whether this extra pool |
129 | * needs to be allocated. It has the value 0 when either an extra pool is not | |
130 | * yet allocated or if the limit on the number of pools is reached. | |
d11a5621 | 131 | */ |
a6cd9570 | 132 | static bool new_pool_required = true; |
4434a56e ME |
133 | /* The lock must be held when performing pool or freelist modifications. */ |
134 | static DEFINE_RAW_SPINLOCK(pool_lock); | |
e1fdc403 | 135 | |
c2a29254 ME |
136 | /* Statistics counters for debugfs. */ |
137 | enum depot_counter_id { | |
138 | DEPOT_COUNTER_ALLOCS, | |
139 | DEPOT_COUNTER_FREES, | |
140 | DEPOT_COUNTER_INUSE, | |
141 | DEPOT_COUNTER_FREELIST_SIZE, | |
142 | DEPOT_COUNTER_COUNT, | |
143 | }; | |
144 | static long counters[DEPOT_COUNTER_COUNT]; | |
145 | static const char *const counter_names[] = { | |
146 | [DEPOT_COUNTER_ALLOCS] = "allocations", | |
147 | [DEPOT_COUNTER_FREES] = "frees", | |
148 | [DEPOT_COUNTER_INUSE] = "in_use", | |
149 | [DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size", | |
150 | }; | |
151 | static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT); | |
152 | ||
735df3c3 | 153 | static int __init disable_stack_depot(char *str) |
e1fdc403 | 154 | { |
4d07a037 | 155 | return kstrtobool(str, &stack_depot_disabled); |
e1fdc403 | 156 | } |
735df3c3 | 157 | early_param("stack_depot_disable", disable_stack_depot); |
e1fdc403 | 158 | |
1c0310ad | 159 | void __init stack_depot_request_early_init(void) |
a5f1783b | 160 | { |
1c0310ad | 161 | /* Too late to request early init now. */ |
a5f1783b VB |
162 | WARN_ON(__stack_depot_early_init_passed); |
163 | ||
1c0310ad | 164 | __stack_depot_early_init_requested = true; |
a5f1783b VB |
165 | } |
166 | ||
4805180b AK |
167 | /* Initialize list_head's within the hash table. */ |
168 | static void init_stack_table(unsigned long entries) | |
169 | { | |
170 | unsigned long i; | |
171 | ||
172 | for (i = 0; i < entries; i++) | |
173 | INIT_LIST_HEAD(&stack_table[i]); | |
174 | } | |
175 | ||
df225c87 | 176 | /* Allocates a hash table via memblock. Can only be used during early boot. */ |
a5f1783b VB |
177 | int __init stack_depot_early_init(void) |
178 | { | |
f9987921 | 179 | unsigned long entries = 0; |
a5f1783b | 180 | |
df225c87 | 181 | /* This function must be called only once, from mm_init(). */ |
a5f1783b VB |
182 | if (WARN_ON(__stack_depot_early_init_passed)) |
183 | return 0; | |
a5f1783b VB |
184 | __stack_depot_early_init_passed = true; |
185 | ||
4d07a037 AK |
186 | /* |
187 | * Print disabled message even if early init has not been requested: | |
188 | * stack_depot_init() will not print one. | |
189 | */ | |
190 | if (stack_depot_disabled) { | |
191 | pr_info("disabled\n"); | |
192 | return 0; | |
193 | } | |
194 | ||
df225c87 AK |
195 | /* |
196 | * If KASAN is enabled, use the maximum order: KASAN is frequently used | |
197 | * in fuzzing scenarios, which leads to a large number of different | |
198 | * stack traces being stored in stack depot. | |
199 | */ | |
4c2e9a67 AK |
200 | if (kasan_enabled() && !stack_bucket_number_order) |
201 | stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; | |
f9987921 | 202 | |
4d07a037 AK |
203 | /* |
204 | * Check if early init has been requested after setting | |
205 | * stack_bucket_number_order: stack_depot_init() uses its value. | |
206 | */ | |
207 | if (!__stack_depot_early_init_requested) | |
a5f1783b VB |
208 | return 0; |
209 | ||
df225c87 | 210 | /* |
4c2e9a67 | 211 | * If stack_bucket_number_order is not set, leave entries as 0 to rely |
4805180b | 212 | * on the automatic calculations performed by alloc_large_system_hash(). |
df225c87 | 213 | */ |
4c2e9a67 AK |
214 | if (stack_bucket_number_order) |
215 | entries = 1UL << stack_bucket_number_order; | |
df225c87 | 216 | pr_info("allocating hash table via alloc_large_system_hash\n"); |
f9987921 | 217 | stack_table = alloc_large_system_hash("stackdepot", |
4805180b | 218 | sizeof(struct list_head), |
f9987921 | 219 | entries, |
4c2e9a67 | 220 | STACK_HASH_TABLE_SCALE, |
4805180b | 221 | HASH_EARLY, |
f9987921 VB |
222 | NULL, |
223 | &stack_hash_mask, | |
4c2e9a67 AK |
224 | 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, |
225 | 1UL << STACK_BUCKET_NUMBER_ORDER_MAX); | |
a5f1783b | 226 | if (!stack_table) { |
4a6b5314 | 227 | pr_err("hash table allocation failed, disabling\n"); |
735df3c3 | 228 | stack_depot_disabled = true; |
a5f1783b VB |
229 | return -ENOMEM; |
230 | } | |
4805180b AK |
231 | if (!entries) { |
232 | /* | |
233 | * Obtain the number of entries that was calculated by | |
234 | * alloc_large_system_hash(). | |
235 | */ | |
236 | entries = stack_hash_mask + 1; | |
237 | } | |
238 | init_stack_table(entries); | |
a5f1783b VB |
239 | |
240 | return 0; | |
241 | } | |
242 | ||
df225c87 | 243 | /* Allocates a hash table via kvcalloc. Can be used after boot. */ |
a5f1783b | 244 | int stack_depot_init(void) |
e1fdc403 | 245 | { |
2dba5eb1 | 246 | static DEFINE_MUTEX(stack_depot_init_mutex); |
c60324fb | 247 | unsigned long entries; |
a5f1783b | 248 | int ret = 0; |
2dba5eb1 VB |
249 | |
250 | mutex_lock(&stack_depot_init_mutex); | |
f9987921 | 251 | |
c60324fb AK |
252 | if (stack_depot_disabled || stack_table) |
253 | goto out_unlock; | |
f9987921 | 254 | |
c60324fb | 255 | /* |
4c2e9a67 | 256 | * Similarly to stack_depot_early_init, use stack_bucket_number_order |
c60324fb AK |
257 | * if assigned, and rely on automatic scaling otherwise. |
258 | */ | |
4c2e9a67 AK |
259 | if (stack_bucket_number_order) { |
260 | entries = 1UL << stack_bucket_number_order; | |
c60324fb | 261 | } else { |
4c2e9a67 | 262 | int scale = STACK_HASH_TABLE_SCALE; |
c60324fb AK |
263 | |
264 | entries = nr_free_buffer_pages(); | |
265 | entries = roundup_pow_of_two(entries); | |
266 | ||
267 | if (scale > PAGE_SHIFT) | |
268 | entries >>= (scale - PAGE_SHIFT); | |
269 | else | |
270 | entries <<= (PAGE_SHIFT - scale); | |
e1fdc403 | 271 | } |
c60324fb | 272 | |
4c2e9a67 AK |
273 | if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN) |
274 | entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN; | |
275 | if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX) | |
276 | entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; | |
c60324fb AK |
277 | |
278 | pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); | |
4805180b | 279 | stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL); |
c60324fb AK |
280 | if (!stack_table) { |
281 | pr_err("hash table allocation failed, disabling\n"); | |
282 | stack_depot_disabled = true; | |
283 | ret = -ENOMEM; | |
284 | goto out_unlock; | |
285 | } | |
286 | stack_hash_mask = entries - 1; | |
4805180b | 287 | init_stack_table(entries); |
c60324fb AK |
288 | |
289 | out_unlock: | |
2dba5eb1 | 290 | mutex_unlock(&stack_depot_init_mutex); |
c60324fb | 291 | |
a5f1783b | 292 | return ret; |
e1fdc403 | 293 | } |
2dba5eb1 | 294 | EXPORT_SYMBOL_GPL(stack_depot_init); |
cd11016e | 295 | |
4434a56e ME |
296 | /* |
297 | * Initializes new stack depot @pool, release all its entries to the freelist, | |
298 | * and update the list of pools. | |
299 | */ | |
b29d3188 AK |
300 | static void depot_init_pool(void *pool) |
301 | { | |
4805180b | 302 | int offset; |
b29d3188 | 303 | |
4434a56e | 304 | lockdep_assert_held(&pool_lock); |
4805180b AK |
305 | |
306 | /* Initialize handles and link stack records into the freelist. */ | |
307 | for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE; | |
308 | offset += DEPOT_STACK_RECORD_SIZE) { | |
b29d3188 AK |
309 | struct stack_record *stack = pool + offset; |
310 | ||
311 | stack->handle.pool_index = pools_num; | |
312 | stack->handle.offset = offset >> DEPOT_STACK_ALIGN; | |
313 | stack->handle.extra = 0; | |
314 | ||
4434a56e ME |
315 | /* |
316 | * Stack traces of size 0 are never saved, and we can simply use | |
317 | * the size field as an indicator if this is a new unused stack | |
318 | * record in the freelist. | |
319 | */ | |
320 | stack->size = 0; | |
321 | ||
322 | INIT_LIST_HEAD(&stack->hash_list); | |
323 | /* | |
324 | * Add to the freelist front to prioritize never-used entries: | |
325 | * required in case there are entries in the freelist, but their | |
326 | * RCU cookie still belongs to the current RCU grace period | |
327 | * (there can still be concurrent readers). | |
328 | */ | |
329 | list_add(&stack->free_list, &free_stacks); | |
c2a29254 | 330 | counters[DEPOT_COUNTER_FREELIST_SIZE]++; |
b29d3188 AK |
331 | } |
332 | ||
b29d3188 AK |
333 | /* Save reference to the pool to be used by depot_fetch_stack(). */ |
334 | stack_pools[pools_num] = pool; | |
4434a56e ME |
335 | |
336 | /* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */ | |
337 | WRITE_ONCE(pools_num, pools_num + 1); | |
338 | ASSERT_EXCLUSIVE_WRITER(pools_num); | |
b29d3188 AK |
339 | } |
340 | ||
b6a353d3 AK |
341 | /* Keeps the preallocated memory to be used for a new stack depot pool. */ |
342 | static void depot_keep_new_pool(void **prealloc) | |
15ef6a98 | 343 | { |
4434a56e | 344 | lockdep_assert_held(&pool_lock); |
a6cd9570 | 345 | |
15ef6a98 | 346 | /* |
b6a353d3 | 347 | * If a new pool is already saved or the maximum number of |
d11a5621 | 348 | * pools is reached, do not use the preallocated memory. |
15ef6a98 | 349 | */ |
b6a353d3 | 350 | if (!new_pool_required) |
514d5c55 | 351 | return; |
cd0fc64e | 352 | |
94b7d328 | 353 | /* |
b6a353d3 | 354 | * Use the preallocated memory for the new pool |
94b7d328 AK |
355 | * as long as we do not exceed the maximum number of pools. |
356 | */ | |
b29d3188 | 357 | if (pools_num < DEPOT_MAX_POOLS) { |
a5d21f71 | 358 | new_pool = *prealloc; |
15ef6a98 | 359 | *prealloc = NULL; |
15ef6a98 | 360 | } |
94b7d328 AK |
361 | |
362 | /* | |
b6a353d3 | 363 | * At this point, either a new pool is kept or the maximum |
94b7d328 AK |
364 | * number of pools is reached. In either case, take note that |
365 | * keeping another pool is not required. | |
94b7d328 | 366 | */ |
4434a56e | 367 | WRITE_ONCE(new_pool_required, false); |
15ef6a98 AK |
368 | } |
369 | ||
4434a56e ME |
370 | /* |
371 | * Try to initialize a new stack depot pool from either a previous or the | |
372 | * current pre-allocation, and release all its entries to the freelist. | |
373 | */ | |
374 | static bool depot_try_init_pool(void **prealloc) | |
15ef6a98 | 375 | { |
4434a56e | 376 | lockdep_assert_held(&pool_lock); |
b29d3188 AK |
377 | |
378 | /* Check if we have a new pool saved and use it. */ | |
379 | if (new_pool) { | |
380 | depot_init_pool(new_pool); | |
a5d21f71 | 381 | new_pool = NULL; |
b29d3188 AK |
382 | |
383 | /* Take note that we might need a new new_pool. */ | |
384 | if (pools_num < DEPOT_MAX_POOLS) | |
4434a56e | 385 | WRITE_ONCE(new_pool_required, true); |
b29d3188 | 386 | |
4434a56e | 387 | return true; |
b29d3188 AK |
388 | } |
389 | ||
390 | /* Bail out if we reached the pool limit. */ | |
391 | if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { | |
392 | WARN_ONCE(1, "Stack depot reached limit capacity"); | |
393 | return false; | |
15ef6a98 | 394 | } |
cd0fc64e | 395 | |
b29d3188 AK |
396 | /* Check if we have preallocated memory and use it. */ |
397 | if (*prealloc) { | |
398 | depot_init_pool(*prealloc); | |
94b7d328 AK |
399 | *prealloc = NULL; |
400 | return true; | |
401 | } | |
402 | ||
b29d3188 | 403 | return false; |
4434a56e ME |
404 | } |
405 | ||
406 | /* Try to find next free usable entry. */ | |
407 | static struct stack_record *depot_pop_free(void) | |
408 | { | |
409 | struct stack_record *stack; | |
b29d3188 | 410 | |
4434a56e ME |
411 | lockdep_assert_held(&pool_lock); |
412 | ||
413 | if (list_empty(&free_stacks)) | |
414 | return NULL; | |
415 | ||
416 | /* | |
417 | * We maintain the invariant that the elements in front are least | |
418 | * recently used, and are therefore more likely to be associated with an | |
419 | * RCU grace period in the past. Consequently it is sufficient to only | |
420 | * check the first entry. | |
421 | */ | |
422 | stack = list_first_entry(&free_stacks, struct stack_record, free_list); | |
423 | if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state)) | |
424 | return NULL; | |
425 | ||
426 | list_del(&stack->free_list); | |
427 | counters[DEPOT_COUNTER_FREELIST_SIZE]--; | |
428 | ||
429 | return stack; | |
94b7d328 AK |
430 | } |
431 | ||
432 | /* Allocates a new stack in a stack depot pool. */ | |
433 | static struct stack_record * | |
434 | depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) | |
435 | { | |
436 | struct stack_record *stack; | |
94b7d328 | 437 | |
4434a56e | 438 | lockdep_assert_held(&pool_lock); |
a6cd9570 | 439 | |
4434a56e ME |
440 | /* This should already be checked by public API entry points. */ |
441 | if (WARN_ON_ONCE(!size)) | |
94b7d328 | 442 | return NULL; |
cd0fc64e | 443 | |
b29d3188 | 444 | /* Check if we have a stack record to save the stack trace. */ |
4434a56e ME |
445 | stack = depot_pop_free(); |
446 | if (!stack) { | |
447 | /* No usable entries on the freelist - try to refill the freelist. */ | |
448 | if (!depot_try_init_pool(prealloc)) | |
449 | return NULL; | |
450 | stack = depot_pop_free(); | |
451 | if (WARN_ON(!stack)) | |
452 | return NULL; | |
453 | } | |
b29d3188 | 454 | |
fc60e0ca AK |
455 | /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ |
456 | if (size > CONFIG_STACKDEPOT_MAX_FRAMES) | |
457 | size = CONFIG_STACKDEPOT_MAX_FRAMES; | |
458 | ||
cd0fc64e | 459 | /* Save the stack trace. */ |
15ef6a98 AK |
460 | stack->hash = hash; |
461 | stack->size = size; | |
b29d3188 | 462 | /* stack->handle is already filled in by depot_init_pool(). */ |
410b764f | 463 | refcount_set(&stack->count, 1); |
15ef6a98 | 464 | memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); |
83130ab2 | 465 | |
8e00b2df AP |
466 | /* |
467 | * Let KMSAN know the stored stack record is initialized. This shall | |
468 | * prevent false positive reports if instrumented code accesses it. | |
469 | */ | |
b29d3188 | 470 | kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE); |
15ef6a98 | 471 | |
c2a29254 ME |
472 | counters[DEPOT_COUNTER_ALLOCS]++; |
473 | counters[DEPOT_COUNTER_INUSE]++; | |
15ef6a98 AK |
474 | return stack; |
475 | } | |
476 | ||
83130ab2 AK |
477 | static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) |
478 | { | |
4434a56e | 479 | const int pools_num_cached = READ_ONCE(pools_num); |
83130ab2 | 480 | union handle_parts parts = { .handle = handle }; |
83130ab2 AK |
481 | void *pool; |
482 | size_t offset = parts.offset << DEPOT_STACK_ALIGN; | |
483 | struct stack_record *stack; | |
484 | ||
4434a56e | 485 | lockdep_assert_not_held(&pool_lock); |
a6cd9570 | 486 | |
4434a56e | 487 | if (parts.pool_index > pools_num_cached) { |
83130ab2 | 488 | WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", |
4434a56e | 489 | parts.pool_index, pools_num_cached, handle); |
83130ab2 AK |
490 | return NULL; |
491 | } | |
492 | ||
493 | pool = stack_pools[parts.pool_index]; | |
4434a56e | 494 | if (WARN_ON(!pool)) |
83130ab2 AK |
495 | return NULL; |
496 | ||
497 | stack = pool + offset; | |
4434a56e ME |
498 | if (WARN_ON(!refcount_read(&stack->count))) |
499 | return NULL; | |
500 | ||
83130ab2 AK |
501 | return stack; |
502 | } | |
503 | ||
108be8de AK |
504 | /* Links stack into the freelist. */ |
505 | static void depot_free_stack(struct stack_record *stack) | |
506 | { | |
4434a56e ME |
507 | unsigned long flags; |
508 | ||
509 | lockdep_assert_not_held(&pool_lock); | |
510 | ||
511 | raw_spin_lock_irqsave(&pool_lock, flags); | |
512 | printk_deferred_enter(); | |
108be8de | 513 | |
4434a56e ME |
514 | /* |
515 | * Remove the entry from the hash list. Concurrent list traversal may | |
516 | * still observe the entry, but since the refcount is zero, this entry | |
517 | * will no longer be considered as valid. | |
518 | */ | |
519 | list_del_rcu(&stack->hash_list); | |
520 | ||
521 | /* | |
522 | * Due to being used from constrained contexts such as the allocators, | |
523 | * NMI, or even RCU itself, stack depot cannot rely on primitives that | |
524 | * would sleep (such as synchronize_rcu()) or recursively call into | |
525 | * stack depot again (such as call_rcu()). | |
526 | * | |
527 | * Instead, get an RCU cookie, so that we can ensure this entry isn't | |
528 | * moved onto another list until the next grace period, and concurrent | |
529 | * RCU list traversal remains safe. | |
530 | */ | |
531 | stack->rcu_state = get_state_synchronize_rcu(); | |
532 | ||
533 | /* | |
534 | * Add the entry to the freelist tail, so that older entries are | |
535 | * considered first - their RCU cookie is more likely to no longer be | |
536 | * associated with the current grace period. | |
537 | */ | |
538 | list_add_tail(&stack->free_list, &free_stacks); | |
c2a29254 ME |
539 | |
540 | counters[DEPOT_COUNTER_FREELIST_SIZE]++; | |
541 | counters[DEPOT_COUNTER_FREES]++; | |
542 | counters[DEPOT_COUNTER_INUSE]--; | |
4434a56e ME |
543 | |
544 | printk_deferred_exit(); | |
545 | raw_spin_unlock_irqrestore(&pool_lock, flags); | |
108be8de AK |
546 | } |
547 | ||
b232b999 | 548 | /* Calculates the hash for a stack. */ |
cd11016e AP |
549 | static inline u32 hash_stack(unsigned long *entries, unsigned int size) |
550 | { | |
551 | return jhash2((u32 *)entries, | |
180644f8 GS |
552 | array_size(size, sizeof(*entries)) / sizeof(u32), |
553 | STACK_HASH_SEED); | |
cd11016e AP |
554 | } |
555 | ||
b232b999 AK |
556 | /* |
557 | * Non-instrumented version of memcmp(). | |
558 | * Does not check the lexicographical order, only the equality. | |
a571b272 AP |
559 | */ |
560 | static inline | |
561 | int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, | |
562 | unsigned int n) | |
563 | { | |
564 | for ( ; n-- ; u1++, u2++) { | |
565 | if (*u1 != *u2) | |
566 | return 1; | |
567 | } | |
568 | return 0; | |
569 | } | |
570 | ||
b232b999 | 571 | /* Finds a stack in a bucket of the hash table. */ |
4805180b | 572 | static inline struct stack_record *find_stack(struct list_head *bucket, |
4434a56e ME |
573 | unsigned long *entries, int size, |
574 | u32 hash, depot_flags_t flags) | |
cd11016e | 575 | { |
4434a56e ME |
576 | struct stack_record *stack, *ret = NULL; |
577 | ||
578 | /* | |
579 | * Stack depot may be used from instrumentation that instruments RCU or | |
580 | * tracing itself; use variant that does not call into RCU and cannot be | |
581 | * traced. | |
582 | * | |
583 | * Note: Such use cases must take care when using refcounting to evict | |
584 | * unused entries, because the stack record free-then-reuse code paths | |
585 | * do call into RCU. | |
586 | */ | |
587 | rcu_read_lock_sched_notrace(); | |
588 | ||
589 | list_for_each_entry_rcu(stack, bucket, hash_list) { | |
590 | if (stack->hash != hash || stack->size != size) | |
591 | continue; | |
cd11016e | 592 | |
4434a56e ME |
593 | /* |
594 | * This may race with depot_free_stack() accessing the freelist | |
595 | * management state unioned with @entries. The refcount is zero | |
596 | * in that case and the below refcount_inc_not_zero() will fail. | |
597 | */ | |
598 | if (data_race(stackdepot_memcmp(entries, stack->entries, size))) | |
599 | continue; | |
a6cd9570 | 600 | |
4434a56e ME |
601 | /* |
602 | * Try to increment refcount. If this succeeds, the stack record | |
603 | * is valid and has not yet been freed. | |
604 | * | |
605 | * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior | |
606 | * to then call stack_depot_put() later, and we can assume that | |
607 | * a stack record is never placed back on the freelist. | |
608 | */ | |
609 | if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count)) | |
610 | continue; | |
611 | ||
612 | ret = stack; | |
613 | break; | |
cd11016e | 614 | } |
4434a56e ME |
615 | |
616 | rcu_read_unlock_sched_notrace(); | |
617 | ||
618 | return ret; | |
cd11016e AP |
619 | } |
620 | ||
022012dc AK |
621 | depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, |
622 | unsigned int nr_entries, | |
623 | gfp_t alloc_flags, | |
624 | depot_flags_t depot_flags) | |
cd11016e | 625 | { |
4805180b AK |
626 | struct list_head *bucket; |
627 | struct stack_record *found = NULL; | |
603c000c | 628 | depot_stack_handle_t handle = 0; |
cd11016e AP |
629 | struct page *page = NULL; |
630 | void *prealloc = NULL; | |
022012dc | 631 | bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; |
c0cfc337 TG |
632 | unsigned long flags; |
633 | u32 hash; | |
cd11016e | 634 | |
022012dc AK |
635 | if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK)) |
636 | return 0; | |
637 | ||
e9400660 ME |
638 | /* |
639 | * If this stack trace is from an interrupt, including anything before | |
b232b999 | 640 | * interrupt entry usually leads to unbounded stack depot growth. |
e9400660 | 641 | * |
b232b999 AK |
642 | * Since use of filter_irq_stacks() is a requirement to ensure stack |
643 | * depot can efficiently deduplicate interrupt stacks, always | |
644 | * filter_irq_stacks() to simplify all callers' use of stack depot. | |
e9400660 ME |
645 | */ |
646 | nr_entries = filter_irq_stacks(entries, nr_entries); | |
647 | ||
735df3c3 | 648 | if (unlikely(nr_entries == 0) || stack_depot_disabled) |
603c000c | 649 | return 0; |
cd11016e | 650 | |
c0cfc337 | 651 | hash = hash_stack(entries, nr_entries); |
f9987921 | 652 | bucket = &stack_table[hash & stack_hash_mask]; |
cd11016e | 653 | |
4434a56e ME |
654 | /* Fast path: look the stack trace up without locking. */ |
655 | found = find_stack(bucket, entries, nr_entries, hash, depot_flags); | |
656 | if (found) | |
cd11016e AP |
657 | goto exit; |
658 | ||
659 | /* | |
a6cd9570 AK |
660 | * Allocate memory for a new pool if required now: |
661 | * we won't be able to do that under the lock. | |
cd11016e | 662 | */ |
4434a56e | 663 | if (unlikely(can_alloc && READ_ONCE(new_pool_required))) { |
cd11016e AP |
664 | /* |
665 | * Zero out zone modifiers, as we don't have specific zone | |
666 | * requirements. Keep the flags related to allocation in atomic | |
667 | * contexts and I/O. | |
668 | */ | |
669 | alloc_flags &= ~GFP_ZONEMASK; | |
670 | alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); | |
87cc271d | 671 | alloc_flags |= __GFP_NOWARN; |
424cafee | 672 | page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER); |
cd11016e AP |
673 | if (page) |
674 | prealloc = page_address(page); | |
675 | } | |
676 | ||
4434a56e | 677 | raw_spin_lock_irqsave(&pool_lock, flags); |
a914d8d6 | 678 | printk_deferred_enter(); |
cd11016e | 679 | |
4434a56e ME |
680 | /* Try to find again, to avoid concurrently inserting duplicates. */ |
681 | found = find_stack(bucket, entries, nr_entries, hash, depot_flags); | |
cd11016e | 682 | if (!found) { |
b232b999 AK |
683 | struct stack_record *new = |
684 | depot_alloc_stack(entries, nr_entries, hash, &prealloc); | |
7f2b8818 | 685 | |
cd11016e | 686 | if (new) { |
4434a56e ME |
687 | /* |
688 | * This releases the stack record into the bucket and | |
689 | * makes it visible to readers in find_stack(). | |
690 | */ | |
691 | list_add_rcu(&new->hash_list, bucket); | |
cd11016e AP |
692 | found = new; |
693 | } | |
4434a56e ME |
694 | } |
695 | ||
696 | if (prealloc) { | |
cd11016e | 697 | /* |
4434a56e ME |
698 | * Either stack depot already contains this stack trace, or |
699 | * depot_alloc_stack() did not consume the preallocated memory. | |
700 | * Try to keep the preallocated memory for future. | |
cd11016e | 701 | */ |
4434a56e | 702 | depot_keep_new_pool(&prealloc); |
cd11016e AP |
703 | } |
704 | ||
a914d8d6 | 705 | printk_deferred_exit(); |
4434a56e | 706 | raw_spin_unlock_irqrestore(&pool_lock, flags); |
cd11016e AP |
707 | exit: |
708 | if (prealloc) { | |
b232b999 | 709 | /* Stack depot didn't use this memory, free it. */ |
424cafee | 710 | free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); |
cd11016e AP |
711 | } |
712 | if (found) | |
603c000c AK |
713 | handle = found->handle.handle; |
714 | return handle; | |
cd11016e | 715 | } |
022012dc | 716 | EXPORT_SYMBOL_GPL(stack_depot_save_flags); |
11ac25c6 | 717 | |
11ac25c6 ME |
718 | depot_stack_handle_t stack_depot_save(unsigned long *entries, |
719 | unsigned int nr_entries, | |
720 | gfp_t alloc_flags) | |
721 | { | |
022012dc AK |
722 | return stack_depot_save_flags(entries, nr_entries, alloc_flags, |
723 | STACK_DEPOT_FLAG_CAN_ALLOC); | |
11ac25c6 | 724 | } |
c0cfc337 | 725 | EXPORT_SYMBOL_GPL(stack_depot_save); |
15ef6a98 | 726 | |
15ef6a98 AK |
727 | unsigned int stack_depot_fetch(depot_stack_handle_t handle, |
728 | unsigned long **entries) | |
729 | { | |
15ef6a98 AK |
730 | struct stack_record *stack; |
731 | ||
732 | *entries = NULL; | |
8e00b2df AP |
733 | /* |
734 | * Let KMSAN know *entries is initialized. This shall prevent false | |
735 | * positive reports if instrumented code accesses it. | |
736 | */ | |
737 | kmsan_unpoison_memory(entries, sizeof(*entries)); | |
738 | ||
0c5d44a8 | 739 | if (!handle || stack_depot_disabled) |
15ef6a98 AK |
740 | return 0; |
741 | ||
83130ab2 | 742 | stack = depot_fetch_stack(handle); |
4434a56e ME |
743 | /* |
744 | * Should never be NULL, otherwise this is a use-after-put (or just a | |
745 | * corrupt handle). | |
746 | */ | |
747 | if (WARN(!stack, "corrupt handle or use after stack_depot_put()")) | |
748 | return 0; | |
a6cd9570 | 749 | |
15ef6a98 AK |
750 | *entries = stack->entries; |
751 | return stack->size; | |
752 | } | |
753 | EXPORT_SYMBOL_GPL(stack_depot_fetch); | |
754 | ||
108be8de AK |
755 | void stack_depot_put(depot_stack_handle_t handle) |
756 | { | |
757 | struct stack_record *stack; | |
108be8de AK |
758 | |
759 | if (!handle || stack_depot_disabled) | |
760 | return; | |
761 | ||
108be8de | 762 | stack = depot_fetch_stack(handle); |
4434a56e ME |
763 | /* |
764 | * Should always be able to find the stack record, otherwise this is an | |
765 | * unbalanced put attempt (or corrupt handle). | |
766 | */ | |
767 | if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()")) | |
768 | return; | |
108be8de | 769 | |
4434a56e | 770 | if (refcount_dec_and_test(&stack->count)) |
108be8de | 771 | depot_free_stack(stack); |
108be8de AK |
772 | } |
773 | EXPORT_SYMBOL_GPL(stack_depot_put); | |
774 | ||
15ef6a98 AK |
775 | void stack_depot_print(depot_stack_handle_t stack) |
776 | { | |
777 | unsigned long *entries; | |
778 | unsigned int nr_entries; | |
779 | ||
780 | nr_entries = stack_depot_fetch(stack, &entries); | |
781 | if (nr_entries > 0) | |
782 | stack_trace_print(entries, nr_entries, 0); | |
783 | } | |
784 | EXPORT_SYMBOL_GPL(stack_depot_print); | |
785 | ||
15ef6a98 AK |
786 | int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, |
787 | int spaces) | |
788 | { | |
789 | unsigned long *entries; | |
790 | unsigned int nr_entries; | |
791 | ||
792 | nr_entries = stack_depot_fetch(handle, &entries); | |
793 | return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, | |
794 | spaces) : 0; | |
795 | } | |
796 | EXPORT_SYMBOL_GPL(stack_depot_snprint); | |
797 | ||
36aa1e67 AK |
798 | depot_stack_handle_t __must_check stack_depot_set_extra_bits( |
799 | depot_stack_handle_t handle, unsigned int extra_bits) | |
800 | { | |
801 | union handle_parts parts = { .handle = handle }; | |
802 | ||
803 | /* Don't set extra bits on empty handles. */ | |
804 | if (!handle) | |
805 | return 0; | |
806 | ||
807 | parts.extra = extra_bits; | |
808 | return parts.handle; | |
809 | } | |
810 | EXPORT_SYMBOL(stack_depot_set_extra_bits); | |
811 | ||
15ef6a98 AK |
812 | unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) |
813 | { | |
814 | union handle_parts parts = { .handle = handle }; | |
815 | ||
816 | return parts.extra; | |
817 | } | |
818 | EXPORT_SYMBOL(stack_depot_get_extra_bits); | |
c2a29254 ME |
819 | |
820 | static int stats_show(struct seq_file *seq, void *v) | |
821 | { | |
822 | /* | |
823 | * data race ok: These are just statistics counters, and approximate | |
824 | * statistics are ok for debugging. | |
825 | */ | |
826 | seq_printf(seq, "pools: %d\n", data_race(pools_num)); | |
827 | for (int i = 0; i < DEPOT_COUNTER_COUNT; i++) | |
828 | seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i])); | |
829 | ||
830 | return 0; | |
831 | } | |
832 | DEFINE_SHOW_ATTRIBUTE(stats); | |
833 | ||
834 | static int depot_debugfs_init(void) | |
835 | { | |
836 | struct dentry *dir; | |
837 | ||
838 | if (stack_depot_disabled) | |
839 | return 0; | |
840 | ||
841 | dir = debugfs_create_dir("stackdepot", NULL); | |
842 | debugfs_create_file("stats", 0444, dir, NULL, &stats_fops); | |
843 | return 0; | |
844 | } | |
845 | late_initcall(depot_debugfs_init); |