Commit | Line | Data |
---|---|---|
1802d0be | 1 | // SPDX-License-Identifier: GPL-2.0-only |
cd11016e | 2 | /* |
b232b999 | 3 | * Stack depot - a stack trace storage that avoids duplication. |
cd11016e | 4 | * |
b232b999 AK |
5 | * Internally, stack depot maintains a hash table of unique stacktraces. The |
6 | * stack traces themselves are stored contiguously one after another in a set | |
7 | * of separate page allocations. | |
8 | * | |
cd11016e AP |
9 | * Author: Alexander Potapenko <glider@google.com> |
10 | * Copyright (C) 2016 Google, Inc. | |
11 | * | |
b232b999 | 12 | * Based on the code by Dmitry Chernenkov. |
cd11016e AP |
13 | */ |
14 | ||
4a6b5314 AK |
15 | #define pr_fmt(fmt) "stackdepot: " fmt |
16 | ||
c2a29254 | 17 | #include <linux/debugfs.h> |
cd11016e AP |
18 | #include <linux/gfp.h> |
19 | #include <linux/jhash.h> | |
20 | #include <linux/kernel.h> | |
8e00b2df | 21 | #include <linux/kmsan.h> |
4805180b | 22 | #include <linux/list.h> |
cd11016e | 23 | #include <linux/mm.h> |
2dba5eb1 | 24 | #include <linux/mutex.h> |
31639fd6 | 25 | #include <linux/poison.h> |
cd11016e | 26 | #include <linux/printk.h> |
4434a56e ME |
27 | #include <linux/rculist.h> |
28 | #include <linux/rcupdate.h> | |
410b764f | 29 | #include <linux/refcount.h> |
cd11016e | 30 | #include <linux/slab.h> |
a6cd9570 | 31 | #include <linux/spinlock.h> |
cd11016e AP |
32 | #include <linux/stacktrace.h> |
33 | #include <linux/stackdepot.h> | |
34 | #include <linux/string.h> | |
35 | #include <linux/types.h> | |
e1fdc403 | 36 | #include <linux/memblock.h> |
f9987921 | 37 | #include <linux/kasan-enabled.h> |
cd11016e | 38 | |
424cafee | 39 | #define DEPOT_POOLS_CAP 8192 |
3ee34eab | 40 | /* The pool_index is offset by 1 so the first record does not have a 0 handle. */ |
424cafee | 41 | #define DEPOT_MAX_POOLS \ |
3ee34eab OS |
42 | (((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \ |
43 | (1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP) | |
cd11016e | 44 | |
735df3c3 | 45 | static bool stack_depot_disabled; |
1c0310ad | 46 | static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); |
a5f1783b VB |
47 | static bool __stack_depot_early_init_passed __initdata; |
48 | ||
0d249ac0 | 49 | /* Use one hash table bucket per 16 KB of memory. */ |
4c2e9a67 | 50 | #define STACK_HASH_TABLE_SCALE 14 |
0d249ac0 | 51 | /* Limit the number of buckets between 4K and 1M. */ |
4c2e9a67 AK |
52 | #define STACK_BUCKET_NUMBER_ORDER_MIN 12 |
53 | #define STACK_BUCKET_NUMBER_ORDER_MAX 20 | |
0d249ac0 | 54 | /* Initial seed for jhash2. */ |
cd11016e AP |
55 | #define STACK_HASH_SEED 0x9747b28c |
56 | ||
4805180b AK |
57 | /* Hash table of stored stack records. */ |
58 | static struct list_head *stack_table; | |
0d249ac0 | 59 | /* Fixed order of the number of table buckets. Used when KASAN is enabled. */ |
4c2e9a67 | 60 | static unsigned int stack_bucket_number_order; |
0d249ac0 | 61 | /* Hash mask for indexing the table. */ |
f9987921 VB |
62 | static unsigned int stack_hash_mask; |
63 | ||
4805180b | 64 | /* Array of memory regions that store stack records. */ |
424cafee | 65 | static void *stack_pools[DEPOT_MAX_POOLS]; |
a5d21f71 AK |
66 | /* Newly allocated pool that is not yet added to stack_pools. */ |
67 | static void *new_pool; | |
b29d3188 AK |
68 | /* Number of pools in stack_pools. */ |
69 | static int pools_num; | |
31639fd6 ME |
70 | /* Offset to the unused space in the currently used pool. */ |
71 | static size_t pool_offset = DEPOT_POOL_SIZE; | |
4805180b AK |
72 | /* Freelist of stack records within stack_pools. */ |
73 | static LIST_HEAD(free_stacks); | |
4434a56e ME |
74 | /* The lock must be held when performing pool or freelist modifications. */ |
75 | static DEFINE_RAW_SPINLOCK(pool_lock); | |
e1fdc403 | 76 | |
c2a29254 ME |
77 | /* Statistics counters for debugfs. */ |
78 | enum depot_counter_id { | |
31639fd6 ME |
79 | DEPOT_COUNTER_REFD_ALLOCS, |
80 | DEPOT_COUNTER_REFD_FREES, | |
81 | DEPOT_COUNTER_REFD_INUSE, | |
c2a29254 | 82 | DEPOT_COUNTER_FREELIST_SIZE, |
31639fd6 ME |
83 | DEPOT_COUNTER_PERSIST_COUNT, |
84 | DEPOT_COUNTER_PERSIST_BYTES, | |
c2a29254 ME |
85 | DEPOT_COUNTER_COUNT, |
86 | }; | |
87 | static long counters[DEPOT_COUNTER_COUNT]; | |
88 | static const char *const counter_names[] = { | |
31639fd6 ME |
89 | [DEPOT_COUNTER_REFD_ALLOCS] = "refcounted_allocations", |
90 | [DEPOT_COUNTER_REFD_FREES] = "refcounted_frees", | |
91 | [DEPOT_COUNTER_REFD_INUSE] = "refcounted_in_use", | |
c2a29254 | 92 | [DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size", |
31639fd6 ME |
93 | [DEPOT_COUNTER_PERSIST_COUNT] = "persistent_count", |
94 | [DEPOT_COUNTER_PERSIST_BYTES] = "persistent_bytes", | |
c2a29254 ME |
95 | }; |
96 | static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT); | |
97 | ||
735df3c3 | 98 | static int __init disable_stack_depot(char *str) |
e1fdc403 | 99 | { |
4d07a037 | 100 | return kstrtobool(str, &stack_depot_disabled); |
e1fdc403 | 101 | } |
735df3c3 | 102 | early_param("stack_depot_disable", disable_stack_depot); |
e1fdc403 | 103 | |
1c0310ad | 104 | void __init stack_depot_request_early_init(void) |
a5f1783b | 105 | { |
1c0310ad | 106 | /* Too late to request early init now. */ |
a5f1783b VB |
107 | WARN_ON(__stack_depot_early_init_passed); |
108 | ||
1c0310ad | 109 | __stack_depot_early_init_requested = true; |
a5f1783b VB |
110 | } |
111 | ||
4805180b AK |
112 | /* Initialize list_head's within the hash table. */ |
113 | static void init_stack_table(unsigned long entries) | |
114 | { | |
115 | unsigned long i; | |
116 | ||
117 | for (i = 0; i < entries; i++) | |
118 | INIT_LIST_HEAD(&stack_table[i]); | |
119 | } | |
120 | ||
df225c87 | 121 | /* Allocates a hash table via memblock. Can only be used during early boot. */ |
a5f1783b VB |
122 | int __init stack_depot_early_init(void) |
123 | { | |
f9987921 | 124 | unsigned long entries = 0; |
a5f1783b | 125 | |
df225c87 | 126 | /* This function must be called only once, from mm_init(). */ |
a5f1783b VB |
127 | if (WARN_ON(__stack_depot_early_init_passed)) |
128 | return 0; | |
a5f1783b VB |
129 | __stack_depot_early_init_passed = true; |
130 | ||
4d07a037 AK |
131 | /* |
132 | * Print disabled message even if early init has not been requested: | |
133 | * stack_depot_init() will not print one. | |
134 | */ | |
135 | if (stack_depot_disabled) { | |
136 | pr_info("disabled\n"); | |
137 | return 0; | |
138 | } | |
139 | ||
df225c87 AK |
140 | /* |
141 | * If KASAN is enabled, use the maximum order: KASAN is frequently used | |
142 | * in fuzzing scenarios, which leads to a large number of different | |
143 | * stack traces being stored in stack depot. | |
144 | */ | |
4c2e9a67 AK |
145 | if (kasan_enabled() && !stack_bucket_number_order) |
146 | stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX; | |
f9987921 | 147 | |
4d07a037 AK |
148 | /* |
149 | * Check if early init has been requested after setting | |
150 | * stack_bucket_number_order: stack_depot_init() uses its value. | |
151 | */ | |
152 | if (!__stack_depot_early_init_requested) | |
a5f1783b VB |
153 | return 0; |
154 | ||
df225c87 | 155 | /* |
4c2e9a67 | 156 | * If stack_bucket_number_order is not set, leave entries as 0 to rely |
4805180b | 157 | * on the automatic calculations performed by alloc_large_system_hash(). |
df225c87 | 158 | */ |
4c2e9a67 AK |
159 | if (stack_bucket_number_order) |
160 | entries = 1UL << stack_bucket_number_order; | |
df225c87 | 161 | pr_info("allocating hash table via alloc_large_system_hash\n"); |
f9987921 | 162 | stack_table = alloc_large_system_hash("stackdepot", |
4805180b | 163 | sizeof(struct list_head), |
f9987921 | 164 | entries, |
4c2e9a67 | 165 | STACK_HASH_TABLE_SCALE, |
4805180b | 166 | HASH_EARLY, |
f9987921 VB |
167 | NULL, |
168 | &stack_hash_mask, | |
4c2e9a67 AK |
169 | 1UL << STACK_BUCKET_NUMBER_ORDER_MIN, |
170 | 1UL << STACK_BUCKET_NUMBER_ORDER_MAX); | |
a5f1783b | 171 | if (!stack_table) { |
4a6b5314 | 172 | pr_err("hash table allocation failed, disabling\n"); |
735df3c3 | 173 | stack_depot_disabled = true; |
a5f1783b VB |
174 | return -ENOMEM; |
175 | } | |
4805180b AK |
176 | if (!entries) { |
177 | /* | |
178 | * Obtain the number of entries that was calculated by | |
179 | * alloc_large_system_hash(). | |
180 | */ | |
181 | entries = stack_hash_mask + 1; | |
182 | } | |
183 | init_stack_table(entries); | |
a5f1783b VB |
184 | |
185 | return 0; | |
186 | } | |
187 | ||
df225c87 | 188 | /* Allocates a hash table via kvcalloc. Can be used after boot. */ |
a5f1783b | 189 | int stack_depot_init(void) |
e1fdc403 | 190 | { |
2dba5eb1 | 191 | static DEFINE_MUTEX(stack_depot_init_mutex); |
c60324fb | 192 | unsigned long entries; |
a5f1783b | 193 | int ret = 0; |
2dba5eb1 VB |
194 | |
195 | mutex_lock(&stack_depot_init_mutex); | |
f9987921 | 196 | |
c60324fb AK |
197 | if (stack_depot_disabled || stack_table) |
198 | goto out_unlock; | |
f9987921 | 199 | |
c60324fb | 200 | /* |
4c2e9a67 | 201 | * Similarly to stack_depot_early_init, use stack_bucket_number_order |
c60324fb AK |
202 | * if assigned, and rely on automatic scaling otherwise. |
203 | */ | |
4c2e9a67 AK |
204 | if (stack_bucket_number_order) { |
205 | entries = 1UL << stack_bucket_number_order; | |
c60324fb | 206 | } else { |
4c2e9a67 | 207 | int scale = STACK_HASH_TABLE_SCALE; |
c60324fb AK |
208 | |
209 | entries = nr_free_buffer_pages(); | |
210 | entries = roundup_pow_of_two(entries); | |
211 | ||
212 | if (scale > PAGE_SHIFT) | |
213 | entries >>= (scale - PAGE_SHIFT); | |
214 | else | |
215 | entries <<= (PAGE_SHIFT - scale); | |
e1fdc403 | 216 | } |
c60324fb | 217 | |
4c2e9a67 AK |
218 | if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN) |
219 | entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN; | |
220 | if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX) | |
221 | entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX; | |
c60324fb AK |
222 | |
223 | pr_info("allocating hash table of %lu entries via kvcalloc\n", entries); | |
4805180b | 224 | stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL); |
c60324fb AK |
225 | if (!stack_table) { |
226 | pr_err("hash table allocation failed, disabling\n"); | |
227 | stack_depot_disabled = true; | |
228 | ret = -ENOMEM; | |
229 | goto out_unlock; | |
230 | } | |
231 | stack_hash_mask = entries - 1; | |
4805180b | 232 | init_stack_table(entries); |
c60324fb AK |
233 | |
234 | out_unlock: | |
2dba5eb1 | 235 | mutex_unlock(&stack_depot_init_mutex); |
c60324fb | 236 | |
a5f1783b | 237 | return ret; |
e1fdc403 | 238 | } |
2dba5eb1 | 239 | EXPORT_SYMBOL_GPL(stack_depot_init); |
cd11016e | 240 | |
4434a56e | 241 | /* |
31639fd6 | 242 | * Initializes new stack pool, and updates the list of pools. |
4434a56e | 243 | */ |
31639fd6 | 244 | static bool depot_init_pool(void **prealloc) |
b29d3188 | 245 | { |
4434a56e | 246 | lockdep_assert_held(&pool_lock); |
4805180b | 247 | |
31639fd6 ME |
248 | if (unlikely(pools_num >= DEPOT_MAX_POOLS)) { |
249 | /* Bail out if we reached the pool limit. */ | |
250 | WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */ | |
251 | WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */ | |
252 | WARN_ONCE(1, "Stack depot reached limit capacity"); | |
253 | return false; | |
254 | } | |
4434a56e | 255 | |
31639fd6 ME |
256 | if (!new_pool && *prealloc) { |
257 | /* We have preallocated memory, use it. */ | |
258 | WRITE_ONCE(new_pool, *prealloc); | |
259 | *prealloc = NULL; | |
b29d3188 AK |
260 | } |
261 | ||
31639fd6 ME |
262 | if (!new_pool) |
263 | return false; /* new_pool and *prealloc are NULL */ | |
264 | ||
b29d3188 | 265 | /* Save reference to the pool to be used by depot_fetch_stack(). */ |
31639fd6 ME |
266 | stack_pools[pools_num] = new_pool; |
267 | ||
268 | /* | |
269 | * Stack depot tries to keep an extra pool allocated even before it runs | |
270 | * out of space in the currently used pool. | |
271 | * | |
272 | * To indicate that a new preallocation is needed new_pool is reset to | |
273 | * NULL; do not reset to NULL if we have reached the maximum number of | |
274 | * pools. | |
275 | */ | |
276 | if (pools_num < DEPOT_MAX_POOLS) | |
277 | WRITE_ONCE(new_pool, NULL); | |
278 | else | |
279 | WRITE_ONCE(new_pool, STACK_DEPOT_POISON); | |
4434a56e ME |
280 | |
281 | /* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */ | |
282 | WRITE_ONCE(pools_num, pools_num + 1); | |
283 | ASSERT_EXCLUSIVE_WRITER(pools_num); | |
31639fd6 ME |
284 | |
285 | pool_offset = 0; | |
286 | ||
287 | return true; | |
b29d3188 AK |
288 | } |
289 | ||
b6a353d3 AK |
290 | /* Keeps the preallocated memory to be used for a new stack depot pool. */ |
291 | static void depot_keep_new_pool(void **prealloc) | |
15ef6a98 | 292 | { |
4434a56e | 293 | lockdep_assert_held(&pool_lock); |
a6cd9570 | 294 | |
15ef6a98 | 295 | /* |
b6a353d3 | 296 | * If a new pool is already saved or the maximum number of |
d11a5621 | 297 | * pools is reached, do not use the preallocated memory. |
15ef6a98 | 298 | */ |
31639fd6 | 299 | if (new_pool) |
514d5c55 | 300 | return; |
cd0fc64e | 301 | |
31639fd6 ME |
302 | WRITE_ONCE(new_pool, *prealloc); |
303 | *prealloc = NULL; | |
15ef6a98 AK |
304 | } |
305 | ||
4434a56e | 306 | /* |
31639fd6 ME |
307 | * Try to initialize a new stack record from the current pool, a cached pool, or |
308 | * the current pre-allocation. | |
4434a56e | 309 | */ |
31639fd6 | 310 | static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size) |
15ef6a98 | 311 | { |
31639fd6 ME |
312 | struct stack_record *stack; |
313 | void *current_pool; | |
314 | u32 pool_index; | |
315 | ||
4434a56e | 316 | lockdep_assert_held(&pool_lock); |
b29d3188 | 317 | |
31639fd6 ME |
318 | if (pool_offset + size > DEPOT_POOL_SIZE) { |
319 | if (!depot_init_pool(prealloc)) | |
320 | return NULL; | |
321 | } | |
b29d3188 | 322 | |
31639fd6 ME |
323 | if (WARN_ON_ONCE(pools_num < 1)) |
324 | return NULL; | |
325 | pool_index = pools_num - 1; | |
326 | current_pool = stack_pools[pool_index]; | |
327 | if (WARN_ON_ONCE(!current_pool)) | |
328 | return NULL; | |
b29d3188 | 329 | |
31639fd6 | 330 | stack = current_pool + pool_offset; |
b29d3188 | 331 | |
31639fd6 | 332 | /* Pre-initialize handle once. */ |
3ee34eab | 333 | stack->handle.pool_index = pool_index + 1; |
31639fd6 ME |
334 | stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN; |
335 | stack->handle.extra = 0; | |
336 | INIT_LIST_HEAD(&stack->hash_list); | |
cd0fc64e | 337 | |
31639fd6 | 338 | pool_offset += size; |
94b7d328 | 339 | |
31639fd6 | 340 | return stack; |
4434a56e ME |
341 | } |
342 | ||
31639fd6 | 343 | /* Try to find next free usable entry from the freelist. */ |
4434a56e ME |
344 | static struct stack_record *depot_pop_free(void) |
345 | { | |
346 | struct stack_record *stack; | |
b29d3188 | 347 | |
4434a56e ME |
348 | lockdep_assert_held(&pool_lock); |
349 | ||
350 | if (list_empty(&free_stacks)) | |
351 | return NULL; | |
352 | ||
353 | /* | |
354 | * We maintain the invariant that the elements in front are least | |
355 | * recently used, and are therefore more likely to be associated with an | |
356 | * RCU grace period in the past. Consequently it is sufficient to only | |
357 | * check the first entry. | |
358 | */ | |
359 | stack = list_first_entry(&free_stacks, struct stack_record, free_list); | |
31639fd6 | 360 | if (!poll_state_synchronize_rcu(stack->rcu_state)) |
4434a56e ME |
361 | return NULL; |
362 | ||
363 | list_del(&stack->free_list); | |
364 | counters[DEPOT_COUNTER_FREELIST_SIZE]--; | |
365 | ||
366 | return stack; | |
94b7d328 AK |
367 | } |
368 | ||
31639fd6 ME |
369 | static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries) |
370 | { | |
371 | const size_t used = flex_array_size(s, entries, nr_entries); | |
372 | const size_t unused = sizeof(s->entries) - used; | |
373 | ||
374 | WARN_ON_ONCE(sizeof(s->entries) < used); | |
375 | ||
376 | return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN); | |
377 | } | |
378 | ||
94b7d328 AK |
379 | /* Allocates a new stack in a stack depot pool. */ |
380 | static struct stack_record * | |
31639fd6 | 381 | depot_alloc_stack(unsigned long *entries, unsigned int nr_entries, u32 hash, depot_flags_t flags, void **prealloc) |
94b7d328 | 382 | { |
31639fd6 ME |
383 | struct stack_record *stack = NULL; |
384 | size_t record_size; | |
94b7d328 | 385 | |
4434a56e | 386 | lockdep_assert_held(&pool_lock); |
a6cd9570 | 387 | |
4434a56e | 388 | /* This should already be checked by public API entry points. */ |
31639fd6 | 389 | if (WARN_ON_ONCE(!nr_entries)) |
94b7d328 | 390 | return NULL; |
cd0fc64e | 391 | |
31639fd6 ME |
392 | /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */ |
393 | if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES) | |
394 | nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES; | |
395 | ||
396 | if (flags & STACK_DEPOT_FLAG_GET) { | |
397 | /* | |
398 | * Evictable entries have to allocate the max. size so they may | |
399 | * safely be re-used by differently sized allocations. | |
400 | */ | |
401 | record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES); | |
4434a56e | 402 | stack = depot_pop_free(); |
31639fd6 ME |
403 | } else { |
404 | record_size = depot_stack_record_size(stack, nr_entries); | |
4434a56e | 405 | } |
b29d3188 | 406 | |
31639fd6 ME |
407 | if (!stack) { |
408 | stack = depot_pop_free_pool(prealloc, record_size); | |
409 | if (!stack) | |
410 | return NULL; | |
411 | } | |
fc60e0ca | 412 | |
cd0fc64e | 413 | /* Save the stack trace. */ |
15ef6a98 | 414 | stack->hash = hash; |
31639fd6 ME |
415 | stack->size = nr_entries; |
416 | /* stack->handle is already filled in by depot_pop_free_pool(). */ | |
417 | memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries)); | |
418 | ||
419 | if (flags & STACK_DEPOT_FLAG_GET) { | |
420 | refcount_set(&stack->count, 1); | |
421 | counters[DEPOT_COUNTER_REFD_ALLOCS]++; | |
422 | counters[DEPOT_COUNTER_REFD_INUSE]++; | |
423 | } else { | |
424 | /* Warn on attempts to switch to refcounting this entry. */ | |
425 | refcount_set(&stack->count, REFCOUNT_SATURATED); | |
426 | counters[DEPOT_COUNTER_PERSIST_COUNT]++; | |
427 | counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size; | |
428 | } | |
83130ab2 | 429 | |
8e00b2df AP |
430 | /* |
431 | * Let KMSAN know the stored stack record is initialized. This shall | |
432 | * prevent false positive reports if instrumented code accesses it. | |
433 | */ | |
31639fd6 | 434 | kmsan_unpoison_memory(stack, record_size); |
15ef6a98 AK |
435 | |
436 | return stack; | |
437 | } | |
438 | ||
83130ab2 AK |
439 | static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle) |
440 | { | |
4434a56e | 441 | const int pools_num_cached = READ_ONCE(pools_num); |
83130ab2 | 442 | union handle_parts parts = { .handle = handle }; |
83130ab2 | 443 | void *pool; |
3ee34eab | 444 | u32 pool_index = parts.pool_index - 1; |
83130ab2 AK |
445 | size_t offset = parts.offset << DEPOT_STACK_ALIGN; |
446 | struct stack_record *stack; | |
447 | ||
4434a56e | 448 | lockdep_assert_not_held(&pool_lock); |
a6cd9570 | 449 | |
dc245594 | 450 | if (pool_index >= pools_num_cached) { |
83130ab2 | 451 | WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n", |
3ee34eab | 452 | pool_index, pools_num_cached, handle); |
83130ab2 AK |
453 | return NULL; |
454 | } | |
455 | ||
3ee34eab | 456 | pool = stack_pools[pool_index]; |
4434a56e | 457 | if (WARN_ON(!pool)) |
83130ab2 AK |
458 | return NULL; |
459 | ||
460 | stack = pool + offset; | |
4434a56e ME |
461 | if (WARN_ON(!refcount_read(&stack->count))) |
462 | return NULL; | |
463 | ||
83130ab2 AK |
464 | return stack; |
465 | } | |
466 | ||
108be8de AK |
467 | /* Links stack into the freelist. */ |
468 | static void depot_free_stack(struct stack_record *stack) | |
469 | { | |
4434a56e ME |
470 | unsigned long flags; |
471 | ||
472 | lockdep_assert_not_held(&pool_lock); | |
473 | ||
474 | raw_spin_lock_irqsave(&pool_lock, flags); | |
475 | printk_deferred_enter(); | |
108be8de | 476 | |
4434a56e ME |
477 | /* |
478 | * Remove the entry from the hash list. Concurrent list traversal may | |
479 | * still observe the entry, but since the refcount is zero, this entry | |
480 | * will no longer be considered as valid. | |
481 | */ | |
482 | list_del_rcu(&stack->hash_list); | |
483 | ||
484 | /* | |
485 | * Due to being used from constrained contexts such as the allocators, | |
486 | * NMI, or even RCU itself, stack depot cannot rely on primitives that | |
487 | * would sleep (such as synchronize_rcu()) or recursively call into | |
488 | * stack depot again (such as call_rcu()). | |
489 | * | |
490 | * Instead, get an RCU cookie, so that we can ensure this entry isn't | |
491 | * moved onto another list until the next grace period, and concurrent | |
492 | * RCU list traversal remains safe. | |
493 | */ | |
494 | stack->rcu_state = get_state_synchronize_rcu(); | |
495 | ||
496 | /* | |
497 | * Add the entry to the freelist tail, so that older entries are | |
498 | * considered first - their RCU cookie is more likely to no longer be | |
499 | * associated with the current grace period. | |
500 | */ | |
501 | list_add_tail(&stack->free_list, &free_stacks); | |
c2a29254 ME |
502 | |
503 | counters[DEPOT_COUNTER_FREELIST_SIZE]++; | |
31639fd6 ME |
504 | counters[DEPOT_COUNTER_REFD_FREES]++; |
505 | counters[DEPOT_COUNTER_REFD_INUSE]--; | |
4434a56e ME |
506 | |
507 | printk_deferred_exit(); | |
508 | raw_spin_unlock_irqrestore(&pool_lock, flags); | |
108be8de AK |
509 | } |
510 | ||
b232b999 | 511 | /* Calculates the hash for a stack. */ |
cd11016e AP |
512 | static inline u32 hash_stack(unsigned long *entries, unsigned int size) |
513 | { | |
514 | return jhash2((u32 *)entries, | |
180644f8 GS |
515 | array_size(size, sizeof(*entries)) / sizeof(u32), |
516 | STACK_HASH_SEED); | |
cd11016e AP |
517 | } |
518 | ||
b232b999 AK |
519 | /* |
520 | * Non-instrumented version of memcmp(). | |
521 | * Does not check the lexicographical order, only the equality. | |
a571b272 AP |
522 | */ |
523 | static inline | |
524 | int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2, | |
525 | unsigned int n) | |
526 | { | |
527 | for ( ; n-- ; u1++, u2++) { | |
528 | if (*u1 != *u2) | |
529 | return 1; | |
530 | } | |
531 | return 0; | |
532 | } | |
533 | ||
b232b999 | 534 | /* Finds a stack in a bucket of the hash table. */ |
4805180b | 535 | static inline struct stack_record *find_stack(struct list_head *bucket, |
4434a56e ME |
536 | unsigned long *entries, int size, |
537 | u32 hash, depot_flags_t flags) | |
cd11016e | 538 | { |
4434a56e ME |
539 | struct stack_record *stack, *ret = NULL; |
540 | ||
541 | /* | |
542 | * Stack depot may be used from instrumentation that instruments RCU or | |
543 | * tracing itself; use variant that does not call into RCU and cannot be | |
544 | * traced. | |
545 | * | |
546 | * Note: Such use cases must take care when using refcounting to evict | |
547 | * unused entries, because the stack record free-then-reuse code paths | |
548 | * do call into RCU. | |
549 | */ | |
550 | rcu_read_lock_sched_notrace(); | |
551 | ||
552 | list_for_each_entry_rcu(stack, bucket, hash_list) { | |
553 | if (stack->hash != hash || stack->size != size) | |
554 | continue; | |
cd11016e | 555 | |
4434a56e ME |
556 | /* |
557 | * This may race with depot_free_stack() accessing the freelist | |
558 | * management state unioned with @entries. The refcount is zero | |
559 | * in that case and the below refcount_inc_not_zero() will fail. | |
560 | */ | |
561 | if (data_race(stackdepot_memcmp(entries, stack->entries, size))) | |
562 | continue; | |
a6cd9570 | 563 | |
4434a56e ME |
564 | /* |
565 | * Try to increment refcount. If this succeeds, the stack record | |
566 | * is valid and has not yet been freed. | |
567 | * | |
568 | * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior | |
569 | * to then call stack_depot_put() later, and we can assume that | |
570 | * a stack record is never placed back on the freelist. | |
571 | */ | |
572 | if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count)) | |
573 | continue; | |
574 | ||
575 | ret = stack; | |
576 | break; | |
cd11016e | 577 | } |
4434a56e ME |
578 | |
579 | rcu_read_unlock_sched_notrace(); | |
580 | ||
581 | return ret; | |
cd11016e AP |
582 | } |
583 | ||
022012dc AK |
584 | depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, |
585 | unsigned int nr_entries, | |
586 | gfp_t alloc_flags, | |
587 | depot_flags_t depot_flags) | |
cd11016e | 588 | { |
4805180b AK |
589 | struct list_head *bucket; |
590 | struct stack_record *found = NULL; | |
603c000c | 591 | depot_stack_handle_t handle = 0; |
cd11016e AP |
592 | struct page *page = NULL; |
593 | void *prealloc = NULL; | |
022012dc | 594 | bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; |
c0cfc337 TG |
595 | unsigned long flags; |
596 | u32 hash; | |
cd11016e | 597 | |
022012dc AK |
598 | if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK)) |
599 | return 0; | |
600 | ||
e9400660 ME |
601 | /* |
602 | * If this stack trace is from an interrupt, including anything before | |
b232b999 | 603 | * interrupt entry usually leads to unbounded stack depot growth. |
e9400660 | 604 | * |
b232b999 AK |
605 | * Since use of filter_irq_stacks() is a requirement to ensure stack |
606 | * depot can efficiently deduplicate interrupt stacks, always | |
607 | * filter_irq_stacks() to simplify all callers' use of stack depot. | |
e9400660 ME |
608 | */ |
609 | nr_entries = filter_irq_stacks(entries, nr_entries); | |
610 | ||
735df3c3 | 611 | if (unlikely(nr_entries == 0) || stack_depot_disabled) |
603c000c | 612 | return 0; |
cd11016e | 613 | |
c0cfc337 | 614 | hash = hash_stack(entries, nr_entries); |
f9987921 | 615 | bucket = &stack_table[hash & stack_hash_mask]; |
cd11016e | 616 | |
4434a56e ME |
617 | /* Fast path: look the stack trace up without locking. */ |
618 | found = find_stack(bucket, entries, nr_entries, hash, depot_flags); | |
619 | if (found) | |
cd11016e AP |
620 | goto exit; |
621 | ||
622 | /* | |
a6cd9570 AK |
623 | * Allocate memory for a new pool if required now: |
624 | * we won't be able to do that under the lock. | |
cd11016e | 625 | */ |
31639fd6 | 626 | if (unlikely(can_alloc && !READ_ONCE(new_pool))) { |
cd11016e AP |
627 | /* |
628 | * Zero out zone modifiers, as we don't have specific zone | |
629 | * requirements. Keep the flags related to allocation in atomic | |
630 | * contexts and I/O. | |
631 | */ | |
632 | alloc_flags &= ~GFP_ZONEMASK; | |
633 | alloc_flags &= (GFP_ATOMIC | GFP_KERNEL); | |
87cc271d | 634 | alloc_flags |= __GFP_NOWARN; |
424cafee | 635 | page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER); |
cd11016e AP |
636 | if (page) |
637 | prealloc = page_address(page); | |
638 | } | |
639 | ||
4434a56e | 640 | raw_spin_lock_irqsave(&pool_lock, flags); |
a914d8d6 | 641 | printk_deferred_enter(); |
cd11016e | 642 | |
4434a56e ME |
643 | /* Try to find again, to avoid concurrently inserting duplicates. */ |
644 | found = find_stack(bucket, entries, nr_entries, hash, depot_flags); | |
cd11016e | 645 | if (!found) { |
b232b999 | 646 | struct stack_record *new = |
31639fd6 | 647 | depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc); |
7f2b8818 | 648 | |
cd11016e | 649 | if (new) { |
4434a56e ME |
650 | /* |
651 | * This releases the stack record into the bucket and | |
652 | * makes it visible to readers in find_stack(). | |
653 | */ | |
654 | list_add_rcu(&new->hash_list, bucket); | |
cd11016e AP |
655 | found = new; |
656 | } | |
4434a56e ME |
657 | } |
658 | ||
659 | if (prealloc) { | |
cd11016e | 660 | /* |
4434a56e ME |
661 | * Either stack depot already contains this stack trace, or |
662 | * depot_alloc_stack() did not consume the preallocated memory. | |
663 | * Try to keep the preallocated memory for future. | |
cd11016e | 664 | */ |
4434a56e | 665 | depot_keep_new_pool(&prealloc); |
cd11016e AP |
666 | } |
667 | ||
a914d8d6 | 668 | printk_deferred_exit(); |
4434a56e | 669 | raw_spin_unlock_irqrestore(&pool_lock, flags); |
cd11016e AP |
670 | exit: |
671 | if (prealloc) { | |
b232b999 | 672 | /* Stack depot didn't use this memory, free it. */ |
424cafee | 673 | free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); |
cd11016e AP |
674 | } |
675 | if (found) | |
603c000c AK |
676 | handle = found->handle.handle; |
677 | return handle; | |
cd11016e | 678 | } |
022012dc | 679 | EXPORT_SYMBOL_GPL(stack_depot_save_flags); |
11ac25c6 | 680 | |
11ac25c6 ME |
681 | depot_stack_handle_t stack_depot_save(unsigned long *entries, |
682 | unsigned int nr_entries, | |
683 | gfp_t alloc_flags) | |
684 | { | |
022012dc AK |
685 | return stack_depot_save_flags(entries, nr_entries, alloc_flags, |
686 | STACK_DEPOT_FLAG_CAN_ALLOC); | |
11ac25c6 | 687 | } |
c0cfc337 | 688 | EXPORT_SYMBOL_GPL(stack_depot_save); |
15ef6a98 | 689 | |
4bedfb31 OS |
690 | struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle) |
691 | { | |
692 | if (!handle) | |
693 | return NULL; | |
694 | ||
695 | return depot_fetch_stack(handle); | |
696 | } | |
697 | ||
15ef6a98 AK |
698 | unsigned int stack_depot_fetch(depot_stack_handle_t handle, |
699 | unsigned long **entries) | |
700 | { | |
15ef6a98 AK |
701 | struct stack_record *stack; |
702 | ||
703 | *entries = NULL; | |
8e00b2df AP |
704 | /* |
705 | * Let KMSAN know *entries is initialized. This shall prevent false | |
706 | * positive reports if instrumented code accesses it. | |
707 | */ | |
708 | kmsan_unpoison_memory(entries, sizeof(*entries)); | |
709 | ||
0c5d44a8 | 710 | if (!handle || stack_depot_disabled) |
15ef6a98 AK |
711 | return 0; |
712 | ||
83130ab2 | 713 | stack = depot_fetch_stack(handle); |
4434a56e ME |
714 | /* |
715 | * Should never be NULL, otherwise this is a use-after-put (or just a | |
716 | * corrupt handle). | |
717 | */ | |
718 | if (WARN(!stack, "corrupt handle or use after stack_depot_put()")) | |
719 | return 0; | |
a6cd9570 | 720 | |
15ef6a98 AK |
721 | *entries = stack->entries; |
722 | return stack->size; | |
723 | } | |
724 | EXPORT_SYMBOL_GPL(stack_depot_fetch); | |
725 | ||
108be8de AK |
726 | void stack_depot_put(depot_stack_handle_t handle) |
727 | { | |
728 | struct stack_record *stack; | |
108be8de AK |
729 | |
730 | if (!handle || stack_depot_disabled) | |
731 | return; | |
732 | ||
108be8de | 733 | stack = depot_fetch_stack(handle); |
4434a56e ME |
734 | /* |
735 | * Should always be able to find the stack record, otherwise this is an | |
736 | * unbalanced put attempt (or corrupt handle). | |
737 | */ | |
738 | if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()")) | |
739 | return; | |
108be8de | 740 | |
4434a56e | 741 | if (refcount_dec_and_test(&stack->count)) |
108be8de | 742 | depot_free_stack(stack); |
108be8de AK |
743 | } |
744 | EXPORT_SYMBOL_GPL(stack_depot_put); | |
745 | ||
15ef6a98 AK |
746 | void stack_depot_print(depot_stack_handle_t stack) |
747 | { | |
748 | unsigned long *entries; | |
749 | unsigned int nr_entries; | |
750 | ||
751 | nr_entries = stack_depot_fetch(stack, &entries); | |
752 | if (nr_entries > 0) | |
753 | stack_trace_print(entries, nr_entries, 0); | |
754 | } | |
755 | EXPORT_SYMBOL_GPL(stack_depot_print); | |
756 | ||
15ef6a98 AK |
757 | int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, |
758 | int spaces) | |
759 | { | |
760 | unsigned long *entries; | |
761 | unsigned int nr_entries; | |
762 | ||
763 | nr_entries = stack_depot_fetch(handle, &entries); | |
764 | return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries, | |
765 | spaces) : 0; | |
766 | } | |
767 | EXPORT_SYMBOL_GPL(stack_depot_snprint); | |
768 | ||
36aa1e67 AK |
769 | depot_stack_handle_t __must_check stack_depot_set_extra_bits( |
770 | depot_stack_handle_t handle, unsigned int extra_bits) | |
771 | { | |
772 | union handle_parts parts = { .handle = handle }; | |
773 | ||
774 | /* Don't set extra bits on empty handles. */ | |
775 | if (!handle) | |
776 | return 0; | |
777 | ||
778 | parts.extra = extra_bits; | |
779 | return parts.handle; | |
780 | } | |
781 | EXPORT_SYMBOL(stack_depot_set_extra_bits); | |
782 | ||
15ef6a98 AK |
783 | unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) |
784 | { | |
785 | union handle_parts parts = { .handle = handle }; | |
786 | ||
787 | return parts.extra; | |
788 | } | |
789 | EXPORT_SYMBOL(stack_depot_get_extra_bits); | |
c2a29254 ME |
790 | |
791 | static int stats_show(struct seq_file *seq, void *v) | |
792 | { | |
793 | /* | |
794 | * data race ok: These are just statistics counters, and approximate | |
795 | * statistics are ok for debugging. | |
796 | */ | |
797 | seq_printf(seq, "pools: %d\n", data_race(pools_num)); | |
798 | for (int i = 0; i < DEPOT_COUNTER_COUNT; i++) | |
799 | seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i])); | |
800 | ||
801 | return 0; | |
802 | } | |
803 | DEFINE_SHOW_ATTRIBUTE(stats); | |
804 | ||
805 | static int depot_debugfs_init(void) | |
806 | { | |
807 | struct dentry *dir; | |
808 | ||
809 | if (stack_depot_disabled) | |
810 | return 0; | |
811 | ||
812 | dir = debugfs_create_dir("stackdepot", NULL); | |
813 | debugfs_create_file("stats", 0444, dir, NULL, &stats_fops); | |
814 | return 0; | |
815 | } | |
816 | late_initcall(depot_debugfs_init); |