bcachefs: Simplify hash table checks
[linux-block.git] / fs / bcachefs / btree_cache.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
07a1006a 4#include "bkey_buf.h"
1c6fdbd8
KO
5#include "btree_cache.h"
6#include "btree_io.h"
7#include "btree_iter.h"
8#include "btree_locking.h"
9#include "debug.h"
a0b73c1c 10#include "error.h"
1c6fdbd8
KO
11#include "trace.h"
12
13#include <linux/prefetch.h>
e0dfc08b 14#include <linux/sched/mm.h>
1c6fdbd8 15
1c6fdbd8
KO
16void bch2_recalc_btree_reserve(struct bch_fs *c)
17{
18 unsigned i, reserve = 16;
19
20 if (!c->btree_roots[0].b)
21 reserve += 8;
22
23 for (i = 0; i < BTREE_ID_NR; i++)
24 if (c->btree_roots[i].b)
25 reserve += min_t(unsigned, 1,
c43a6ef9 26 c->btree_roots[i].b->c.level) * 8;
1c6fdbd8
KO
27
28 c->btree_cache.reserve = reserve;
29}
30
31static inline unsigned btree_cache_can_free(struct btree_cache *bc)
32{
33 return max_t(int, 0, bc->used - bc->reserve);
34}
35
36static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
37{
38 EBUG_ON(btree_node_write_in_flight(b));
39
40 kvpfree(b->data, btree_bytes(c));
41 b->data = NULL;
4580baec
KO
42 kvfree(b->aux_data);
43 b->aux_data = NULL;
1c6fdbd8
KO
44}
45
46static void btree_node_data_free(struct bch_fs *c, struct btree *b)
47{
48 struct btree_cache *bc = &c->btree_cache;
49
50 __btree_node_data_free(c, b);
51 bc->used--;
52 list_move(&b->list, &bc->freed);
53}
54
55static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
56 const void *obj)
57{
58 const struct btree *b = obj;
59 const u64 *v = arg->key;
60
237e8048 61 return b->hash_val == *v ? 0 : 1;
1c6fdbd8
KO
62}
63
64static const struct rhashtable_params bch_btree_cache_params = {
65 .head_offset = offsetof(struct btree, hash),
237e8048
KO
66 .key_offset = offsetof(struct btree, hash_val),
67 .key_len = sizeof(u64),
1c6fdbd8
KO
68 .obj_cmpfn = bch2_btree_cache_cmp_fn,
69};
70
4580baec 71static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
1c6fdbd8 72{
e38821f3 73 BUG_ON(b->data || b->aux_data);
1c6fdbd8
KO
74
75 b->data = kvpmalloc(btree_bytes(c), gfp);
76 if (!b->data)
e38821f3 77 return -ENOMEM;
1c6fdbd8 78
4580baec
KO
79 b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
80 if (!b->aux_data) {
e38821f3
KO
81 kvpfree(b->data, btree_bytes(c));
82 b->data = NULL;
83 return -ENOMEM;
84 }
1c6fdbd8 85
e38821f3
KO
86 return 0;
87}
88
4580baec 89static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
e38821f3 90{
4580baec 91 struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
1c6fdbd8
KO
92 if (!b)
93 return NULL;
94
26609b61 95 bkey_btree_ptr_init(&b->key);
c43a6ef9
KO
96 six_lock_init(&b->c.lock);
97 lockdep_set_novalidate_class(&b->c.lock);
1c6fdbd8
KO
98 INIT_LIST_HEAD(&b->list);
99 INIT_LIST_HEAD(&b->write_blocked);
4580baec
KO
100 b->byte_order = ilog2(btree_bytes(c));
101 return b;
102}
103
104static struct btree *btree_node_mem_alloc(struct bch_fs *c)
105{
106 struct btree_cache *bc = &c->btree_cache;
107 struct btree *b = __btree_node_mem_alloc(c);
108 if (!b)
109 return NULL;
1c6fdbd8 110
4580baec
KO
111 if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
112 kfree(b);
113 return NULL;
114 }
115
116 bc->used++;
117 list_add(&b->list, &bc->freeable);
118 return b;
1c6fdbd8
KO
119}
120
121/* Btree in memory cache - hash table */
122
123void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
124{
125 rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
126
127 /* Cause future lookups for this node to fail: */
237e8048 128 b->hash_val = 0;
760992aa
KO
129
130 six_lock_wakeup_all(&b->c.lock);
1c6fdbd8
KO
131}
132
133int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
134{
237e8048
KO
135 BUG_ON(b->hash_val);
136 b->hash_val = btree_ptr_hash_val(&b->key);
137
1c6fdbd8
KO
138 return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
139 bch_btree_cache_params);
140}
141
142int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
143 unsigned level, enum btree_id id)
144{
145 int ret;
146
c43a6ef9
KO
147 b->c.level = level;
148 b->c.btree_id = id;
1c6fdbd8 149
a9d79c6e
KO
150 if (level)
151 six_lock_pcpu_alloc(&b->c.lock);
152 else
153 six_lock_pcpu_free_rcu(&b->c.lock);
154
1c6fdbd8
KO
155 mutex_lock(&bc->lock);
156 ret = __bch2_btree_node_hash_insert(bc, b);
157 if (!ret)
158 list_add(&b->list, &bc->live);
159 mutex_unlock(&bc->lock);
160
161 return ret;
162}
163
164__flatten
165static inline struct btree *btree_cache_find(struct btree_cache *bc,
166 const struct bkey_i *k)
167{
237e8048
KO
168 u64 v = btree_ptr_hash_val(k);
169
170 return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
1c6fdbd8
KO
171}
172
173/*
174 * this version is for btree nodes that have already been freed (we're not
175 * reaping a real btree node)
176 */
177static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
178{
179 struct btree_cache *bc = &c->btree_cache;
180 int ret = 0;
181
182 lockdep_assert_held(&bc->lock);
183
c43a6ef9 184 if (!six_trylock_intent(&b->c.lock))
1c6fdbd8
KO
185 return -ENOMEM;
186
c43a6ef9 187 if (!six_trylock_write(&b->c.lock))
1c6fdbd8
KO
188 goto out_unlock_intent;
189
190 if (btree_node_noevict(b))
191 goto out_unlock;
192
193 if (!btree_node_may_write(b))
194 goto out_unlock;
195
d0cc3def
KO
196 if (btree_node_dirty(b) &&
197 test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
198 goto out_unlock;
199
1c6fdbd8
KO
200 if (btree_node_dirty(b) ||
201 btree_node_write_in_flight(b) ||
202 btree_node_read_in_flight(b)) {
203 if (!flush)
204 goto out_unlock;
205
206 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
207 TASK_UNINTERRUPTIBLE);
208
209 /*
210 * Using the underscore version because we don't want to compact
211 * bsets after the write, since this node is about to be evicted
212 * - unless btree verify mode is enabled, since it runs out of
213 * the post write cleanup:
214 */
29364f34 215 if (bch2_verify_btree_ondisk)
1c6fdbd8
KO
216 bch2_btree_node_write(c, b, SIX_LOCK_intent);
217 else
2177147b 218 __bch2_btree_node_write(c, b);
1c6fdbd8
KO
219
220 /* wait for any in flight btree write */
221 btree_node_wait_on_io(b);
222 }
223out:
237e8048 224 if (b->hash_val && !ret)
1c6fdbd8
KO
225 trace_btree_node_reap(c, b);
226 return ret;
227out_unlock:
c43a6ef9 228 six_unlock_write(&b->c.lock);
1c6fdbd8 229out_unlock_intent:
c43a6ef9 230 six_unlock_intent(&b->c.lock);
1c6fdbd8
KO
231 ret = -ENOMEM;
232 goto out;
233}
234
235static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
236{
237 return __btree_node_reclaim(c, b, false);
238}
239
240static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
241{
242 return __btree_node_reclaim(c, b, true);
243}
244
245static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
246 struct shrink_control *sc)
247{
248 struct bch_fs *c = container_of(shrink, struct bch_fs,
249 btree_cache.shrink);
250 struct btree_cache *bc = &c->btree_cache;
251 struct btree *b, *t;
252 unsigned long nr = sc->nr_to_scan;
253 unsigned long can_free;
254 unsigned long touched = 0;
255 unsigned long freed = 0;
97c0e195 256 unsigned i, flags;
1c6fdbd8 257
29364f34 258 if (bch2_btree_shrinker_disabled)
1c6fdbd8
KO
259 return SHRINK_STOP;
260
261 /* Return -1 if we can't do anything right now */
8c9eef95 262 if (sc->gfp_mask & __GFP_FS)
1c6fdbd8
KO
263 mutex_lock(&bc->lock);
264 else if (!mutex_trylock(&bc->lock))
265 return -1;
266
97c0e195
KO
267 flags = memalloc_nofs_save();
268
1c6fdbd8
KO
269 /*
270 * It's _really_ critical that we don't free too many btree nodes - we
271 * have to always leave ourselves a reserve. The reserve is how we
272 * guarantee that allocating memory for a new btree node can always
273 * succeed, so that inserting keys into the btree can always succeed and
274 * IO can always make forward progress:
275 */
276 nr /= btree_pages(c);
277 can_free = btree_cache_can_free(bc);
278 nr = min_t(unsigned long, nr, can_free);
279
280 i = 0;
281 list_for_each_entry_safe(b, t, &bc->freeable, list) {
c043a330
KO
282 /*
283 * Leave a few nodes on the freeable list, so that a btree split
284 * won't have to hit the system allocator:
285 */
286 if (++i <= 3)
287 continue;
288
1c6fdbd8
KO
289 touched++;
290
291 if (freed >= nr)
292 break;
293
c043a330 294 if (!btree_node_reclaim(c, b)) {
1c6fdbd8 295 btree_node_data_free(c, b);
c43a6ef9
KO
296 six_unlock_write(&b->c.lock);
297 six_unlock_intent(&b->c.lock);
1c6fdbd8
KO
298 freed++;
299 }
300 }
301restart:
302 list_for_each_entry_safe(b, t, &bc->live, list) {
303 touched++;
304
305 if (freed >= nr) {
306 /* Save position */
307 if (&t->list != &bc->live)
308 list_move_tail(&bc->live, &t->list);
309 break;
310 }
311
312 if (!btree_node_accessed(b) &&
313 !btree_node_reclaim(c, b)) {
314 /* can't call bch2_btree_node_hash_remove under lock */
315 freed++;
316 if (&t->list != &bc->live)
317 list_move_tail(&bc->live, &t->list);
318
319 btree_node_data_free(c, b);
320 mutex_unlock(&bc->lock);
321
322 bch2_btree_node_hash_remove(bc, b);
c43a6ef9
KO
323 six_unlock_write(&b->c.lock);
324 six_unlock_intent(&b->c.lock);
1c6fdbd8
KO
325
326 if (freed >= nr)
327 goto out;
328
47a5649a 329 if (sc->gfp_mask & __GFP_FS)
1c6fdbd8
KO
330 mutex_lock(&bc->lock);
331 else if (!mutex_trylock(&bc->lock))
332 goto out;
333 goto restart;
334 } else
335 clear_btree_node_accessed(b);
336 }
337
338 mutex_unlock(&bc->lock);
339out:
e648448c 340 memalloc_nofs_restore(flags);
1c6fdbd8
KO
341 return (unsigned long) freed * btree_pages(c);
342}
343
344static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
345 struct shrink_control *sc)
346{
347 struct bch_fs *c = container_of(shrink, struct bch_fs,
348 btree_cache.shrink);
349 struct btree_cache *bc = &c->btree_cache;
350
29364f34 351 if (bch2_btree_shrinker_disabled)
1c6fdbd8
KO
352 return 0;
353
354 return btree_cache_can_free(bc) * btree_pages(c);
355}
356
357void bch2_fs_btree_cache_exit(struct bch_fs *c)
358{
359 struct btree_cache *bc = &c->btree_cache;
360 struct btree *b;
5d0b7f90 361 unsigned i, flags;
1c6fdbd8
KO
362
363 if (bc->shrink.list.next)
364 unregister_shrinker(&bc->shrink);
365
5d0b7f90
KO
366 /* vfree() can allocate memory: */
367 flags = memalloc_nofs_save();
1c6fdbd8
KO
368 mutex_lock(&bc->lock);
369
370#ifdef CONFIG_BCACHEFS_DEBUG
371 if (c->verify_data)
372 list_move(&c->verify_data->list, &bc->live);
373
374 kvpfree(c->verify_ondisk, btree_bytes(c));
375#endif
376
377 for (i = 0; i < BTREE_ID_NR; i++)
378 if (c->btree_roots[i].b)
379 list_add(&c->btree_roots[i].b->list, &bc->live);
380
381 list_splice(&bc->freeable, &bc->live);
382
383 while (!list_empty(&bc->live)) {
384 b = list_first_entry(&bc->live, struct btree, list);
385
386 BUG_ON(btree_node_read_in_flight(b) ||
387 btree_node_write_in_flight(b));
388
389 if (btree_node_dirty(b))
390 bch2_btree_complete_write(c, b, btree_current_write(b));
6a747c46 391 clear_btree_node_dirty(c, b);
1c6fdbd8
KO
392
393 btree_node_data_free(c, b);
394 }
395
6a747c46
KO
396 BUG_ON(atomic_read(&c->btree_cache.dirty));
397
1c6fdbd8
KO
398 while (!list_empty(&bc->freed)) {
399 b = list_first_entry(&bc->freed, struct btree, list);
400 list_del(&b->list);
a9d79c6e 401 six_lock_pcpu_free(&b->c.lock);
1c6fdbd8
KO
402 kfree(b);
403 }
404
405 mutex_unlock(&bc->lock);
5d0b7f90 406 memalloc_nofs_restore(flags);
1c6fdbd8
KO
407
408 if (bc->table_init_done)
409 rhashtable_destroy(&bc->table);
410}
411
412int bch2_fs_btree_cache_init(struct bch_fs *c)
413{
414 struct btree_cache *bc = &c->btree_cache;
415 unsigned i;
416 int ret = 0;
417
418 pr_verbose_init(c->opts, "");
419
420 ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
421 if (ret)
422 goto out;
423
424 bc->table_init_done = true;
425
426 bch2_recalc_btree_reserve(c);
427
428 for (i = 0; i < bc->reserve; i++)
4580baec 429 if (!btree_node_mem_alloc(c)) {
1c6fdbd8
KO
430 ret = -ENOMEM;
431 goto out;
432 }
433
434 list_splice_init(&bc->live, &bc->freeable);
435
436#ifdef CONFIG_BCACHEFS_DEBUG
437 mutex_init(&c->verify_lock);
438
439 c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
440 if (!c->verify_ondisk) {
441 ret = -ENOMEM;
442 goto out;
443 }
444
4580baec 445 c->verify_data = btree_node_mem_alloc(c);
1c6fdbd8
KO
446 if (!c->verify_data) {
447 ret = -ENOMEM;
448 goto out;
449 }
450
451 list_del_init(&c->verify_data->list);
452#endif
453
454 bc->shrink.count_objects = bch2_btree_cache_count;
455 bc->shrink.scan_objects = bch2_btree_cache_scan;
456 bc->shrink.seeks = 4;
457 bc->shrink.batch = btree_pages(c) * 2;
d8b46004 458 ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
1c6fdbd8
KO
459out:
460 pr_verbose_init(c->opts, "ret %i", ret);
461 return ret;
462}
463
464void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
465{
466 mutex_init(&bc->lock);
467 INIT_LIST_HEAD(&bc->live);
468 INIT_LIST_HEAD(&bc->freeable);
469 INIT_LIST_HEAD(&bc->freed);
470}
471
472/*
473 * We can only have one thread cannibalizing other cached btree nodes at a time,
474 * or we'll deadlock. We use an open coded mutex to ensure that, which a
475 * cannibalize_bucket() will take. This means every time we unlock the root of
476 * the btree, we need to release this lock if we have it held.
477 */
478void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
479{
480 struct btree_cache *bc = &c->btree_cache;
481
482 if (bc->alloc_lock == current) {
483 trace_btree_node_cannibalize_unlock(c);
484 bc->alloc_lock = NULL;
485 closure_wake_up(&bc->alloc_wait);
486 }
487}
488
489int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
490{
491 struct btree_cache *bc = &c->btree_cache;
492 struct task_struct *old;
493
494 old = cmpxchg(&bc->alloc_lock, NULL, current);
495 if (old == NULL || old == current)
496 goto success;
497
498 if (!cl) {
499 trace_btree_node_cannibalize_lock_fail(c);
500 return -ENOMEM;
501 }
502
503 closure_wait(&bc->alloc_wait, cl);
504
505 /* Try again, after adding ourselves to waitlist */
506 old = cmpxchg(&bc->alloc_lock, NULL, current);
507 if (old == NULL || old == current) {
508 /* We raced */
509 closure_wake_up(&bc->alloc_wait);
510 goto success;
511 }
512
513 trace_btree_node_cannibalize_lock_fail(c);
514 return -EAGAIN;
515
516success:
517 trace_btree_node_cannibalize_lock(c);
518 return 0;
519}
520
521static struct btree *btree_node_cannibalize(struct bch_fs *c)
522{
523 struct btree_cache *bc = &c->btree_cache;
524 struct btree *b;
525
526 list_for_each_entry_reverse(b, &bc->live, list)
527 if (!btree_node_reclaim(c, b))
528 return b;
529
530 while (1) {
531 list_for_each_entry_reverse(b, &bc->live, list)
532 if (!btree_node_write_and_reclaim(c, b))
533 return b;
534
535 /*
536 * Rare case: all nodes were intent-locked.
537 * Just busy-wait.
538 */
539 WARN_ONCE(1, "btree cache cannibalize failed\n");
540 cond_resched();
541 }
542}
543
544struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
545{
546 struct btree_cache *bc = &c->btree_cache;
547 struct btree *b;
548 u64 start_time = local_clock();
e0dfc08b 549 unsigned flags;
1c6fdbd8 550
e0dfc08b 551 flags = memalloc_nofs_save();
1c6fdbd8
KO
552 mutex_lock(&bc->lock);
553
554 /*
555 * btree_free() doesn't free memory; it sticks the node on the end of
556 * the list. Check if there's any freed nodes there:
557 */
558 list_for_each_entry(b, &bc->freeable, list)
559 if (!btree_node_reclaim(c, b))
e38821f3 560 goto got_node;
1c6fdbd8
KO
561
562 /*
563 * We never free struct btree itself, just the memory that holds the on
564 * disk node. Check the freed list before allocating a new one:
565 */
566 list_for_each_entry(b, &bc->freed, list)
e38821f3
KO
567 if (!btree_node_reclaim(c, b))
568 goto got_node;
1c6fdbd8 569
e38821f3
KO
570 b = NULL;
571got_node:
572 if (b)
573 list_del_init(&b->list);
574 mutex_unlock(&bc->lock);
575
576 if (!b) {
4580baec 577 b = __btree_node_mem_alloc(c);
e38821f3 578 if (!b)
1c6fdbd8 579 goto err;
1c6fdbd8 580
e38821f3
KO
581 BUG_ON(!six_trylock_intent(&b->c.lock));
582 BUG_ON(!six_trylock_write(&b->c.lock));
583 }
584
585 if (!b->data) {
4580baec 586 if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
e38821f3
KO
587 goto err;
588
589 mutex_lock(&bc->lock);
590 bc->used++;
591 mutex_unlock(&bc->lock);
592 }
1c6fdbd8 593
1c6fdbd8
KO
594 BUG_ON(btree_node_hashed(b));
595 BUG_ON(btree_node_write_in_flight(b));
1c6fdbd8
KO
596out:
597 b->flags = 0;
598 b->written = 0;
599 b->nsets = 0;
600 b->sib_u64s[0] = 0;
601 b->sib_u64s[1] = 0;
602 b->whiteout_u64s = 0;
29364f34 603 bch2_btree_keys_init(b);
1c6fdbd8
KO
604
605 bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
606 start_time);
607
692c3f06 608 memalloc_nofs_restore(flags);
1c6fdbd8
KO
609 return b;
610err:
e38821f3
KO
611 mutex_lock(&bc->lock);
612
613 if (b) {
614 list_add(&b->list, &bc->freed);
615 six_unlock_write(&b->c.lock);
616 six_unlock_intent(&b->c.lock);
617 }
618
1c6fdbd8
KO
619 /* Try to cannibalize another cached btree node: */
620 if (bc->alloc_lock == current) {
621 b = btree_node_cannibalize(c);
622 list_del_init(&b->list);
623 mutex_unlock(&bc->lock);
624
625 bch2_btree_node_hash_remove(bc, b);
626
627 trace_btree_node_cannibalize(c);
628 goto out;
629 }
630
631 mutex_unlock(&bc->lock);
692c3f06 632 memalloc_nofs_restore(flags);
1c6fdbd8
KO
633 return ERR_PTR(-ENOMEM);
634}
635
636/* Slowpath, don't want it inlined into btree_iter_traverse() */
637static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
638 struct btree_iter *iter,
639 const struct bkey_i *k,
e62d65f2 640 enum btree_id btree_id,
1c6fdbd8
KO
641 unsigned level,
642 enum six_lock_type lock_type,
643 bool sync)
644{
645 struct btree_cache *bc = &c->btree_cache;
646 struct btree *b;
647
72141e1f 648 BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
1c6fdbd8
KO
649 /*
650 * Parent node must be locked, else we could read in a btree node that's
651 * been freed:
652 */
e62d65f2 653 if (iter && !bch2_btree_node_relock(iter, level + 1))
72141e1f 654 return ERR_PTR(-EINTR);
1c6fdbd8
KO
655
656 b = bch2_btree_node_mem_alloc(c);
657 if (IS_ERR(b))
658 return b;
659
660 bkey_copy(&b->key, k);
e62d65f2 661 if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
1c6fdbd8
KO
662 /* raced with another fill: */
663
664 /* mark as unhashed... */
237e8048 665 b->hash_val = 0;
1c6fdbd8
KO
666
667 mutex_lock(&bc->lock);
668 list_add(&b->list, &bc->freeable);
669 mutex_unlock(&bc->lock);
670
c43a6ef9
KO
671 six_unlock_write(&b->c.lock);
672 six_unlock_intent(&b->c.lock);
1c6fdbd8
KO
673 return NULL;
674 }
675
676 /*
72141e1f 677 * Unlock before doing IO:
1c6fdbd8 678 *
72141e1f 679 * XXX: ideally should be dropping all btree node locks here
1c6fdbd8 680 */
e62d65f2 681 if (iter && btree_node_read_locked(iter, level + 1))
1c6fdbd8
KO
682 btree_node_unlock(iter, level + 1);
683
684 bch2_btree_node_read(c, b, sync);
685
c43a6ef9 686 six_unlock_write(&b->c.lock);
1c6fdbd8
KO
687
688 if (!sync) {
c43a6ef9 689 six_unlock_intent(&b->c.lock);
1c6fdbd8
KO
690 return NULL;
691 }
692
693 if (lock_type == SIX_LOCK_read)
c43a6ef9 694 six_lock_downgrade(&b->c.lock);
1c6fdbd8
KO
695
696 return b;
697}
698
bd2bb273
KO
699static int lock_node_check_fn(struct six_lock *lock, void *p)
700{
701 struct btree *b = container_of(lock, struct btree, c.lock);
702 const struct bkey_i *k = p;
703
704 return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
705}
706
1c6fdbd8
KO
707/**
708 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
709 * in from disk if necessary.
710 *
711 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
712 *
713 * The btree node will have either a read or a write lock held, depending on
714 * the @write parameter.
715 */
716struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
717 const struct bkey_i *k, unsigned level,
a301dc38
KO
718 enum six_lock_type lock_type,
719 unsigned long trace_ip)
1c6fdbd8
KO
720{
721 struct btree_cache *bc = &c->btree_cache;
722 struct btree *b;
723 struct bset_tree *t;
724
1c6fdbd8 725 EBUG_ON(level >= BTREE_MAX_DEPTH);
72141e1f
KO
726
727 b = btree_node_mem_ptr(k);
728 if (b)
729 goto lock_node;
1c6fdbd8 730retry:
1c6fdbd8 731 b = btree_cache_find(bc, k);
1c6fdbd8
KO
732 if (unlikely(!b)) {
733 /*
734 * We must have the parent locked to call bch2_btree_node_fill(),
735 * else we could read in a btree node from disk that's been
736 * freed:
737 */
e62d65f2
KO
738 b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
739 level, lock_type, true);
1c6fdbd8
KO
740
741 /* We raced and found the btree node in the cache */
742 if (!b)
743 goto retry;
744
745 if (IS_ERR(b))
746 return b;
747 } else {
72141e1f 748lock_node:
1c6fdbd8
KO
749 /*
750 * There's a potential deadlock with splits and insertions into
751 * interior nodes we have to avoid:
752 *
753 * The other thread might be holding an intent lock on the node
754 * we want, and they want to update its parent node so they're
755 * going to upgrade their intent lock on the parent node to a
756 * write lock.
757 *
758 * But if we're holding a read lock on the parent, and we're
759 * trying to get the intent lock they're holding, we deadlock.
760 *
761 * So to avoid this we drop the read locks on parent nodes when
762 * we're starting to take intent locks - and handle the race.
763 *
764 * The race is that they might be about to free the node we
765 * want, and dropping our read lock on the parent node lets them
766 * update the parent marking the node we want as freed, and then
767 * free it:
768 *
769 * To guard against this, btree nodes are evicted from the cache
237e8048 770 * when they're freed - and b->hash_val is zeroed out, which we
1c6fdbd8
KO
771 * check for after we lock the node.
772 *
773 * Then, bch2_btree_node_relock() on the parent will fail - because
774 * the parent was modified, when the pointer to the node we want
775 * was removed - and we'll bail out:
776 */
777 if (btree_node_read_locked(iter, level + 1))
778 btree_node_unlock(iter, level + 1);
779
bd2bb273 780 if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
a301dc38 781 lock_node_check_fn, (void *) k, trace_ip)) {
bd2bb273
KO
782 if (b->hash_val != btree_ptr_hash_val(k))
783 goto retry;
1c6fdbd8 784 return ERR_PTR(-EINTR);
bd2bb273 785 }
1c6fdbd8 786
237e8048 787 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
c43a6ef9 788 b->c.level != level ||
1c6fdbd8 789 race_fault())) {
c43a6ef9 790 six_unlock_type(&b->c.lock, lock_type);
1c6fdbd8
KO
791 if (bch2_btree_node_relock(iter, level + 1))
792 goto retry;
793
20bceecb 794 trace_trans_restart_btree_node_reused(iter->trans->ip);
1c6fdbd8
KO
795 return ERR_PTR(-EINTR);
796 }
797 }
798
72141e1f 799 /* XXX: waiting on IO with btree locks held: */
1c6fdbd8
KO
800 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
801 TASK_UNINTERRUPTIBLE);
802
803 prefetch(b->aux_data);
804
805 for_each_bset(b, t) {
806 void *p = (u64 *) b->aux_data + t->aux_data_offset;
807
808 prefetch(p + L1_CACHE_BYTES * 0);
809 prefetch(p + L1_CACHE_BYTES * 1);
810 prefetch(p + L1_CACHE_BYTES * 2);
811 }
812
813 /* avoid atomic set bit if it's not needed: */
7f81d4cf 814 if (!btree_node_accessed(b))
1c6fdbd8
KO
815 set_btree_node_accessed(b);
816
817 if (unlikely(btree_node_read_error(b))) {
c43a6ef9 818 six_unlock_type(&b->c.lock, lock_type);
1c6fdbd8
KO
819 return ERR_PTR(-EIO);
820 }
821
a0b73c1c
KO
822 EBUG_ON(b->c.btree_id != iter->btree_id);
823 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
4cf91b02 824 EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
a0b73c1c 825 EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
4cf91b02 826 bpos_cmp(b->data->min_key,
a0b73c1c 827 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
1c6fdbd8
KO
828
829 return b;
830}
831
e62d65f2
KO
832struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
833 const struct bkey_i *k,
834 enum btree_id btree_id,
a0b73c1c
KO
835 unsigned level,
836 bool nofill)
e62d65f2
KO
837{
838 struct btree_cache *bc = &c->btree_cache;
839 struct btree *b;
840 struct bset_tree *t;
bd2bb273 841 int ret;
e62d65f2
KO
842
843 EBUG_ON(level >= BTREE_MAX_DEPTH);
844
845 b = btree_node_mem_ptr(k);
846 if (b)
847 goto lock_node;
848retry:
849 b = btree_cache_find(bc, k);
850 if (unlikely(!b)) {
a0b73c1c 851 if (nofill)
18a7b972 852 goto out;
a0b73c1c 853
e62d65f2
KO
854 b = bch2_btree_node_fill(c, NULL, k, btree_id,
855 level, SIX_LOCK_read, true);
856
857 /* We raced and found the btree node in the cache */
858 if (!b)
859 goto retry;
860
18a7b972
KO
861 if (IS_ERR(b) &&
862 !bch2_btree_cache_cannibalize_lock(c, NULL))
863 goto retry;
864
e62d65f2 865 if (IS_ERR(b))
18a7b972 866 goto out;
e62d65f2
KO
867 } else {
868lock_node:
bd2bb273
KO
869 ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
870 if (ret)
871 goto retry;
e62d65f2
KO
872
873 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
874 b->c.btree_id != btree_id ||
875 b->c.level != level)) {
876 six_unlock_read(&b->c.lock);
877 goto retry;
878 }
879 }
880
881 /* XXX: waiting on IO with btree locks held: */
882 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
883 TASK_UNINTERRUPTIBLE);
884
885 prefetch(b->aux_data);
886
887 for_each_bset(b, t) {
888 void *p = (u64 *) b->aux_data + t->aux_data_offset;
889
890 prefetch(p + L1_CACHE_BYTES * 0);
891 prefetch(p + L1_CACHE_BYTES * 1);
892 prefetch(p + L1_CACHE_BYTES * 2);
893 }
894
895 /* avoid atomic set bit if it's not needed: */
896 if (!btree_node_accessed(b))
897 set_btree_node_accessed(b);
898
899 if (unlikely(btree_node_read_error(b))) {
900 six_unlock_read(&b->c.lock);
18a7b972
KO
901 b = ERR_PTR(-EIO);
902 goto out;
e62d65f2
KO
903 }
904
a0b73c1c
KO
905 EBUG_ON(b->c.btree_id != btree_id);
906 EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
4cf91b02 907 EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
a0b73c1c 908 EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
4cf91b02 909 bpos_cmp(b->data->min_key,
a0b73c1c 910 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
18a7b972
KO
911out:
912 bch2_btree_cache_cannibalize_unlock(c);
e62d65f2
KO
913 return b;
914}
915
1c6fdbd8 916void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
edfbba58
KO
917 const struct bkey_i *k,
918 enum btree_id btree_id, unsigned level)
1c6fdbd8
KO
919{
920 struct btree_cache *bc = &c->btree_cache;
921 struct btree *b;
922
edfbba58 923 BUG_ON(iter && !btree_node_locked(iter, level + 1));
1c6fdbd8
KO
924 BUG_ON(level >= BTREE_MAX_DEPTH);
925
1c6fdbd8 926 b = btree_cache_find(bc, k);
1c6fdbd8
KO
927 if (b)
928 return;
929
edfbba58 930 bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
1c6fdbd8
KO
931}
932
319f9ac3
KO
933void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
934 struct btree *b)
1c6fdbd8
KO
935{
936 const struct bkey_format *f = &b->format;
937 struct bset_stats stats;
1c6fdbd8
KO
938
939 memset(&stats, 0, sizeof(stats));
940
1c6fdbd8
KO
941 bch2_btree_keys_stats(b, &stats);
942
f020bfcd
KO
943 pr_buf(out, "l %u ", b->c.level);
944 bch2_bpos_to_text(out, b->data->min_key);
945 pr_buf(out, " - ");
946 bch2_bpos_to_text(out, b->data->max_key);
947 pr_buf(out, ":\n"
948 " ptrs: ");
26609b61 949 bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
f020bfcd 950
319f9ac3
KO
951 pr_buf(out, "\n"
952 " format: u64s %u fields %u %u %u %u %u\n"
953 " unpack fn len: %u\n"
954 " bytes used %zu/%zu (%zu%% full)\n"
54ca47e1 955 " sib u64s: %u, %u (merge threshold %u)\n"
319f9ac3
KO
956 " nr packed keys %u\n"
957 " nr unpacked keys %u\n"
958 " floats %zu\n"
58404bb2 959 " failed unpacked %zu\n",
319f9ac3
KO
960 f->key_u64s,
961 f->bits_per_field[0],
962 f->bits_per_field[1],
963 f->bits_per_field[2],
964 f->bits_per_field[3],
965 f->bits_per_field[4],
966 b->unpack_fn_len,
967 b->nr.live_u64s * sizeof(u64),
968 btree_bytes(c) - sizeof(struct btree_node),
969 b->nr.live_u64s * 100 / btree_max_u64s(c),
970 b->sib_u64s[0],
971 b->sib_u64s[1],
54ca47e1 972 c->btree_foreground_merge_threshold,
319f9ac3
KO
973 b->nr.packed_keys,
974 b->nr.unpacked_keys,
975 stats.floats,
58404bb2 976 stats.failed);
1c6fdbd8 977}
d8ebed7d
KO
978
979void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
980{
b929bbef
KO
981 pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
982 pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
983 pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
d8ebed7d 984}