Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
07a1006a | 4 | #include "bkey_buf.h" |
1c6fdbd8 KO |
5 | #include "btree_cache.h" |
6 | #include "btree_io.h" | |
7 | #include "btree_iter.h" | |
8 | #include "btree_locking.h" | |
9 | #include "debug.h" | |
a0b73c1c | 10 | #include "error.h" |
1c6fdbd8 KO |
11 | #include "trace.h" |
12 | ||
13 | #include <linux/prefetch.h> | |
e0dfc08b | 14 | #include <linux/sched/mm.h> |
1c6fdbd8 | 15 | |
1c6fdbd8 KO |
16 | void bch2_recalc_btree_reserve(struct bch_fs *c) |
17 | { | |
18 | unsigned i, reserve = 16; | |
19 | ||
20 | if (!c->btree_roots[0].b) | |
21 | reserve += 8; | |
22 | ||
23 | for (i = 0; i < BTREE_ID_NR; i++) | |
24 | if (c->btree_roots[i].b) | |
25 | reserve += min_t(unsigned, 1, | |
c43a6ef9 | 26 | c->btree_roots[i].b->c.level) * 8; |
1c6fdbd8 KO |
27 | |
28 | c->btree_cache.reserve = reserve; | |
29 | } | |
30 | ||
31 | static inline unsigned btree_cache_can_free(struct btree_cache *bc) | |
32 | { | |
33 | return max_t(int, 0, bc->used - bc->reserve); | |
34 | } | |
35 | ||
36 | static void __btree_node_data_free(struct bch_fs *c, struct btree *b) | |
37 | { | |
38 | EBUG_ON(btree_node_write_in_flight(b)); | |
39 | ||
40 | kvpfree(b->data, btree_bytes(c)); | |
41 | b->data = NULL; | |
4580baec KO |
42 | kvfree(b->aux_data); |
43 | b->aux_data = NULL; | |
1c6fdbd8 KO |
44 | } |
45 | ||
46 | static void btree_node_data_free(struct bch_fs *c, struct btree *b) | |
47 | { | |
48 | struct btree_cache *bc = &c->btree_cache; | |
49 | ||
50 | __btree_node_data_free(c, b); | |
51 | bc->used--; | |
52 | list_move(&b->list, &bc->freed); | |
53 | } | |
54 | ||
55 | static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, | |
56 | const void *obj) | |
57 | { | |
58 | const struct btree *b = obj; | |
59 | const u64 *v = arg->key; | |
60 | ||
237e8048 | 61 | return b->hash_val == *v ? 0 : 1; |
1c6fdbd8 KO |
62 | } |
63 | ||
64 | static const struct rhashtable_params bch_btree_cache_params = { | |
65 | .head_offset = offsetof(struct btree, hash), | |
237e8048 KO |
66 | .key_offset = offsetof(struct btree, hash_val), |
67 | .key_len = sizeof(u64), | |
1c6fdbd8 KO |
68 | .obj_cmpfn = bch2_btree_cache_cmp_fn, |
69 | }; | |
70 | ||
4580baec | 71 | static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) |
1c6fdbd8 | 72 | { |
e38821f3 | 73 | BUG_ON(b->data || b->aux_data); |
1c6fdbd8 KO |
74 | |
75 | b->data = kvpmalloc(btree_bytes(c), gfp); | |
76 | if (!b->data) | |
e38821f3 | 77 | return -ENOMEM; |
1c6fdbd8 | 78 | |
4580baec KO |
79 | b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); |
80 | if (!b->aux_data) { | |
e38821f3 KO |
81 | kvpfree(b->data, btree_bytes(c)); |
82 | b->data = NULL; | |
83 | return -ENOMEM; | |
84 | } | |
1c6fdbd8 | 85 | |
e38821f3 KO |
86 | return 0; |
87 | } | |
88 | ||
4580baec | 89 | static struct btree *__btree_node_mem_alloc(struct bch_fs *c) |
e38821f3 | 90 | { |
4580baec | 91 | struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); |
1c6fdbd8 KO |
92 | if (!b) |
93 | return NULL; | |
94 | ||
26609b61 | 95 | bkey_btree_ptr_init(&b->key); |
c43a6ef9 KO |
96 | six_lock_init(&b->c.lock); |
97 | lockdep_set_novalidate_class(&b->c.lock); | |
1c6fdbd8 KO |
98 | INIT_LIST_HEAD(&b->list); |
99 | INIT_LIST_HEAD(&b->write_blocked); | |
4580baec KO |
100 | b->byte_order = ilog2(btree_bytes(c)); |
101 | return b; | |
102 | } | |
103 | ||
104 | static struct btree *btree_node_mem_alloc(struct bch_fs *c) | |
105 | { | |
106 | struct btree_cache *bc = &c->btree_cache; | |
107 | struct btree *b = __btree_node_mem_alloc(c); | |
108 | if (!b) | |
109 | return NULL; | |
1c6fdbd8 | 110 | |
4580baec KO |
111 | if (btree_node_data_alloc(c, b, GFP_KERNEL)) { |
112 | kfree(b); | |
113 | return NULL; | |
114 | } | |
115 | ||
116 | bc->used++; | |
117 | list_add(&b->list, &bc->freeable); | |
118 | return b; | |
1c6fdbd8 KO |
119 | } |
120 | ||
121 | /* Btree in memory cache - hash table */ | |
122 | ||
123 | void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) | |
124 | { | |
125 | rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); | |
126 | ||
127 | /* Cause future lookups for this node to fail: */ | |
237e8048 | 128 | b->hash_val = 0; |
760992aa KO |
129 | |
130 | six_lock_wakeup_all(&b->c.lock); | |
1c6fdbd8 KO |
131 | } |
132 | ||
133 | int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) | |
134 | { | |
237e8048 KO |
135 | BUG_ON(b->hash_val); |
136 | b->hash_val = btree_ptr_hash_val(&b->key); | |
137 | ||
1c6fdbd8 KO |
138 | return rhashtable_lookup_insert_fast(&bc->table, &b->hash, |
139 | bch_btree_cache_params); | |
140 | } | |
141 | ||
142 | int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, | |
143 | unsigned level, enum btree_id id) | |
144 | { | |
145 | int ret; | |
146 | ||
c43a6ef9 KO |
147 | b->c.level = level; |
148 | b->c.btree_id = id; | |
1c6fdbd8 | 149 | |
a9d79c6e KO |
150 | if (level) |
151 | six_lock_pcpu_alloc(&b->c.lock); | |
152 | else | |
153 | six_lock_pcpu_free_rcu(&b->c.lock); | |
154 | ||
1c6fdbd8 KO |
155 | mutex_lock(&bc->lock); |
156 | ret = __bch2_btree_node_hash_insert(bc, b); | |
157 | if (!ret) | |
158 | list_add(&b->list, &bc->live); | |
159 | mutex_unlock(&bc->lock); | |
160 | ||
161 | return ret; | |
162 | } | |
163 | ||
164 | __flatten | |
165 | static inline struct btree *btree_cache_find(struct btree_cache *bc, | |
166 | const struct bkey_i *k) | |
167 | { | |
237e8048 KO |
168 | u64 v = btree_ptr_hash_val(k); |
169 | ||
170 | return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); | |
1c6fdbd8 KO |
171 | } |
172 | ||
173 | /* | |
174 | * this version is for btree nodes that have already been freed (we're not | |
175 | * reaping a real btree node) | |
176 | */ | |
177 | static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) | |
178 | { | |
179 | struct btree_cache *bc = &c->btree_cache; | |
180 | int ret = 0; | |
181 | ||
182 | lockdep_assert_held(&bc->lock); | |
183 | ||
c43a6ef9 | 184 | if (!six_trylock_intent(&b->c.lock)) |
1c6fdbd8 KO |
185 | return -ENOMEM; |
186 | ||
c43a6ef9 | 187 | if (!six_trylock_write(&b->c.lock)) |
1c6fdbd8 KO |
188 | goto out_unlock_intent; |
189 | ||
190 | if (btree_node_noevict(b)) | |
191 | goto out_unlock; | |
192 | ||
193 | if (!btree_node_may_write(b)) | |
194 | goto out_unlock; | |
195 | ||
d0cc3def KO |
196 | if (btree_node_dirty(b) && |
197 | test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) | |
198 | goto out_unlock; | |
199 | ||
1c6fdbd8 KO |
200 | if (btree_node_dirty(b) || |
201 | btree_node_write_in_flight(b) || | |
202 | btree_node_read_in_flight(b)) { | |
203 | if (!flush) | |
204 | goto out_unlock; | |
205 | ||
206 | wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, | |
207 | TASK_UNINTERRUPTIBLE); | |
208 | ||
209 | /* | |
210 | * Using the underscore version because we don't want to compact | |
211 | * bsets after the write, since this node is about to be evicted | |
212 | * - unless btree verify mode is enabled, since it runs out of | |
213 | * the post write cleanup: | |
214 | */ | |
29364f34 | 215 | if (bch2_verify_btree_ondisk) |
1c6fdbd8 KO |
216 | bch2_btree_node_write(c, b, SIX_LOCK_intent); |
217 | else | |
2177147b | 218 | __bch2_btree_node_write(c, b); |
1c6fdbd8 KO |
219 | |
220 | /* wait for any in flight btree write */ | |
221 | btree_node_wait_on_io(b); | |
222 | } | |
223 | out: | |
237e8048 | 224 | if (b->hash_val && !ret) |
1c6fdbd8 KO |
225 | trace_btree_node_reap(c, b); |
226 | return ret; | |
227 | out_unlock: | |
c43a6ef9 | 228 | six_unlock_write(&b->c.lock); |
1c6fdbd8 | 229 | out_unlock_intent: |
c43a6ef9 | 230 | six_unlock_intent(&b->c.lock); |
1c6fdbd8 KO |
231 | ret = -ENOMEM; |
232 | goto out; | |
233 | } | |
234 | ||
235 | static int btree_node_reclaim(struct bch_fs *c, struct btree *b) | |
236 | { | |
237 | return __btree_node_reclaim(c, b, false); | |
238 | } | |
239 | ||
240 | static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) | |
241 | { | |
242 | return __btree_node_reclaim(c, b, true); | |
243 | } | |
244 | ||
245 | static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, | |
246 | struct shrink_control *sc) | |
247 | { | |
248 | struct bch_fs *c = container_of(shrink, struct bch_fs, | |
249 | btree_cache.shrink); | |
250 | struct btree_cache *bc = &c->btree_cache; | |
251 | struct btree *b, *t; | |
252 | unsigned long nr = sc->nr_to_scan; | |
253 | unsigned long can_free; | |
254 | unsigned long touched = 0; | |
255 | unsigned long freed = 0; | |
97c0e195 | 256 | unsigned i, flags; |
1c6fdbd8 | 257 | |
29364f34 | 258 | if (bch2_btree_shrinker_disabled) |
1c6fdbd8 KO |
259 | return SHRINK_STOP; |
260 | ||
261 | /* Return -1 if we can't do anything right now */ | |
8c9eef95 | 262 | if (sc->gfp_mask & __GFP_FS) |
1c6fdbd8 KO |
263 | mutex_lock(&bc->lock); |
264 | else if (!mutex_trylock(&bc->lock)) | |
265 | return -1; | |
266 | ||
97c0e195 KO |
267 | flags = memalloc_nofs_save(); |
268 | ||
1c6fdbd8 KO |
269 | /* |
270 | * It's _really_ critical that we don't free too many btree nodes - we | |
271 | * have to always leave ourselves a reserve. The reserve is how we | |
272 | * guarantee that allocating memory for a new btree node can always | |
273 | * succeed, so that inserting keys into the btree can always succeed and | |
274 | * IO can always make forward progress: | |
275 | */ | |
276 | nr /= btree_pages(c); | |
277 | can_free = btree_cache_can_free(bc); | |
278 | nr = min_t(unsigned long, nr, can_free); | |
279 | ||
280 | i = 0; | |
281 | list_for_each_entry_safe(b, t, &bc->freeable, list) { | |
c043a330 KO |
282 | /* |
283 | * Leave a few nodes on the freeable list, so that a btree split | |
284 | * won't have to hit the system allocator: | |
285 | */ | |
286 | if (++i <= 3) | |
287 | continue; | |
288 | ||
1c6fdbd8 KO |
289 | touched++; |
290 | ||
291 | if (freed >= nr) | |
292 | break; | |
293 | ||
c043a330 | 294 | if (!btree_node_reclaim(c, b)) { |
1c6fdbd8 | 295 | btree_node_data_free(c, b); |
c43a6ef9 KO |
296 | six_unlock_write(&b->c.lock); |
297 | six_unlock_intent(&b->c.lock); | |
1c6fdbd8 KO |
298 | freed++; |
299 | } | |
300 | } | |
301 | restart: | |
302 | list_for_each_entry_safe(b, t, &bc->live, list) { | |
303 | touched++; | |
304 | ||
305 | if (freed >= nr) { | |
306 | /* Save position */ | |
307 | if (&t->list != &bc->live) | |
308 | list_move_tail(&bc->live, &t->list); | |
309 | break; | |
310 | } | |
311 | ||
312 | if (!btree_node_accessed(b) && | |
313 | !btree_node_reclaim(c, b)) { | |
314 | /* can't call bch2_btree_node_hash_remove under lock */ | |
315 | freed++; | |
316 | if (&t->list != &bc->live) | |
317 | list_move_tail(&bc->live, &t->list); | |
318 | ||
319 | btree_node_data_free(c, b); | |
320 | mutex_unlock(&bc->lock); | |
321 | ||
322 | bch2_btree_node_hash_remove(bc, b); | |
c43a6ef9 KO |
323 | six_unlock_write(&b->c.lock); |
324 | six_unlock_intent(&b->c.lock); | |
1c6fdbd8 KO |
325 | |
326 | if (freed >= nr) | |
327 | goto out; | |
328 | ||
47a5649a | 329 | if (sc->gfp_mask & __GFP_FS) |
1c6fdbd8 KO |
330 | mutex_lock(&bc->lock); |
331 | else if (!mutex_trylock(&bc->lock)) | |
332 | goto out; | |
333 | goto restart; | |
334 | } else | |
335 | clear_btree_node_accessed(b); | |
336 | } | |
337 | ||
338 | mutex_unlock(&bc->lock); | |
339 | out: | |
e648448c | 340 | memalloc_nofs_restore(flags); |
1c6fdbd8 KO |
341 | return (unsigned long) freed * btree_pages(c); |
342 | } | |
343 | ||
344 | static unsigned long bch2_btree_cache_count(struct shrinker *shrink, | |
345 | struct shrink_control *sc) | |
346 | { | |
347 | struct bch_fs *c = container_of(shrink, struct bch_fs, | |
348 | btree_cache.shrink); | |
349 | struct btree_cache *bc = &c->btree_cache; | |
350 | ||
29364f34 | 351 | if (bch2_btree_shrinker_disabled) |
1c6fdbd8 KO |
352 | return 0; |
353 | ||
354 | return btree_cache_can_free(bc) * btree_pages(c); | |
355 | } | |
356 | ||
357 | void bch2_fs_btree_cache_exit(struct bch_fs *c) | |
358 | { | |
359 | struct btree_cache *bc = &c->btree_cache; | |
360 | struct btree *b; | |
5d0b7f90 | 361 | unsigned i, flags; |
1c6fdbd8 KO |
362 | |
363 | if (bc->shrink.list.next) | |
364 | unregister_shrinker(&bc->shrink); | |
365 | ||
5d0b7f90 KO |
366 | /* vfree() can allocate memory: */ |
367 | flags = memalloc_nofs_save(); | |
1c6fdbd8 KO |
368 | mutex_lock(&bc->lock); |
369 | ||
370 | #ifdef CONFIG_BCACHEFS_DEBUG | |
371 | if (c->verify_data) | |
372 | list_move(&c->verify_data->list, &bc->live); | |
373 | ||
374 | kvpfree(c->verify_ondisk, btree_bytes(c)); | |
375 | #endif | |
376 | ||
377 | for (i = 0; i < BTREE_ID_NR; i++) | |
378 | if (c->btree_roots[i].b) | |
379 | list_add(&c->btree_roots[i].b->list, &bc->live); | |
380 | ||
381 | list_splice(&bc->freeable, &bc->live); | |
382 | ||
383 | while (!list_empty(&bc->live)) { | |
384 | b = list_first_entry(&bc->live, struct btree, list); | |
385 | ||
386 | BUG_ON(btree_node_read_in_flight(b) || | |
387 | btree_node_write_in_flight(b)); | |
388 | ||
389 | if (btree_node_dirty(b)) | |
390 | bch2_btree_complete_write(c, b, btree_current_write(b)); | |
6a747c46 | 391 | clear_btree_node_dirty(c, b); |
1c6fdbd8 KO |
392 | |
393 | btree_node_data_free(c, b); | |
394 | } | |
395 | ||
6a747c46 KO |
396 | BUG_ON(atomic_read(&c->btree_cache.dirty)); |
397 | ||
1c6fdbd8 KO |
398 | while (!list_empty(&bc->freed)) { |
399 | b = list_first_entry(&bc->freed, struct btree, list); | |
400 | list_del(&b->list); | |
a9d79c6e | 401 | six_lock_pcpu_free(&b->c.lock); |
1c6fdbd8 KO |
402 | kfree(b); |
403 | } | |
404 | ||
405 | mutex_unlock(&bc->lock); | |
5d0b7f90 | 406 | memalloc_nofs_restore(flags); |
1c6fdbd8 KO |
407 | |
408 | if (bc->table_init_done) | |
409 | rhashtable_destroy(&bc->table); | |
410 | } | |
411 | ||
412 | int bch2_fs_btree_cache_init(struct bch_fs *c) | |
413 | { | |
414 | struct btree_cache *bc = &c->btree_cache; | |
415 | unsigned i; | |
416 | int ret = 0; | |
417 | ||
418 | pr_verbose_init(c->opts, ""); | |
419 | ||
420 | ret = rhashtable_init(&bc->table, &bch_btree_cache_params); | |
421 | if (ret) | |
422 | goto out; | |
423 | ||
424 | bc->table_init_done = true; | |
425 | ||
426 | bch2_recalc_btree_reserve(c); | |
427 | ||
428 | for (i = 0; i < bc->reserve; i++) | |
4580baec | 429 | if (!btree_node_mem_alloc(c)) { |
1c6fdbd8 KO |
430 | ret = -ENOMEM; |
431 | goto out; | |
432 | } | |
433 | ||
434 | list_splice_init(&bc->live, &bc->freeable); | |
435 | ||
436 | #ifdef CONFIG_BCACHEFS_DEBUG | |
437 | mutex_init(&c->verify_lock); | |
438 | ||
439 | c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); | |
440 | if (!c->verify_ondisk) { | |
441 | ret = -ENOMEM; | |
442 | goto out; | |
443 | } | |
444 | ||
4580baec | 445 | c->verify_data = btree_node_mem_alloc(c); |
1c6fdbd8 KO |
446 | if (!c->verify_data) { |
447 | ret = -ENOMEM; | |
448 | goto out; | |
449 | } | |
450 | ||
451 | list_del_init(&c->verify_data->list); | |
452 | #endif | |
453 | ||
454 | bc->shrink.count_objects = bch2_btree_cache_count; | |
455 | bc->shrink.scan_objects = bch2_btree_cache_scan; | |
456 | bc->shrink.seeks = 4; | |
457 | bc->shrink.batch = btree_pages(c) * 2; | |
d8b46004 | 458 | ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); |
1c6fdbd8 KO |
459 | out: |
460 | pr_verbose_init(c->opts, "ret %i", ret); | |
461 | return ret; | |
462 | } | |
463 | ||
464 | void bch2_fs_btree_cache_init_early(struct btree_cache *bc) | |
465 | { | |
466 | mutex_init(&bc->lock); | |
467 | INIT_LIST_HEAD(&bc->live); | |
468 | INIT_LIST_HEAD(&bc->freeable); | |
469 | INIT_LIST_HEAD(&bc->freed); | |
470 | } | |
471 | ||
472 | /* | |
473 | * We can only have one thread cannibalizing other cached btree nodes at a time, | |
474 | * or we'll deadlock. We use an open coded mutex to ensure that, which a | |
475 | * cannibalize_bucket() will take. This means every time we unlock the root of | |
476 | * the btree, we need to release this lock if we have it held. | |
477 | */ | |
478 | void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) | |
479 | { | |
480 | struct btree_cache *bc = &c->btree_cache; | |
481 | ||
482 | if (bc->alloc_lock == current) { | |
483 | trace_btree_node_cannibalize_unlock(c); | |
484 | bc->alloc_lock = NULL; | |
485 | closure_wake_up(&bc->alloc_wait); | |
486 | } | |
487 | } | |
488 | ||
489 | int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) | |
490 | { | |
491 | struct btree_cache *bc = &c->btree_cache; | |
492 | struct task_struct *old; | |
493 | ||
494 | old = cmpxchg(&bc->alloc_lock, NULL, current); | |
495 | if (old == NULL || old == current) | |
496 | goto success; | |
497 | ||
498 | if (!cl) { | |
499 | trace_btree_node_cannibalize_lock_fail(c); | |
500 | return -ENOMEM; | |
501 | } | |
502 | ||
503 | closure_wait(&bc->alloc_wait, cl); | |
504 | ||
505 | /* Try again, after adding ourselves to waitlist */ | |
506 | old = cmpxchg(&bc->alloc_lock, NULL, current); | |
507 | if (old == NULL || old == current) { | |
508 | /* We raced */ | |
509 | closure_wake_up(&bc->alloc_wait); | |
510 | goto success; | |
511 | } | |
512 | ||
513 | trace_btree_node_cannibalize_lock_fail(c); | |
514 | return -EAGAIN; | |
515 | ||
516 | success: | |
517 | trace_btree_node_cannibalize_lock(c); | |
518 | return 0; | |
519 | } | |
520 | ||
521 | static struct btree *btree_node_cannibalize(struct bch_fs *c) | |
522 | { | |
523 | struct btree_cache *bc = &c->btree_cache; | |
524 | struct btree *b; | |
525 | ||
526 | list_for_each_entry_reverse(b, &bc->live, list) | |
527 | if (!btree_node_reclaim(c, b)) | |
528 | return b; | |
529 | ||
530 | while (1) { | |
531 | list_for_each_entry_reverse(b, &bc->live, list) | |
532 | if (!btree_node_write_and_reclaim(c, b)) | |
533 | return b; | |
534 | ||
535 | /* | |
536 | * Rare case: all nodes were intent-locked. | |
537 | * Just busy-wait. | |
538 | */ | |
539 | WARN_ONCE(1, "btree cache cannibalize failed\n"); | |
540 | cond_resched(); | |
541 | } | |
542 | } | |
543 | ||
544 | struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) | |
545 | { | |
546 | struct btree_cache *bc = &c->btree_cache; | |
547 | struct btree *b; | |
548 | u64 start_time = local_clock(); | |
e0dfc08b | 549 | unsigned flags; |
1c6fdbd8 | 550 | |
e0dfc08b | 551 | flags = memalloc_nofs_save(); |
1c6fdbd8 KO |
552 | mutex_lock(&bc->lock); |
553 | ||
554 | /* | |
555 | * btree_free() doesn't free memory; it sticks the node on the end of | |
556 | * the list. Check if there's any freed nodes there: | |
557 | */ | |
558 | list_for_each_entry(b, &bc->freeable, list) | |
559 | if (!btree_node_reclaim(c, b)) | |
e38821f3 | 560 | goto got_node; |
1c6fdbd8 KO |
561 | |
562 | /* | |
563 | * We never free struct btree itself, just the memory that holds the on | |
564 | * disk node. Check the freed list before allocating a new one: | |
565 | */ | |
566 | list_for_each_entry(b, &bc->freed, list) | |
e38821f3 KO |
567 | if (!btree_node_reclaim(c, b)) |
568 | goto got_node; | |
1c6fdbd8 | 569 | |
e38821f3 KO |
570 | b = NULL; |
571 | got_node: | |
572 | if (b) | |
573 | list_del_init(&b->list); | |
574 | mutex_unlock(&bc->lock); | |
575 | ||
576 | if (!b) { | |
4580baec | 577 | b = __btree_node_mem_alloc(c); |
e38821f3 | 578 | if (!b) |
1c6fdbd8 | 579 | goto err; |
1c6fdbd8 | 580 | |
e38821f3 KO |
581 | BUG_ON(!six_trylock_intent(&b->c.lock)); |
582 | BUG_ON(!six_trylock_write(&b->c.lock)); | |
583 | } | |
584 | ||
585 | if (!b->data) { | |
4580baec | 586 | if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) |
e38821f3 KO |
587 | goto err; |
588 | ||
589 | mutex_lock(&bc->lock); | |
590 | bc->used++; | |
591 | mutex_unlock(&bc->lock); | |
592 | } | |
1c6fdbd8 | 593 | |
1c6fdbd8 KO |
594 | BUG_ON(btree_node_hashed(b)); |
595 | BUG_ON(btree_node_write_in_flight(b)); | |
1c6fdbd8 KO |
596 | out: |
597 | b->flags = 0; | |
598 | b->written = 0; | |
599 | b->nsets = 0; | |
600 | b->sib_u64s[0] = 0; | |
601 | b->sib_u64s[1] = 0; | |
602 | b->whiteout_u64s = 0; | |
29364f34 | 603 | bch2_btree_keys_init(b); |
1c6fdbd8 KO |
604 | |
605 | bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], | |
606 | start_time); | |
607 | ||
692c3f06 | 608 | memalloc_nofs_restore(flags); |
1c6fdbd8 KO |
609 | return b; |
610 | err: | |
e38821f3 KO |
611 | mutex_lock(&bc->lock); |
612 | ||
613 | if (b) { | |
614 | list_add(&b->list, &bc->freed); | |
615 | six_unlock_write(&b->c.lock); | |
616 | six_unlock_intent(&b->c.lock); | |
617 | } | |
618 | ||
1c6fdbd8 KO |
619 | /* Try to cannibalize another cached btree node: */ |
620 | if (bc->alloc_lock == current) { | |
621 | b = btree_node_cannibalize(c); | |
622 | list_del_init(&b->list); | |
623 | mutex_unlock(&bc->lock); | |
624 | ||
625 | bch2_btree_node_hash_remove(bc, b); | |
626 | ||
627 | trace_btree_node_cannibalize(c); | |
628 | goto out; | |
629 | } | |
630 | ||
631 | mutex_unlock(&bc->lock); | |
692c3f06 | 632 | memalloc_nofs_restore(flags); |
1c6fdbd8 KO |
633 | return ERR_PTR(-ENOMEM); |
634 | } | |
635 | ||
636 | /* Slowpath, don't want it inlined into btree_iter_traverse() */ | |
637 | static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, | |
638 | struct btree_iter *iter, | |
639 | const struct bkey_i *k, | |
e62d65f2 | 640 | enum btree_id btree_id, |
1c6fdbd8 KO |
641 | unsigned level, |
642 | enum six_lock_type lock_type, | |
643 | bool sync) | |
644 | { | |
645 | struct btree_cache *bc = &c->btree_cache; | |
646 | struct btree *b; | |
647 | ||
72141e1f | 648 | BUG_ON(level + 1 >= BTREE_MAX_DEPTH); |
1c6fdbd8 KO |
649 | /* |
650 | * Parent node must be locked, else we could read in a btree node that's | |
651 | * been freed: | |
652 | */ | |
e62d65f2 | 653 | if (iter && !bch2_btree_node_relock(iter, level + 1)) |
72141e1f | 654 | return ERR_PTR(-EINTR); |
1c6fdbd8 KO |
655 | |
656 | b = bch2_btree_node_mem_alloc(c); | |
657 | if (IS_ERR(b)) | |
658 | return b; | |
659 | ||
660 | bkey_copy(&b->key, k); | |
e62d65f2 | 661 | if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { |
1c6fdbd8 KO |
662 | /* raced with another fill: */ |
663 | ||
664 | /* mark as unhashed... */ | |
237e8048 | 665 | b->hash_val = 0; |
1c6fdbd8 KO |
666 | |
667 | mutex_lock(&bc->lock); | |
668 | list_add(&b->list, &bc->freeable); | |
669 | mutex_unlock(&bc->lock); | |
670 | ||
c43a6ef9 KO |
671 | six_unlock_write(&b->c.lock); |
672 | six_unlock_intent(&b->c.lock); | |
1c6fdbd8 KO |
673 | return NULL; |
674 | } | |
675 | ||
676 | /* | |
72141e1f | 677 | * Unlock before doing IO: |
1c6fdbd8 | 678 | * |
72141e1f | 679 | * XXX: ideally should be dropping all btree node locks here |
1c6fdbd8 | 680 | */ |
e62d65f2 | 681 | if (iter && btree_node_read_locked(iter, level + 1)) |
1c6fdbd8 KO |
682 | btree_node_unlock(iter, level + 1); |
683 | ||
684 | bch2_btree_node_read(c, b, sync); | |
685 | ||
c43a6ef9 | 686 | six_unlock_write(&b->c.lock); |
1c6fdbd8 KO |
687 | |
688 | if (!sync) { | |
c43a6ef9 | 689 | six_unlock_intent(&b->c.lock); |
1c6fdbd8 KO |
690 | return NULL; |
691 | } | |
692 | ||
693 | if (lock_type == SIX_LOCK_read) | |
c43a6ef9 | 694 | six_lock_downgrade(&b->c.lock); |
1c6fdbd8 KO |
695 | |
696 | return b; | |
697 | } | |
698 | ||
bd2bb273 KO |
699 | static int lock_node_check_fn(struct six_lock *lock, void *p) |
700 | { | |
701 | struct btree *b = container_of(lock, struct btree, c.lock); | |
702 | const struct bkey_i *k = p; | |
703 | ||
704 | return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; | |
705 | } | |
706 | ||
1c6fdbd8 KO |
707 | /** |
708 | * bch_btree_node_get - find a btree node in the cache and lock it, reading it | |
709 | * in from disk if necessary. | |
710 | * | |
711 | * If IO is necessary and running under generic_make_request, returns -EAGAIN. | |
712 | * | |
713 | * The btree node will have either a read or a write lock held, depending on | |
714 | * the @write parameter. | |
715 | */ | |
716 | struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, | |
717 | const struct bkey_i *k, unsigned level, | |
a301dc38 KO |
718 | enum six_lock_type lock_type, |
719 | unsigned long trace_ip) | |
1c6fdbd8 KO |
720 | { |
721 | struct btree_cache *bc = &c->btree_cache; | |
722 | struct btree *b; | |
723 | struct bset_tree *t; | |
724 | ||
1c6fdbd8 | 725 | EBUG_ON(level >= BTREE_MAX_DEPTH); |
72141e1f KO |
726 | |
727 | b = btree_node_mem_ptr(k); | |
728 | if (b) | |
729 | goto lock_node; | |
1c6fdbd8 | 730 | retry: |
1c6fdbd8 | 731 | b = btree_cache_find(bc, k); |
1c6fdbd8 KO |
732 | if (unlikely(!b)) { |
733 | /* | |
734 | * We must have the parent locked to call bch2_btree_node_fill(), | |
735 | * else we could read in a btree node from disk that's been | |
736 | * freed: | |
737 | */ | |
e62d65f2 KO |
738 | b = bch2_btree_node_fill(c, iter, k, iter->btree_id, |
739 | level, lock_type, true); | |
1c6fdbd8 KO |
740 | |
741 | /* We raced and found the btree node in the cache */ | |
742 | if (!b) | |
743 | goto retry; | |
744 | ||
745 | if (IS_ERR(b)) | |
746 | return b; | |
747 | } else { | |
72141e1f | 748 | lock_node: |
1c6fdbd8 KO |
749 | /* |
750 | * There's a potential deadlock with splits and insertions into | |
751 | * interior nodes we have to avoid: | |
752 | * | |
753 | * The other thread might be holding an intent lock on the node | |
754 | * we want, and they want to update its parent node so they're | |
755 | * going to upgrade their intent lock on the parent node to a | |
756 | * write lock. | |
757 | * | |
758 | * But if we're holding a read lock on the parent, and we're | |
759 | * trying to get the intent lock they're holding, we deadlock. | |
760 | * | |
761 | * So to avoid this we drop the read locks on parent nodes when | |
762 | * we're starting to take intent locks - and handle the race. | |
763 | * | |
764 | * The race is that they might be about to free the node we | |
765 | * want, and dropping our read lock on the parent node lets them | |
766 | * update the parent marking the node we want as freed, and then | |
767 | * free it: | |
768 | * | |
769 | * To guard against this, btree nodes are evicted from the cache | |
237e8048 | 770 | * when they're freed - and b->hash_val is zeroed out, which we |
1c6fdbd8 KO |
771 | * check for after we lock the node. |
772 | * | |
773 | * Then, bch2_btree_node_relock() on the parent will fail - because | |
774 | * the parent was modified, when the pointer to the node we want | |
775 | * was removed - and we'll bail out: | |
776 | */ | |
777 | if (btree_node_read_locked(iter, level + 1)) | |
778 | btree_node_unlock(iter, level + 1); | |
779 | ||
bd2bb273 | 780 | if (!btree_node_lock(b, k->k.p, level, iter, lock_type, |
a301dc38 | 781 | lock_node_check_fn, (void *) k, trace_ip)) { |
bd2bb273 KO |
782 | if (b->hash_val != btree_ptr_hash_val(k)) |
783 | goto retry; | |
1c6fdbd8 | 784 | return ERR_PTR(-EINTR); |
bd2bb273 | 785 | } |
1c6fdbd8 | 786 | |
237e8048 | 787 | if (unlikely(b->hash_val != btree_ptr_hash_val(k) || |
c43a6ef9 | 788 | b->c.level != level || |
1c6fdbd8 | 789 | race_fault())) { |
c43a6ef9 | 790 | six_unlock_type(&b->c.lock, lock_type); |
1c6fdbd8 KO |
791 | if (bch2_btree_node_relock(iter, level + 1)) |
792 | goto retry; | |
793 | ||
20bceecb | 794 | trace_trans_restart_btree_node_reused(iter->trans->ip); |
1c6fdbd8 KO |
795 | return ERR_PTR(-EINTR); |
796 | } | |
797 | } | |
798 | ||
72141e1f | 799 | /* XXX: waiting on IO with btree locks held: */ |
1c6fdbd8 KO |
800 | wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, |
801 | TASK_UNINTERRUPTIBLE); | |
802 | ||
803 | prefetch(b->aux_data); | |
804 | ||
805 | for_each_bset(b, t) { | |
806 | void *p = (u64 *) b->aux_data + t->aux_data_offset; | |
807 | ||
808 | prefetch(p + L1_CACHE_BYTES * 0); | |
809 | prefetch(p + L1_CACHE_BYTES * 1); | |
810 | prefetch(p + L1_CACHE_BYTES * 2); | |
811 | } | |
812 | ||
813 | /* avoid atomic set bit if it's not needed: */ | |
7f81d4cf | 814 | if (!btree_node_accessed(b)) |
1c6fdbd8 KO |
815 | set_btree_node_accessed(b); |
816 | ||
817 | if (unlikely(btree_node_read_error(b))) { | |
c43a6ef9 | 818 | six_unlock_type(&b->c.lock, lock_type); |
1c6fdbd8 KO |
819 | return ERR_PTR(-EIO); |
820 | } | |
821 | ||
a0b73c1c KO |
822 | EBUG_ON(b->c.btree_id != iter->btree_id); |
823 | EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); | |
4cf91b02 | 824 | EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); |
a0b73c1c | 825 | EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && |
4cf91b02 | 826 | bpos_cmp(b->data->min_key, |
a0b73c1c | 827 | bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); |
1c6fdbd8 KO |
828 | |
829 | return b; | |
830 | } | |
831 | ||
e62d65f2 KO |
832 | struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, |
833 | const struct bkey_i *k, | |
834 | enum btree_id btree_id, | |
a0b73c1c KO |
835 | unsigned level, |
836 | bool nofill) | |
e62d65f2 KO |
837 | { |
838 | struct btree_cache *bc = &c->btree_cache; | |
839 | struct btree *b; | |
840 | struct bset_tree *t; | |
bd2bb273 | 841 | int ret; |
e62d65f2 KO |
842 | |
843 | EBUG_ON(level >= BTREE_MAX_DEPTH); | |
844 | ||
845 | b = btree_node_mem_ptr(k); | |
846 | if (b) | |
847 | goto lock_node; | |
848 | retry: | |
849 | b = btree_cache_find(bc, k); | |
850 | if (unlikely(!b)) { | |
a0b73c1c | 851 | if (nofill) |
18a7b972 | 852 | goto out; |
a0b73c1c | 853 | |
e62d65f2 KO |
854 | b = bch2_btree_node_fill(c, NULL, k, btree_id, |
855 | level, SIX_LOCK_read, true); | |
856 | ||
857 | /* We raced and found the btree node in the cache */ | |
858 | if (!b) | |
859 | goto retry; | |
860 | ||
18a7b972 KO |
861 | if (IS_ERR(b) && |
862 | !bch2_btree_cache_cannibalize_lock(c, NULL)) | |
863 | goto retry; | |
864 | ||
e62d65f2 | 865 | if (IS_ERR(b)) |
18a7b972 | 866 | goto out; |
e62d65f2 KO |
867 | } else { |
868 | lock_node: | |
bd2bb273 KO |
869 | ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); |
870 | if (ret) | |
871 | goto retry; | |
e62d65f2 KO |
872 | |
873 | if (unlikely(b->hash_val != btree_ptr_hash_val(k) || | |
874 | b->c.btree_id != btree_id || | |
875 | b->c.level != level)) { | |
876 | six_unlock_read(&b->c.lock); | |
877 | goto retry; | |
878 | } | |
879 | } | |
880 | ||
881 | /* XXX: waiting on IO with btree locks held: */ | |
882 | wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, | |
883 | TASK_UNINTERRUPTIBLE); | |
884 | ||
885 | prefetch(b->aux_data); | |
886 | ||
887 | for_each_bset(b, t) { | |
888 | void *p = (u64 *) b->aux_data + t->aux_data_offset; | |
889 | ||
890 | prefetch(p + L1_CACHE_BYTES * 0); | |
891 | prefetch(p + L1_CACHE_BYTES * 1); | |
892 | prefetch(p + L1_CACHE_BYTES * 2); | |
893 | } | |
894 | ||
895 | /* avoid atomic set bit if it's not needed: */ | |
896 | if (!btree_node_accessed(b)) | |
897 | set_btree_node_accessed(b); | |
898 | ||
899 | if (unlikely(btree_node_read_error(b))) { | |
900 | six_unlock_read(&b->c.lock); | |
18a7b972 KO |
901 | b = ERR_PTR(-EIO); |
902 | goto out; | |
e62d65f2 KO |
903 | } |
904 | ||
a0b73c1c KO |
905 | EBUG_ON(b->c.btree_id != btree_id); |
906 | EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); | |
4cf91b02 | 907 | EBUG_ON(bpos_cmp(b->data->max_key, k->k.p)); |
a0b73c1c | 908 | EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && |
4cf91b02 | 909 | bpos_cmp(b->data->min_key, |
a0b73c1c | 910 | bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); |
18a7b972 KO |
911 | out: |
912 | bch2_btree_cache_cannibalize_unlock(c); | |
e62d65f2 KO |
913 | return b; |
914 | } | |
915 | ||
1c6fdbd8 | 916 | void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, |
edfbba58 KO |
917 | const struct bkey_i *k, |
918 | enum btree_id btree_id, unsigned level) | |
1c6fdbd8 KO |
919 | { |
920 | struct btree_cache *bc = &c->btree_cache; | |
921 | struct btree *b; | |
922 | ||
edfbba58 | 923 | BUG_ON(iter && !btree_node_locked(iter, level + 1)); |
1c6fdbd8 KO |
924 | BUG_ON(level >= BTREE_MAX_DEPTH); |
925 | ||
1c6fdbd8 | 926 | b = btree_cache_find(bc, k); |
1c6fdbd8 KO |
927 | if (b) |
928 | return; | |
929 | ||
edfbba58 | 930 | bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); |
1c6fdbd8 KO |
931 | } |
932 | ||
319f9ac3 KO |
933 | void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, |
934 | struct btree *b) | |
1c6fdbd8 KO |
935 | { |
936 | const struct bkey_format *f = &b->format; | |
937 | struct bset_stats stats; | |
1c6fdbd8 KO |
938 | |
939 | memset(&stats, 0, sizeof(stats)); | |
940 | ||
1c6fdbd8 KO |
941 | bch2_btree_keys_stats(b, &stats); |
942 | ||
f020bfcd KO |
943 | pr_buf(out, "l %u ", b->c.level); |
944 | bch2_bpos_to_text(out, b->data->min_key); | |
945 | pr_buf(out, " - "); | |
946 | bch2_bpos_to_text(out, b->data->max_key); | |
947 | pr_buf(out, ":\n" | |
948 | " ptrs: "); | |
26609b61 | 949 | bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); |
f020bfcd | 950 | |
319f9ac3 KO |
951 | pr_buf(out, "\n" |
952 | " format: u64s %u fields %u %u %u %u %u\n" | |
953 | " unpack fn len: %u\n" | |
954 | " bytes used %zu/%zu (%zu%% full)\n" | |
54ca47e1 | 955 | " sib u64s: %u, %u (merge threshold %u)\n" |
319f9ac3 KO |
956 | " nr packed keys %u\n" |
957 | " nr unpacked keys %u\n" | |
958 | " floats %zu\n" | |
58404bb2 | 959 | " failed unpacked %zu\n", |
319f9ac3 KO |
960 | f->key_u64s, |
961 | f->bits_per_field[0], | |
962 | f->bits_per_field[1], | |
963 | f->bits_per_field[2], | |
964 | f->bits_per_field[3], | |
965 | f->bits_per_field[4], | |
966 | b->unpack_fn_len, | |
967 | b->nr.live_u64s * sizeof(u64), | |
968 | btree_bytes(c) - sizeof(struct btree_node), | |
969 | b->nr.live_u64s * 100 / btree_max_u64s(c), | |
970 | b->sib_u64s[0], | |
971 | b->sib_u64s[1], | |
54ca47e1 | 972 | c->btree_foreground_merge_threshold, |
319f9ac3 KO |
973 | b->nr.packed_keys, |
974 | b->nr.unpacked_keys, | |
975 | stats.floats, | |
58404bb2 | 976 | stats.failed); |
1c6fdbd8 | 977 | } |
d8ebed7d KO |
978 | |
979 | void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) | |
980 | { | |
b929bbef KO |
981 | pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); |
982 | pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); | |
983 | pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); | |
d8ebed7d | 984 | } |