Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
91dcad18 | 4 | #include "bbpos.h" |
07a1006a | 5 | #include "bkey_buf.h" |
1c6fdbd8 KO |
6 | #include "btree_cache.h" |
7 | #include "btree_io.h" | |
8 | #include "btree_iter.h" | |
9 | #include "btree_locking.h" | |
10 | #include "debug.h" | |
549d173c | 11 | #include "errcode.h" |
a0b73c1c | 12 | #include "error.h" |
0117591e | 13 | #include "journal.h" |
1c6fdbd8 KO |
14 | #include "trace.h" |
15 | ||
16 | #include <linux/prefetch.h> | |
e0dfc08b | 17 | #include <linux/sched/mm.h> |
1c6fdbd8 | 18 | |
de517c95 KO |
19 | const char * const bch2_btree_node_flags[] = { |
20 | #define x(f) #f, | |
21 | BTREE_FLAGS() | |
22 | #undef x | |
23 | NULL | |
24 | }; | |
25 | ||
1c6fdbd8 KO |
26 | void bch2_recalc_btree_reserve(struct bch_fs *c) |
27 | { | |
28 | unsigned i, reserve = 16; | |
29 | ||
faa6cb6c | 30 | if (!c->btree_roots_known[0].b) |
1c6fdbd8 KO |
31 | reserve += 8; |
32 | ||
faa6cb6c KO |
33 | for (i = 0; i < btree_id_nr_alive(c); i++) { |
34 | struct btree_root *r = bch2_btree_id_root(c, i); | |
35 | ||
36 | if (r->b) | |
37 | reserve += min_t(unsigned, 1, r->b->c.level) * 8; | |
38 | } | |
1c6fdbd8 KO |
39 | |
40 | c->btree_cache.reserve = reserve; | |
41 | } | |
42 | ||
43 | static inline unsigned btree_cache_can_free(struct btree_cache *bc) | |
44 | { | |
45 | return max_t(int, 0, bc->used - bc->reserve); | |
46 | } | |
47 | ||
30985537 KO |
48 | static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) |
49 | { | |
50 | if (b->c.lock.readers) | |
51 | list_move(&b->list, &bc->freed_pcpu); | |
52 | else | |
53 | list_move(&b->list, &bc->freed_nonpcpu); | |
54 | } | |
55 | ||
65c0601a | 56 | static void btree_node_data_free(struct bch_fs *c, struct btree *b) |
1c6fdbd8 | 57 | { |
65c0601a KO |
58 | struct btree_cache *bc = &c->btree_cache; |
59 | ||
1c6fdbd8 KO |
60 | EBUG_ON(btree_node_write_in_flight(b)); |
61 | ||
0b438c5b KO |
62 | clear_btree_node_just_written(b); |
63 | ||
cb6fc943 | 64 | kvfree(b->data); |
1c6fdbd8 | 65 | b->data = NULL; |
65c0601a | 66 | #ifdef __KERNEL__ |
4580baec | 67 | kvfree(b->aux_data); |
65c0601a KO |
68 | #else |
69 | munmap(b->aux_data, btree_aux_data_bytes(b)); | |
70 | #endif | |
4580baec | 71 | b->aux_data = NULL; |
1c6fdbd8 | 72 | |
1c6fdbd8 | 73 | bc->used--; |
30985537 KO |
74 | |
75 | btree_node_to_freedlist(bc, b); | |
1c6fdbd8 KO |
76 | } |
77 | ||
78 | static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, | |
79 | const void *obj) | |
80 | { | |
81 | const struct btree *b = obj; | |
82 | const u64 *v = arg->key; | |
83 | ||
237e8048 | 84 | return b->hash_val == *v ? 0 : 1; |
1c6fdbd8 KO |
85 | } |
86 | ||
87 | static const struct rhashtable_params bch_btree_cache_params = { | |
88 | .head_offset = offsetof(struct btree, hash), | |
237e8048 KO |
89 | .key_offset = offsetof(struct btree, hash_val), |
90 | .key_len = sizeof(u64), | |
1c6fdbd8 KO |
91 | .obj_cmpfn = bch2_btree_cache_cmp_fn, |
92 | }; | |
93 | ||
4580baec | 94 | static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) |
1c6fdbd8 | 95 | { |
e38821f3 | 96 | BUG_ON(b->data || b->aux_data); |
1c6fdbd8 | 97 | |
cb6fc943 | 98 | b->data = kvmalloc(btree_buf_bytes(b), gfp); |
1c6fdbd8 | 99 | if (!b->data) |
65d48e35 | 100 | return -BCH_ERR_ENOMEM_btree_node_mem_alloc; |
65c0601a | 101 | #ifdef __KERNEL__ |
4580baec | 102 | b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); |
65c0601a KO |
103 | #else |
104 | b->aux_data = mmap(NULL, btree_aux_data_bytes(b), | |
105 | PROT_READ|PROT_WRITE|PROT_EXEC, | |
106 | MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); | |
107 | if (b->aux_data == MAP_FAILED) | |
108 | b->aux_data = NULL; | |
109 | #endif | |
4580baec | 110 | if (!b->aux_data) { |
cb6fc943 | 111 | kvfree(b->data); |
e38821f3 | 112 | b->data = NULL; |
65d48e35 | 113 | return -BCH_ERR_ENOMEM_btree_node_mem_alloc; |
e38821f3 | 114 | } |
1c6fdbd8 | 115 | |
e38821f3 KO |
116 | return 0; |
117 | } | |
118 | ||
c36ff038 | 119 | static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) |
e38821f3 | 120 | { |
a1019576 KO |
121 | struct btree *b; |
122 | ||
123 | b = kzalloc(sizeof(struct btree), gfp); | |
1c6fdbd8 KO |
124 | if (!b) |
125 | return NULL; | |
126 | ||
26609b61 | 127 | bkey_btree_ptr_init(&b->key); |
1c6fdbd8 KO |
128 | INIT_LIST_HEAD(&b->list); |
129 | INIT_LIST_HEAD(&b->write_blocked); | |
ec4edd7b | 130 | b->byte_order = ilog2(c->opts.btree_node_size); |
4580baec KO |
131 | return b; |
132 | } | |
133 | ||
6adaac0b | 134 | struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) |
4580baec KO |
135 | { |
136 | struct btree_cache *bc = &c->btree_cache; | |
a1019576 KO |
137 | struct btree *b; |
138 | ||
139 | b = __btree_node_mem_alloc(c, GFP_KERNEL); | |
4580baec KO |
140 | if (!b) |
141 | return NULL; | |
1c6fdbd8 | 142 | |
4580baec KO |
143 | if (btree_node_data_alloc(c, b, GFP_KERNEL)) { |
144 | kfree(b); | |
145 | return NULL; | |
146 | } | |
147 | ||
0d2234a7 KO |
148 | bch2_btree_lock_init(&b->c, 0); |
149 | ||
4580baec KO |
150 | bc->used++; |
151 | list_add(&b->list, &bc->freeable); | |
152 | return b; | |
1c6fdbd8 KO |
153 | } |
154 | ||
155 | /* Btree in memory cache - hash table */ | |
156 | ||
157 | void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) | |
158 | { | |
cab8e233 | 159 | int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); |
a1019576 | 160 | |
cab8e233 | 161 | BUG_ON(ret); |
1c6fdbd8 KO |
162 | |
163 | /* Cause future lookups for this node to fail: */ | |
237e8048 | 164 | b->hash_val = 0; |
1c6fdbd8 KO |
165 | } |
166 | ||
167 | int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) | |
168 | { | |
237e8048 KO |
169 | BUG_ON(b->hash_val); |
170 | b->hash_val = btree_ptr_hash_val(&b->key); | |
171 | ||
1c6fdbd8 KO |
172 | return rhashtable_lookup_insert_fast(&bc->table, &b->hash, |
173 | bch_btree_cache_params); | |
174 | } | |
175 | ||
176 | int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, | |
177 | unsigned level, enum btree_id id) | |
178 | { | |
179 | int ret; | |
180 | ||
c43a6ef9 KO |
181 | b->c.level = level; |
182 | b->c.btree_id = id; | |
1c6fdbd8 KO |
183 | |
184 | mutex_lock(&bc->lock); | |
185 | ret = __bch2_btree_node_hash_insert(bc, b); | |
186 | if (!ret) | |
b5ac23c4 | 187 | list_add_tail(&b->list, &bc->live); |
1c6fdbd8 KO |
188 | mutex_unlock(&bc->lock); |
189 | ||
190 | return ret; | |
191 | } | |
192 | ||
193 | __flatten | |
194 | static inline struct btree *btree_cache_find(struct btree_cache *bc, | |
195 | const struct bkey_i *k) | |
196 | { | |
237e8048 KO |
197 | u64 v = btree_ptr_hash_val(k); |
198 | ||
199 | return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); | |
1c6fdbd8 KO |
200 | } |
201 | ||
202 | /* | |
203 | * this version is for btree nodes that have already been freed (we're not | |
204 | * reaping a real btree node) | |
205 | */ | |
206 | static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) | |
207 | { | |
208 | struct btree_cache *bc = &c->btree_cache; | |
209 | int ret = 0; | |
210 | ||
211 | lockdep_assert_held(&bc->lock); | |
91dcad18 KO |
212 | |
213 | struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); | |
214 | ||
215 | u64 mask = b->c.level | |
216 | ? bc->pinned_nodes_interior_mask | |
217 | : bc->pinned_nodes_leaf_mask; | |
218 | ||
219 | if ((mask & BIT_ULL(b->c.btree_id)) && | |
220 | bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && | |
221 | bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) | |
222 | return -BCH_ERR_ENOMEM_btree_node_reclaim; | |
223 | ||
19d54324 KO |
224 | wait_on_io: |
225 | if (b->flags & ((1U << BTREE_NODE_dirty)| | |
226 | (1U << BTREE_NODE_read_in_flight)| | |
227 | (1U << BTREE_NODE_write_in_flight))) { | |
228 | if (!flush) | |
65d48e35 | 229 | return -BCH_ERR_ENOMEM_btree_node_reclaim; |
19d54324 KO |
230 | |
231 | /* XXX: waiting on IO with btree cache lock held */ | |
232 | bch2_btree_node_wait_on_read(b); | |
233 | bch2_btree_node_wait_on_write(b); | |
234 | } | |
1c6fdbd8 | 235 | |
c43a6ef9 | 236 | if (!six_trylock_intent(&b->c.lock)) |
65d48e35 | 237 | return -BCH_ERR_ENOMEM_btree_node_reclaim; |
1c6fdbd8 | 238 | |
c43a6ef9 | 239 | if (!six_trylock_write(&b->c.lock)) |
1c6fdbd8 KO |
240 | goto out_unlock_intent; |
241 | ||
19d54324 KO |
242 | /* recheck under lock */ |
243 | if (b->flags & ((1U << BTREE_NODE_read_in_flight)| | |
244 | (1U << BTREE_NODE_write_in_flight))) { | |
245 | if (!flush) | |
246 | goto out_unlock; | |
247 | six_unlock_write(&b->c.lock); | |
248 | six_unlock_intent(&b->c.lock); | |
249 | goto wait_on_io; | |
250 | } | |
251 | ||
bf3efff5 KO |
252 | if (btree_node_noevict(b) || |
253 | btree_node_write_blocked(b) || | |
254 | btree_node_will_make_reachable(b)) | |
1c6fdbd8 KO |
255 | goto out_unlock; |
256 | ||
19d54324 | 257 | if (btree_node_dirty(b)) { |
55334d78 | 258 | if (!flush) |
1c6fdbd8 | 259 | goto out_unlock; |
1c6fdbd8 KO |
260 | /* |
261 | * Using the underscore version because we don't want to compact | |
262 | * bsets after the write, since this node is about to be evicted | |
263 | * - unless btree verify mode is enabled, since it runs out of | |
264 | * the post write cleanup: | |
265 | */ | |
29364f34 | 266 | if (bch2_verify_btree_ondisk) |
46fee692 KO |
267 | bch2_btree_node_write(c, b, SIX_LOCK_intent, |
268 | BTREE_WRITE_cache_reclaim); | |
1c6fdbd8 | 269 | else |
46fee692 KO |
270 | __bch2_btree_node_write(c, b, |
271 | BTREE_WRITE_cache_reclaim); | |
1c6fdbd8 | 272 | |
19d54324 KO |
273 | six_unlock_write(&b->c.lock); |
274 | six_unlock_intent(&b->c.lock); | |
275 | goto wait_on_io; | |
1c6fdbd8 KO |
276 | } |
277 | out: | |
237e8048 | 278 | if (b->hash_val && !ret) |
674cfc26 | 279 | trace_and_count(c, btree_cache_reap, c, b); |
1c6fdbd8 KO |
280 | return ret; |
281 | out_unlock: | |
c43a6ef9 | 282 | six_unlock_write(&b->c.lock); |
1c6fdbd8 | 283 | out_unlock_intent: |
c43a6ef9 | 284 | six_unlock_intent(&b->c.lock); |
65d48e35 | 285 | ret = -BCH_ERR_ENOMEM_btree_node_reclaim; |
1c6fdbd8 KO |
286 | goto out; |
287 | } | |
288 | ||
289 | static int btree_node_reclaim(struct bch_fs *c, struct btree *b) | |
290 | { | |
291 | return __btree_node_reclaim(c, b, false); | |
292 | } | |
293 | ||
294 | static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) | |
295 | { | |
296 | return __btree_node_reclaim(c, b, true); | |
297 | } | |
298 | ||
299 | static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, | |
300 | struct shrink_control *sc) | |
301 | { | |
ecae0bd5 | 302 | struct bch_fs *c = shrink->private_data; |
1c6fdbd8 KO |
303 | struct btree_cache *bc = &c->btree_cache; |
304 | struct btree *b, *t; | |
305 | unsigned long nr = sc->nr_to_scan; | |
7c7e071d | 306 | unsigned long can_free = 0; |
1c6fdbd8 | 307 | unsigned long freed = 0; |
c36ff038 | 308 | unsigned long touched = 0; |
97c0e195 | 309 | unsigned i, flags; |
c7ce813f | 310 | unsigned long ret = SHRINK_STOP; |
c36ff038 KO |
311 | bool trigger_writes = atomic_read(&bc->dirty) + nr >= |
312 | bc->used * 3 / 4; | |
1c6fdbd8 | 313 | |
29364f34 | 314 | if (bch2_btree_shrinker_disabled) |
1c6fdbd8 KO |
315 | return SHRINK_STOP; |
316 | ||
c36ff038 | 317 | mutex_lock(&bc->lock); |
97c0e195 KO |
318 | flags = memalloc_nofs_save(); |
319 | ||
1c6fdbd8 KO |
320 | /* |
321 | * It's _really_ critical that we don't free too many btree nodes - we | |
322 | * have to always leave ourselves a reserve. The reserve is how we | |
323 | * guarantee that allocating memory for a new btree node can always | |
324 | * succeed, so that inserting keys into the btree can always succeed and | |
325 | * IO can always make forward progress: | |
326 | */ | |
1c6fdbd8 KO |
327 | can_free = btree_cache_can_free(bc); |
328 | nr = min_t(unsigned long, nr, can_free); | |
329 | ||
330 | i = 0; | |
331 | list_for_each_entry_safe(b, t, &bc->freeable, list) { | |
c043a330 KO |
332 | /* |
333 | * Leave a few nodes on the freeable list, so that a btree split | |
334 | * won't have to hit the system allocator: | |
335 | */ | |
336 | if (++i <= 3) | |
337 | continue; | |
338 | ||
1c6fdbd8 KO |
339 | touched++; |
340 | ||
54b2db3d | 341 | if (touched >= nr) |
c36ff038 | 342 | goto out; |
1c6fdbd8 | 343 | |
c043a330 | 344 | if (!btree_node_reclaim(c, b)) { |
1c6fdbd8 | 345 | btree_node_data_free(c, b); |
c43a6ef9 KO |
346 | six_unlock_write(&b->c.lock); |
347 | six_unlock_intent(&b->c.lock); | |
1c6fdbd8 KO |
348 | freed++; |
349 | } | |
350 | } | |
351 | restart: | |
352 | list_for_each_entry_safe(b, t, &bc->live, list) { | |
c36ff038 KO |
353 | touched++; |
354 | ||
05a49d22 KO |
355 | if (btree_node_accessed(b)) { |
356 | clear_btree_node_accessed(b); | |
c36ff038 | 357 | } else if (!btree_node_reclaim(c, b)) { |
1c6fdbd8 | 358 | freed++; |
1c6fdbd8 | 359 | btree_node_data_free(c, b); |
1c6fdbd8 KO |
360 | |
361 | bch2_btree_node_hash_remove(bc, b); | |
c43a6ef9 KO |
362 | six_unlock_write(&b->c.lock); |
363 | six_unlock_intent(&b->c.lock); | |
1c6fdbd8 | 364 | |
c36ff038 KO |
365 | if (freed == nr) |
366 | goto out_rotate; | |
367 | } else if (trigger_writes && | |
368 | btree_node_dirty(b) && | |
369 | !btree_node_will_make_reachable(b) && | |
370 | !btree_node_write_blocked(b) && | |
371 | six_trylock_read(&b->c.lock)) { | |
372 | list_move(&bc->live, &b->list); | |
373 | mutex_unlock(&bc->lock); | |
46fee692 | 374 | __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); |
c36ff038 KO |
375 | six_unlock_read(&b->c.lock); |
376 | if (touched >= nr) | |
377 | goto out_nounlock; | |
378 | mutex_lock(&bc->lock); | |
1c6fdbd8 | 379 | goto restart; |
05a49d22 | 380 | } |
05a49d22 | 381 | |
c36ff038 | 382 | if (touched >= nr) |
05a49d22 | 383 | break; |
1c6fdbd8 | 384 | } |
c36ff038 KO |
385 | out_rotate: |
386 | if (&t->list != &bc->live) | |
387 | list_move_tail(&bc->live, &t->list); | |
1c6fdbd8 | 388 | out: |
c36ff038 KO |
389 | mutex_unlock(&bc->lock); |
390 | out_nounlock: | |
7c7e071d | 391 | ret = freed; |
e648448c | 392 | memalloc_nofs_restore(flags); |
674cfc26 | 393 | trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); |
c7ce813f | 394 | return ret; |
1c6fdbd8 KO |
395 | } |
396 | ||
397 | static unsigned long bch2_btree_cache_count(struct shrinker *shrink, | |
398 | struct shrink_control *sc) | |
399 | { | |
ecae0bd5 | 400 | struct bch_fs *c = shrink->private_data; |
1c6fdbd8 KO |
401 | struct btree_cache *bc = &c->btree_cache; |
402 | ||
29364f34 | 403 | if (bch2_btree_shrinker_disabled) |
1c6fdbd8 KO |
404 | return 0; |
405 | ||
7c7e071d | 406 | return btree_cache_can_free(bc); |
1c6fdbd8 KO |
407 | } |
408 | ||
409 | void bch2_fs_btree_cache_exit(struct bch_fs *c) | |
410 | { | |
411 | struct btree_cache *bc = &c->btree_cache; | |
412 | struct btree *b; | |
5d0b7f90 | 413 | unsigned i, flags; |
1c6fdbd8 | 414 | |
ecae0bd5 | 415 | shrinker_free(bc->shrink); |
1c6fdbd8 | 416 | |
5d0b7f90 KO |
417 | /* vfree() can allocate memory: */ |
418 | flags = memalloc_nofs_save(); | |
1c6fdbd8 KO |
419 | mutex_lock(&bc->lock); |
420 | ||
1c6fdbd8 KO |
421 | if (c->verify_data) |
422 | list_move(&c->verify_data->list, &bc->live); | |
423 | ||
cb6fc943 | 424 | kvfree(c->verify_ondisk); |
1c6fdbd8 | 425 | |
faa6cb6c KO |
426 | for (i = 0; i < btree_id_nr_alive(c); i++) { |
427 | struct btree_root *r = bch2_btree_id_root(c, i); | |
428 | ||
429 | if (r->b) | |
430 | list_add(&r->b->list, &bc->live); | |
431 | } | |
1c6fdbd8 KO |
432 | |
433 | list_splice(&bc->freeable, &bc->live); | |
434 | ||
435 | while (!list_empty(&bc->live)) { | |
436 | b = list_first_entry(&bc->live, struct btree, list); | |
437 | ||
438 | BUG_ON(btree_node_read_in_flight(b) || | |
439 | btree_node_write_in_flight(b)); | |
440 | ||
1c6fdbd8 KO |
441 | btree_node_data_free(c, b); |
442 | } | |
443 | ||
0117591e KO |
444 | BUG_ON(!bch2_journal_error(&c->journal) && |
445 | atomic_read(&c->btree_cache.dirty)); | |
6a747c46 | 446 | |
30985537 KO |
447 | list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); |
448 | ||
449 | while (!list_empty(&bc->freed_nonpcpu)) { | |
450 | b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); | |
1c6fdbd8 | 451 | list_del(&b->list); |
0d2234a7 | 452 | six_lock_exit(&b->c.lock); |
1c6fdbd8 KO |
453 | kfree(b); |
454 | } | |
455 | ||
456 | mutex_unlock(&bc->lock); | |
5d0b7f90 | 457 | memalloc_nofs_restore(flags); |
1c6fdbd8 KO |
458 | |
459 | if (bc->table_init_done) | |
460 | rhashtable_destroy(&bc->table); | |
461 | } | |
462 | ||
463 | int bch2_fs_btree_cache_init(struct bch_fs *c) | |
464 | { | |
465 | struct btree_cache *bc = &c->btree_cache; | |
ecae0bd5 | 466 | struct shrinker *shrink; |
1c6fdbd8 KO |
467 | unsigned i; |
468 | int ret = 0; | |
469 | ||
1c6fdbd8 KO |
470 | ret = rhashtable_init(&bc->table, &bch_btree_cache_params); |
471 | if (ret) | |
c8b4534d | 472 | goto err; |
1c6fdbd8 KO |
473 | |
474 | bc->table_init_done = true; | |
475 | ||
476 | bch2_recalc_btree_reserve(c); | |
477 | ||
478 | for (i = 0; i < bc->reserve; i++) | |
c8b4534d KO |
479 | if (!__bch2_btree_node_mem_alloc(c)) |
480 | goto err; | |
1c6fdbd8 KO |
481 | |
482 | list_splice_init(&bc->live, &bc->freeable); | |
483 | ||
1c6fdbd8 KO |
484 | mutex_init(&c->verify_lock); |
485 | ||
c9d01179 | 486 | shrink = shrinker_alloc(0, "%s-btree_cache", c->name); |
ecae0bd5 | 487 | if (!shrink) |
c8b4534d | 488 | goto err; |
ecae0bd5 LT |
489 | bc->shrink = shrink; |
490 | shrink->count_objects = bch2_btree_cache_count; | |
491 | shrink->scan_objects = bch2_btree_cache_scan; | |
492 | shrink->seeks = 4; | |
493 | shrink->private_data = c; | |
494 | shrinker_register(shrink); | |
c8b4534d KO |
495 | |
496 | return 0; | |
497 | err: | |
498 | return -BCH_ERR_ENOMEM_fs_btree_cache_init; | |
1c6fdbd8 KO |
499 | } |
500 | ||
501 | void bch2_fs_btree_cache_init_early(struct btree_cache *bc) | |
502 | { | |
503 | mutex_init(&bc->lock); | |
504 | INIT_LIST_HEAD(&bc->live); | |
505 | INIT_LIST_HEAD(&bc->freeable); | |
30985537 KO |
506 | INIT_LIST_HEAD(&bc->freed_pcpu); |
507 | INIT_LIST_HEAD(&bc->freed_nonpcpu); | |
1c6fdbd8 KO |
508 | } |
509 | ||
510 | /* | |
511 | * We can only have one thread cannibalizing other cached btree nodes at a time, | |
512 | * or we'll deadlock. We use an open coded mutex to ensure that, which a | |
513 | * cannibalize_bucket() will take. This means every time we unlock the root of | |
514 | * the btree, we need to release this lock if we have it held. | |
515 | */ | |
a564c9fa | 516 | void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) |
1c6fdbd8 | 517 | { |
a564c9fa | 518 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
519 | struct btree_cache *bc = &c->btree_cache; |
520 | ||
521 | if (bc->alloc_lock == current) { | |
a564c9fa | 522 | trace_and_count(c, btree_cache_cannibalize_unlock, trans); |
1c6fdbd8 KO |
523 | bc->alloc_lock = NULL; |
524 | closure_wake_up(&bc->alloc_wait); | |
525 | } | |
526 | } | |
527 | ||
a564c9fa | 528 | int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) |
1c6fdbd8 | 529 | { |
a564c9fa | 530 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
531 | struct btree_cache *bc = &c->btree_cache; |
532 | struct task_struct *old; | |
533 | ||
534 | old = cmpxchg(&bc->alloc_lock, NULL, current); | |
535 | if (old == NULL || old == current) | |
536 | goto success; | |
537 | ||
538 | if (!cl) { | |
a564c9fa | 539 | trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); |
65d48e35 | 540 | return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; |
1c6fdbd8 KO |
541 | } |
542 | ||
543 | closure_wait(&bc->alloc_wait, cl); | |
544 | ||
545 | /* Try again, after adding ourselves to waitlist */ | |
546 | old = cmpxchg(&bc->alloc_lock, NULL, current); | |
547 | if (old == NULL || old == current) { | |
548 | /* We raced */ | |
549 | closure_wake_up(&bc->alloc_wait); | |
550 | goto success; | |
551 | } | |
552 | ||
a564c9fa | 553 | trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); |
87ced107 | 554 | return -BCH_ERR_btree_cache_cannibalize_lock_blocked; |
1c6fdbd8 KO |
555 | |
556 | success: | |
a564c9fa | 557 | trace_and_count(c, btree_cache_cannibalize_lock, trans); |
1c6fdbd8 KO |
558 | return 0; |
559 | } | |
560 | ||
561 | static struct btree *btree_node_cannibalize(struct bch_fs *c) | |
562 | { | |
563 | struct btree_cache *bc = &c->btree_cache; | |
564 | struct btree *b; | |
565 | ||
566 | list_for_each_entry_reverse(b, &bc->live, list) | |
567 | if (!btree_node_reclaim(c, b)) | |
568 | return b; | |
569 | ||
570 | while (1) { | |
571 | list_for_each_entry_reverse(b, &bc->live, list) | |
572 | if (!btree_node_write_and_reclaim(c, b)) | |
573 | return b; | |
574 | ||
575 | /* | |
576 | * Rare case: all nodes were intent-locked. | |
577 | * Just busy-wait. | |
578 | */ | |
579 | WARN_ONCE(1, "btree cache cannibalize failed\n"); | |
580 | cond_resched(); | |
581 | } | |
582 | } | |
583 | ||
1306f87d | 584 | struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) |
1c6fdbd8 | 585 | { |
1306f87d | 586 | struct bch_fs *c = trans->c; |
1c6fdbd8 | 587 | struct btree_cache *bc = &c->btree_cache; |
30985537 KO |
588 | struct list_head *freed = pcpu_read_locks |
589 | ? &bc->freed_pcpu | |
590 | : &bc->freed_nonpcpu; | |
5b3f7805 | 591 | struct btree *b, *b2; |
1c6fdbd8 | 592 | u64 start_time = local_clock(); |
e0dfc08b | 593 | unsigned flags; |
1c6fdbd8 | 594 | |
e0dfc08b | 595 | flags = memalloc_nofs_save(); |
1c6fdbd8 KO |
596 | mutex_lock(&bc->lock); |
597 | ||
1c6fdbd8 KO |
598 | /* |
599 | * We never free struct btree itself, just the memory that holds the on | |
600 | * disk node. Check the freed list before allocating a new one: | |
601 | */ | |
30985537 | 602 | list_for_each_entry(b, freed, list) |
5b3f7805 KO |
603 | if (!btree_node_reclaim(c, b)) { |
604 | list_del_init(&b->list); | |
e38821f3 | 605 | goto got_node; |
5b3f7805 | 606 | } |
1c6fdbd8 | 607 | |
e1d29c5f | 608 | b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); |
c36ff038 KO |
609 | if (!b) { |
610 | mutex_unlock(&bc->lock); | |
e1d29c5f | 611 | bch2_trans_unlock(trans); |
c36ff038 KO |
612 | b = __btree_node_mem_alloc(c, GFP_KERNEL); |
613 | if (!b) | |
614 | goto err; | |
615 | mutex_lock(&bc->lock); | |
616 | } | |
5b3f7805 | 617 | |
0d2234a7 | 618 | bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); |
30985537 | 619 | |
5b3f7805 KO |
620 | BUG_ON(!six_trylock_intent(&b->c.lock)); |
621 | BUG_ON(!six_trylock_write(&b->c.lock)); | |
e38821f3 | 622 | got_node: |
e38821f3 | 623 | |
5b3f7805 KO |
624 | /* |
625 | * btree_free() doesn't free memory; it sticks the node on the end of | |
626 | * the list. Check if there's any freed nodes there: | |
627 | */ | |
628 | list_for_each_entry(b2, &bc->freeable, list) | |
629 | if (!btree_node_reclaim(c, b2)) { | |
630 | swap(b->data, b2->data); | |
631 | swap(b->aux_data, b2->aux_data); | |
30985537 | 632 | btree_node_to_freedlist(bc, b2); |
5b3f7805 KO |
633 | six_unlock_write(&b2->c.lock); |
634 | six_unlock_intent(&b2->c.lock); | |
635 | goto got_mem; | |
636 | } | |
1c6fdbd8 | 637 | |
5b3f7805 | 638 | mutex_unlock(&bc->lock); |
e38821f3 | 639 | |
e1d29c5f KO |
640 | if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { |
641 | bch2_trans_unlock(trans); | |
642 | if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) | |
643 | goto err; | |
644 | } | |
e38821f3 | 645 | |
5b3f7805 KO |
646 | mutex_lock(&bc->lock); |
647 | bc->used++; | |
648 | got_mem: | |
649 | mutex_unlock(&bc->lock); | |
1c6fdbd8 | 650 | |
1c6fdbd8 | 651 | BUG_ON(btree_node_hashed(b)); |
19d54324 | 652 | BUG_ON(btree_node_dirty(b)); |
1c6fdbd8 | 653 | BUG_ON(btree_node_write_in_flight(b)); |
1c6fdbd8 KO |
654 | out: |
655 | b->flags = 0; | |
656 | b->written = 0; | |
657 | b->nsets = 0; | |
658 | b->sib_u64s[0] = 0; | |
659 | b->sib_u64s[1] = 0; | |
660 | b->whiteout_u64s = 0; | |
29364f34 | 661 | bch2_btree_keys_init(b); |
e68031fb | 662 | set_btree_node_accessed(b); |
1c6fdbd8 KO |
663 | |
664 | bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], | |
665 | start_time); | |
666 | ||
692c3f06 | 667 | memalloc_nofs_restore(flags); |
1c6fdbd8 KO |
668 | return b; |
669 | err: | |
e38821f3 | 670 | mutex_lock(&bc->lock); |
c36ff038 | 671 | |
1c6fdbd8 KO |
672 | /* Try to cannibalize another cached btree node: */ |
673 | if (bc->alloc_lock == current) { | |
5b3f7805 | 674 | b2 = btree_node_cannibalize(c); |
0b438c5b | 675 | clear_btree_node_just_written(b2); |
5b3f7805 KO |
676 | bch2_btree_node_hash_remove(bc, b2); |
677 | ||
678 | if (b) { | |
679 | swap(b->data, b2->data); | |
680 | swap(b->aux_data, b2->aux_data); | |
30985537 | 681 | btree_node_to_freedlist(bc, b2); |
5b3f7805 KO |
682 | six_unlock_write(&b2->c.lock); |
683 | six_unlock_intent(&b2->c.lock); | |
684 | } else { | |
685 | b = b2; | |
686 | list_del_init(&b->list); | |
687 | } | |
1c6fdbd8 | 688 | |
5b3f7805 | 689 | mutex_unlock(&bc->lock); |
1c6fdbd8 | 690 | |
a564c9fa | 691 | trace_and_count(c, btree_cache_cannibalize, trans); |
1c6fdbd8 KO |
692 | goto out; |
693 | } | |
694 | ||
695 | mutex_unlock(&bc->lock); | |
692c3f06 | 696 | memalloc_nofs_restore(flags); |
65d48e35 | 697 | return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); |
1c6fdbd8 KO |
698 | } |
699 | ||
700 | /* Slowpath, don't want it inlined into btree_iter_traverse() */ | |
1306f87d | 701 | static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, |
67e0dd8f | 702 | struct btree_path *path, |
1c6fdbd8 | 703 | const struct bkey_i *k, |
e62d65f2 | 704 | enum btree_id btree_id, |
1c6fdbd8 KO |
705 | unsigned level, |
706 | enum six_lock_type lock_type, | |
707 | bool sync) | |
708 | { | |
1306f87d | 709 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
710 | struct btree_cache *bc = &c->btree_cache; |
711 | struct btree *b; | |
19d54324 | 712 | u32 seq; |
1c6fdbd8 | 713 | |
8cf2036e KO |
714 | if (unlikely(level >= BTREE_MAX_DEPTH)) { |
715 | int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u", | |
716 | level, BTREE_MAX_DEPTH); | |
717 | return ERR_PTR(ret); | |
718 | } | |
719 | ||
720 | if (unlikely(!bkey_is_btree_ptr(&k->k))) { | |
721 | struct printbuf buf = PRINTBUF; | |
722 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); | |
723 | ||
724 | int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); | |
725 | printbuf_exit(&buf); | |
726 | return ERR_PTR(ret); | |
727 | } | |
728 | ||
729 | if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { | |
730 | struct printbuf buf = PRINTBUF; | |
731 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); | |
732 | ||
733 | int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); | |
734 | printbuf_exit(&buf); | |
735 | return ERR_PTR(ret); | |
736 | } | |
737 | ||
1c6fdbd8 KO |
738 | /* |
739 | * Parent node must be locked, else we could read in a btree node that's | |
740 | * been freed: | |
741 | */ | |
1306f87d | 742 | if (path && !bch2_btree_node_relock(trans, path, level + 1)) { |
674cfc26 | 743 | trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); |
549d173c | 744 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); |
e5af273f | 745 | } |
1c6fdbd8 | 746 | |
1306f87d | 747 | b = bch2_btree_node_mem_alloc(trans, level != 0); |
8f9ad91a | 748 | |
65d48e35 | 749 | if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { |
5f43b013 KO |
750 | if (!path) |
751 | return b; | |
752 | ||
8f9ad91a | 753 | trans->memory_allocation_failure = true; |
674cfc26 | 754 | trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); |
549d173c | 755 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); |
8f9ad91a KO |
756 | } |
757 | ||
1c6fdbd8 KO |
758 | if (IS_ERR(b)) |
759 | return b; | |
760 | ||
761 | bkey_copy(&b->key, k); | |
e62d65f2 | 762 | if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { |
1c6fdbd8 KO |
763 | /* raced with another fill: */ |
764 | ||
765 | /* mark as unhashed... */ | |
237e8048 | 766 | b->hash_val = 0; |
1c6fdbd8 KO |
767 | |
768 | mutex_lock(&bc->lock); | |
769 | list_add(&b->list, &bc->freeable); | |
770 | mutex_unlock(&bc->lock); | |
771 | ||
c43a6ef9 KO |
772 | six_unlock_write(&b->c.lock); |
773 | six_unlock_intent(&b->c.lock); | |
1c6fdbd8 KO |
774 | return NULL; |
775 | } | |
776 | ||
19d54324 KO |
777 | set_btree_node_read_in_flight(b); |
778 | ||
779 | six_unlock_write(&b->c.lock); | |
1fb4fe63 | 780 | seq = six_lock_seq(&b->c.lock); |
19d54324 KO |
781 | six_unlock_intent(&b->c.lock); |
782 | ||
c205321b | 783 | /* Unlock before doing IO: */ |
51c801bc | 784 | if (path && sync) |
25aa8c21 | 785 | bch2_trans_unlock_noassert(trans); |
1c6fdbd8 | 786 | |
a564c9fa | 787 | bch2_btree_node_read(trans, b, sync); |
1c6fdbd8 | 788 | |
19d54324 | 789 | if (!sync) |
1c6fdbd8 | 790 | return NULL; |
1c6fdbd8 | 791 | |
1306f87d | 792 | if (path) { |
549d173c KO |
793 | int ret = bch2_trans_relock(trans) ?: |
794 | bch2_btree_path_relock_intent(trans, path); | |
795 | if (ret) { | |
796 | BUG_ON(!trans->restarted); | |
797 | return ERR_PTR(ret); | |
798 | } | |
e5af273f | 799 | } |
c205321b | 800 | |
e5af273f | 801 | if (!six_relock_type(&b->c.lock, lock_type, seq)) { |
5f43b013 KO |
802 | BUG_ON(!path); |
803 | ||
804 | trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); | |
549d173c | 805 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); |
e5af273f | 806 | } |
1c6fdbd8 KO |
807 | |
808 | return b; | |
809 | } | |
810 | ||
537c32f5 KO |
811 | static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) |
812 | { | |
1d8a2689 | 813 | struct printbuf buf = PRINTBUF; |
537c32f5 | 814 | |
067d228b | 815 | if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) |
537c32f5 KO |
816 | return; |
817 | ||
401ec4db | 818 | prt_printf(&buf, |
1d8a2689 KO |
819 | "btree node header doesn't match ptr\n" |
820 | "btree %s level %u\n" | |
821 | "ptr: ", | |
88dfe193 | 822 | bch2_btree_id_str(b->c.btree_id), b->c.level); |
1d8a2689 KO |
823 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); |
824 | ||
401ec4db | 825 | prt_printf(&buf, "\nheader: btree %s level %llu\n" |
1d8a2689 | 826 | "min ", |
88dfe193 | 827 | bch2_btree_id_str(BTREE_NODE_ID(b->data)), |
1d8a2689 KO |
828 | BTREE_NODE_LEVEL(b->data)); |
829 | bch2_bpos_to_text(&buf, b->data->min_key); | |
830 | ||
401ec4db | 831 | prt_printf(&buf, "\nmax "); |
1d8a2689 KO |
832 | bch2_bpos_to_text(&buf, b->data->max_key); |
833 | ||
79032b07 KO |
834 | bch2_fs_topology_error(c, "%s", buf.buf); |
835 | ||
1d8a2689 | 836 | printbuf_exit(&buf); |
537c32f5 KO |
837 | } |
838 | ||
839 | static inline void btree_check_header(struct bch_fs *c, struct btree *b) | |
840 | { | |
841 | if (b->c.btree_id != BTREE_NODE_ID(b->data) || | |
842 | b->c.level != BTREE_NODE_LEVEL(b->data) || | |
e88a75eb | 843 | !bpos_eq(b->data->max_key, b->key.k.p) || |
537c32f5 | 844 | (b->key.k.type == KEY_TYPE_btree_ptr_v2 && |
e88a75eb | 845 | !bpos_eq(b->data->min_key, |
537c32f5 KO |
846 | bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) |
847 | btree_bad_header(c, b); | |
848 | } | |
849 | ||
001783e2 KO |
850 | static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, |
851 | const struct bkey_i *k, unsigned level, | |
852 | enum six_lock_type lock_type, | |
853 | unsigned long trace_ip) | |
1c6fdbd8 | 854 | { |
6e075b54 | 855 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
856 | struct btree_cache *bc = &c->btree_cache; |
857 | struct btree *b; | |
858 | struct bset_tree *t; | |
e1d29c5f | 859 | bool need_relock = false; |
549d173c | 860 | int ret; |
1c6fdbd8 | 861 | |
1c6fdbd8 KO |
862 | EBUG_ON(level >= BTREE_MAX_DEPTH); |
863 | retry: | |
1c6fdbd8 | 864 | b = btree_cache_find(bc, k); |
1c6fdbd8 KO |
865 | if (unlikely(!b)) { |
866 | /* | |
867 | * We must have the parent locked to call bch2_btree_node_fill(), | |
868 | * else we could read in a btree node from disk that's been | |
869 | * freed: | |
870 | */ | |
1306f87d | 871 | b = bch2_btree_node_fill(trans, path, k, path->btree_id, |
e62d65f2 | 872 | level, lock_type, true); |
e1d29c5f | 873 | need_relock = true; |
1c6fdbd8 KO |
874 | |
875 | /* We raced and found the btree node in the cache */ | |
876 | if (!b) | |
877 | goto retry; | |
878 | ||
879 | if (IS_ERR(b)) | |
880 | return b; | |
881 | } else { | |
67e0dd8f | 882 | if (btree_node_read_locked(path, level + 1)) |
8bfe14e8 | 883 | btree_node_unlock(trans, path, level + 1); |
1c6fdbd8 | 884 | |
0d7009d7 KO |
885 | ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); |
886 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) | |
887 | return ERR_PTR(ret); | |
888 | ||
889 | BUG_ON(ret); | |
1c6fdbd8 | 890 | |
237e8048 | 891 | if (unlikely(b->hash_val != btree_ptr_hash_val(k) || |
c43a6ef9 | 892 | b->c.level != level || |
1c6fdbd8 | 893 | race_fault())) { |
c43a6ef9 | 894 | six_unlock_type(&b->c.lock, lock_type); |
67e0dd8f | 895 | if (bch2_btree_node_relock(trans, path, level + 1)) |
1c6fdbd8 KO |
896 | goto retry; |
897 | ||
674cfc26 | 898 | trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); |
549d173c | 899 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); |
1c6fdbd8 | 900 | } |
447e9227 KO |
901 | |
902 | /* avoid atomic set bit if it's not needed: */ | |
903 | if (!btree_node_accessed(b)) | |
904 | set_btree_node_accessed(b); | |
1c6fdbd8 KO |
905 | } |
906 | ||
c205321b | 907 | if (unlikely(btree_node_read_in_flight(b))) { |
1fb4fe63 | 908 | u32 seq = six_lock_seq(&b->c.lock); |
19d54324 | 909 | |
c205321b | 910 | six_unlock_type(&b->c.lock, lock_type); |
6e075b54 | 911 | bch2_trans_unlock(trans); |
e1d29c5f | 912 | need_relock = true; |
c205321b | 913 | |
19d54324 | 914 | bch2_btree_node_wait_on_read(b); |
c205321b KO |
915 | |
916 | /* | |
67e0dd8f KO |
917 | * should_be_locked is not set on this path yet, so we need to |
918 | * relock it specifically: | |
c205321b | 919 | */ |
19d54324 KO |
920 | if (!six_relock_type(&b->c.lock, lock_type, seq)) |
921 | goto retry; | |
c205321b | 922 | } |
1c6fdbd8 | 923 | |
e1d29c5f | 924 | if (unlikely(need_relock)) { |
96dea3d5 | 925 | ret = bch2_trans_relock(trans) ?: |
e1d29c5f KO |
926 | bch2_btree_path_relock_intent(trans, path); |
927 | if (ret) { | |
928 | six_unlock_type(&b->c.lock, lock_type); | |
929 | return ERR_PTR(ret); | |
930 | } | |
931 | } | |
932 | ||
1c6fdbd8 KO |
933 | prefetch(b->aux_data); |
934 | ||
935 | for_each_bset(b, t) { | |
936 | void *p = (u64 *) b->aux_data + t->aux_data_offset; | |
937 | ||
938 | prefetch(p + L1_CACHE_BYTES * 0); | |
939 | prefetch(p + L1_CACHE_BYTES * 1); | |
940 | prefetch(p + L1_CACHE_BYTES * 2); | |
941 | } | |
942 | ||
1c6fdbd8 | 943 | if (unlikely(btree_node_read_error(b))) { |
c43a6ef9 | 944 | six_unlock_type(&b->c.lock, lock_type); |
52946d82 | 945 | return ERR_PTR(-BCH_ERR_btree_node_read_error); |
1c6fdbd8 KO |
946 | } |
947 | ||
67e0dd8f | 948 | EBUG_ON(b->c.btree_id != path->btree_id); |
001783e2 KO |
949 | EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); |
950 | btree_check_header(c, b); | |
951 | ||
952 | return b; | |
953 | } | |
954 | ||
955 | /** | |
96dea3d5 | 956 | * bch2_btree_node_get - find a btree node in the cache and lock it, reading it |
001783e2 KO |
957 | * in from disk if necessary. |
958 | * | |
96dea3d5 KO |
959 | * @trans: btree transaction object |
960 | * @path: btree_path being traversed | |
961 | * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) | |
962 | * @level: level of btree node being looked up (0 == leaf node) | |
963 | * @lock_type: SIX_LOCK_read or SIX_LOCK_intent | |
964 | * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) | |
965 | * | |
001783e2 KO |
966 | * The btree node will have either a read or a write lock held, depending on |
967 | * the @write parameter. | |
96dea3d5 KO |
968 | * |
969 | * Returns: btree node or ERR_PTR() | |
001783e2 KO |
970 | */ |
971 | struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, | |
972 | const struct bkey_i *k, unsigned level, | |
973 | enum six_lock_type lock_type, | |
974 | unsigned long trace_ip) | |
975 | { | |
976 | struct bch_fs *c = trans->c; | |
977 | struct btree *b; | |
978 | struct bset_tree *t; | |
979 | int ret; | |
980 | ||
981 | EBUG_ON(level >= BTREE_MAX_DEPTH); | |
982 | ||
983 | b = btree_node_mem_ptr(k); | |
984 | ||
985 | /* | |
986 | * Check b->hash_val _before_ calling btree_node_lock() - this might not | |
987 | * be the node we want anymore, and trying to lock the wrong node could | |
988 | * cause an unneccessary transaction restart: | |
989 | */ | |
990 | if (unlikely(!c->opts.btree_node_mem_ptr_optimization || | |
991 | !b || | |
992 | b->hash_val != btree_ptr_hash_val(k))) | |
993 | return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); | |
994 | ||
995 | if (btree_node_read_locked(path, level + 1)) | |
996 | btree_node_unlock(trans, path, level + 1); | |
997 | ||
998 | ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); | |
999 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) | |
1000 | return ERR_PTR(ret); | |
1001 | ||
1002 | BUG_ON(ret); | |
1003 | ||
1004 | if (unlikely(b->hash_val != btree_ptr_hash_val(k) || | |
1005 | b->c.level != level || | |
1006 | race_fault())) { | |
1007 | six_unlock_type(&b->c.lock, lock_type); | |
1008 | if (bch2_btree_node_relock(trans, path, level + 1)) | |
1009 | return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); | |
1010 | ||
1011 | trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); | |
1012 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); | |
1013 | } | |
1014 | ||
1015 | if (unlikely(btree_node_read_in_flight(b))) { | |
001783e2 | 1016 | six_unlock_type(&b->c.lock, lock_type); |
51c801bc | 1017 | return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); |
001783e2 KO |
1018 | } |
1019 | ||
1020 | prefetch(b->aux_data); | |
1021 | ||
1022 | for_each_bset(b, t) { | |
1023 | void *p = (u64 *) b->aux_data + t->aux_data_offset; | |
1024 | ||
1025 | prefetch(p + L1_CACHE_BYTES * 0); | |
1026 | prefetch(p + L1_CACHE_BYTES * 1); | |
1027 | prefetch(p + L1_CACHE_BYTES * 2); | |
1028 | } | |
1029 | ||
1030 | /* avoid atomic set bit if it's not needed: */ | |
1031 | if (!btree_node_accessed(b)) | |
1032 | set_btree_node_accessed(b); | |
1033 | ||
1034 | if (unlikely(btree_node_read_error(b))) { | |
1035 | six_unlock_type(&b->c.lock, lock_type); | |
52946d82 | 1036 | return ERR_PTR(-BCH_ERR_btree_node_read_error); |
001783e2 KO |
1037 | } |
1038 | ||
1039 | EBUG_ON(b->c.btree_id != path->btree_id); | |
a0b73c1c | 1040 | EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); |
537c32f5 | 1041 | btree_check_header(c, b); |
1c6fdbd8 KO |
1042 | |
1043 | return b; | |
1044 | } | |
1045 | ||
ca7d8fca | 1046 | struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, |
e62d65f2 KO |
1047 | const struct bkey_i *k, |
1048 | enum btree_id btree_id, | |
a0b73c1c KO |
1049 | unsigned level, |
1050 | bool nofill) | |
e62d65f2 | 1051 | { |
ca7d8fca | 1052 | struct bch_fs *c = trans->c; |
e62d65f2 KO |
1053 | struct btree_cache *bc = &c->btree_cache; |
1054 | struct btree *b; | |
1055 | struct bset_tree *t; | |
bd2bb273 | 1056 | int ret; |
e62d65f2 KO |
1057 | |
1058 | EBUG_ON(level >= BTREE_MAX_DEPTH); | |
1059 | ||
a32b9573 KO |
1060 | if (c->opts.btree_node_mem_ptr_optimization) { |
1061 | b = btree_node_mem_ptr(k); | |
1062 | if (b) | |
1063 | goto lock_node; | |
1064 | } | |
e62d65f2 KO |
1065 | retry: |
1066 | b = btree_cache_find(bc, k); | |
1067 | if (unlikely(!b)) { | |
a0b73c1c | 1068 | if (nofill) |
18a7b972 | 1069 | goto out; |
a0b73c1c | 1070 | |
1306f87d | 1071 | b = bch2_btree_node_fill(trans, NULL, k, btree_id, |
e62d65f2 KO |
1072 | level, SIX_LOCK_read, true); |
1073 | ||
1074 | /* We raced and found the btree node in the cache */ | |
1075 | if (!b) | |
1076 | goto retry; | |
1077 | ||
18a7b972 | 1078 | if (IS_ERR(b) && |
a564c9fa | 1079 | !bch2_btree_cache_cannibalize_lock(trans, NULL)) |
18a7b972 KO |
1080 | goto retry; |
1081 | ||
e62d65f2 | 1082 | if (IS_ERR(b)) |
18a7b972 | 1083 | goto out; |
e62d65f2 KO |
1084 | } else { |
1085 | lock_node: | |
94c69faf | 1086 | ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); |
0d7009d7 KO |
1087 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) |
1088 | return ERR_PTR(ret); | |
1089 | ||
1090 | BUG_ON(ret); | |
e62d65f2 KO |
1091 | |
1092 | if (unlikely(b->hash_val != btree_ptr_hash_val(k) || | |
1093 | b->c.btree_id != btree_id || | |
1094 | b->c.level != level)) { | |
1095 | six_unlock_read(&b->c.lock); | |
1096 | goto retry; | |
1097 | } | |
1098 | } | |
1099 | ||
1100 | /* XXX: waiting on IO with btree locks held: */ | |
19d54324 | 1101 | __bch2_btree_node_wait_on_read(b); |
e62d65f2 KO |
1102 | |
1103 | prefetch(b->aux_data); | |
1104 | ||
1105 | for_each_bset(b, t) { | |
1106 | void *p = (u64 *) b->aux_data + t->aux_data_offset; | |
1107 | ||
1108 | prefetch(p + L1_CACHE_BYTES * 0); | |
1109 | prefetch(p + L1_CACHE_BYTES * 1); | |
1110 | prefetch(p + L1_CACHE_BYTES * 2); | |
1111 | } | |
1112 | ||
1113 | /* avoid atomic set bit if it's not needed: */ | |
1114 | if (!btree_node_accessed(b)) | |
1115 | set_btree_node_accessed(b); | |
1116 | ||
1117 | if (unlikely(btree_node_read_error(b))) { | |
1118 | six_unlock_read(&b->c.lock); | |
52946d82 | 1119 | b = ERR_PTR(-BCH_ERR_btree_node_read_error); |
18a7b972 | 1120 | goto out; |
e62d65f2 KO |
1121 | } |
1122 | ||
a0b73c1c KO |
1123 | EBUG_ON(b->c.btree_id != btree_id); |
1124 | EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); | |
537c32f5 | 1125 | btree_check_header(c, b); |
18a7b972 | 1126 | out: |
a564c9fa | 1127 | bch2_btree_cache_cannibalize_unlock(trans); |
e62d65f2 KO |
1128 | return b; |
1129 | } | |
1130 | ||
1306f87d | 1131 | int bch2_btree_node_prefetch(struct btree_trans *trans, |
67e0dd8f | 1132 | struct btree_path *path, |
8b3e9bd6 KO |
1133 | const struct bkey_i *k, |
1134 | enum btree_id btree_id, unsigned level) | |
1c6fdbd8 | 1135 | { |
1306f87d | 1136 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
1137 | struct btree_cache *bc = &c->btree_cache; |
1138 | struct btree *b; | |
1139 | ||
5f43b013 | 1140 | BUG_ON(path && !btree_node_locked(path, level + 1)); |
1c6fdbd8 KO |
1141 | BUG_ON(level >= BTREE_MAX_DEPTH); |
1142 | ||
1c6fdbd8 | 1143 | b = btree_cache_find(bc, k); |
1c6fdbd8 | 1144 | if (b) |
8b3e9bd6 | 1145 | return 0; |
1c6fdbd8 | 1146 | |
1306f87d | 1147 | b = bch2_btree_node_fill(trans, path, k, btree_id, |
78cf784e | 1148 | level, SIX_LOCK_read, false); |
8b3e9bd6 | 1149 | return PTR_ERR_OR_ZERO(b); |
1c6fdbd8 KO |
1150 | } |
1151 | ||
ca7d8fca | 1152 | void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) |
ceda1b9a | 1153 | { |
ca7d8fca | 1154 | struct bch_fs *c = trans->c; |
ceda1b9a KO |
1155 | struct btree_cache *bc = &c->btree_cache; |
1156 | struct btree *b; | |
1157 | ||
1158 | b = btree_cache_find(bc, k); | |
1159 | if (!b) | |
1160 | return; | |
aa6e130e KO |
1161 | |
1162 | BUG_ON(b == btree_node_root(trans->c, b)); | |
19d54324 KO |
1163 | wait_on_io: |
1164 | /* not allowed to wait on io with btree locks held: */ | |
1165 | ||
1166 | /* XXX we're called from btree_gc which will be holding other btree | |
1167 | * nodes locked | |
3e3e02e6 | 1168 | */ |
19d54324 KO |
1169 | __bch2_btree_node_wait_on_read(b); |
1170 | __bch2_btree_node_wait_on_write(b); | |
ceda1b9a | 1171 | |
ca7d8fca KO |
1172 | btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); |
1173 | btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); | |
7b4c4ccf KO |
1174 | if (unlikely(b->hash_val != btree_ptr_hash_val(k))) |
1175 | goto out; | |
ceda1b9a | 1176 | |
19d54324 | 1177 | if (btree_node_dirty(b)) { |
46fee692 | 1178 | __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); |
19d54324 KO |
1179 | six_unlock_write(&b->c.lock); |
1180 | six_unlock_intent(&b->c.lock); | |
1181 | goto wait_on_io; | |
1182 | } | |
ceda1b9a KO |
1183 | |
1184 | BUG_ON(btree_node_dirty(b)); | |
1185 | ||
1186 | mutex_lock(&bc->lock); | |
1187 | btree_node_data_free(c, b); | |
1188 | bch2_btree_node_hash_remove(bc, b); | |
1189 | mutex_unlock(&bc->lock); | |
7b4c4ccf | 1190 | out: |
ceda1b9a KO |
1191 | six_unlock_write(&b->c.lock); |
1192 | six_unlock_intent(&b->c.lock); | |
1193 | } | |
1194 | ||
88dfe193 KO |
1195 | const char *bch2_btree_id_str(enum btree_id btree) |
1196 | { | |
1197 | return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; | |
1198 | } | |
1199 | ||
1200 | void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) | |
1201 | { | |
1202 | prt_printf(out, "%s level %u/%u\n ", | |
1203 | bch2_btree_id_str(b->c.btree_id), | |
1204 | b->c.level, | |
1205 | bch2_btree_id_root(c, b->c.btree_id)->level); | |
1206 | bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); | |
1207 | } | |
1208 | ||
1209 | void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) | |
1c6fdbd8 | 1210 | { |
1c6fdbd8 | 1211 | struct bset_stats stats; |
1c6fdbd8 KO |
1212 | |
1213 | memset(&stats, 0, sizeof(stats)); | |
1214 | ||
1c6fdbd8 KO |
1215 | bch2_btree_keys_stats(b, &stats); |
1216 | ||
401ec4db | 1217 | prt_printf(out, "l %u ", b->c.level); |
f020bfcd | 1218 | bch2_bpos_to_text(out, b->data->min_key); |
401ec4db | 1219 | prt_printf(out, " - "); |
f020bfcd | 1220 | bch2_bpos_to_text(out, b->data->max_key); |
401ec4db | 1221 | prt_printf(out, ":\n" |
f020bfcd | 1222 | " ptrs: "); |
26609b61 | 1223 | bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); |
6c643965 | 1224 | prt_newline(out); |
f020bfcd | 1225 | |
6c643965 KO |
1226 | prt_printf(out, |
1227 | " format: "); | |
1228 | bch2_bkey_format_to_text(out, &b->format); | |
1229 | ||
1230 | prt_printf(out, | |
319f9ac3 KO |
1231 | " unpack fn len: %u\n" |
1232 | " bytes used %zu/%zu (%zu%% full)\n" | |
54ca47e1 | 1233 | " sib u64s: %u, %u (merge threshold %u)\n" |
319f9ac3 KO |
1234 | " nr packed keys %u\n" |
1235 | " nr unpacked keys %u\n" | |
1236 | " floats %zu\n" | |
58404bb2 | 1237 | " failed unpacked %zu\n", |
319f9ac3 KO |
1238 | b->unpack_fn_len, |
1239 | b->nr.live_u64s * sizeof(u64), | |
ec4edd7b | 1240 | btree_buf_bytes(b) - sizeof(struct btree_node), |
319f9ac3 KO |
1241 | b->nr.live_u64s * 100 / btree_max_u64s(c), |
1242 | b->sib_u64s[0], | |
1243 | b->sib_u64s[1], | |
54ca47e1 | 1244 | c->btree_foreground_merge_threshold, |
319f9ac3 KO |
1245 | b->nr.packed_keys, |
1246 | b->nr.unpacked_keys, | |
1247 | stats.floats, | |
58404bb2 | 1248 | stats.failed); |
1c6fdbd8 | 1249 | } |
d8ebed7d | 1250 | |
a345b0f3 | 1251 | void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) |
d8ebed7d | 1252 | { |
401ec4db KO |
1253 | prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); |
1254 | prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); | |
1255 | prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); | |
d8ebed7d | 1256 | } |