Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Code for working with individual keys, and sorted sets of keys with in a | |
4 | * btree node | |
5 | * | |
6 | * Copyright 2012 Google, Inc. | |
7 | */ | |
8 | ||
9 | #include "bcachefs.h" | |
10 | #include "btree_cache.h" | |
11 | #include "bset.h" | |
12 | #include "eytzinger.h" | |
13 | #include "trace.h" | |
14 | #include "util.h" | |
15 | ||
16 | #include <asm/unaligned.h> | |
17 | #include <linux/console.h> | |
18 | #include <linux/random.h> | |
19 | #include <linux/prefetch.h> | |
20 | ||
21 | struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) | |
22 | { | |
23 | struct bset_tree *t; | |
24 | ||
25 | for_each_bset(b, t) | |
26 | if (k >= btree_bkey_first(b, t) && | |
27 | k < btree_bkey_last(b, t)) | |
28 | return t; | |
29 | ||
30 | BUG(); | |
31 | } | |
32 | ||
33 | /* | |
34 | * There are never duplicate live keys in the btree - but including keys that | |
35 | * have been flagged as deleted (and will be cleaned up later) we _will_ see | |
36 | * duplicates. | |
37 | * | |
38 | * Thus the sort order is: usual key comparison first, but for keys that compare | |
39 | * equal the deleted key(s) come first, and the (at most one) live version comes | |
40 | * last. | |
41 | * | |
42 | * The main reason for this is insertion: to handle overwrites, we first iterate | |
43 | * over keys that compare equal to our insert key, and then insert immediately | |
44 | * prior to the first key greater than the key we're inserting - our insert | |
45 | * position will be after all keys that compare equal to our insert key, which | |
46 | * by the time we actually do the insert will all be deleted. | |
47 | */ | |
48 | ||
49 | void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set) | |
50 | { | |
51 | struct bkey_packed *_k, *_n; | |
52 | struct bkey k, n; | |
53 | char buf[120]; | |
54 | ||
55 | if (!i->u64s) | |
56 | return; | |
57 | ||
58 | for (_k = i->start, k = bkey_unpack_key(b, _k); | |
59 | _k < vstruct_last(i); | |
60 | _k = _n, k = n) { | |
61 | _n = bkey_next(_k); | |
62 | ||
63 | bch2_bkey_to_text(buf, sizeof(buf), &k); | |
64 | printk(KERN_ERR "block %u key %zi/%u: %s\n", set, | |
65 | _k->_data - i->_data, i->u64s, buf); | |
66 | ||
67 | if (_n == vstruct_last(i)) | |
68 | continue; | |
69 | ||
70 | n = bkey_unpack_key(b, _n); | |
71 | ||
72 | if (bkey_cmp(bkey_start_pos(&n), k.p) < 0) { | |
73 | printk(KERN_ERR "Key skipped backwards\n"); | |
74 | continue; | |
75 | } | |
76 | ||
77 | /* | |
78 | * Weird check for duplicate non extent keys: extents are | |
79 | * deleted iff they have 0 size, so if it has zero size and it's | |
80 | * not deleted these aren't extents: | |
81 | */ | |
82 | if (((!k.size && !bkey_deleted(&k)) || | |
83 | (!n.size && !bkey_deleted(&n))) && | |
84 | !bkey_deleted(&k) && | |
85 | !bkey_cmp(n.p, k.p)) | |
86 | printk(KERN_ERR "Duplicate keys\n"); | |
87 | } | |
88 | } | |
89 | ||
90 | void bch2_dump_btree_node(struct btree *b) | |
91 | { | |
92 | struct bset_tree *t; | |
93 | ||
94 | console_lock(); | |
95 | for_each_bset(b, t) | |
96 | bch2_dump_bset(b, bset(b, t), t - b->set); | |
97 | console_unlock(); | |
98 | } | |
99 | ||
100 | void bch2_dump_btree_node_iter(struct btree *b, | |
101 | struct btree_node_iter *iter) | |
102 | { | |
103 | struct btree_node_iter_set *set; | |
104 | ||
105 | printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets); | |
106 | ||
107 | btree_node_iter_for_each(iter, set) { | |
108 | struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); | |
109 | struct bset_tree *t = bch2_bkey_to_bset(b, k); | |
110 | struct bkey uk = bkey_unpack_key(b, k); | |
111 | char buf[100]; | |
112 | ||
113 | bch2_bkey_to_text(buf, sizeof(buf), &uk); | |
114 | printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set, | |
115 | k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf); | |
116 | } | |
117 | } | |
118 | ||
119 | #ifdef CONFIG_BCACHEFS_DEBUG | |
120 | ||
121 | static bool keys_out_of_order(struct btree *b, | |
122 | const struct bkey_packed *prev, | |
123 | const struct bkey_packed *next, | |
124 | bool is_extents) | |
125 | { | |
126 | struct bkey nextu = bkey_unpack_key(b, next); | |
127 | ||
128 | return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 || | |
129 | ((is_extents | |
130 | ? !bkey_deleted(next) | |
131 | : !bkey_deleted(prev)) && | |
132 | !bkey_cmp_packed(b, prev, next)); | |
133 | } | |
134 | ||
135 | void __bch2_verify_btree_nr_keys(struct btree *b) | |
136 | { | |
137 | struct bset_tree *t; | |
138 | struct bkey_packed *k; | |
139 | struct btree_nr_keys nr = { 0 }; | |
140 | ||
141 | for_each_bset(b, t) | |
142 | for (k = btree_bkey_first(b, t); | |
143 | k != btree_bkey_last(b, t); | |
144 | k = bkey_next(k)) | |
145 | if (!bkey_whiteout(k)) | |
146 | btree_keys_account_key_add(&nr, t - b->set, k); | |
147 | ||
148 | BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); | |
149 | } | |
150 | ||
151 | static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, | |
152 | struct btree *b, | |
153 | struct bkey_packed *k) | |
154 | { | |
155 | const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b); | |
156 | ||
157 | bkey_unpack_key(b, k); | |
158 | ||
159 | if (n && | |
160 | keys_out_of_order(b, k, n, iter->is_extents)) { | |
161 | struct bkey ku = bkey_unpack_key(b, k); | |
162 | struct bkey nu = bkey_unpack_key(b, n); | |
163 | char buf1[80], buf2[80]; | |
164 | ||
165 | bch2_dump_btree_node(b); | |
166 | bch2_bkey_to_text(buf1, sizeof(buf1), &ku); | |
167 | bch2_bkey_to_text(buf2, sizeof(buf2), &nu); | |
168 | panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2); | |
169 | } | |
170 | } | |
171 | ||
172 | void bch2_btree_node_iter_verify(struct btree_node_iter *iter, | |
173 | struct btree *b) | |
174 | { | |
175 | struct btree_node_iter_set *set, *prev = NULL; | |
176 | struct bset_tree *t; | |
177 | struct bkey_packed *k, *first; | |
178 | ||
179 | if (bch2_btree_node_iter_end(iter)) | |
180 | return; | |
181 | ||
182 | btree_node_iter_for_each(iter, set) { | |
183 | k = __btree_node_offset_to_key(b, set->k); | |
184 | t = bch2_bkey_to_bset(b, k); | |
185 | ||
186 | BUG_ON(__btree_node_offset_to_key(b, set->end) != | |
187 | btree_bkey_last(b, t)); | |
188 | ||
189 | BUG_ON(prev && | |
190 | btree_node_iter_cmp(iter, b, *prev, *set) > 0); | |
191 | ||
192 | prev = set; | |
193 | } | |
194 | ||
195 | first = __btree_node_offset_to_key(b, iter->data[0].k); | |
196 | ||
197 | for_each_bset(b, t) | |
198 | if (bch2_btree_node_iter_bset_pos(iter, b, t) == | |
199 | btree_bkey_last(b, t) && | |
200 | (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)))) | |
201 | BUG_ON(__btree_node_iter_cmp(iter->is_extents, b, | |
202 | k, first) > 0); | |
203 | } | |
204 | ||
205 | void bch2_verify_key_order(struct btree *b, | |
206 | struct btree_node_iter *iter, | |
207 | struct bkey_packed *where) | |
208 | { | |
209 | struct bset_tree *t = bch2_bkey_to_bset(b, where); | |
210 | struct bkey_packed *k, *prev; | |
211 | struct bkey uk, uw = bkey_unpack_key(b, where); | |
212 | ||
213 | k = bch2_bkey_prev_all(b, t, where); | |
214 | if (k && | |
215 | keys_out_of_order(b, k, where, iter->is_extents)) { | |
216 | char buf1[100], buf2[100]; | |
217 | ||
218 | bch2_dump_btree_node(b); | |
219 | uk = bkey_unpack_key(b, k); | |
220 | bch2_bkey_to_text(buf1, sizeof(buf1), &uk); | |
221 | bch2_bkey_to_text(buf2, sizeof(buf2), &uw); | |
222 | panic("out of order with prev:\n%s\n%s\n", | |
223 | buf1, buf2); | |
224 | } | |
225 | ||
226 | k = bkey_next(where); | |
227 | BUG_ON(k != btree_bkey_last(b, t) && | |
228 | keys_out_of_order(b, where, k, iter->is_extents)); | |
229 | ||
230 | for_each_bset(b, t) { | |
231 | if (where >= btree_bkey_first(b, t) || | |
232 | where < btree_bkey_last(b, t)) | |
233 | continue; | |
234 | ||
235 | k = bch2_btree_node_iter_bset_pos(iter, b, t); | |
236 | ||
237 | if (k == btree_bkey_last(b, t)) | |
238 | k = bch2_bkey_prev_all(b, t, k); | |
239 | ||
240 | while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 && | |
241 | (prev = bch2_bkey_prev_all(b, t, k))) | |
242 | k = prev; | |
243 | ||
244 | for (; | |
245 | k != btree_bkey_last(b, t); | |
246 | k = bkey_next(k)) { | |
247 | uk = bkey_unpack_key(b, k); | |
248 | ||
249 | if (iter->is_extents) { | |
250 | BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 || | |
251 | bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0)); | |
252 | } else { | |
253 | BUG_ON(!bkey_cmp(uw.p, uk.p) && | |
254 | !bkey_deleted(&uk)); | |
255 | } | |
256 | ||
257 | if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0) | |
258 | break; | |
259 | } | |
260 | } | |
261 | } | |
262 | ||
263 | #else | |
264 | ||
265 | static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, | |
266 | struct btree *b, | |
267 | struct bkey_packed *k) {} | |
268 | ||
269 | #endif | |
270 | ||
271 | /* Auxiliary search trees */ | |
272 | ||
273 | #define BFLOAT_FAILED_UNPACKED (U8_MAX - 0) | |
274 | #define BFLOAT_FAILED_PREV (U8_MAX - 1) | |
275 | #define BFLOAT_FAILED_OVERFLOW (U8_MAX - 2) | |
276 | #define BFLOAT_FAILED (U8_MAX - 2) | |
277 | ||
278 | #define KEY_WORDS BITS_TO_LONGS(1 << BKEY_EXPONENT_BITS) | |
279 | ||
280 | struct bkey_float { | |
281 | u8 exponent; | |
282 | u8 key_offset; | |
283 | union { | |
284 | u32 mantissa32; | |
285 | struct { | |
286 | u16 mantissa16; | |
287 | u16 _pad; | |
288 | }; | |
289 | }; | |
290 | } __packed; | |
291 | ||
292 | #define BFLOAT_32BIT_NR 32U | |
293 | ||
294 | static unsigned bkey_float_byte_offset(unsigned idx) | |
295 | { | |
296 | int d = (idx - BFLOAT_32BIT_NR) << 1; | |
297 | ||
298 | d &= ~(d >> 31); | |
299 | ||
300 | return idx * 6 - d; | |
301 | } | |
302 | ||
303 | struct ro_aux_tree { | |
304 | struct bkey_float _d[0]; | |
305 | }; | |
306 | ||
307 | struct rw_aux_tree { | |
308 | u16 offset; | |
309 | struct bpos k; | |
310 | }; | |
311 | ||
312 | /* | |
313 | * BSET_CACHELINE was originally intended to match the hardware cacheline size - | |
314 | * it used to be 64, but I realized the lookup code would touch slightly less | |
315 | * memory if it was 128. | |
316 | * | |
317 | * It definites the number of bytes (in struct bset) per struct bkey_float in | |
318 | * the auxiliar search tree - when we're done searching the bset_float tree we | |
319 | * have this many bytes left that we do a linear search over. | |
320 | * | |
321 | * Since (after level 5) every level of the bset_tree is on a new cacheline, | |
322 | * we're touching one fewer cacheline in the bset tree in exchange for one more | |
323 | * cacheline in the linear search - but the linear search might stop before it | |
324 | * gets to the second cacheline. | |
325 | */ | |
326 | ||
327 | #define BSET_CACHELINE 128 | |
328 | ||
329 | /* Space required for the btree node keys */ | |
330 | static inline size_t btree_keys_bytes(struct btree *b) | |
331 | { | |
332 | return PAGE_SIZE << b->page_order; | |
333 | } | |
334 | ||
335 | static inline size_t btree_keys_cachelines(struct btree *b) | |
336 | { | |
337 | return btree_keys_bytes(b) / BSET_CACHELINE; | |
338 | } | |
339 | ||
340 | static inline size_t btree_aux_data_bytes(struct btree *b) | |
341 | { | |
342 | return btree_keys_cachelines(b) * 8; | |
343 | } | |
344 | ||
345 | static inline size_t btree_aux_data_u64s(struct btree *b) | |
346 | { | |
347 | return btree_aux_data_bytes(b) / sizeof(u64); | |
348 | } | |
349 | ||
350 | static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) | |
351 | { | |
352 | BUG_ON(t->aux_data_offset == U16_MAX); | |
353 | ||
354 | switch (bset_aux_tree_type(t)) { | |
355 | case BSET_NO_AUX_TREE: | |
356 | return t->aux_data_offset; | |
357 | case BSET_RO_AUX_TREE: | |
358 | return t->aux_data_offset + | |
359 | DIV_ROUND_UP(bkey_float_byte_offset(t->size) + | |
360 | sizeof(u8) * t->size, 8); | |
361 | case BSET_RW_AUX_TREE: | |
362 | return t->aux_data_offset + | |
363 | DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); | |
364 | default: | |
365 | BUG(); | |
366 | } | |
367 | } | |
368 | ||
369 | static unsigned bset_aux_tree_buf_start(const struct btree *b, | |
370 | const struct bset_tree *t) | |
371 | { | |
372 | return t == b->set | |
373 | ? DIV_ROUND_UP(b->unpack_fn_len, 8) | |
374 | : bset_aux_tree_buf_end(t - 1); | |
375 | } | |
376 | ||
377 | static void *__aux_tree_base(const struct btree *b, | |
378 | const struct bset_tree *t) | |
379 | { | |
380 | return b->aux_data + t->aux_data_offset * 8; | |
381 | } | |
382 | ||
383 | static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, | |
384 | const struct bset_tree *t) | |
385 | { | |
386 | EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); | |
387 | ||
388 | return __aux_tree_base(b, t); | |
389 | } | |
390 | ||
391 | static u8 *ro_aux_tree_prev(const struct btree *b, | |
392 | const struct bset_tree *t) | |
393 | { | |
394 | EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); | |
395 | ||
396 | return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); | |
397 | } | |
398 | ||
399 | static struct bkey_float *bkey_float_get(struct ro_aux_tree *b, | |
400 | unsigned idx) | |
401 | { | |
402 | return (void *) b + bkey_float_byte_offset(idx); | |
403 | } | |
404 | ||
405 | static struct bkey_float *bkey_float(const struct btree *b, | |
406 | const struct bset_tree *t, | |
407 | unsigned idx) | |
408 | { | |
409 | return bkey_float_get(ro_aux_tree_base(b, t), idx); | |
410 | } | |
411 | ||
412 | static void bset_aux_tree_verify(struct btree *b) | |
413 | { | |
414 | #ifdef CONFIG_BCACHEFS_DEBUG | |
415 | struct bset_tree *t; | |
416 | ||
417 | for_each_bset(b, t) { | |
418 | if (t->aux_data_offset == U16_MAX) | |
419 | continue; | |
420 | ||
421 | BUG_ON(t != b->set && | |
422 | t[-1].aux_data_offset == U16_MAX); | |
423 | ||
424 | BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); | |
425 | BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); | |
426 | BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); | |
427 | } | |
428 | #endif | |
429 | } | |
430 | ||
431 | /* Memory allocation */ | |
432 | ||
433 | void bch2_btree_keys_free(struct btree *b) | |
434 | { | |
435 | kvfree(b->aux_data); | |
436 | b->aux_data = NULL; | |
437 | } | |
438 | ||
439 | int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) | |
440 | { | |
441 | b->page_order = page_order; | |
442 | b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); | |
443 | if (!b->aux_data) | |
444 | return -ENOMEM; | |
445 | ||
446 | return 0; | |
447 | } | |
448 | ||
449 | void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) | |
450 | { | |
451 | unsigned i; | |
452 | ||
453 | b->nsets = 0; | |
454 | memset(&b->nr, 0, sizeof(b->nr)); | |
455 | #ifdef CONFIG_BCACHEFS_DEBUG | |
456 | b->expensive_debug_checks = expensive_debug_checks; | |
457 | #endif | |
458 | for (i = 0; i < MAX_BSETS; i++) | |
459 | b->set[i].data_offset = U16_MAX; | |
460 | ||
461 | bch2_bset_set_no_aux_tree(b, b->set); | |
462 | } | |
463 | ||
464 | /* Binary tree stuff for auxiliary search trees */ | |
465 | ||
466 | /* | |
467 | * Cacheline/offset <-> bkey pointer arithmetic: | |
468 | * | |
469 | * t->tree is a binary search tree in an array; each node corresponds to a key | |
470 | * in one cacheline in t->set (BSET_CACHELINE bytes). | |
471 | * | |
472 | * This means we don't have to store the full index of the key that a node in | |
473 | * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and | |
474 | * then bkey_float->m gives us the offset within that cacheline, in units of 8 | |
475 | * bytes. | |
476 | * | |
477 | * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to | |
478 | * make this work. | |
479 | * | |
480 | * To construct the bfloat for an arbitrary key we need to know what the key | |
481 | * immediately preceding it is: we have to check if the two keys differ in the | |
482 | * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size | |
483 | * of the previous key so we can walk backwards to it from t->tree[j]'s key. | |
484 | */ | |
485 | ||
486 | static inline void *bset_cacheline(const struct btree *b, | |
487 | const struct bset_tree *t, | |
488 | unsigned cacheline) | |
489 | { | |
490 | return (void *) round_down((unsigned long) btree_bkey_first(b, t), | |
491 | L1_CACHE_BYTES) + | |
492 | cacheline * BSET_CACHELINE; | |
493 | } | |
494 | ||
495 | static struct bkey_packed *cacheline_to_bkey(const struct btree *b, | |
496 | const struct bset_tree *t, | |
497 | unsigned cacheline, | |
498 | unsigned offset) | |
499 | { | |
500 | return bset_cacheline(b, t, cacheline) + offset * 8; | |
501 | } | |
502 | ||
503 | static unsigned bkey_to_cacheline(const struct btree *b, | |
504 | const struct bset_tree *t, | |
505 | const struct bkey_packed *k) | |
506 | { | |
507 | return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; | |
508 | } | |
509 | ||
510 | static ssize_t __bkey_to_cacheline_offset(const struct btree *b, | |
511 | const struct bset_tree *t, | |
512 | unsigned cacheline, | |
513 | const struct bkey_packed *k) | |
514 | { | |
515 | return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); | |
516 | } | |
517 | ||
518 | static unsigned bkey_to_cacheline_offset(const struct btree *b, | |
519 | const struct bset_tree *t, | |
520 | unsigned cacheline, | |
521 | const struct bkey_packed *k) | |
522 | { | |
523 | size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); | |
524 | ||
525 | EBUG_ON(m > U8_MAX); | |
526 | return m; | |
527 | } | |
528 | ||
529 | static inline struct bkey_packed *tree_to_bkey(const struct btree *b, | |
530 | const struct bset_tree *t, | |
531 | unsigned j) | |
532 | { | |
533 | return cacheline_to_bkey(b, t, | |
534 | __eytzinger1_to_inorder(j, t->size, t->extra), | |
535 | bkey_float(b, t, j)->key_offset); | |
536 | } | |
537 | ||
538 | static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, | |
539 | const struct bset_tree *t, | |
540 | unsigned j) | |
541 | { | |
542 | unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; | |
543 | ||
544 | return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); | |
545 | } | |
546 | ||
547 | static struct rw_aux_tree *rw_aux_tree(const struct btree *b, | |
548 | const struct bset_tree *t) | |
549 | { | |
550 | EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); | |
551 | ||
552 | return __aux_tree_base(b, t); | |
553 | } | |
554 | ||
555 | /* | |
556 | * For the write set - the one we're currently inserting keys into - we don't | |
557 | * maintain a full search tree, we just keep a simple lookup table in t->prev. | |
558 | */ | |
559 | static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, | |
560 | struct bset_tree *t, | |
561 | unsigned j) | |
562 | { | |
563 | return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); | |
564 | } | |
565 | ||
566 | static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, | |
567 | unsigned j, struct bkey_packed *k) | |
568 | { | |
569 | EBUG_ON(k >= btree_bkey_last(b, t)); | |
570 | ||
571 | rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { | |
572 | .offset = __btree_node_key_to_offset(b, k), | |
573 | .k = bkey_unpack_pos(b, k), | |
574 | }; | |
575 | } | |
576 | ||
577 | static void bch2_bset_verify_rw_aux_tree(struct btree *b, | |
578 | struct bset_tree *t) | |
579 | { | |
580 | struct bkey_packed *k = btree_bkey_first(b, t); | |
581 | unsigned j = 0; | |
582 | ||
583 | if (!btree_keys_expensive_checks(b)) | |
584 | return; | |
585 | ||
586 | BUG_ON(bset_has_ro_aux_tree(t)); | |
587 | ||
588 | if (!bset_has_rw_aux_tree(t)) | |
589 | return; | |
590 | ||
591 | BUG_ON(t->size < 1); | |
592 | BUG_ON(rw_aux_to_bkey(b, t, j) != k); | |
593 | ||
594 | goto start; | |
595 | while (1) { | |
596 | if (rw_aux_to_bkey(b, t, j) == k) { | |
597 | BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, | |
598 | bkey_unpack_pos(b, k))); | |
599 | start: | |
600 | if (++j == t->size) | |
601 | break; | |
602 | ||
603 | BUG_ON(rw_aux_tree(b, t)[j].offset <= | |
604 | rw_aux_tree(b, t)[j - 1].offset); | |
605 | } | |
606 | ||
607 | k = bkey_next(k); | |
608 | BUG_ON(k >= btree_bkey_last(b, t)); | |
609 | } | |
610 | } | |
611 | ||
612 | /* returns idx of first entry >= offset: */ | |
613 | static unsigned rw_aux_tree_bsearch(struct btree *b, | |
614 | struct bset_tree *t, | |
615 | unsigned offset) | |
616 | { | |
617 | unsigned l = 0, r = t->size; | |
618 | ||
619 | EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); | |
620 | ||
621 | while (l < r) { | |
622 | unsigned m = (l + r) >> 1; | |
623 | ||
624 | if (rw_aux_tree(b, t)[m].offset < offset) | |
625 | l = m + 1; | |
626 | else | |
627 | r = m; | |
628 | } | |
629 | ||
630 | EBUG_ON(l < t->size && | |
631 | rw_aux_tree(b, t)[l].offset < offset); | |
632 | EBUG_ON(l && | |
633 | rw_aux_tree(b, t)[l - 1].offset >= offset); | |
634 | ||
635 | EBUG_ON(l > r); | |
636 | EBUG_ON(l > t->size); | |
637 | ||
638 | return l; | |
639 | } | |
640 | ||
641 | static inline unsigned bfloat_mantissa(const struct bkey_float *f, | |
642 | unsigned idx) | |
643 | { | |
644 | return idx < BFLOAT_32BIT_NR ? f->mantissa32 : f->mantissa16; | |
645 | } | |
646 | ||
647 | static inline void bfloat_mantissa_set(struct bkey_float *f, | |
648 | unsigned idx, unsigned mantissa) | |
649 | { | |
650 | if (idx < BFLOAT_32BIT_NR) | |
651 | f->mantissa32 = mantissa; | |
652 | else | |
653 | f->mantissa16 = mantissa; | |
654 | } | |
655 | ||
656 | static inline unsigned bkey_mantissa(const struct bkey_packed *k, | |
657 | const struct bkey_float *f, | |
658 | unsigned idx) | |
659 | { | |
660 | u64 v; | |
661 | ||
662 | EBUG_ON(!bkey_packed(k)); | |
663 | ||
664 | v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); | |
665 | ||
666 | /* | |
667 | * In little endian, we're shifting off low bits (and then the bits we | |
668 | * want are at the low end), in big endian we're shifting off high bits | |
669 | * (and then the bits we want are at the high end, so we shift them | |
670 | * back down): | |
671 | */ | |
672 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
673 | v >>= f->exponent & 7; | |
674 | #else | |
675 | v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16); | |
676 | #endif | |
677 | return idx < BFLOAT_32BIT_NR ? (u32) v : (u16) v; | |
678 | } | |
679 | ||
680 | static void make_bfloat(struct btree *b, struct bset_tree *t, | |
681 | unsigned j, | |
682 | struct bkey_packed *min_key, | |
683 | struct bkey_packed *max_key) | |
684 | { | |
685 | struct bkey_float *f = bkey_float(b, t, j); | |
686 | struct bkey_packed *m = tree_to_bkey(b, t, j); | |
687 | struct bkey_packed *p = tree_to_prev_bkey(b, t, j); | |
688 | struct bkey_packed *l, *r; | |
689 | unsigned bits = j < BFLOAT_32BIT_NR ? 32 : 16; | |
690 | unsigned mantissa; | |
691 | int shift, exponent, high_bit; | |
692 | ||
693 | EBUG_ON(bkey_next(p) != m); | |
694 | ||
695 | if (is_power_of_2(j)) { | |
696 | l = min_key; | |
697 | ||
698 | if (!l->u64s) { | |
699 | if (!bkey_pack_pos(l, b->data->min_key, b)) { | |
700 | struct bkey_i tmp; | |
701 | ||
702 | bkey_init(&tmp.k); | |
703 | tmp.k.p = b->data->min_key; | |
704 | bkey_copy(l, &tmp); | |
705 | } | |
706 | } | |
707 | } else { | |
708 | l = tree_to_prev_bkey(b, t, j >> ffs(j)); | |
709 | ||
710 | EBUG_ON(m < l); | |
711 | } | |
712 | ||
713 | if (is_power_of_2(j + 1)) { | |
714 | r = max_key; | |
715 | ||
716 | if (!r->u64s) { | |
717 | if (!bkey_pack_pos(r, t->max_key, b)) { | |
718 | struct bkey_i tmp; | |
719 | ||
720 | bkey_init(&tmp.k); | |
721 | tmp.k.p = t->max_key; | |
722 | bkey_copy(r, &tmp); | |
723 | } | |
724 | } | |
725 | } else { | |
726 | r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); | |
727 | ||
728 | EBUG_ON(m > r); | |
729 | } | |
730 | ||
731 | /* | |
732 | * for failed bfloats, the lookup code falls back to comparing against | |
733 | * the original key. | |
734 | */ | |
735 | ||
736 | if (!bkey_packed(l) || !bkey_packed(r) || | |
737 | !bkey_packed(p) || !bkey_packed(m) || | |
738 | !b->nr_key_bits) { | |
739 | f->exponent = BFLOAT_FAILED_UNPACKED; | |
740 | return; | |
741 | } | |
742 | ||
743 | /* | |
744 | * The greatest differing bit of l and r is the first bit we must | |
745 | * include in the bfloat mantissa we're creating in order to do | |
746 | * comparisons - that bit always becomes the high bit of | |
747 | * bfloat->mantissa, and thus the exponent we're calculating here is | |
748 | * the position of what will become the low bit in bfloat->mantissa: | |
749 | * | |
750 | * Note that this may be negative - we may be running off the low end | |
751 | * of the key: we handle this later: | |
752 | */ | |
753 | high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), | |
754 | min_t(unsigned, bits, b->nr_key_bits) - 1); | |
755 | exponent = high_bit - (bits - 1); | |
756 | ||
757 | /* | |
758 | * Then we calculate the actual shift value, from the start of the key | |
759 | * (k->_data), to get the key bits starting at exponent: | |
760 | */ | |
761 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
762 | shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; | |
763 | ||
764 | EBUG_ON(shift + bits > b->format.key_u64s * 64); | |
765 | #else | |
766 | shift = high_bit_offset + | |
767 | b->nr_key_bits - | |
768 | exponent - | |
769 | bits; | |
770 | ||
771 | EBUG_ON(shift < KEY_PACKED_BITS_START); | |
772 | #endif | |
773 | EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); | |
774 | ||
775 | f->exponent = shift; | |
776 | mantissa = bkey_mantissa(m, f, j); | |
777 | ||
778 | /* | |
779 | * If we've got garbage bits, set them to all 1s - it's legal for the | |
780 | * bfloat to compare larger than the original key, but not smaller: | |
781 | */ | |
782 | if (exponent < 0) | |
783 | mantissa |= ~(~0U << -exponent); | |
784 | ||
785 | bfloat_mantissa_set(f, j, mantissa); | |
786 | ||
787 | /* | |
788 | * The bfloat must be able to tell its key apart from the previous key - | |
789 | * if its key and the previous key don't differ in the required bits, | |
790 | * flag as failed - unless the keys are actually equal, in which case | |
791 | * we aren't required to return a specific one: | |
792 | */ | |
793 | if (exponent > 0 && | |
794 | bfloat_mantissa(f, j) == bkey_mantissa(p, f, j) && | |
795 | bkey_cmp_packed(b, p, m)) { | |
796 | f->exponent = BFLOAT_FAILED_PREV; | |
797 | return; | |
798 | } | |
799 | ||
800 | /* | |
801 | * f->mantissa must compare >= the original key - for transitivity with | |
802 | * the comparison in bset_search_tree. If we're dropping set bits, | |
803 | * increment it: | |
804 | */ | |
805 | if (exponent > (int) bch2_bkey_ffs(b, m)) { | |
806 | if (j < BFLOAT_32BIT_NR | |
807 | ? f->mantissa32 == U32_MAX | |
808 | : f->mantissa16 == U16_MAX) | |
809 | f->exponent = BFLOAT_FAILED_OVERFLOW; | |
810 | ||
811 | if (j < BFLOAT_32BIT_NR) | |
812 | f->mantissa32++; | |
813 | else | |
814 | f->mantissa16++; | |
815 | } | |
816 | } | |
817 | ||
818 | /* bytes remaining - only valid for last bset: */ | |
819 | static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) | |
820 | { | |
821 | bset_aux_tree_verify(b); | |
822 | ||
823 | return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); | |
824 | } | |
825 | ||
826 | static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) | |
827 | { | |
828 | unsigned bytes = __bset_tree_capacity(b, t); | |
829 | ||
830 | if (bytes < 7 * BFLOAT_32BIT_NR) | |
831 | return bytes / 7; | |
832 | ||
833 | bytes -= 7 * BFLOAT_32BIT_NR; | |
834 | ||
835 | return BFLOAT_32BIT_NR + bytes / 5; | |
836 | } | |
837 | ||
838 | static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) | |
839 | { | |
840 | return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); | |
841 | } | |
842 | ||
843 | static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) | |
844 | { | |
845 | struct bkey_packed *k; | |
846 | ||
847 | t->size = 1; | |
848 | t->extra = BSET_RW_AUX_TREE_VAL; | |
849 | rw_aux_tree(b, t)[0].offset = | |
850 | __btree_node_key_to_offset(b, btree_bkey_first(b, t)); | |
851 | ||
852 | for (k = btree_bkey_first(b, t); | |
853 | k != btree_bkey_last(b, t); | |
854 | k = bkey_next(k)) { | |
855 | if (t->size == bset_rw_tree_capacity(b, t)) | |
856 | break; | |
857 | ||
858 | if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > | |
859 | L1_CACHE_BYTES) | |
860 | rw_aux_tree_set(b, t, t->size++, k); | |
861 | } | |
862 | } | |
863 | ||
864 | static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) | |
865 | { | |
866 | struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); | |
867 | struct bkey_packed min_key, max_key; | |
868 | unsigned j, cacheline = 1; | |
869 | ||
870 | /* signal to make_bfloat() that they're uninitialized: */ | |
871 | min_key.u64s = max_key.u64s = 0; | |
872 | ||
873 | t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), | |
874 | bset_ro_tree_capacity(b, t)); | |
875 | retry: | |
876 | if (t->size < 2) { | |
877 | t->size = 0; | |
878 | t->extra = BSET_NO_AUX_TREE_VAL; | |
879 | return; | |
880 | } | |
881 | ||
882 | t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; | |
883 | ||
884 | /* First we figure out where the first key in each cacheline is */ | |
885 | eytzinger1_for_each(j, t->size) { | |
886 | while (bkey_to_cacheline(b, t, k) < cacheline) | |
887 | prev = k, k = bkey_next(k); | |
888 | ||
889 | if (k >= btree_bkey_last(b, t)) { | |
890 | /* XXX: this path sucks */ | |
891 | t->size--; | |
892 | goto retry; | |
893 | } | |
894 | ||
895 | ro_aux_tree_prev(b, t)[j] = prev->u64s; | |
896 | bkey_float(b, t, j)->key_offset = | |
897 | bkey_to_cacheline_offset(b, t, cacheline++, k); | |
898 | ||
899 | EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); | |
900 | EBUG_ON(tree_to_bkey(b, t, j) != k); | |
901 | } | |
902 | ||
903 | while (bkey_next(k) != btree_bkey_last(b, t)) | |
904 | k = bkey_next(k); | |
905 | ||
906 | t->max_key = bkey_unpack_pos(b, k); | |
907 | ||
908 | /* Then we build the tree */ | |
909 | eytzinger1_for_each(j, t->size) | |
910 | make_bfloat(b, t, j, &min_key, &max_key); | |
911 | } | |
912 | ||
913 | static void bset_alloc_tree(struct btree *b, struct bset_tree *t) | |
914 | { | |
915 | struct bset_tree *i; | |
916 | ||
917 | for (i = b->set; i != t; i++) | |
918 | BUG_ON(bset_has_rw_aux_tree(i)); | |
919 | ||
920 | bch2_bset_set_no_aux_tree(b, t); | |
921 | ||
922 | /* round up to next cacheline: */ | |
923 | t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), | |
924 | SMP_CACHE_BYTES / sizeof(u64)); | |
925 | ||
926 | bset_aux_tree_verify(b); | |
927 | } | |
928 | ||
929 | void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, | |
930 | bool writeable) | |
931 | { | |
932 | if (writeable | |
933 | ? bset_has_rw_aux_tree(t) | |
934 | : bset_has_ro_aux_tree(t)) | |
935 | return; | |
936 | ||
937 | bset_alloc_tree(b, t); | |
938 | ||
939 | if (!__bset_tree_capacity(b, t)) | |
940 | return; | |
941 | ||
942 | if (writeable) | |
943 | __build_rw_aux_tree(b, t); | |
944 | else | |
945 | __build_ro_aux_tree(b, t); | |
946 | ||
947 | bset_aux_tree_verify(b); | |
948 | } | |
949 | ||
950 | void bch2_bset_init_first(struct btree *b, struct bset *i) | |
951 | { | |
952 | struct bset_tree *t; | |
953 | ||
954 | BUG_ON(b->nsets); | |
955 | ||
956 | memset(i, 0, sizeof(*i)); | |
957 | get_random_bytes(&i->seq, sizeof(i->seq)); | |
958 | SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); | |
959 | ||
960 | t = &b->set[b->nsets++]; | |
961 | set_btree_bset(b, t, i); | |
962 | } | |
963 | ||
964 | void bch2_bset_init_next(struct bch_fs *c, struct btree *b, | |
965 | struct btree_node_entry *bne) | |
966 | { | |
967 | struct bset *i = &bne->keys; | |
968 | struct bset_tree *t; | |
969 | ||
970 | BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); | |
971 | BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); | |
972 | BUG_ON(b->nsets >= MAX_BSETS); | |
973 | ||
974 | memset(i, 0, sizeof(*i)); | |
975 | i->seq = btree_bset_first(b)->seq; | |
976 | SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); | |
977 | ||
978 | t = &b->set[b->nsets++]; | |
979 | set_btree_bset(b, t, i); | |
980 | } | |
981 | ||
982 | /* | |
983 | * find _some_ key in the same bset as @k that precedes @k - not necessarily the | |
984 | * immediate predecessor: | |
985 | */ | |
986 | static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, | |
987 | struct bkey_packed *k) | |
988 | { | |
989 | struct bkey_packed *p; | |
990 | unsigned offset; | |
991 | int j; | |
992 | ||
993 | EBUG_ON(k < btree_bkey_first(b, t) || | |
994 | k > btree_bkey_last(b, t)); | |
995 | ||
996 | if (k == btree_bkey_first(b, t)) | |
997 | return NULL; | |
998 | ||
999 | switch (bset_aux_tree_type(t)) { | |
1000 | case BSET_NO_AUX_TREE: | |
1001 | p = btree_bkey_first(b, t); | |
1002 | break; | |
1003 | case BSET_RO_AUX_TREE: | |
1004 | j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); | |
1005 | ||
1006 | do { | |
1007 | p = j ? tree_to_bkey(b, t, | |
1008 | __inorder_to_eytzinger1(j--, | |
1009 | t->size, t->extra)) | |
1010 | : btree_bkey_first(b, t); | |
1011 | } while (p >= k); | |
1012 | break; | |
1013 | case BSET_RW_AUX_TREE: | |
1014 | offset = __btree_node_key_to_offset(b, k); | |
1015 | j = rw_aux_tree_bsearch(b, t, offset); | |
1016 | p = j ? rw_aux_to_bkey(b, t, j - 1) | |
1017 | : btree_bkey_first(b, t); | |
1018 | break; | |
1019 | } | |
1020 | ||
1021 | return p; | |
1022 | } | |
1023 | ||
1024 | struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, | |
1025 | struct bset_tree *t, | |
1026 | struct bkey_packed *k, | |
1027 | unsigned min_key_type) | |
1028 | { | |
1029 | struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; | |
1030 | ||
1031 | while ((p = __bkey_prev(b, t, k)) && !ret) { | |
1032 | for (i = p; i != k; i = bkey_next(i)) | |
1033 | if (i->type >= min_key_type) | |
1034 | ret = i; | |
1035 | ||
1036 | k = p; | |
1037 | } | |
1038 | ||
1039 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { | |
1040 | BUG_ON(ret >= orig_k); | |
1041 | ||
1042 | for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t); | |
1043 | i != orig_k; | |
1044 | i = bkey_next(i)) | |
1045 | BUG_ON(i->type >= min_key_type); | |
1046 | } | |
1047 | ||
1048 | return ret; | |
1049 | } | |
1050 | ||
1051 | /* Insert */ | |
1052 | ||
1053 | static void rw_aux_tree_fix_invalidated_key(struct btree *b, | |
1054 | struct bset_tree *t, | |
1055 | struct bkey_packed *k) | |
1056 | { | |
1057 | unsigned offset = __btree_node_key_to_offset(b, k); | |
1058 | unsigned j = rw_aux_tree_bsearch(b, t, offset); | |
1059 | ||
1060 | if (j < t->size && | |
1061 | rw_aux_tree(b, t)[j].offset == offset) | |
1062 | rw_aux_tree_set(b, t, j, k); | |
1063 | ||
1064 | bch2_bset_verify_rw_aux_tree(b, t); | |
1065 | } | |
1066 | ||
1067 | static void ro_aux_tree_fix_invalidated_key(struct btree *b, | |
1068 | struct bset_tree *t, | |
1069 | struct bkey_packed *k) | |
1070 | { | |
1071 | struct bkey_packed min_key, max_key; | |
1072 | unsigned inorder, j; | |
1073 | ||
1074 | EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); | |
1075 | ||
1076 | /* signal to make_bfloat() that they're uninitialized: */ | |
1077 | min_key.u64s = max_key.u64s = 0; | |
1078 | ||
1079 | if (bkey_next(k) == btree_bkey_last(b, t)) { | |
1080 | t->max_key = bkey_unpack_pos(b, k); | |
1081 | ||
1082 | for (j = 1; j < t->size; j = j * 2 + 1) | |
1083 | make_bfloat(b, t, j, &min_key, &max_key); | |
1084 | } | |
1085 | ||
1086 | inorder = bkey_to_cacheline(b, t, k); | |
1087 | ||
1088 | if (inorder && | |
1089 | inorder < t->size) { | |
1090 | j = __inorder_to_eytzinger1(inorder, t->size, t->extra); | |
1091 | ||
1092 | if (k == tree_to_bkey(b, t, j)) { | |
1093 | /* Fix the node this key corresponds to */ | |
1094 | make_bfloat(b, t, j, &min_key, &max_key); | |
1095 | ||
1096 | /* Children for which this key is the right boundary */ | |
1097 | for (j = eytzinger1_left_child(j); | |
1098 | j < t->size; | |
1099 | j = eytzinger1_right_child(j)) | |
1100 | make_bfloat(b, t, j, &min_key, &max_key); | |
1101 | } | |
1102 | } | |
1103 | ||
1104 | if (inorder + 1 < t->size) { | |
1105 | j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); | |
1106 | ||
1107 | if (k == tree_to_prev_bkey(b, t, j)) { | |
1108 | make_bfloat(b, t, j, &min_key, &max_key); | |
1109 | ||
1110 | /* Children for which this key is the left boundary */ | |
1111 | for (j = eytzinger1_right_child(j); | |
1112 | j < t->size; | |
1113 | j = eytzinger1_left_child(j)) | |
1114 | make_bfloat(b, t, j, &min_key, &max_key); | |
1115 | } | |
1116 | } | |
1117 | } | |
1118 | ||
1119 | /** | |
1120 | * bch2_bset_fix_invalidated_key() - given an existing key @k that has been | |
1121 | * modified, fix any auxiliary search tree by remaking all the nodes in the | |
1122 | * auxiliary search tree that @k corresponds to | |
1123 | */ | |
1124 | void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t, | |
1125 | struct bkey_packed *k) | |
1126 | { | |
1127 | switch (bset_aux_tree_type(t)) { | |
1128 | case BSET_NO_AUX_TREE: | |
1129 | break; | |
1130 | case BSET_RO_AUX_TREE: | |
1131 | ro_aux_tree_fix_invalidated_key(b, t, k); | |
1132 | break; | |
1133 | case BSET_RW_AUX_TREE: | |
1134 | rw_aux_tree_fix_invalidated_key(b, t, k); | |
1135 | break; | |
1136 | } | |
1137 | } | |
1138 | ||
1139 | static void bch2_bset_fix_lookup_table(struct btree *b, | |
1140 | struct bset_tree *t, | |
1141 | struct bkey_packed *_where, | |
1142 | unsigned clobber_u64s, | |
1143 | unsigned new_u64s) | |
1144 | { | |
1145 | int shift = new_u64s - clobber_u64s; | |
1146 | unsigned l, j, where = __btree_node_key_to_offset(b, _where); | |
1147 | ||
1148 | EBUG_ON(bset_has_ro_aux_tree(t)); | |
1149 | ||
1150 | if (!bset_has_rw_aux_tree(t)) | |
1151 | return; | |
1152 | ||
1153 | l = rw_aux_tree_bsearch(b, t, where); | |
1154 | ||
1155 | /* l is first >= than @where */ | |
1156 | ||
1157 | EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where); | |
1158 | EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where); | |
1159 | ||
1160 | if (!l) /* never delete first entry */ | |
1161 | l++; | |
1162 | else if (l < t->size && | |
1163 | where < t->end_offset && | |
1164 | rw_aux_tree(b, t)[l].offset == where) | |
1165 | rw_aux_tree_set(b, t, l++, _where); | |
1166 | ||
1167 | /* l now > where */ | |
1168 | ||
1169 | for (j = l; | |
1170 | j < t->size && | |
1171 | rw_aux_tree(b, t)[j].offset < where + clobber_u64s; | |
1172 | j++) | |
1173 | ; | |
1174 | ||
1175 | if (j < t->size && | |
1176 | rw_aux_tree(b, t)[j].offset + shift == | |
1177 | rw_aux_tree(b, t)[l - 1].offset) | |
1178 | j++; | |
1179 | ||
1180 | memmove(&rw_aux_tree(b, t)[l], | |
1181 | &rw_aux_tree(b, t)[j], | |
1182 | (void *) &rw_aux_tree(b, t)[t->size] - | |
1183 | (void *) &rw_aux_tree(b, t)[j]); | |
1184 | t->size -= j - l; | |
1185 | ||
1186 | for (j = l; j < t->size; j++) | |
1187 | rw_aux_tree(b, t)[j].offset += shift; | |
1188 | ||
1189 | EBUG_ON(l < t->size && | |
1190 | rw_aux_tree(b, t)[l].offset == | |
1191 | rw_aux_tree(b, t)[l - 1].offset); | |
1192 | ||
1193 | if (t->size < bset_rw_tree_capacity(b, t) && | |
1194 | (l < t->size | |
1195 | ? rw_aux_tree(b, t)[l].offset | |
1196 | : t->end_offset) - | |
1197 | rw_aux_tree(b, t)[l - 1].offset > | |
1198 | L1_CACHE_BYTES / sizeof(u64)) { | |
1199 | struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); | |
1200 | struct bkey_packed *end = l < t->size | |
1201 | ? rw_aux_to_bkey(b, t, l) | |
1202 | : btree_bkey_last(b, t); | |
1203 | struct bkey_packed *k = start; | |
1204 | ||
1205 | while (1) { | |
1206 | k = bkey_next(k); | |
1207 | if (k == end) | |
1208 | break; | |
1209 | ||
1210 | if ((void *) k - (void *) start >= L1_CACHE_BYTES) { | |
1211 | memmove(&rw_aux_tree(b, t)[l + 1], | |
1212 | &rw_aux_tree(b, t)[l], | |
1213 | (void *) &rw_aux_tree(b, t)[t->size] - | |
1214 | (void *) &rw_aux_tree(b, t)[l]); | |
1215 | t->size++; | |
1216 | rw_aux_tree_set(b, t, l, k); | |
1217 | break; | |
1218 | } | |
1219 | } | |
1220 | } | |
1221 | ||
1222 | bch2_bset_verify_rw_aux_tree(b, t); | |
1223 | bset_aux_tree_verify(b); | |
1224 | } | |
1225 | ||
1226 | void bch2_bset_insert(struct btree *b, | |
1227 | struct btree_node_iter *iter, | |
1228 | struct bkey_packed *where, | |
1229 | struct bkey_i *insert, | |
1230 | unsigned clobber_u64s) | |
1231 | { | |
1232 | struct bkey_format *f = &b->format; | |
1233 | struct bset_tree *t = bset_tree_last(b); | |
1234 | struct bkey_packed packed, *src = bkey_to_packed(insert); | |
1235 | ||
1236 | bch2_bset_verify_rw_aux_tree(b, t); | |
1237 | ||
1238 | if (bch2_bkey_pack_key(&packed, &insert->k, f)) | |
1239 | src = &packed; | |
1240 | ||
1241 | if (!bkey_whiteout(&insert->k)) | |
1242 | btree_keys_account_key_add(&b->nr, t - b->set, src); | |
1243 | ||
1244 | if (src->u64s != clobber_u64s) { | |
1245 | u64 *src_p = where->_data + clobber_u64s; | |
1246 | u64 *dst_p = where->_data + src->u64s; | |
1247 | ||
1248 | EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < | |
1249 | (int) clobber_u64s - src->u64s); | |
1250 | ||
1251 | memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); | |
1252 | le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); | |
1253 | set_btree_bset_end(b, t); | |
1254 | } | |
1255 | ||
1256 | memcpy_u64s(where, src, | |
1257 | bkeyp_key_u64s(f, src)); | |
1258 | memcpy_u64s(bkeyp_val(f, where), &insert->v, | |
1259 | bkeyp_val_u64s(f, src)); | |
1260 | ||
1261 | bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); | |
1262 | ||
1263 | bch2_verify_key_order(b, iter, where); | |
1264 | bch2_verify_btree_nr_keys(b); | |
1265 | } | |
1266 | ||
1267 | void bch2_bset_delete(struct btree *b, | |
1268 | struct bkey_packed *where, | |
1269 | unsigned clobber_u64s) | |
1270 | { | |
1271 | struct bset_tree *t = bset_tree_last(b); | |
1272 | u64 *src_p = where->_data + clobber_u64s; | |
1273 | u64 *dst_p = where->_data; | |
1274 | ||
1275 | bch2_bset_verify_rw_aux_tree(b, t); | |
1276 | ||
1277 | EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); | |
1278 | ||
1279 | memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); | |
1280 | le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); | |
1281 | set_btree_bset_end(b, t); | |
1282 | ||
1283 | bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); | |
1284 | } | |
1285 | ||
1286 | /* Lookup */ | |
1287 | ||
1288 | __flatten | |
1289 | static struct bkey_packed *bset_search_write_set(const struct btree *b, | |
1290 | struct bset_tree *t, | |
1291 | struct bpos search, | |
1292 | const struct bkey_packed *packed_search) | |
1293 | { | |
1294 | unsigned l = 0, r = t->size; | |
1295 | ||
1296 | while (l + 1 != r) { | |
1297 | unsigned m = (l + r) >> 1; | |
1298 | ||
1299 | if (bkey_cmp(rw_aux_tree(b, t)[m].k, search) < 0) | |
1300 | l = m; | |
1301 | else | |
1302 | r = m; | |
1303 | } | |
1304 | ||
1305 | return rw_aux_to_bkey(b, t, l); | |
1306 | } | |
1307 | ||
1308 | noinline | |
1309 | static int bset_search_tree_slowpath(const struct btree *b, | |
1310 | struct bset_tree *t, struct bpos *search, | |
1311 | const struct bkey_packed *packed_search, | |
1312 | unsigned n) | |
1313 | { | |
1314 | return bkey_cmp_p_or_unp(b, tree_to_bkey(b, t, n), | |
1315 | packed_search, search) < 0; | |
1316 | } | |
1317 | ||
1318 | __flatten | |
1319 | static struct bkey_packed *bset_search_tree(const struct btree *b, | |
1320 | struct bset_tree *t, | |
1321 | struct bpos search, | |
1322 | const struct bkey_packed *packed_search) | |
1323 | { | |
1324 | struct ro_aux_tree *base = ro_aux_tree_base(b, t); | |
1325 | struct bkey_float *f = bkey_float_get(base, 1); | |
1326 | void *p; | |
1327 | unsigned inorder, n = 1; | |
1328 | ||
1329 | while (1) { | |
1330 | if (likely(n << 4 < t->size)) { | |
1331 | p = bkey_float_get(base, n << 4); | |
1332 | prefetch(p); | |
1333 | } else if (n << 3 < t->size) { | |
1334 | inorder = __eytzinger1_to_inorder(n, t->size, t->extra); | |
1335 | p = bset_cacheline(b, t, inorder); | |
1336 | #ifdef CONFIG_X86_64 | |
1337 | asm(".intel_syntax noprefix;" | |
1338 | "prefetcht0 [%0 - 127 + 64 * 0];" | |
1339 | "prefetcht0 [%0 - 127 + 64 * 1];" | |
1340 | "prefetcht0 [%0 - 127 + 64 * 2];" | |
1341 | "prefetcht0 [%0 - 127 + 64 * 3];" | |
1342 | ".att_syntax prefix;" | |
1343 | : | |
1344 | : "r" (p + 127)); | |
1345 | #else | |
1346 | prefetch(p + L1_CACHE_BYTES * 0); | |
1347 | prefetch(p + L1_CACHE_BYTES * 1); | |
1348 | prefetch(p + L1_CACHE_BYTES * 2); | |
1349 | prefetch(p + L1_CACHE_BYTES * 3); | |
1350 | #endif | |
1351 | } else if (n >= t->size) | |
1352 | break; | |
1353 | ||
1354 | f = bkey_float_get(base, n); | |
1355 | ||
1356 | if (packed_search && | |
1357 | likely(f->exponent < BFLOAT_FAILED)) | |
1358 | n = n * 2 + (bfloat_mantissa(f, n) < | |
1359 | bkey_mantissa(packed_search, f, n)); | |
1360 | else | |
1361 | n = n * 2 + bset_search_tree_slowpath(b, t, | |
1362 | &search, packed_search, n); | |
1363 | } while (n < t->size); | |
1364 | ||
1365 | inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); | |
1366 | ||
1367 | /* | |
1368 | * n would have been the node we recursed to - the low bit tells us if | |
1369 | * we recursed left or recursed right. | |
1370 | */ | |
1371 | if (n & 1) { | |
1372 | return cacheline_to_bkey(b, t, inorder, f->key_offset); | |
1373 | } else { | |
1374 | if (--inorder) { | |
1375 | n = eytzinger1_prev(n >> 1, t->size); | |
1376 | f = bkey_float_get(base, n); | |
1377 | return cacheline_to_bkey(b, t, inorder, f->key_offset); | |
1378 | } else | |
1379 | return btree_bkey_first(b, t); | |
1380 | } | |
1381 | } | |
1382 | ||
1383 | /* | |
1384 | * Returns the first key greater than or equal to @search | |
1385 | */ | |
1386 | __always_inline __flatten | |
1387 | static struct bkey_packed *bch2_bset_search(struct btree *b, | |
1388 | struct bset_tree *t, | |
1389 | struct bpos search, | |
1390 | struct bkey_packed *packed_search, | |
1391 | const struct bkey_packed *lossy_packed_search, | |
1392 | bool strictly_greater) | |
1393 | { | |
1394 | struct bkey_packed *m; | |
1395 | ||
1396 | /* | |
1397 | * First, we search for a cacheline, then lastly we do a linear search | |
1398 | * within that cacheline. | |
1399 | * | |
1400 | * To search for the cacheline, there's three different possibilities: | |
1401 | * * The set is too small to have a search tree, so we just do a linear | |
1402 | * search over the whole set. | |
1403 | * * The set is the one we're currently inserting into; keeping a full | |
1404 | * auxiliary search tree up to date would be too expensive, so we | |
1405 | * use a much simpler lookup table to do a binary search - | |
1406 | * bset_search_write_set(). | |
1407 | * * Or we use the auxiliary search tree we constructed earlier - | |
1408 | * bset_search_tree() | |
1409 | */ | |
1410 | ||
1411 | switch (bset_aux_tree_type(t)) { | |
1412 | case BSET_NO_AUX_TREE: | |
1413 | m = btree_bkey_first(b, t); | |
1414 | break; | |
1415 | case BSET_RW_AUX_TREE: | |
1416 | m = bset_search_write_set(b, t, search, lossy_packed_search); | |
1417 | break; | |
1418 | case BSET_RO_AUX_TREE: | |
1419 | /* | |
1420 | * Each node in the auxiliary search tree covers a certain range | |
1421 | * of bits, and keys above and below the set it covers might | |
1422 | * differ outside those bits - so we have to special case the | |
1423 | * start and end - handle that here: | |
1424 | */ | |
1425 | ||
1426 | if (bkey_cmp(search, t->max_key) > 0) | |
1427 | return btree_bkey_last(b, t); | |
1428 | ||
1429 | m = bset_search_tree(b, t, search, lossy_packed_search); | |
1430 | break; | |
1431 | } | |
1432 | ||
1433 | if (lossy_packed_search) | |
1434 | while (m != btree_bkey_last(b, t) && | |
1435 | !btree_iter_pos_cmp_p_or_unp(b, search, lossy_packed_search, | |
1436 | m, strictly_greater)) | |
1437 | m = bkey_next(m); | |
1438 | ||
1439 | if (!packed_search) | |
1440 | while (m != btree_bkey_last(b, t) && | |
1441 | !btree_iter_pos_cmp_packed(b, &search, m, strictly_greater)) | |
1442 | m = bkey_next(m); | |
1443 | ||
1444 | if (btree_keys_expensive_checks(b)) { | |
1445 | struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); | |
1446 | ||
1447 | BUG_ON(prev && | |
1448 | btree_iter_pos_cmp_p_or_unp(b, search, packed_search, | |
1449 | prev, strictly_greater)); | |
1450 | } | |
1451 | ||
1452 | return m; | |
1453 | } | |
1454 | ||
1455 | /* Btree node iterator */ | |
1456 | ||
1457 | void bch2_btree_node_iter_push(struct btree_node_iter *iter, | |
1458 | struct btree *b, | |
1459 | const struct bkey_packed *k, | |
1460 | const struct bkey_packed *end) | |
1461 | { | |
1462 | __bch2_btree_node_iter_push(iter, b, k, end); | |
1463 | bch2_btree_node_iter_sort(iter, b); | |
1464 | } | |
1465 | ||
1466 | noinline __flatten __attribute__((cold)) | |
1467 | static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, | |
1468 | struct btree *b, struct bpos search, | |
1469 | bool strictly_greater, bool is_extents) | |
1470 | { | |
1471 | struct bset_tree *t; | |
1472 | ||
1473 | trace_bkey_pack_pos_fail(&search); | |
1474 | ||
1475 | for_each_bset(b, t) | |
1476 | __bch2_btree_node_iter_push(iter, b, | |
1477 | bch2_bset_search(b, t, search, NULL, NULL, | |
1478 | strictly_greater), | |
1479 | btree_bkey_last(b, t)); | |
1480 | ||
1481 | bch2_btree_node_iter_sort(iter, b); | |
1482 | } | |
1483 | ||
1484 | /** | |
1485 | * bch_btree_node_iter_init - initialize a btree node iterator, starting from a | |
1486 | * given position | |
1487 | * | |
1488 | * Main entry point to the lookup code for individual btree nodes: | |
1489 | * | |
1490 | * NOTE: | |
1491 | * | |
1492 | * When you don't filter out deleted keys, btree nodes _do_ contain duplicate | |
1493 | * keys. This doesn't matter for most code, but it does matter for lookups. | |
1494 | * | |
1495 | * Some adjacent keys with a string of equal keys: | |
1496 | * i j k k k k l m | |
1497 | * | |
1498 | * If you search for k, the lookup code isn't guaranteed to return you any | |
1499 | * specific k. The lookup code is conceptually doing a binary search and | |
1500 | * iterating backwards is very expensive so if the pivot happens to land at the | |
1501 | * last k that's what you'll get. | |
1502 | * | |
1503 | * This works out ok, but it's something to be aware of: | |
1504 | * | |
1505 | * - For non extents, we guarantee that the live key comes last - see | |
1506 | * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't | |
1507 | * see will only be deleted keys you don't care about. | |
1508 | * | |
1509 | * - For extents, deleted keys sort last (see the comment at the top of this | |
1510 | * file). But when you're searching for extents, you actually want the first | |
1511 | * key strictly greater than your search key - an extent that compares equal | |
1512 | * to the search key is going to have 0 sectors after the search key. | |
1513 | * | |
1514 | * But this does mean that we can't just search for | |
1515 | * bkey_successor(start_of_range) to get the first extent that overlaps with | |
1516 | * the range we want - if we're unlucky and there's an extent that ends | |
1517 | * exactly where we searched, then there could be a deleted key at the same | |
1518 | * position and we'd get that when we search instead of the preceding extent | |
1519 | * we needed. | |
1520 | * | |
1521 | * So we've got to search for start_of_range, then after the lookup iterate | |
1522 | * past any extents that compare equal to the position we searched for. | |
1523 | */ | |
1524 | void bch2_btree_node_iter_init(struct btree_node_iter *iter, | |
1525 | struct btree *b, struct bpos search, | |
1526 | bool strictly_greater, bool is_extents) | |
1527 | { | |
1528 | struct bset_tree *t; | |
1529 | struct bkey_packed p, *packed_search = NULL; | |
1530 | ||
1531 | EBUG_ON(bkey_cmp(search, b->data->min_key) < 0); | |
1532 | bset_aux_tree_verify(b); | |
1533 | ||
1534 | __bch2_btree_node_iter_init(iter, is_extents); | |
1535 | ||
1536 | switch (bch2_bkey_pack_pos_lossy(&p, search, b)) { | |
1537 | case BKEY_PACK_POS_EXACT: | |
1538 | packed_search = &p; | |
1539 | break; | |
1540 | case BKEY_PACK_POS_SMALLER: | |
1541 | packed_search = NULL; | |
1542 | break; | |
1543 | case BKEY_PACK_POS_FAIL: | |
1544 | btree_node_iter_init_pack_failed(iter, b, search, | |
1545 | strictly_greater, is_extents); | |
1546 | return; | |
1547 | } | |
1548 | ||
1549 | for_each_bset(b, t) | |
1550 | __bch2_btree_node_iter_push(iter, b, | |
1551 | bch2_bset_search(b, t, search, | |
1552 | packed_search, &p, | |
1553 | strictly_greater), | |
1554 | btree_bkey_last(b, t)); | |
1555 | ||
1556 | bch2_btree_node_iter_sort(iter, b); | |
1557 | } | |
1558 | ||
1559 | void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, | |
1560 | struct btree *b, | |
1561 | bool is_extents) | |
1562 | { | |
1563 | struct bset_tree *t; | |
1564 | ||
1565 | __bch2_btree_node_iter_init(iter, is_extents); | |
1566 | ||
1567 | for_each_bset(b, t) | |
1568 | __bch2_btree_node_iter_push(iter, b, | |
1569 | btree_bkey_first(b, t), | |
1570 | btree_bkey_last(b, t)); | |
1571 | bch2_btree_node_iter_sort(iter, b); | |
1572 | } | |
1573 | ||
1574 | struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, | |
1575 | struct btree *b, | |
1576 | struct bset_tree *t) | |
1577 | { | |
1578 | struct btree_node_iter_set *set; | |
1579 | ||
1580 | btree_node_iter_for_each(iter, set) | |
1581 | if (set->end == t->end_offset) | |
1582 | return __btree_node_offset_to_key(b, set->k); | |
1583 | ||
1584 | return btree_bkey_last(b, t); | |
1585 | } | |
1586 | ||
1587 | static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, | |
1588 | struct btree *b, | |
1589 | unsigned first) | |
1590 | { | |
1591 | bool ret; | |
1592 | ||
1593 | if ((ret = (btree_node_iter_cmp(iter, b, | |
1594 | iter->data[first], | |
1595 | iter->data[first + 1]) > 0))) | |
1596 | swap(iter->data[first], iter->data[first + 1]); | |
1597 | return ret; | |
1598 | } | |
1599 | ||
1600 | void bch2_btree_node_iter_sort(struct btree_node_iter *iter, | |
1601 | struct btree *b) | |
1602 | { | |
1603 | /* unrolled bubble sort: */ | |
1604 | ||
1605 | if (!__btree_node_iter_set_end(iter, 2)) { | |
1606 | btree_node_iter_sort_two(iter, b, 0); | |
1607 | btree_node_iter_sort_two(iter, b, 1); | |
1608 | } | |
1609 | ||
1610 | if (!__btree_node_iter_set_end(iter, 1)) | |
1611 | btree_node_iter_sort_two(iter, b, 0); | |
1612 | } | |
1613 | ||
1614 | void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, | |
1615 | struct btree_node_iter_set *set) | |
1616 | { | |
1617 | struct btree_node_iter_set *last = | |
1618 | iter->data + ARRAY_SIZE(iter->data) - 1; | |
1619 | ||
1620 | memmove(&set[0], &set[1], (void *) last - (void *) set); | |
1621 | *last = (struct btree_node_iter_set) { 0, 0 }; | |
1622 | } | |
1623 | ||
1624 | static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, | |
1625 | struct btree *b) | |
1626 | { | |
1627 | iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; | |
1628 | ||
1629 | EBUG_ON(iter->data->k > iter->data->end); | |
1630 | ||
1631 | if (unlikely(__btree_node_iter_set_end(iter, 0))) { | |
1632 | bch2_btree_node_iter_set_drop(iter, iter->data); | |
1633 | return; | |
1634 | } | |
1635 | ||
1636 | if (__btree_node_iter_set_end(iter, 1)) | |
1637 | return; | |
1638 | ||
1639 | if (!btree_node_iter_sort_two(iter, b, 0)) | |
1640 | return; | |
1641 | ||
1642 | if (__btree_node_iter_set_end(iter, 2)) | |
1643 | return; | |
1644 | ||
1645 | btree_node_iter_sort_two(iter, b, 1); | |
1646 | } | |
1647 | ||
1648 | /** | |
1649 | * bch_btree_node_iter_advance - advance @iter by one key | |
1650 | * | |
1651 | * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might | |
1652 | * momentarily have out of order extents. | |
1653 | */ | |
1654 | void bch2_btree_node_iter_advance(struct btree_node_iter *iter, | |
1655 | struct btree *b) | |
1656 | { | |
1657 | #ifdef CONFIG_BCACHEFS_DEBUG | |
1658 | struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b); | |
1659 | ||
1660 | __bch2_btree_node_iter_advance(iter, b); | |
1661 | bch2_btree_node_iter_next_check(iter, b, k); | |
1662 | #else | |
1663 | __bch2_btree_node_iter_advance(iter, b); | |
1664 | #endif | |
1665 | } | |
1666 | ||
1667 | static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) | |
1668 | { | |
1669 | unsigned n = ARRAY_SIZE(iter->data); | |
1670 | ||
1671 | while (n && __btree_node_iter_set_end(iter, n - 1)) | |
1672 | --n; | |
1673 | ||
1674 | return n; | |
1675 | } | |
1676 | ||
1677 | /* | |
1678 | * Expensive: | |
1679 | */ | |
1680 | struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, | |
1681 | struct btree *b, | |
1682 | unsigned min_key_type) | |
1683 | { | |
1684 | struct bkey_packed *k, *prev = NULL; | |
1685 | struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b); | |
1686 | struct btree_node_iter_set *set; | |
1687 | struct bset_tree *t; | |
1688 | unsigned end; | |
1689 | ||
1690 | bch2_btree_node_iter_verify(iter, b); | |
1691 | ||
1692 | for_each_bset(b, t) { | |
1693 | k = bch2_bkey_prev_filter(b, t, | |
1694 | bch2_btree_node_iter_bset_pos(iter, b, t), | |
1695 | min_key_type); | |
1696 | if (k && | |
1697 | (!prev || __btree_node_iter_cmp(iter->is_extents, b, | |
1698 | k, prev) > 0)) { | |
1699 | prev = k; | |
1700 | end = t->end_offset; | |
1701 | } | |
1702 | } | |
1703 | ||
1704 | if (!prev) | |
1705 | goto out; | |
1706 | ||
1707 | /* | |
1708 | * We're manually memmoving instead of just calling sort() to ensure the | |
1709 | * prev we picked ends up in slot 0 - sort won't necessarily put it | |
1710 | * there because of duplicate deleted keys: | |
1711 | */ | |
1712 | btree_node_iter_for_each(iter, set) | |
1713 | if (set->end == end) | |
1714 | goto found; | |
1715 | ||
1716 | BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); | |
1717 | found: | |
1718 | BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); | |
1719 | ||
1720 | memmove(&iter->data[1], | |
1721 | &iter->data[0], | |
1722 | (void *) set - (void *) &iter->data[0]); | |
1723 | ||
1724 | iter->data[0].k = __btree_node_key_to_offset(b, prev); | |
1725 | iter->data[0].end = end; | |
1726 | out: | |
1727 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { | |
1728 | struct btree_node_iter iter2 = *iter; | |
1729 | ||
1730 | if (prev) | |
1731 | bch2_btree_node_iter_advance(&iter2, b); | |
1732 | ||
1733 | while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) { | |
1734 | BUG_ON(k->type >= min_key_type); | |
1735 | bch2_btree_node_iter_advance(&iter2, b); | |
1736 | } | |
1737 | } | |
1738 | ||
1739 | return prev; | |
1740 | } | |
1741 | ||
1742 | struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, | |
1743 | struct btree *b, | |
1744 | struct bkey *u) | |
1745 | { | |
1746 | struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); | |
1747 | ||
1748 | return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; | |
1749 | } | |
1750 | ||
1751 | /* Mergesort */ | |
1752 | ||
1753 | void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) | |
1754 | { | |
1755 | struct bset_tree *t; | |
1756 | ||
1757 | for_each_bset(b, t) { | |
1758 | enum bset_aux_tree_type type = bset_aux_tree_type(t); | |
1759 | size_t j; | |
1760 | ||
1761 | stats->sets[type].nr++; | |
1762 | stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * | |
1763 | sizeof(u64); | |
1764 | ||
1765 | if (bset_has_ro_aux_tree(t)) { | |
1766 | stats->floats += t->size - 1; | |
1767 | ||
1768 | for (j = 1; j < t->size; j++) | |
1769 | switch (bkey_float(b, t, j)->exponent) { | |
1770 | case BFLOAT_FAILED_UNPACKED: | |
1771 | stats->failed_unpacked++; | |
1772 | break; | |
1773 | case BFLOAT_FAILED_PREV: | |
1774 | stats->failed_prev++; | |
1775 | break; | |
1776 | case BFLOAT_FAILED_OVERFLOW: | |
1777 | stats->failed_overflow++; | |
1778 | break; | |
1779 | } | |
1780 | } | |
1781 | } | |
1782 | } | |
1783 | ||
1784 | int bch2_bkey_print_bfloat(struct btree *b, struct bkey_packed *k, | |
1785 | char *buf, size_t size) | |
1786 | { | |
1787 | struct bset_tree *t = bch2_bkey_to_bset(b, k); | |
1788 | struct bkey_packed *l, *r, *p; | |
1789 | struct bkey uk, up; | |
1790 | char buf1[200], buf2[200]; | |
1791 | unsigned j; | |
1792 | ||
1793 | if (!size) | |
1794 | return 0; | |
1795 | ||
1796 | if (!bset_has_ro_aux_tree(t)) | |
1797 | goto out; | |
1798 | ||
1799 | j = __inorder_to_eytzinger1(bkey_to_cacheline(b, t, k), t->size, t->extra); | |
1800 | if (j && | |
1801 | j < t->size && | |
1802 | k == tree_to_bkey(b, t, j)) | |
1803 | switch (bkey_float(b, t, j)->exponent) { | |
1804 | case BFLOAT_FAILED_UNPACKED: | |
1805 | uk = bkey_unpack_key(b, k); | |
1806 | return scnprintf(buf, size, | |
1807 | " failed unpacked at depth %u\n" | |
1808 | "\t%llu:%llu\n", | |
1809 | ilog2(j), | |
1810 | uk.p.inode, uk.p.offset); | |
1811 | case BFLOAT_FAILED_PREV: | |
1812 | p = tree_to_prev_bkey(b, t, j); | |
1813 | l = is_power_of_2(j) | |
1814 | ? btree_bkey_first(b, t) | |
1815 | : tree_to_prev_bkey(b, t, j >> ffs(j)); | |
1816 | r = is_power_of_2(j + 1) | |
1817 | ? bch2_bkey_prev_all(b, t, btree_bkey_last(b, t)) | |
1818 | : tree_to_bkey(b, t, j >> (ffz(j) + 1)); | |
1819 | ||
1820 | up = bkey_unpack_key(b, p); | |
1821 | uk = bkey_unpack_key(b, k); | |
1822 | bch2_to_binary(buf1, high_word(&b->format, p), b->nr_key_bits); | |
1823 | bch2_to_binary(buf2, high_word(&b->format, k), b->nr_key_bits); | |
1824 | ||
1825 | return scnprintf(buf, size, | |
1826 | " failed prev at depth %u\n" | |
1827 | "\tkey starts at bit %u but first differing bit at %u\n" | |
1828 | "\t%llu:%llu\n" | |
1829 | "\t%llu:%llu\n" | |
1830 | "\t%s\n" | |
1831 | "\t%s\n", | |
1832 | ilog2(j), | |
1833 | bch2_bkey_greatest_differing_bit(b, l, r), | |
1834 | bch2_bkey_greatest_differing_bit(b, p, k), | |
1835 | uk.p.inode, uk.p.offset, | |
1836 | up.p.inode, up.p.offset, | |
1837 | buf1, buf2); | |
1838 | case BFLOAT_FAILED_OVERFLOW: | |
1839 | uk = bkey_unpack_key(b, k); | |
1840 | return scnprintf(buf, size, | |
1841 | " failed overflow at depth %u\n" | |
1842 | "\t%llu:%llu\n", | |
1843 | ilog2(j), | |
1844 | uk.p.inode, uk.p.offset); | |
1845 | } | |
1846 | out: | |
1847 | *buf = '\0'; | |
1848 | return 0; | |
1849 | } |