Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Code for working with individual keys, and sorted sets of keys with in a | |
4 | * btree node | |
5 | * | |
6 | * Copyright 2012 Google, Inc. | |
7 | */ | |
8 | ||
9 | #include "bcachefs.h" | |
10 | #include "btree_cache.h" | |
11 | #include "bset.h" | |
12 | #include "eytzinger.h" | |
13 | #include "trace.h" | |
14 | #include "util.h" | |
15 | ||
5f60d5f6 | 16 | #include <linux/unaligned.h> |
1c6fdbd8 KO |
17 | #include <linux/console.h> |
18 | #include <linux/random.h> | |
19 | #include <linux/prefetch.h> | |
20 | ||
271a3d3a KO |
21 | static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, |
22 | struct btree *); | |
23 | ||
63f1a598 KO |
24 | static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) |
25 | { | |
26 | unsigned n = ARRAY_SIZE(iter->data); | |
27 | ||
28 | while (n && __btree_node_iter_set_end(iter, n - 1)) | |
29 | --n; | |
30 | ||
31 | return n; | |
32 | } | |
33 | ||
1c6fdbd8 KO |
34 | struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) |
35 | { | |
216c9fac | 36 | return bch2_bkey_to_bset_inlined(b, k); |
1c6fdbd8 KO |
37 | } |
38 | ||
39 | /* | |
40 | * There are never duplicate live keys in the btree - but including keys that | |
41 | * have been flagged as deleted (and will be cleaned up later) we _will_ see | |
42 | * duplicates. | |
43 | * | |
44 | * Thus the sort order is: usual key comparison first, but for keys that compare | |
45 | * equal the deleted key(s) come first, and the (at most one) live version comes | |
46 | * last. | |
47 | * | |
48 | * The main reason for this is insertion: to handle overwrites, we first iterate | |
49 | * over keys that compare equal to our insert key, and then insert immediately | |
50 | * prior to the first key greater than the key we're inserting - our insert | |
51 | * position will be after all keys that compare equal to our insert key, which | |
52 | * by the time we actually do the insert will all be deleted. | |
53 | */ | |
54 | ||
a34782a0 KO |
55 | void bch2_dump_bset(struct bch_fs *c, struct btree *b, |
56 | struct bset *i, unsigned set) | |
1c6fdbd8 KO |
57 | { |
58 | struct bkey_packed *_k, *_n; | |
a34782a0 KO |
59 | struct bkey uk, n; |
60 | struct bkey_s_c k; | |
fa8e94fa | 61 | struct printbuf buf = PRINTBUF; |
1c6fdbd8 KO |
62 | |
63 | if (!i->u64s) | |
64 | return; | |
65 | ||
a34782a0 | 66 | for (_k = i->start; |
1c6fdbd8 | 67 | _k < vstruct_last(i); |
a34782a0 | 68 | _k = _n) { |
ac2ccddc | 69 | _n = bkey_p_next(_k); |
1c6fdbd8 | 70 | |
d04d2727 KO |
71 | if (!_k->u64s) { |
72 | printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set, | |
73 | _k->_data - i->_data); | |
74 | break; | |
75 | } | |
76 | ||
a34782a0 | 77 | k = bkey_disassemble(b, _k, &uk); |
fa8e94fa KO |
78 | |
79 | printbuf_reset(&buf); | |
a34782a0 | 80 | if (c) |
fa8e94fa | 81 | bch2_bkey_val_to_text(&buf, c, k); |
a34782a0 | 82 | else |
fa8e94fa | 83 | bch2_bkey_to_text(&buf, k.k); |
24e0c3f8 | 84 | printk(KERN_ERR "block %u key %5zu: %s\n", set, |
fa8e94fa | 85 | _k->_data - i->_data, buf.buf); |
1c6fdbd8 KO |
86 | |
87 | if (_n == vstruct_last(i)) | |
88 | continue; | |
89 | ||
90 | n = bkey_unpack_key(b, _n); | |
91 | ||
e88a75eb | 92 | if (bpos_lt(n.p, k.k->p)) { |
1c6fdbd8 KO |
93 | printk(KERN_ERR "Key skipped backwards\n"); |
94 | continue; | |
95 | } | |
96 | ||
e88a75eb | 97 | if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) |
1c6fdbd8 KO |
98 | printk(KERN_ERR "Duplicate keys\n"); |
99 | } | |
fa8e94fa KO |
100 | |
101 | printbuf_exit(&buf); | |
1c6fdbd8 KO |
102 | } |
103 | ||
a34782a0 | 104 | void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) |
1c6fdbd8 | 105 | { |
1c6fdbd8 KO |
106 | console_lock(); |
107 | for_each_bset(b, t) | |
a34782a0 | 108 | bch2_dump_bset(c, b, bset(b, t), t - b->set); |
1c6fdbd8 KO |
109 | console_unlock(); |
110 | } | |
111 | ||
112 | void bch2_dump_btree_node_iter(struct btree *b, | |
113 | struct btree_node_iter *iter) | |
114 | { | |
115 | struct btree_node_iter_set *set; | |
fa8e94fa | 116 | struct printbuf buf = PRINTBUF; |
1c6fdbd8 | 117 | |
63f1a598 KO |
118 | printk(KERN_ERR "btree node iter with %u/%u sets:\n", |
119 | __btree_node_iter_used(iter), b->nsets); | |
1c6fdbd8 KO |
120 | |
121 | btree_node_iter_for_each(iter, set) { | |
122 | struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); | |
123 | struct bset_tree *t = bch2_bkey_to_bset(b, k); | |
124 | struct bkey uk = bkey_unpack_key(b, k); | |
1c6fdbd8 | 125 | |
fa8e94fa KO |
126 | printbuf_reset(&buf); |
127 | bch2_bkey_to_text(&buf, &uk); | |
63f1a598 | 128 | printk(KERN_ERR "set %zu key %u: %s\n", |
fa8e94fa | 129 | t - b->set, set->k, buf.buf); |
1c6fdbd8 | 130 | } |
fa8e94fa KO |
131 | |
132 | printbuf_exit(&buf); | |
1c6fdbd8 KO |
133 | } |
134 | ||
812a9297 | 135 | struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) |
1c6fdbd8 | 136 | { |
1c6fdbd8 | 137 | struct bkey_packed *k; |
812a9297 | 138 | struct btree_nr_keys nr = {}; |
1c6fdbd8 KO |
139 | |
140 | for_each_bset(b, t) | |
ad44bdc3 | 141 | bset_tree_for_each_key(b, t, k) |
c052cf82 | 142 | if (!bkey_deleted(k)) |
1c6fdbd8 | 143 | btree_keys_account_key_add(&nr, t - b->set, k); |
812a9297 KO |
144 | return nr; |
145 | } | |
146 | ||
812a9297 KO |
147 | void __bch2_verify_btree_nr_keys(struct btree *b) |
148 | { | |
149 | struct btree_nr_keys nr = bch2_btree_node_count_keys(b); | |
1c6fdbd8 KO |
150 | |
151 | BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); | |
152 | } | |
153 | ||
34aeb820 | 154 | static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, |
271a3d3a | 155 | struct btree *b) |
1c6fdbd8 | 156 | { |
271a3d3a KO |
157 | struct btree_node_iter iter = *_iter; |
158 | const struct bkey_packed *k, *n; | |
159 | ||
160 | k = bch2_btree_node_iter_peek_all(&iter, b); | |
161 | __bch2_btree_node_iter_advance(&iter, b); | |
162 | n = bch2_btree_node_iter_peek_all(&iter, b); | |
1c6fdbd8 KO |
163 | |
164 | bkey_unpack_key(b, k); | |
165 | ||
166 | if (n && | |
a00fd8c5 | 167 | bkey_iter_cmp(b, k, n) > 0) { |
271a3d3a | 168 | struct btree_node_iter_set *set; |
1c6fdbd8 KO |
169 | struct bkey ku = bkey_unpack_key(b, k); |
170 | struct bkey nu = bkey_unpack_key(b, n); | |
fa8e94fa KO |
171 | struct printbuf buf1 = PRINTBUF; |
172 | struct printbuf buf2 = PRINTBUF; | |
1c6fdbd8 | 173 | |
a34782a0 | 174 | bch2_dump_btree_node(NULL, b); |
fa8e94fa KO |
175 | bch2_bkey_to_text(&buf1, &ku); |
176 | bch2_bkey_to_text(&buf2, &nu); | |
271a3d3a | 177 | printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", |
fa8e94fa | 178 | buf1.buf, buf2.buf); |
271a3d3a KO |
179 | printk(KERN_ERR "iter was:"); |
180 | ||
181 | btree_node_iter_for_each(_iter, set) { | |
96dea3d5 KO |
182 | struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k); |
183 | struct bset_tree *t = bch2_bkey_to_bset(b, k2); | |
271a3d3a | 184 | printk(" [%zi %zi]", t - b->set, |
96dea3d5 | 185 | k2->_data - bset(b, t)->_data); |
271a3d3a KO |
186 | } |
187 | panic("\n"); | |
1c6fdbd8 KO |
188 | } |
189 | } | |
190 | ||
34aeb820 KO |
191 | void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, |
192 | struct btree *b) | |
1c6fdbd8 | 193 | { |
1fe08f31 | 194 | struct btree_node_iter_set *set, *s2; |
63f1a598 | 195 | struct bkey_packed *k, *p; |
1c6fdbd8 | 196 | |
63f1a598 KO |
197 | if (bch2_btree_node_iter_end(iter)) |
198 | return; | |
199 | ||
1fe08f31 | 200 | /* Verify no duplicates: */ |
67e0dd8f KO |
201 | btree_node_iter_for_each(iter, set) { |
202 | BUG_ON(set->k > set->end); | |
1fe08f31 KO |
203 | btree_node_iter_for_each(iter, s2) |
204 | BUG_ON(set != s2 && set->end == s2->end); | |
67e0dd8f | 205 | } |
1c6fdbd8 | 206 | |
1fe08f31 | 207 | /* Verify that set->end is correct: */ |
1c6fdbd8 | 208 | btree_node_iter_for_each(iter, set) { |
1fe08f31 | 209 | for_each_bset(b, t) |
b6fb4269 KO |
210 | if (set->end == t->end_offset) { |
211 | BUG_ON(set->k < btree_bkey_first_offset(t) || | |
212 | set->k >= t->end_offset); | |
1fe08f31 | 213 | goto found; |
b6fb4269 | 214 | } |
1fe08f31 KO |
215 | BUG(); |
216 | found: | |
b6fb4269 | 217 | do {} while (0); |
1c6fdbd8 KO |
218 | } |
219 | ||
1fe08f31 KO |
220 | /* Verify iterator is sorted: */ |
221 | btree_node_iter_for_each(iter, set) | |
222 | BUG_ON(set != iter->data && | |
271a3d3a | 223 | btree_node_iter_cmp(b, set[-1], set[0]) > 0); |
63f1a598 KO |
224 | |
225 | k = bch2_btree_node_iter_peek_all(iter, b); | |
226 | ||
227 | for_each_bset(b, t) { | |
228 | if (iter->data[0].end == t->end_offset) | |
229 | continue; | |
230 | ||
231 | p = bch2_bkey_prev_all(b, t, | |
232 | bch2_btree_node_iter_bset_pos(iter, b, t)); | |
233 | ||
234 | BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); | |
235 | } | |
1c6fdbd8 KO |
236 | } |
237 | ||
34aeb820 KO |
238 | static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, |
239 | struct bkey_packed *insert, unsigned clobber_u64s) | |
1c6fdbd8 KO |
240 | { |
241 | struct bset_tree *t = bch2_bkey_to_bset(b, where); | |
271a3d3a | 242 | struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); |
5cfd6977 | 243 | struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); |
fa8e94fa KO |
244 | struct printbuf buf1 = PRINTBUF; |
245 | struct printbuf buf2 = PRINTBUF; | |
271a3d3a KO |
246 | #if 0 |
247 | BUG_ON(prev && | |
a00fd8c5 | 248 | bkey_iter_cmp(b, prev, insert) > 0); |
271a3d3a KO |
249 | #else |
250 | if (prev && | |
a00fd8c5 | 251 | bkey_iter_cmp(b, prev, insert) > 0) { |
271a3d3a KO |
252 | struct bkey k1 = bkey_unpack_key(b, prev); |
253 | struct bkey k2 = bkey_unpack_key(b, insert); | |
1c6fdbd8 | 254 | |
a34782a0 | 255 | bch2_dump_btree_node(NULL, b); |
fa8e94fa KO |
256 | bch2_bkey_to_text(&buf1, &k1); |
257 | bch2_bkey_to_text(&buf2, &k2); | |
271a3d3a KO |
258 | |
259 | panic("prev > insert:\n" | |
c201e2d9 KO |
260 | "prev key %s\n" |
261 | "insert key %s\n", | |
fa8e94fa | 262 | buf1.buf, buf2.buf); |
1c6fdbd8 | 263 | } |
271a3d3a KO |
264 | #endif |
265 | #if 0 | |
266 | BUG_ON(next != btree_bkey_last(b, t) && | |
a00fd8c5 | 267 | bkey_iter_cmp(b, insert, next) > 0); |
271a3d3a KO |
268 | #else |
269 | if (next != btree_bkey_last(b, t) && | |
a00fd8c5 | 270 | bkey_iter_cmp(b, insert, next) > 0) { |
271a3d3a KO |
271 | struct bkey k1 = bkey_unpack_key(b, insert); |
272 | struct bkey k2 = bkey_unpack_key(b, next); | |
1c6fdbd8 | 273 | |
a34782a0 | 274 | bch2_dump_btree_node(NULL, b); |
fa8e94fa KO |
275 | bch2_bkey_to_text(&buf1, &k1); |
276 | bch2_bkey_to_text(&buf2, &k2); | |
271a3d3a KO |
277 | |
278 | panic("insert > next:\n" | |
c201e2d9 KO |
279 | "insert key %s\n" |
280 | "next key %s\n", | |
fa8e94fa | 281 | buf1.buf, buf2.buf); |
1c6fdbd8 | 282 | } |
271a3d3a KO |
283 | #endif |
284 | } | |
285 | ||
34aeb820 KO |
286 | static inline void bch2_verify_insert_pos(struct btree *b, |
287 | struct bkey_packed *where, | |
288 | struct bkey_packed *insert, | |
289 | unsigned clobber_u64s) | |
290 | { | |
291 | if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) | |
292 | __bch2_verify_insert_pos(b, where, insert, clobber_u64s); | |
293 | } | |
1c6fdbd8 | 294 | |
1c6fdbd8 KO |
295 | |
296 | /* Auxiliary search trees */ | |
297 | ||
58404bb2 KO |
298 | #define BFLOAT_FAILED_UNPACKED U8_MAX |
299 | #define BFLOAT_FAILED U8_MAX | |
1c6fdbd8 | 300 | |
1c6fdbd8 KO |
301 | struct bkey_float { |
302 | u8 exponent; | |
303 | u8 key_offset; | |
b904a799 KO |
304 | u16 mantissa; |
305 | }; | |
306 | #define BKEY_MANTISSA_BITS 16 | |
1c6fdbd8 | 307 | |
1c6fdbd8 | 308 | struct ro_aux_tree { |
5cfd6977 KO |
309 | u8 nothing[0]; |
310 | struct bkey_float f[]; | |
1c6fdbd8 KO |
311 | }; |
312 | ||
313 | struct rw_aux_tree { | |
314 | u16 offset; | |
315 | struct bpos k; | |
316 | }; | |
317 | ||
1c6fdbd8 KO |
318 | static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) |
319 | { | |
320 | BUG_ON(t->aux_data_offset == U16_MAX); | |
321 | ||
322 | switch (bset_aux_tree_type(t)) { | |
323 | case BSET_NO_AUX_TREE: | |
324 | return t->aux_data_offset; | |
325 | case BSET_RO_AUX_TREE: | |
326 | return t->aux_data_offset + | |
3130303b | 327 | DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8); |
1c6fdbd8 KO |
328 | case BSET_RW_AUX_TREE: |
329 | return t->aux_data_offset + | |
330 | DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); | |
331 | default: | |
332 | BUG(); | |
333 | } | |
334 | } | |
335 | ||
336 | static unsigned bset_aux_tree_buf_start(const struct btree *b, | |
337 | const struct bset_tree *t) | |
338 | { | |
339 | return t == b->set | |
340 | ? DIV_ROUND_UP(b->unpack_fn_len, 8) | |
341 | : bset_aux_tree_buf_end(t - 1); | |
342 | } | |
343 | ||
344 | static void *__aux_tree_base(const struct btree *b, | |
345 | const struct bset_tree *t) | |
346 | { | |
347 | return b->aux_data + t->aux_data_offset * 8; | |
348 | } | |
349 | ||
350 | static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, | |
351 | const struct bset_tree *t) | |
352 | { | |
353 | EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); | |
354 | ||
355 | return __aux_tree_base(b, t); | |
356 | } | |
357 | ||
1c6fdbd8 KO |
358 | static struct bkey_float *bkey_float(const struct btree *b, |
359 | const struct bset_tree *t, | |
360 | unsigned idx) | |
361 | { | |
b904a799 | 362 | return ro_aux_tree_base(b, t)->f + idx; |
1c6fdbd8 KO |
363 | } |
364 | ||
34aeb820 | 365 | static void __bset_aux_tree_verify(struct btree *b) |
1c6fdbd8 | 366 | { |
1c6fdbd8 KO |
367 | for_each_bset(b, t) { |
368 | if (t->aux_data_offset == U16_MAX) | |
369 | continue; | |
370 | ||
371 | BUG_ON(t != b->set && | |
372 | t[-1].aux_data_offset == U16_MAX); | |
373 | ||
374 | BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); | |
375 | BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); | |
376 | BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); | |
377 | } | |
34aeb820 KO |
378 | } |
379 | ||
380 | static inline void bset_aux_tree_verify(struct btree *b) | |
381 | { | |
382 | if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) | |
383 | __bset_aux_tree_verify(b); | |
1c6fdbd8 KO |
384 | } |
385 | ||
29364f34 | 386 | void bch2_btree_keys_init(struct btree *b) |
1c6fdbd8 KO |
387 | { |
388 | unsigned i; | |
389 | ||
390 | b->nsets = 0; | |
391 | memset(&b->nr, 0, sizeof(b->nr)); | |
29364f34 | 392 | |
1c6fdbd8 KO |
393 | for (i = 0; i < MAX_BSETS; i++) |
394 | b->set[i].data_offset = U16_MAX; | |
395 | ||
396 | bch2_bset_set_no_aux_tree(b, b->set); | |
397 | } | |
398 | ||
399 | /* Binary tree stuff for auxiliary search trees */ | |
400 | ||
401 | /* | |
402 | * Cacheline/offset <-> bkey pointer arithmetic: | |
403 | * | |
404 | * t->tree is a binary search tree in an array; each node corresponds to a key | |
405 | * in one cacheline in t->set (BSET_CACHELINE bytes). | |
406 | * | |
407 | * This means we don't have to store the full index of the key that a node in | |
408 | * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and | |
409 | * then bkey_float->m gives us the offset within that cacheline, in units of 8 | |
410 | * bytes. | |
411 | * | |
412 | * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to | |
413 | * make this work. | |
414 | * | |
415 | * To construct the bfloat for an arbitrary key we need to know what the key | |
416 | * immediately preceding it is: we have to check if the two keys differ in the | |
417 | * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size | |
418 | * of the previous key so we can walk backwards to it from t->tree[j]'s key. | |
419 | */ | |
420 | ||
421 | static inline void *bset_cacheline(const struct btree *b, | |
422 | const struct bset_tree *t, | |
423 | unsigned cacheline) | |
424 | { | |
425 | return (void *) round_down((unsigned long) btree_bkey_first(b, t), | |
426 | L1_CACHE_BYTES) + | |
427 | cacheline * BSET_CACHELINE; | |
428 | } | |
429 | ||
430 | static struct bkey_packed *cacheline_to_bkey(const struct btree *b, | |
431 | const struct bset_tree *t, | |
432 | unsigned cacheline, | |
433 | unsigned offset) | |
434 | { | |
435 | return bset_cacheline(b, t, cacheline) + offset * 8; | |
436 | } | |
437 | ||
438 | static unsigned bkey_to_cacheline(const struct btree *b, | |
439 | const struct bset_tree *t, | |
440 | const struct bkey_packed *k) | |
441 | { | |
442 | return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; | |
443 | } | |
444 | ||
445 | static ssize_t __bkey_to_cacheline_offset(const struct btree *b, | |
446 | const struct bset_tree *t, | |
447 | unsigned cacheline, | |
448 | const struct bkey_packed *k) | |
449 | { | |
450 | return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); | |
451 | } | |
452 | ||
453 | static unsigned bkey_to_cacheline_offset(const struct btree *b, | |
454 | const struct bset_tree *t, | |
455 | unsigned cacheline, | |
456 | const struct bkey_packed *k) | |
457 | { | |
458 | size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); | |
459 | ||
460 | EBUG_ON(m > U8_MAX); | |
461 | return m; | |
462 | } | |
463 | ||
464 | static inline struct bkey_packed *tree_to_bkey(const struct btree *b, | |
465 | const struct bset_tree *t, | |
466 | unsigned j) | |
467 | { | |
468 | return cacheline_to_bkey(b, t, | |
72492d55 | 469 | __eytzinger1_to_inorder(j, t->size - 1, t->extra), |
1c6fdbd8 KO |
470 | bkey_float(b, t, j)->key_offset); |
471 | } | |
472 | ||
1c6fdbd8 KO |
473 | static struct rw_aux_tree *rw_aux_tree(const struct btree *b, |
474 | const struct bset_tree *t) | |
475 | { | |
476 | EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); | |
477 | ||
478 | return __aux_tree_base(b, t); | |
479 | } | |
480 | ||
481 | /* | |
482 | * For the write set - the one we're currently inserting keys into - we don't | |
483 | * maintain a full search tree, we just keep a simple lookup table in t->prev. | |
484 | */ | |
485 | static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, | |
486 | struct bset_tree *t, | |
487 | unsigned j) | |
488 | { | |
489 | return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); | |
490 | } | |
491 | ||
492 | static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, | |
493 | unsigned j, struct bkey_packed *k) | |
494 | { | |
495 | EBUG_ON(k >= btree_bkey_last(b, t)); | |
496 | ||
497 | rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { | |
498 | .offset = __btree_node_key_to_offset(b, k), | |
499 | .k = bkey_unpack_pos(b, k), | |
500 | }; | |
501 | } | |
502 | ||
34aeb820 | 503 | static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) |
1c6fdbd8 KO |
504 | { |
505 | struct bkey_packed *k = btree_bkey_first(b, t); | |
506 | unsigned j = 0; | |
507 | ||
1c6fdbd8 KO |
508 | BUG_ON(bset_has_ro_aux_tree(t)); |
509 | ||
510 | if (!bset_has_rw_aux_tree(t)) | |
511 | return; | |
512 | ||
513 | BUG_ON(t->size < 1); | |
514 | BUG_ON(rw_aux_to_bkey(b, t, j) != k); | |
515 | ||
516 | goto start; | |
517 | while (1) { | |
518 | if (rw_aux_to_bkey(b, t, j) == k) { | |
e88a75eb | 519 | BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, |
1c6fdbd8 KO |
520 | bkey_unpack_pos(b, k))); |
521 | start: | |
522 | if (++j == t->size) | |
523 | break; | |
524 | ||
525 | BUG_ON(rw_aux_tree(b, t)[j].offset <= | |
526 | rw_aux_tree(b, t)[j - 1].offset); | |
527 | } | |
528 | ||
ac2ccddc | 529 | k = bkey_p_next(k); |
1c6fdbd8 KO |
530 | BUG_ON(k >= btree_bkey_last(b, t)); |
531 | } | |
532 | } | |
533 | ||
34aeb820 KO |
534 | static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, |
535 | struct bset_tree *t) | |
536 | { | |
537 | if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) | |
538 | __bch2_bset_verify_rw_aux_tree(b, t); | |
539 | } | |
540 | ||
1c6fdbd8 KO |
541 | /* returns idx of first entry >= offset: */ |
542 | static unsigned rw_aux_tree_bsearch(struct btree *b, | |
543 | struct bset_tree *t, | |
544 | unsigned offset) | |
545 | { | |
617391ba KO |
546 | unsigned bset_offs = offset - btree_bkey_first_offset(t); |
547 | unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); | |
548 | unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; | |
1c6fdbd8 KO |
549 | |
550 | EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); | |
617391ba KO |
551 | EBUG_ON(!t->size); |
552 | EBUG_ON(idx > t->size); | |
1c6fdbd8 | 553 | |
617391ba KO |
554 | while (idx < t->size && |
555 | rw_aux_tree(b, t)[idx].offset < offset) | |
556 | idx++; | |
1c6fdbd8 | 557 | |
617391ba KO |
558 | while (idx && |
559 | rw_aux_tree(b, t)[idx - 1].offset >= offset) | |
560 | idx--; | |
1c6fdbd8 | 561 | |
617391ba KO |
562 | EBUG_ON(idx < t->size && |
563 | rw_aux_tree(b, t)[idx].offset < offset); | |
564 | EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); | |
565 | EBUG_ON(idx + 1 < t->size && | |
566 | rw_aux_tree(b, t)[idx].offset == | |
567 | rw_aux_tree(b, t)[idx + 1].offset); | |
1c6fdbd8 | 568 | |
617391ba | 569 | return idx; |
1c6fdbd8 KO |
570 | } |
571 | ||
1c6fdbd8 | 572 | static inline unsigned bkey_mantissa(const struct bkey_packed *k, |
2a463e94 | 573 | const struct bkey_float *f) |
1c6fdbd8 KO |
574 | { |
575 | u64 v; | |
576 | ||
577 | EBUG_ON(!bkey_packed(k)); | |
578 | ||
579 | v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); | |
580 | ||
581 | /* | |
582 | * In little endian, we're shifting off low bits (and then the bits we | |
583 | * want are at the low end), in big endian we're shifting off high bits | |
584 | * (and then the bits we want are at the high end, so we shift them | |
585 | * back down): | |
586 | */ | |
587 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
588 | v >>= f->exponent & 7; | |
589 | #else | |
b904a799 | 590 | v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; |
1c6fdbd8 | 591 | #endif |
b904a799 | 592 | return (u16) v; |
1c6fdbd8 KO |
593 | } |
594 | ||
73bd774d KO |
595 | static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, |
596 | unsigned j, | |
597 | struct bkey_packed *min_key, | |
598 | struct bkey_packed *max_key) | |
1c6fdbd8 KO |
599 | { |
600 | struct bkey_float *f = bkey_float(b, t, j); | |
601 | struct bkey_packed *m = tree_to_bkey(b, t, j); | |
9ae82fe6 KO |
602 | struct bkey_packed *l = is_power_of_2(j) |
603 | ? min_key | |
5d011012 | 604 | : tree_to_bkey(b, t, j >> ffs(j)); |
9ae82fe6 KO |
605 | struct bkey_packed *r = is_power_of_2(j + 1) |
606 | ? max_key | |
607 | : tree_to_bkey(b, t, j >> (ffz(j) + 1)); | |
1c6fdbd8 KO |
608 | unsigned mantissa; |
609 | int shift, exponent, high_bit; | |
610 | ||
1c6fdbd8 KO |
611 | /* |
612 | * for failed bfloats, the lookup code falls back to comparing against | |
613 | * the original key. | |
614 | */ | |
615 | ||
1bdb67e8 | 616 | if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || |
1c6fdbd8 KO |
617 | !b->nr_key_bits) { |
618 | f->exponent = BFLOAT_FAILED_UNPACKED; | |
619 | return; | |
620 | } | |
621 | ||
622 | /* | |
623 | * The greatest differing bit of l and r is the first bit we must | |
624 | * include in the bfloat mantissa we're creating in order to do | |
625 | * comparisons - that bit always becomes the high bit of | |
626 | * bfloat->mantissa, and thus the exponent we're calculating here is | |
627 | * the position of what will become the low bit in bfloat->mantissa: | |
628 | * | |
629 | * Note that this may be negative - we may be running off the low end | |
630 | * of the key: we handle this later: | |
631 | */ | |
632 | high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), | |
b904a799 KO |
633 | min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); |
634 | exponent = high_bit - (BKEY_MANTISSA_BITS - 1); | |
1c6fdbd8 KO |
635 | |
636 | /* | |
637 | * Then we calculate the actual shift value, from the start of the key | |
638 | * (k->_data), to get the key bits starting at exponent: | |
639 | */ | |
640 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
641 | shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; | |
642 | ||
b904a799 | 643 | EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); |
1c6fdbd8 KO |
644 | #else |
645 | shift = high_bit_offset + | |
646 | b->nr_key_bits - | |
647 | exponent - | |
b904a799 | 648 | BKEY_MANTISSA_BITS; |
1c6fdbd8 KO |
649 | |
650 | EBUG_ON(shift < KEY_PACKED_BITS_START); | |
651 | #endif | |
652 | EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); | |
653 | ||
654 | f->exponent = shift; | |
2a463e94 | 655 | mantissa = bkey_mantissa(m, f); |
1c6fdbd8 KO |
656 | |
657 | /* | |
658 | * If we've got garbage bits, set them to all 1s - it's legal for the | |
659 | * bfloat to compare larger than the original key, but not smaller: | |
660 | */ | |
661 | if (exponent < 0) | |
662 | mantissa |= ~(~0U << -exponent); | |
663 | ||
b904a799 | 664 | f->mantissa = mantissa; |
1c6fdbd8 KO |
665 | } |
666 | ||
667 | /* bytes remaining - only valid for last bset: */ | |
b6fb4269 | 668 | static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t) |
1c6fdbd8 KO |
669 | { |
670 | bset_aux_tree_verify(b); | |
671 | ||
672 | return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); | |
673 | } | |
674 | ||
b6fb4269 | 675 | static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t) |
1c6fdbd8 | 676 | { |
3130303b | 677 | return __bset_tree_capacity(b, t) / sizeof(struct bkey_float); |
1c6fdbd8 KO |
678 | } |
679 | ||
b6fb4269 | 680 | static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t) |
1c6fdbd8 KO |
681 | { |
682 | return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); | |
683 | } | |
684 | ||
9ae82fe6 | 685 | static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) |
1c6fdbd8 KO |
686 | { |
687 | struct bkey_packed *k; | |
688 | ||
689 | t->size = 1; | |
690 | t->extra = BSET_RW_AUX_TREE_VAL; | |
691 | rw_aux_tree(b, t)[0].offset = | |
692 | __btree_node_key_to_offset(b, btree_bkey_first(b, t)); | |
693 | ||
ad44bdc3 | 694 | bset_tree_for_each_key(b, t, k) { |
1c6fdbd8 KO |
695 | if (t->size == bset_rw_tree_capacity(b, t)) |
696 | break; | |
697 | ||
698 | if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > | |
699 | L1_CACHE_BYTES) | |
700 | rw_aux_tree_set(b, t, t->size++, k); | |
701 | } | |
702 | } | |
703 | ||
9ae82fe6 | 704 | static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) |
1c6fdbd8 | 705 | { |
3130303b | 706 | struct bkey_packed *k = btree_bkey_first(b, t); |
9ae82fe6 | 707 | struct bkey_i min_key, max_key; |
3fe8a186 | 708 | unsigned cacheline = 1; |
1c6fdbd8 | 709 | |
1c6fdbd8 KO |
710 | t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), |
711 | bset_ro_tree_capacity(b, t)); | |
712 | retry: | |
713 | if (t->size < 2) { | |
714 | t->size = 0; | |
715 | t->extra = BSET_NO_AUX_TREE_VAL; | |
716 | return; | |
717 | } | |
718 | ||
288a6690 | 719 | t->extra = eytzinger1_extra(t->size - 1); |
1c6fdbd8 KO |
720 | |
721 | /* First we figure out where the first key in each cacheline is */ | |
72492d55 | 722 | eytzinger1_for_each(j, t->size - 1) { |
1c6fdbd8 | 723 | while (bkey_to_cacheline(b, t, k) < cacheline) |
3130303b | 724 | k = bkey_p_next(k); |
1c6fdbd8 KO |
725 | |
726 | if (k >= btree_bkey_last(b, t)) { | |
727 | /* XXX: this path sucks */ | |
728 | t->size--; | |
729 | goto retry; | |
730 | } | |
731 | ||
1c6fdbd8 KO |
732 | bkey_float(b, t, j)->key_offset = |
733 | bkey_to_cacheline_offset(b, t, cacheline++, k); | |
734 | ||
1c6fdbd8 KO |
735 | EBUG_ON(tree_to_bkey(b, t, j) != k); |
736 | } | |
737 | ||
c7e04e22 KO |
738 | if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { |
739 | bkey_init(&min_key.k); | |
740 | min_key.k.p = b->data->min_key; | |
741 | } | |
742 | ||
743 | if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { | |
744 | bkey_init(&max_key.k); | |
3ce8b463 | 745 | max_key.k.p = b->data->max_key; |
c7e04e22 | 746 | } |
9ae82fe6 | 747 | |
1c6fdbd8 | 748 | /* Then we build the tree */ |
72492d55 | 749 | eytzinger1_for_each(j, t->size - 1) |
17563164 KO |
750 | make_bfloat(b, t, j, |
751 | bkey_to_packed(&min_key), | |
752 | bkey_to_packed(&max_key)); | |
1c6fdbd8 KO |
753 | } |
754 | ||
755 | static void bset_alloc_tree(struct btree *b, struct bset_tree *t) | |
756 | { | |
757 | struct bset_tree *i; | |
758 | ||
759 | for (i = b->set; i != t; i++) | |
760 | BUG_ON(bset_has_rw_aux_tree(i)); | |
761 | ||
762 | bch2_bset_set_no_aux_tree(b, t); | |
763 | ||
764 | /* round up to next cacheline: */ | |
765 | t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), | |
766 | SMP_CACHE_BYTES / sizeof(u64)); | |
767 | ||
768 | bset_aux_tree_verify(b); | |
769 | } | |
770 | ||
771 | void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, | |
772 | bool writeable) | |
773 | { | |
774 | if (writeable | |
775 | ? bset_has_rw_aux_tree(t) | |
776 | : bset_has_ro_aux_tree(t)) | |
777 | return; | |
778 | ||
779 | bset_alloc_tree(b, t); | |
780 | ||
781 | if (!__bset_tree_capacity(b, t)) | |
782 | return; | |
783 | ||
784 | if (writeable) | |
785 | __build_rw_aux_tree(b, t); | |
786 | else | |
787 | __build_ro_aux_tree(b, t); | |
788 | ||
789 | bset_aux_tree_verify(b); | |
790 | } | |
791 | ||
792 | void bch2_bset_init_first(struct btree *b, struct bset *i) | |
793 | { | |
794 | struct bset_tree *t; | |
795 | ||
796 | BUG_ON(b->nsets); | |
797 | ||
798 | memset(i, 0, sizeof(*i)); | |
799 | get_random_bytes(&i->seq, sizeof(i->seq)); | |
800 | SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); | |
801 | ||
802 | t = &b->set[b->nsets++]; | |
803 | set_btree_bset(b, t, i); | |
804 | } | |
805 | ||
ec4edd7b | 806 | void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) |
1c6fdbd8 KO |
807 | { |
808 | struct bset *i = &bne->keys; | |
809 | struct bset_tree *t; | |
810 | ||
ec4edd7b | 811 | BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); |
1c6fdbd8 KO |
812 | BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); |
813 | BUG_ON(b->nsets >= MAX_BSETS); | |
814 | ||
815 | memset(i, 0, sizeof(*i)); | |
816 | i->seq = btree_bset_first(b)->seq; | |
817 | SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); | |
818 | ||
819 | t = &b->set[b->nsets++]; | |
820 | set_btree_bset(b, t, i); | |
821 | } | |
822 | ||
823 | /* | |
824 | * find _some_ key in the same bset as @k that precedes @k - not necessarily the | |
825 | * immediate predecessor: | |
826 | */ | |
827 | static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, | |
828 | struct bkey_packed *k) | |
829 | { | |
830 | struct bkey_packed *p; | |
831 | unsigned offset; | |
832 | int j; | |
833 | ||
834 | EBUG_ON(k < btree_bkey_first(b, t) || | |
835 | k > btree_bkey_last(b, t)); | |
836 | ||
837 | if (k == btree_bkey_first(b, t)) | |
838 | return NULL; | |
839 | ||
840 | switch (bset_aux_tree_type(t)) { | |
841 | case BSET_NO_AUX_TREE: | |
842 | p = btree_bkey_first(b, t); | |
843 | break; | |
844 | case BSET_RO_AUX_TREE: | |
845 | j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); | |
846 | ||
847 | do { | |
848 | p = j ? tree_to_bkey(b, t, | |
849 | __inorder_to_eytzinger1(j--, | |
72492d55 | 850 | t->size - 1, t->extra)) |
1c6fdbd8 KO |
851 | : btree_bkey_first(b, t); |
852 | } while (p >= k); | |
853 | break; | |
854 | case BSET_RW_AUX_TREE: | |
855 | offset = __btree_node_key_to_offset(b, k); | |
856 | j = rw_aux_tree_bsearch(b, t, offset); | |
857 | p = j ? rw_aux_to_bkey(b, t, j - 1) | |
858 | : btree_bkey_first(b, t); | |
859 | break; | |
860 | } | |
861 | ||
862 | return p; | |
863 | } | |
864 | ||
865 | struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, | |
866 | struct bset_tree *t, | |
867 | struct bkey_packed *k, | |
868 | unsigned min_key_type) | |
869 | { | |
870 | struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; | |
871 | ||
872 | while ((p = __bkey_prev(b, t, k)) && !ret) { | |
ac2ccddc | 873 | for (i = p; i != k; i = bkey_p_next(i)) |
1c6fdbd8 KO |
874 | if (i->type >= min_key_type) |
875 | ret = i; | |
876 | ||
877 | k = p; | |
878 | } | |
879 | ||
34aeb820 | 880 | if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { |
1c6fdbd8 KO |
881 | BUG_ON(ret >= orig_k); |
882 | ||
ad44bdc3 | 883 | for (i = ret |
ac2ccddc | 884 | ? bkey_p_next(ret) |
ad44bdc3 | 885 | : btree_bkey_first(b, t); |
1c6fdbd8 | 886 | i != orig_k; |
ac2ccddc | 887 | i = bkey_p_next(i)) |
1c6fdbd8 KO |
888 | BUG_ON(i->type >= min_key_type); |
889 | } | |
890 | ||
891 | return ret; | |
892 | } | |
893 | ||
894 | /* Insert */ | |
895 | ||
94932a08 AH |
896 | static void rw_aux_tree_insert_entry(struct btree *b, |
897 | struct bset_tree *t, | |
898 | unsigned idx) | |
899 | { | |
900 | EBUG_ON(!idx || idx > t->size); | |
901 | struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1); | |
902 | struct bkey_packed *end = idx < t->size | |
903 | ? rw_aux_to_bkey(b, t, idx) | |
904 | : btree_bkey_last(b, t); | |
905 | ||
906 | if (t->size < bset_rw_tree_capacity(b, t) && | |
907 | (void *) end - (void *) start > L1_CACHE_BYTES) { | |
908 | struct bkey_packed *k = start; | |
909 | ||
910 | while (1) { | |
911 | k = bkey_p_next(k); | |
912 | if (k == end) | |
913 | break; | |
914 | ||
915 | if ((void *) k - (void *) start >= L1_CACHE_BYTES) { | |
916 | memmove(&rw_aux_tree(b, t)[idx + 1], | |
917 | &rw_aux_tree(b, t)[idx], | |
918 | (void *) &rw_aux_tree(b, t)[t->size] - | |
919 | (void *) &rw_aux_tree(b, t)[idx]); | |
920 | t->size++; | |
921 | rw_aux_tree_set(b, t, idx, k); | |
922 | break; | |
923 | } | |
924 | } | |
925 | } | |
926 | } | |
927 | ||
1c6fdbd8 KO |
928 | static void bch2_bset_fix_lookup_table(struct btree *b, |
929 | struct bset_tree *t, | |
930 | struct bkey_packed *_where, | |
931 | unsigned clobber_u64s, | |
932 | unsigned new_u64s) | |
933 | { | |
934 | int shift = new_u64s - clobber_u64s; | |
94932a08 | 935 | unsigned idx, j, where = __btree_node_key_to_offset(b, _where); |
1c6fdbd8 KO |
936 | |
937 | EBUG_ON(bset_has_ro_aux_tree(t)); | |
938 | ||
939 | if (!bset_has_rw_aux_tree(t)) | |
940 | return; | |
941 | ||
94932a08 AH |
942 | if (where > rw_aux_tree(b, t)[t->size - 1].offset) { |
943 | rw_aux_tree_insert_entry(b, t, t->size); | |
944 | goto verify; | |
945 | } | |
946 | ||
617391ba | 947 | /* returns first entry >= where */ |
94932a08 AH |
948 | idx = rw_aux_tree_bsearch(b, t, where); |
949 | ||
950 | if (rw_aux_tree(b, t)[idx].offset == where) { | |
951 | if (!idx) { /* never delete first entry */ | |
952 | idx++; | |
953 | } else if (where < t->end_offset) { | |
954 | rw_aux_tree_set(b, t, idx++, _where); | |
955 | } else { | |
956 | EBUG_ON(where != t->end_offset); | |
957 | rw_aux_tree_insert_entry(b, t, --t->size); | |
958 | goto verify; | |
959 | } | |
960 | } | |
1c6fdbd8 | 961 | |
94932a08 AH |
962 | EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where); |
963 | if (idx < t->size && | |
964 | rw_aux_tree(b, t)[idx].offset + shift == | |
965 | rw_aux_tree(b, t)[idx - 1].offset) { | |
966 | memmove(&rw_aux_tree(b, t)[idx], | |
967 | &rw_aux_tree(b, t)[idx + 1], | |
968 | (void *) &rw_aux_tree(b, t)[t->size] - | |
969 | (void *) &rw_aux_tree(b, t)[idx + 1]); | |
970 | t->size -= 1; | |
971 | } | |
1c6fdbd8 | 972 | |
94932a08 AH |
973 | for (j = idx; j < t->size; j++) |
974 | rw_aux_tree(b, t)[j].offset += shift; | |
1c6fdbd8 | 975 | |
94932a08 AH |
976 | EBUG_ON(idx < t->size && |
977 | rw_aux_tree(b, t)[idx].offset == | |
978 | rw_aux_tree(b, t)[idx - 1].offset); | |
1c6fdbd8 | 979 | |
94932a08 | 980 | rw_aux_tree_insert_entry(b, t, idx); |
1c6fdbd8 | 981 | |
94932a08 | 982 | verify: |
1c6fdbd8 KO |
983 | bch2_bset_verify_rw_aux_tree(b, t); |
984 | bset_aux_tree_verify(b); | |
985 | } | |
986 | ||
987 | void bch2_bset_insert(struct btree *b, | |
1c6fdbd8 KO |
988 | struct bkey_packed *where, |
989 | struct bkey_i *insert, | |
990 | unsigned clobber_u64s) | |
991 | { | |
992 | struct bkey_format *f = &b->format; | |
993 | struct bset_tree *t = bset_tree_last(b); | |
994 | struct bkey_packed packed, *src = bkey_to_packed(insert); | |
995 | ||
996 | bch2_bset_verify_rw_aux_tree(b, t); | |
271a3d3a | 997 | bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); |
1c6fdbd8 KO |
998 | |
999 | if (bch2_bkey_pack_key(&packed, &insert->k, f)) | |
1000 | src = &packed; | |
1001 | ||
c052cf82 | 1002 | if (!bkey_deleted(&insert->k)) |
1c6fdbd8 KO |
1003 | btree_keys_account_key_add(&b->nr, t - b->set, src); |
1004 | ||
1005 | if (src->u64s != clobber_u64s) { | |
5cfd6977 KO |
1006 | u64 *src_p = (u64 *) where->_data + clobber_u64s; |
1007 | u64 *dst_p = (u64 *) where->_data + src->u64s; | |
1c6fdbd8 KO |
1008 | |
1009 | EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < | |
1010 | (int) clobber_u64s - src->u64s); | |
1011 | ||
1012 | memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); | |
1013 | le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); | |
1014 | set_btree_bset_end(b, t); | |
1015 | } | |
1016 | ||
d598a9b7 | 1017 | memcpy_u64s_small(where, src, |
1c6fdbd8 KO |
1018 | bkeyp_key_u64s(f, src)); |
1019 | memcpy_u64s(bkeyp_val(f, where), &insert->v, | |
1020 | bkeyp_val_u64s(f, src)); | |
1021 | ||
fdf22400 KO |
1022 | if (src->u64s != clobber_u64s) |
1023 | bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); | |
1c6fdbd8 | 1024 | |
1c6fdbd8 KO |
1025 | bch2_verify_btree_nr_keys(b); |
1026 | } | |
1027 | ||
1028 | void bch2_bset_delete(struct btree *b, | |
1029 | struct bkey_packed *where, | |
1030 | unsigned clobber_u64s) | |
1031 | { | |
1032 | struct bset_tree *t = bset_tree_last(b); | |
5cfd6977 | 1033 | u64 *src_p = (u64 *) where->_data + clobber_u64s; |
1c6fdbd8 KO |
1034 | u64 *dst_p = where->_data; |
1035 | ||
1036 | bch2_bset_verify_rw_aux_tree(b, t); | |
1037 | ||
1038 | EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); | |
1039 | ||
1040 | memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); | |
1041 | le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); | |
1042 | set_btree_bset_end(b, t); | |
1043 | ||
1044 | bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); | |
1045 | } | |
1046 | ||
1047 | /* Lookup */ | |
1048 | ||
1049 | __flatten | |
1050 | static struct bkey_packed *bset_search_write_set(const struct btree *b, | |
1051 | struct bset_tree *t, | |
2649b514 | 1052 | struct bpos *search) |
1c6fdbd8 KO |
1053 | { |
1054 | unsigned l = 0, r = t->size; | |
1055 | ||
1056 | while (l + 1 != r) { | |
1057 | unsigned m = (l + r) >> 1; | |
1058 | ||
e88a75eb | 1059 | if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) |
1c6fdbd8 KO |
1060 | l = m; |
1061 | else | |
1062 | r = m; | |
1063 | } | |
1064 | ||
1065 | return rw_aux_to_bkey(b, t, l); | |
1066 | } | |
1067 | ||
c4537686 KO |
1068 | static inline void prefetch_four_cachelines(void *p) |
1069 | { | |
6f152b0f | 1070 | #ifdef CONFIG_X86_64 |
2eba51a6 BH |
1071 | asm("prefetcht0 (-127 + 64 * 0)(%0);" |
1072 | "prefetcht0 (-127 + 64 * 1)(%0);" | |
1073 | "prefetcht0 (-127 + 64 * 2)(%0);" | |
1074 | "prefetcht0 (-127 + 64 * 3)(%0);" | |
c4537686 KO |
1075 | : |
1076 | : "r" (p + 127)); | |
1077 | #else | |
1078 | prefetch(p + L1_CACHE_BYTES * 0); | |
1079 | prefetch(p + L1_CACHE_BYTES * 1); | |
1080 | prefetch(p + L1_CACHE_BYTES * 2); | |
1081 | prefetch(p + L1_CACHE_BYTES * 3); | |
1082 | #endif | |
1083 | } | |
1084 | ||
58404bb2 | 1085 | static inline bool bkey_mantissa_bits_dropped(const struct btree *b, |
89ae9a04 | 1086 | const struct bkey_float *f) |
58404bb2 KO |
1087 | { |
1088 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
1089 | unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; | |
1090 | ||
1091 | return f->exponent > key_bits_start; | |
1092 | #else | |
1093 | unsigned key_bits_end = high_bit_offset + b->nr_key_bits; | |
58404bb2 | 1094 | |
b904a799 | 1095 | return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; |
58404bb2 KO |
1096 | #endif |
1097 | } | |
1098 | ||
1c6fdbd8 KO |
1099 | __flatten |
1100 | static struct bkey_packed *bset_search_tree(const struct btree *b, | |
d108efc2 KO |
1101 | const struct bset_tree *t, |
1102 | const struct bpos *search, | |
1c6fdbd8 KO |
1103 | const struct bkey_packed *packed_search) |
1104 | { | |
1105 | struct ro_aux_tree *base = ro_aux_tree_base(b, t); | |
c4537686 | 1106 | struct bkey_float *f; |
58404bb2 KO |
1107 | struct bkey_packed *k; |
1108 | unsigned inorder, n = 1, l, r; | |
1109 | int cmp; | |
1c6fdbd8 | 1110 | |
c4537686 KO |
1111 | do { |
1112 | if (likely(n << 4 < t->size)) | |
b904a799 | 1113 | prefetch(&base->f[n << 4]); |
1c6fdbd8 | 1114 | |
b904a799 | 1115 | f = &base->f[n]; |
58404bb2 KO |
1116 | if (unlikely(f->exponent >= BFLOAT_FAILED)) |
1117 | goto slowpath; | |
1118 | ||
b904a799 | 1119 | l = f->mantissa; |
2a463e94 | 1120 | r = bkey_mantissa(packed_search, f); |
58404bb2 | 1121 | |
89ae9a04 | 1122 | if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f)) |
58404bb2 KO |
1123 | goto slowpath; |
1124 | ||
1125 | n = n * 2 + (l < r); | |
1126 | continue; | |
1127 | slowpath: | |
1128 | k = tree_to_bkey(b, t, n); | |
1129 | cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); | |
1130 | if (!cmp) | |
1131 | return k; | |
1132 | ||
1133 | n = n * 2 + (cmp < 0); | |
1c6fdbd8 KO |
1134 | } while (n < t->size); |
1135 | ||
72492d55 | 1136 | inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); |
1c6fdbd8 KO |
1137 | |
1138 | /* | |
1139 | * n would have been the node we recursed to - the low bit tells us if | |
1140 | * we recursed left or recursed right. | |
1141 | */ | |
b904a799 KO |
1142 | if (likely(!(n & 1))) { |
1143 | --inorder; | |
1144 | if (unlikely(!inorder)) | |
1c6fdbd8 | 1145 | return btree_bkey_first(b, t); |
b904a799 | 1146 | |
72492d55 | 1147 | f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; |
1c6fdbd8 | 1148 | } |
b904a799 KO |
1149 | |
1150 | return cacheline_to_bkey(b, t, inorder, f->key_offset); | |
1c6fdbd8 KO |
1151 | } |
1152 | ||
c4537686 KO |
1153 | static __always_inline __flatten |
1154 | struct bkey_packed *__bch2_bset_search(struct btree *b, | |
1c6fdbd8 | 1155 | struct bset_tree *t, |
a00fd8c5 | 1156 | struct bpos *search, |
a00fd8c5 | 1157 | const struct bkey_packed *lossy_packed_search) |
1c6fdbd8 | 1158 | { |
1c6fdbd8 KO |
1159 | |
1160 | /* | |
1161 | * First, we search for a cacheline, then lastly we do a linear search | |
1162 | * within that cacheline. | |
1163 | * | |
1164 | * To search for the cacheline, there's three different possibilities: | |
1165 | * * The set is too small to have a search tree, so we just do a linear | |
1166 | * search over the whole set. | |
1167 | * * The set is the one we're currently inserting into; keeping a full | |
1168 | * auxiliary search tree up to date would be too expensive, so we | |
1169 | * use a much simpler lookup table to do a binary search - | |
1170 | * bset_search_write_set(). | |
1171 | * * Or we use the auxiliary search tree we constructed earlier - | |
1172 | * bset_search_tree() | |
1173 | */ | |
1174 | ||
1175 | switch (bset_aux_tree_type(t)) { | |
1176 | case BSET_NO_AUX_TREE: | |
c4537686 | 1177 | return btree_bkey_first(b, t); |
1c6fdbd8 | 1178 | case BSET_RW_AUX_TREE: |
2649b514 | 1179 | return bset_search_write_set(b, t, search); |
1c6fdbd8 | 1180 | case BSET_RO_AUX_TREE: |
c4537686 KO |
1181 | return bset_search_tree(b, t, search, lossy_packed_search); |
1182 | default: | |
439c172b | 1183 | BUG(); |
1c6fdbd8 | 1184 | } |
c4537686 | 1185 | } |
1c6fdbd8 | 1186 | |
c4537686 KO |
1187 | static __always_inline __flatten |
1188 | struct bkey_packed *bch2_bset_search_linear(struct btree *b, | |
1189 | struct bset_tree *t, | |
1190 | struct bpos *search, | |
1191 | struct bkey_packed *packed_search, | |
1192 | const struct bkey_packed *lossy_packed_search, | |
1193 | struct bkey_packed *m) | |
1194 | { | |
1c6fdbd8 KO |
1195 | if (lossy_packed_search) |
1196 | while (m != btree_bkey_last(b, t) && | |
9626aeb1 KO |
1197 | bkey_iter_cmp_p_or_unp(b, m, |
1198 | lossy_packed_search, search) < 0) | |
ac2ccddc | 1199 | m = bkey_p_next(m); |
1c6fdbd8 KO |
1200 | |
1201 | if (!packed_search) | |
1202 | while (m != btree_bkey_last(b, t) && | |
9626aeb1 | 1203 | bkey_iter_pos_cmp(b, m, search) < 0) |
ac2ccddc | 1204 | m = bkey_p_next(m); |
1c6fdbd8 | 1205 | |
34aeb820 | 1206 | if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { |
1c6fdbd8 KO |
1207 | struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); |
1208 | ||
1209 | BUG_ON(prev && | |
9626aeb1 KO |
1210 | bkey_iter_cmp_p_or_unp(b, prev, |
1211 | packed_search, search) >= 0); | |
1c6fdbd8 KO |
1212 | } |
1213 | ||
1214 | return m; | |
1215 | } | |
1216 | ||
1217 | /* Btree node iterator */ | |
1218 | ||
a00fd8c5 KO |
1219 | static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, |
1220 | struct btree *b, | |
1221 | const struct bkey_packed *k, | |
1222 | const struct bkey_packed *end) | |
1223 | { | |
1224 | if (k != end) { | |
1225 | struct btree_node_iter_set *pos; | |
1226 | ||
1227 | btree_node_iter_for_each(iter, pos) | |
1228 | ; | |
1229 | ||
1230 | BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); | |
1231 | *pos = (struct btree_node_iter_set) { | |
1232 | __btree_node_key_to_offset(b, k), | |
1233 | __btree_node_key_to_offset(b, end) | |
1234 | }; | |
1235 | } | |
1236 | } | |
1237 | ||
1c6fdbd8 KO |
1238 | void bch2_btree_node_iter_push(struct btree_node_iter *iter, |
1239 | struct btree *b, | |
1240 | const struct bkey_packed *k, | |
1241 | const struct bkey_packed *end) | |
1242 | { | |
1243 | __bch2_btree_node_iter_push(iter, b, k, end); | |
1244 | bch2_btree_node_iter_sort(iter, b); | |
1245 | } | |
1246 | ||
3e3e02e6 | 1247 | noinline __flatten __cold |
1c6fdbd8 | 1248 | static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, |
a00fd8c5 | 1249 | struct btree *b, struct bpos *search) |
1c6fdbd8 | 1250 | { |
2649b514 | 1251 | struct bkey_packed *k; |
1c6fdbd8 | 1252 | |
a00fd8c5 | 1253 | trace_bkey_pack_pos_fail(search); |
1c6fdbd8 | 1254 | |
2649b514 | 1255 | bch2_btree_node_iter_init_from_start(iter, b); |
1c6fdbd8 | 1256 | |
2649b514 KO |
1257 | while ((k = bch2_btree_node_iter_peek(iter, b)) && |
1258 | bkey_iter_pos_cmp(b, k, search) < 0) | |
1259 | bch2_btree_node_iter_advance(iter, b); | |
1c6fdbd8 KO |
1260 | } |
1261 | ||
1262 | /** | |
96dea3d5 | 1263 | * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a |
1c6fdbd8 KO |
1264 | * given position |
1265 | * | |
96dea3d5 KO |
1266 | * @iter: iterator to initialize |
1267 | * @b: btree node to search | |
1268 | * @search: search key | |
1269 | * | |
1c6fdbd8 KO |
1270 | * Main entry point to the lookup code for individual btree nodes: |
1271 | * | |
1272 | * NOTE: | |
1273 | * | |
1274 | * When you don't filter out deleted keys, btree nodes _do_ contain duplicate | |
1275 | * keys. This doesn't matter for most code, but it does matter for lookups. | |
1276 | * | |
1277 | * Some adjacent keys with a string of equal keys: | |
1278 | * i j k k k k l m | |
1279 | * | |
1280 | * If you search for k, the lookup code isn't guaranteed to return you any | |
1281 | * specific k. The lookup code is conceptually doing a binary search and | |
1282 | * iterating backwards is very expensive so if the pivot happens to land at the | |
1283 | * last k that's what you'll get. | |
1284 | * | |
1285 | * This works out ok, but it's something to be aware of: | |
1286 | * | |
1287 | * - For non extents, we guarantee that the live key comes last - see | |
1288 | * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't | |
1289 | * see will only be deleted keys you don't care about. | |
1290 | * | |
1291 | * - For extents, deleted keys sort last (see the comment at the top of this | |
1292 | * file). But when you're searching for extents, you actually want the first | |
1293 | * key strictly greater than your search key - an extent that compares equal | |
1294 | * to the search key is going to have 0 sectors after the search key. | |
1295 | * | |
1296 | * But this does mean that we can't just search for | |
e751c01a | 1297 | * bpos_successor(start_of_range) to get the first extent that overlaps with |
1c6fdbd8 KO |
1298 | * the range we want - if we're unlucky and there's an extent that ends |
1299 | * exactly where we searched, then there could be a deleted key at the same | |
1300 | * position and we'd get that when we search instead of the preceding extent | |
1301 | * we needed. | |
1302 | * | |
1303 | * So we've got to search for start_of_range, then after the lookup iterate | |
1304 | * past any extents that compare equal to the position we searched for. | |
1305 | */ | |
c4e065c2 | 1306 | __flatten |
1c6fdbd8 | 1307 | void bch2_btree_node_iter_init(struct btree_node_iter *iter, |
a00fd8c5 | 1308 | struct btree *b, struct bpos *search) |
1c6fdbd8 | 1309 | { |
1c6fdbd8 | 1310 | struct bkey_packed p, *packed_search = NULL; |
c4e065c2 | 1311 | struct btree_node_iter_set *pos = iter->data; |
c4537686 KO |
1312 | struct bkey_packed *k[MAX_BSETS]; |
1313 | unsigned i; | |
1c6fdbd8 | 1314 | |
e88a75eb KO |
1315 | EBUG_ON(bpos_lt(*search, b->data->min_key)); |
1316 | EBUG_ON(bpos_gt(*search, b->data->max_key)); | |
1c6fdbd8 KO |
1317 | bset_aux_tree_verify(b); |
1318 | ||
271a3d3a | 1319 | memset(iter, 0, sizeof(*iter)); |
1c6fdbd8 | 1320 | |
a00fd8c5 | 1321 | switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { |
1c6fdbd8 KO |
1322 | case BKEY_PACK_POS_EXACT: |
1323 | packed_search = &p; | |
1324 | break; | |
1325 | case BKEY_PACK_POS_SMALLER: | |
1326 | packed_search = NULL; | |
1327 | break; | |
1328 | case BKEY_PACK_POS_FAIL: | |
a00fd8c5 | 1329 | btree_node_iter_init_pack_failed(iter, b, search); |
1c6fdbd8 KO |
1330 | return; |
1331 | } | |
1332 | ||
c4537686 KO |
1333 | for (i = 0; i < b->nsets; i++) { |
1334 | k[i] = __bch2_bset_search(b, b->set + i, search, &p); | |
1335 | prefetch_four_cachelines(k[i]); | |
1336 | } | |
1337 | ||
1338 | for (i = 0; i < b->nsets; i++) { | |
1339 | struct bset_tree *t = b->set + i; | |
c4e065c2 KO |
1340 | struct bkey_packed *end = btree_bkey_last(b, t); |
1341 | ||
c4537686 KO |
1342 | k[i] = bch2_bset_search_linear(b, t, search, |
1343 | packed_search, &p, k[i]); | |
1344 | if (k[i] != end) | |
c4e065c2 | 1345 | *pos++ = (struct btree_node_iter_set) { |
c4537686 | 1346 | __btree_node_key_to_offset(b, k[i]), |
c4e065c2 KO |
1347 | __btree_node_key_to_offset(b, end) |
1348 | }; | |
1349 | } | |
1c6fdbd8 KO |
1350 | |
1351 | bch2_btree_node_iter_sort(iter, b); | |
1352 | } | |
1353 | ||
1354 | void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, | |
271a3d3a | 1355 | struct btree *b) |
1c6fdbd8 | 1356 | { |
271a3d3a | 1357 | memset(iter, 0, sizeof(*iter)); |
1c6fdbd8 KO |
1358 | |
1359 | for_each_bset(b, t) | |
1360 | __bch2_btree_node_iter_push(iter, b, | |
1361 | btree_bkey_first(b, t), | |
1362 | btree_bkey_last(b, t)); | |
1363 | bch2_btree_node_iter_sort(iter, b); | |
1364 | } | |
1365 | ||
1366 | struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, | |
1367 | struct btree *b, | |
1368 | struct bset_tree *t) | |
1369 | { | |
1370 | struct btree_node_iter_set *set; | |
1371 | ||
1372 | btree_node_iter_for_each(iter, set) | |
1373 | if (set->end == t->end_offset) | |
1374 | return __btree_node_offset_to_key(b, set->k); | |
1375 | ||
1376 | return btree_bkey_last(b, t); | |
1377 | } | |
1378 | ||
1379 | static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, | |
1380 | struct btree *b, | |
1381 | unsigned first) | |
1382 | { | |
1383 | bool ret; | |
1384 | ||
271a3d3a | 1385 | if ((ret = (btree_node_iter_cmp(b, |
1c6fdbd8 KO |
1386 | iter->data[first], |
1387 | iter->data[first + 1]) > 0))) | |
1388 | swap(iter->data[first], iter->data[first + 1]); | |
1389 | return ret; | |
1390 | } | |
1391 | ||
1392 | void bch2_btree_node_iter_sort(struct btree_node_iter *iter, | |
1393 | struct btree *b) | |
1394 | { | |
1395 | /* unrolled bubble sort: */ | |
1396 | ||
1397 | if (!__btree_node_iter_set_end(iter, 2)) { | |
1398 | btree_node_iter_sort_two(iter, b, 0); | |
1399 | btree_node_iter_sort_two(iter, b, 1); | |
1400 | } | |
1401 | ||
1402 | if (!__btree_node_iter_set_end(iter, 1)) | |
1403 | btree_node_iter_sort_two(iter, b, 0); | |
1404 | } | |
1405 | ||
1406 | void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, | |
1407 | struct btree_node_iter_set *set) | |
1408 | { | |
1409 | struct btree_node_iter_set *last = | |
1410 | iter->data + ARRAY_SIZE(iter->data) - 1; | |
1411 | ||
1412 | memmove(&set[0], &set[1], (void *) last - (void *) set); | |
1413 | *last = (struct btree_node_iter_set) { 0, 0 }; | |
1414 | } | |
1415 | ||
1416 | static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, | |
1417 | struct btree *b) | |
1418 | { | |
1419 | iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; | |
1420 | ||
1421 | EBUG_ON(iter->data->k > iter->data->end); | |
1422 | ||
1423 | if (unlikely(__btree_node_iter_set_end(iter, 0))) { | |
005def8f KO |
1424 | /* avoid an expensive memmove call: */ |
1425 | iter->data[0] = iter->data[1]; | |
1426 | iter->data[1] = iter->data[2]; | |
1427 | iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; | |
1c6fdbd8 KO |
1428 | return; |
1429 | } | |
1430 | ||
1431 | if (__btree_node_iter_set_end(iter, 1)) | |
1432 | return; | |
1433 | ||
1434 | if (!btree_node_iter_sort_two(iter, b, 0)) | |
1435 | return; | |
1436 | ||
1437 | if (__btree_node_iter_set_end(iter, 2)) | |
1438 | return; | |
1439 | ||
1440 | btree_node_iter_sort_two(iter, b, 1); | |
1441 | } | |
1442 | ||
1c6fdbd8 KO |
1443 | void bch2_btree_node_iter_advance(struct btree_node_iter *iter, |
1444 | struct btree *b) | |
1445 | { | |
34aeb820 KO |
1446 | if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { |
1447 | __bch2_btree_node_iter_verify(iter, b); | |
1448 | __bch2_btree_node_iter_next_check(iter, b); | |
f13f5a8c KO |
1449 | } |
1450 | ||
271a3d3a | 1451 | __bch2_btree_node_iter_advance(iter, b); |
1c6fdbd8 KO |
1452 | } |
1453 | ||
1c6fdbd8 KO |
1454 | /* |
1455 | * Expensive: | |
1456 | */ | |
e67ab045 KO |
1457 | struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, |
1458 | struct btree *b) | |
1c6fdbd8 KO |
1459 | { |
1460 | struct bkey_packed *k, *prev = NULL; | |
1c6fdbd8 | 1461 | struct btree_node_iter_set *set; |
ac10a961 | 1462 | unsigned end = 0; |
1c6fdbd8 | 1463 | |
34aeb820 | 1464 | bch2_btree_node_iter_verify(iter, b); |
1c6fdbd8 KO |
1465 | |
1466 | for_each_bset(b, t) { | |
e67ab045 KO |
1467 | k = bch2_bkey_prev_all(b, t, |
1468 | bch2_btree_node_iter_bset_pos(iter, b, t)); | |
1c6fdbd8 | 1469 | if (k && |
a00fd8c5 | 1470 | (!prev || bkey_iter_cmp(b, k, prev) > 0)) { |
1c6fdbd8 KO |
1471 | prev = k; |
1472 | end = t->end_offset; | |
1473 | } | |
1474 | } | |
1475 | ||
1476 | if (!prev) | |
e67ab045 | 1477 | return NULL; |
1c6fdbd8 KO |
1478 | |
1479 | /* | |
1480 | * We're manually memmoving instead of just calling sort() to ensure the | |
1481 | * prev we picked ends up in slot 0 - sort won't necessarily put it | |
1482 | * there because of duplicate deleted keys: | |
1483 | */ | |
1484 | btree_node_iter_for_each(iter, set) | |
1485 | if (set->end == end) | |
1486 | goto found; | |
1487 | ||
1488 | BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); | |
1489 | found: | |
1490 | BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); | |
1491 | ||
1492 | memmove(&iter->data[1], | |
1493 | &iter->data[0], | |
1494 | (void *) set - (void *) &iter->data[0]); | |
1495 | ||
1496 | iter->data[0].k = __btree_node_key_to_offset(b, prev); | |
1497 | iter->data[0].end = end; | |
1c6fdbd8 | 1498 | |
34aeb820 | 1499 | bch2_btree_node_iter_verify(iter, b); |
e67ab045 KO |
1500 | return prev; |
1501 | } | |
1c6fdbd8 | 1502 | |
c052cf82 KO |
1503 | struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, |
1504 | struct btree *b) | |
e67ab045 KO |
1505 | { |
1506 | struct bkey_packed *prev; | |
1507 | ||
1508 | do { | |
1509 | prev = bch2_btree_node_iter_prev_all(iter, b); | |
c052cf82 | 1510 | } while (prev && bkey_deleted(prev)); |
1c6fdbd8 KO |
1511 | |
1512 | return prev; | |
1513 | } | |
1514 | ||
1515 | struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, | |
1516 | struct btree *b, | |
1517 | struct bkey *u) | |
1518 | { | |
1519 | struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); | |
1520 | ||
1521 | return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; | |
1522 | } | |
1523 | ||
1524 | /* Mergesort */ | |
1525 | ||
a345b0f3 | 1526 | void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) |
1c6fdbd8 | 1527 | { |
b6fb4269 | 1528 | for_each_bset_c(b, t) { |
1c6fdbd8 KO |
1529 | enum bset_aux_tree_type type = bset_aux_tree_type(t); |
1530 | size_t j; | |
1531 | ||
1532 | stats->sets[type].nr++; | |
1533 | stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * | |
1534 | sizeof(u64); | |
1535 | ||
1536 | if (bset_has_ro_aux_tree(t)) { | |
1537 | stats->floats += t->size - 1; | |
1538 | ||
1539 | for (j = 1; j < t->size; j++) | |
58404bb2 KO |
1540 | stats->failed += |
1541 | bkey_float(b, t, j)->exponent == | |
1542 | BFLOAT_FAILED; | |
1c6fdbd8 KO |
1543 | } |
1544 | } | |
1545 | } | |
1546 | ||
319f9ac3 KO |
1547 | void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, |
1548 | struct bkey_packed *k) | |
1c6fdbd8 KO |
1549 | { |
1550 | struct bset_tree *t = bch2_bkey_to_bset(b, k); | |
1bdb67e8 | 1551 | struct bkey uk; |
b564513c | 1552 | unsigned j, inorder; |
1c6fdbd8 | 1553 | |
1c6fdbd8 | 1554 | if (!bset_has_ro_aux_tree(t)) |
319f9ac3 | 1555 | return; |
1c6fdbd8 | 1556 | |
b564513c KO |
1557 | inorder = bkey_to_cacheline(b, t, k); |
1558 | if (!inorder || inorder >= t->size) | |
319f9ac3 | 1559 | return; |
b564513c | 1560 | |
72492d55 | 1561 | j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); |
b564513c | 1562 | if (k != tree_to_bkey(b, t, j)) |
319f9ac3 | 1563 | return; |
b564513c KO |
1564 | |
1565 | switch (bkey_float(b, t, j)->exponent) { | |
58404bb2 | 1566 | case BFLOAT_FAILED: |
b564513c | 1567 | uk = bkey_unpack_key(b, k); |
401ec4db | 1568 | prt_printf(out, |
319f9ac3 | 1569 | " failed unpacked at depth %u\n" |
f020bfcd KO |
1570 | "\t", |
1571 | ilog2(j)); | |
1572 | bch2_bpos_to_text(out, uk.p); | |
401ec4db | 1573 | prt_printf(out, "\n"); |
319f9ac3 | 1574 | break; |
b564513c | 1575 | } |
1c6fdbd8 | 1576 | } |