Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Code for working with individual keys, and sorted sets of keys with in a | |
4 | * btree node | |
5 | * | |
6 | * Copyright 2012 Google, Inc. | |
7 | */ | |
8 | ||
9 | #include "bcachefs.h" | |
10 | #include "btree_cache.h" | |
11 | #include "bset.h" | |
12 | #include "eytzinger.h" | |
13 | #include "trace.h" | |
14 | #include "util.h" | |
15 | ||
16 | #include <asm/unaligned.h> | |
17 | #include <linux/console.h> | |
18 | #include <linux/random.h> | |
19 | #include <linux/prefetch.h> | |
20 | ||
271a3d3a KO |
21 | static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, |
22 | struct btree *); | |
23 | ||
63f1a598 KO |
24 | static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) |
25 | { | |
26 | unsigned n = ARRAY_SIZE(iter->data); | |
27 | ||
28 | while (n && __btree_node_iter_set_end(iter, n - 1)) | |
29 | --n; | |
30 | ||
31 | return n; | |
32 | } | |
33 | ||
1c6fdbd8 KO |
34 | struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) |
35 | { | |
216c9fac | 36 | return bch2_bkey_to_bset_inlined(b, k); |
1c6fdbd8 KO |
37 | } |
38 | ||
39 | /* | |
40 | * There are never duplicate live keys in the btree - but including keys that | |
41 | * have been flagged as deleted (and will be cleaned up later) we _will_ see | |
42 | * duplicates. | |
43 | * | |
44 | * Thus the sort order is: usual key comparison first, but for keys that compare | |
45 | * equal the deleted key(s) come first, and the (at most one) live version comes | |
46 | * last. | |
47 | * | |
48 | * The main reason for this is insertion: to handle overwrites, we first iterate | |
49 | * over keys that compare equal to our insert key, and then insert immediately | |
50 | * prior to the first key greater than the key we're inserting - our insert | |
51 | * position will be after all keys that compare equal to our insert key, which | |
52 | * by the time we actually do the insert will all be deleted. | |
53 | */ | |
54 | ||
a34782a0 KO |
55 | void bch2_dump_bset(struct bch_fs *c, struct btree *b, |
56 | struct bset *i, unsigned set) | |
1c6fdbd8 KO |
57 | { |
58 | struct bkey_packed *_k, *_n; | |
a34782a0 KO |
59 | struct bkey uk, n; |
60 | struct bkey_s_c k; | |
fa8e94fa | 61 | struct printbuf buf = PRINTBUF; |
1c6fdbd8 KO |
62 | |
63 | if (!i->u64s) | |
64 | return; | |
65 | ||
a34782a0 | 66 | for (_k = i->start; |
1c6fdbd8 | 67 | _k < vstruct_last(i); |
a34782a0 | 68 | _k = _n) { |
ac2ccddc | 69 | _n = bkey_p_next(_k); |
1c6fdbd8 | 70 | |
d04d2727 KO |
71 | if (!_k->u64s) { |
72 | printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set, | |
73 | _k->_data - i->_data); | |
74 | break; | |
75 | } | |
76 | ||
a34782a0 | 77 | k = bkey_disassemble(b, _k, &uk); |
fa8e94fa KO |
78 | |
79 | printbuf_reset(&buf); | |
a34782a0 | 80 | if (c) |
fa8e94fa | 81 | bch2_bkey_val_to_text(&buf, c, k); |
a34782a0 | 82 | else |
fa8e94fa | 83 | bch2_bkey_to_text(&buf, k.k); |
24e0c3f8 | 84 | printk(KERN_ERR "block %u key %5zu: %s\n", set, |
fa8e94fa | 85 | _k->_data - i->_data, buf.buf); |
1c6fdbd8 KO |
86 | |
87 | if (_n == vstruct_last(i)) | |
88 | continue; | |
89 | ||
90 | n = bkey_unpack_key(b, _n); | |
91 | ||
e88a75eb | 92 | if (bpos_lt(n.p, k.k->p)) { |
1c6fdbd8 KO |
93 | printk(KERN_ERR "Key skipped backwards\n"); |
94 | continue; | |
95 | } | |
96 | ||
e88a75eb | 97 | if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) |
1c6fdbd8 KO |
98 | printk(KERN_ERR "Duplicate keys\n"); |
99 | } | |
fa8e94fa KO |
100 | |
101 | printbuf_exit(&buf); | |
1c6fdbd8 KO |
102 | } |
103 | ||
a34782a0 | 104 | void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) |
1c6fdbd8 KO |
105 | { |
106 | struct bset_tree *t; | |
107 | ||
108 | console_lock(); | |
109 | for_each_bset(b, t) | |
a34782a0 | 110 | bch2_dump_bset(c, b, bset(b, t), t - b->set); |
1c6fdbd8 KO |
111 | console_unlock(); |
112 | } | |
113 | ||
114 | void bch2_dump_btree_node_iter(struct btree *b, | |
115 | struct btree_node_iter *iter) | |
116 | { | |
117 | struct btree_node_iter_set *set; | |
fa8e94fa | 118 | struct printbuf buf = PRINTBUF; |
1c6fdbd8 | 119 | |
63f1a598 KO |
120 | printk(KERN_ERR "btree node iter with %u/%u sets:\n", |
121 | __btree_node_iter_used(iter), b->nsets); | |
1c6fdbd8 KO |
122 | |
123 | btree_node_iter_for_each(iter, set) { | |
124 | struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); | |
125 | struct bset_tree *t = bch2_bkey_to_bset(b, k); | |
126 | struct bkey uk = bkey_unpack_key(b, k); | |
1c6fdbd8 | 127 | |
fa8e94fa KO |
128 | printbuf_reset(&buf); |
129 | bch2_bkey_to_text(&buf, &uk); | |
63f1a598 | 130 | printk(KERN_ERR "set %zu key %u: %s\n", |
fa8e94fa | 131 | t - b->set, set->k, buf.buf); |
1c6fdbd8 | 132 | } |
fa8e94fa KO |
133 | |
134 | printbuf_exit(&buf); | |
1c6fdbd8 KO |
135 | } |
136 | ||
812a9297 | 137 | struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) |
1c6fdbd8 KO |
138 | { |
139 | struct bset_tree *t; | |
140 | struct bkey_packed *k; | |
812a9297 | 141 | struct btree_nr_keys nr = {}; |
1c6fdbd8 KO |
142 | |
143 | for_each_bset(b, t) | |
ad44bdc3 | 144 | bset_tree_for_each_key(b, t, k) |
c052cf82 | 145 | if (!bkey_deleted(k)) |
1c6fdbd8 | 146 | btree_keys_account_key_add(&nr, t - b->set, k); |
812a9297 KO |
147 | return nr; |
148 | } | |
149 | ||
150 | #ifdef CONFIG_BCACHEFS_DEBUG | |
151 | ||
152 | void __bch2_verify_btree_nr_keys(struct btree *b) | |
153 | { | |
154 | struct btree_nr_keys nr = bch2_btree_node_count_keys(b); | |
1c6fdbd8 KO |
155 | |
156 | BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); | |
157 | } | |
158 | ||
271a3d3a KO |
159 | static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, |
160 | struct btree *b) | |
1c6fdbd8 | 161 | { |
271a3d3a KO |
162 | struct btree_node_iter iter = *_iter; |
163 | const struct bkey_packed *k, *n; | |
164 | ||
165 | k = bch2_btree_node_iter_peek_all(&iter, b); | |
166 | __bch2_btree_node_iter_advance(&iter, b); | |
167 | n = bch2_btree_node_iter_peek_all(&iter, b); | |
1c6fdbd8 KO |
168 | |
169 | bkey_unpack_key(b, k); | |
170 | ||
171 | if (n && | |
a00fd8c5 | 172 | bkey_iter_cmp(b, k, n) > 0) { |
271a3d3a | 173 | struct btree_node_iter_set *set; |
1c6fdbd8 KO |
174 | struct bkey ku = bkey_unpack_key(b, k); |
175 | struct bkey nu = bkey_unpack_key(b, n); | |
fa8e94fa KO |
176 | struct printbuf buf1 = PRINTBUF; |
177 | struct printbuf buf2 = PRINTBUF; | |
1c6fdbd8 | 178 | |
a34782a0 | 179 | bch2_dump_btree_node(NULL, b); |
fa8e94fa KO |
180 | bch2_bkey_to_text(&buf1, &ku); |
181 | bch2_bkey_to_text(&buf2, &nu); | |
271a3d3a | 182 | printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", |
fa8e94fa | 183 | buf1.buf, buf2.buf); |
271a3d3a KO |
184 | printk(KERN_ERR "iter was:"); |
185 | ||
186 | btree_node_iter_for_each(_iter, set) { | |
96dea3d5 KO |
187 | struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k); |
188 | struct bset_tree *t = bch2_bkey_to_bset(b, k2); | |
271a3d3a | 189 | printk(" [%zi %zi]", t - b->set, |
96dea3d5 | 190 | k2->_data - bset(b, t)->_data); |
271a3d3a KO |
191 | } |
192 | panic("\n"); | |
1c6fdbd8 KO |
193 | } |
194 | } | |
195 | ||
196 | void bch2_btree_node_iter_verify(struct btree_node_iter *iter, | |
271a3d3a | 197 | struct btree *b) |
1c6fdbd8 | 198 | { |
1fe08f31 | 199 | struct btree_node_iter_set *set, *s2; |
63f1a598 | 200 | struct bkey_packed *k, *p; |
1c6fdbd8 | 201 | struct bset_tree *t; |
1c6fdbd8 | 202 | |
63f1a598 KO |
203 | if (bch2_btree_node_iter_end(iter)) |
204 | return; | |
205 | ||
1fe08f31 | 206 | /* Verify no duplicates: */ |
67e0dd8f KO |
207 | btree_node_iter_for_each(iter, set) { |
208 | BUG_ON(set->k > set->end); | |
1fe08f31 KO |
209 | btree_node_iter_for_each(iter, s2) |
210 | BUG_ON(set != s2 && set->end == s2->end); | |
67e0dd8f | 211 | } |
1c6fdbd8 | 212 | |
1fe08f31 | 213 | /* Verify that set->end is correct: */ |
1c6fdbd8 | 214 | btree_node_iter_for_each(iter, set) { |
1fe08f31 KO |
215 | for_each_bset(b, t) |
216 | if (set->end == t->end_offset) | |
217 | goto found; | |
218 | BUG(); | |
219 | found: | |
220 | BUG_ON(set->k < btree_bkey_first_offset(t) || | |
221 | set->k >= t->end_offset); | |
1c6fdbd8 KO |
222 | } |
223 | ||
1fe08f31 KO |
224 | /* Verify iterator is sorted: */ |
225 | btree_node_iter_for_each(iter, set) | |
226 | BUG_ON(set != iter->data && | |
271a3d3a | 227 | btree_node_iter_cmp(b, set[-1], set[0]) > 0); |
63f1a598 KO |
228 | |
229 | k = bch2_btree_node_iter_peek_all(iter, b); | |
230 | ||
231 | for_each_bset(b, t) { | |
232 | if (iter->data[0].end == t->end_offset) | |
233 | continue; | |
234 | ||
235 | p = bch2_bkey_prev_all(b, t, | |
236 | bch2_btree_node_iter_bset_pos(iter, b, t)); | |
237 | ||
238 | BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); | |
239 | } | |
1c6fdbd8 KO |
240 | } |
241 | ||
271a3d3a KO |
242 | void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, |
243 | struct bkey_packed *insert, unsigned clobber_u64s) | |
1c6fdbd8 KO |
244 | { |
245 | struct bset_tree *t = bch2_bkey_to_bset(b, where); | |
271a3d3a | 246 | struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); |
5cfd6977 | 247 | struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); |
fa8e94fa KO |
248 | struct printbuf buf1 = PRINTBUF; |
249 | struct printbuf buf2 = PRINTBUF; | |
271a3d3a KO |
250 | #if 0 |
251 | BUG_ON(prev && | |
a00fd8c5 | 252 | bkey_iter_cmp(b, prev, insert) > 0); |
271a3d3a KO |
253 | #else |
254 | if (prev && | |
a00fd8c5 | 255 | bkey_iter_cmp(b, prev, insert) > 0) { |
271a3d3a KO |
256 | struct bkey k1 = bkey_unpack_key(b, prev); |
257 | struct bkey k2 = bkey_unpack_key(b, insert); | |
1c6fdbd8 | 258 | |
a34782a0 | 259 | bch2_dump_btree_node(NULL, b); |
fa8e94fa KO |
260 | bch2_bkey_to_text(&buf1, &k1); |
261 | bch2_bkey_to_text(&buf2, &k2); | |
271a3d3a KO |
262 | |
263 | panic("prev > insert:\n" | |
c201e2d9 KO |
264 | "prev key %s\n" |
265 | "insert key %s\n", | |
fa8e94fa | 266 | buf1.buf, buf2.buf); |
1c6fdbd8 | 267 | } |
271a3d3a KO |
268 | #endif |
269 | #if 0 | |
270 | BUG_ON(next != btree_bkey_last(b, t) && | |
a00fd8c5 | 271 | bkey_iter_cmp(b, insert, next) > 0); |
271a3d3a KO |
272 | #else |
273 | if (next != btree_bkey_last(b, t) && | |
a00fd8c5 | 274 | bkey_iter_cmp(b, insert, next) > 0) { |
271a3d3a KO |
275 | struct bkey k1 = bkey_unpack_key(b, insert); |
276 | struct bkey k2 = bkey_unpack_key(b, next); | |
1c6fdbd8 | 277 | |
a34782a0 | 278 | bch2_dump_btree_node(NULL, b); |
fa8e94fa KO |
279 | bch2_bkey_to_text(&buf1, &k1); |
280 | bch2_bkey_to_text(&buf2, &k2); | |
271a3d3a KO |
281 | |
282 | panic("insert > next:\n" | |
c201e2d9 KO |
283 | "insert key %s\n" |
284 | "next key %s\n", | |
fa8e94fa | 285 | buf1.buf, buf2.buf); |
1c6fdbd8 | 286 | } |
271a3d3a KO |
287 | #endif |
288 | } | |
289 | ||
1c6fdbd8 KO |
290 | #else |
291 | ||
292 | static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, | |
271a3d3a | 293 | struct btree *b) {} |
1c6fdbd8 KO |
294 | |
295 | #endif | |
296 | ||
297 | /* Auxiliary search trees */ | |
298 | ||
58404bb2 KO |
299 | #define BFLOAT_FAILED_UNPACKED U8_MAX |
300 | #define BFLOAT_FAILED U8_MAX | |
1c6fdbd8 | 301 | |
1c6fdbd8 KO |
302 | struct bkey_float { |
303 | u8 exponent; | |
304 | u8 key_offset; | |
b904a799 KO |
305 | u16 mantissa; |
306 | }; | |
307 | #define BKEY_MANTISSA_BITS 16 | |
1c6fdbd8 KO |
308 | |
309 | static unsigned bkey_float_byte_offset(unsigned idx) | |
310 | { | |
b904a799 | 311 | return idx * sizeof(struct bkey_float); |
1c6fdbd8 KO |
312 | } |
313 | ||
314 | struct ro_aux_tree { | |
5cfd6977 KO |
315 | u8 nothing[0]; |
316 | struct bkey_float f[]; | |
1c6fdbd8 KO |
317 | }; |
318 | ||
319 | struct rw_aux_tree { | |
320 | u16 offset; | |
321 | struct bpos k; | |
322 | }; | |
323 | ||
1c6fdbd8 KO |
324 | static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) |
325 | { | |
326 | BUG_ON(t->aux_data_offset == U16_MAX); | |
327 | ||
328 | switch (bset_aux_tree_type(t)) { | |
329 | case BSET_NO_AUX_TREE: | |
330 | return t->aux_data_offset; | |
331 | case BSET_RO_AUX_TREE: | |
332 | return t->aux_data_offset + | |
b904a799 KO |
333 | DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + |
334 | t->size * sizeof(u8), 8); | |
1c6fdbd8 KO |
335 | case BSET_RW_AUX_TREE: |
336 | return t->aux_data_offset + | |
337 | DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); | |
338 | default: | |
339 | BUG(); | |
340 | } | |
341 | } | |
342 | ||
343 | static unsigned bset_aux_tree_buf_start(const struct btree *b, | |
344 | const struct bset_tree *t) | |
345 | { | |
346 | return t == b->set | |
347 | ? DIV_ROUND_UP(b->unpack_fn_len, 8) | |
348 | : bset_aux_tree_buf_end(t - 1); | |
349 | } | |
350 | ||
351 | static void *__aux_tree_base(const struct btree *b, | |
352 | const struct bset_tree *t) | |
353 | { | |
354 | return b->aux_data + t->aux_data_offset * 8; | |
355 | } | |
356 | ||
357 | static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, | |
358 | const struct bset_tree *t) | |
359 | { | |
360 | EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); | |
361 | ||
362 | return __aux_tree_base(b, t); | |
363 | } | |
364 | ||
365 | static u8 *ro_aux_tree_prev(const struct btree *b, | |
366 | const struct bset_tree *t) | |
367 | { | |
368 | EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); | |
369 | ||
370 | return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); | |
371 | } | |
372 | ||
1c6fdbd8 KO |
373 | static struct bkey_float *bkey_float(const struct btree *b, |
374 | const struct bset_tree *t, | |
375 | unsigned idx) | |
376 | { | |
b904a799 | 377 | return ro_aux_tree_base(b, t)->f + idx; |
1c6fdbd8 KO |
378 | } |
379 | ||
d108efc2 | 380 | static void bset_aux_tree_verify(const struct btree *b) |
1c6fdbd8 KO |
381 | { |
382 | #ifdef CONFIG_BCACHEFS_DEBUG | |
d108efc2 | 383 | const struct bset_tree *t; |
1c6fdbd8 KO |
384 | |
385 | for_each_bset(b, t) { | |
386 | if (t->aux_data_offset == U16_MAX) | |
387 | continue; | |
388 | ||
389 | BUG_ON(t != b->set && | |
390 | t[-1].aux_data_offset == U16_MAX); | |
391 | ||
392 | BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); | |
393 | BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); | |
394 | BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); | |
395 | } | |
396 | #endif | |
397 | } | |
398 | ||
29364f34 | 399 | void bch2_btree_keys_init(struct btree *b) |
1c6fdbd8 KO |
400 | { |
401 | unsigned i; | |
402 | ||
403 | b->nsets = 0; | |
404 | memset(&b->nr, 0, sizeof(b->nr)); | |
29364f34 | 405 | |
1c6fdbd8 KO |
406 | for (i = 0; i < MAX_BSETS; i++) |
407 | b->set[i].data_offset = U16_MAX; | |
408 | ||
409 | bch2_bset_set_no_aux_tree(b, b->set); | |
410 | } | |
411 | ||
412 | /* Binary tree stuff for auxiliary search trees */ | |
413 | ||
414 | /* | |
415 | * Cacheline/offset <-> bkey pointer arithmetic: | |
416 | * | |
417 | * t->tree is a binary search tree in an array; each node corresponds to a key | |
418 | * in one cacheline in t->set (BSET_CACHELINE bytes). | |
419 | * | |
420 | * This means we don't have to store the full index of the key that a node in | |
421 | * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and | |
422 | * then bkey_float->m gives us the offset within that cacheline, in units of 8 | |
423 | * bytes. | |
424 | * | |
425 | * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to | |
426 | * make this work. | |
427 | * | |
428 | * To construct the bfloat for an arbitrary key we need to know what the key | |
429 | * immediately preceding it is: we have to check if the two keys differ in the | |
430 | * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size | |
431 | * of the previous key so we can walk backwards to it from t->tree[j]'s key. | |
432 | */ | |
433 | ||
434 | static inline void *bset_cacheline(const struct btree *b, | |
435 | const struct bset_tree *t, | |
436 | unsigned cacheline) | |
437 | { | |
438 | return (void *) round_down((unsigned long) btree_bkey_first(b, t), | |
439 | L1_CACHE_BYTES) + | |
440 | cacheline * BSET_CACHELINE; | |
441 | } | |
442 | ||
443 | static struct bkey_packed *cacheline_to_bkey(const struct btree *b, | |
444 | const struct bset_tree *t, | |
445 | unsigned cacheline, | |
446 | unsigned offset) | |
447 | { | |
448 | return bset_cacheline(b, t, cacheline) + offset * 8; | |
449 | } | |
450 | ||
451 | static unsigned bkey_to_cacheline(const struct btree *b, | |
452 | const struct bset_tree *t, | |
453 | const struct bkey_packed *k) | |
454 | { | |
455 | return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; | |
456 | } | |
457 | ||
458 | static ssize_t __bkey_to_cacheline_offset(const struct btree *b, | |
459 | const struct bset_tree *t, | |
460 | unsigned cacheline, | |
461 | const struct bkey_packed *k) | |
462 | { | |
463 | return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); | |
464 | } | |
465 | ||
466 | static unsigned bkey_to_cacheline_offset(const struct btree *b, | |
467 | const struct bset_tree *t, | |
468 | unsigned cacheline, | |
469 | const struct bkey_packed *k) | |
470 | { | |
471 | size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); | |
472 | ||
473 | EBUG_ON(m > U8_MAX); | |
474 | return m; | |
475 | } | |
476 | ||
477 | static inline struct bkey_packed *tree_to_bkey(const struct btree *b, | |
478 | const struct bset_tree *t, | |
479 | unsigned j) | |
480 | { | |
481 | return cacheline_to_bkey(b, t, | |
72492d55 | 482 | __eytzinger1_to_inorder(j, t->size - 1, t->extra), |
1c6fdbd8 KO |
483 | bkey_float(b, t, j)->key_offset); |
484 | } | |
485 | ||
486 | static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, | |
487 | const struct bset_tree *t, | |
488 | unsigned j) | |
489 | { | |
490 | unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; | |
491 | ||
5cfd6977 | 492 | return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s); |
1c6fdbd8 KO |
493 | } |
494 | ||
495 | static struct rw_aux_tree *rw_aux_tree(const struct btree *b, | |
496 | const struct bset_tree *t) | |
497 | { | |
498 | EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); | |
499 | ||
500 | return __aux_tree_base(b, t); | |
501 | } | |
502 | ||
503 | /* | |
504 | * For the write set - the one we're currently inserting keys into - we don't | |
505 | * maintain a full search tree, we just keep a simple lookup table in t->prev. | |
506 | */ | |
507 | static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, | |
508 | struct bset_tree *t, | |
509 | unsigned j) | |
510 | { | |
511 | return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); | |
512 | } | |
513 | ||
514 | static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, | |
515 | unsigned j, struct bkey_packed *k) | |
516 | { | |
517 | EBUG_ON(k >= btree_bkey_last(b, t)); | |
518 | ||
519 | rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { | |
520 | .offset = __btree_node_key_to_offset(b, k), | |
521 | .k = bkey_unpack_pos(b, k), | |
522 | }; | |
523 | } | |
524 | ||
525 | static void bch2_bset_verify_rw_aux_tree(struct btree *b, | |
526 | struct bset_tree *t) | |
527 | { | |
528 | struct bkey_packed *k = btree_bkey_first(b, t); | |
529 | unsigned j = 0; | |
530 | ||
29364f34 | 531 | if (!bch2_expensive_debug_checks) |
1c6fdbd8 KO |
532 | return; |
533 | ||
534 | BUG_ON(bset_has_ro_aux_tree(t)); | |
535 | ||
536 | if (!bset_has_rw_aux_tree(t)) | |
537 | return; | |
538 | ||
539 | BUG_ON(t->size < 1); | |
540 | BUG_ON(rw_aux_to_bkey(b, t, j) != k); | |
541 | ||
542 | goto start; | |
543 | while (1) { | |
544 | if (rw_aux_to_bkey(b, t, j) == k) { | |
e88a75eb | 545 | BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, |
1c6fdbd8 KO |
546 | bkey_unpack_pos(b, k))); |
547 | start: | |
548 | if (++j == t->size) | |
549 | break; | |
550 | ||
551 | BUG_ON(rw_aux_tree(b, t)[j].offset <= | |
552 | rw_aux_tree(b, t)[j - 1].offset); | |
553 | } | |
554 | ||
ac2ccddc | 555 | k = bkey_p_next(k); |
1c6fdbd8 KO |
556 | BUG_ON(k >= btree_bkey_last(b, t)); |
557 | } | |
558 | } | |
559 | ||
560 | /* returns idx of first entry >= offset: */ | |
561 | static unsigned rw_aux_tree_bsearch(struct btree *b, | |
562 | struct bset_tree *t, | |
563 | unsigned offset) | |
564 | { | |
617391ba KO |
565 | unsigned bset_offs = offset - btree_bkey_first_offset(t); |
566 | unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); | |
567 | unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; | |
1c6fdbd8 KO |
568 | |
569 | EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); | |
617391ba KO |
570 | EBUG_ON(!t->size); |
571 | EBUG_ON(idx > t->size); | |
1c6fdbd8 | 572 | |
617391ba KO |
573 | while (idx < t->size && |
574 | rw_aux_tree(b, t)[idx].offset < offset) | |
575 | idx++; | |
1c6fdbd8 | 576 | |
617391ba KO |
577 | while (idx && |
578 | rw_aux_tree(b, t)[idx - 1].offset >= offset) | |
579 | idx--; | |
1c6fdbd8 | 580 | |
617391ba KO |
581 | EBUG_ON(idx < t->size && |
582 | rw_aux_tree(b, t)[idx].offset < offset); | |
583 | EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); | |
584 | EBUG_ON(idx + 1 < t->size && | |
585 | rw_aux_tree(b, t)[idx].offset == | |
586 | rw_aux_tree(b, t)[idx + 1].offset); | |
1c6fdbd8 | 587 | |
617391ba | 588 | return idx; |
1c6fdbd8 KO |
589 | } |
590 | ||
1c6fdbd8 KO |
591 | static inline unsigned bkey_mantissa(const struct bkey_packed *k, |
592 | const struct bkey_float *f, | |
593 | unsigned idx) | |
594 | { | |
595 | u64 v; | |
596 | ||
597 | EBUG_ON(!bkey_packed(k)); | |
598 | ||
599 | v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); | |
600 | ||
601 | /* | |
602 | * In little endian, we're shifting off low bits (and then the bits we | |
603 | * want are at the low end), in big endian we're shifting off high bits | |
604 | * (and then the bits we want are at the high end, so we shift them | |
605 | * back down): | |
606 | */ | |
607 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
608 | v >>= f->exponent & 7; | |
609 | #else | |
b904a799 | 610 | v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; |
1c6fdbd8 | 611 | #endif |
b904a799 | 612 | return (u16) v; |
1c6fdbd8 KO |
613 | } |
614 | ||
73bd774d KO |
615 | static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, |
616 | unsigned j, | |
617 | struct bkey_packed *min_key, | |
618 | struct bkey_packed *max_key) | |
1c6fdbd8 KO |
619 | { |
620 | struct bkey_float *f = bkey_float(b, t, j); | |
621 | struct bkey_packed *m = tree_to_bkey(b, t, j); | |
9ae82fe6 KO |
622 | struct bkey_packed *l = is_power_of_2(j) |
623 | ? min_key | |
624 | : tree_to_prev_bkey(b, t, j >> ffs(j)); | |
625 | struct bkey_packed *r = is_power_of_2(j + 1) | |
626 | ? max_key | |
627 | : tree_to_bkey(b, t, j >> (ffz(j) + 1)); | |
1c6fdbd8 KO |
628 | unsigned mantissa; |
629 | int shift, exponent, high_bit; | |
630 | ||
1c6fdbd8 KO |
631 | /* |
632 | * for failed bfloats, the lookup code falls back to comparing against | |
633 | * the original key. | |
634 | */ | |
635 | ||
1bdb67e8 | 636 | if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || |
1c6fdbd8 KO |
637 | !b->nr_key_bits) { |
638 | f->exponent = BFLOAT_FAILED_UNPACKED; | |
639 | return; | |
640 | } | |
641 | ||
642 | /* | |
643 | * The greatest differing bit of l and r is the first bit we must | |
644 | * include in the bfloat mantissa we're creating in order to do | |
645 | * comparisons - that bit always becomes the high bit of | |
646 | * bfloat->mantissa, and thus the exponent we're calculating here is | |
647 | * the position of what will become the low bit in bfloat->mantissa: | |
648 | * | |
649 | * Note that this may be negative - we may be running off the low end | |
650 | * of the key: we handle this later: | |
651 | */ | |
652 | high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), | |
b904a799 KO |
653 | min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); |
654 | exponent = high_bit - (BKEY_MANTISSA_BITS - 1); | |
1c6fdbd8 KO |
655 | |
656 | /* | |
657 | * Then we calculate the actual shift value, from the start of the key | |
658 | * (k->_data), to get the key bits starting at exponent: | |
659 | */ | |
660 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
661 | shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; | |
662 | ||
b904a799 | 663 | EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); |
1c6fdbd8 KO |
664 | #else |
665 | shift = high_bit_offset + | |
666 | b->nr_key_bits - | |
667 | exponent - | |
b904a799 | 668 | BKEY_MANTISSA_BITS; |
1c6fdbd8 KO |
669 | |
670 | EBUG_ON(shift < KEY_PACKED_BITS_START); | |
671 | #endif | |
672 | EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); | |
673 | ||
674 | f->exponent = shift; | |
675 | mantissa = bkey_mantissa(m, f, j); | |
676 | ||
677 | /* | |
678 | * If we've got garbage bits, set them to all 1s - it's legal for the | |
679 | * bfloat to compare larger than the original key, but not smaller: | |
680 | */ | |
681 | if (exponent < 0) | |
682 | mantissa |= ~(~0U << -exponent); | |
683 | ||
b904a799 | 684 | f->mantissa = mantissa; |
1c6fdbd8 KO |
685 | } |
686 | ||
687 | /* bytes remaining - only valid for last bset: */ | |
d108efc2 | 688 | static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) |
1c6fdbd8 KO |
689 | { |
690 | bset_aux_tree_verify(b); | |
691 | ||
692 | return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); | |
693 | } | |
694 | ||
d108efc2 | 695 | static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) |
1c6fdbd8 | 696 | { |
b904a799 KO |
697 | return __bset_tree_capacity(b, t) / |
698 | (sizeof(struct bkey_float) + sizeof(u8)); | |
1c6fdbd8 KO |
699 | } |
700 | ||
d108efc2 | 701 | static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) |
1c6fdbd8 KO |
702 | { |
703 | return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); | |
704 | } | |
705 | ||
9ae82fe6 | 706 | static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) |
1c6fdbd8 KO |
707 | { |
708 | struct bkey_packed *k; | |
709 | ||
710 | t->size = 1; | |
711 | t->extra = BSET_RW_AUX_TREE_VAL; | |
712 | rw_aux_tree(b, t)[0].offset = | |
713 | __btree_node_key_to_offset(b, btree_bkey_first(b, t)); | |
714 | ||
ad44bdc3 | 715 | bset_tree_for_each_key(b, t, k) { |
1c6fdbd8 KO |
716 | if (t->size == bset_rw_tree_capacity(b, t)) |
717 | break; | |
718 | ||
719 | if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > | |
720 | L1_CACHE_BYTES) | |
721 | rw_aux_tree_set(b, t, t->size++, k); | |
722 | } | |
723 | } | |
724 | ||
9ae82fe6 | 725 | static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) |
1c6fdbd8 KO |
726 | { |
727 | struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); | |
9ae82fe6 | 728 | struct bkey_i min_key, max_key; |
3fe8a186 | 729 | unsigned cacheline = 1; |
1c6fdbd8 | 730 | |
1c6fdbd8 KO |
731 | t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), |
732 | bset_ro_tree_capacity(b, t)); | |
733 | retry: | |
734 | if (t->size < 2) { | |
735 | t->size = 0; | |
736 | t->extra = BSET_NO_AUX_TREE_VAL; | |
737 | return; | |
738 | } | |
739 | ||
740 | t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; | |
741 | ||
742 | /* First we figure out where the first key in each cacheline is */ | |
72492d55 | 743 | eytzinger1_for_each(j, t->size - 1) { |
1c6fdbd8 | 744 | while (bkey_to_cacheline(b, t, k) < cacheline) |
ac2ccddc | 745 | prev = k, k = bkey_p_next(k); |
1c6fdbd8 KO |
746 | |
747 | if (k >= btree_bkey_last(b, t)) { | |
748 | /* XXX: this path sucks */ | |
749 | t->size--; | |
750 | goto retry; | |
751 | } | |
752 | ||
753 | ro_aux_tree_prev(b, t)[j] = prev->u64s; | |
754 | bkey_float(b, t, j)->key_offset = | |
755 | bkey_to_cacheline_offset(b, t, cacheline++, k); | |
756 | ||
757 | EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); | |
758 | EBUG_ON(tree_to_bkey(b, t, j) != k); | |
759 | } | |
760 | ||
ad44bdc3 | 761 | while (k != btree_bkey_last(b, t)) |
ac2ccddc | 762 | prev = k, k = bkey_p_next(k); |
1c6fdbd8 | 763 | |
c7e04e22 KO |
764 | if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { |
765 | bkey_init(&min_key.k); | |
766 | min_key.k.p = b->data->min_key; | |
767 | } | |
768 | ||
769 | if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { | |
770 | bkey_init(&max_key.k); | |
3ce8b463 | 771 | max_key.k.p = b->data->max_key; |
c7e04e22 | 772 | } |
9ae82fe6 | 773 | |
1c6fdbd8 | 774 | /* Then we build the tree */ |
72492d55 | 775 | eytzinger1_for_each(j, t->size - 1) |
17563164 KO |
776 | make_bfloat(b, t, j, |
777 | bkey_to_packed(&min_key), | |
778 | bkey_to_packed(&max_key)); | |
1c6fdbd8 KO |
779 | } |
780 | ||
781 | static void bset_alloc_tree(struct btree *b, struct bset_tree *t) | |
782 | { | |
783 | struct bset_tree *i; | |
784 | ||
785 | for (i = b->set; i != t; i++) | |
786 | BUG_ON(bset_has_rw_aux_tree(i)); | |
787 | ||
788 | bch2_bset_set_no_aux_tree(b, t); | |
789 | ||
790 | /* round up to next cacheline: */ | |
791 | t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), | |
792 | SMP_CACHE_BYTES / sizeof(u64)); | |
793 | ||
794 | bset_aux_tree_verify(b); | |
795 | } | |
796 | ||
797 | void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, | |
798 | bool writeable) | |
799 | { | |
800 | if (writeable | |
801 | ? bset_has_rw_aux_tree(t) | |
802 | : bset_has_ro_aux_tree(t)) | |
803 | return; | |
804 | ||
805 | bset_alloc_tree(b, t); | |
806 | ||
807 | if (!__bset_tree_capacity(b, t)) | |
808 | return; | |
809 | ||
810 | if (writeable) | |
811 | __build_rw_aux_tree(b, t); | |
812 | else | |
813 | __build_ro_aux_tree(b, t); | |
814 | ||
815 | bset_aux_tree_verify(b); | |
816 | } | |
817 | ||
818 | void bch2_bset_init_first(struct btree *b, struct bset *i) | |
819 | { | |
820 | struct bset_tree *t; | |
821 | ||
822 | BUG_ON(b->nsets); | |
823 | ||
824 | memset(i, 0, sizeof(*i)); | |
825 | get_random_bytes(&i->seq, sizeof(i->seq)); | |
826 | SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); | |
827 | ||
828 | t = &b->set[b->nsets++]; | |
829 | set_btree_bset(b, t, i); | |
830 | } | |
831 | ||
ec4edd7b | 832 | void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne) |
1c6fdbd8 KO |
833 | { |
834 | struct bset *i = &bne->keys; | |
835 | struct bset_tree *t; | |
836 | ||
ec4edd7b | 837 | BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b)); |
1c6fdbd8 KO |
838 | BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); |
839 | BUG_ON(b->nsets >= MAX_BSETS); | |
840 | ||
841 | memset(i, 0, sizeof(*i)); | |
842 | i->seq = btree_bset_first(b)->seq; | |
843 | SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); | |
844 | ||
845 | t = &b->set[b->nsets++]; | |
846 | set_btree_bset(b, t, i); | |
847 | } | |
848 | ||
849 | /* | |
850 | * find _some_ key in the same bset as @k that precedes @k - not necessarily the | |
851 | * immediate predecessor: | |
852 | */ | |
853 | static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, | |
854 | struct bkey_packed *k) | |
855 | { | |
856 | struct bkey_packed *p; | |
857 | unsigned offset; | |
858 | int j; | |
859 | ||
860 | EBUG_ON(k < btree_bkey_first(b, t) || | |
861 | k > btree_bkey_last(b, t)); | |
862 | ||
863 | if (k == btree_bkey_first(b, t)) | |
864 | return NULL; | |
865 | ||
866 | switch (bset_aux_tree_type(t)) { | |
867 | case BSET_NO_AUX_TREE: | |
868 | p = btree_bkey_first(b, t); | |
869 | break; | |
870 | case BSET_RO_AUX_TREE: | |
871 | j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); | |
872 | ||
873 | do { | |
874 | p = j ? tree_to_bkey(b, t, | |
875 | __inorder_to_eytzinger1(j--, | |
72492d55 | 876 | t->size - 1, t->extra)) |
1c6fdbd8 KO |
877 | : btree_bkey_first(b, t); |
878 | } while (p >= k); | |
879 | break; | |
880 | case BSET_RW_AUX_TREE: | |
881 | offset = __btree_node_key_to_offset(b, k); | |
882 | j = rw_aux_tree_bsearch(b, t, offset); | |
883 | p = j ? rw_aux_to_bkey(b, t, j - 1) | |
884 | : btree_bkey_first(b, t); | |
885 | break; | |
886 | } | |
887 | ||
888 | return p; | |
889 | } | |
890 | ||
891 | struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, | |
892 | struct bset_tree *t, | |
893 | struct bkey_packed *k, | |
894 | unsigned min_key_type) | |
895 | { | |
896 | struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; | |
897 | ||
898 | while ((p = __bkey_prev(b, t, k)) && !ret) { | |
ac2ccddc | 899 | for (i = p; i != k; i = bkey_p_next(i)) |
1c6fdbd8 KO |
900 | if (i->type >= min_key_type) |
901 | ret = i; | |
902 | ||
903 | k = p; | |
904 | } | |
905 | ||
29364f34 | 906 | if (bch2_expensive_debug_checks) { |
1c6fdbd8 KO |
907 | BUG_ON(ret >= orig_k); |
908 | ||
ad44bdc3 | 909 | for (i = ret |
ac2ccddc | 910 | ? bkey_p_next(ret) |
ad44bdc3 | 911 | : btree_bkey_first(b, t); |
1c6fdbd8 | 912 | i != orig_k; |
ac2ccddc | 913 | i = bkey_p_next(i)) |
1c6fdbd8 KO |
914 | BUG_ON(i->type >= min_key_type); |
915 | } | |
916 | ||
917 | return ret; | |
918 | } | |
919 | ||
920 | /* Insert */ | |
921 | ||
1c6fdbd8 KO |
922 | static void bch2_bset_fix_lookup_table(struct btree *b, |
923 | struct bset_tree *t, | |
924 | struct bkey_packed *_where, | |
925 | unsigned clobber_u64s, | |
926 | unsigned new_u64s) | |
927 | { | |
928 | int shift = new_u64s - clobber_u64s; | |
929 | unsigned l, j, where = __btree_node_key_to_offset(b, _where); | |
930 | ||
931 | EBUG_ON(bset_has_ro_aux_tree(t)); | |
932 | ||
933 | if (!bset_has_rw_aux_tree(t)) | |
934 | return; | |
935 | ||
617391ba | 936 | /* returns first entry >= where */ |
1c6fdbd8 KO |
937 | l = rw_aux_tree_bsearch(b, t, where); |
938 | ||
1c6fdbd8 KO |
939 | if (!l) /* never delete first entry */ |
940 | l++; | |
941 | else if (l < t->size && | |
942 | where < t->end_offset && | |
943 | rw_aux_tree(b, t)[l].offset == where) | |
944 | rw_aux_tree_set(b, t, l++, _where); | |
945 | ||
946 | /* l now > where */ | |
947 | ||
948 | for (j = l; | |
949 | j < t->size && | |
950 | rw_aux_tree(b, t)[j].offset < where + clobber_u64s; | |
951 | j++) | |
952 | ; | |
953 | ||
954 | if (j < t->size && | |
955 | rw_aux_tree(b, t)[j].offset + shift == | |
956 | rw_aux_tree(b, t)[l - 1].offset) | |
957 | j++; | |
958 | ||
959 | memmove(&rw_aux_tree(b, t)[l], | |
960 | &rw_aux_tree(b, t)[j], | |
961 | (void *) &rw_aux_tree(b, t)[t->size] - | |
962 | (void *) &rw_aux_tree(b, t)[j]); | |
963 | t->size -= j - l; | |
964 | ||
965 | for (j = l; j < t->size; j++) | |
3e3e02e6 | 966 | rw_aux_tree(b, t)[j].offset += shift; |
1c6fdbd8 KO |
967 | |
968 | EBUG_ON(l < t->size && | |
969 | rw_aux_tree(b, t)[l].offset == | |
970 | rw_aux_tree(b, t)[l - 1].offset); | |
971 | ||
972 | if (t->size < bset_rw_tree_capacity(b, t) && | |
973 | (l < t->size | |
974 | ? rw_aux_tree(b, t)[l].offset | |
975 | : t->end_offset) - | |
976 | rw_aux_tree(b, t)[l - 1].offset > | |
977 | L1_CACHE_BYTES / sizeof(u64)) { | |
978 | struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); | |
979 | struct bkey_packed *end = l < t->size | |
980 | ? rw_aux_to_bkey(b, t, l) | |
981 | : btree_bkey_last(b, t); | |
982 | struct bkey_packed *k = start; | |
983 | ||
984 | while (1) { | |
ac2ccddc | 985 | k = bkey_p_next(k); |
1c6fdbd8 KO |
986 | if (k == end) |
987 | break; | |
988 | ||
989 | if ((void *) k - (void *) start >= L1_CACHE_BYTES) { | |
990 | memmove(&rw_aux_tree(b, t)[l + 1], | |
991 | &rw_aux_tree(b, t)[l], | |
992 | (void *) &rw_aux_tree(b, t)[t->size] - | |
993 | (void *) &rw_aux_tree(b, t)[l]); | |
994 | t->size++; | |
995 | rw_aux_tree_set(b, t, l, k); | |
996 | break; | |
997 | } | |
998 | } | |
999 | } | |
1000 | ||
1001 | bch2_bset_verify_rw_aux_tree(b, t); | |
1002 | bset_aux_tree_verify(b); | |
1003 | } | |
1004 | ||
1005 | void bch2_bset_insert(struct btree *b, | |
1006 | struct btree_node_iter *iter, | |
1007 | struct bkey_packed *where, | |
1008 | struct bkey_i *insert, | |
1009 | unsigned clobber_u64s) | |
1010 | { | |
1011 | struct bkey_format *f = &b->format; | |
1012 | struct bset_tree *t = bset_tree_last(b); | |
1013 | struct bkey_packed packed, *src = bkey_to_packed(insert); | |
1014 | ||
1015 | bch2_bset_verify_rw_aux_tree(b, t); | |
271a3d3a | 1016 | bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); |
1c6fdbd8 KO |
1017 | |
1018 | if (bch2_bkey_pack_key(&packed, &insert->k, f)) | |
1019 | src = &packed; | |
1020 | ||
c052cf82 | 1021 | if (!bkey_deleted(&insert->k)) |
1c6fdbd8 KO |
1022 | btree_keys_account_key_add(&b->nr, t - b->set, src); |
1023 | ||
1024 | if (src->u64s != clobber_u64s) { | |
5cfd6977 KO |
1025 | u64 *src_p = (u64 *) where->_data + clobber_u64s; |
1026 | u64 *dst_p = (u64 *) where->_data + src->u64s; | |
1c6fdbd8 KO |
1027 | |
1028 | EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < | |
1029 | (int) clobber_u64s - src->u64s); | |
1030 | ||
1031 | memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); | |
1032 | le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); | |
1033 | set_btree_bset_end(b, t); | |
1034 | } | |
1035 | ||
d598a9b7 | 1036 | memcpy_u64s_small(where, src, |
1c6fdbd8 KO |
1037 | bkeyp_key_u64s(f, src)); |
1038 | memcpy_u64s(bkeyp_val(f, where), &insert->v, | |
1039 | bkeyp_val_u64s(f, src)); | |
1040 | ||
fdf22400 KO |
1041 | if (src->u64s != clobber_u64s) |
1042 | bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); | |
1c6fdbd8 | 1043 | |
1c6fdbd8 KO |
1044 | bch2_verify_btree_nr_keys(b); |
1045 | } | |
1046 | ||
1047 | void bch2_bset_delete(struct btree *b, | |
1048 | struct bkey_packed *where, | |
1049 | unsigned clobber_u64s) | |
1050 | { | |
1051 | struct bset_tree *t = bset_tree_last(b); | |
5cfd6977 | 1052 | u64 *src_p = (u64 *) where->_data + clobber_u64s; |
1c6fdbd8 KO |
1053 | u64 *dst_p = where->_data; |
1054 | ||
1055 | bch2_bset_verify_rw_aux_tree(b, t); | |
1056 | ||
1057 | EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); | |
1058 | ||
1059 | memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); | |
1060 | le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); | |
1061 | set_btree_bset_end(b, t); | |
1062 | ||
1063 | bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); | |
1064 | } | |
1065 | ||
1066 | /* Lookup */ | |
1067 | ||
1068 | __flatten | |
1069 | static struct bkey_packed *bset_search_write_set(const struct btree *b, | |
1070 | struct bset_tree *t, | |
2649b514 | 1071 | struct bpos *search) |
1c6fdbd8 KO |
1072 | { |
1073 | unsigned l = 0, r = t->size; | |
1074 | ||
1075 | while (l + 1 != r) { | |
1076 | unsigned m = (l + r) >> 1; | |
1077 | ||
e88a75eb | 1078 | if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) |
1c6fdbd8 KO |
1079 | l = m; |
1080 | else | |
1081 | r = m; | |
1082 | } | |
1083 | ||
1084 | return rw_aux_to_bkey(b, t, l); | |
1085 | } | |
1086 | ||
c4537686 KO |
1087 | static inline void prefetch_four_cachelines(void *p) |
1088 | { | |
6f152b0f | 1089 | #ifdef CONFIG_X86_64 |
2eba51a6 BH |
1090 | asm("prefetcht0 (-127 + 64 * 0)(%0);" |
1091 | "prefetcht0 (-127 + 64 * 1)(%0);" | |
1092 | "prefetcht0 (-127 + 64 * 2)(%0);" | |
1093 | "prefetcht0 (-127 + 64 * 3)(%0);" | |
c4537686 KO |
1094 | : |
1095 | : "r" (p + 127)); | |
1096 | #else | |
1097 | prefetch(p + L1_CACHE_BYTES * 0); | |
1098 | prefetch(p + L1_CACHE_BYTES * 1); | |
1099 | prefetch(p + L1_CACHE_BYTES * 2); | |
1100 | prefetch(p + L1_CACHE_BYTES * 3); | |
1101 | #endif | |
1102 | } | |
1103 | ||
58404bb2 KO |
1104 | static inline bool bkey_mantissa_bits_dropped(const struct btree *b, |
1105 | const struct bkey_float *f, | |
1106 | unsigned idx) | |
1107 | { | |
1108 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ | |
1109 | unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; | |
1110 | ||
1111 | return f->exponent > key_bits_start; | |
1112 | #else | |
1113 | unsigned key_bits_end = high_bit_offset + b->nr_key_bits; | |
58404bb2 | 1114 | |
b904a799 | 1115 | return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; |
58404bb2 KO |
1116 | #endif |
1117 | } | |
1118 | ||
1c6fdbd8 KO |
1119 | __flatten |
1120 | static struct bkey_packed *bset_search_tree(const struct btree *b, | |
d108efc2 KO |
1121 | const struct bset_tree *t, |
1122 | const struct bpos *search, | |
1c6fdbd8 KO |
1123 | const struct bkey_packed *packed_search) |
1124 | { | |
1125 | struct ro_aux_tree *base = ro_aux_tree_base(b, t); | |
c4537686 | 1126 | struct bkey_float *f; |
58404bb2 KO |
1127 | struct bkey_packed *k; |
1128 | unsigned inorder, n = 1, l, r; | |
1129 | int cmp; | |
1c6fdbd8 | 1130 | |
c4537686 KO |
1131 | do { |
1132 | if (likely(n << 4 < t->size)) | |
b904a799 | 1133 | prefetch(&base->f[n << 4]); |
1c6fdbd8 | 1134 | |
b904a799 | 1135 | f = &base->f[n]; |
58404bb2 KO |
1136 | if (unlikely(f->exponent >= BFLOAT_FAILED)) |
1137 | goto slowpath; | |
1138 | ||
b904a799 | 1139 | l = f->mantissa; |
58404bb2 KO |
1140 | r = bkey_mantissa(packed_search, f, n); |
1141 | ||
1142 | if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) | |
1143 | goto slowpath; | |
1144 | ||
1145 | n = n * 2 + (l < r); | |
1146 | continue; | |
1147 | slowpath: | |
1148 | k = tree_to_bkey(b, t, n); | |
1149 | cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); | |
1150 | if (!cmp) | |
1151 | return k; | |
1152 | ||
1153 | n = n * 2 + (cmp < 0); | |
1c6fdbd8 KO |
1154 | } while (n < t->size); |
1155 | ||
72492d55 | 1156 | inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); |
1c6fdbd8 KO |
1157 | |
1158 | /* | |
1159 | * n would have been the node we recursed to - the low bit tells us if | |
1160 | * we recursed left or recursed right. | |
1161 | */ | |
b904a799 KO |
1162 | if (likely(!(n & 1))) { |
1163 | --inorder; | |
1164 | if (unlikely(!inorder)) | |
1c6fdbd8 | 1165 | return btree_bkey_first(b, t); |
b904a799 | 1166 | |
72492d55 | 1167 | f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; |
1c6fdbd8 | 1168 | } |
b904a799 KO |
1169 | |
1170 | return cacheline_to_bkey(b, t, inorder, f->key_offset); | |
1c6fdbd8 KO |
1171 | } |
1172 | ||
c4537686 KO |
1173 | static __always_inline __flatten |
1174 | struct bkey_packed *__bch2_bset_search(struct btree *b, | |
1c6fdbd8 | 1175 | struct bset_tree *t, |
a00fd8c5 | 1176 | struct bpos *search, |
a00fd8c5 | 1177 | const struct bkey_packed *lossy_packed_search) |
1c6fdbd8 | 1178 | { |
1c6fdbd8 KO |
1179 | |
1180 | /* | |
1181 | * First, we search for a cacheline, then lastly we do a linear search | |
1182 | * within that cacheline. | |
1183 | * | |
1184 | * To search for the cacheline, there's three different possibilities: | |
1185 | * * The set is too small to have a search tree, so we just do a linear | |
1186 | * search over the whole set. | |
1187 | * * The set is the one we're currently inserting into; keeping a full | |
1188 | * auxiliary search tree up to date would be too expensive, so we | |
1189 | * use a much simpler lookup table to do a binary search - | |
1190 | * bset_search_write_set(). | |
1191 | * * Or we use the auxiliary search tree we constructed earlier - | |
1192 | * bset_search_tree() | |
1193 | */ | |
1194 | ||
1195 | switch (bset_aux_tree_type(t)) { | |
1196 | case BSET_NO_AUX_TREE: | |
c4537686 | 1197 | return btree_bkey_first(b, t); |
1c6fdbd8 | 1198 | case BSET_RW_AUX_TREE: |
2649b514 | 1199 | return bset_search_write_set(b, t, search); |
1c6fdbd8 | 1200 | case BSET_RO_AUX_TREE: |
c4537686 KO |
1201 | return bset_search_tree(b, t, search, lossy_packed_search); |
1202 | default: | |
439c172b | 1203 | BUG(); |
1c6fdbd8 | 1204 | } |
c4537686 | 1205 | } |
1c6fdbd8 | 1206 | |
c4537686 KO |
1207 | static __always_inline __flatten |
1208 | struct bkey_packed *bch2_bset_search_linear(struct btree *b, | |
1209 | struct bset_tree *t, | |
1210 | struct bpos *search, | |
1211 | struct bkey_packed *packed_search, | |
1212 | const struct bkey_packed *lossy_packed_search, | |
1213 | struct bkey_packed *m) | |
1214 | { | |
1c6fdbd8 KO |
1215 | if (lossy_packed_search) |
1216 | while (m != btree_bkey_last(b, t) && | |
9626aeb1 KO |
1217 | bkey_iter_cmp_p_or_unp(b, m, |
1218 | lossy_packed_search, search) < 0) | |
ac2ccddc | 1219 | m = bkey_p_next(m); |
1c6fdbd8 KO |
1220 | |
1221 | if (!packed_search) | |
1222 | while (m != btree_bkey_last(b, t) && | |
9626aeb1 | 1223 | bkey_iter_pos_cmp(b, m, search) < 0) |
ac2ccddc | 1224 | m = bkey_p_next(m); |
1c6fdbd8 | 1225 | |
29364f34 | 1226 | if (bch2_expensive_debug_checks) { |
1c6fdbd8 KO |
1227 | struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); |
1228 | ||
1229 | BUG_ON(prev && | |
9626aeb1 KO |
1230 | bkey_iter_cmp_p_or_unp(b, prev, |
1231 | packed_search, search) >= 0); | |
1c6fdbd8 KO |
1232 | } |
1233 | ||
1234 | return m; | |
1235 | } | |
1236 | ||
1237 | /* Btree node iterator */ | |
1238 | ||
a00fd8c5 KO |
1239 | static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, |
1240 | struct btree *b, | |
1241 | const struct bkey_packed *k, | |
1242 | const struct bkey_packed *end) | |
1243 | { | |
1244 | if (k != end) { | |
1245 | struct btree_node_iter_set *pos; | |
1246 | ||
1247 | btree_node_iter_for_each(iter, pos) | |
1248 | ; | |
1249 | ||
1250 | BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); | |
1251 | *pos = (struct btree_node_iter_set) { | |
1252 | __btree_node_key_to_offset(b, k), | |
1253 | __btree_node_key_to_offset(b, end) | |
1254 | }; | |
1255 | } | |
1256 | } | |
1257 | ||
1c6fdbd8 KO |
1258 | void bch2_btree_node_iter_push(struct btree_node_iter *iter, |
1259 | struct btree *b, | |
1260 | const struct bkey_packed *k, | |
1261 | const struct bkey_packed *end) | |
1262 | { | |
1263 | __bch2_btree_node_iter_push(iter, b, k, end); | |
1264 | bch2_btree_node_iter_sort(iter, b); | |
1265 | } | |
1266 | ||
3e3e02e6 | 1267 | noinline __flatten __cold |
1c6fdbd8 | 1268 | static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, |
a00fd8c5 | 1269 | struct btree *b, struct bpos *search) |
1c6fdbd8 | 1270 | { |
2649b514 | 1271 | struct bkey_packed *k; |
1c6fdbd8 | 1272 | |
a00fd8c5 | 1273 | trace_bkey_pack_pos_fail(search); |
1c6fdbd8 | 1274 | |
2649b514 | 1275 | bch2_btree_node_iter_init_from_start(iter, b); |
1c6fdbd8 | 1276 | |
2649b514 KO |
1277 | while ((k = bch2_btree_node_iter_peek(iter, b)) && |
1278 | bkey_iter_pos_cmp(b, k, search) < 0) | |
1279 | bch2_btree_node_iter_advance(iter, b); | |
1c6fdbd8 KO |
1280 | } |
1281 | ||
1282 | /** | |
96dea3d5 | 1283 | * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a |
1c6fdbd8 KO |
1284 | * given position |
1285 | * | |
96dea3d5 KO |
1286 | * @iter: iterator to initialize |
1287 | * @b: btree node to search | |
1288 | * @search: search key | |
1289 | * | |
1c6fdbd8 KO |
1290 | * Main entry point to the lookup code for individual btree nodes: |
1291 | * | |
1292 | * NOTE: | |
1293 | * | |
1294 | * When you don't filter out deleted keys, btree nodes _do_ contain duplicate | |
1295 | * keys. This doesn't matter for most code, but it does matter for lookups. | |
1296 | * | |
1297 | * Some adjacent keys with a string of equal keys: | |
1298 | * i j k k k k l m | |
1299 | * | |
1300 | * If you search for k, the lookup code isn't guaranteed to return you any | |
1301 | * specific k. The lookup code is conceptually doing a binary search and | |
1302 | * iterating backwards is very expensive so if the pivot happens to land at the | |
1303 | * last k that's what you'll get. | |
1304 | * | |
1305 | * This works out ok, but it's something to be aware of: | |
1306 | * | |
1307 | * - For non extents, we guarantee that the live key comes last - see | |
1308 | * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't | |
1309 | * see will only be deleted keys you don't care about. | |
1310 | * | |
1311 | * - For extents, deleted keys sort last (see the comment at the top of this | |
1312 | * file). But when you're searching for extents, you actually want the first | |
1313 | * key strictly greater than your search key - an extent that compares equal | |
1314 | * to the search key is going to have 0 sectors after the search key. | |
1315 | * | |
1316 | * But this does mean that we can't just search for | |
e751c01a | 1317 | * bpos_successor(start_of_range) to get the first extent that overlaps with |
1c6fdbd8 KO |
1318 | * the range we want - if we're unlucky and there's an extent that ends |
1319 | * exactly where we searched, then there could be a deleted key at the same | |
1320 | * position and we'd get that when we search instead of the preceding extent | |
1321 | * we needed. | |
1322 | * | |
1323 | * So we've got to search for start_of_range, then after the lookup iterate | |
1324 | * past any extents that compare equal to the position we searched for. | |
1325 | */ | |
c4e065c2 | 1326 | __flatten |
1c6fdbd8 | 1327 | void bch2_btree_node_iter_init(struct btree_node_iter *iter, |
a00fd8c5 | 1328 | struct btree *b, struct bpos *search) |
1c6fdbd8 | 1329 | { |
1c6fdbd8 | 1330 | struct bkey_packed p, *packed_search = NULL; |
c4e065c2 | 1331 | struct btree_node_iter_set *pos = iter->data; |
c4537686 KO |
1332 | struct bkey_packed *k[MAX_BSETS]; |
1333 | unsigned i; | |
1c6fdbd8 | 1334 | |
e88a75eb KO |
1335 | EBUG_ON(bpos_lt(*search, b->data->min_key)); |
1336 | EBUG_ON(bpos_gt(*search, b->data->max_key)); | |
1c6fdbd8 KO |
1337 | bset_aux_tree_verify(b); |
1338 | ||
271a3d3a | 1339 | memset(iter, 0, sizeof(*iter)); |
1c6fdbd8 | 1340 | |
a00fd8c5 | 1341 | switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { |
1c6fdbd8 KO |
1342 | case BKEY_PACK_POS_EXACT: |
1343 | packed_search = &p; | |
1344 | break; | |
1345 | case BKEY_PACK_POS_SMALLER: | |
1346 | packed_search = NULL; | |
1347 | break; | |
1348 | case BKEY_PACK_POS_FAIL: | |
a00fd8c5 | 1349 | btree_node_iter_init_pack_failed(iter, b, search); |
1c6fdbd8 KO |
1350 | return; |
1351 | } | |
1352 | ||
c4537686 KO |
1353 | for (i = 0; i < b->nsets; i++) { |
1354 | k[i] = __bch2_bset_search(b, b->set + i, search, &p); | |
1355 | prefetch_four_cachelines(k[i]); | |
1356 | } | |
1357 | ||
1358 | for (i = 0; i < b->nsets; i++) { | |
1359 | struct bset_tree *t = b->set + i; | |
c4e065c2 KO |
1360 | struct bkey_packed *end = btree_bkey_last(b, t); |
1361 | ||
c4537686 KO |
1362 | k[i] = bch2_bset_search_linear(b, t, search, |
1363 | packed_search, &p, k[i]); | |
1364 | if (k[i] != end) | |
c4e065c2 | 1365 | *pos++ = (struct btree_node_iter_set) { |
c4537686 | 1366 | __btree_node_key_to_offset(b, k[i]), |
c4e065c2 KO |
1367 | __btree_node_key_to_offset(b, end) |
1368 | }; | |
1369 | } | |
1c6fdbd8 KO |
1370 | |
1371 | bch2_btree_node_iter_sort(iter, b); | |
1372 | } | |
1373 | ||
1374 | void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, | |
271a3d3a | 1375 | struct btree *b) |
1c6fdbd8 KO |
1376 | { |
1377 | struct bset_tree *t; | |
1378 | ||
271a3d3a | 1379 | memset(iter, 0, sizeof(*iter)); |
1c6fdbd8 KO |
1380 | |
1381 | for_each_bset(b, t) | |
1382 | __bch2_btree_node_iter_push(iter, b, | |
1383 | btree_bkey_first(b, t), | |
1384 | btree_bkey_last(b, t)); | |
1385 | bch2_btree_node_iter_sort(iter, b); | |
1386 | } | |
1387 | ||
1388 | struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, | |
1389 | struct btree *b, | |
1390 | struct bset_tree *t) | |
1391 | { | |
1392 | struct btree_node_iter_set *set; | |
1393 | ||
1394 | btree_node_iter_for_each(iter, set) | |
1395 | if (set->end == t->end_offset) | |
1396 | return __btree_node_offset_to_key(b, set->k); | |
1397 | ||
1398 | return btree_bkey_last(b, t); | |
1399 | } | |
1400 | ||
1401 | static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, | |
1402 | struct btree *b, | |
1403 | unsigned first) | |
1404 | { | |
1405 | bool ret; | |
1406 | ||
271a3d3a | 1407 | if ((ret = (btree_node_iter_cmp(b, |
1c6fdbd8 KO |
1408 | iter->data[first], |
1409 | iter->data[first + 1]) > 0))) | |
1410 | swap(iter->data[first], iter->data[first + 1]); | |
1411 | return ret; | |
1412 | } | |
1413 | ||
1414 | void bch2_btree_node_iter_sort(struct btree_node_iter *iter, | |
1415 | struct btree *b) | |
1416 | { | |
1417 | /* unrolled bubble sort: */ | |
1418 | ||
1419 | if (!__btree_node_iter_set_end(iter, 2)) { | |
1420 | btree_node_iter_sort_two(iter, b, 0); | |
1421 | btree_node_iter_sort_two(iter, b, 1); | |
1422 | } | |
1423 | ||
1424 | if (!__btree_node_iter_set_end(iter, 1)) | |
1425 | btree_node_iter_sort_two(iter, b, 0); | |
1426 | } | |
1427 | ||
1428 | void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, | |
1429 | struct btree_node_iter_set *set) | |
1430 | { | |
1431 | struct btree_node_iter_set *last = | |
1432 | iter->data + ARRAY_SIZE(iter->data) - 1; | |
1433 | ||
1434 | memmove(&set[0], &set[1], (void *) last - (void *) set); | |
1435 | *last = (struct btree_node_iter_set) { 0, 0 }; | |
1436 | } | |
1437 | ||
1438 | static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, | |
1439 | struct btree *b) | |
1440 | { | |
1441 | iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; | |
1442 | ||
1443 | EBUG_ON(iter->data->k > iter->data->end); | |
1444 | ||
1445 | if (unlikely(__btree_node_iter_set_end(iter, 0))) { | |
005def8f KO |
1446 | /* avoid an expensive memmove call: */ |
1447 | iter->data[0] = iter->data[1]; | |
1448 | iter->data[1] = iter->data[2]; | |
1449 | iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; | |
1c6fdbd8 KO |
1450 | return; |
1451 | } | |
1452 | ||
1453 | if (__btree_node_iter_set_end(iter, 1)) | |
1454 | return; | |
1455 | ||
1456 | if (!btree_node_iter_sort_two(iter, b, 0)) | |
1457 | return; | |
1458 | ||
1459 | if (__btree_node_iter_set_end(iter, 2)) | |
1460 | return; | |
1461 | ||
1462 | btree_node_iter_sort_two(iter, b, 1); | |
1463 | } | |
1464 | ||
1c6fdbd8 KO |
1465 | void bch2_btree_node_iter_advance(struct btree_node_iter *iter, |
1466 | struct btree *b) | |
1467 | { | |
29364f34 | 1468 | if (bch2_expensive_debug_checks) { |
f13f5a8c KO |
1469 | bch2_btree_node_iter_verify(iter, b); |
1470 | bch2_btree_node_iter_next_check(iter, b); | |
1471 | } | |
1472 | ||
271a3d3a | 1473 | __bch2_btree_node_iter_advance(iter, b); |
1c6fdbd8 KO |
1474 | } |
1475 | ||
1c6fdbd8 KO |
1476 | /* |
1477 | * Expensive: | |
1478 | */ | |
e67ab045 KO |
1479 | struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, |
1480 | struct btree *b) | |
1c6fdbd8 KO |
1481 | { |
1482 | struct bkey_packed *k, *prev = NULL; | |
1c6fdbd8 KO |
1483 | struct btree_node_iter_set *set; |
1484 | struct bset_tree *t; | |
ac10a961 | 1485 | unsigned end = 0; |
1c6fdbd8 | 1486 | |
29364f34 | 1487 | if (bch2_expensive_debug_checks) |
2dac0eae | 1488 | bch2_btree_node_iter_verify(iter, b); |
1c6fdbd8 KO |
1489 | |
1490 | for_each_bset(b, t) { | |
e67ab045 KO |
1491 | k = bch2_bkey_prev_all(b, t, |
1492 | bch2_btree_node_iter_bset_pos(iter, b, t)); | |
1c6fdbd8 | 1493 | if (k && |
a00fd8c5 | 1494 | (!prev || bkey_iter_cmp(b, k, prev) > 0)) { |
1c6fdbd8 KO |
1495 | prev = k; |
1496 | end = t->end_offset; | |
1497 | } | |
1498 | } | |
1499 | ||
1500 | if (!prev) | |
e67ab045 | 1501 | return NULL; |
1c6fdbd8 KO |
1502 | |
1503 | /* | |
1504 | * We're manually memmoving instead of just calling sort() to ensure the | |
1505 | * prev we picked ends up in slot 0 - sort won't necessarily put it | |
1506 | * there because of duplicate deleted keys: | |
1507 | */ | |
1508 | btree_node_iter_for_each(iter, set) | |
1509 | if (set->end == end) | |
1510 | goto found; | |
1511 | ||
1512 | BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); | |
1513 | found: | |
1514 | BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); | |
1515 | ||
1516 | memmove(&iter->data[1], | |
1517 | &iter->data[0], | |
1518 | (void *) set - (void *) &iter->data[0]); | |
1519 | ||
1520 | iter->data[0].k = __btree_node_key_to_offset(b, prev); | |
1521 | iter->data[0].end = end; | |
1c6fdbd8 | 1522 | |
29364f34 | 1523 | if (bch2_expensive_debug_checks) |
2dac0eae | 1524 | bch2_btree_node_iter_verify(iter, b); |
e67ab045 KO |
1525 | return prev; |
1526 | } | |
1c6fdbd8 | 1527 | |
c052cf82 KO |
1528 | struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, |
1529 | struct btree *b) | |
e67ab045 KO |
1530 | { |
1531 | struct bkey_packed *prev; | |
1532 | ||
1533 | do { | |
1534 | prev = bch2_btree_node_iter_prev_all(iter, b); | |
c052cf82 | 1535 | } while (prev && bkey_deleted(prev)); |
1c6fdbd8 KO |
1536 | |
1537 | return prev; | |
1538 | } | |
1539 | ||
1540 | struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, | |
1541 | struct btree *b, | |
1542 | struct bkey *u) | |
1543 | { | |
1544 | struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); | |
1545 | ||
1546 | return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; | |
1547 | } | |
1548 | ||
1549 | /* Mergesort */ | |
1550 | ||
a345b0f3 | 1551 | void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) |
1c6fdbd8 | 1552 | { |
a345b0f3 | 1553 | const struct bset_tree *t; |
1c6fdbd8 KO |
1554 | |
1555 | for_each_bset(b, t) { | |
1556 | enum bset_aux_tree_type type = bset_aux_tree_type(t); | |
1557 | size_t j; | |
1558 | ||
1559 | stats->sets[type].nr++; | |
1560 | stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * | |
1561 | sizeof(u64); | |
1562 | ||
1563 | if (bset_has_ro_aux_tree(t)) { | |
1564 | stats->floats += t->size - 1; | |
1565 | ||
1566 | for (j = 1; j < t->size; j++) | |
58404bb2 KO |
1567 | stats->failed += |
1568 | bkey_float(b, t, j)->exponent == | |
1569 | BFLOAT_FAILED; | |
1c6fdbd8 KO |
1570 | } |
1571 | } | |
1572 | } | |
1573 | ||
319f9ac3 KO |
1574 | void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, |
1575 | struct bkey_packed *k) | |
1c6fdbd8 KO |
1576 | { |
1577 | struct bset_tree *t = bch2_bkey_to_bset(b, k); | |
1bdb67e8 | 1578 | struct bkey uk; |
b564513c | 1579 | unsigned j, inorder; |
1c6fdbd8 | 1580 | |
1c6fdbd8 | 1581 | if (!bset_has_ro_aux_tree(t)) |
319f9ac3 | 1582 | return; |
1c6fdbd8 | 1583 | |
b564513c KO |
1584 | inorder = bkey_to_cacheline(b, t, k); |
1585 | if (!inorder || inorder >= t->size) | |
319f9ac3 | 1586 | return; |
b564513c | 1587 | |
72492d55 | 1588 | j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); |
b564513c | 1589 | if (k != tree_to_bkey(b, t, j)) |
319f9ac3 | 1590 | return; |
b564513c KO |
1591 | |
1592 | switch (bkey_float(b, t, j)->exponent) { | |
58404bb2 | 1593 | case BFLOAT_FAILED: |
b564513c | 1594 | uk = bkey_unpack_key(b, k); |
401ec4db | 1595 | prt_printf(out, |
319f9ac3 | 1596 | " failed unpacked at depth %u\n" |
f020bfcd KO |
1597 | "\t", |
1598 | ilog2(j)); | |
1599 | bch2_bpos_to_text(out, uk.p); | |
401ec4db | 1600 | prt_printf(out, "\n"); |
319f9ac3 | 1601 | break; |
b564513c | 1602 | } |
1c6fdbd8 | 1603 | } |