bcachefs: make struct btree_iter a bit smaller
[linux-block.git] / fs / bcachefs / btree_types.h
CommitLineData
1c6fdbd8
KO
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _BCACHEFS_BTREE_TYPES_H
3#define _BCACHEFS_BTREE_TYPES_H
4
5#include <linux/list.h>
6#include <linux/rhashtable.h>
7
8#include "bkey_methods.h"
9#include "journal_types.h"
10#include "six.h"
11
12struct open_bucket;
13struct btree_update;
14
15#define MAX_BSETS 3U
16
17struct btree_nr_keys {
18
19 /*
20 * Amount of live metadata (i.e. size of node after a compaction) in
21 * units of u64s
22 */
23 u16 live_u64s;
24 u16 bset_u64s[MAX_BSETS];
25
26 /* live keys only: */
27 u16 packed_keys;
28 u16 unpacked_keys;
29};
30
31struct bset_tree {
32 /*
33 * We construct a binary tree in an array as if the array
34 * started at 1, so that things line up on the same cachelines
35 * better: see comments in bset.c at cacheline_to_bkey() for
36 * details
37 */
38
39 /* size of the binary tree and prev array */
40 u16 size;
41
42 /* function of size - precalculated for to_inorder() */
43 u16 extra;
44
45 u16 data_offset;
46 u16 aux_data_offset;
47 u16 end_offset;
48
49 struct bpos max_key;
50};
51
52struct btree_write {
53 struct journal_entry_pin journal;
54 struct closure_waitlist wait;
55};
56
57struct btree_ob_ref {
58 u8 nr;
59 u8 refs[BCH_REPLICAS_MAX];
60};
61
62struct btree_alloc {
63 struct btree_ob_ref ob;
64 BKEY_PADDED(k);
65};
66
67struct btree {
68 /* Hottest entries first */
69 struct rhash_head hash;
70
71 /* Key/pointer for this btree node */
72 __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
73
74 struct six_lock lock;
75
76 unsigned long flags;
77 u16 written;
78 u8 level;
79 u8 btree_id;
80 u8 nsets;
81 u8 nr_key_bits;
82
83 struct bkey_format format;
84
85 struct btree_node *data;
86 void *aux_data;
87
88 /*
89 * Sets of sorted keys - the real btree node - plus a binary search tree
90 *
91 * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
92 * to the memory we have allocated for this btree node. Additionally,
93 * set[0]->data points to the entire btree node as it exists on disk.
94 */
95 struct bset_tree set[MAX_BSETS];
96
97 struct btree_nr_keys nr;
98 u16 sib_u64s[2];
99 u16 whiteout_u64s;
100 u16 uncompacted_whiteout_u64s;
101 u8 page_order;
102 u8 unpack_fn_len;
103
104 /*
105 * XXX: add a delete sequence number, so when bch2_btree_node_relock()
106 * fails because the lock sequence number has changed - i.e. the
107 * contents were modified - we can still relock the node if it's still
108 * the one we want, without redoing the traversal
109 */
110
111 /*
112 * For asynchronous splits/interior node updates:
113 * When we do a split, we allocate new child nodes and update the parent
114 * node to point to them: we update the parent in memory immediately,
115 * but then we must wait until the children have been written out before
116 * the update to the parent can be written - this is a list of the
117 * btree_updates that are blocking this node from being
118 * written:
119 */
120 struct list_head write_blocked;
121
122 /*
123 * Also for asynchronous splits/interior node updates:
124 * If a btree node isn't reachable yet, we don't want to kick off
125 * another write - because that write also won't yet be reachable and
126 * marking it as completed before it's reachable would be incorrect:
127 */
128 unsigned long will_make_reachable;
129
130 struct btree_ob_ref ob;
131
132 /* lru list */
133 struct list_head list;
134
135 struct btree_write writes[2];
136
137#ifdef CONFIG_BCACHEFS_DEBUG
138 bool *expensive_debug_checks;
139#endif
140};
141
142struct btree_cache {
143 struct rhashtable table;
144 bool table_init_done;
145 /*
146 * We never free a struct btree, except on shutdown - we just put it on
147 * the btree_cache_freed list and reuse it later. This simplifies the
148 * code, and it doesn't cost us much memory as the memory usage is
149 * dominated by buffers that hold the actual btree node data and those
150 * can be freed - and the number of struct btrees allocated is
151 * effectively bounded.
152 *
153 * btree_cache_freeable effectively is a small cache - we use it because
154 * high order page allocations can be rather expensive, and it's quite
155 * common to delete and allocate btree nodes in quick succession. It
156 * should never grow past ~2-3 nodes in practice.
157 */
158 struct mutex lock;
159 struct list_head live;
160 struct list_head freeable;
161 struct list_head freed;
162
163 /* Number of elements in live + freeable lists */
164 unsigned used;
165 unsigned reserve;
166 struct shrinker shrink;
167
168 /*
169 * If we need to allocate memory for a new btree node and that
170 * allocation fails, we can cannibalize another node in the btree cache
171 * to satisfy the allocation - lock to guarantee only one thread does
172 * this at a time:
173 */
174 struct task_struct *alloc_lock;
175 struct closure_waitlist alloc_wait;
176};
177
178struct btree_node_iter {
1c6fdbd8
KO
179 struct btree_node_iter_set {
180 u16 k, end;
181 } data[MAX_BSETS];
182};
183
184enum btree_iter_type {
185 BTREE_ITER_KEYS,
186 BTREE_ITER_SLOTS,
187 BTREE_ITER_NODES,
188};
189
190#define BTREE_ITER_TYPE ((1 << 2) - 1)
191
192#define BTREE_ITER_INTENT (1 << 2)
193#define BTREE_ITER_PREFETCH (1 << 3)
194/*
195 * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
196 * @pos or the first key strictly greater than @pos
197 */
198#define BTREE_ITER_IS_EXTENTS (1 << 4)
199/*
200 * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
201 */
202#define BTREE_ITER_AT_END_OF_LEAF (1 << 5)
203#define BTREE_ITER_ERROR (1 << 6)
204
205enum btree_iter_uptodate {
206 BTREE_ITER_UPTODATE = 0,
207 BTREE_ITER_NEED_PEEK = 1,
208 BTREE_ITER_NEED_RELOCK = 2,
209 BTREE_ITER_NEED_TRAVERSE = 3,
210};
211
212/*
213 * @pos - iterator's current position
214 * @level - current btree depth
215 * @locks_want - btree level below which we start taking intent locks
216 * @nodes_locked - bitmask indicating which nodes in @nodes are locked
217 * @nodes_intent_locked - bitmask indicating which locks are intent locks
218 */
219struct btree_iter {
220 struct bch_fs *c;
221 struct bpos pos;
222
223 u8 flags;
224 enum btree_iter_uptodate uptodate:4;
225 enum btree_id btree_id:4;
226 unsigned level:4,
227 locks_want:4,
228 nodes_locked:4,
229 nodes_intent_locked:4;
230
231 struct btree_iter_level {
232 struct btree *b;
233 struct btree_node_iter iter;
e4ccb251 234 u32 lock_seq;
1c6fdbd8
KO
235 } l[BTREE_MAX_DEPTH];
236
1c6fdbd8
KO
237 /*
238 * Current unpacked key - so that bch2_btree_iter_next()/
239 * bch2_btree_iter_next_slot() can correctly advance pos.
240 */
241 struct bkey k;
242
243 /*
244 * Circular linked list of linked iterators: linked iterators share
245 * locks (e.g. two linked iterators may have the same node intent
246 * locked, or read and write locked, at the same time), and insertions
247 * through one iterator won't invalidate the other linked iterators.
248 */
249
250 /* Must come last: */
251 struct btree_iter *next;
252};
253
254#define BTREE_ITER_MAX 8
255
256struct btree_insert_entry {
257 struct btree_iter *iter;
258 struct bkey_i *k;
259 unsigned extra_res;
260 /*
261 * true if entire key was inserted - can only be false for
262 * extents
263 */
264 bool done;
265};
266
267struct btree_trans {
268 struct bch_fs *c;
1c7a0adf 269 size_t nr_restarts;
1c6fdbd8
KO
270
271 u8 nr_iters;
272 u8 iters_live;
273 u8 iters_linked;
274 u8 nr_updates;
275
276 unsigned mem_top;
277 unsigned mem_bytes;
278 void *mem;
279
280 struct btree_iter *iters;
281 u64 iter_ids[BTREE_ITER_MAX];
282
283 struct btree_insert_entry updates[BTREE_ITER_MAX];
284
285 struct btree_iter iters_onstack[2];
286};
287
288#define BTREE_FLAG(flag) \
289static inline bool btree_node_ ## flag(struct btree *b) \
290{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
291 \
292static inline void set_btree_node_ ## flag(struct btree *b) \
293{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
294 \
295static inline void clear_btree_node_ ## flag(struct btree *b) \
296{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
297
298enum btree_flags {
299 BTREE_NODE_read_in_flight,
300 BTREE_NODE_read_error,
301 BTREE_NODE_dirty,
302 BTREE_NODE_need_write,
303 BTREE_NODE_noevict,
304 BTREE_NODE_write_idx,
305 BTREE_NODE_accessed,
306 BTREE_NODE_write_in_flight,
307 BTREE_NODE_just_written,
308 BTREE_NODE_dying,
309 BTREE_NODE_fake,
310};
311
312BTREE_FLAG(read_in_flight);
313BTREE_FLAG(read_error);
314BTREE_FLAG(dirty);
315BTREE_FLAG(need_write);
316BTREE_FLAG(noevict);
317BTREE_FLAG(write_idx);
318BTREE_FLAG(accessed);
319BTREE_FLAG(write_in_flight);
320BTREE_FLAG(just_written);
321BTREE_FLAG(dying);
322BTREE_FLAG(fake);
323
324static inline struct btree_write *btree_current_write(struct btree *b)
325{
326 return b->writes + btree_node_write_idx(b);
327}
328
329static inline struct btree_write *btree_prev_write(struct btree *b)
330{
331 return b->writes + (btree_node_write_idx(b) ^ 1);
332}
333
334static inline struct bset_tree *bset_tree_last(struct btree *b)
335{
336 EBUG_ON(!b->nsets);
337 return b->set + b->nsets - 1;
338}
339
1fe08f31
KO
340static inline void *
341__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
342{
343 return (void *) ((u64 *) b->data + 1 + offset);
344}
345
346static inline u16
347__btree_node_ptr_to_offset(const struct btree *b, const void *p)
348{
349 u16 ret = (u64 *) p - 1 - (u64 *) b->data;
350
351 EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
352 return ret;
353}
354
1c6fdbd8
KO
355static inline struct bset *bset(const struct btree *b,
356 const struct bset_tree *t)
357{
1fe08f31
KO
358 return __btree_node_offset_to_ptr(b, t->data_offset);
359}
360
361static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
362{
363 t->end_offset =
364 __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
365}
366
367static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
368 const struct bset *i)
369{
370 t->data_offset = __btree_node_ptr_to_offset(b, i);
371 set_btree_bset_end(b, t);
1c6fdbd8
KO
372}
373
374static inline struct bset *btree_bset_first(struct btree *b)
375{
376 return bset(b, b->set);
377}
378
379static inline struct bset *btree_bset_last(struct btree *b)
380{
381 return bset(b, bset_tree_last(b));
382}
383
384static inline u16
385__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
386{
1fe08f31 387 return __btree_node_ptr_to_offset(b, k);
1c6fdbd8
KO
388}
389
390static inline struct bkey_packed *
391__btree_node_offset_to_key(const struct btree *b, u16 k)
392{
1fe08f31 393 return __btree_node_offset_to_ptr(b, k);
1c6fdbd8
KO
394}
395
617391ba
KO
396static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
397{
398 return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
399}
400
1fe08f31
KO
401#define btree_bkey_first(_b, _t) \
402({ \
403 EBUG_ON(bset(_b, _t)->start != \
404 __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
405 \
406 bset(_b, _t)->start; \
407})
1c6fdbd8
KO
408
409#define btree_bkey_last(_b, _t) \
410({ \
411 EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \
412 vstruct_last(bset(_b, _t))); \
413 \
414 __btree_node_offset_to_key(_b, (_t)->end_offset); \
415})
416
1c6fdbd8
KO
417static inline unsigned bset_byte_offset(struct btree *b, void *i)
418{
419 return i - (void *) b->data;
420}
421
422/* Type of keys @b contains: */
423static inline enum bkey_type btree_node_type(struct btree *b)
424{
425 return b->level ? BKEY_TYPE_BTREE : b->btree_id;
426}
427
428static inline const struct bkey_ops *btree_node_ops(struct btree *b)
429{
430 return &bch2_bkey_ops[btree_node_type(b)];
431}
432
433static inline bool btree_node_has_ptrs(struct btree *b)
434{
435 return btree_type_has_ptrs(btree_node_type(b));
436}
437
438static inline bool btree_node_is_extents(struct btree *b)
439{
440 return btree_node_type(b) == BKEY_TYPE_EXTENTS;
441}
442
443struct btree_root {
444 struct btree *b;
445
446 struct btree_update *as;
447
448 /* On disk root - see async splits: */
449 __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
450 u8 level;
451 u8 alive;
452};
453
454/*
455 * Optional hook that will be called just prior to a btree node update, when
456 * we're holding the write lock and we know what key is about to be overwritten:
457 */
458
1c6fdbd8
KO
459enum btree_insert_ret {
460 BTREE_INSERT_OK,
461 /* extent spanned multiple leaf nodes: have to traverse to next node: */
462 BTREE_INSERT_NEED_TRAVERSE,
463 /* write lock held for too long */
1c6fdbd8
KO
464 /* leaf node needs to be split */
465 BTREE_INSERT_BTREE_NODE_FULL,
466 BTREE_INSERT_JOURNAL_RES_FULL,
467 BTREE_INSERT_ENOSPC,
468 BTREE_INSERT_NEED_GC_LOCK,
469};
470
471struct extent_insert_hook {
472 enum btree_insert_ret
473 (*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
474 struct bkey_s_c, const struct bkey_i *);
475};
476
477enum btree_gc_coalesce_fail_reason {
478 BTREE_GC_COALESCE_FAIL_RESERVE_GET,
479 BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
480 BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
481};
482
483enum btree_node_sibling {
484 btree_prev_sib,
485 btree_next_sib,
486};
487
488typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
489 struct btree *,
490 struct btree_node_iter *);
491
492#endif /* _BCACHEFS_BTREE_TYPES_H */