Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _BCACHEFS_BTREE_TYPES_H | |
3 | #define _BCACHEFS_BTREE_TYPES_H | |
4 | ||
5 | #include <linux/list.h> | |
6 | #include <linux/rhashtable.h> | |
7 | ||
8 | #include "bkey_methods.h" | |
9 | #include "journal_types.h" | |
10 | #include "six.h" | |
11 | ||
12 | struct open_bucket; | |
13 | struct btree_update; | |
14 | ||
15 | #define MAX_BSETS 3U | |
16 | ||
17 | struct btree_nr_keys { | |
18 | ||
19 | /* | |
20 | * Amount of live metadata (i.e. size of node after a compaction) in | |
21 | * units of u64s | |
22 | */ | |
23 | u16 live_u64s; | |
24 | u16 bset_u64s[MAX_BSETS]; | |
25 | ||
26 | /* live keys only: */ | |
27 | u16 packed_keys; | |
28 | u16 unpacked_keys; | |
29 | }; | |
30 | ||
31 | struct bset_tree { | |
32 | /* | |
33 | * We construct a binary tree in an array as if the array | |
34 | * started at 1, so that things line up on the same cachelines | |
35 | * better: see comments in bset.c at cacheline_to_bkey() for | |
36 | * details | |
37 | */ | |
38 | ||
39 | /* size of the binary tree and prev array */ | |
40 | u16 size; | |
41 | ||
42 | /* function of size - precalculated for to_inorder() */ | |
43 | u16 extra; | |
44 | ||
45 | u16 data_offset; | |
46 | u16 aux_data_offset; | |
47 | u16 end_offset; | |
48 | ||
49 | struct bpos max_key; | |
50 | }; | |
51 | ||
52 | struct btree_write { | |
53 | struct journal_entry_pin journal; | |
54 | struct closure_waitlist wait; | |
55 | }; | |
56 | ||
57 | struct btree_ob_ref { | |
58 | u8 nr; | |
59 | u8 refs[BCH_REPLICAS_MAX]; | |
60 | }; | |
61 | ||
62 | struct btree_alloc { | |
63 | struct btree_ob_ref ob; | |
64 | BKEY_PADDED(k); | |
65 | }; | |
66 | ||
67 | struct btree { | |
68 | /* Hottest entries first */ | |
69 | struct rhash_head hash; | |
70 | ||
71 | /* Key/pointer for this btree node */ | |
72 | __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); | |
73 | ||
74 | struct six_lock lock; | |
75 | ||
76 | unsigned long flags; | |
77 | u16 written; | |
78 | u8 level; | |
79 | u8 btree_id; | |
80 | u8 nsets; | |
81 | u8 nr_key_bits; | |
82 | ||
83 | struct bkey_format format; | |
84 | ||
85 | struct btree_node *data; | |
86 | void *aux_data; | |
87 | ||
88 | /* | |
89 | * Sets of sorted keys - the real btree node - plus a binary search tree | |
90 | * | |
91 | * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point | |
92 | * to the memory we have allocated for this btree node. Additionally, | |
93 | * set[0]->data points to the entire btree node as it exists on disk. | |
94 | */ | |
95 | struct bset_tree set[MAX_BSETS]; | |
96 | ||
97 | struct btree_nr_keys nr; | |
98 | u16 sib_u64s[2]; | |
99 | u16 whiteout_u64s; | |
100 | u16 uncompacted_whiteout_u64s; | |
101 | u8 page_order; | |
102 | u8 unpack_fn_len; | |
103 | ||
104 | /* | |
105 | * XXX: add a delete sequence number, so when bch2_btree_node_relock() | |
106 | * fails because the lock sequence number has changed - i.e. the | |
107 | * contents were modified - we can still relock the node if it's still | |
108 | * the one we want, without redoing the traversal | |
109 | */ | |
110 | ||
111 | /* | |
112 | * For asynchronous splits/interior node updates: | |
113 | * When we do a split, we allocate new child nodes and update the parent | |
114 | * node to point to them: we update the parent in memory immediately, | |
115 | * but then we must wait until the children have been written out before | |
116 | * the update to the parent can be written - this is a list of the | |
117 | * btree_updates that are blocking this node from being | |
118 | * written: | |
119 | */ | |
120 | struct list_head write_blocked; | |
121 | ||
122 | /* | |
123 | * Also for asynchronous splits/interior node updates: | |
124 | * If a btree node isn't reachable yet, we don't want to kick off | |
125 | * another write - because that write also won't yet be reachable and | |
126 | * marking it as completed before it's reachable would be incorrect: | |
127 | */ | |
128 | unsigned long will_make_reachable; | |
129 | ||
130 | struct btree_ob_ref ob; | |
131 | ||
132 | /* lru list */ | |
133 | struct list_head list; | |
134 | ||
135 | struct btree_write writes[2]; | |
136 | ||
137 | #ifdef CONFIG_BCACHEFS_DEBUG | |
138 | bool *expensive_debug_checks; | |
139 | #endif | |
140 | }; | |
141 | ||
142 | struct btree_cache { | |
143 | struct rhashtable table; | |
144 | bool table_init_done; | |
145 | /* | |
146 | * We never free a struct btree, except on shutdown - we just put it on | |
147 | * the btree_cache_freed list and reuse it later. This simplifies the | |
148 | * code, and it doesn't cost us much memory as the memory usage is | |
149 | * dominated by buffers that hold the actual btree node data and those | |
150 | * can be freed - and the number of struct btrees allocated is | |
151 | * effectively bounded. | |
152 | * | |
153 | * btree_cache_freeable effectively is a small cache - we use it because | |
154 | * high order page allocations can be rather expensive, and it's quite | |
155 | * common to delete and allocate btree nodes in quick succession. It | |
156 | * should never grow past ~2-3 nodes in practice. | |
157 | */ | |
158 | struct mutex lock; | |
159 | struct list_head live; | |
160 | struct list_head freeable; | |
161 | struct list_head freed; | |
162 | ||
163 | /* Number of elements in live + freeable lists */ | |
164 | unsigned used; | |
165 | unsigned reserve; | |
166 | struct shrinker shrink; | |
167 | ||
168 | /* | |
169 | * If we need to allocate memory for a new btree node and that | |
170 | * allocation fails, we can cannibalize another node in the btree cache | |
171 | * to satisfy the allocation - lock to guarantee only one thread does | |
172 | * this at a time: | |
173 | */ | |
174 | struct task_struct *alloc_lock; | |
175 | struct closure_waitlist alloc_wait; | |
176 | }; | |
177 | ||
178 | struct btree_node_iter { | |
1c6fdbd8 KO |
179 | struct btree_node_iter_set { |
180 | u16 k, end; | |
181 | } data[MAX_BSETS]; | |
182 | }; | |
183 | ||
184 | enum btree_iter_type { | |
185 | BTREE_ITER_KEYS, | |
186 | BTREE_ITER_SLOTS, | |
187 | BTREE_ITER_NODES, | |
188 | }; | |
189 | ||
190 | #define BTREE_ITER_TYPE ((1 << 2) - 1) | |
191 | ||
192 | #define BTREE_ITER_INTENT (1 << 2) | |
193 | #define BTREE_ITER_PREFETCH (1 << 3) | |
194 | /* | |
195 | * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for | |
196 | * @pos or the first key strictly greater than @pos | |
197 | */ | |
198 | #define BTREE_ITER_IS_EXTENTS (1 << 4) | |
199 | /* | |
200 | * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator: | |
201 | */ | |
202 | #define BTREE_ITER_AT_END_OF_LEAF (1 << 5) | |
203 | #define BTREE_ITER_ERROR (1 << 6) | |
204 | ||
205 | enum btree_iter_uptodate { | |
206 | BTREE_ITER_UPTODATE = 0, | |
207 | BTREE_ITER_NEED_PEEK = 1, | |
208 | BTREE_ITER_NEED_RELOCK = 2, | |
209 | BTREE_ITER_NEED_TRAVERSE = 3, | |
210 | }; | |
211 | ||
212 | /* | |
213 | * @pos - iterator's current position | |
214 | * @level - current btree depth | |
215 | * @locks_want - btree level below which we start taking intent locks | |
216 | * @nodes_locked - bitmask indicating which nodes in @nodes are locked | |
217 | * @nodes_intent_locked - bitmask indicating which locks are intent locks | |
218 | */ | |
219 | struct btree_iter { | |
220 | struct bch_fs *c; | |
221 | struct bpos pos; | |
222 | ||
223 | u8 flags; | |
224 | enum btree_iter_uptodate uptodate:4; | |
225 | enum btree_id btree_id:4; | |
226 | unsigned level:4, | |
227 | locks_want:4, | |
228 | nodes_locked:4, | |
229 | nodes_intent_locked:4; | |
230 | ||
231 | struct btree_iter_level { | |
232 | struct btree *b; | |
233 | struct btree_node_iter iter; | |
e4ccb251 | 234 | u32 lock_seq; |
1c6fdbd8 KO |
235 | } l[BTREE_MAX_DEPTH]; |
236 | ||
1c6fdbd8 KO |
237 | /* |
238 | * Current unpacked key - so that bch2_btree_iter_next()/ | |
239 | * bch2_btree_iter_next_slot() can correctly advance pos. | |
240 | */ | |
241 | struct bkey k; | |
242 | ||
243 | /* | |
244 | * Circular linked list of linked iterators: linked iterators share | |
245 | * locks (e.g. two linked iterators may have the same node intent | |
246 | * locked, or read and write locked, at the same time), and insertions | |
247 | * through one iterator won't invalidate the other linked iterators. | |
248 | */ | |
249 | ||
250 | /* Must come last: */ | |
251 | struct btree_iter *next; | |
252 | }; | |
253 | ||
254 | #define BTREE_ITER_MAX 8 | |
255 | ||
256 | struct btree_insert_entry { | |
257 | struct btree_iter *iter; | |
258 | struct bkey_i *k; | |
259 | unsigned extra_res; | |
260 | /* | |
261 | * true if entire key was inserted - can only be false for | |
262 | * extents | |
263 | */ | |
264 | bool done; | |
265 | }; | |
266 | ||
267 | struct btree_trans { | |
268 | struct bch_fs *c; | |
1c7a0adf | 269 | size_t nr_restarts; |
1c6fdbd8 KO |
270 | |
271 | u8 nr_iters; | |
272 | u8 iters_live; | |
273 | u8 iters_linked; | |
274 | u8 nr_updates; | |
275 | ||
276 | unsigned mem_top; | |
277 | unsigned mem_bytes; | |
278 | void *mem; | |
279 | ||
280 | struct btree_iter *iters; | |
281 | u64 iter_ids[BTREE_ITER_MAX]; | |
282 | ||
283 | struct btree_insert_entry updates[BTREE_ITER_MAX]; | |
284 | ||
285 | struct btree_iter iters_onstack[2]; | |
286 | }; | |
287 | ||
288 | #define BTREE_FLAG(flag) \ | |
289 | static inline bool btree_node_ ## flag(struct btree *b) \ | |
290 | { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ | |
291 | \ | |
292 | static inline void set_btree_node_ ## flag(struct btree *b) \ | |
293 | { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ | |
294 | \ | |
295 | static inline void clear_btree_node_ ## flag(struct btree *b) \ | |
296 | { clear_bit(BTREE_NODE_ ## flag, &b->flags); } | |
297 | ||
298 | enum btree_flags { | |
299 | BTREE_NODE_read_in_flight, | |
300 | BTREE_NODE_read_error, | |
301 | BTREE_NODE_dirty, | |
302 | BTREE_NODE_need_write, | |
303 | BTREE_NODE_noevict, | |
304 | BTREE_NODE_write_idx, | |
305 | BTREE_NODE_accessed, | |
306 | BTREE_NODE_write_in_flight, | |
307 | BTREE_NODE_just_written, | |
308 | BTREE_NODE_dying, | |
309 | BTREE_NODE_fake, | |
310 | }; | |
311 | ||
312 | BTREE_FLAG(read_in_flight); | |
313 | BTREE_FLAG(read_error); | |
314 | BTREE_FLAG(dirty); | |
315 | BTREE_FLAG(need_write); | |
316 | BTREE_FLAG(noevict); | |
317 | BTREE_FLAG(write_idx); | |
318 | BTREE_FLAG(accessed); | |
319 | BTREE_FLAG(write_in_flight); | |
320 | BTREE_FLAG(just_written); | |
321 | BTREE_FLAG(dying); | |
322 | BTREE_FLAG(fake); | |
323 | ||
324 | static inline struct btree_write *btree_current_write(struct btree *b) | |
325 | { | |
326 | return b->writes + btree_node_write_idx(b); | |
327 | } | |
328 | ||
329 | static inline struct btree_write *btree_prev_write(struct btree *b) | |
330 | { | |
331 | return b->writes + (btree_node_write_idx(b) ^ 1); | |
332 | } | |
333 | ||
334 | static inline struct bset_tree *bset_tree_last(struct btree *b) | |
335 | { | |
336 | EBUG_ON(!b->nsets); | |
337 | return b->set + b->nsets - 1; | |
338 | } | |
339 | ||
1fe08f31 KO |
340 | static inline void * |
341 | __btree_node_offset_to_ptr(const struct btree *b, u16 offset) | |
342 | { | |
343 | return (void *) ((u64 *) b->data + 1 + offset); | |
344 | } | |
345 | ||
346 | static inline u16 | |
347 | __btree_node_ptr_to_offset(const struct btree *b, const void *p) | |
348 | { | |
349 | u16 ret = (u64 *) p - 1 - (u64 *) b->data; | |
350 | ||
351 | EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); | |
352 | return ret; | |
353 | } | |
354 | ||
1c6fdbd8 KO |
355 | static inline struct bset *bset(const struct btree *b, |
356 | const struct bset_tree *t) | |
357 | { | |
1fe08f31 KO |
358 | return __btree_node_offset_to_ptr(b, t->data_offset); |
359 | } | |
360 | ||
361 | static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) | |
362 | { | |
363 | t->end_offset = | |
364 | __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); | |
365 | } | |
366 | ||
367 | static inline void set_btree_bset(struct btree *b, struct bset_tree *t, | |
368 | const struct bset *i) | |
369 | { | |
370 | t->data_offset = __btree_node_ptr_to_offset(b, i); | |
371 | set_btree_bset_end(b, t); | |
1c6fdbd8 KO |
372 | } |
373 | ||
374 | static inline struct bset *btree_bset_first(struct btree *b) | |
375 | { | |
376 | return bset(b, b->set); | |
377 | } | |
378 | ||
379 | static inline struct bset *btree_bset_last(struct btree *b) | |
380 | { | |
381 | return bset(b, bset_tree_last(b)); | |
382 | } | |
383 | ||
384 | static inline u16 | |
385 | __btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) | |
386 | { | |
1fe08f31 | 387 | return __btree_node_ptr_to_offset(b, k); |
1c6fdbd8 KO |
388 | } |
389 | ||
390 | static inline struct bkey_packed * | |
391 | __btree_node_offset_to_key(const struct btree *b, u16 k) | |
392 | { | |
1fe08f31 | 393 | return __btree_node_offset_to_ptr(b, k); |
1c6fdbd8 KO |
394 | } |
395 | ||
617391ba KO |
396 | static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) |
397 | { | |
398 | return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); | |
399 | } | |
400 | ||
1fe08f31 KO |
401 | #define btree_bkey_first(_b, _t) \ |
402 | ({ \ | |
403 | EBUG_ON(bset(_b, _t)->start != \ | |
404 | __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ | |
405 | \ | |
406 | bset(_b, _t)->start; \ | |
407 | }) | |
1c6fdbd8 KO |
408 | |
409 | #define btree_bkey_last(_b, _t) \ | |
410 | ({ \ | |
411 | EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ | |
412 | vstruct_last(bset(_b, _t))); \ | |
413 | \ | |
414 | __btree_node_offset_to_key(_b, (_t)->end_offset); \ | |
415 | }) | |
416 | ||
1c6fdbd8 KO |
417 | static inline unsigned bset_byte_offset(struct btree *b, void *i) |
418 | { | |
419 | return i - (void *) b->data; | |
420 | } | |
421 | ||
422 | /* Type of keys @b contains: */ | |
423 | static inline enum bkey_type btree_node_type(struct btree *b) | |
424 | { | |
425 | return b->level ? BKEY_TYPE_BTREE : b->btree_id; | |
426 | } | |
427 | ||
428 | static inline const struct bkey_ops *btree_node_ops(struct btree *b) | |
429 | { | |
430 | return &bch2_bkey_ops[btree_node_type(b)]; | |
431 | } | |
432 | ||
433 | static inline bool btree_node_has_ptrs(struct btree *b) | |
434 | { | |
435 | return btree_type_has_ptrs(btree_node_type(b)); | |
436 | } | |
437 | ||
438 | static inline bool btree_node_is_extents(struct btree *b) | |
439 | { | |
440 | return btree_node_type(b) == BKEY_TYPE_EXTENTS; | |
441 | } | |
442 | ||
443 | struct btree_root { | |
444 | struct btree *b; | |
445 | ||
446 | struct btree_update *as; | |
447 | ||
448 | /* On disk root - see async splits: */ | |
449 | __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); | |
450 | u8 level; | |
451 | u8 alive; | |
452 | }; | |
453 | ||
454 | /* | |
455 | * Optional hook that will be called just prior to a btree node update, when | |
456 | * we're holding the write lock and we know what key is about to be overwritten: | |
457 | */ | |
458 | ||
1c6fdbd8 KO |
459 | enum btree_insert_ret { |
460 | BTREE_INSERT_OK, | |
461 | /* extent spanned multiple leaf nodes: have to traverse to next node: */ | |
462 | BTREE_INSERT_NEED_TRAVERSE, | |
463 | /* write lock held for too long */ | |
1c6fdbd8 KO |
464 | /* leaf node needs to be split */ |
465 | BTREE_INSERT_BTREE_NODE_FULL, | |
466 | BTREE_INSERT_JOURNAL_RES_FULL, | |
467 | BTREE_INSERT_ENOSPC, | |
468 | BTREE_INSERT_NEED_GC_LOCK, | |
469 | }; | |
470 | ||
471 | struct extent_insert_hook { | |
472 | enum btree_insert_ret | |
473 | (*fn)(struct extent_insert_hook *, struct bpos, struct bpos, | |
474 | struct bkey_s_c, const struct bkey_i *); | |
475 | }; | |
476 | ||
477 | enum btree_gc_coalesce_fail_reason { | |
478 | BTREE_GC_COALESCE_FAIL_RESERVE_GET, | |
479 | BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, | |
480 | BTREE_GC_COALESCE_FAIL_FORMAT_FITS, | |
481 | }; | |
482 | ||
483 | enum btree_node_sibling { | |
484 | btree_prev_sib, | |
485 | btree_next_sib, | |
486 | }; | |
487 | ||
488 | typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, | |
489 | struct btree *, | |
490 | struct btree_node_iter *); | |
491 | ||
492 | #endif /* _BCACHEFS_BTREE_TYPES_H */ |