bcachefs: add missing bch2_btree_iter_node_drop() call
[linux-block.git] / fs / bcachefs / btree_iter.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "bkey_methods.h"
5#include "btree_cache.h"
6#include "btree_iter.h"
7#include "btree_locking.h"
8#include "debug.h"
9#include "extents.h"
10#include "trace.h"
11
12#include <linux/prefetch.h>
13
14static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
15 struct btree_iter_level *,
16 struct bkey *);
17
18#define BTREE_ITER_NOT_END ((struct btree *) 1)
19
20static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
21{
22 return l < BTREE_MAX_DEPTH &&
23 iter->l[l].b &&
24 iter->l[l].b != BTREE_ITER_NOT_END;
25}
26
a00fd8c5
KO
27/* Returns < 0 if @k is before iter pos, > 0 if @k is after */
28static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
29 const struct btree *b,
30 const struct bkey_packed *k,
31 bool interior_node)
32{
33 int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
34
35 if (cmp)
36 return cmp;
37 if (bkey_deleted(k))
38 return -1;
cbdf24ce
KO
39
40 /*
41 * Normally, for extents we want the first key strictly greater than
42 * the iterator position - with the exception that for interior nodes,
43 * we don't want to advance past the last key if the iterator position
44 * is POS_MAX:
45 */
46 if (iter->flags & BTREE_ITER_IS_EXTENTS &&
47 (!interior_node ||
48 bkey_cmp_left_packed_byval(b, k, POS_MAX)))
a00fd8c5
KO
49 return -1;
50 return 1;
51}
52
53static inline int btree_iter_pos_cmp(struct btree_iter *iter,
54 const struct btree *b,
55 const struct bkey_packed *k)
56{
57 return __btree_iter_pos_cmp(iter, b, k, b->level != 0);
58}
59
1c6fdbd8
KO
60/* Btree node locking: */
61
62/*
63 * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
64 * succeed:
65 */
66void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
67{
68 struct btree_iter *linked;
69
70 EBUG_ON(iter->l[b->level].b != b);
e4ccb251 71 EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
1c6fdbd8 72
0f238367 73 trans_for_each_iter_with_node(iter->trans, b, linked)
e4ccb251 74 linked->l[b->level].lock_seq += 2;
1c6fdbd8
KO
75
76 six_unlock_write(&b->lock);
77}
78
79void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
80{
1c6fdbd8
KO
81 struct btree_iter *linked;
82 unsigned readers = 0;
83
84 EBUG_ON(btree_node_read_locked(iter, b->level));
85
0f238367 86 trans_for_each_iter(iter->trans, linked)
1c6fdbd8
KO
87 if (linked->l[b->level].b == b &&
88 btree_node_read_locked(linked, b->level))
89 readers++;
90
91 /*
92 * Must drop our read locks before calling six_lock_write() -
93 * six_unlock() won't do wakeups until the reader count
94 * goes to 0, and it's safe because we have the node intent
95 * locked:
96 */
97 atomic64_sub(__SIX_VAL(read_lock, readers),
98 &b->lock.state.counter);
9e5e5b9e 99 btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
1c6fdbd8
KO
100 atomic64_add(__SIX_VAL(read_lock, readers),
101 &b->lock.state.counter);
102}
103
1c6fdbd8
KO
104bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
105{
106 struct btree *b = btree_iter_node(iter, level);
107 int want = __btree_lock_want(iter, level);
108
109 if (!b || b == BTREE_ITER_NOT_END)
110 return false;
111
112 if (race_fault())
113 return false;
114
e4ccb251
KO
115 if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) &&
116 !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
1c6fdbd8
KO
117 btree_node_lock_increment(iter, b, level, want)))
118 return false;
119
120 mark_btree_node_locked(iter, level, want);
121 return true;
122}
123
124static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
125{
126 struct btree *b = iter->l[level].b;
127
128 EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
129
130 if (!is_btree_node(iter, level))
131 return false;
132
133 if (btree_node_intent_locked(iter, level))
134 return true;
135
136 if (race_fault())
137 return false;
138
139 if (btree_node_locked(iter, level)
140 ? six_lock_tryupgrade(&b->lock)
e4ccb251 141 : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq))
1c6fdbd8
KO
142 goto success;
143
e4ccb251 144 if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
1c6fdbd8
KO
145 btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
146 btree_node_unlock(iter, level);
147 goto success;
148 }
149
150 return false;
151success:
152 mark_btree_node_intent_locked(iter, level);
153 return true;
154}
155
156static inline bool btree_iter_get_locks(struct btree_iter *iter,
157 bool upgrade)
158{
159 unsigned l = iter->level;
160 int fail_idx = -1;
161
162 do {
163 if (!btree_iter_node(iter, l))
164 break;
165
166 if (!(upgrade
167 ? bch2_btree_node_upgrade(iter, l)
168 : bch2_btree_node_relock(iter, l))) {
169 fail_idx = l;
170 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
171 }
172
173 l++;
174 } while (l < iter->locks_want);
175
176 /*
177 * When we fail to get a lock, we have to ensure that any child nodes
178 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
179 * the node that we failed to relock:
180 */
181 while (fail_idx >= 0) {
182 btree_node_unlock(iter, fail_idx);
183 iter->l[fail_idx].b = BTREE_ITER_NOT_END;
184 --fail_idx;
185 }
186
187 if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
188 iter->uptodate = BTREE_ITER_NEED_PEEK;
189
0f238367
KO
190 bch2_btree_trans_verify_locks(iter->trans);
191
1c6fdbd8
KO
192 return iter->uptodate < BTREE_ITER_NEED_RELOCK;
193}
194
195/* Slowpath: */
196bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
197 unsigned level,
198 struct btree_iter *iter,
199 enum six_lock_type type,
200 bool may_drop_locks)
201{
1c6fdbd8
KO
202 struct btree_iter *linked;
203 bool ret = true;
204
647d7b60 205 /* Check if it's safe to block: */
0f238367 206 trans_for_each_iter(iter->trans, linked) {
1c6fdbd8
KO
207 if (!linked->nodes_locked)
208 continue;
209
647d7b60 210 /* * Must lock btree nodes in key order: */
1c6fdbd8
KO
211 if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
212 ret = false;
213
214 /*
215 * Can't block taking an intent lock if we have _any_ nodes read
216 * locked:
217 *
218 * - Our read lock blocks another thread with an intent lock on
219 * the same node from getting a write lock, and thus from
220 * dropping its intent lock
221 *
222 * - And the other thread may have multiple nodes intent locked:
223 * both the node we want to intent lock, and the node we
224 * already have read locked - deadlock:
225 */
226 if (type == SIX_LOCK_intent &&
227 linked->nodes_locked != linked->nodes_intent_locked) {
228 if (may_drop_locks) {
229 linked->locks_want = max_t(unsigned,
230 linked->locks_want,
231 __fls(linked->nodes_locked) + 1);
232 btree_iter_get_locks(linked, true);
233 }
234 ret = false;
235 }
236
237 /*
238 * Interior nodes must be locked before their descendants: if
239 * another iterator has possible descendants locked of the node
240 * we're about to lock, it must have the ancestors locked too:
241 */
242 if (linked->btree_id == iter->btree_id &&
243 level > __fls(linked->nodes_locked)) {
244 if (may_drop_locks) {
647d7b60
KO
245 linked->locks_want =
246 max(level + 1, max_t(unsigned,
247 linked->locks_want,
248 iter->locks_want));
1c6fdbd8
KO
249 btree_iter_get_locks(linked, true);
250 }
251 ret = false;
252 }
253 }
254
255 if (ret)
9e5e5b9e 256 __btree_node_lock_type(iter->trans->c, b, type);
1c7a0adf
KO
257 else
258 trans_restart();
259
1c6fdbd8
KO
260 return ret;
261}
262
263/* Btree iterator locking: */
264
265#ifdef CONFIG_BCACHEFS_DEBUG
0f238367 266void bch2_btree_iter_verify_locks(struct btree_iter *iter)
1c6fdbd8
KO
267{
268 unsigned l;
269
ad7ae8d6
KO
270 BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) &&
271 !btree_node_locked(iter, 0));
272
1c6fdbd8
KO
273 for (l = 0; btree_iter_node(iter, l); l++) {
274 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
275 !btree_node_locked(iter, l))
276 continue;
277
278 BUG_ON(btree_lock_want(iter, l) !=
279 btree_node_locked_type(iter, l));
280 }
281}
ad7ae8d6 282
0f238367 283void bch2_btree_trans_verify_locks(struct btree_trans *trans)
ad7ae8d6 284{
0f238367 285 struct btree_iter *iter;
ad7ae8d6 286
0f238367
KO
287 trans_for_each_iter(trans, iter)
288 bch2_btree_iter_verify_locks(iter);
ad7ae8d6 289}
1c6fdbd8
KO
290#endif
291
292__flatten
0f238367 293static bool bch2_btree_iter_relock(struct btree_iter *iter)
1c6fdbd8
KO
294{
295 return iter->uptodate >= BTREE_ITER_NEED_RELOCK
296 ? btree_iter_get_locks(iter, false)
297 : true;
298}
299
1c6fdbd8
KO
300bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
301 unsigned new_locks_want)
302{
303 struct btree_iter *linked;
304
305 EBUG_ON(iter->locks_want >= new_locks_want);
306
307 iter->locks_want = new_locks_want;
308
309 if (btree_iter_get_locks(iter, true))
310 return true;
311
312 /*
313 * Ancestor nodes must be locked before child nodes, so set locks_want
314 * on iterators that might lock ancestors before us to avoid getting
315 * -EINTR later:
316 */
0f238367
KO
317 trans_for_each_iter(iter->trans, linked)
318 if (linked != iter &&
319 linked->btree_id == iter->btree_id &&
1c6fdbd8
KO
320 btree_iter_cmp(linked, iter) <= 0 &&
321 linked->locks_want < new_locks_want) {
322 linked->locks_want = new_locks_want;
323 btree_iter_get_locks(linked, true);
324 }
325
326 return false;
327}
328
329bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
330 unsigned new_locks_want)
331{
332 unsigned l = iter->level;
333
334 EBUG_ON(iter->locks_want >= new_locks_want);
335
336 iter->locks_want = new_locks_want;
337
338 do {
339 if (!btree_iter_node(iter, l))
340 break;
341
342 if (!bch2_btree_node_upgrade(iter, l)) {
343 iter->locks_want = l;
344 return false;
345 }
346
347 l++;
348 } while (l < iter->locks_want);
349
350 return true;
351}
352
353void __bch2_btree_iter_downgrade(struct btree_iter *iter,
354 unsigned downgrade_to)
355{
356 struct btree_iter *linked;
357 unsigned l;
358
359 /*
360 * We downgrade linked iterators as well because btree_iter_upgrade
361 * might have had to modify locks_want on linked iterators due to lock
362 * ordering:
363 */
0f238367 364 trans_for_each_iter(iter->trans, linked) {
1c6fdbd8
KO
365 unsigned new_locks_want = downgrade_to ?:
366 (linked->flags & BTREE_ITER_INTENT ? 1 : 0);
367
368 if (linked->locks_want <= new_locks_want)
369 continue;
370
371 linked->locks_want = new_locks_want;
372
373 while (linked->nodes_locked &&
374 (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
375 if (l > linked->level) {
376 btree_node_unlock(linked, l);
377 } else {
378 if (btree_node_intent_locked(linked, l)) {
379 six_lock_downgrade(&linked->l[l].b->lock);
380 linked->nodes_intent_locked ^= 1 << l;
381 }
382 break;
383 }
384 }
1c6fdbd8 385 }
ad7ae8d6 386
0f238367 387 bch2_btree_trans_verify_locks(iter->trans);
1c6fdbd8
KO
388}
389
390int bch2_btree_iter_unlock(struct btree_iter *iter)
391{
392 struct btree_iter *linked;
393
0f238367 394 trans_for_each_iter(iter->trans, linked)
1c6fdbd8
KO
395 __bch2_btree_iter_unlock(linked);
396
0f238367 397 return btree_iter_err(iter);
1c6fdbd8
KO
398}
399
0f238367
KO
400bool bch2_btree_trans_relock(struct btree_trans *trans)
401{
402 struct btree_iter *iter;
403 bool ret = true;
404
405 trans_for_each_iter(trans, iter)
406 ret &= bch2_btree_iter_relock(iter);
407
408 return ret;
409}
410
411void bch2_btree_trans_unlock(struct btree_trans *trans)
412{
413 struct btree_iter *iter;
414
415 trans_for_each_iter(trans, iter)
416 __bch2_btree_iter_unlock(iter);
417}
418
419/* Btree transaction locking: */
420
1c6fdbd8
KO
421/* Btree iterator: */
422
423#ifdef CONFIG_BCACHEFS_DEBUG
424
425static void __bch2_btree_iter_verify(struct btree_iter *iter,
426 struct btree *b)
427{
428 struct btree_iter_level *l = &iter->l[b->level];
429 struct btree_node_iter tmp = l->iter;
430 struct bkey_packed *k;
431
f13f5a8c
KO
432 if (!debug_check_iterators(iter->trans->c))
433 return;
434
271a3d3a
KO
435 if (iter->uptodate > BTREE_ITER_NEED_PEEK)
436 return;
437
1c6fdbd8
KO
438 bch2_btree_node_iter_verify(&l->iter, b);
439
440 /*
441 * For interior nodes, the iterator will have skipped past
442 * deleted keys:
271a3d3a
KO
443 *
444 * For extents, the iterator may have skipped past deleted keys (but not
445 * whiteouts)
1c6fdbd8 446 */
271a3d3a 447 k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
26609b61 448 ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
1c6fdbd8 449 : bch2_btree_node_iter_prev_all(&tmp, b);
a00fd8c5 450 if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
1c6fdbd8
KO
451 char buf[100];
452 struct bkey uk = bkey_unpack_key(b, k);
453
319f9ac3 454 bch2_bkey_to_text(&PBUF(buf), &uk);
271a3d3a 455 panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
1c6fdbd8
KO
456 buf, iter->pos.inode, iter->pos.offset);
457 }
458
459 k = bch2_btree_node_iter_peek_all(&l->iter, b);
a00fd8c5 460 if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
1c6fdbd8
KO
461 char buf[100];
462 struct bkey uk = bkey_unpack_key(b, k);
463
319f9ac3 464 bch2_bkey_to_text(&PBUF(buf), &uk);
271a3d3a
KO
465 panic("iter should be after current key:\n"
466 "iter pos %llu:%llu\n"
467 "cur key %s\n",
1c6fdbd8
KO
468 iter->pos.inode, iter->pos.offset, buf);
469 }
470
271a3d3a
KO
471 BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
472 (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
473 !bkey_whiteout(&iter->k) &&
474 bch2_btree_node_iter_end(&l->iter));
1c6fdbd8
KO
475}
476
477void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
478{
479 struct btree_iter *linked;
480
f13f5a8c
KO
481 if (!debug_check_iterators(iter->trans->c))
482 return;
483
0f238367 484 trans_for_each_iter_with_node(iter->trans, b, linked)
1c6fdbd8
KO
485 __bch2_btree_iter_verify(linked, b);
486}
487
271a3d3a
KO
488#else
489
490static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
491 struct btree *b) {}
492
1c6fdbd8
KO
493#endif
494
495static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
496 struct btree *b,
497 struct btree_node_iter *node_iter,
498 struct bset_tree *t,
499 struct bkey_packed *where,
500 unsigned clobber_u64s,
501 unsigned new_u64s)
502{
503 const struct bkey_packed *end = btree_bkey_last(b, t);
504 struct btree_node_iter_set *set;
505 unsigned offset = __btree_node_key_to_offset(b, where);
506 int shift = new_u64s - clobber_u64s;
271a3d3a 507 unsigned old_end = t->end_offset - shift;
1c6fdbd8
KO
508
509 btree_node_iter_for_each(node_iter, set)
510 if (set->end == old_end)
511 goto found;
512
513 /* didn't find the bset in the iterator - might have to readd it: */
514 if (new_u64s &&
a00fd8c5 515 btree_iter_pos_cmp(iter, b, where) > 0) {
1c6fdbd8
KO
516 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
517
518 bch2_btree_node_iter_push(node_iter, b, where, end);
519
520 if (!b->level &&
521 node_iter == &iter->l[0].iter)
522 bkey_disassemble(b,
523 bch2_btree_node_iter_peek_all(node_iter, b),
524 &iter->k);
525 }
526 return;
527found:
271a3d3a 528 set->end = t->end_offset;
1c6fdbd8
KO
529
530 /* Iterator hasn't gotten to the key that changed yet: */
531 if (set->k < offset)
532 return;
533
534 if (new_u64s &&
a00fd8c5 535 btree_iter_pos_cmp(iter, b, where) > 0) {
1c6fdbd8
KO
536 set->k = offset;
537 } else if (set->k < offset + clobber_u64s) {
538 set->k = offset + new_u64s;
539 if (set->k == set->end)
540 bch2_btree_node_iter_set_drop(node_iter, set);
541 } else {
542 set->k = (int) set->k + shift;
543 goto iter_current_key_not_modified;
544 }
545
546 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
547
548 bch2_btree_node_iter_sort(node_iter, b);
56e0e7c7
KO
549 if (!b->level && node_iter == &iter->l[0].iter) {
550 /*
551 * not legal to call bkey_debugcheck() here, because we're
552 * called midway through the update path after update has been
553 * marked but before deletes have actually happened:
554 */
555#if 0
1c6fdbd8 556 __btree_iter_peek_all(iter, &iter->l[0], &iter->k);
56e0e7c7
KO
557#endif
558 struct btree_iter_level *l = &iter->l[0];
559 struct bkey_packed *k =
560 bch2_btree_node_iter_peek_all(&l->iter, l->b);
561
562 if (unlikely(!k))
563 iter->k.type = KEY_TYPE_deleted;
564 else
565 bkey_disassemble(l->b, k, &iter->k);
566 }
1c6fdbd8
KO
567iter_current_key_not_modified:
568
569 /*
570 * Interior nodes are special because iterators for interior nodes don't
571 * obey the usual invariants regarding the iterator position:
572 *
573 * We may have whiteouts that compare greater than the iterator
574 * position, and logically should be in the iterator, but that we
575 * skipped past to find the first live key greater than the iterator
576 * position. This becomes an issue when we insert a new key that is
577 * greater than the current iterator position, but smaller than the
578 * whiteouts we've already skipped past - this happens in the course of
579 * a btree split.
580 *
581 * We have to rewind the iterator past to before those whiteouts here,
582 * else bkey_node_iter_prev() is not going to work and who knows what
583 * else would happen. And we have to do it manually, because here we've
584 * already done the insert and the iterator is currently inconsistent:
585 *
586 * We've got multiple competing invariants, here - we have to be careful
587 * about rewinding iterators for interior nodes, because they should
588 * always point to the key for the child node the btree iterator points
589 * to.
590 */
a00fd8c5
KO
591 if (b->level && new_u64s &&
592 btree_iter_pos_cmp(iter, b, where) > 0) {
216c9fac 593 struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where);
1c6fdbd8
KO
594 struct bkey_packed *k;
595
596 for_each_bset(b, t) {
216c9fac 597 if (where_set == t)
1c6fdbd8
KO
598 continue;
599
600 k = bch2_bkey_prev_all(b, t,
601 bch2_btree_node_iter_bset_pos(node_iter, b, t));
602 if (k &&
a00fd8c5 603 bkey_iter_cmp(b, k, where) > 0) {
1c6fdbd8
KO
604 struct btree_node_iter_set *set;
605 unsigned offset =
606 __btree_node_key_to_offset(b, bkey_next(k));
607
608 btree_node_iter_for_each(node_iter, set)
609 if (set->k == offset) {
610 set->k = __btree_node_key_to_offset(b, k);
611 bch2_btree_node_iter_sort(node_iter, b);
612 goto next_bset;
613 }
614
615 bch2_btree_node_iter_push(node_iter, b, k,
616 btree_bkey_last(b, t));
617 }
618next_bset:
619 t = t;
620 }
621 }
622}
623
624void bch2_btree_node_iter_fix(struct btree_iter *iter,
216c9fac
KO
625 struct btree *b,
626 struct btree_node_iter *node_iter,
627 struct bkey_packed *where,
628 unsigned clobber_u64s,
629 unsigned new_u64s)
1c6fdbd8 630{
216c9fac 631 struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
1c6fdbd8
KO
632 struct btree_iter *linked;
633
634 if (node_iter != &iter->l[b->level].iter)
635 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
636 where, clobber_u64s, new_u64s);
637
0f238367 638 trans_for_each_iter_with_node(iter->trans, b, linked)
1c6fdbd8
KO
639 __bch2_btree_node_iter_fix(linked, b,
640 &linked->l[b->level].iter, t,
641 where, clobber_u64s, new_u64s);
1c6fdbd8
KO
642}
643
644static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
645 struct btree_iter_level *l,
646 struct bkey *u,
647 struct bkey_packed *k)
648{
649 struct bkey_s_c ret;
650
651 if (unlikely(!k)) {
652 /*
653 * signal to bch2_btree_iter_peek_slot() that we're currently at
654 * a hole
655 */
26609b61 656 u->type = KEY_TYPE_deleted;
1c6fdbd8
KO
657 return bkey_s_c_null;
658 }
659
660 ret = bkey_disassemble(l->b, k, u);
661
9e5e5b9e
KO
662 if (debug_check_bkeys(iter->trans->c))
663 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1c6fdbd8
KO
664
665 return ret;
666}
667
668/* peek_all() doesn't skip deleted keys */
669static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
670 struct btree_iter_level *l,
671 struct bkey *u)
672{
673 return __btree_iter_unpack(iter, l, u,
674 bch2_btree_node_iter_peek_all(&l->iter, l->b));
675}
676
677static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
678 struct btree_iter_level *l)
679{
680 return __btree_iter_unpack(iter, l, &iter->k,
681 bch2_btree_node_iter_peek(&l->iter, l->b));
682}
683
a00fd8c5
KO
684static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
685 struct btree_iter_level *l,
686 int max_advance)
1c6fdbd8 687{
a00fd8c5
KO
688 struct bkey_packed *k;
689 int nr_advanced = 0;
690
691 while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
692 btree_iter_pos_cmp(iter, l->b, k) < 0) {
693 if (max_advance > 0 && nr_advanced >= max_advance)
694 return false;
695
696 bch2_btree_node_iter_advance(&l->iter, l->b);
697 nr_advanced++;
698 }
699
700 return true;
1c6fdbd8
KO
701}
702
703/*
704 * Verify that iterator for parent node points to child node:
705 */
706static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
707{
708 struct btree_iter_level *l;
709 unsigned plevel;
710 bool parent_locked;
711 struct bkey_packed *k;
712
713 if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
714 return;
715
716 plevel = b->level + 1;
717 if (!btree_iter_node(iter, plevel))
718 return;
719
720 parent_locked = btree_node_locked(iter, plevel);
721
722 if (!bch2_btree_node_relock(iter, plevel))
723 return;
724
725 l = &iter->l[plevel];
726 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
727 if (!k ||
728 bkey_deleted(k) ||
729 bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
730 char buf[100];
731 struct bkey uk = bkey_unpack_key(b, k);
732
319f9ac3 733 bch2_bkey_to_text(&PBUF(buf), &uk);
1c6fdbd8
KO
734 panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
735 buf, b->key.k.p.inode, b->key.k.p.offset);
736 }
737
738 if (!parent_locked)
739 btree_node_unlock(iter, b->level + 1);
740}
741
1c6fdbd8
KO
742static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
743 struct btree *b)
744{
a00fd8c5 745 return __btree_iter_pos_cmp(iter, NULL,
cbdf24ce 746 bkey_to_packed(&b->key), true) < 0;
1c6fdbd8
KO
747}
748
749static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
750 struct btree *b)
751{
752 return iter->btree_id == b->btree_id &&
753 bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
754 !btree_iter_pos_after_node(iter, b);
755}
756
757static inline void __btree_iter_init(struct btree_iter *iter,
a00fd8c5 758 unsigned level)
1c6fdbd8 759{
a00fd8c5
KO
760 struct btree_iter_level *l = &iter->l[level];
761
762 bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
1c6fdbd8 763
a00fd8c5
KO
764 if (iter->flags & BTREE_ITER_IS_EXTENTS)
765 btree_iter_advance_to_pos(iter, l, -1);
1c6fdbd8
KO
766
767 /* Skip to first non whiteout: */
a00fd8c5
KO
768 if (level)
769 bch2_btree_node_iter_peek(&l->iter, l->b);
1c6fdbd8
KO
770
771 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
772}
773
774static inline void btree_iter_node_set(struct btree_iter *iter,
775 struct btree *b)
776{
777 btree_iter_verify_new_node(iter, b);
778
779 EBUG_ON(!btree_iter_pos_in_node(iter, b));
780 EBUG_ON(b->lock.state.seq & 1);
781
e4ccb251 782 iter->l[b->level].lock_seq = b->lock.state.seq;
1c6fdbd8 783 iter->l[b->level].b = b;
a00fd8c5 784 __btree_iter_init(iter, b->level);
1c6fdbd8
KO
785}
786
787/*
788 * A btree node is being replaced - update the iterator to point to the new
789 * node:
790 */
791void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
792{
793 enum btree_node_locked_type t;
794 struct btree_iter *linked;
795
0f238367 796 trans_for_each_iter(iter->trans, linked)
1c6fdbd8
KO
797 if (btree_iter_pos_in_node(linked, b)) {
798 /*
799 * bch2_btree_iter_node_drop() has already been called -
800 * the old node we're replacing has already been
801 * unlocked and the pointer invalidated
802 */
803 BUG_ON(btree_node_locked(linked, b->level));
804
805 t = btree_lock_want(linked, b->level);
806 if (t != BTREE_NODE_UNLOCKED) {
807 six_lock_increment(&b->lock, (enum six_lock_type) t);
808 mark_btree_node_locked(linked, b->level, (enum six_lock_type) t);
809 }
810
811 btree_iter_node_set(linked, b);
812 }
813
814 six_unlock_intent(&b->lock);
815}
816
817void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
818{
819 struct btree_iter *linked;
820 unsigned level = b->level;
821
0f238367 822 trans_for_each_iter(iter->trans, linked)
1c6fdbd8 823 if (linked->l[level].b == b) {
ad7ae8d6 824 __btree_node_unlock(linked, level);
1c6fdbd8
KO
825 linked->l[level].b = BTREE_ITER_NOT_END;
826 }
827}
828
829/*
830 * A btree node has been modified in such a way as to invalidate iterators - fix
831 * them:
832 */
833void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
834{
835 struct btree_iter *linked;
836
0f238367 837 trans_for_each_iter_with_node(iter->trans, b, linked)
a00fd8c5 838 __btree_iter_init(linked, b->level);
1c6fdbd8
KO
839}
840
841static inline int btree_iter_lock_root(struct btree_iter *iter,
842 unsigned depth_want)
843{
9e5e5b9e 844 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
845 struct btree *b;
846 enum six_lock_type lock_type;
847 unsigned i;
848
849 EBUG_ON(iter->nodes_locked);
850
851 while (1) {
852 b = READ_ONCE(c->btree_roots[iter->btree_id].b);
853 iter->level = READ_ONCE(b->level);
854
855 if (unlikely(iter->level < depth_want)) {
856 /*
857 * the root is at a lower depth than the depth we want:
858 * got to the end of the btree, or we're walking nodes
859 * greater than some depth and there are no nodes >=
860 * that depth
861 */
862 iter->level = depth_want;
863 iter->l[iter->level].b = NULL;
8812600c 864 return 1;
1c6fdbd8
KO
865 }
866
867 lock_type = __btree_lock_want(iter, iter->level);
868 if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
869 iter, lock_type, true)))
870 return -EINTR;
871
872 if (likely(b == c->btree_roots[iter->btree_id].b &&
873 b->level == iter->level &&
874 !race_fault())) {
875 for (i = 0; i < iter->level; i++)
876 iter->l[i].b = BTREE_ITER_NOT_END;
877 iter->l[iter->level].b = b;
878
879 mark_btree_node_locked(iter, iter->level, lock_type);
880 btree_iter_node_set(iter, b);
881 return 0;
882
883 }
884
885 six_unlock_type(&b->lock, lock_type);
886 }
887}
888
889noinline
890static void btree_iter_prefetch(struct btree_iter *iter)
891{
9e5e5b9e 892 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
893 struct btree_iter_level *l = &iter->l[iter->level];
894 struct btree_node_iter node_iter = l->iter;
895 struct bkey_packed *k;
896 BKEY_PADDED(k) tmp;
9e5e5b9e 897 unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1c6fdbd8
KO
898 ? (iter->level > 1 ? 0 : 2)
899 : (iter->level > 1 ? 1 : 16);
900 bool was_locked = btree_node_locked(iter, iter->level);
901
902 while (nr) {
903 if (!bch2_btree_node_relock(iter, iter->level))
904 return;
905
906 bch2_btree_node_iter_advance(&node_iter, l->b);
907 k = bch2_btree_node_iter_peek(&node_iter, l->b);
908 if (!k)
909 break;
910
911 bch2_bkey_unpack(l->b, &tmp.k, k);
9e5e5b9e 912 bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
1c6fdbd8
KO
913 }
914
915 if (!was_locked)
916 btree_node_unlock(iter, iter->level);
917}
918
919static inline int btree_iter_down(struct btree_iter *iter)
920{
9e5e5b9e 921 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
922 struct btree_iter_level *l = &iter->l[iter->level];
923 struct btree *b;
924 unsigned level = iter->level - 1;
925 enum six_lock_type lock_type = __btree_lock_want(iter, level);
926 BKEY_PADDED(k) tmp;
927
928 BUG_ON(!btree_node_locked(iter, iter->level));
929
930 bch2_bkey_unpack(l->b, &tmp.k,
931 bch2_btree_node_iter_peek(&l->iter, l->b));
932
9e5e5b9e 933 b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, true);
1c6fdbd8
KO
934 if (unlikely(IS_ERR(b)))
935 return PTR_ERR(b);
936
937 mark_btree_node_locked(iter, level, lock_type);
938 btree_iter_node_set(iter, b);
939
940 if (iter->flags & BTREE_ITER_PREFETCH)
941 btree_iter_prefetch(iter);
942
943 iter->level = level;
944
945 return 0;
946}
947
948static void btree_iter_up(struct btree_iter *iter)
949{
950 btree_node_unlock(iter, iter->level++);
951}
952
953int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
954
bf7b87a4
KO
955static int __btree_iter_traverse_all(struct btree_trans *trans,
956 struct btree_iter *iter, int ret)
1c6fdbd8 957{
e542029e
KO
958 struct bch_fs *c = trans->c;
959 u8 sorted[BTREE_ITER_MAX];
960 unsigned i, nr_sorted = 0;
961
962 trans_for_each_iter(trans, iter)
963 sorted[nr_sorted++] = iter - trans->iters;
964
965#define btree_iter_cmp_by_idx(_l, _r) \
966 btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
967
968 bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
969#undef btree_iter_cmp_by_idx
970
1c6fdbd8 971retry_all:
e542029e 972 bch2_btree_trans_unlock(trans);
1c6fdbd8 973
bf7b87a4 974 if (unlikely(ret == -ENOMEM)) {
1c6fdbd8
KO
975 struct closure cl;
976
977 closure_init_stack(&cl);
978
979 do {
980 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
981 closure_sync(&cl);
982 } while (ret);
983 }
984
bf7b87a4
KO
985 if (unlikely(ret == -EIO)) {
986 iter->flags |= BTREE_ITER_ERROR;
987 iter->l[iter->level].b = BTREE_ITER_NOT_END;
988 goto out;
989 }
990
991 BUG_ON(ret && ret != -EINTR);
992
1c6fdbd8 993 /* Now, redo traversals in correct order: */
e542029e
KO
994 for (i = 0; i < nr_sorted; i++) {
995 iter = &trans->iters[sorted[i]];
1c6fdbd8 996
e542029e
KO
997 do {
998 ret = __bch2_btree_iter_traverse(iter);
999 } while (ret == -EINTR);
1c6fdbd8 1000
e542029e
KO
1001 if (ret)
1002 goto retry_all;
1003 }
1c6fdbd8 1004
e542029e 1005 ret = btree_trans_has_multiple_iters(trans) ? -EINTR : 0;
1c6fdbd8
KO
1006out:
1007 bch2_btree_cache_cannibalize_unlock(c);
1008 return ret;
bf7b87a4 1009}
1c6fdbd8 1010
bf7b87a4
KO
1011int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1012{
1013 return __btree_iter_traverse_all(trans, NULL, 0);
1c6fdbd8
KO
1014}
1015
1016static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
1017 bool check_pos)
1018{
1019 unsigned l = iter->level;
1020
1021 while (btree_iter_node(iter, l) &&
1022 !(is_btree_node(iter, l) &&
1023 bch2_btree_node_relock(iter, l) &&
1024 (!check_pos ||
1025 btree_iter_pos_in_node(iter, iter->l[l].b)))) {
1026 btree_node_unlock(iter, l);
1027 iter->l[l].b = BTREE_ITER_NOT_END;
1028 l++;
1029 }
1030
1031 return l;
1032}
1033
1034/*
1035 * This is the main state machine for walking down the btree - walks down to a
1036 * specified depth
1037 *
1038 * Returns 0 on success, -EIO on error (error reading in a btree node).
1039 *
1040 * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1041 * stashed in the iterator and returned from bch2_btree_iter_unlock().
1042 */
1043int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1044{
1045 unsigned depth_want = iter->level;
1046
1047 if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1048 return 0;
1049
0f238367 1050 if (bch2_btree_iter_relock(iter))
1c6fdbd8
KO
1051 return 0;
1052
1c6fdbd8
KO
1053 /*
1054 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
1055 * here unnecessary
1056 */
1057 iter->level = btree_iter_up_until_locked(iter, true);
1058
1059 /*
1060 * If we've got a btree node locked (i.e. we aren't about to relock the
1061 * root) - advance its node iterator if necessary:
1062 *
1063 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
1064 */
a00fd8c5
KO
1065 if (btree_iter_node(iter, iter->level))
1066 btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
1c6fdbd8
KO
1067
1068 /*
1069 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1070 * would indicate to other code that we got to the end of the btree,
1071 * here it indicates that relocking the root failed - it's critical that
1072 * btree_iter_lock_root() comes next and that it can't fail
1073 */
1074 while (iter->level > depth_want) {
1075 int ret = btree_iter_node(iter, iter->level)
1076 ? btree_iter_down(iter)
1077 : btree_iter_lock_root(iter, depth_want);
1078 if (unlikely(ret)) {
8812600c
KO
1079 if (ret == 1)
1080 return 0;
1081
1c6fdbd8
KO
1082 iter->level = depth_want;
1083 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1084 return ret;
1085 }
1086 }
1087
1088 iter->uptodate = BTREE_ITER_NEED_PEEK;
271a3d3a 1089
0f238367 1090 bch2_btree_trans_verify_locks(iter->trans);
271a3d3a 1091 __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
1c6fdbd8
KO
1092 return 0;
1093}
1094
1095int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
1096{
1097 int ret;
1098
1099 ret = __bch2_btree_iter_traverse(iter);
1100 if (unlikely(ret))
bf7b87a4 1101 ret = __btree_iter_traverse_all(iter->trans, iter, ret);
1c6fdbd8 1102
0f238367 1103 BUG_ON(ret == -EINTR && !btree_trans_has_multiple_iters(iter->trans));
1c6fdbd8
KO
1104
1105 return ret;
1106}
1107
1108static inline void bch2_btree_iter_checks(struct btree_iter *iter,
1109 enum btree_iter_type type)
1110{
1111 EBUG_ON(iter->btree_id >= BTREE_ID_NR);
1c6fdbd8
KO
1112 EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
1113 (iter->btree_id == BTREE_ID_EXTENTS &&
1114 type != BTREE_ITER_NODES));
1115
0f238367 1116 bch2_btree_trans_verify_locks(iter->trans);
1c6fdbd8
KO
1117}
1118
1119/* Iterate across nodes (leaf and interior nodes) */
1120
1121struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1122{
1123 struct btree *b;
1124 int ret;
1125
1126 bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1127
1128 if (iter->uptodate == BTREE_ITER_UPTODATE)
1129 return iter->l[iter->level].b;
1130
1131 ret = bch2_btree_iter_traverse(iter);
1132 if (ret)
1133 return NULL;
1134
1135 b = btree_iter_node(iter, iter->level);
1136 if (!b)
1137 return NULL;
1138
1139 BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
1140
1141 iter->pos = b->key.k.p;
1142 iter->uptodate = BTREE_ITER_UPTODATE;
1143
1144 return b;
1145}
1146
1147struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
1148{
1149 struct btree *b;
1150 int ret;
1151
1152 bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1153
1154 /* already got to end? */
1155 if (!btree_iter_node(iter, iter->level))
1156 return NULL;
1157
1158 btree_iter_up(iter);
1159
1160 if (!bch2_btree_node_relock(iter, iter->level))
1161 btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
1162
1163 ret = bch2_btree_iter_traverse(iter);
1164 if (ret)
1165 return NULL;
1166
1167 /* got to end? */
1168 b = btree_iter_node(iter, iter->level);
1169 if (!b)
1170 return NULL;
1171
1172 if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
1173 /*
1174 * Haven't gotten to the end of the parent node: go back down to
1175 * the next child node
1176 */
1177
1178 /*
1179 * We don't really want to be unlocking here except we can't
1180 * directly tell btree_iter_traverse() "traverse to this level"
1181 * except by setting iter->level, so we have to unlock so we
1182 * don't screw up our lock invariants:
1183 */
1184 if (btree_node_read_locked(iter, iter->level))
1185 btree_node_unlock(iter, iter->level);
1186
1187 /* ick: */
1188 iter->pos = iter->btree_id == BTREE_ID_INODES
1189 ? btree_type_successor(iter->btree_id, iter->pos)
1190 : bkey_successor(iter->pos);
1191 iter->level = depth;
1192
1193 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1194 ret = bch2_btree_iter_traverse(iter);
1195 if (ret)
1196 return NULL;
1197
1198 b = iter->l[iter->level].b;
1199 }
1200
1201 iter->pos = b->key.k.p;
1202 iter->uptodate = BTREE_ITER_UPTODATE;
1203
1204 return b;
1205}
1206
1207/* Iterate across keys (in leaf nodes only) */
1208
1209void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
1210{
1211 struct btree_iter_level *l = &iter->l[0];
1c6fdbd8
KO
1212
1213 EBUG_ON(iter->level != 0);
1214 EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
1215 EBUG_ON(!btree_node_locked(iter, 0));
1216 EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
1217
1218 iter->pos = new_pos;
1219 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1220
a00fd8c5 1221 btree_iter_advance_to_pos(iter, l, -1);
1c6fdbd8 1222
a00fd8c5
KO
1223 if (bch2_btree_node_iter_end(&l->iter) &&
1224 btree_iter_pos_after_node(iter, l->b))
1c6fdbd8 1225 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1c6fdbd8
KO
1226}
1227
1228void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
1229{
1230 int cmp = bkey_cmp(new_pos, iter->pos);
1231 unsigned level;
1232
1233 if (!cmp)
1234 return;
1235
1236 iter->pos = new_pos;
1237
1238 level = btree_iter_up_until_locked(iter, true);
1239
1240 if (btree_iter_node(iter, level)) {
1c6fdbd8
KO
1241 /*
1242 * We might have to skip over many keys, or just a few: try
1243 * advancing the node iterator, and if we have to skip over too
1244 * many keys just reinit it (or if we're rewinding, since that
1245 * is expensive).
1246 */
a00fd8c5
KO
1247 if (cmp < 0 ||
1248 !btree_iter_advance_to_pos(iter, &iter->l[level], 8))
1249 __btree_iter_init(iter, level);
1c6fdbd8
KO
1250
1251 /* Don't leave it locked if we're not supposed to: */
1252 if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
1253 btree_node_unlock(iter, level);
1254 }
1255
1256 if (level != iter->level)
1257 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1258 else
1259 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1260}
1261
1262static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
1263{
1264 struct btree_iter_level *l = &iter->l[0];
1265 struct bkey_s_c ret = { .k = &iter->k };
1266
1267 if (!bkey_deleted(&iter->k)) {
1268 EBUG_ON(bch2_btree_node_iter_end(&l->iter));
1269 ret.v = bkeyp_val(&l->b->format,
1270 __bch2_btree_node_iter_peek_all(&l->iter, l->b));
1271 }
1272
9e5e5b9e 1273 if (debug_check_bkeys(iter->trans->c) &&
1c6fdbd8 1274 !bkey_deleted(ret.k))
9e5e5b9e 1275 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1c6fdbd8
KO
1276 return ret;
1277}
1278
1279struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1280{
1281 struct btree_iter_level *l = &iter->l[0];
1282 struct bkey_s_c k;
1283 int ret;
1284
1285 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1286
1287 if (iter->uptodate == BTREE_ITER_UPTODATE)
1288 return btree_iter_peek_uptodate(iter);
1289
1290 while (1) {
1291 ret = bch2_btree_iter_traverse(iter);
1292 if (unlikely(ret))
1293 return bkey_s_c_err(ret);
1294
1295 k = __btree_iter_peek(iter, l);
1296 if (likely(k.k))
1297 break;
1298
1299 /* got to the end of the leaf, iterator needs to be traversed: */
1300 iter->pos = l->b->key.k.p;
1301 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1302
1303 if (!bkey_cmp(iter->pos, POS_MAX))
1304 return bkey_s_c_null;
1305
1306 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1307 }
1308
1309 /*
1310 * iter->pos should always be equal to the key we just
1311 * returned - except extents can straddle iter->pos:
1312 */
1313 if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
1314 bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1315 iter->pos = bkey_start_pos(k.k);
1316
1317 iter->uptodate = BTREE_ITER_UPTODATE;
1318 return k;
1319}
1320
1321static noinline
1322struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
1323{
1324 struct btree_iter_level *l = &iter->l[0];
1325
1326 iter->pos = l->b->key.k.p;
1327 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1328
1329 if (!bkey_cmp(iter->pos, POS_MAX))
1330 return bkey_s_c_null;
1331
1332 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1333
1334 return bch2_btree_iter_peek(iter);
1335}
1336
1337struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1338{
1339 struct btree_iter_level *l = &iter->l[0];
1340 struct bkey_packed *p;
1341 struct bkey_s_c k;
1342
1343 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1344
1345 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1346 k = bch2_btree_iter_peek(iter);
1347 if (IS_ERR_OR_NULL(k.k))
1348 return k;
1349 }
1350
1351 do {
a00fd8c5 1352 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8
KO
1353 p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1354 if (unlikely(!p))
1355 return bch2_btree_iter_peek_next_leaf(iter);
1356 } while (bkey_whiteout(p));
1357
1358 k = __btree_iter_unpack(iter, l, &iter->k, p);
1359
1360 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
1361 iter->pos = bkey_start_pos(k.k);
1362 return k;
1363}
1364
1365struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1366{
1367 struct btree_iter_level *l = &iter->l[0];
1368 struct bkey_packed *p;
1369 struct bkey_s_c k;
1370 int ret;
1371
1372 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1373
1374 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1375 k = bch2_btree_iter_peek(iter);
1376 if (IS_ERR(k.k))
1377 return k;
1378 }
1379
1380 while (1) {
1381 p = bch2_btree_node_iter_prev(&l->iter, l->b);
1382 if (likely(p))
1383 break;
1384
1385 iter->pos = l->b->data->min_key;
1386 if (!bkey_cmp(iter->pos, POS_MIN))
1387 return bkey_s_c_null;
1388
1389 bch2_btree_iter_set_pos(iter,
1390 btree_type_predecessor(iter->btree_id, iter->pos));
1391
1392 ret = bch2_btree_iter_traverse(iter);
1393 if (unlikely(ret))
1394 return bkey_s_c_err(ret);
1395
1396 p = bch2_btree_node_iter_peek(&l->iter, l->b);
1397 if (p)
1398 break;
1399 }
1400
1401 k = __btree_iter_unpack(iter, l, &iter->k, p);
1402
1403 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
1404
1405 iter->pos = bkey_start_pos(k.k);
1406 iter->uptodate = BTREE_ITER_UPTODATE;
1407 return k;
1408}
1409
1410static inline struct bkey_s_c
271a3d3a 1411__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
1c6fdbd8
KO
1412{
1413 struct btree_iter_level *l = &iter->l[0];
271a3d3a 1414 struct btree_node_iter node_iter;
1c6fdbd8
KO
1415 struct bkey_s_c k;
1416 struct bkey n;
1417 int ret;
1418
1419recheck:
1420 while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1421 bkey_deleted(k.k) &&
1422 bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
a00fd8c5 1423 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8 1424
271a3d3a
KO
1425 /*
1426 * iterator is now at the correct position for inserting at iter->pos,
1427 * but we need to keep iterating until we find the first non whiteout so
1428 * we know how big a hole we have, if any:
1429 */
1430
1431 node_iter = l->iter;
1432 if (k.k && bkey_whiteout(k.k))
1433 k = __btree_iter_unpack(iter, l, &iter->k,
1434 bch2_btree_node_iter_peek(&node_iter, l->b));
1435
1c6fdbd8
KO
1436 /*
1437 * If we got to the end of the node, check if we need to traverse to the
1438 * next node:
1439 */
1440 if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1441 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1442 ret = bch2_btree_iter_traverse(iter);
1443 if (unlikely(ret))
1444 return bkey_s_c_err(ret);
1445
1446 goto recheck;
1447 }
1448
1449 if (k.k &&
1450 !bkey_whiteout(k.k) &&
1451 bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
271a3d3a
KO
1452 /*
1453 * if we skipped forward to find the first non whiteout and
1454 * there _wasn't_ actually a hole, we want the iterator to be
1455 * pointed at the key we found:
1456 */
1457 l->iter = node_iter;
1458
1c6fdbd8
KO
1459 EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
1460 EBUG_ON(bkey_deleted(k.k));
1461 iter->uptodate = BTREE_ITER_UPTODATE;
1462 return k;
1463 }
1464
1465 /* hole */
271a3d3a
KO
1466
1467 /* holes can't span inode numbers: */
1468 if (iter->pos.offset == KEY_OFFSET_MAX) {
1469 if (iter->pos.inode == KEY_INODE_MAX)
1470 return bkey_s_c_null;
1471
1472 iter->pos = bkey_successor(iter->pos);
1473 goto recheck;
1474 }
1475
1476 if (!k.k)
1477 k.k = &l->b->key.k;
1478
1c6fdbd8
KO
1479 bkey_init(&n);
1480 n.p = iter->pos;
271a3d3a
KO
1481 bch2_key_resize(&n,
1482 min_t(u64, KEY_SIZE_MAX,
1483 (k.k->p.inode == n.p.inode
1484 ? bkey_start_offset(k.k)
1485 : KEY_OFFSET_MAX) -
1486 n.p.offset));
1487
319f9ac3 1488 EBUG_ON(!n.size);
1c6fdbd8 1489
271a3d3a
KO
1490 iter->k = n;
1491 iter->uptodate = BTREE_ITER_UPTODATE;
1492 return (struct bkey_s_c) { &iter->k, NULL };
1493}
1c6fdbd8 1494
271a3d3a
KO
1495static inline struct bkey_s_c
1496__bch2_btree_iter_peek_slot(struct btree_iter *iter)
1497{
1498 struct btree_iter_level *l = &iter->l[0];
1499 struct bkey_s_c k;
1500 int ret;
1c6fdbd8 1501
271a3d3a
KO
1502 if (iter->flags & BTREE_ITER_IS_EXTENTS)
1503 return __bch2_btree_iter_peek_slot_extents(iter);
1c6fdbd8 1504
271a3d3a
KO
1505recheck:
1506 while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1507 bkey_deleted(k.k) &&
1508 bkey_cmp(k.k->p, iter->pos) == 0)
a00fd8c5 1509 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8 1510
271a3d3a
KO
1511 /*
1512 * If we got to the end of the node, check if we need to traverse to the
1513 * next node:
1514 */
1515 if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1516 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1517 ret = bch2_btree_iter_traverse(iter);
1518 if (unlikely(ret))
1519 return bkey_s_c_err(ret);
1c6fdbd8 1520
271a3d3a 1521 goto recheck;
1c6fdbd8
KO
1522 }
1523
271a3d3a
KO
1524 if (k.k &&
1525 !bkey_deleted(k.k) &&
1526 !bkey_cmp(iter->pos, k.k->p)) {
1527 iter->uptodate = BTREE_ITER_UPTODATE;
1528 return k;
1529 } else {
1530 /* hole */
1531 bkey_init(&iter->k);
1532 iter->k.p = iter->pos;
1533
1534 iter->uptodate = BTREE_ITER_UPTODATE;
1535 return (struct bkey_s_c) { &iter->k, NULL };
1536 }
1c6fdbd8
KO
1537}
1538
1539struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1540{
1541 int ret;
1542
1543 bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1544
1545 if (iter->uptodate == BTREE_ITER_UPTODATE)
1546 return btree_iter_peek_uptodate(iter);
1547
1548 ret = bch2_btree_iter_traverse(iter);
1549 if (unlikely(ret))
1550 return bkey_s_c_err(ret);
1551
1552 return __bch2_btree_iter_peek_slot(iter);
1553}
1554
1555struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1556{
1557 bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1558
1559 iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
1560
1561 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1562 /*
1563 * XXX: when we just need to relock we should be able to avoid
1564 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
1565 * for that to work
1566 */
1567 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1568
1569 return bch2_btree_iter_peek_slot(iter);
1570 }
1571
1572 if (!bkey_deleted(&iter->k))
a00fd8c5 1573 bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b);
1c6fdbd8
KO
1574
1575 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1576
1577 return __bch2_btree_iter_peek_slot(iter);
1578}
1579
9e5e5b9e
KO
1580static inline void bch2_btree_iter_init(struct btree_trans *trans,
1581 struct btree_iter *iter, enum btree_id btree_id,
424eb881 1582 struct bpos pos, unsigned flags)
1c6fdbd8 1583{
9e5e5b9e 1584 struct bch_fs *c = trans->c;
1c6fdbd8
KO
1585 unsigned i;
1586
424eb881
KO
1587 if (btree_id == BTREE_ID_EXTENTS &&
1588 !(flags & BTREE_ITER_NODES))
1589 flags |= BTREE_ITER_IS_EXTENTS;
1c6fdbd8 1590
9e5e5b9e 1591 iter->trans = trans;
1c6fdbd8
KO
1592 iter->pos = pos;
1593 bkey_init(&iter->k);
1594 iter->k.p = pos;
1595 iter->flags = flags;
1596 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1597 iter->btree_id = btree_id;
424eb881
KO
1598 iter->level = 0;
1599 iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0;
1c6fdbd8
KO
1600 iter->nodes_locked = 0;
1601 iter->nodes_intent_locked = 0;
1602 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1603 iter->l[i].b = NULL;
1604 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1c6fdbd8
KO
1605
1606 prefetch(c->btree_roots[btree_id].b);
1607}
1608
1c6fdbd8
KO
1609/* new transactional stuff: */
1610
424eb881
KO
1611int bch2_trans_iter_put(struct btree_trans *trans,
1612 struct btree_iter *iter)
08af47df 1613{
0f238367 1614 int ret = btree_iter_err(iter);
08af47df 1615
e1120a4c 1616 trans->iters_live &= ~(1ULL << iter->idx);
424eb881 1617 return ret;
08af47df
KO
1618}
1619
5df4be3f
KO
1620static inline void __bch2_trans_iter_free(struct btree_trans *trans,
1621 unsigned idx)
1622{
ecc892e4 1623 __bch2_btree_iter_unlock(&trans->iters[idx]);
5df4be3f
KO
1624 trans->iters_linked &= ~(1ULL << idx);
1625 trans->iters_live &= ~(1ULL << idx);
1626 trans->iters_touched &= ~(1ULL << idx);
1627 trans->iters_unlink_on_restart &= ~(1ULL << idx);
1628 trans->iters_unlink_on_commit &= ~(1ULL << idx);
5df4be3f
KO
1629}
1630
424eb881
KO
1631int bch2_trans_iter_free(struct btree_trans *trans,
1632 struct btree_iter *iter)
1c6fdbd8 1633{
0f238367 1634 int ret = btree_iter_err(iter);
424eb881 1635
e1120a4c 1636 __bch2_trans_iter_free(trans, iter->idx);
424eb881 1637 return ret;
5df4be3f 1638}
1c6fdbd8 1639
424eb881
KO
1640int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
1641 struct btree_iter *iter)
5df4be3f 1642{
0f238367 1643 int ret = btree_iter_err(iter);
424eb881 1644
e1120a4c 1645 trans->iters_unlink_on_commit |= 1ULL << iter->idx;
424eb881 1646 return ret;
1c6fdbd8
KO
1647}
1648
a8e00bd4
KO
1649static int btree_trans_realloc_iters(struct btree_trans *trans,
1650 unsigned new_size)
1c6fdbd8 1651{
a8e00bd4 1652 void *new_iters, *new_updates;
1c6fdbd8 1653
a8e00bd4
KO
1654 BUG_ON(new_size > BTREE_ITER_MAX);
1655
1656 if (new_size <= trans->size)
1657 return 0;
1658
1659 BUG_ON(trans->used_mempool);
1660
1c6fdbd8
KO
1661 bch2_trans_unlock(trans);
1662
a8e00bd4
KO
1663 new_iters = kmalloc(sizeof(struct btree_iter) * new_size +
1664 sizeof(struct btree_insert_entry) * (new_size + 4),
1665 GFP_NOFS);
1666 if (new_iters)
1667 goto success;
1668
581edb63 1669 new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
a8e00bd4
KO
1670 new_size = BTREE_ITER_MAX;
1671
1672 trans->used_mempool = true;
1673success:
1674 new_updates = new_iters + sizeof(struct btree_iter) * new_size;
1c6fdbd8
KO
1675
1676 memcpy(new_iters, trans->iters,
1677 sizeof(struct btree_iter) * trans->nr_iters);
a8e00bd4
KO
1678 memcpy(new_updates, trans->updates,
1679 sizeof(struct btree_insert_entry) * trans->nr_updates);
1680
5df4be3f
KO
1681 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
1682 memset(trans->iters, POISON_FREE,
1683 sizeof(struct btree_iter) * trans->nr_iters +
1684 sizeof(struct btree_insert_entry) * trans->nr_iters);
1685
a8e00bd4
KO
1686 if (trans->iters != trans->iters_onstack)
1687 kfree(trans->iters);
1688
1689 trans->iters = new_iters;
1690 trans->updates = new_updates;
1691 trans->size = new_size;
1c6fdbd8 1692
1c7a0adf
KO
1693 if (trans->iters_live) {
1694 trans_restart();
1695 return -EINTR;
1696 }
1697
1698 return 0;
1c6fdbd8
KO
1699}
1700
581edb63 1701void bch2_trans_preload_iters(struct btree_trans *trans)
1c6fdbd8 1702{
a8e00bd4 1703 btree_trans_realloc_iters(trans, BTREE_ITER_MAX);
1c6fdbd8
KO
1704}
1705
7c26ecae
KO
1706static int btree_trans_iter_alloc(struct btree_trans *trans)
1707{
7c26ecae
KO
1708 unsigned idx = ffz(trans->iters_linked);
1709
1710 if (idx < trans->nr_iters)
1711 goto got_slot;
1712
1713 if (trans->nr_iters == trans->size) {
1714 int ret = btree_trans_realloc_iters(trans, trans->size * 2);
1715 if (ret)
1716 return ret;
1717 }
1718
1719 idx = trans->nr_iters++;
1720 BUG_ON(trans->nr_iters > trans->size);
e1120a4c
KO
1721
1722 trans->iters[idx].idx = idx;
7c26ecae 1723got_slot:
7c26ecae 1724 BUG_ON(trans->iters_linked & (1ULL << idx));
7c26ecae
KO
1725 trans->iters_linked |= 1ULL << idx;
1726 return idx;
1727}
1728
1c6fdbd8 1729static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
5df4be3f 1730 unsigned btree_id, struct bpos pos,
1c6fdbd8
KO
1731 unsigned flags, u64 iter_id)
1732{
1733 struct btree_iter *iter;
1734 int idx;
1735
1736 BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
1737
5df4be3f 1738 for (idx = 0; idx < trans->nr_iters; idx++) {
7c26ecae
KO
1739 if (!(trans->iters_linked & (1ULL << idx)))
1740 continue;
1741
5df4be3f
KO
1742 iter = &trans->iters[idx];
1743 if (iter_id
1744 ? iter->id == iter_id
1745 : (iter->btree_id == btree_id &&
1746 !bkey_cmp(iter->pos, pos)))
1c6fdbd8 1747 goto found;
5df4be3f 1748 }
1c6fdbd8
KO
1749 idx = -1;
1750found:
1751 if (idx < 0) {
7c26ecae
KO
1752 idx = btree_trans_iter_alloc(trans);
1753 if (idx < 0)
1754 return ERR_PTR(idx);
1c6fdbd8 1755
1c6fdbd8 1756 iter = &trans->iters[idx];
a8e00bd4 1757 iter->id = iter_id;
1c6fdbd8 1758
9e5e5b9e 1759 bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
1c6fdbd8
KO
1760 } else {
1761 iter = &trans->iters[idx];
1762
1c6fdbd8
KO
1763 iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1764 iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1765 }
1766
5df4be3f 1767 BUG_ON(iter->btree_id != btree_id);
a8e00bd4 1768 BUG_ON(trans->iters_live & (1ULL << idx));
5df4be3f
KO
1769 trans->iters_live |= 1ULL << idx;
1770 trans->iters_touched |= 1ULL << idx;
1c6fdbd8 1771
08af47df
KO
1772 BUG_ON(iter->btree_id != btree_id);
1773 BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
1774
1c6fdbd8
KO
1775 return iter;
1776}
1777
1778struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
1779 enum btree_id btree_id,
1780 struct bpos pos, unsigned flags,
1781 u64 iter_id)
1782{
1783 struct btree_iter *iter =
5df4be3f 1784 __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
1c6fdbd8
KO
1785
1786 if (!IS_ERR(iter))
1787 bch2_btree_iter_set_pos(iter, pos);
1788 return iter;
1789}
1790
424eb881
KO
1791struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
1792 enum btree_id btree_id,
1793 struct bpos pos,
1794 unsigned locks_want,
1795 unsigned depth,
1796 unsigned flags)
1797{
1798 struct btree_iter *iter =
1799 __btree_trans_get_iter(trans, btree_id, pos,
1800 flags|BTREE_ITER_NODES, 0);
1801 unsigned i;
1802
1803 BUG_ON(IS_ERR(iter));
1804 BUG_ON(bkey_cmp(iter->pos, pos));
1805
1806 iter->locks_want = locks_want;
1807 iter->level = depth;
1808
1809 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1810 iter->l[i].b = NULL;
1811 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1812
1813 return iter;
1814}
1815
7c26ecae
KO
1816struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
1817 struct btree_iter *src)
1c6fdbd8 1818{
ecc892e4 1819 struct btree_iter *iter;
e1120a4c 1820 unsigned offset = offsetof(struct btree_iter, trans);
ecc892e4 1821 int i, idx;
1c6fdbd8 1822
7c26ecae
KO
1823 idx = btree_trans_iter_alloc(trans);
1824 if (idx < 0)
1825 return ERR_PTR(idx);
1826
1827 trans->iters_live |= 1ULL << idx;
1828 trans->iters_touched |= 1ULL << idx;
1829 trans->iters_unlink_on_restart |= 1ULL << idx;
1830
ecc892e4 1831 iter = &trans->iters[idx];
e1120a4c
KO
1832
1833 memcpy((void *) iter + offset,
1834 (void *) src + offset,
1835 sizeof(*iter) - offset);
ecc892e4
KO
1836
1837 for (i = 0; i < BTREE_MAX_DEPTH; i++)
1838 if (btree_node_locked(iter, i))
1839 six_lock_increment(&iter->l[i].b->lock,
1840 __btree_lock_want(iter, i));
7c26ecae
KO
1841
1842 return &trans->iters[idx];
1c6fdbd8
KO
1843}
1844
1845void *bch2_trans_kmalloc(struct btree_trans *trans,
1846 size_t size)
1847{
1848 void *ret;
1849
1850 if (trans->mem_top + size > trans->mem_bytes) {
1851 size_t old_bytes = trans->mem_bytes;
1852 size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
1853 void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
1854
1855 if (!new_mem)
1856 return ERR_PTR(-ENOMEM);
1857
1858 trans->mem = new_mem;
1859 trans->mem_bytes = new_bytes;
1860
1c7a0adf
KO
1861 if (old_bytes) {
1862 trans_restart();
1c6fdbd8 1863 return ERR_PTR(-EINTR);
1c7a0adf 1864 }
1c6fdbd8
KO
1865 }
1866
1867 ret = trans->mem + trans->mem_top;
1868 trans->mem_top += size;
1869 return ret;
1870}
1871
1872int bch2_trans_unlock(struct btree_trans *trans)
1873{
1874 unsigned iters = trans->iters_linked;
1875 int ret = 0;
1876
1877 while (iters) {
1878 unsigned idx = __ffs(iters);
1879 struct btree_iter *iter = &trans->iters[idx];
1880
0f238367 1881 ret = ret ?: btree_iter_err(iter);
1c6fdbd8
KO
1882
1883 __bch2_btree_iter_unlock(iter);
1884 iters ^= 1 << idx;
1885 }
1886
1887 return ret;
1888}
1889
5df4be3f
KO
1890inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
1891{
1892 iters &= trans->iters_linked;
4afe7000 1893 iters &= ~trans->iters_live;
5df4be3f
KO
1894
1895 while (iters) {
1896 unsigned idx = __ffs64(iters);
1897
1898 iters &= ~(1ULL << idx);
1899 __bch2_trans_iter_free(trans, idx);
1900 }
1901}
1902
1c7a0adf 1903void __bch2_trans_begin(struct btree_trans *trans)
1c6fdbd8 1904{
5df4be3f 1905 u64 iters_to_unlink;
1c6fdbd8 1906
1c6fdbd8
KO
1907 /*
1908 * On transaction restart, the transaction isn't required to allocate
1909 * all the same iterators it on the last iteration:
1910 *
1911 * Unlink any iterators it didn't use this iteration, assuming it got
1912 * further (allocated an iter with a higher idx) than where the iter
1913 * was originally allocated:
1914 */
5df4be3f
KO
1915 iters_to_unlink = ~trans->iters_live &
1916 ((1ULL << fls64(trans->iters_live)) - 1);
a8e00bd4 1917
5df4be3f
KO
1918 iters_to_unlink |= trans->iters_unlink_on_restart;
1919 iters_to_unlink |= trans->iters_unlink_on_commit;
a8e00bd4 1920
4afe7000
KO
1921 trans->iters_live = 0;
1922
5df4be3f 1923 bch2_trans_unlink_iters(trans, iters_to_unlink);
1c6fdbd8 1924
5df4be3f
KO
1925 trans->iters_touched = 0;
1926 trans->iters_unlink_on_restart = 0;
1927 trans->iters_unlink_on_commit = 0;
1928 trans->nr_updates = 0;
1929 trans->mem_top = 0;
bf7b87a4
KO
1930
1931 bch2_btree_iter_traverse_all(trans);
1c6fdbd8
KO
1932}
1933
1934void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
1935{
a8e00bd4
KO
1936 memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
1937
1c6fdbd8 1938 trans->c = c;
a8e00bd4 1939 trans->size = ARRAY_SIZE(trans->iters_onstack);
1c6fdbd8 1940 trans->iters = trans->iters_onstack;
a8e00bd4 1941 trans->updates = trans->updates_onstack;
1c6fdbd8
KO
1942}
1943
1944int bch2_trans_exit(struct btree_trans *trans)
1945{
1946 int ret = bch2_trans_unlock(trans);
1947
1948 kfree(trans->mem);
a8e00bd4 1949 if (trans->used_mempool)
581edb63 1950 mempool_free(trans->iters, &trans->c->btree_iters_pool);
a8e00bd4
KO
1951 else if (trans->iters != trans->iters_onstack)
1952 kfree(trans->iters);
1c6fdbd8
KO
1953 trans->mem = (void *) 0x1;
1954 trans->iters = (void *) 0x1;
1955 return ret;
1956}