1 // SPDX-License-Identifier: GPL-2.0
4 #include "bkey_methods.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
18 #include <linux/prefetch.h>
20 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
21 static inline void btree_trans_sort_iters(struct btree_trans *);
22 static struct btree_iter *btree_iter_child_alloc(struct btree_trans *,
23 struct btree_iter *, unsigned long);
24 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *,
26 static void btree_iter_copy(struct btree_trans *, struct btree_iter *, struct btree_iter *);
28 static inline int btree_iter_cmp(const struct btree_iter *l,
29 const struct btree_iter *r)
31 return cmp_int(l->btree_id, r->btree_id) ?:
32 -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
33 bkey_cmp(l->real_pos, r->real_pos);
36 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
38 EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
40 /* Are we iterating over keys in all snapshots? */
41 if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
42 p = bpos_successor(p);
44 p = bpos_nosnap_successor(p);
45 p.snapshot = iter->snapshot;
51 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
53 EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
55 /* Are we iterating over keys in all snapshots? */
56 if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
57 p = bpos_predecessor(p);
59 p = bpos_nosnap_predecessor(p);
60 p.snapshot = iter->snapshot;
66 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
68 return l < BTREE_MAX_DEPTH &&
69 (unsigned long) iter->l[l].b >= 128;
72 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
74 struct bpos pos = iter->pos;
76 if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
77 bkey_cmp(pos, POS_MAX))
78 pos = bkey_successor(iter, pos);
82 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
85 return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
88 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
91 return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
94 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
97 return iter->btree_id == b->c.btree_id &&
98 !btree_iter_pos_before_node(iter, b) &&
99 !btree_iter_pos_after_node(iter, b);
102 /* Btree node locking: */
104 void bch2_btree_node_unlock_write(struct btree_trans *trans,
105 struct btree_iter *iter, struct btree *b)
107 bch2_btree_node_unlock_write_inlined(trans, iter, b);
110 void __bch2_btree_node_lock_write(struct btree_trans *trans,
111 struct btree_iter *iter, struct btree *b)
113 struct btree_iter *linked;
114 unsigned readers = 0;
116 EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
118 trans_for_each_iter(trans, linked)
119 if (linked->l[b->c.level].b == b &&
120 btree_node_read_locked(linked, b->c.level))
124 * Must drop our read locks before calling six_lock_write() -
125 * six_unlock() won't do wakeups until the reader count
126 * goes to 0, and it's safe because we have the node intent
129 if (!b->c.lock.readers)
130 atomic64_sub(__SIX_VAL(read_lock, readers),
131 &b->c.lock.state.counter);
133 this_cpu_sub(*b->c.lock.readers, readers);
135 btree_node_lock_type(trans->c, b, SIX_LOCK_write);
137 if (!b->c.lock.readers)
138 atomic64_add(__SIX_VAL(read_lock, readers),
139 &b->c.lock.state.counter);
141 this_cpu_add(*b->c.lock.readers, readers);
144 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
146 struct btree *b = btree_iter_node(iter, level);
147 int want = __btree_lock_want(iter, level);
149 if (!is_btree_node(iter, level))
155 if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
156 (btree_node_lock_seq_matches(iter, b, level) &&
157 btree_node_lock_increment(iter->trans, b, level, want))) {
158 mark_btree_node_locked(iter, level, want);
165 static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
167 struct btree *b = iter->l[level].b;
169 EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
171 if (!is_btree_node(iter, level))
174 if (btree_node_intent_locked(iter, level))
180 if (btree_node_locked(iter, level)
181 ? six_lock_tryupgrade(&b->c.lock)
182 : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
185 if (btree_node_lock_seq_matches(iter, b, level) &&
186 btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
187 btree_node_unlock(iter, level);
193 mark_btree_node_intent_locked(iter, level);
197 static inline bool btree_iter_get_locks(struct btree_trans *trans,
198 struct btree_iter *iter,
199 bool upgrade, unsigned long trace_ip)
201 unsigned l = iter->level;
205 if (!btree_iter_node(iter, l))
209 ? bch2_btree_node_upgrade(iter, l)
210 : bch2_btree_node_relock(iter, l))) {
212 ? trace_node_upgrade_fail
213 : trace_node_relock_fail)(trans->ip, trace_ip,
214 btree_iter_type(iter) == BTREE_ITER_CACHED,
215 iter->btree_id, &iter->real_pos,
216 l, iter->l[l].lock_seq,
217 is_btree_node(iter, l)
219 : (unsigned long) iter->l[l].b,
220 is_btree_node(iter, l)
221 ? iter->l[l].b->c.lock.state.seq
224 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
228 } while (l < iter->locks_want);
231 * When we fail to get a lock, we have to ensure that any child nodes
232 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
233 * the node that we failed to relock:
235 while (fail_idx >= 0) {
236 btree_node_unlock(iter, fail_idx);
237 iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
241 if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
242 iter->uptodate = BTREE_ITER_NEED_PEEK;
244 bch2_btree_trans_verify_locks(trans);
246 return iter->uptodate < BTREE_ITER_NEED_RELOCK;
249 static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
250 enum btree_iter_type type)
252 return type != BTREE_ITER_CACHED
253 ? container_of(_b, struct btree, c)->key.k.p
254 : container_of(_b, struct bkey_cached, c)->key.pos;
258 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
259 unsigned level, struct btree_iter *iter,
260 enum six_lock_type type,
261 six_lock_should_sleep_fn should_sleep_fn, void *p,
264 struct btree_trans *trans = iter->trans;
265 struct btree_iter *linked, *deadlock_iter = NULL;
266 u64 start_time = local_clock();
270 /* Check if it's safe to block: */
271 trans_for_each_iter(trans, linked) {
272 if (!linked->nodes_locked)
276 * Can't block taking an intent lock if we have _any_ nodes read
279 * - Our read lock blocks another thread with an intent lock on
280 * the same node from getting a write lock, and thus from
281 * dropping its intent lock
283 * - And the other thread may have multiple nodes intent locked:
284 * both the node we want to intent lock, and the node we
285 * already have read locked - deadlock:
287 if (type == SIX_LOCK_intent &&
288 linked->nodes_locked != linked->nodes_intent_locked) {
289 deadlock_iter = linked;
293 if (linked->btree_id != iter->btree_id) {
294 if (linked->btree_id > iter->btree_id) {
295 deadlock_iter = linked;
302 * Within the same btree, cached iterators come before non
305 if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
306 if (btree_iter_is_cached(iter)) {
307 deadlock_iter = linked;
314 * Interior nodes must be locked before their descendants: if
315 * another iterator has possible descendants locked of the node
316 * we're about to lock, it must have the ancestors locked too:
318 if (level > __fls(linked->nodes_locked)) {
319 deadlock_iter = linked;
323 /* Must lock btree nodes in key order: */
324 if (btree_node_locked(linked, level) &&
325 bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
326 btree_iter_type(linked))) <= 0) {
327 deadlock_iter = linked;
332 if (unlikely(deadlock_iter)) {
333 trace_trans_restart_would_deadlock(trans->ip, ip,
334 trans->in_traverse_all, reason,
335 deadlock_iter->btree_id,
336 btree_iter_type(deadlock_iter),
337 &deadlock_iter->real_pos,
339 btree_iter_type(iter),
341 btree_trans_restart(trans);
345 if (six_trylock_type(&b->c.lock, type))
348 #ifdef CONFIG_BCACHEFS_DEBUG
349 trans->locking_iter_idx = iter->idx;
350 trans->locking_pos = pos;
351 trans->locking_btree_id = iter->btree_id;
352 trans->locking_level = level;
356 ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
358 #ifdef CONFIG_BCACHEFS_DEBUG
359 trans->locking = NULL;
362 bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
367 /* Btree iterator locking: */
369 #ifdef CONFIG_BCACHEFS_DEBUG
370 static void bch2_btree_iter_verify_locks(struct btree_trans *trans,
371 struct btree_iter *iter)
375 if (!(trans->iters_linked & (1ULL << iter->idx))) {
376 BUG_ON(iter->nodes_locked);
380 for (l = 0; btree_iter_node(iter, l); l++) {
381 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
382 !btree_node_locked(iter, l))
385 BUG_ON(btree_lock_want(iter, l) !=
386 btree_node_locked_type(iter, l));
390 void bch2_btree_trans_verify_locks(struct btree_trans *trans)
392 struct btree_iter *iter;
394 trans_for_each_iter(trans, iter)
395 bch2_btree_iter_verify_locks(trans, iter);
398 static inline void bch2_btree_iter_verify_locks(struct btree_trans *trans,
399 struct btree_iter *iter) {}
403 * Only for btree_cache.c - only relocks intent locks
405 bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
407 struct btree_trans *trans = iter->trans;
410 for (l = iter->level;
411 l < iter->locks_want && btree_iter_node(iter, l);
413 if (!bch2_btree_node_relock(iter, l)) {
414 trace_node_relock_fail(trans->ip, _RET_IP_,
415 btree_iter_type(iter) == BTREE_ITER_CACHED,
416 iter->btree_id, &iter->real_pos,
417 l, iter->l[l].lock_seq,
418 is_btree_node(iter, l)
420 : (unsigned long) iter->l[l].b,
421 is_btree_node(iter, l)
422 ? iter->l[l].b->c.lock.state.seq
424 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
425 btree_trans_restart(trans);
434 static bool bch2_btree_iter_relock(struct btree_trans *trans,
435 struct btree_iter *iter, unsigned long trace_ip)
437 bool ret = btree_iter_get_locks(trans, iter, false, trace_ip);
440 btree_trans_restart(trans);
444 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
445 unsigned new_locks_want)
447 struct btree_trans *trans = iter->trans;
448 struct btree_iter *linked;
450 EBUG_ON(iter->locks_want >= new_locks_want);
452 iter->locks_want = new_locks_want;
454 if (btree_iter_get_locks(trans, iter, true, _THIS_IP_))
458 * XXX: this is ugly - we'd prefer to not be mucking with other
459 * iterators in the btree_trans here.
461 * On failure to upgrade the iterator, setting iter->locks_want and
462 * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
463 * get the locks we want on transaction restart.
465 * But if this iterator was a clone, on transaction restart what we did
466 * to this iterator isn't going to be preserved.
468 * Possibly we could add an iterator field for the parent iterator when
469 * an iterator is a copy - for now, we'll just upgrade any other
470 * iterators with the same btree id.
472 * The code below used to be needed to ensure ancestor nodes get locked
473 * before interior nodes - now that's handled by
474 * bch2_btree_iter_traverse_all().
476 trans_for_each_iter(trans, linked)
477 if (linked != iter &&
478 btree_iter_type(linked) == btree_iter_type(iter) &&
479 linked->btree_id == iter->btree_id &&
480 linked->locks_want < new_locks_want) {
481 linked->locks_want = new_locks_want;
482 btree_iter_get_locks(trans, linked, true, _THIS_IP_);
485 if (iter->should_be_locked)
486 btree_trans_restart(trans);
490 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
491 unsigned new_locks_want)
495 EBUG_ON(iter->locks_want < new_locks_want);
497 iter->locks_want = new_locks_want;
499 while (iter->nodes_locked &&
500 (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
501 if (l > iter->level) {
502 btree_node_unlock(iter, l);
504 if (btree_node_intent_locked(iter, l)) {
505 six_lock_downgrade(&iter->l[l].b->c.lock);
506 iter->nodes_intent_locked ^= 1 << l;
512 bch2_btree_trans_verify_locks(iter->trans);
515 void bch2_trans_downgrade(struct btree_trans *trans)
517 struct btree_iter *iter;
519 trans_for_each_iter(trans, iter)
520 bch2_btree_iter_downgrade(iter);
523 /* Btree transaction locking: */
525 static inline bool btree_iter_should_be_locked(struct btree_iter *iter)
527 return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
528 iter->should_be_locked;
531 bool bch2_trans_relock(struct btree_trans *trans)
533 struct btree_iter *iter;
535 if (unlikely(trans->restarted))
538 trans_for_each_iter(trans, iter)
539 if (btree_iter_should_be_locked(iter) &&
540 !bch2_btree_iter_relock(trans, iter, _RET_IP_)) {
541 trace_trans_restart_relock(trans->ip, _RET_IP_,
542 iter->btree_id, &iter->real_pos);
543 BUG_ON(!trans->restarted);
549 void bch2_trans_unlock(struct btree_trans *trans)
551 struct btree_iter *iter;
553 trans_for_each_iter(trans, iter)
554 __bch2_btree_iter_unlock(iter);
557 /* Btree iterator: */
559 #ifdef CONFIG_BCACHEFS_DEBUG
561 static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
563 struct bkey_cached *ck;
564 bool locked = btree_node_locked(iter, 0);
566 if (!bch2_btree_node_relock(iter, 0))
569 ck = (void *) iter->l[0].b;
570 BUG_ON(ck->key.btree_id != iter->btree_id ||
571 bkey_cmp(ck->key.pos, iter->pos));
574 btree_node_unlock(iter, 0);
577 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
580 struct btree_iter_level *l;
581 struct btree_node_iter tmp;
583 struct bkey_packed *p, *k;
584 char buf1[100], buf2[100], buf3[100];
587 if (!bch2_debug_check_iterators)
592 locked = btree_node_locked(iter, level);
594 if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
596 bch2_btree_iter_verify_cached(iter);
600 BUG_ON(iter->level < iter->min_depth);
602 if (!btree_iter_node(iter, level))
605 if (!bch2_btree_node_relock(iter, level))
608 BUG_ON(!btree_iter_pos_in_node(iter, l->b));
611 * node iterators don't use leaf node iterator:
613 if (btree_iter_type(iter) == BTREE_ITER_NODES &&
614 level <= iter->min_depth)
617 bch2_btree_node_iter_verify(&l->iter, l->b);
620 * For interior nodes, the iterator will have skipped past
623 * For extents, the iterator may have skipped past deleted keys (but not
626 p = level || btree_node_type_is_extents(iter->btree_id)
627 ? bch2_btree_node_iter_prev(&tmp, l->b)
628 : bch2_btree_node_iter_prev_all(&tmp, l->b);
629 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
631 if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
636 if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
642 btree_node_unlock(iter, level);
645 strcpy(buf2, "(none)");
646 strcpy(buf3, "(none)");
648 bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
651 struct bkey uk = bkey_unpack_key(l->b, p);
652 bch2_bkey_to_text(&PBUF(buf2), &uk);
656 struct bkey uk = bkey_unpack_key(l->b, k);
657 bch2_bkey_to_text(&PBUF(buf3), &uk);
660 panic("iterator should be %s key at level %u:\n"
664 msg, level, buf1, buf2, buf3);
667 static void bch2_btree_iter_verify(struct btree_iter *iter)
669 struct btree_trans *trans = iter->trans;
670 struct bch_fs *c = trans->c;
671 enum btree_iter_type type = btree_iter_type(iter);
674 EBUG_ON(iter->btree_id >= BTREE_ID_NR);
676 BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
677 iter->pos.snapshot != iter->snapshot);
679 BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
680 (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
682 BUG_ON(type == BTREE_ITER_NODES &&
683 !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
685 BUG_ON(type != BTREE_ITER_NODES &&
686 (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
687 !btree_type_has_snapshots(iter->btree_id));
689 for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) {
691 BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
695 bch2_btree_iter_verify_level(iter, i);
698 bch2_btree_iter_verify_locks(trans, iter);
701 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
703 enum btree_iter_type type = btree_iter_type(iter);
705 BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
706 iter->pos.snapshot != iter->snapshot);
708 BUG_ON((type == BTREE_ITER_KEYS ||
709 type == BTREE_ITER_CACHED) &&
710 (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
711 bkey_cmp(iter->pos, iter->k.p) > 0));
714 void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
716 struct btree_iter *iter;
718 if (!bch2_debug_check_iterators)
721 trans_for_each_iter_with_node(trans, b, iter)
722 bch2_btree_iter_verify_level(iter, b->c.level);
727 static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
728 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
729 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
733 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
736 struct bkey_packed *k)
738 struct btree_node_iter_set *set;
740 btree_node_iter_for_each(iter, set)
741 if (set->end == t->end_offset) {
742 set->k = __btree_node_key_to_offset(b, k);
743 bch2_btree_node_iter_sort(iter, b);
747 bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
750 static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
752 struct bkey_packed *where)
754 struct btree_iter_level *l = &iter->l[b->c.level];
756 if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
759 if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
760 bch2_btree_node_iter_advance(&l->iter, l->b);
762 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
765 void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
766 struct btree_iter *iter,
768 struct bkey_packed *where)
770 struct btree_iter *linked;
772 trans_for_each_iter_with_node(trans, b, linked) {
773 __bch2_btree_iter_fix_key_modified(linked, b, where);
774 bch2_btree_iter_verify_level(linked, b->c.level);
778 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
780 struct btree_node_iter *node_iter,
782 struct bkey_packed *where,
783 unsigned clobber_u64s,
786 const struct bkey_packed *end = btree_bkey_last(b, t);
787 struct btree_node_iter_set *set;
788 unsigned offset = __btree_node_key_to_offset(b, where);
789 int shift = new_u64s - clobber_u64s;
790 unsigned old_end = t->end_offset - shift;
791 unsigned orig_iter_pos = node_iter->data[0].k;
792 bool iter_current_key_modified =
793 orig_iter_pos >= offset &&
794 orig_iter_pos <= offset + clobber_u64s;
796 btree_node_iter_for_each(node_iter, set)
797 if (set->end == old_end)
800 /* didn't find the bset in the iterator - might have to readd it: */
802 bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
803 bch2_btree_node_iter_push(node_iter, b, where, end);
806 /* Iterator is after key that changed */
810 set->end = t->end_offset;
812 /* Iterator hasn't gotten to the key that changed yet: */
817 bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
819 } else if (set->k < offset + clobber_u64s) {
820 set->k = offset + new_u64s;
821 if (set->k == set->end)
822 bch2_btree_node_iter_set_drop(node_iter, set);
824 /* Iterator is after key that changed */
825 set->k = (int) set->k + shift;
829 bch2_btree_node_iter_sort(node_iter, b);
831 if (node_iter->data[0].k != orig_iter_pos)
832 iter_current_key_modified = true;
835 * When a new key is added, and the node iterator now points to that
836 * key, the iterator might have skipped past deleted keys that should
837 * come after the key the iterator now points to. We have to rewind to
838 * before those deleted keys - otherwise
839 * bch2_btree_node_iter_prev_all() breaks:
841 if (!bch2_btree_node_iter_end(node_iter) &&
842 iter_current_key_modified &&
844 btree_node_type_is_extents(iter->btree_id))) {
846 struct bkey_packed *k, *k2, *p;
848 k = bch2_btree_node_iter_peek_all(node_iter, b);
850 for_each_bset(b, t) {
851 bool set_pos = false;
853 if (node_iter->data[0].end == t->end_offset)
856 k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
858 while ((p = bch2_bkey_prev_all(b, t, k2)) &&
859 bkey_iter_cmp(b, k, p) < 0) {
865 btree_node_iter_set_set_pos(node_iter,
871 node_iter == &iter->l[0].iter &&
872 iter_current_key_modified)
873 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
876 void bch2_btree_node_iter_fix(struct btree_trans *trans,
877 struct btree_iter *iter,
879 struct btree_node_iter *node_iter,
880 struct bkey_packed *where,
881 unsigned clobber_u64s,
884 struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
885 struct btree_iter *linked;
887 if (node_iter != &iter->l[b->c.level].iter) {
888 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
889 where, clobber_u64s, new_u64s);
891 if (bch2_debug_check_iterators)
892 bch2_btree_node_iter_verify(node_iter, b);
895 trans_for_each_iter_with_node(trans, b, linked) {
896 __bch2_btree_node_iter_fix(linked, b,
897 &linked->l[b->c.level].iter, t,
898 where, clobber_u64s, new_u64s);
899 bch2_btree_iter_verify_level(linked, b->c.level);
903 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
904 struct btree_iter_level *l,
906 struct bkey_packed *k)
912 * signal to bch2_btree_iter_peek_slot() that we're currently at
915 u->type = KEY_TYPE_deleted;
916 return bkey_s_c_null;
919 ret = bkey_disassemble(l->b, k, u);
922 * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
923 * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
924 * being overwritten but doesn't change k->size. But this is ok, because
925 * those keys are never written out, we just have to avoid a spurious
928 if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
929 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
934 /* peek_all() doesn't skip deleted keys */
935 static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
936 struct btree_iter_level *l)
938 return __btree_iter_unpack(iter, l, &iter->k,
939 bch2_btree_node_iter_peek_all(&l->iter, l->b));
942 static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
943 struct btree_iter_level *l)
945 struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
946 bch2_btree_node_iter_peek(&l->iter, l->b));
948 iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
949 iter->trans->iters_sorted = false;
953 static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
954 struct btree_iter_level *l)
956 struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
957 bch2_btree_node_iter_prev(&l->iter, l->b));
959 iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
960 iter->trans->iters_sorted = false;
964 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
965 struct btree_iter_level *l,
968 struct bkey_packed *k;
971 while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
972 bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
973 if (max_advance > 0 && nr_advanced >= max_advance)
976 bch2_btree_node_iter_advance(&l->iter, l->b);
984 * Verify that iterator for parent node points to child node:
986 static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
988 struct btree_iter_level *l;
991 struct bkey_packed *k;
993 if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
996 plevel = b->c.level + 1;
997 if (!btree_iter_node(iter, plevel))
1000 parent_locked = btree_node_locked(iter, plevel);
1002 if (!bch2_btree_node_relock(iter, plevel))
1005 l = &iter->l[plevel];
1006 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1009 bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
1014 struct bkey uk = bkey_unpack_key(b, k);
1016 bch2_dump_btree_node(iter->trans->c, l->b);
1017 bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
1018 bch2_bkey_to_text(&PBUF(buf2), &uk);
1019 bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
1020 bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
1021 panic("parent iter doesn't point to new node:\n"
1025 bch2_btree_ids[iter->btree_id], buf1,
1030 btree_node_unlock(iter, b->c.level + 1);
1033 static inline void __btree_iter_init(struct btree_iter *iter,
1036 struct btree_iter_level *l = &iter->l[level];
1038 bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
1041 * Iterators to interior nodes should always be pointed at the first non
1045 bch2_btree_node_iter_peek(&l->iter, l->b);
1047 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1050 static inline void btree_iter_node_set(struct btree_iter *iter,
1053 BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
1055 btree_iter_verify_new_node(iter, b);
1057 EBUG_ON(!btree_iter_pos_in_node(iter, b));
1058 EBUG_ON(b->c.lock.state.seq & 1);
1060 iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
1061 iter->l[b->c.level].b = b;
1062 __btree_iter_init(iter, b->c.level);
1066 * A btree node is being replaced - update the iterator to point to the new
1069 void bch2_btree_iter_node_replace(struct btree_trans *trans,
1070 struct btree_iter *iter, struct btree *b)
1072 enum btree_node_locked_type t;
1073 struct btree_iter *linked;
1075 trans_for_each_iter(trans, linked)
1076 if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
1077 btree_iter_pos_in_node(linked, b)) {
1079 * bch2_btree_iter_node_drop() has already been called -
1080 * the old node we're replacing has already been
1081 * unlocked and the pointer invalidated
1083 BUG_ON(btree_node_locked(linked, b->c.level));
1085 t = btree_lock_want(linked, b->c.level);
1086 if (t != BTREE_NODE_UNLOCKED) {
1087 six_lock_increment(&b->c.lock, (enum six_lock_type) t);
1088 mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t);
1091 btree_iter_node_set(linked, b);
1095 void bch2_btree_iter_node_drop(struct btree_trans *trans,
1096 struct btree_iter *iter, struct btree *b)
1098 struct btree_iter *linked;
1099 unsigned level = b->c.level;
1101 trans_for_each_iter(trans, linked)
1102 if (linked->l[level].b == b) {
1103 btree_node_unlock(linked, level);
1104 linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
1109 * A btree node has been modified in such a way as to invalidate iterators - fix
1112 void bch2_btree_iter_reinit_node(struct btree_trans *trans,
1113 struct btree_iter *iter, struct btree *b)
1115 struct btree_iter *linked;
1117 trans_for_each_iter_with_node(trans, b, linked)
1118 __btree_iter_init(linked, b->c.level);
1121 static int lock_root_check_fn(struct six_lock *lock, void *p)
1123 struct btree *b = container_of(lock, struct btree, c.lock);
1124 struct btree **rootp = p;
1126 return b == *rootp ? 0 : -1;
1129 static inline int btree_iter_lock_root(struct btree_trans *trans,
1130 struct btree_iter *iter,
1131 unsigned depth_want,
1132 unsigned long trace_ip)
1134 struct bch_fs *c = trans->c;
1135 struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
1136 enum six_lock_type lock_type;
1139 EBUG_ON(iter->nodes_locked);
1142 b = READ_ONCE(*rootp);
1143 iter->level = READ_ONCE(b->c.level);
1145 if (unlikely(iter->level < depth_want)) {
1147 * the root is at a lower depth than the depth we want:
1148 * got to the end of the btree, or we're walking nodes
1149 * greater than some depth and there are no nodes >=
1152 iter->level = depth_want;
1153 for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
1154 iter->l[i].b = NULL;
1158 lock_type = __btree_lock_want(iter, iter->level);
1159 if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level,
1161 lock_root_check_fn, rootp,
1163 if (trans->restarted)
1168 if (likely(b == READ_ONCE(*rootp) &&
1169 b->c.level == iter->level &&
1171 for (i = 0; i < iter->level; i++)
1172 iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
1173 iter->l[iter->level].b = b;
1174 for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
1175 iter->l[i].b = NULL;
1177 mark_btree_node_locked(iter, iter->level, lock_type);
1178 btree_iter_node_set(iter, b);
1182 six_unlock_type(&b->c.lock, lock_type);
1187 static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *iter)
1189 struct bch_fs *c = trans->c;
1190 struct btree_iter_level *l = &iter->l[iter->level];
1191 struct btree_node_iter node_iter = l->iter;
1192 struct bkey_packed *k;
1193 struct bkey_buf tmp;
1194 unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1195 ? (iter->level > 1 ? 0 : 2)
1196 : (iter->level > 1 ? 1 : 16);
1197 bool was_locked = btree_node_locked(iter, iter->level);
1200 bch2_bkey_buf_init(&tmp);
1202 while (nr && !ret) {
1203 if (!bch2_btree_node_relock(iter, iter->level))
1206 bch2_btree_node_iter_advance(&node_iter, l->b);
1207 k = bch2_btree_node_iter_peek(&node_iter, l->b);
1211 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
1212 ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
1217 btree_node_unlock(iter, iter->level);
1219 bch2_bkey_buf_exit(&tmp, c);
1223 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
1224 unsigned plevel, struct btree *b)
1226 struct btree_iter_level *l = &iter->l[plevel];
1227 bool locked = btree_node_locked(iter, plevel);
1228 struct bkey_packed *k;
1229 struct bch_btree_ptr_v2 *bp;
1231 if (!bch2_btree_node_relock(iter, plevel))
1234 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1235 BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
1237 bp = (void *) bkeyp_val(&l->b->format, k);
1238 bp->mem_ptr = (unsigned long)b;
1241 btree_node_unlock(iter, plevel);
1244 static __always_inline int btree_iter_down(struct btree_trans *trans,
1245 struct btree_iter *iter,
1246 unsigned long trace_ip)
1248 struct bch_fs *c = trans->c;
1249 struct btree_iter_level *l = &iter->l[iter->level];
1251 unsigned level = iter->level - 1;
1252 enum six_lock_type lock_type = __btree_lock_want(iter, level);
1253 struct bkey_buf tmp;
1256 EBUG_ON(!btree_node_locked(iter, iter->level));
1258 bch2_bkey_buf_init(&tmp);
1259 bch2_bkey_buf_unpack(&tmp, c, l->b,
1260 bch2_btree_node_iter_peek(&l->iter, l->b));
1262 b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip);
1263 ret = PTR_ERR_OR_ZERO(b);
1267 mark_btree_node_locked(iter, level, lock_type);
1268 btree_iter_node_set(iter, b);
1270 if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
1271 unlikely(b != btree_node_mem_ptr(tmp.k)))
1272 btree_node_mem_ptr_set(iter, level + 1, b);
1274 if (iter->flags & BTREE_ITER_PREFETCH)
1275 ret = btree_iter_prefetch(trans, iter);
1277 if (btree_node_read_locked(iter, level + 1))
1278 btree_node_unlock(iter, level + 1);
1279 iter->level = level;
1281 bch2_btree_iter_verify_locks(trans, iter);
1283 bch2_bkey_buf_exit(&tmp, c);
1287 static int btree_iter_traverse_one(struct btree_trans *,
1288 struct btree_iter *, unsigned long);
1290 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
1291 unsigned long trace_ip)
1293 struct bch_fs *c = trans->c;
1294 struct btree_iter *iter, *prev = NULL;
1297 if (trans->in_traverse_all)
1300 trans->in_traverse_all = true;
1302 trans->restarted = false;
1304 trans_for_each_iter(trans, iter)
1305 iter->should_be_locked = false;
1307 btree_trans_sort_iters(trans);
1309 trans_for_each_iter_inorder_reverse(trans, iter, i) {
1311 if (iter->btree_id == prev->btree_id &&
1312 iter->locks_want < prev->locks_want)
1313 __bch2_btree_iter_upgrade(iter, prev->locks_want);
1314 else if (!iter->locks_want && prev->locks_want)
1315 __bch2_btree_iter_upgrade(iter, 1);
1321 bch2_trans_unlock(trans);
1324 if (unlikely(ret == -ENOMEM)) {
1327 closure_init_stack(&cl);
1330 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1335 if (unlikely(ret == -EIO)) {
1336 trans->error = true;
1340 BUG_ON(ret && ret != -EINTR);
1342 /* Now, redo traversals in correct order: */
1344 while (i < trans->nr_sorted) {
1345 iter = trans->iters + trans->sorted[i];
1347 EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
1349 ret = btree_iter_traverse_one(trans, iter, _THIS_IP_);
1353 EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
1355 if (iter->nodes_locked)
1360 * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
1361 * and relock(), relock() won't relock since iter->should_be_locked
1362 * isn't set yet, which is all fine
1364 trans_for_each_iter(trans, iter)
1365 BUG_ON(iter->uptodate >= BTREE_ITER_NEED_TRAVERSE);
1367 bch2_btree_cache_cannibalize_unlock(c);
1369 trans->in_traverse_all = false;
1371 trace_trans_traverse_all(trans->ip, trace_ip);
1375 static int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1377 return __btree_iter_traverse_all(trans, 0, _RET_IP_);
1380 static inline bool btree_iter_good_node(struct btree_iter *iter,
1381 unsigned l, int check_pos)
1383 if (!is_btree_node(iter, l) ||
1384 !bch2_btree_node_relock(iter, l))
1387 if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
1389 if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
1394 static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
1397 unsigned l = iter->level;
1399 while (btree_iter_node(iter, l) &&
1400 !btree_iter_good_node(iter, l, check_pos)) {
1401 btree_node_unlock(iter, l);
1402 iter->l[l].b = BTREE_ITER_NO_NODE_UP;
1410 * This is the main state machine for walking down the btree - walks down to a
1413 * Returns 0 on success, -EIO on error (error reading in a btree node).
1415 * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1416 * stashed in the iterator and returned from bch2_trans_exit().
1418 static int btree_iter_traverse_one(struct btree_trans *trans,
1419 struct btree_iter *iter,
1420 unsigned long trace_ip)
1422 unsigned l, depth_want = iter->level;
1426 * Ensure we obey iter->should_be_locked: if it's set, we can't unlock
1427 * and re-traverse the iterator without a transaction restart:
1429 if (iter->should_be_locked) {
1430 ret = bch2_btree_iter_relock(trans, iter, trace_ip) ? 0 : -EINTR;
1434 if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
1435 ret = bch2_btree_iter_traverse_cached(iter);
1439 if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1442 iter->level = btree_iter_up_until_good_node(iter, 0);
1444 /* If we need intent locks, take them too: */
1445 for (l = iter->level + 1;
1446 l < iter->locks_want && btree_iter_node(iter, l);
1448 if (!bch2_btree_node_relock(iter, l))
1449 while (iter->level <= l) {
1450 btree_node_unlock(iter, iter->level);
1451 iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
1456 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1457 * would indicate to other code that we got to the end of the btree,
1458 * here it indicates that relocking the root failed - it's critical that
1459 * btree_iter_lock_root() comes next and that it can't fail
1461 while (iter->level > depth_want) {
1462 ret = btree_iter_node(iter, iter->level)
1463 ? btree_iter_down(trans, iter, trace_ip)
1464 : btree_iter_lock_root(trans, iter, depth_want, trace_ip);
1465 if (unlikely(ret)) {
1468 * Got to the end of the btree (in
1469 * BTREE_ITER_NODES mode)
1475 __bch2_btree_iter_unlock(iter);
1476 iter->level = depth_want;
1479 iter->flags |= BTREE_ITER_ERROR;
1480 iter->l[iter->level].b =
1481 BTREE_ITER_NO_NODE_ERROR;
1483 iter->l[iter->level].b =
1484 BTREE_ITER_NO_NODE_DOWN;
1490 iter->uptodate = BTREE_ITER_NEED_PEEK;
1492 BUG_ON((ret == -EINTR) != !!trans->restarted);
1493 trace_iter_traverse(trans->ip, trace_ip,
1494 btree_iter_type(iter) == BTREE_ITER_CACHED,
1495 iter->btree_id, &iter->real_pos, ret);
1496 bch2_btree_iter_verify(iter);
1500 static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1502 struct btree_trans *trans = iter->trans;
1505 ret = bch2_trans_cond_resched(trans) ?:
1506 btree_iter_traverse_one(trans, iter, _RET_IP_);
1507 if (unlikely(ret) && hweight64(trans->iters_linked) == 1) {
1508 ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
1509 BUG_ON(ret == -EINTR);
1517 * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
1518 * for internal btree iterator users
1520 * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
1521 * btree_iter_traverse() does not:
1523 static inline int __must_check
1524 btree_iter_traverse(struct btree_iter *iter)
1526 return iter->uptodate >= BTREE_ITER_NEED_RELOCK
1527 ? __bch2_btree_iter_traverse(iter)
1532 bch2_btree_iter_traverse(struct btree_iter *iter)
1536 btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
1538 ret = btree_iter_traverse(iter);
1542 iter->should_be_locked = true;
1546 /* Iterate across nodes (leaf and interior nodes) */
1548 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1553 EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1554 bch2_btree_iter_verify(iter);
1556 ret = btree_iter_traverse(iter);
1560 b = btree_iter_node(iter, iter->level);
1564 BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
1566 iter->pos = iter->real_pos = b->key.k.p;
1567 iter->trans->iters_sorted = false;
1569 bch2_btree_iter_verify(iter);
1570 iter->should_be_locked = true;
1575 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1580 EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1581 bch2_btree_iter_verify(iter);
1583 /* already got to end? */
1584 if (!btree_iter_node(iter, iter->level))
1587 bch2_trans_cond_resched(iter->trans);
1589 btree_node_unlock(iter, iter->level);
1590 iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
1593 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1594 ret = btree_iter_traverse(iter);
1599 b = btree_iter_node(iter, iter->level);
1603 if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
1605 * Haven't gotten to the end of the parent node: go back down to
1606 * the next child node
1608 btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
1610 /* Unlock to avoid screwing up our lock invariants: */
1611 btree_node_unlock(iter, iter->level);
1613 iter->level = iter->min_depth;
1614 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1615 bch2_btree_iter_verify(iter);
1617 ret = btree_iter_traverse(iter);
1621 b = iter->l[iter->level].b;
1624 iter->pos = iter->real_pos = b->key.k.p;
1625 iter->trans->iters_sorted = false;
1627 bch2_btree_iter_verify(iter);
1628 iter->should_be_locked = true;
1633 /* Iterate across keys (in leaf nodes only) */
1635 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
1637 struct btree_trans *trans = iter->trans;
1638 #ifdef CONFIG_BCACHEFS_DEBUG
1639 struct bpos old_pos = iter->real_pos;
1641 int cmp = bpos_cmp(new_pos, iter->real_pos);
1642 unsigned l = iter->level;
1644 EBUG_ON(trans->restarted);
1649 iter->real_pos = new_pos;
1650 iter->should_be_locked = false;
1651 trans->iters_sorted = false;
1653 if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
1654 btree_node_unlock(iter, 0);
1655 iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
1656 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1660 l = btree_iter_up_until_good_node(iter, cmp);
1662 if (btree_iter_node(iter, l)) {
1664 * We might have to skip over many keys, or just a few: try
1665 * advancing the node iterator, and if we have to skip over too
1666 * many keys just reinit it (or if we're rewinding, since that
1670 !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
1671 __btree_iter_init(iter, l);
1673 /* Don't leave it locked if we're not supposed to: */
1674 if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
1675 btree_node_unlock(iter, l);
1678 if (l != iter->level)
1679 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1681 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1683 bch2_btree_iter_verify(iter);
1684 #ifdef CONFIG_BCACHEFS_DEBUG
1685 trace_iter_set_search_pos(trans->ip, _RET_IP_,
1687 &old_pos, &new_pos, l);
1691 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1693 struct bpos pos = iter->k.p;
1694 bool ret = bpos_cmp(pos, SPOS_MAX) != 0;
1696 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1697 pos = bkey_successor(iter, pos);
1698 bch2_btree_iter_set_pos(iter, pos);
1702 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1704 struct bpos pos = bkey_start_pos(&iter->k);
1705 bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1706 ? bpos_cmp(pos, POS_MIN)
1707 : bkey_cmp(pos, POS_MIN)) != 0;
1709 if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1710 pos = bkey_predecessor(iter, pos);
1711 bch2_btree_iter_set_pos(iter, pos);
1715 static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *iter)
1717 struct btree_insert_entry *i;
1718 struct bkey_i *ret = NULL;
1720 trans_for_each_update(iter->trans, i) {
1721 if (i->btree_id < iter->btree_id)
1723 if (i->btree_id > iter->btree_id)
1725 if (bpos_cmp(i->k->k.p, iter->real_pos) < 0)
1727 if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
1734 static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
1736 return iter->flags & BTREE_ITER_WITH_UPDATES
1737 ? __btree_trans_peek_updates(iter)
1742 * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1745 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1747 struct btree_iter_level *l = &iter->l[0];
1748 struct bpos search_key = btree_iter_search_key(iter);
1749 struct bkey_i *next_update;
1753 EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1754 bch2_btree_iter_verify(iter);
1755 bch2_btree_iter_verify_entry_exit(iter);
1758 btree_iter_set_search_pos(iter, search_key);
1760 ret = btree_iter_traverse(iter);
1761 if (unlikely(ret)) {
1762 /* ensure that iter->k is consistent with iter->pos: */
1763 bch2_btree_iter_set_pos(iter, iter->pos);
1764 k = bkey_s_c_err(ret);
1768 next_update = btree_trans_peek_updates(iter);
1769 k = btree_iter_level_peek_all(iter, l);
1771 /* * In the btree, deleted keys sort before non deleted: */
1772 if (k.k && bkey_deleted(k.k) &&
1774 bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
1775 search_key = k.k->p;
1780 bpos_cmp(next_update->k.p,
1781 k.k ? k.k->p : l->b->key.k.p) <= 0) {
1782 iter->k = next_update->k;
1783 k = bkey_i_to_s_c(next_update);
1787 if (likely(!bkey_deleted(k.k)))
1790 /* Advance to next key: */
1791 search_key = bkey_successor(iter, k.k->p);
1792 } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
1793 /* Advance to next leaf node: */
1794 search_key = bpos_successor(l->b->key.k.p);
1797 bch2_btree_iter_set_pos(iter, SPOS_MAX);
1798 iter->real_pos = SPOS_MAX;
1805 * iter->pos should be mononotically increasing, and always be equal to
1806 * the key we just returned - except extents can straddle iter->pos:
1808 if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
1810 else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1811 iter->pos = bkey_start_pos(k.k);
1812 iter->real_pos = k.k->p;
1814 iter->should_be_locked = true;
1815 bch2_btree_iter_verify_entry_exit(iter);
1816 bch2_btree_iter_verify(iter);
1821 * bch2_btree_iter_next: returns first key greater than iterator's current
1824 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1826 if (!bch2_btree_iter_advance(iter))
1827 return bkey_s_c_null;
1829 return bch2_btree_iter_peek(iter);
1833 * bch2_btree_iter_peek_prev: returns first key less than or equal to
1834 * iterator's current position
1836 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
1838 struct bpos search_key = iter->pos;
1839 struct btree_iter_level *l = &iter->l[0];
1843 EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1844 EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
1845 bch2_btree_iter_verify(iter);
1846 bch2_btree_iter_verify_entry_exit(iter);
1849 btree_iter_set_search_pos(iter, search_key);
1851 ret = btree_iter_traverse(iter);
1852 if (unlikely(ret)) {
1853 /* ensure that iter->k is consistent with iter->pos: */
1854 bch2_btree_iter_set_pos(iter, iter->pos);
1855 k = bkey_s_c_err(ret);
1859 k = btree_iter_level_peek(iter, l);
1861 ((iter->flags & BTREE_ITER_IS_EXTENTS)
1862 ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
1863 : bkey_cmp(k.k->p, iter->pos) > 0))
1864 k = btree_iter_level_prev(iter, l);
1868 } else if (likely(bpos_cmp(l->b->data->min_key, POS_MIN))) {
1869 /* Advance to previous leaf node: */
1870 search_key = bpos_predecessor(l->b->data->min_key);
1872 /* Start of btree: */
1873 bch2_btree_iter_set_pos(iter, POS_MIN);
1879 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
1881 /* Extents can straddle iter->pos: */
1882 if (bkey_cmp(k.k->p, iter->pos) < 0)
1885 iter->should_be_locked = true;
1886 bch2_btree_iter_verify_entry_exit(iter);
1887 bch2_btree_iter_verify(iter);
1892 * bch2_btree_iter_prev: returns first key less than iterator's current
1895 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1897 if (!bch2_btree_iter_rewind(iter))
1898 return bkey_s_c_null;
1900 return bch2_btree_iter_peek_prev(iter);
1903 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1905 struct btree_trans *trans = iter->trans;
1906 struct bpos search_key;
1910 EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS &&
1911 btree_iter_type(iter) != BTREE_ITER_CACHED);
1912 bch2_btree_iter_verify(iter);
1913 bch2_btree_iter_verify_entry_exit(iter);
1915 /* extents can't span inode numbers: */
1916 if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
1917 unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
1918 if (iter->pos.inode == KEY_INODE_MAX)
1919 return bkey_s_c_null;
1921 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
1924 search_key = btree_iter_search_key(iter);
1925 btree_iter_set_search_pos(iter, search_key);
1927 ret = btree_iter_traverse(iter);
1929 return bkey_s_c_err(ret);
1931 if (btree_iter_type(iter) == BTREE_ITER_CACHED ||
1932 !(iter->flags & BTREE_ITER_IS_EXTENTS)) {
1933 struct bkey_i *next_update;
1934 struct bkey_cached *ck;
1936 next_update = btree_trans_peek_updates(iter);
1938 switch (btree_iter_type(iter)) {
1939 case BTREE_ITER_KEYS:
1940 k = btree_iter_level_peek_all(iter, &iter->l[0]);
1941 EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0);
1943 case BTREE_ITER_CACHED:
1944 ck = (void *) iter->l[0].b;
1945 EBUG_ON(iter->btree_id != ck->key.btree_id ||
1946 bkey_cmp(iter->pos, ck->key.pos));
1949 k = bkey_i_to_s_c(ck->k);
1951 case BTREE_ITER_NODES:
1956 (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
1957 iter->k = next_update->k;
1958 k = bkey_i_to_s_c(next_update);
1962 ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
1963 ? bpos_cmp(iter->pos, k.k->p)
1964 : bkey_cmp(iter->pos, k.k->p))) {
1965 bkey_init(&iter->k);
1966 iter->k.p = iter->pos;
1967 k = (struct bkey_s_c) { &iter->k, NULL };
1972 if (iter->flags & BTREE_ITER_INTENT) {
1973 struct btree_iter *child =
1974 btree_iter_child_alloc(trans, iter, _THIS_IP_);
1976 btree_iter_copy(trans, child, iter);
1977 k = bch2_btree_iter_peek(child);
1979 if (k.k && !bkey_err(k))
1982 struct bpos pos = iter->pos;
1984 k = bch2_btree_iter_peek(iter);
1988 if (unlikely(bkey_err(k)))
1991 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
1993 if (bkey_cmp(iter->pos, next) < 0) {
1994 bkey_init(&iter->k);
1995 iter->k.p = iter->pos;
1996 bch2_key_resize(&iter->k,
1997 min_t(u64, KEY_SIZE_MAX,
1998 (next.inode == iter->pos.inode
2003 k = (struct bkey_s_c) { &iter->k, NULL };
2004 EBUG_ON(!k.k->size);
2008 bch2_btree_iter_verify_entry_exit(iter);
2009 bch2_btree_iter_verify(iter);
2010 iter->should_be_locked = true;
2015 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2017 if (!bch2_btree_iter_advance(iter))
2018 return bkey_s_c_null;
2020 return bch2_btree_iter_peek_slot(iter);
2023 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2025 if (!bch2_btree_iter_rewind(iter))
2026 return bkey_s_c_null;
2028 return bch2_btree_iter_peek_slot(iter);
2031 static inline void bch2_btree_iter_init(struct btree_trans *trans,
2032 struct btree_iter *iter, enum btree_id btree_id)
2034 struct bch_fs *c = trans->c;
2037 iter->trans = trans;
2038 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
2039 iter->btree_id = btree_id;
2040 iter->real_pos = POS_MIN;
2042 iter->min_depth = 0;
2043 iter->locks_want = 0;
2044 iter->nodes_locked = 0;
2045 iter->nodes_intent_locked = 0;
2046 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
2047 iter->l[i].b = BTREE_ITER_NO_NODE_INIT;
2049 prefetch(c->btree_roots[btree_id].b);
2052 /* new transactional stuff: */
2054 #ifdef CONFIG_BCACHEFS_DEBUG
2055 static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2057 struct btree_iter *iter;
2060 BUG_ON(trans->nr_sorted != hweight64(trans->iters_linked));
2062 trans_for_each_iter(trans, iter) {
2063 BUG_ON(iter->sorted_idx >= trans->nr_sorted);
2064 BUG_ON(trans->sorted[iter->sorted_idx] != iter->idx);
2067 for (i = 0; i < trans->nr_sorted; i++) {
2068 unsigned idx = trans->sorted[i];
2070 EBUG_ON(!(trans->iters_linked & (1ULL << idx)));
2071 BUG_ON(trans->iters[idx].sorted_idx != i);
2075 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
2078 static void btree_trans_verify_sorted(struct btree_trans *trans)
2080 #ifdef CONFIG_BCACHEFS_DEBUG
2081 struct btree_iter *iter, *prev = NULL;
2084 trans_for_each_iter_inorder(trans, iter, i) {
2085 BUG_ON(prev && btree_iter_cmp(prev, iter) > 0);
2091 static noinline void __btree_trans_sort_iters(struct btree_trans *trans)
2093 int i, l = 0, r = trans->nr_sorted, inc = 1;
2097 * Cocktail shaker sort: this is efficient because iterators will be
2103 for (i = inc > 0 ? l : r - 2;
2104 i + 1 < r && i >= l;
2106 if (btree_iter_cmp(trans->iters + trans->sorted[i],
2107 trans->iters + trans->sorted[i + 1]) > 0) {
2108 swap(trans->sorted[i], trans->sorted[i + 1]);
2109 trans->iters[trans->sorted[i]].sorted_idx = i;
2110 trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
2122 trans->iters_sorted = true;
2124 btree_trans_verify_sorted(trans);
2127 static inline void btree_trans_sort_iters(struct btree_trans *trans)
2129 btree_trans_verify_sorted_refs(trans);
2131 if (trans->iters_sorted) {
2132 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
2133 btree_trans_verify_sorted(trans);
2136 __btree_trans_sort_iters(trans);
2139 static inline void btree_iter_list_remove(struct btree_trans *trans,
2140 struct btree_iter *iter)
2144 EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
2145 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2147 memmove_u64s_down_small(trans->sorted + iter->sorted_idx,
2148 trans->sorted + iter->sorted_idx + 1,
2149 DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
2151 array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx);
2153 for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
2154 trans->iters[trans->sorted[i]].sorted_idx = i;
2156 iter->sorted_idx = U8_MAX;
2159 static inline void btree_iter_list_add(struct btree_trans *trans,
2160 struct btree_iter *pos,
2161 struct btree_iter *iter)
2165 iter->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
2167 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2168 memmove_u64s_up_small(trans->sorted + iter->sorted_idx + 1,
2169 trans->sorted + iter->sorted_idx,
2170 DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
2172 trans->sorted[iter->sorted_idx] = iter->idx;
2174 array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx);
2177 for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
2178 trans->iters[trans->sorted[i]].sorted_idx = i;
2180 btree_trans_verify_sorted_refs(trans);
2183 static void btree_iter_child_free(struct btree_trans *trans, struct btree_iter *iter)
2185 struct btree_iter *child = btree_iter_child(trans, iter);
2188 bch2_trans_iter_free(trans, child);
2189 iter->child_idx = U8_MAX;
2193 static struct btree_iter *btree_iter_child_alloc(struct btree_trans *trans,
2194 struct btree_iter *iter,
2197 struct btree_iter *child = btree_iter_child(trans, iter);
2200 child = btree_trans_iter_alloc(trans, iter);
2201 child->ip_allocated = ip;
2202 iter->child_idx = child->idx;
2204 trans->iters_live |= 1ULL << child->idx;
2205 trans->iters_touched |= 1ULL << child->idx;
2211 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
2214 btree_iter_child_free(trans, &trans->iters[idx]);
2216 btree_iter_list_remove(trans, &trans->iters[idx]);
2218 __bch2_btree_iter_unlock(&trans->iters[idx]);
2219 trans->iters_linked &= ~(1ULL << idx);
2220 trans->iters_live &= ~(1ULL << idx);
2221 trans->iters_touched &= ~(1ULL << idx);
2223 btree_trans_verify_sorted_refs(trans);
2226 static bool have_iter_at_pos(struct btree_trans *trans,
2227 struct btree_iter *iter)
2229 struct btree_iter *n;
2231 n = prev_btree_iter(trans, iter);
2232 if (n && !btree_iter_cmp(n, iter))
2235 n = next_btree_iter(trans, iter);
2236 if (n && !btree_iter_cmp(n, iter))
2242 int bch2_trans_iter_put(struct btree_trans *trans,
2243 struct btree_iter *iter)
2247 if (IS_ERR_OR_NULL(iter))
2250 BUG_ON(trans->iters + iter->idx != iter);
2251 BUG_ON(!btree_iter_live(trans, iter));
2253 ret = btree_iter_err(iter);
2255 if (!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) &&
2256 (!(trans->iters_touched & (1ULL << iter->idx)) ||
2257 have_iter_at_pos(trans, iter)))
2258 __bch2_trans_iter_free(trans, iter->idx);
2260 trans->iters_live &= ~(1ULL << iter->idx);
2264 int bch2_trans_iter_free(struct btree_trans *trans,
2265 struct btree_iter *iter)
2267 if (IS_ERR_OR_NULL(iter))
2270 set_btree_iter_dontneed(trans, iter);
2272 return bch2_trans_iter_put(trans, iter);
2276 void bch2_dump_trans_iters_updates(struct btree_trans *trans)
2278 struct btree_iter *iter;
2279 struct btree_insert_entry *i;
2281 char buf1[300], buf2[100];
2283 btree_trans_sort_iters(trans);
2285 trans_for_each_iter_inorder(trans, iter, idx)
2286 printk(KERN_ERR "iter: btree %s pos %s real_pos %s%s%s%s %pS\n",
2287 bch2_btree_ids[iter->btree_id],
2288 (bch2_bpos_to_text(&PBUF(buf1), iter->pos), buf1),
2289 (bch2_bpos_to_text(&PBUF(buf2), iter->real_pos), buf2),
2290 btree_iter_live(trans, iter) ? " live" : "",
2291 (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
2292 iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
2293 (void *) iter->ip_allocated);
2295 trans_for_each_update(trans, i)
2296 printk(KERN_ERR "update: btree %s %s %pS\n",
2297 bch2_btree_ids[i->btree_id],
2298 (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, bkey_i_to_s_c(i->k)), buf1),
2299 (void *) i->ip_allocated);
2302 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
2303 struct btree_iter *pos)
2305 struct btree_iter *iter;
2308 btree_trans_verify_sorted_refs(trans);
2310 if (unlikely(trans->iters_linked ==
2311 ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
2312 bch2_dump_trans_iters_updates(trans);
2313 panic("trans iter oveflow\n");
2316 idx = __ffs64(~trans->iters_linked);
2317 iter = &trans->iters[idx];
2319 iter->trans = trans;
2321 iter->child_idx = U8_MAX;
2322 iter->sorted_idx = U8_MAX;
2324 iter->nodes_locked = 0;
2325 iter->nodes_intent_locked = 0;
2326 trans->iters_linked |= 1ULL << idx;
2328 btree_iter_list_add(trans, pos, iter);
2332 static void btree_iter_copy(struct btree_trans *trans, struct btree_iter *dst,
2333 struct btree_iter *src)
2335 unsigned i, offset = offsetof(struct btree_iter, flags);
2337 __bch2_btree_iter_unlock(dst);
2338 btree_iter_child_free(trans, dst);
2340 memcpy((void *) dst + offset,
2341 (void *) src + offset,
2342 sizeof(struct btree_iter) - offset);
2344 for (i = 0; i < BTREE_MAX_DEPTH; i++)
2345 if (btree_node_locked(dst, i))
2346 six_lock_increment(&dst->l[i].b->c.lock,
2347 __btree_lock_want(dst, i));
2349 dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2350 dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
2351 trans->iters_sorted = false;
2354 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
2355 enum btree_id btree_id, struct bpos pos,
2356 unsigned locks_want,
2360 struct btree_iter *iter, *best = NULL;
2361 struct bpos real_pos, pos_min = POS_MIN;
2363 EBUG_ON(trans->restarted);
2365 if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
2366 btree_node_type_is_extents(btree_id) &&
2367 !(flags & BTREE_ITER_NOT_EXTENTS) &&
2368 !(flags & BTREE_ITER_ALL_SNAPSHOTS))
2369 flags |= BTREE_ITER_IS_EXTENTS;
2371 if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
2372 !btree_type_has_snapshots(btree_id))
2373 flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
2375 if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
2376 pos.snapshot = btree_type_has_snapshots(btree_id)
2381 if ((flags & BTREE_ITER_IS_EXTENTS) &&
2382 bkey_cmp(pos, POS_MAX))
2383 real_pos = bpos_nosnap_successor(pos);
2385 trans_for_each_iter(trans, iter) {
2386 if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
2389 if (iter->btree_id != btree_id)
2393 int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
2394 bpos_diff(iter->real_pos, real_pos));
2397 ((cmp == 0 && btree_iter_keep(trans, iter))))
2405 iter = btree_trans_iter_alloc(trans, best);
2406 bch2_btree_iter_init(trans, iter, btree_id);
2407 } else if (btree_iter_keep(trans, best)) {
2408 iter = btree_trans_iter_alloc(trans, best);
2409 btree_iter_copy(trans, iter, best);
2414 trans->iters_live |= 1ULL << iter->idx;
2415 trans->iters_touched |= 1ULL << iter->idx;
2417 iter->flags = flags;
2419 iter->snapshot = pos.snapshot;
2422 * If the iterator has locks_want greater than requested, we explicitly
2423 * do not downgrade it here - on transaction restart because btree node
2424 * split needs to upgrade locks, we might be putting/getting the
2425 * iterator again. Downgrading iterators only happens via an explicit
2426 * bch2_trans_downgrade().
2429 locks_want = min(locks_want, BTREE_MAX_DEPTH);
2430 if (locks_want > iter->locks_want) {
2431 iter->locks_want = locks_want;
2432 btree_iter_get_locks(trans, iter, true, _THIS_IP_);
2435 while (iter->level != depth) {
2436 btree_node_unlock(iter, iter->level);
2437 iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
2438 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
2439 if (iter->level < depth)
2445 iter->min_depth = depth;
2447 bch2_btree_iter_set_pos(iter, pos);
2448 btree_iter_set_search_pos(iter, real_pos);
2450 trace_trans_get_iter(_RET_IP_, trans->ip,
2452 &real_pos, locks_want, iter->uptodate,
2453 best ? &best->real_pos : &pos_min,
2454 best ? best->locks_want : U8_MAX,
2455 best ? best->uptodate : U8_MAX);
2460 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
2461 enum btree_id btree_id,
2463 unsigned locks_want,
2467 struct btree_iter *iter =
2468 __bch2_trans_get_iter(trans, btree_id, pos,
2471 BTREE_ITER_NOT_EXTENTS|
2472 BTREE_ITER_ALL_SNAPSHOTS|
2475 BUG_ON(bkey_cmp(iter->pos, pos));
2476 BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
2477 BUG_ON(iter->level != depth);
2478 BUG_ON(iter->min_depth != depth);
2479 iter->ip_allocated = _RET_IP_;
2484 struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
2485 struct btree_iter *src)
2487 struct btree_iter *iter;
2489 iter = btree_trans_iter_alloc(trans, src);
2490 btree_iter_copy(trans, iter, src);
2492 trans->iters_live |= 1ULL << iter->idx;
2494 * We don't need to preserve this iter since it's cheap to copy it
2495 * again - this will cause trans_iter_put() to free it right away:
2497 set_btree_iter_dontneed(trans, iter);
2502 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2504 size_t new_top = trans->mem_top + size;
2507 if (new_top > trans->mem_bytes) {
2508 size_t old_bytes = trans->mem_bytes;
2509 size_t new_bytes = roundup_pow_of_two(new_top);
2512 WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2514 new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2515 if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2516 new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2517 new_bytes = BTREE_TRANS_MEM_MAX;
2522 return ERR_PTR(-ENOMEM);
2524 trans->mem = new_mem;
2525 trans->mem_bytes = new_bytes;
2528 trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
2529 btree_trans_restart(trans);
2530 return ERR_PTR(-EINTR);
2534 p = trans->mem + trans->mem_top;
2535 trans->mem_top += size;
2540 inline void bch2_trans_unlink_iters(struct btree_trans *trans)
2542 u64 iters = trans->iters_linked &
2543 ~trans->iters_touched &
2547 unsigned idx = __ffs64(iters);
2549 iters &= ~(1ULL << idx);
2550 __bch2_trans_iter_free(trans, idx);
2555 * bch2_trans_begin() - reset a transaction after a interrupted attempt
2556 * @trans: transaction to reset
2558 * While iterating over nodes or updating nodes a attempt to lock a btree
2559 * node may return EINTR when the trylock fails. When this occurs
2560 * bch2_trans_begin() should be called and the transaction retried.
2562 void bch2_trans_begin(struct btree_trans *trans)
2564 struct btree_iter *iter;
2566 trans_for_each_iter(trans, iter)
2567 iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
2568 BTREE_ITER_SET_POS_AFTER_COMMIT);
2571 * XXX: we shouldn't be doing this if the transaction was restarted, but
2572 * currently we still overflow transaction iterators if we do that
2574 bch2_trans_unlink_iters(trans);
2575 trans->iters_touched &= trans->iters_live;
2577 trans->extra_journal_res = 0;
2578 trans->nr_updates = 0;
2581 trans->hooks = NULL;
2582 trans->extra_journal_entries = NULL;
2583 trans->extra_journal_entry_u64s = 0;
2585 if (trans->fs_usage_deltas) {
2586 trans->fs_usage_deltas->used = 0;
2587 memset((void *) trans->fs_usage_deltas +
2588 offsetof(struct replicas_delta_list, memset_start), 0,
2589 (void *) &trans->fs_usage_deltas->memset_end -
2590 (void *) &trans->fs_usage_deltas->memset_start);
2593 bch2_trans_cond_resched(trans);
2595 if (trans->restarted)
2596 bch2_btree_iter_traverse_all(trans);
2598 trans->restarted = false;
2601 static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
2603 size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX;
2604 size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2607 BUG_ON(trans->used_mempool);
2610 p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
2613 p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
2615 trans->iters = p; p += iters_bytes;
2616 trans->updates = p; p += updates_bytes;
2619 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
2620 unsigned expected_nr_iters,
2621 size_t expected_mem_bytes)
2622 __acquires(&c->btree_trans_barrier)
2624 memset(trans, 0, sizeof(*trans));
2626 trans->ip = _RET_IP_;
2629 * reallocating iterators currently completely breaks
2630 * bch2_trans_iter_put(), we always allocate the max:
2632 bch2_trans_alloc_iters(trans, c);
2634 if (expected_mem_bytes) {
2635 expected_mem_bytes = roundup_pow_of_two(expected_mem_bytes);
2636 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2638 if (!unlikely(trans->mem)) {
2639 trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2640 trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2642 trans->mem_bytes = expected_mem_bytes;
2646 trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2648 #ifdef CONFIG_BCACHEFS_DEBUG
2649 trans->pid = current->pid;
2650 mutex_lock(&c->btree_trans_lock);
2651 list_add(&trans->list, &c->btree_trans_list);
2652 mutex_unlock(&c->btree_trans_lock);
2656 int bch2_trans_exit(struct btree_trans *trans)
2657 __releases(&c->btree_trans_barrier)
2659 struct bch_fs *c = trans->c;
2661 bch2_trans_unlock(trans);
2663 #ifdef CONFIG_BCACHEFS_DEBUG
2664 if (trans->iters_live) {
2665 struct btree_iter *iter;
2667 trans_for_each_iter(trans, iter)
2668 btree_iter_child_free(trans, iter);
2671 if (trans->iters_live) {
2672 struct btree_iter *iter;
2674 bch_err(c, "btree iterators leaked!");
2675 trans_for_each_iter(trans, iter)
2676 if (btree_iter_live(trans, iter))
2677 printk(KERN_ERR " btree %s allocated at %pS\n",
2678 bch2_btree_ids[iter->btree_id],
2679 (void *) iter->ip_allocated);
2680 /* Be noisy about this: */
2681 bch2_fatal_error(c);
2684 mutex_lock(&trans->c->btree_trans_lock);
2685 list_del(&trans->list);
2686 mutex_unlock(&trans->c->btree_trans_lock);
2689 srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2691 bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
2693 if (trans->fs_usage_deltas) {
2694 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
2695 REPLICAS_DELTA_LIST_MAX)
2696 mempool_free(trans->fs_usage_deltas,
2697 &trans->c->replicas_delta_pool);
2699 kfree(trans->fs_usage_deltas);
2702 if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
2703 mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
2709 * Userspace doesn't have a real percpu implementation:
2711 trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
2715 mempool_free(trans->iters, &trans->c->btree_iters_pool);
2717 trans->mem = (void *) 0x1;
2718 trans->iters = (void *) 0x1;
2720 return trans->error ? -EIO : 0;
2723 static void __maybe_unused
2724 bch2_btree_iter_node_to_text(struct printbuf *out,
2725 struct btree_bkey_cached_common *_b,
2726 enum btree_iter_type type)
2728 pr_buf(out, " l=%u %s:",
2729 _b->level, bch2_btree_ids[_b->btree_id]);
2730 bch2_bpos_to_text(out, btree_node_pos(_b, type));
2733 #ifdef CONFIG_BCACHEFS_DEBUG
2734 static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
2736 struct btree_iter *iter;
2738 trans_for_each_iter(trans, iter)
2739 if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
2746 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
2748 #ifdef CONFIG_BCACHEFS_DEBUG
2749 struct btree_trans *trans;
2750 struct btree_iter *iter;
2754 mutex_lock(&c->btree_trans_lock);
2755 list_for_each_entry(trans, &c->btree_trans_list, list) {
2756 if (!trans_has_btree_nodes_locked(trans))
2759 pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
2761 trans_for_each_iter(trans, iter) {
2762 if (!iter->nodes_locked)
2765 pr_buf(out, " iter %u %c %s:",
2767 btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
2768 bch2_btree_ids[iter->btree_id]);
2769 bch2_bpos_to_text(out, iter->pos);
2772 for (l = 0; l < BTREE_MAX_DEPTH; l++) {
2773 if (btree_node_locked(iter, l)) {
2774 pr_buf(out, " %s l=%u ",
2775 btree_node_intent_locked(iter, l) ? "i" : "r", l);
2776 bch2_btree_iter_node_to_text(out,
2777 (void *) iter->l[l].b,
2778 btree_iter_type(iter));
2784 b = READ_ONCE(trans->locking);
2786 iter = &trans->iters[trans->locking_iter_idx];
2787 pr_buf(out, " locking iter %u %c l=%u %s:",
2788 trans->locking_iter_idx,
2789 btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
2790 trans->locking_level,
2791 bch2_btree_ids[trans->locking_btree_id]);
2792 bch2_bpos_to_text(out, trans->locking_pos);
2794 pr_buf(out, " node ");
2795 bch2_btree_iter_node_to_text(out,
2797 btree_iter_type(iter));
2801 mutex_unlock(&c->btree_trans_lock);
2805 void bch2_fs_btree_iter_exit(struct bch_fs *c)
2807 mempool_exit(&c->btree_trans_mem_pool);
2808 mempool_exit(&c->btree_iters_pool);
2809 cleanup_srcu_struct(&c->btree_trans_barrier);
2812 int bch2_fs_btree_iter_init(struct bch_fs *c)
2814 unsigned nr = BTREE_ITER_MAX;
2816 INIT_LIST_HEAD(&c->btree_trans_list);
2817 mutex_init(&c->btree_trans_lock);
2819 return init_srcu_struct(&c->btree_trans_barrier) ?:
2820 mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
2821 sizeof(struct btree_iter) * nr +
2822 sizeof(struct btree_insert_entry) * nr) ?:
2823 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
2824 BTREE_TRANS_MEM_MAX);