bcachefs: Reduce iter->trans usage
[linux-2.6-block.git] / fs / bcachefs / btree_iter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_methods.h"
5 #include "bkey_buf.h"
6 #include "btree_cache.h"
7 #include "btree_iter.h"
8 #include "btree_key_cache.h"
9 #include "btree_locking.h"
10 #include "btree_update.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "replicas.h"
16 #include "trace.h"
17
18 #include <linux/prefetch.h>
19
20 static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
21 static inline void btree_trans_sort_iters(struct btree_trans *);
22 static struct btree_iter *btree_iter_child_alloc(struct btree_trans *,
23                                         struct btree_iter *, unsigned long);
24 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *,
25                                                  struct btree_iter *);
26 static void btree_iter_copy(struct btree_trans *, struct btree_iter *, struct btree_iter *);
27
28 static inline int btree_iter_cmp(const struct btree_iter *l,
29                                  const struct btree_iter *r)
30 {
31         return   cmp_int(l->btree_id, r->btree_id) ?:
32                 -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
33                  bkey_cmp(l->real_pos, r->real_pos);
34 }
35
36 static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
37 {
38         EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
39
40         /* Are we iterating over keys in all snapshots? */
41         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
42                 p = bpos_successor(p);
43         } else {
44                 p = bpos_nosnap_successor(p);
45                 p.snapshot = iter->snapshot;
46         }
47
48         return p;
49 }
50
51 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
52 {
53         EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
54
55         /* Are we iterating over keys in all snapshots? */
56         if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
57                 p = bpos_predecessor(p);
58         } else {
59                 p = bpos_nosnap_predecessor(p);
60                 p.snapshot = iter->snapshot;
61         }
62
63         return p;
64 }
65
66 static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
67 {
68         return l < BTREE_MAX_DEPTH &&
69                 (unsigned long) iter->l[l].b >= 128;
70 }
71
72 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
73 {
74         struct bpos pos = iter->pos;
75
76         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
77             bkey_cmp(pos, POS_MAX))
78                 pos = bkey_successor(iter, pos);
79         return pos;
80 }
81
82 static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
83                                               struct btree *b)
84 {
85         return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
86 }
87
88 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
89                                              struct btree *b)
90 {
91         return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
92 }
93
94 static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
95                                           struct btree *b)
96 {
97         return iter->btree_id == b->c.btree_id &&
98                 !btree_iter_pos_before_node(iter, b) &&
99                 !btree_iter_pos_after_node(iter, b);
100 }
101
102 /* Btree node locking: */
103
104 void bch2_btree_node_unlock_write(struct btree_trans *trans,
105                         struct btree_iter *iter, struct btree *b)
106 {
107         bch2_btree_node_unlock_write_inlined(trans, iter, b);
108 }
109
110 void __bch2_btree_node_lock_write(struct btree_trans *trans,
111                         struct btree_iter *iter, struct btree *b)
112 {
113         struct btree_iter *linked;
114         unsigned readers = 0;
115
116         EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
117
118         trans_for_each_iter(trans, linked)
119                 if (linked->l[b->c.level].b == b &&
120                     btree_node_read_locked(linked, b->c.level))
121                         readers++;
122
123         /*
124          * Must drop our read locks before calling six_lock_write() -
125          * six_unlock() won't do wakeups until the reader count
126          * goes to 0, and it's safe because we have the node intent
127          * locked:
128          */
129         if (!b->c.lock.readers)
130                 atomic64_sub(__SIX_VAL(read_lock, readers),
131                              &b->c.lock.state.counter);
132         else
133                 this_cpu_sub(*b->c.lock.readers, readers);
134
135         btree_node_lock_type(trans->c, b, SIX_LOCK_write);
136
137         if (!b->c.lock.readers)
138                 atomic64_add(__SIX_VAL(read_lock, readers),
139                              &b->c.lock.state.counter);
140         else
141                 this_cpu_add(*b->c.lock.readers, readers);
142 }
143
144 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
145 {
146         struct btree *b = btree_iter_node(iter, level);
147         int want = __btree_lock_want(iter, level);
148
149         if (!is_btree_node(iter, level))
150                 return false;
151
152         if (race_fault())
153                 return false;
154
155         if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
156             (btree_node_lock_seq_matches(iter, b, level) &&
157              btree_node_lock_increment(iter->trans, b, level, want))) {
158                 mark_btree_node_locked(iter, level, want);
159                 return true;
160         } else {
161                 return false;
162         }
163 }
164
165 static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
166 {
167         struct btree *b = iter->l[level].b;
168
169         EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
170
171         if (!is_btree_node(iter, level))
172                 return false;
173
174         if (btree_node_intent_locked(iter, level))
175                 return true;
176
177         if (race_fault())
178                 return false;
179
180         if (btree_node_locked(iter, level)
181             ? six_lock_tryupgrade(&b->c.lock)
182             : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
183                 goto success;
184
185         if (btree_node_lock_seq_matches(iter, b, level) &&
186             btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
187                 btree_node_unlock(iter, level);
188                 goto success;
189         }
190
191         return false;
192 success:
193         mark_btree_node_intent_locked(iter, level);
194         return true;
195 }
196
197 static inline bool btree_iter_get_locks(struct btree_trans *trans,
198                                         struct btree_iter *iter,
199                                         bool upgrade, unsigned long trace_ip)
200 {
201         unsigned l = iter->level;
202         int fail_idx = -1;
203
204         do {
205                 if (!btree_iter_node(iter, l))
206                         break;
207
208                 if (!(upgrade
209                       ? bch2_btree_node_upgrade(iter, l)
210                       : bch2_btree_node_relock(iter, l))) {
211                         (upgrade
212                          ? trace_node_upgrade_fail
213                          : trace_node_relock_fail)(trans->ip, trace_ip,
214                                         btree_iter_type(iter) == BTREE_ITER_CACHED,
215                                         iter->btree_id, &iter->real_pos,
216                                         l, iter->l[l].lock_seq,
217                                         is_btree_node(iter, l)
218                                         ? 0
219                                         : (unsigned long) iter->l[l].b,
220                                         is_btree_node(iter, l)
221                                         ? iter->l[l].b->c.lock.state.seq
222                                         : 0);
223                         fail_idx = l;
224                         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
225                 }
226
227                 l++;
228         } while (l < iter->locks_want);
229
230         /*
231          * When we fail to get a lock, we have to ensure that any child nodes
232          * can't be relocked so bch2_btree_iter_traverse has to walk back up to
233          * the node that we failed to relock:
234          */
235         while (fail_idx >= 0) {
236                 btree_node_unlock(iter, fail_idx);
237                 iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
238                 --fail_idx;
239         }
240
241         if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
242                 iter->uptodate = BTREE_ITER_NEED_PEEK;
243
244         bch2_btree_trans_verify_locks(trans);
245
246         return iter->uptodate < BTREE_ITER_NEED_RELOCK;
247 }
248
249 static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
250                                   enum btree_iter_type type)
251 {
252         return  type != BTREE_ITER_CACHED
253                 ? container_of(_b, struct btree, c)->key.k.p
254                 : container_of(_b, struct bkey_cached, c)->key.pos;
255 }
256
257 /* Slowpath: */
258 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
259                             unsigned level, struct btree_iter *iter,
260                             enum six_lock_type type,
261                             six_lock_should_sleep_fn should_sleep_fn, void *p,
262                             unsigned long ip)
263 {
264         struct btree_trans *trans = iter->trans;
265         struct btree_iter *linked, *deadlock_iter = NULL;
266         u64 start_time = local_clock();
267         unsigned reason = 9;
268         bool ret;
269
270         /* Check if it's safe to block: */
271         trans_for_each_iter(trans, linked) {
272                 if (!linked->nodes_locked)
273                         continue;
274
275                 /*
276                  * Can't block taking an intent lock if we have _any_ nodes read
277                  * locked:
278                  *
279                  * - Our read lock blocks another thread with an intent lock on
280                  *   the same node from getting a write lock, and thus from
281                  *   dropping its intent lock
282                  *
283                  * - And the other thread may have multiple nodes intent locked:
284                  *   both the node we want to intent lock, and the node we
285                  *   already have read locked - deadlock:
286                  */
287                 if (type == SIX_LOCK_intent &&
288                     linked->nodes_locked != linked->nodes_intent_locked) {
289                         deadlock_iter = linked;
290                         reason = 1;
291                 }
292
293                 if (linked->btree_id != iter->btree_id) {
294                         if (linked->btree_id > iter->btree_id) {
295                                 deadlock_iter = linked;
296                                 reason = 3;
297                         }
298                         continue;
299                 }
300
301                 /*
302                  * Within the same btree, cached iterators come before non
303                  * cached iterators:
304                  */
305                 if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
306                         if (btree_iter_is_cached(iter)) {
307                                 deadlock_iter = linked;
308                                 reason = 4;
309                         }
310                         continue;
311                 }
312
313                 /*
314                  * Interior nodes must be locked before their descendants: if
315                  * another iterator has possible descendants locked of the node
316                  * we're about to lock, it must have the ancestors locked too:
317                  */
318                 if (level > __fls(linked->nodes_locked)) {
319                         deadlock_iter = linked;
320                         reason = 5;
321                 }
322
323                 /* Must lock btree nodes in key order: */
324                 if (btree_node_locked(linked, level) &&
325                     bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
326                                                  btree_iter_type(linked))) <= 0) {
327                         deadlock_iter = linked;
328                         reason = 7;
329                 }
330         }
331
332         if (unlikely(deadlock_iter)) {
333                 trace_trans_restart_would_deadlock(trans->ip, ip,
334                                 trans->in_traverse_all, reason,
335                                 deadlock_iter->btree_id,
336                                 btree_iter_type(deadlock_iter),
337                                 &deadlock_iter->real_pos,
338                                 iter->btree_id,
339                                 btree_iter_type(iter),
340                                 &pos);
341                 btree_trans_restart(trans);
342                 return false;
343         }
344
345         if (six_trylock_type(&b->c.lock, type))
346                 return true;
347
348 #ifdef CONFIG_BCACHEFS_DEBUG
349         trans->locking_iter_idx = iter->idx;
350         trans->locking_pos      = pos;
351         trans->locking_btree_id = iter->btree_id;
352         trans->locking_level    = level;
353         trans->locking          = b;
354 #endif
355
356         ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
357
358 #ifdef CONFIG_BCACHEFS_DEBUG
359         trans->locking = NULL;
360 #endif
361         if (ret)
362                 bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
363                                        start_time);
364         return ret;
365 }
366
367 /* Btree iterator locking: */
368
369 #ifdef CONFIG_BCACHEFS_DEBUG
370 static void bch2_btree_iter_verify_locks(struct btree_trans *trans,
371                                          struct btree_iter *iter)
372 {
373         unsigned l;
374
375         if (!(trans->iters_linked & (1ULL << iter->idx))) {
376                 BUG_ON(iter->nodes_locked);
377                 return;
378         }
379
380         for (l = 0; btree_iter_node(iter, l); l++) {
381                 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
382                     !btree_node_locked(iter, l))
383                         continue;
384
385                 BUG_ON(btree_lock_want(iter, l) !=
386                        btree_node_locked_type(iter, l));
387         }
388 }
389
390 void bch2_btree_trans_verify_locks(struct btree_trans *trans)
391 {
392         struct btree_iter *iter;
393
394         trans_for_each_iter(trans, iter)
395                 bch2_btree_iter_verify_locks(trans, iter);
396 }
397 #else
398 static inline void bch2_btree_iter_verify_locks(struct btree_trans *trans,
399                                                 struct btree_iter *iter) {}
400 #endif
401
402 /*
403  * Only for btree_cache.c - only relocks intent locks
404  */
405 bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
406 {
407         struct btree_trans *trans = iter->trans;
408         unsigned l;
409
410         for (l = iter->level;
411              l < iter->locks_want && btree_iter_node(iter, l);
412              l++) {
413                 if (!bch2_btree_node_relock(iter, l)) {
414                         trace_node_relock_fail(trans->ip, _RET_IP_,
415                                         btree_iter_type(iter) == BTREE_ITER_CACHED,
416                                         iter->btree_id, &iter->real_pos,
417                                         l, iter->l[l].lock_seq,
418                                         is_btree_node(iter, l)
419                                         ? 0
420                                         : (unsigned long) iter->l[l].b,
421                                         is_btree_node(iter, l)
422                                         ? iter->l[l].b->c.lock.state.seq
423                                         : 0);
424                         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
425                         btree_trans_restart(trans);
426                         return false;
427                 }
428         }
429
430         return true;
431 }
432
433 __flatten
434 static bool bch2_btree_iter_relock(struct btree_trans *trans,
435                         struct btree_iter *iter, unsigned long trace_ip)
436 {
437         bool ret = btree_iter_get_locks(trans, iter, false, trace_ip);
438
439         if (!ret)
440                 btree_trans_restart(trans);
441         return ret;
442 }
443
444 bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
445                                unsigned new_locks_want)
446 {
447         struct btree_trans *trans = iter->trans;
448         struct btree_iter *linked;
449
450         EBUG_ON(iter->locks_want >= new_locks_want);
451
452         iter->locks_want = new_locks_want;
453
454         if (btree_iter_get_locks(trans, iter, true, _THIS_IP_))
455                 return true;
456
457         /*
458          * XXX: this is ugly - we'd prefer to not be mucking with other
459          * iterators in the btree_trans here.
460          *
461          * On failure to upgrade the iterator, setting iter->locks_want and
462          * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
463          * get the locks we want on transaction restart.
464          *
465          * But if this iterator was a clone, on transaction restart what we did
466          * to this iterator isn't going to be preserved.
467          *
468          * Possibly we could add an iterator field for the parent iterator when
469          * an iterator is a copy - for now, we'll just upgrade any other
470          * iterators with the same btree id.
471          *
472          * The code below used to be needed to ensure ancestor nodes get locked
473          * before interior nodes - now that's handled by
474          * bch2_btree_iter_traverse_all().
475          */
476         trans_for_each_iter(trans, linked)
477                 if (linked != iter &&
478                     btree_iter_type(linked) == btree_iter_type(iter) &&
479                     linked->btree_id == iter->btree_id &&
480                     linked->locks_want < new_locks_want) {
481                         linked->locks_want = new_locks_want;
482                         btree_iter_get_locks(trans, linked, true, _THIS_IP_);
483                 }
484
485         if (iter->should_be_locked)
486                 btree_trans_restart(trans);
487         return false;
488 }
489
490 void __bch2_btree_iter_downgrade(struct btree_iter *iter,
491                                  unsigned new_locks_want)
492 {
493         unsigned l;
494
495         EBUG_ON(iter->locks_want < new_locks_want);
496
497         iter->locks_want = new_locks_want;
498
499         while (iter->nodes_locked &&
500                (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
501                 if (l > iter->level) {
502                         btree_node_unlock(iter, l);
503                 } else {
504                         if (btree_node_intent_locked(iter, l)) {
505                                 six_lock_downgrade(&iter->l[l].b->c.lock);
506                                 iter->nodes_intent_locked ^= 1 << l;
507                         }
508                         break;
509                 }
510         }
511
512         bch2_btree_trans_verify_locks(iter->trans);
513 }
514
515 void bch2_trans_downgrade(struct btree_trans *trans)
516 {
517         struct btree_iter *iter;
518
519         trans_for_each_iter(trans, iter)
520                 bch2_btree_iter_downgrade(iter);
521 }
522
523 /* Btree transaction locking: */
524
525 static inline bool btree_iter_should_be_locked(struct btree_iter *iter)
526 {
527         return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
528                 iter->should_be_locked;
529 }
530
531 bool bch2_trans_relock(struct btree_trans *trans)
532 {
533         struct btree_iter *iter;
534
535         if (unlikely(trans->restarted))
536                 return false;
537
538         trans_for_each_iter(trans, iter)
539                 if (btree_iter_should_be_locked(iter) &&
540                     !bch2_btree_iter_relock(trans, iter, _RET_IP_)) {
541                         trace_trans_restart_relock(trans->ip, _RET_IP_,
542                                         iter->btree_id, &iter->real_pos);
543                         BUG_ON(!trans->restarted);
544                         return false;
545                 }
546         return true;
547 }
548
549 void bch2_trans_unlock(struct btree_trans *trans)
550 {
551         struct btree_iter *iter;
552
553         trans_for_each_iter(trans, iter)
554                 __bch2_btree_iter_unlock(iter);
555 }
556
557 /* Btree iterator: */
558
559 #ifdef CONFIG_BCACHEFS_DEBUG
560
561 static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
562 {
563         struct bkey_cached *ck;
564         bool locked = btree_node_locked(iter, 0);
565
566         if (!bch2_btree_node_relock(iter, 0))
567                 return;
568
569         ck = (void *) iter->l[0].b;
570         BUG_ON(ck->key.btree_id != iter->btree_id ||
571                bkey_cmp(ck->key.pos, iter->pos));
572
573         if (!locked)
574                 btree_node_unlock(iter, 0);
575 }
576
577 static void bch2_btree_iter_verify_level(struct btree_iter *iter,
578                                          unsigned level)
579 {
580         struct btree_iter_level *l;
581         struct btree_node_iter tmp;
582         bool locked;
583         struct bkey_packed *p, *k;
584         char buf1[100], buf2[100], buf3[100];
585         const char *msg;
586
587         if (!bch2_debug_check_iterators)
588                 return;
589
590         l       = &iter->l[level];
591         tmp     = l->iter;
592         locked  = btree_node_locked(iter, level);
593
594         if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
595                 if (!level)
596                         bch2_btree_iter_verify_cached(iter);
597                 return;
598         }
599
600         BUG_ON(iter->level < iter->min_depth);
601
602         if (!btree_iter_node(iter, level))
603                 return;
604
605         if (!bch2_btree_node_relock(iter, level))
606                 return;
607
608         BUG_ON(!btree_iter_pos_in_node(iter, l->b));
609
610         /*
611          * node iterators don't use leaf node iterator:
612          */
613         if (btree_iter_type(iter) == BTREE_ITER_NODES &&
614             level <= iter->min_depth)
615                 goto unlock;
616
617         bch2_btree_node_iter_verify(&l->iter, l->b);
618
619         /*
620          * For interior nodes, the iterator will have skipped past
621          * deleted keys:
622          *
623          * For extents, the iterator may have skipped past deleted keys (but not
624          * whiteouts)
625          */
626         p = level || btree_node_type_is_extents(iter->btree_id)
627                 ? bch2_btree_node_iter_prev(&tmp, l->b)
628                 : bch2_btree_node_iter_prev_all(&tmp, l->b);
629         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
630
631         if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
632                 msg = "before";
633                 goto err;
634         }
635
636         if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
637                 msg = "after";
638                 goto err;
639         }
640 unlock:
641         if (!locked)
642                 btree_node_unlock(iter, level);
643         return;
644 err:
645         strcpy(buf2, "(none)");
646         strcpy(buf3, "(none)");
647
648         bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
649
650         if (p) {
651                 struct bkey uk = bkey_unpack_key(l->b, p);
652                 bch2_bkey_to_text(&PBUF(buf2), &uk);
653         }
654
655         if (k) {
656                 struct bkey uk = bkey_unpack_key(l->b, k);
657                 bch2_bkey_to_text(&PBUF(buf3), &uk);
658         }
659
660         panic("iterator should be %s key at level %u:\n"
661               "iter pos %s\n"
662               "prev key %s\n"
663               "cur  key %s\n",
664               msg, level, buf1, buf2, buf3);
665 }
666
667 static void bch2_btree_iter_verify(struct btree_iter *iter)
668 {
669         struct btree_trans *trans = iter->trans;
670         struct bch_fs *c = trans->c;
671         enum btree_iter_type type = btree_iter_type(iter);
672         unsigned i;
673
674         EBUG_ON(iter->btree_id >= BTREE_ID_NR);
675
676         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
677                iter->pos.snapshot != iter->snapshot);
678
679         BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
680                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
681
682         BUG_ON(type == BTREE_ITER_NODES &&
683                !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
684
685         BUG_ON(type != BTREE_ITER_NODES &&
686                (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
687                !btree_type_has_snapshots(iter->btree_id));
688
689         for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) {
690                 if (!iter->l[i].b) {
691                         BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
692                         break;
693                 }
694
695                 bch2_btree_iter_verify_level(iter, i);
696         }
697
698         bch2_btree_iter_verify_locks(trans, iter);
699 }
700
701 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
702 {
703         enum btree_iter_type type = btree_iter_type(iter);
704
705         BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
706                iter->pos.snapshot != iter->snapshot);
707
708         BUG_ON((type == BTREE_ITER_KEYS ||
709                 type == BTREE_ITER_CACHED) &&
710                (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
711                 bkey_cmp(iter->pos, iter->k.p) > 0));
712 }
713
714 void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
715 {
716         struct btree_iter *iter;
717
718         if (!bch2_debug_check_iterators)
719                 return;
720
721         trans_for_each_iter_with_node(trans, b, iter)
722                 bch2_btree_iter_verify_level(iter, b->c.level);
723 }
724
725 #else
726
727 static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
728 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
729 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
730
731 #endif
732
733 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
734                                         struct btree *b,
735                                         struct bset_tree *t,
736                                         struct bkey_packed *k)
737 {
738         struct btree_node_iter_set *set;
739
740         btree_node_iter_for_each(iter, set)
741                 if (set->end == t->end_offset) {
742                         set->k = __btree_node_key_to_offset(b, k);
743                         bch2_btree_node_iter_sort(iter, b);
744                         return;
745                 }
746
747         bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
748 }
749
750 static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
751                                                struct btree *b,
752                                                struct bkey_packed *where)
753 {
754         struct btree_iter_level *l = &iter->l[b->c.level];
755
756         if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
757                 return;
758
759         if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
760                 bch2_btree_node_iter_advance(&l->iter, l->b);
761
762         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
763 }
764
765 void bch2_btree_iter_fix_key_modified(struct btree_trans *trans,
766                                       struct btree_iter *iter,
767                                       struct btree *b,
768                                       struct bkey_packed *where)
769 {
770         struct btree_iter *linked;
771
772         trans_for_each_iter_with_node(trans, b, linked) {
773                 __bch2_btree_iter_fix_key_modified(linked, b, where);
774                 bch2_btree_iter_verify_level(linked, b->c.level);
775         }
776 }
777
778 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
779                                       struct btree *b,
780                                       struct btree_node_iter *node_iter,
781                                       struct bset_tree *t,
782                                       struct bkey_packed *where,
783                                       unsigned clobber_u64s,
784                                       unsigned new_u64s)
785 {
786         const struct bkey_packed *end = btree_bkey_last(b, t);
787         struct btree_node_iter_set *set;
788         unsigned offset = __btree_node_key_to_offset(b, where);
789         int shift = new_u64s - clobber_u64s;
790         unsigned old_end = t->end_offset - shift;
791         unsigned orig_iter_pos = node_iter->data[0].k;
792         bool iter_current_key_modified =
793                 orig_iter_pos >= offset &&
794                 orig_iter_pos <= offset + clobber_u64s;
795
796         btree_node_iter_for_each(node_iter, set)
797                 if (set->end == old_end)
798                         goto found;
799
800         /* didn't find the bset in the iterator - might have to readd it: */
801         if (new_u64s &&
802             bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
803                 bch2_btree_node_iter_push(node_iter, b, where, end);
804                 goto fixup_done;
805         } else {
806                 /* Iterator is after key that changed */
807                 return;
808         }
809 found:
810         set->end = t->end_offset;
811
812         /* Iterator hasn't gotten to the key that changed yet: */
813         if (set->k < offset)
814                 return;
815
816         if (new_u64s &&
817             bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
818                 set->k = offset;
819         } else if (set->k < offset + clobber_u64s) {
820                 set->k = offset + new_u64s;
821                 if (set->k == set->end)
822                         bch2_btree_node_iter_set_drop(node_iter, set);
823         } else {
824                 /* Iterator is after key that changed */
825                 set->k = (int) set->k + shift;
826                 return;
827         }
828
829         bch2_btree_node_iter_sort(node_iter, b);
830 fixup_done:
831         if (node_iter->data[0].k != orig_iter_pos)
832                 iter_current_key_modified = true;
833
834         /*
835          * When a new key is added, and the node iterator now points to that
836          * key, the iterator might have skipped past deleted keys that should
837          * come after the key the iterator now points to. We have to rewind to
838          * before those deleted keys - otherwise
839          * bch2_btree_node_iter_prev_all() breaks:
840          */
841         if (!bch2_btree_node_iter_end(node_iter) &&
842             iter_current_key_modified &&
843             (b->c.level ||
844              btree_node_type_is_extents(iter->btree_id))) {
845                 struct bset_tree *t;
846                 struct bkey_packed *k, *k2, *p;
847
848                 k = bch2_btree_node_iter_peek_all(node_iter, b);
849
850                 for_each_bset(b, t) {
851                         bool set_pos = false;
852
853                         if (node_iter->data[0].end == t->end_offset)
854                                 continue;
855
856                         k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
857
858                         while ((p = bch2_bkey_prev_all(b, t, k2)) &&
859                                bkey_iter_cmp(b, k, p) < 0) {
860                                 k2 = p;
861                                 set_pos = true;
862                         }
863
864                         if (set_pos)
865                                 btree_node_iter_set_set_pos(node_iter,
866                                                             b, t, k2);
867                 }
868         }
869
870         if (!b->c.level &&
871             node_iter == &iter->l[0].iter &&
872             iter_current_key_modified)
873                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
874 }
875
876 void bch2_btree_node_iter_fix(struct btree_trans *trans,
877                               struct btree_iter *iter,
878                               struct btree *b,
879                               struct btree_node_iter *node_iter,
880                               struct bkey_packed *where,
881                               unsigned clobber_u64s,
882                               unsigned new_u64s)
883 {
884         struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
885         struct btree_iter *linked;
886
887         if (node_iter != &iter->l[b->c.level].iter) {
888                 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
889                                            where, clobber_u64s, new_u64s);
890
891                 if (bch2_debug_check_iterators)
892                         bch2_btree_node_iter_verify(node_iter, b);
893         }
894
895         trans_for_each_iter_with_node(trans, b, linked) {
896                 __bch2_btree_node_iter_fix(linked, b,
897                                            &linked->l[b->c.level].iter, t,
898                                            where, clobber_u64s, new_u64s);
899                 bch2_btree_iter_verify_level(linked, b->c.level);
900         }
901 }
902
903 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
904                                                   struct btree_iter_level *l,
905                                                   struct bkey *u,
906                                                   struct bkey_packed *k)
907 {
908         struct bkey_s_c ret;
909
910         if (unlikely(!k)) {
911                 /*
912                  * signal to bch2_btree_iter_peek_slot() that we're currently at
913                  * a hole
914                  */
915                 u->type = KEY_TYPE_deleted;
916                 return bkey_s_c_null;
917         }
918
919         ret = bkey_disassemble(l->b, k, u);
920
921         /*
922          * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
923          * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
924          * being overwritten but doesn't change k->size. But this is ok, because
925          * those keys are never written out, we just have to avoid a spurious
926          * assertion here:
927          */
928         if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
929                 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
930
931         return ret;
932 }
933
934 /* peek_all() doesn't skip deleted keys */
935 static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
936                                                         struct btree_iter_level *l)
937 {
938         return __btree_iter_unpack(iter, l, &iter->k,
939                         bch2_btree_node_iter_peek_all(&l->iter, l->b));
940 }
941
942 static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
943                                                     struct btree_iter_level *l)
944 {
945         struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
946                         bch2_btree_node_iter_peek(&l->iter, l->b));
947
948         iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
949         iter->trans->iters_sorted = false;
950         return k;
951 }
952
953 static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
954                                                     struct btree_iter_level *l)
955 {
956         struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
957                         bch2_btree_node_iter_prev(&l->iter, l->b));
958
959         iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
960         iter->trans->iters_sorted = false;
961         return k;
962 }
963
964 static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
965                                              struct btree_iter_level *l,
966                                              int max_advance)
967 {
968         struct bkey_packed *k;
969         int nr_advanced = 0;
970
971         while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
972                bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
973                 if (max_advance > 0 && nr_advanced >= max_advance)
974                         return false;
975
976                 bch2_btree_node_iter_advance(&l->iter, l->b);
977                 nr_advanced++;
978         }
979
980         return true;
981 }
982
983 /*
984  * Verify that iterator for parent node points to child node:
985  */
986 static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
987 {
988         struct btree_iter_level *l;
989         unsigned plevel;
990         bool parent_locked;
991         struct bkey_packed *k;
992
993         if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
994                 return;
995
996         plevel = b->c.level + 1;
997         if (!btree_iter_node(iter, plevel))
998                 return;
999
1000         parent_locked = btree_node_locked(iter, plevel);
1001
1002         if (!bch2_btree_node_relock(iter, plevel))
1003                 return;
1004
1005         l = &iter->l[plevel];
1006         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1007         if (!k ||
1008             bkey_deleted(k) ||
1009             bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
1010                 char buf1[100];
1011                 char buf2[100];
1012                 char buf3[100];
1013                 char buf4[100];
1014                 struct bkey uk = bkey_unpack_key(b, k);
1015
1016                 bch2_dump_btree_node(iter->trans->c, l->b);
1017                 bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
1018                 bch2_bkey_to_text(&PBUF(buf2), &uk);
1019                 bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
1020                 bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
1021                 panic("parent iter doesn't point to new node:\n"
1022                       "iter pos %s %s\n"
1023                       "iter key %s\n"
1024                       "new node %s-%s\n",
1025                       bch2_btree_ids[iter->btree_id], buf1,
1026                       buf2, buf3, buf4);
1027         }
1028
1029         if (!parent_locked)
1030                 btree_node_unlock(iter, b->c.level + 1);
1031 }
1032
1033 static inline void __btree_iter_init(struct btree_iter *iter,
1034                                      unsigned level)
1035 {
1036         struct btree_iter_level *l = &iter->l[level];
1037
1038         bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
1039
1040         /*
1041          * Iterators to interior nodes should always be pointed at the first non
1042          * whiteout:
1043          */
1044         if (level)
1045                 bch2_btree_node_iter_peek(&l->iter, l->b);
1046
1047         btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1048 }
1049
1050 static inline void btree_iter_node_set(struct btree_iter *iter,
1051                                        struct btree *b)
1052 {
1053         BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
1054
1055         btree_iter_verify_new_node(iter, b);
1056
1057         EBUG_ON(!btree_iter_pos_in_node(iter, b));
1058         EBUG_ON(b->c.lock.state.seq & 1);
1059
1060         iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
1061         iter->l[b->c.level].b = b;
1062         __btree_iter_init(iter, b->c.level);
1063 }
1064
1065 /*
1066  * A btree node is being replaced - update the iterator to point to the new
1067  * node:
1068  */
1069 void bch2_btree_iter_node_replace(struct btree_trans *trans,
1070                         struct btree_iter *iter, struct btree *b)
1071 {
1072         enum btree_node_locked_type t;
1073         struct btree_iter *linked;
1074
1075         trans_for_each_iter(trans, linked)
1076                 if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
1077                     btree_iter_pos_in_node(linked, b)) {
1078                         /*
1079                          * bch2_btree_iter_node_drop() has already been called -
1080                          * the old node we're replacing has already been
1081                          * unlocked and the pointer invalidated
1082                          */
1083                         BUG_ON(btree_node_locked(linked, b->c.level));
1084
1085                         t = btree_lock_want(linked, b->c.level);
1086                         if (t != BTREE_NODE_UNLOCKED) {
1087                                 six_lock_increment(&b->c.lock, (enum six_lock_type) t);
1088                                 mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t);
1089                         }
1090
1091                         btree_iter_node_set(linked, b);
1092                 }
1093 }
1094
1095 void bch2_btree_iter_node_drop(struct btree_trans *trans,
1096                         struct btree_iter *iter, struct btree *b)
1097 {
1098         struct btree_iter *linked;
1099         unsigned level = b->c.level;
1100
1101         trans_for_each_iter(trans, linked)
1102                 if (linked->l[level].b == b) {
1103                         btree_node_unlock(linked, level);
1104                         linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
1105                 }
1106 }
1107
1108 /*
1109  * A btree node has been modified in such a way as to invalidate iterators - fix
1110  * them:
1111  */
1112 void bch2_btree_iter_reinit_node(struct btree_trans *trans,
1113                         struct btree_iter *iter, struct btree *b)
1114 {
1115         struct btree_iter *linked;
1116
1117         trans_for_each_iter_with_node(trans, b, linked)
1118                 __btree_iter_init(linked, b->c.level);
1119 }
1120
1121 static int lock_root_check_fn(struct six_lock *lock, void *p)
1122 {
1123         struct btree *b = container_of(lock, struct btree, c.lock);
1124         struct btree **rootp = p;
1125
1126         return b == *rootp ? 0 : -1;
1127 }
1128
1129 static inline int btree_iter_lock_root(struct btree_trans *trans,
1130                                        struct btree_iter *iter,
1131                                        unsigned depth_want,
1132                                        unsigned long trace_ip)
1133 {
1134         struct bch_fs *c = trans->c;
1135         struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
1136         enum six_lock_type lock_type;
1137         unsigned i;
1138
1139         EBUG_ON(iter->nodes_locked);
1140
1141         while (1) {
1142                 b = READ_ONCE(*rootp);
1143                 iter->level = READ_ONCE(b->c.level);
1144
1145                 if (unlikely(iter->level < depth_want)) {
1146                         /*
1147                          * the root is at a lower depth than the depth we want:
1148                          * got to the end of the btree, or we're walking nodes
1149                          * greater than some depth and there are no nodes >=
1150                          * that depth
1151                          */
1152                         iter->level = depth_want;
1153                         for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
1154                                 iter->l[i].b = NULL;
1155                         return 1;
1156                 }
1157
1158                 lock_type = __btree_lock_want(iter, iter->level);
1159                 if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level,
1160                                               iter, lock_type,
1161                                               lock_root_check_fn, rootp,
1162                                               trace_ip))) {
1163                         if (trans->restarted)
1164                                 return -EINTR;
1165                         continue;
1166                 }
1167
1168                 if (likely(b == READ_ONCE(*rootp) &&
1169                            b->c.level == iter->level &&
1170                            !race_fault())) {
1171                         for (i = 0; i < iter->level; i++)
1172                                 iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
1173                         iter->l[iter->level].b = b;
1174                         for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
1175                                 iter->l[i].b = NULL;
1176
1177                         mark_btree_node_locked(iter, iter->level, lock_type);
1178                         btree_iter_node_set(iter, b);
1179                         return 0;
1180                 }
1181
1182                 six_unlock_type(&b->c.lock, lock_type);
1183         }
1184 }
1185
1186 noinline
1187 static int btree_iter_prefetch(struct btree_trans *trans, struct btree_iter *iter)
1188 {
1189         struct bch_fs *c = trans->c;
1190         struct btree_iter_level *l = &iter->l[iter->level];
1191         struct btree_node_iter node_iter = l->iter;
1192         struct bkey_packed *k;
1193         struct bkey_buf tmp;
1194         unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1195                 ? (iter->level > 1 ? 0 :  2)
1196                 : (iter->level > 1 ? 1 : 16);
1197         bool was_locked = btree_node_locked(iter, iter->level);
1198         int ret = 0;
1199
1200         bch2_bkey_buf_init(&tmp);
1201
1202         while (nr && !ret) {
1203                 if (!bch2_btree_node_relock(iter, iter->level))
1204                         break;
1205
1206                 bch2_btree_node_iter_advance(&node_iter, l->b);
1207                 k = bch2_btree_node_iter_peek(&node_iter, l->b);
1208                 if (!k)
1209                         break;
1210
1211                 bch2_bkey_buf_unpack(&tmp, c, l->b, k);
1212                 ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
1213                                                iter->level - 1);
1214         }
1215
1216         if (!was_locked)
1217                 btree_node_unlock(iter, iter->level);
1218
1219         bch2_bkey_buf_exit(&tmp, c);
1220         return ret;
1221 }
1222
1223 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
1224                                             unsigned plevel, struct btree *b)
1225 {
1226         struct btree_iter_level *l = &iter->l[plevel];
1227         bool locked = btree_node_locked(iter, plevel);
1228         struct bkey_packed *k;
1229         struct bch_btree_ptr_v2 *bp;
1230
1231         if (!bch2_btree_node_relock(iter, plevel))
1232                 return;
1233
1234         k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1235         BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
1236
1237         bp = (void *) bkeyp_val(&l->b->format, k);
1238         bp->mem_ptr = (unsigned long)b;
1239
1240         if (!locked)
1241                 btree_node_unlock(iter, plevel);
1242 }
1243
1244 static __always_inline int btree_iter_down(struct btree_trans *trans,
1245                                            struct btree_iter *iter,
1246                                            unsigned long trace_ip)
1247 {
1248         struct bch_fs *c = trans->c;
1249         struct btree_iter_level *l = &iter->l[iter->level];
1250         struct btree *b;
1251         unsigned level = iter->level - 1;
1252         enum six_lock_type lock_type = __btree_lock_want(iter, level);
1253         struct bkey_buf tmp;
1254         int ret;
1255
1256         EBUG_ON(!btree_node_locked(iter, iter->level));
1257
1258         bch2_bkey_buf_init(&tmp);
1259         bch2_bkey_buf_unpack(&tmp, c, l->b,
1260                          bch2_btree_node_iter_peek(&l->iter, l->b));
1261
1262         b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip);
1263         ret = PTR_ERR_OR_ZERO(b);
1264         if (unlikely(ret))
1265                 goto err;
1266
1267         mark_btree_node_locked(iter, level, lock_type);
1268         btree_iter_node_set(iter, b);
1269
1270         if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
1271             unlikely(b != btree_node_mem_ptr(tmp.k)))
1272                 btree_node_mem_ptr_set(iter, level + 1, b);
1273
1274         if (iter->flags & BTREE_ITER_PREFETCH)
1275                 ret = btree_iter_prefetch(trans, iter);
1276
1277         if (btree_node_read_locked(iter, level + 1))
1278                 btree_node_unlock(iter, level + 1);
1279         iter->level = level;
1280
1281         bch2_btree_iter_verify_locks(trans, iter);
1282 err:
1283         bch2_bkey_buf_exit(&tmp, c);
1284         return ret;
1285 }
1286
1287 static int btree_iter_traverse_one(struct btree_trans *,
1288                         struct btree_iter *, unsigned long);
1289
1290 static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
1291                                      unsigned long trace_ip)
1292 {
1293         struct bch_fs *c = trans->c;
1294         struct btree_iter *iter, *prev = NULL;
1295         int i;
1296
1297         if (trans->in_traverse_all)
1298                 return -EINTR;
1299
1300         trans->in_traverse_all = true;
1301 retry_all:
1302         trans->restarted = false;
1303
1304         trans_for_each_iter(trans, iter)
1305                 iter->should_be_locked = false;
1306
1307         btree_trans_sort_iters(trans);
1308
1309         trans_for_each_iter_inorder_reverse(trans, iter, i) {
1310                 if (prev) {
1311                         if (iter->btree_id == prev->btree_id &&
1312                             iter->locks_want < prev->locks_want)
1313                                 __bch2_btree_iter_upgrade(iter, prev->locks_want);
1314                         else if (!iter->locks_want && prev->locks_want)
1315                                 __bch2_btree_iter_upgrade(iter, 1);
1316                 }
1317
1318                 prev = iter;
1319         }
1320
1321         bch2_trans_unlock(trans);
1322         cond_resched();
1323
1324         if (unlikely(ret == -ENOMEM)) {
1325                 struct closure cl;
1326
1327                 closure_init_stack(&cl);
1328
1329                 do {
1330                         ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1331                         closure_sync(&cl);
1332                 } while (ret);
1333         }
1334
1335         if (unlikely(ret == -EIO)) {
1336                 trans->error = true;
1337                 goto out;
1338         }
1339
1340         BUG_ON(ret && ret != -EINTR);
1341
1342         /* Now, redo traversals in correct order: */
1343         i = 0;
1344         while (i < trans->nr_sorted) {
1345                 iter = trans->iters + trans->sorted[i];
1346
1347                 EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
1348
1349                 ret = btree_iter_traverse_one(trans, iter, _THIS_IP_);
1350                 if (ret)
1351                         goto retry_all;
1352
1353                 EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
1354
1355                 if (iter->nodes_locked)
1356                         i++;
1357         }
1358
1359         /*
1360          * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
1361          * and relock(), relock() won't relock since iter->should_be_locked
1362          * isn't set yet, which is all fine
1363          */
1364         trans_for_each_iter(trans, iter)
1365                 BUG_ON(iter->uptodate >= BTREE_ITER_NEED_TRAVERSE);
1366 out:
1367         bch2_btree_cache_cannibalize_unlock(c);
1368
1369         trans->in_traverse_all = false;
1370
1371         trace_trans_traverse_all(trans->ip, trace_ip);
1372         return ret;
1373 }
1374
1375 static int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1376 {
1377         return __btree_iter_traverse_all(trans, 0, _RET_IP_);
1378 }
1379
1380 static inline bool btree_iter_good_node(struct btree_iter *iter,
1381                                         unsigned l, int check_pos)
1382 {
1383         if (!is_btree_node(iter, l) ||
1384             !bch2_btree_node_relock(iter, l))
1385                 return false;
1386
1387         if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
1388                 return false;
1389         if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
1390                 return false;
1391         return true;
1392 }
1393
1394 static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
1395                                                      int check_pos)
1396 {
1397         unsigned l = iter->level;
1398
1399         while (btree_iter_node(iter, l) &&
1400                !btree_iter_good_node(iter, l, check_pos)) {
1401                 btree_node_unlock(iter, l);
1402                 iter->l[l].b = BTREE_ITER_NO_NODE_UP;
1403                 l++;
1404         }
1405
1406         return l;
1407 }
1408
1409 /*
1410  * This is the main state machine for walking down the btree - walks down to a
1411  * specified depth
1412  *
1413  * Returns 0 on success, -EIO on error (error reading in a btree node).
1414  *
1415  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1416  * stashed in the iterator and returned from bch2_trans_exit().
1417  */
1418 static int btree_iter_traverse_one(struct btree_trans *trans,
1419                                    struct btree_iter *iter,
1420                                    unsigned long trace_ip)
1421 {
1422         unsigned l, depth_want = iter->level;
1423         int ret = 0;
1424
1425         /*
1426          * Ensure we obey iter->should_be_locked: if it's set, we can't unlock
1427          * and re-traverse the iterator without a transaction restart:
1428          */
1429         if (iter->should_be_locked) {
1430                 ret = bch2_btree_iter_relock(trans, iter, trace_ip) ? 0 : -EINTR;
1431                 goto out;
1432         }
1433
1434         if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
1435                 ret = bch2_btree_iter_traverse_cached(iter);
1436                 goto out;
1437         }
1438
1439         if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1440                 goto out;
1441
1442         iter->level = btree_iter_up_until_good_node(iter, 0);
1443
1444         /* If we need intent locks, take them too: */
1445         for (l = iter->level + 1;
1446              l < iter->locks_want && btree_iter_node(iter, l);
1447              l++)
1448                 if (!bch2_btree_node_relock(iter, l))
1449                         while (iter->level <= l) {
1450                                 btree_node_unlock(iter, iter->level);
1451                                 iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
1452                                 iter->level++;
1453                         }
1454
1455         /*
1456          * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1457          * would indicate to other code that we got to the end of the btree,
1458          * here it indicates that relocking the root failed - it's critical that
1459          * btree_iter_lock_root() comes next and that it can't fail
1460          */
1461         while (iter->level > depth_want) {
1462                 ret = btree_iter_node(iter, iter->level)
1463                         ? btree_iter_down(trans, iter, trace_ip)
1464                         : btree_iter_lock_root(trans, iter, depth_want, trace_ip);
1465                 if (unlikely(ret)) {
1466                         if (ret == 1) {
1467                                 /*
1468                                  * Got to the end of the btree (in
1469                                  * BTREE_ITER_NODES mode)
1470                                  */
1471                                 ret = 0;
1472                                 goto out;
1473                         }
1474
1475                         __bch2_btree_iter_unlock(iter);
1476                         iter->level = depth_want;
1477
1478                         if (ret == -EIO) {
1479                                 iter->flags |= BTREE_ITER_ERROR;
1480                                 iter->l[iter->level].b =
1481                                         BTREE_ITER_NO_NODE_ERROR;
1482                         } else {
1483                                 iter->l[iter->level].b =
1484                                         BTREE_ITER_NO_NODE_DOWN;
1485                         }
1486                         goto out;
1487                 }
1488         }
1489
1490         iter->uptodate = BTREE_ITER_NEED_PEEK;
1491 out:
1492         BUG_ON((ret == -EINTR) != !!trans->restarted);
1493         trace_iter_traverse(trans->ip, trace_ip,
1494                             btree_iter_type(iter) == BTREE_ITER_CACHED,
1495                             iter->btree_id, &iter->real_pos, ret);
1496         bch2_btree_iter_verify(iter);
1497         return ret;
1498 }
1499
1500 static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1501 {
1502         struct btree_trans *trans = iter->trans;
1503         int ret;
1504
1505         ret =   bch2_trans_cond_resched(trans) ?:
1506                 btree_iter_traverse_one(trans, iter, _RET_IP_);
1507         if (unlikely(ret) && hweight64(trans->iters_linked) == 1) {
1508                 ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
1509                 BUG_ON(ret == -EINTR);
1510         }
1511
1512         return ret;
1513 }
1514
1515 /*
1516  * Note:
1517  * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
1518  * for internal btree iterator users
1519  *
1520  * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
1521  * btree_iter_traverse() does not:
1522  */
1523 static inline int __must_check
1524 btree_iter_traverse(struct btree_iter *iter)
1525 {
1526         return iter->uptodate >= BTREE_ITER_NEED_RELOCK
1527                 ? __bch2_btree_iter_traverse(iter)
1528                 : 0;
1529 }
1530
1531 int __must_check
1532 bch2_btree_iter_traverse(struct btree_iter *iter)
1533 {
1534         int ret;
1535
1536         btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
1537
1538         ret = btree_iter_traverse(iter);
1539         if (ret)
1540                 return ret;
1541
1542         iter->should_be_locked = true;
1543         return 0;
1544 }
1545
1546 /* Iterate across nodes (leaf and interior nodes) */
1547
1548 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1549 {
1550         struct btree *b;
1551         int ret;
1552
1553         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1554         bch2_btree_iter_verify(iter);
1555
1556         ret = btree_iter_traverse(iter);
1557         if (ret)
1558                 return NULL;
1559
1560         b = btree_iter_node(iter, iter->level);
1561         if (!b)
1562                 return NULL;
1563
1564         BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
1565
1566         iter->pos = iter->real_pos = b->key.k.p;
1567         iter->trans->iters_sorted = false;
1568
1569         bch2_btree_iter_verify(iter);
1570         iter->should_be_locked = true;
1571
1572         return b;
1573 }
1574
1575 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1576 {
1577         struct btree *b;
1578         int ret;
1579
1580         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
1581         bch2_btree_iter_verify(iter);
1582
1583         /* already got to end? */
1584         if (!btree_iter_node(iter, iter->level))
1585                 return NULL;
1586
1587         bch2_trans_cond_resched(iter->trans);
1588
1589         btree_node_unlock(iter, iter->level);
1590         iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
1591         iter->level++;
1592
1593         btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1594         ret = btree_iter_traverse(iter);
1595         if (ret)
1596                 return NULL;
1597
1598         /* got to end? */
1599         b = btree_iter_node(iter, iter->level);
1600         if (!b)
1601                 return NULL;
1602
1603         if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
1604                 /*
1605                  * Haven't gotten to the end of the parent node: go back down to
1606                  * the next child node
1607                  */
1608                 btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
1609
1610                 /* Unlock to avoid screwing up our lock invariants: */
1611                 btree_node_unlock(iter, iter->level);
1612
1613                 iter->level = iter->min_depth;
1614                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1615                 bch2_btree_iter_verify(iter);
1616
1617                 ret = btree_iter_traverse(iter);
1618                 if (ret)
1619                         return NULL;
1620
1621                 b = iter->l[iter->level].b;
1622         }
1623
1624         iter->pos = iter->real_pos = b->key.k.p;
1625         iter->trans->iters_sorted = false;
1626
1627         bch2_btree_iter_verify(iter);
1628         iter->should_be_locked = true;
1629
1630         return b;
1631 }
1632
1633 /* Iterate across keys (in leaf nodes only) */
1634
1635 static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
1636 {
1637         struct btree_trans *trans = iter->trans;
1638 #ifdef CONFIG_BCACHEFS_DEBUG
1639         struct bpos old_pos = iter->real_pos;
1640 #endif
1641         int cmp = bpos_cmp(new_pos, iter->real_pos);
1642         unsigned l = iter->level;
1643
1644         EBUG_ON(trans->restarted);
1645
1646         if (!cmp)
1647                 goto out;
1648
1649         iter->real_pos = new_pos;
1650         iter->should_be_locked = false;
1651         trans->iters_sorted = false;
1652
1653         if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
1654                 btree_node_unlock(iter, 0);
1655                 iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
1656                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1657                 return;
1658         }
1659
1660         l = btree_iter_up_until_good_node(iter, cmp);
1661
1662         if (btree_iter_node(iter, l)) {
1663                 /*
1664                  * We might have to skip over many keys, or just a few: try
1665                  * advancing the node iterator, and if we have to skip over too
1666                  * many keys just reinit it (or if we're rewinding, since that
1667                  * is expensive).
1668                  */
1669                 if (cmp < 0 ||
1670                     !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
1671                         __btree_iter_init(iter, l);
1672
1673                 /* Don't leave it locked if we're not supposed to: */
1674                 if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
1675                         btree_node_unlock(iter, l);
1676         }
1677 out:
1678         if (l != iter->level)
1679                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1680         else
1681                 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1682
1683         bch2_btree_iter_verify(iter);
1684 #ifdef CONFIG_BCACHEFS_DEBUG
1685         trace_iter_set_search_pos(trans->ip, _RET_IP_,
1686                                   iter->btree_id,
1687                                   &old_pos, &new_pos, l);
1688 #endif
1689 }
1690
1691 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
1692 {
1693         struct bpos pos = iter->k.p;
1694         bool ret = bpos_cmp(pos, SPOS_MAX) != 0;
1695
1696         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1697                 pos = bkey_successor(iter, pos);
1698         bch2_btree_iter_set_pos(iter, pos);
1699         return ret;
1700 }
1701
1702 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
1703 {
1704         struct bpos pos = bkey_start_pos(&iter->k);
1705         bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
1706                     ? bpos_cmp(pos, POS_MIN)
1707                     : bkey_cmp(pos, POS_MIN)) != 0;
1708
1709         if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
1710                 pos = bkey_predecessor(iter, pos);
1711         bch2_btree_iter_set_pos(iter, pos);
1712         return ret;
1713 }
1714
1715 static noinline struct bkey_i *__btree_trans_peek_updates(struct btree_iter *iter)
1716 {
1717         struct btree_insert_entry *i;
1718         struct bkey_i *ret = NULL;
1719
1720         trans_for_each_update(iter->trans, i) {
1721                 if (i->btree_id < iter->btree_id)
1722                         continue;
1723                 if (i->btree_id > iter->btree_id)
1724                         break;
1725                 if (bpos_cmp(i->k->k.p, iter->real_pos) < 0)
1726                         continue;
1727                 if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
1728                         ret = i->k;
1729         }
1730
1731         return ret;
1732 }
1733
1734 static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
1735 {
1736         return iter->flags & BTREE_ITER_WITH_UPDATES
1737                 ? __btree_trans_peek_updates(iter)
1738                 : NULL;
1739 }
1740
1741 /**
1742  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
1743  * current position
1744  */
1745 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1746 {
1747         struct btree_iter_level *l = &iter->l[0];
1748         struct bpos search_key = btree_iter_search_key(iter);
1749         struct bkey_i *next_update;
1750         struct bkey_s_c k;
1751         int ret;
1752
1753         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1754         bch2_btree_iter_verify(iter);
1755         bch2_btree_iter_verify_entry_exit(iter);
1756
1757         while (1) {
1758                 btree_iter_set_search_pos(iter, search_key);
1759
1760                 ret = btree_iter_traverse(iter);
1761                 if (unlikely(ret)) {
1762                         /* ensure that iter->k is consistent with iter->pos: */
1763                         bch2_btree_iter_set_pos(iter, iter->pos);
1764                         k = bkey_s_c_err(ret);
1765                         goto out;
1766                 }
1767
1768                 next_update = btree_trans_peek_updates(iter);
1769                 k = btree_iter_level_peek_all(iter, l);
1770
1771                 /* * In the btree, deleted keys sort before non deleted: */
1772                 if (k.k && bkey_deleted(k.k) &&
1773                     (!next_update ||
1774                      bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
1775                         search_key = k.k->p;
1776                         continue;
1777                 }
1778
1779                 if (next_update &&
1780                     bpos_cmp(next_update->k.p,
1781                              k.k ? k.k->p : l->b->key.k.p) <= 0) {
1782                         iter->k = next_update->k;
1783                         k = bkey_i_to_s_c(next_update);
1784                 }
1785
1786                 if (likely(k.k)) {
1787                         if (likely(!bkey_deleted(k.k)))
1788                                 break;
1789
1790                         /* Advance to next key: */
1791                         search_key = bkey_successor(iter, k.k->p);
1792                 } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) {
1793                         /* Advance to next leaf node: */
1794                         search_key = bpos_successor(l->b->key.k.p);
1795                 } else {
1796                         /* End of btree: */
1797                         bch2_btree_iter_set_pos(iter, SPOS_MAX);
1798                         iter->real_pos = SPOS_MAX;
1799                         k = bkey_s_c_null;
1800                         goto out;
1801                 }
1802         }
1803
1804         /*
1805          * iter->pos should be mononotically increasing, and always be equal to
1806          * the key we just returned - except extents can straddle iter->pos:
1807          */
1808         if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
1809                 iter->pos = k.k->p;
1810         else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1811                 iter->pos = bkey_start_pos(k.k);
1812         iter->real_pos = k.k->p;
1813 out:
1814         iter->should_be_locked = true;
1815         bch2_btree_iter_verify_entry_exit(iter);
1816         bch2_btree_iter_verify(iter);
1817         return k;
1818 }
1819
1820 /**
1821  * bch2_btree_iter_next: returns first key greater than iterator's current
1822  * position
1823  */
1824 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1825 {
1826         if (!bch2_btree_iter_advance(iter))
1827                 return bkey_s_c_null;
1828
1829         return bch2_btree_iter_peek(iter);
1830 }
1831
1832 /**
1833  * bch2_btree_iter_peek_prev: returns first key less than or equal to
1834  * iterator's current position
1835  */
1836 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
1837 {
1838         struct bpos search_key = iter->pos;
1839         struct btree_iter_level *l = &iter->l[0];
1840         struct bkey_s_c k;
1841         int ret;
1842
1843         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
1844         EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
1845         bch2_btree_iter_verify(iter);
1846         bch2_btree_iter_verify_entry_exit(iter);
1847
1848         while (1) {
1849                 btree_iter_set_search_pos(iter, search_key);
1850
1851                 ret = btree_iter_traverse(iter);
1852                 if (unlikely(ret)) {
1853                         /* ensure that iter->k is consistent with iter->pos: */
1854                         bch2_btree_iter_set_pos(iter, iter->pos);
1855                         k = bkey_s_c_err(ret);
1856                         goto out;
1857                 }
1858
1859                 k = btree_iter_level_peek(iter, l);
1860                 if (!k.k ||
1861                     ((iter->flags & BTREE_ITER_IS_EXTENTS)
1862                      ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
1863                      : bkey_cmp(k.k->p, iter->pos) > 0))
1864                         k = btree_iter_level_prev(iter, l);
1865
1866                 if (likely(k.k)) {
1867                         break;
1868                 } else if (likely(bpos_cmp(l->b->data->min_key, POS_MIN))) {
1869                         /* Advance to previous leaf node: */
1870                         search_key = bpos_predecessor(l->b->data->min_key);
1871                 } else {
1872                         /* Start of btree: */
1873                         bch2_btree_iter_set_pos(iter, POS_MIN);
1874                         k = bkey_s_c_null;
1875                         goto out;
1876                 }
1877         }
1878
1879         EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
1880
1881         /* Extents can straddle iter->pos: */
1882         if (bkey_cmp(k.k->p, iter->pos) < 0)
1883                 iter->pos = k.k->p;
1884 out:
1885         iter->should_be_locked = true;
1886         bch2_btree_iter_verify_entry_exit(iter);
1887         bch2_btree_iter_verify(iter);
1888         return k;
1889 }
1890
1891 /**
1892  * bch2_btree_iter_prev: returns first key less than iterator's current
1893  * position
1894  */
1895 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1896 {
1897         if (!bch2_btree_iter_rewind(iter))
1898                 return bkey_s_c_null;
1899
1900         return bch2_btree_iter_peek_prev(iter);
1901 }
1902
1903 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1904 {
1905         struct btree_trans *trans = iter->trans;
1906         struct bpos search_key;
1907         struct bkey_s_c k;
1908         int ret;
1909
1910         EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS &&
1911                 btree_iter_type(iter) != BTREE_ITER_CACHED);
1912         bch2_btree_iter_verify(iter);
1913         bch2_btree_iter_verify_entry_exit(iter);
1914
1915         /* extents can't span inode numbers: */
1916         if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
1917             unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
1918                 if (iter->pos.inode == KEY_INODE_MAX)
1919                         return bkey_s_c_null;
1920
1921                 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
1922         }
1923
1924         search_key = btree_iter_search_key(iter);
1925         btree_iter_set_search_pos(iter, search_key);
1926
1927         ret = btree_iter_traverse(iter);
1928         if (unlikely(ret))
1929                 return bkey_s_c_err(ret);
1930
1931         if (btree_iter_type(iter) == BTREE_ITER_CACHED ||
1932             !(iter->flags & BTREE_ITER_IS_EXTENTS)) {
1933                 struct bkey_i *next_update;
1934                 struct bkey_cached *ck;
1935
1936                 next_update = btree_trans_peek_updates(iter);
1937
1938                 switch (btree_iter_type(iter)) {
1939                 case BTREE_ITER_KEYS:
1940                         k = btree_iter_level_peek_all(iter, &iter->l[0]);
1941                         EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0);
1942                         break;
1943                 case BTREE_ITER_CACHED:
1944                         ck = (void *) iter->l[0].b;
1945                         EBUG_ON(iter->btree_id != ck->key.btree_id ||
1946                                 bkey_cmp(iter->pos, ck->key.pos));
1947                         BUG_ON(!ck->valid);
1948
1949                         k = bkey_i_to_s_c(ck->k);
1950                         break;
1951                 case BTREE_ITER_NODES:
1952                         BUG();
1953                 }
1954
1955                 if (next_update &&
1956                     (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
1957                         iter->k = next_update->k;
1958                         k = bkey_i_to_s_c(next_update);
1959                 }
1960
1961                 if (!k.k ||
1962                     ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
1963                      ? bpos_cmp(iter->pos, k.k->p)
1964                      : bkey_cmp(iter->pos, k.k->p))) {
1965                         bkey_init(&iter->k);
1966                         iter->k.p = iter->pos;
1967                         k = (struct bkey_s_c) { &iter->k, NULL };
1968                 }
1969         } else {
1970                 struct bpos next;
1971
1972                 if (iter->flags & BTREE_ITER_INTENT) {
1973                         struct btree_iter *child =
1974                                 btree_iter_child_alloc(trans, iter, _THIS_IP_);
1975
1976                         btree_iter_copy(trans, child, iter);
1977                         k = bch2_btree_iter_peek(child);
1978
1979                         if (k.k && !bkey_err(k))
1980                                 iter->k = child->k;
1981                 } else {
1982                         struct bpos pos = iter->pos;
1983
1984                         k = bch2_btree_iter_peek(iter);
1985                         iter->pos = pos;
1986                 }
1987
1988                 if (unlikely(bkey_err(k)))
1989                         return k;
1990
1991                 next = k.k ? bkey_start_pos(k.k) : POS_MAX;
1992
1993                 if (bkey_cmp(iter->pos, next) < 0) {
1994                         bkey_init(&iter->k);
1995                         iter->k.p = iter->pos;
1996                         bch2_key_resize(&iter->k,
1997                                         min_t(u64, KEY_SIZE_MAX,
1998                                               (next.inode == iter->pos.inode
1999                                                ? next.offset
2000                                                : KEY_OFFSET_MAX) -
2001                                               iter->pos.offset));
2002
2003                         k = (struct bkey_s_c) { &iter->k, NULL };
2004                         EBUG_ON(!k.k->size);
2005                 }
2006         }
2007
2008         bch2_btree_iter_verify_entry_exit(iter);
2009         bch2_btree_iter_verify(iter);
2010         iter->should_be_locked = true;
2011
2012         return k;
2013 }
2014
2015 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
2016 {
2017         if (!bch2_btree_iter_advance(iter))
2018                 return bkey_s_c_null;
2019
2020         return bch2_btree_iter_peek_slot(iter);
2021 }
2022
2023 struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
2024 {
2025         if (!bch2_btree_iter_rewind(iter))
2026                 return bkey_s_c_null;
2027
2028         return bch2_btree_iter_peek_slot(iter);
2029 }
2030
2031 static inline void bch2_btree_iter_init(struct btree_trans *trans,
2032                         struct btree_iter *iter, enum btree_id btree_id)
2033 {
2034         struct bch_fs *c = trans->c;
2035         unsigned i;
2036
2037         iter->trans                     = trans;
2038         iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
2039         iter->btree_id                  = btree_id;
2040         iter->real_pos                  = POS_MIN;
2041         iter->level                     = 0;
2042         iter->min_depth                 = 0;
2043         iter->locks_want                = 0;
2044         iter->nodes_locked              = 0;
2045         iter->nodes_intent_locked       = 0;
2046         for (i = 0; i < ARRAY_SIZE(iter->l); i++)
2047                 iter->l[i].b            = BTREE_ITER_NO_NODE_INIT;
2048
2049         prefetch(c->btree_roots[btree_id].b);
2050 }
2051
2052 /* new transactional stuff: */
2053
2054 #ifdef CONFIG_BCACHEFS_DEBUG
2055 static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
2056 {
2057         struct btree_iter *iter;
2058         unsigned i;
2059
2060         BUG_ON(trans->nr_sorted != hweight64(trans->iters_linked));
2061
2062         trans_for_each_iter(trans, iter) {
2063                 BUG_ON(iter->sorted_idx >= trans->nr_sorted);
2064                 BUG_ON(trans->sorted[iter->sorted_idx] != iter->idx);
2065         }
2066
2067         for (i = 0; i < trans->nr_sorted; i++) {
2068                 unsigned idx = trans->sorted[i];
2069
2070                 EBUG_ON(!(trans->iters_linked & (1ULL << idx)));
2071                 BUG_ON(trans->iters[idx].sorted_idx != i);
2072         }
2073 }
2074 #else
2075 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {}
2076 #endif
2077
2078 static void btree_trans_verify_sorted(struct btree_trans *trans)
2079 {
2080 #ifdef CONFIG_BCACHEFS_DEBUG
2081         struct btree_iter *iter, *prev = NULL;
2082         unsigned i;
2083
2084         trans_for_each_iter_inorder(trans, iter, i) {
2085                 BUG_ON(prev && btree_iter_cmp(prev, iter) > 0);
2086                 prev = iter;
2087         }
2088 #endif
2089 }
2090
2091 static noinline void __btree_trans_sort_iters(struct btree_trans *trans)
2092 {
2093         int i, l = 0, r = trans->nr_sorted, inc = 1;
2094         bool swapped;
2095
2096         /*
2097          * Cocktail shaker sort: this is efficient because iterators will be
2098          * mostly sorteda.
2099          */
2100         do {
2101                 swapped = false;
2102
2103                 for (i = inc > 0 ? l : r - 2;
2104                      i + 1 < r && i >= l;
2105                      i += inc) {
2106                         if (btree_iter_cmp(trans->iters + trans->sorted[i],
2107                                            trans->iters + trans->sorted[i + 1]) > 0) {
2108                                 swap(trans->sorted[i], trans->sorted[i + 1]);
2109                                 trans->iters[trans->sorted[i]].sorted_idx = i;
2110                                 trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
2111                                 swapped = true;
2112                         }
2113                 }
2114
2115                 if (inc > 0)
2116                         --r;
2117                 else
2118                         l++;
2119                 inc = -inc;
2120         } while (swapped);
2121
2122         trans->iters_sorted = true;
2123
2124         btree_trans_verify_sorted(trans);
2125 }
2126
2127 static inline void btree_trans_sort_iters(struct btree_trans *trans)
2128 {
2129         btree_trans_verify_sorted_refs(trans);
2130
2131         if (trans->iters_sorted) {
2132                 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
2133                         btree_trans_verify_sorted(trans);
2134                 return;
2135         }
2136         __btree_trans_sort_iters(trans);
2137 }
2138
2139 static inline void btree_iter_list_remove(struct btree_trans *trans,
2140                                           struct btree_iter *iter)
2141 {
2142         unsigned i;
2143
2144         EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
2145 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2146         trans->nr_sorted--;
2147         memmove_u64s_down_small(trans->sorted + iter->sorted_idx,
2148                                 trans->sorted + iter->sorted_idx + 1,
2149                                 DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
2150 #else
2151         array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx);
2152 #endif
2153         for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
2154                 trans->iters[trans->sorted[i]].sorted_idx = i;
2155
2156         iter->sorted_idx = U8_MAX;
2157 }
2158
2159 static inline void btree_iter_list_add(struct btree_trans *trans,
2160                                        struct btree_iter *pos,
2161                                        struct btree_iter *iter)
2162 {
2163         unsigned i;
2164
2165         iter->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
2166
2167 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
2168         memmove_u64s_up_small(trans->sorted + iter->sorted_idx + 1,
2169                               trans->sorted + iter->sorted_idx,
2170                               DIV_ROUND_UP(trans->nr_sorted - iter->sorted_idx, 8));
2171         trans->nr_sorted++;
2172         trans->sorted[iter->sorted_idx] = iter->idx;
2173 #else
2174         array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx);
2175 #endif
2176
2177         for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
2178                 trans->iters[trans->sorted[i]].sorted_idx = i;
2179
2180         btree_trans_verify_sorted_refs(trans);
2181 }
2182
2183 static void btree_iter_child_free(struct btree_trans *trans, struct btree_iter *iter)
2184 {
2185         struct btree_iter *child = btree_iter_child(trans, iter);
2186
2187         if (child) {
2188                 bch2_trans_iter_free(trans, child);
2189                 iter->child_idx = U8_MAX;
2190         }
2191 }
2192
2193 static struct btree_iter *btree_iter_child_alloc(struct btree_trans *trans,
2194                                                  struct btree_iter *iter,
2195                                                  unsigned long ip)
2196 {
2197         struct btree_iter *child = btree_iter_child(trans, iter);
2198
2199         if (!child) {
2200                 child = btree_trans_iter_alloc(trans, iter);
2201                 child->ip_allocated     = ip;
2202                 iter->child_idx         = child->idx;
2203
2204                 trans->iters_live       |= 1ULL << child->idx;
2205                 trans->iters_touched    |= 1ULL << child->idx;
2206         }
2207
2208         return child;
2209 }
2210
2211 static inline void __bch2_trans_iter_free(struct btree_trans *trans,
2212                                           unsigned idx)
2213 {
2214         btree_iter_child_free(trans, &trans->iters[idx]);
2215
2216         btree_iter_list_remove(trans, &trans->iters[idx]);
2217
2218         __bch2_btree_iter_unlock(&trans->iters[idx]);
2219         trans->iters_linked             &= ~(1ULL << idx);
2220         trans->iters_live               &= ~(1ULL << idx);
2221         trans->iters_touched            &= ~(1ULL << idx);
2222
2223         btree_trans_verify_sorted_refs(trans);
2224 }
2225
2226 static bool have_iter_at_pos(struct btree_trans *trans,
2227                              struct btree_iter *iter)
2228 {
2229         struct btree_iter *n;
2230
2231         n = prev_btree_iter(trans, iter);
2232         if (n && !btree_iter_cmp(n, iter))
2233                 return true;
2234
2235         n = next_btree_iter(trans, iter);
2236         if (n && !btree_iter_cmp(n, iter))
2237                 return true;
2238
2239         return false;
2240 }
2241
2242 int bch2_trans_iter_put(struct btree_trans *trans,
2243                         struct btree_iter *iter)
2244 {
2245         int ret;
2246
2247         if (IS_ERR_OR_NULL(iter))
2248                 return 0;
2249
2250         BUG_ON(trans->iters + iter->idx != iter);
2251         BUG_ON(!btree_iter_live(trans, iter));
2252
2253         ret = btree_iter_err(iter);
2254
2255         if (!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) &&
2256             (!(trans->iters_touched & (1ULL << iter->idx)) ||
2257              have_iter_at_pos(trans, iter)))
2258                 __bch2_trans_iter_free(trans, iter->idx);
2259
2260         trans->iters_live       &= ~(1ULL << iter->idx);
2261         return ret;
2262 }
2263
2264 int bch2_trans_iter_free(struct btree_trans *trans,
2265                          struct btree_iter *iter)
2266 {
2267         if (IS_ERR_OR_NULL(iter))
2268                 return 0;
2269
2270         set_btree_iter_dontneed(trans, iter);
2271
2272         return bch2_trans_iter_put(trans, iter);
2273 }
2274
2275 noinline __cold
2276 void bch2_dump_trans_iters_updates(struct btree_trans *trans)
2277 {
2278         struct btree_iter *iter;
2279         struct btree_insert_entry *i;
2280         unsigned idx;
2281         char buf1[300], buf2[100];
2282
2283         btree_trans_sort_iters(trans);
2284
2285         trans_for_each_iter_inorder(trans, iter, idx)
2286                 printk(KERN_ERR "iter: btree %s pos %s real_pos %s%s%s%s %pS\n",
2287                        bch2_btree_ids[iter->btree_id],
2288                        (bch2_bpos_to_text(&PBUF(buf1), iter->pos), buf1),
2289                        (bch2_bpos_to_text(&PBUF(buf2), iter->real_pos), buf2),
2290                        btree_iter_live(trans, iter) ? " live" : "",
2291                        (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
2292                        iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
2293                        (void *) iter->ip_allocated);
2294
2295         trans_for_each_update(trans, i)
2296                 printk(KERN_ERR "update: btree %s %s %pS\n",
2297                        bch2_btree_ids[i->btree_id],
2298                        (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, bkey_i_to_s_c(i->k)), buf1),
2299                        (void *) i->ip_allocated);
2300 }
2301
2302 static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
2303                                                  struct btree_iter *pos)
2304 {
2305         struct btree_iter *iter;
2306         unsigned idx;
2307
2308         btree_trans_verify_sorted_refs(trans);
2309
2310         if (unlikely(trans->iters_linked ==
2311                      ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
2312                 bch2_dump_trans_iters_updates(trans);
2313                 panic("trans iter oveflow\n");
2314         }
2315
2316         idx = __ffs64(~trans->iters_linked);
2317         iter = &trans->iters[idx];
2318
2319         iter->trans             = trans;
2320         iter->idx               = idx;
2321         iter->child_idx         = U8_MAX;
2322         iter->sorted_idx        = U8_MAX;
2323         iter->flags             = 0;
2324         iter->nodes_locked      = 0;
2325         iter->nodes_intent_locked = 0;
2326         trans->iters_linked     |= 1ULL << idx;
2327
2328         btree_iter_list_add(trans, pos, iter);
2329         return iter;
2330 }
2331
2332 static void btree_iter_copy(struct btree_trans *trans, struct btree_iter *dst,
2333                             struct btree_iter *src)
2334 {
2335         unsigned i, offset = offsetof(struct btree_iter, flags);
2336
2337         __bch2_btree_iter_unlock(dst);
2338         btree_iter_child_free(trans, dst);
2339
2340         memcpy((void *) dst + offset,
2341                (void *) src + offset,
2342                sizeof(struct btree_iter) - offset);
2343
2344         for (i = 0; i < BTREE_MAX_DEPTH; i++)
2345                 if (btree_node_locked(dst, i))
2346                         six_lock_increment(&dst->l[i].b->c.lock,
2347                                            __btree_lock_want(dst, i));
2348
2349         dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2350         dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
2351         trans->iters_sorted = false;
2352 }
2353
2354 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
2355                                          enum btree_id btree_id, struct bpos pos,
2356                                          unsigned locks_want,
2357                                          unsigned depth,
2358                                          unsigned flags)
2359 {
2360         struct btree_iter *iter, *best = NULL;
2361         struct bpos real_pos, pos_min = POS_MIN;
2362
2363         EBUG_ON(trans->restarted);
2364
2365         if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
2366             btree_node_type_is_extents(btree_id) &&
2367             !(flags & BTREE_ITER_NOT_EXTENTS) &&
2368             !(flags & BTREE_ITER_ALL_SNAPSHOTS))
2369                 flags |= BTREE_ITER_IS_EXTENTS;
2370
2371         if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
2372             !btree_type_has_snapshots(btree_id))
2373                 flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
2374
2375         if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
2376                 pos.snapshot = btree_type_has_snapshots(btree_id)
2377                         ? U32_MAX : 0;
2378
2379         real_pos = pos;
2380
2381         if ((flags & BTREE_ITER_IS_EXTENTS) &&
2382             bkey_cmp(pos, POS_MAX))
2383                 real_pos = bpos_nosnap_successor(pos);
2384
2385         trans_for_each_iter(trans, iter) {
2386                 if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
2387                         continue;
2388
2389                 if (iter->btree_id != btree_id)
2390                         continue;
2391
2392                 if (best) {
2393                         int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
2394                                            bpos_diff(iter->real_pos, real_pos));
2395
2396                         if (cmp < 0 ||
2397                             ((cmp == 0 && btree_iter_keep(trans, iter))))
2398                                 continue;
2399                 }
2400
2401                 best = iter;
2402         }
2403
2404         if (!best) {
2405                 iter = btree_trans_iter_alloc(trans, best);
2406                 bch2_btree_iter_init(trans, iter, btree_id);
2407         } else if (btree_iter_keep(trans, best)) {
2408                 iter = btree_trans_iter_alloc(trans, best);
2409                 btree_iter_copy(trans, iter, best);
2410         } else {
2411                 iter = best;
2412         }
2413
2414         trans->iters_live       |= 1ULL << iter->idx;
2415         trans->iters_touched    |= 1ULL << iter->idx;
2416
2417         iter->flags = flags;
2418
2419         iter->snapshot = pos.snapshot;
2420
2421         /*
2422          * If the iterator has locks_want greater than requested, we explicitly
2423          * do not downgrade it here - on transaction restart because btree node
2424          * split needs to upgrade locks, we might be putting/getting the
2425          * iterator again. Downgrading iterators only happens via an explicit
2426          * bch2_trans_downgrade().
2427          */
2428
2429         locks_want = min(locks_want, BTREE_MAX_DEPTH);
2430         if (locks_want > iter->locks_want) {
2431                 iter->locks_want = locks_want;
2432                 btree_iter_get_locks(trans, iter, true, _THIS_IP_);
2433         }
2434
2435         while (iter->level != depth) {
2436                 btree_node_unlock(iter, iter->level);
2437                 iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
2438                 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
2439                 if (iter->level < depth)
2440                         iter->level++;
2441                 else
2442                         iter->level--;
2443         }
2444
2445         iter->min_depth = depth;
2446
2447         bch2_btree_iter_set_pos(iter, pos);
2448         btree_iter_set_search_pos(iter, real_pos);
2449
2450         trace_trans_get_iter(_RET_IP_, trans->ip,
2451                              btree_id,
2452                              &real_pos, locks_want, iter->uptodate,
2453                              best ? &best->real_pos     : &pos_min,
2454                              best ? best->locks_want    : U8_MAX,
2455                              best ? best->uptodate      : U8_MAX);
2456
2457         return iter;
2458 }
2459
2460 struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
2461                                             enum btree_id btree_id,
2462                                             struct bpos pos,
2463                                             unsigned locks_want,
2464                                             unsigned depth,
2465                                             unsigned flags)
2466 {
2467         struct btree_iter *iter =
2468                 __bch2_trans_get_iter(trans, btree_id, pos,
2469                                       locks_want, depth,
2470                                       BTREE_ITER_NODES|
2471                                       BTREE_ITER_NOT_EXTENTS|
2472                                       BTREE_ITER_ALL_SNAPSHOTS|
2473                                       flags);
2474
2475         BUG_ON(bkey_cmp(iter->pos, pos));
2476         BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
2477         BUG_ON(iter->level      != depth);
2478         BUG_ON(iter->min_depth  != depth);
2479         iter->ip_allocated = _RET_IP_;
2480
2481         return iter;
2482 }
2483
2484 struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
2485                                           struct btree_iter *src)
2486 {
2487         struct btree_iter *iter;
2488
2489         iter = btree_trans_iter_alloc(trans, src);
2490         btree_iter_copy(trans, iter, src);
2491
2492         trans->iters_live |= 1ULL << iter->idx;
2493         /*
2494          * We don't need to preserve this iter since it's cheap to copy it
2495          * again - this will cause trans_iter_put() to free it right away:
2496          */
2497         set_btree_iter_dontneed(trans, iter);
2498
2499         return iter;
2500 }
2501
2502 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
2503 {
2504         size_t new_top = trans->mem_top + size;
2505         void *p;
2506
2507         if (new_top > trans->mem_bytes) {
2508                 size_t old_bytes = trans->mem_bytes;
2509                 size_t new_bytes = roundup_pow_of_two(new_top);
2510                 void *new_mem;
2511
2512                 WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
2513
2514                 new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
2515                 if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
2516                         new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
2517                         new_bytes = BTREE_TRANS_MEM_MAX;
2518                         kfree(trans->mem);
2519                 }
2520
2521                 if (!new_mem)
2522                         return ERR_PTR(-ENOMEM);
2523
2524                 trans->mem = new_mem;
2525                 trans->mem_bytes = new_bytes;
2526
2527                 if (old_bytes) {
2528                         trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
2529                         btree_trans_restart(trans);
2530                         return ERR_PTR(-EINTR);
2531                 }
2532         }
2533
2534         p = trans->mem + trans->mem_top;
2535         trans->mem_top += size;
2536         memset(p, 0, size);
2537         return p;
2538 }
2539
2540 inline void bch2_trans_unlink_iters(struct btree_trans *trans)
2541 {
2542         u64 iters = trans->iters_linked &
2543                 ~trans->iters_touched &
2544                 ~trans->iters_live;
2545
2546         while (iters) {
2547                 unsigned idx = __ffs64(iters);
2548
2549                 iters &= ~(1ULL << idx);
2550                 __bch2_trans_iter_free(trans, idx);
2551         }
2552 }
2553
2554 /**
2555  * bch2_trans_begin() - reset a transaction after a interrupted attempt
2556  * @trans: transaction to reset
2557  *
2558  * While iterating over nodes or updating nodes a attempt to lock a btree
2559  * node may return EINTR when the trylock fails. When this occurs
2560  * bch2_trans_begin() should be called and the transaction retried.
2561  */
2562 void bch2_trans_begin(struct btree_trans *trans)
2563 {
2564         struct btree_iter *iter;
2565
2566         trans_for_each_iter(trans, iter)
2567                 iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
2568                                  BTREE_ITER_SET_POS_AFTER_COMMIT);
2569
2570         /*
2571          * XXX: we shouldn't be doing this if the transaction was restarted, but
2572          * currently we still overflow transaction iterators if we do that
2573          * */
2574         bch2_trans_unlink_iters(trans);
2575         trans->iters_touched &= trans->iters_live;
2576
2577         trans->extra_journal_res        = 0;
2578         trans->nr_updates               = 0;
2579         trans->mem_top                  = 0;
2580
2581         trans->hooks                    = NULL;
2582         trans->extra_journal_entries    = NULL;
2583         trans->extra_journal_entry_u64s = 0;
2584
2585         if (trans->fs_usage_deltas) {
2586                 trans->fs_usage_deltas->used = 0;
2587                 memset((void *) trans->fs_usage_deltas +
2588                        offsetof(struct replicas_delta_list, memset_start), 0,
2589                        (void *) &trans->fs_usage_deltas->memset_end -
2590                        (void *) &trans->fs_usage_deltas->memset_start);
2591         }
2592
2593         bch2_trans_cond_resched(trans);
2594
2595         if (trans->restarted)
2596                 bch2_btree_iter_traverse_all(trans);
2597
2598         trans->restarted = false;
2599 }
2600
2601 static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
2602 {
2603         size_t iters_bytes      = sizeof(struct btree_iter) * BTREE_ITER_MAX;
2604         size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
2605         void *p = NULL;
2606
2607         BUG_ON(trans->used_mempool);
2608
2609 #ifdef __KERNEL__
2610         p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
2611 #endif
2612         if (!p)
2613                 p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
2614
2615         trans->iters            = p; p += iters_bytes;
2616         trans->updates          = p; p += updates_bytes;
2617 }
2618
2619 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
2620                      unsigned expected_nr_iters,
2621                      size_t expected_mem_bytes)
2622         __acquires(&c->btree_trans_barrier)
2623 {
2624         memset(trans, 0, sizeof(*trans));
2625         trans->c                = c;
2626         trans->ip               = _RET_IP_;
2627
2628         /*
2629          * reallocating iterators currently completely breaks
2630          * bch2_trans_iter_put(), we always allocate the max:
2631          */
2632         bch2_trans_alloc_iters(trans, c);
2633
2634         if (expected_mem_bytes) {
2635                 expected_mem_bytes = roundup_pow_of_two(expected_mem_bytes);
2636                 trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
2637
2638                 if (!unlikely(trans->mem)) {
2639                         trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
2640                         trans->mem_bytes = BTREE_TRANS_MEM_MAX;
2641                 } else {
2642                         trans->mem_bytes = expected_mem_bytes;
2643                 }
2644         }
2645
2646         trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
2647
2648 #ifdef CONFIG_BCACHEFS_DEBUG
2649         trans->pid = current->pid;
2650         mutex_lock(&c->btree_trans_lock);
2651         list_add(&trans->list, &c->btree_trans_list);
2652         mutex_unlock(&c->btree_trans_lock);
2653 #endif
2654 }
2655
2656 int bch2_trans_exit(struct btree_trans *trans)
2657         __releases(&c->btree_trans_barrier)
2658 {
2659         struct bch_fs *c = trans->c;
2660
2661         bch2_trans_unlock(trans);
2662
2663 #ifdef CONFIG_BCACHEFS_DEBUG
2664         if (trans->iters_live) {
2665                 struct btree_iter *iter;
2666
2667                 trans_for_each_iter(trans, iter)
2668                         btree_iter_child_free(trans, iter);
2669         }
2670
2671         if (trans->iters_live) {
2672                 struct btree_iter *iter;
2673
2674                 bch_err(c, "btree iterators leaked!");
2675                 trans_for_each_iter(trans, iter)
2676                         if (btree_iter_live(trans, iter))
2677                                 printk(KERN_ERR "  btree %s allocated at %pS\n",
2678                                        bch2_btree_ids[iter->btree_id],
2679                                        (void *) iter->ip_allocated);
2680                 /* Be noisy about this: */
2681                 bch2_fatal_error(c);
2682         }
2683
2684         mutex_lock(&trans->c->btree_trans_lock);
2685         list_del(&trans->list);
2686         mutex_unlock(&trans->c->btree_trans_lock);
2687 #endif
2688
2689         srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
2690
2691         bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
2692
2693         if (trans->fs_usage_deltas) {
2694                 if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
2695                     REPLICAS_DELTA_LIST_MAX)
2696                         mempool_free(trans->fs_usage_deltas,
2697                                      &trans->c->replicas_delta_pool);
2698                 else
2699                         kfree(trans->fs_usage_deltas);
2700         }
2701
2702         if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
2703                 mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
2704         else
2705                 kfree(trans->mem);
2706
2707 #ifdef __KERNEL__
2708         /*
2709          * Userspace doesn't have a real percpu implementation:
2710          */
2711         trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
2712 #endif
2713
2714         if (trans->iters)
2715                 mempool_free(trans->iters, &trans->c->btree_iters_pool);
2716
2717         trans->mem      = (void *) 0x1;
2718         trans->iters    = (void *) 0x1;
2719
2720         return trans->error ? -EIO : 0;
2721 }
2722
2723 static void __maybe_unused
2724 bch2_btree_iter_node_to_text(struct printbuf *out,
2725                              struct btree_bkey_cached_common *_b,
2726                              enum btree_iter_type type)
2727 {
2728         pr_buf(out, "    l=%u %s:",
2729                _b->level, bch2_btree_ids[_b->btree_id]);
2730         bch2_bpos_to_text(out, btree_node_pos(_b, type));
2731 }
2732
2733 #ifdef CONFIG_BCACHEFS_DEBUG
2734 static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
2735 {
2736         struct btree_iter *iter;
2737
2738         trans_for_each_iter(trans, iter)
2739                 if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
2740                     iter->nodes_locked)
2741                         return true;
2742         return false;
2743 }
2744 #endif
2745
2746 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
2747 {
2748 #ifdef CONFIG_BCACHEFS_DEBUG
2749         struct btree_trans *trans;
2750         struct btree_iter *iter;
2751         struct btree *b;
2752         unsigned l;
2753
2754         mutex_lock(&c->btree_trans_lock);
2755         list_for_each_entry(trans, &c->btree_trans_list, list) {
2756                 if (!trans_has_btree_nodes_locked(trans))
2757                         continue;
2758
2759                 pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
2760
2761                 trans_for_each_iter(trans, iter) {
2762                         if (!iter->nodes_locked)
2763                                 continue;
2764
2765                         pr_buf(out, "  iter %u %c %s:",
2766                                iter->idx,
2767                                btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
2768                                bch2_btree_ids[iter->btree_id]);
2769                         bch2_bpos_to_text(out, iter->pos);
2770                         pr_buf(out, "\n");
2771
2772                         for (l = 0; l < BTREE_MAX_DEPTH; l++) {
2773                                 if (btree_node_locked(iter, l)) {
2774                                         pr_buf(out, "    %s l=%u ",
2775                                                btree_node_intent_locked(iter, l) ? "i" : "r", l);
2776                                         bch2_btree_iter_node_to_text(out,
2777                                                         (void *) iter->l[l].b,
2778                                                         btree_iter_type(iter));
2779                                         pr_buf(out, "\n");
2780                                 }
2781                         }
2782                 }
2783
2784                 b = READ_ONCE(trans->locking);
2785                 if (b) {
2786                         iter = &trans->iters[trans->locking_iter_idx];
2787                         pr_buf(out, "  locking iter %u %c l=%u %s:",
2788                                trans->locking_iter_idx,
2789                                btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
2790                                trans->locking_level,
2791                                bch2_btree_ids[trans->locking_btree_id]);
2792                         bch2_bpos_to_text(out, trans->locking_pos);
2793
2794                         pr_buf(out, " node ");
2795                         bch2_btree_iter_node_to_text(out,
2796                                         (void *) b,
2797                                         btree_iter_type(iter));
2798                         pr_buf(out, "\n");
2799                 }
2800         }
2801         mutex_unlock(&c->btree_trans_lock);
2802 #endif
2803 }
2804
2805 void bch2_fs_btree_iter_exit(struct bch_fs *c)
2806 {
2807         mempool_exit(&c->btree_trans_mem_pool);
2808         mempool_exit(&c->btree_iters_pool);
2809         cleanup_srcu_struct(&c->btree_trans_barrier);
2810 }
2811
2812 int bch2_fs_btree_iter_init(struct bch_fs *c)
2813 {
2814         unsigned nr = BTREE_ITER_MAX;
2815
2816         INIT_LIST_HEAD(&c->btree_trans_list);
2817         mutex_init(&c->btree_trans_lock);
2818
2819         return  init_srcu_struct(&c->btree_trans_barrier) ?:
2820                 mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
2821                         sizeof(struct btree_iter) * nr +
2822                         sizeof(struct btree_insert_entry) * nr) ?:
2823                 mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
2824                                           BTREE_TRANS_MEM_MAX);
2825 }