bcachefs: Change btree_iter_traverse_error() to not use iter->next
[linux-block.git] / fs / bcachefs / btree_iter.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "bkey_methods.h"
5#include "btree_cache.h"
6#include "btree_iter.h"
7#include "btree_locking.h"
8#include "debug.h"
9#include "extents.h"
10#include "trace.h"
11
12#include <linux/prefetch.h>
13
14static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
15 struct btree_iter_level *,
16 struct bkey *);
17
18#define BTREE_ITER_NOT_END ((struct btree *) 1)
19
20static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
21{
22 return l < BTREE_MAX_DEPTH &&
23 iter->l[l].b &&
24 iter->l[l].b != BTREE_ITER_NOT_END;
25}
26
a00fd8c5
KO
27/* Returns < 0 if @k is before iter pos, > 0 if @k is after */
28static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
29 const struct btree *b,
30 const struct bkey_packed *k,
31 bool interior_node)
32{
33 int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
34
35 if (cmp)
36 return cmp;
37 if (bkey_deleted(k))
38 return -1;
cbdf24ce
KO
39
40 /*
41 * Normally, for extents we want the first key strictly greater than
42 * the iterator position - with the exception that for interior nodes,
43 * we don't want to advance past the last key if the iterator position
44 * is POS_MAX:
45 */
46 if (iter->flags & BTREE_ITER_IS_EXTENTS &&
47 (!interior_node ||
48 bkey_cmp_left_packed_byval(b, k, POS_MAX)))
a00fd8c5
KO
49 return -1;
50 return 1;
51}
52
53static inline int btree_iter_pos_cmp(struct btree_iter *iter,
54 const struct btree *b,
55 const struct bkey_packed *k)
56{
57 return __btree_iter_pos_cmp(iter, b, k, b->level != 0);
58}
59
1c6fdbd8
KO
60/* Btree node locking: */
61
62/*
63 * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
64 * succeed:
65 */
66void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
67{
68 struct btree_iter *linked;
69
70 EBUG_ON(iter->l[b->level].b != b);
e4ccb251 71 EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
1c6fdbd8 72
0f238367 73 trans_for_each_iter_with_node(iter->trans, b, linked)
e4ccb251 74 linked->l[b->level].lock_seq += 2;
1c6fdbd8
KO
75
76 six_unlock_write(&b->lock);
77}
78
79void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
80{
1c6fdbd8
KO
81 struct btree_iter *linked;
82 unsigned readers = 0;
83
84 EBUG_ON(btree_node_read_locked(iter, b->level));
85
0f238367 86 trans_for_each_iter(iter->trans, linked)
1c6fdbd8
KO
87 if (linked->l[b->level].b == b &&
88 btree_node_read_locked(linked, b->level))
89 readers++;
90
91 /*
92 * Must drop our read locks before calling six_lock_write() -
93 * six_unlock() won't do wakeups until the reader count
94 * goes to 0, and it's safe because we have the node intent
95 * locked:
96 */
97 atomic64_sub(__SIX_VAL(read_lock, readers),
98 &b->lock.state.counter);
9e5e5b9e 99 btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
1c6fdbd8
KO
100 atomic64_add(__SIX_VAL(read_lock, readers),
101 &b->lock.state.counter);
102}
103
1c6fdbd8
KO
104bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
105{
106 struct btree *b = btree_iter_node(iter, level);
107 int want = __btree_lock_want(iter, level);
108
109 if (!b || b == BTREE_ITER_NOT_END)
110 return false;
111
112 if (race_fault())
113 return false;
114
e4ccb251
KO
115 if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) &&
116 !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
1c6fdbd8
KO
117 btree_node_lock_increment(iter, b, level, want)))
118 return false;
119
120 mark_btree_node_locked(iter, level, want);
121 return true;
122}
123
124static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
125{
126 struct btree *b = iter->l[level].b;
127
128 EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
129
130 if (!is_btree_node(iter, level))
131 return false;
132
133 if (btree_node_intent_locked(iter, level))
134 return true;
135
136 if (race_fault())
137 return false;
138
139 if (btree_node_locked(iter, level)
140 ? six_lock_tryupgrade(&b->lock)
e4ccb251 141 : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq))
1c6fdbd8
KO
142 goto success;
143
e4ccb251 144 if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
1c6fdbd8
KO
145 btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
146 btree_node_unlock(iter, level);
147 goto success;
148 }
149
150 return false;
151success:
152 mark_btree_node_intent_locked(iter, level);
153 return true;
154}
155
156static inline bool btree_iter_get_locks(struct btree_iter *iter,
157 bool upgrade)
158{
159 unsigned l = iter->level;
160 int fail_idx = -1;
161
162 do {
163 if (!btree_iter_node(iter, l))
164 break;
165
166 if (!(upgrade
167 ? bch2_btree_node_upgrade(iter, l)
168 : bch2_btree_node_relock(iter, l))) {
169 fail_idx = l;
170 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
171 }
172
173 l++;
174 } while (l < iter->locks_want);
175
176 /*
177 * When we fail to get a lock, we have to ensure that any child nodes
178 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
179 * the node that we failed to relock:
180 */
181 while (fail_idx >= 0) {
182 btree_node_unlock(iter, fail_idx);
183 iter->l[fail_idx].b = BTREE_ITER_NOT_END;
184 --fail_idx;
185 }
186
187 if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
188 iter->uptodate = BTREE_ITER_NEED_PEEK;
189
0f238367
KO
190 bch2_btree_trans_verify_locks(iter->trans);
191
1c6fdbd8
KO
192 return iter->uptodate < BTREE_ITER_NEED_RELOCK;
193}
194
195/* Slowpath: */
196bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
197 unsigned level,
198 struct btree_iter *iter,
199 enum six_lock_type type,
200 bool may_drop_locks)
201{
1c6fdbd8
KO
202 struct btree_iter *linked;
203 bool ret = true;
204
647d7b60 205 /* Check if it's safe to block: */
0f238367 206 trans_for_each_iter(iter->trans, linked) {
1c6fdbd8
KO
207 if (!linked->nodes_locked)
208 continue;
209
647d7b60 210 /* * Must lock btree nodes in key order: */
1c6fdbd8
KO
211 if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
212 ret = false;
213
214 /*
215 * Can't block taking an intent lock if we have _any_ nodes read
216 * locked:
217 *
218 * - Our read lock blocks another thread with an intent lock on
219 * the same node from getting a write lock, and thus from
220 * dropping its intent lock
221 *
222 * - And the other thread may have multiple nodes intent locked:
223 * both the node we want to intent lock, and the node we
224 * already have read locked - deadlock:
225 */
226 if (type == SIX_LOCK_intent &&
227 linked->nodes_locked != linked->nodes_intent_locked) {
228 if (may_drop_locks) {
229 linked->locks_want = max_t(unsigned,
230 linked->locks_want,
231 __fls(linked->nodes_locked) + 1);
232 btree_iter_get_locks(linked, true);
233 }
234 ret = false;
235 }
236
237 /*
238 * Interior nodes must be locked before their descendants: if
239 * another iterator has possible descendants locked of the node
240 * we're about to lock, it must have the ancestors locked too:
241 */
242 if (linked->btree_id == iter->btree_id &&
243 level > __fls(linked->nodes_locked)) {
244 if (may_drop_locks) {
647d7b60
KO
245 linked->locks_want =
246 max(level + 1, max_t(unsigned,
247 linked->locks_want,
248 iter->locks_want));
1c6fdbd8
KO
249 btree_iter_get_locks(linked, true);
250 }
251 ret = false;
252 }
253 }
254
255 if (ret)
9e5e5b9e 256 __btree_node_lock_type(iter->trans->c, b, type);
1c7a0adf
KO
257 else
258 trans_restart();
259
1c6fdbd8
KO
260 return ret;
261}
262
263/* Btree iterator locking: */
264
265#ifdef CONFIG_BCACHEFS_DEBUG
0f238367 266void bch2_btree_iter_verify_locks(struct btree_iter *iter)
1c6fdbd8
KO
267{
268 unsigned l;
269
ad7ae8d6
KO
270 BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) &&
271 !btree_node_locked(iter, 0));
272
1c6fdbd8
KO
273 for (l = 0; btree_iter_node(iter, l); l++) {
274 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
275 !btree_node_locked(iter, l))
276 continue;
277
278 BUG_ON(btree_lock_want(iter, l) !=
279 btree_node_locked_type(iter, l));
280 }
281}
ad7ae8d6 282
0f238367 283void bch2_btree_trans_verify_locks(struct btree_trans *trans)
ad7ae8d6 284{
0f238367 285 struct btree_iter *iter;
ad7ae8d6 286
0f238367
KO
287 trans_for_each_iter(trans, iter)
288 bch2_btree_iter_verify_locks(iter);
ad7ae8d6 289}
1c6fdbd8
KO
290#endif
291
292__flatten
0f238367 293static bool bch2_btree_iter_relock(struct btree_iter *iter)
1c6fdbd8
KO
294{
295 return iter->uptodate >= BTREE_ITER_NEED_RELOCK
296 ? btree_iter_get_locks(iter, false)
297 : true;
298}
299
1c6fdbd8
KO
300bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
301 unsigned new_locks_want)
302{
303 struct btree_iter *linked;
304
305 EBUG_ON(iter->locks_want >= new_locks_want);
306
307 iter->locks_want = new_locks_want;
308
309 if (btree_iter_get_locks(iter, true))
310 return true;
311
312 /*
313 * Ancestor nodes must be locked before child nodes, so set locks_want
314 * on iterators that might lock ancestors before us to avoid getting
315 * -EINTR later:
316 */
0f238367
KO
317 trans_for_each_iter(iter->trans, linked)
318 if (linked != iter &&
319 linked->btree_id == iter->btree_id &&
1c6fdbd8
KO
320 btree_iter_cmp(linked, iter) <= 0 &&
321 linked->locks_want < new_locks_want) {
322 linked->locks_want = new_locks_want;
323 btree_iter_get_locks(linked, true);
324 }
325
326 return false;
327}
328
329bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
330 unsigned new_locks_want)
331{
332 unsigned l = iter->level;
333
334 EBUG_ON(iter->locks_want >= new_locks_want);
335
336 iter->locks_want = new_locks_want;
337
338 do {
339 if (!btree_iter_node(iter, l))
340 break;
341
342 if (!bch2_btree_node_upgrade(iter, l)) {
343 iter->locks_want = l;
344 return false;
345 }
346
347 l++;
348 } while (l < iter->locks_want);
349
350 return true;
351}
352
353void __bch2_btree_iter_downgrade(struct btree_iter *iter,
354 unsigned downgrade_to)
355{
356 struct btree_iter *linked;
357 unsigned l;
358
359 /*
360 * We downgrade linked iterators as well because btree_iter_upgrade
361 * might have had to modify locks_want on linked iterators due to lock
362 * ordering:
363 */
0f238367 364 trans_for_each_iter(iter->trans, linked) {
1c6fdbd8
KO
365 unsigned new_locks_want = downgrade_to ?:
366 (linked->flags & BTREE_ITER_INTENT ? 1 : 0);
367
368 if (linked->locks_want <= new_locks_want)
369 continue;
370
371 linked->locks_want = new_locks_want;
372
373 while (linked->nodes_locked &&
374 (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
375 if (l > linked->level) {
376 btree_node_unlock(linked, l);
377 } else {
378 if (btree_node_intent_locked(linked, l)) {
379 six_lock_downgrade(&linked->l[l].b->lock);
380 linked->nodes_intent_locked ^= 1 << l;
381 }
382 break;
383 }
384 }
1c6fdbd8 385 }
ad7ae8d6 386
0f238367 387 bch2_btree_trans_verify_locks(iter->trans);
1c6fdbd8
KO
388}
389
390int bch2_btree_iter_unlock(struct btree_iter *iter)
391{
392 struct btree_iter *linked;
393
0f238367 394 trans_for_each_iter(iter->trans, linked)
1c6fdbd8
KO
395 __bch2_btree_iter_unlock(linked);
396
0f238367 397 return btree_iter_err(iter);
1c6fdbd8
KO
398}
399
0f238367
KO
400bool bch2_btree_trans_relock(struct btree_trans *trans)
401{
402 struct btree_iter *iter;
403 bool ret = true;
404
405 trans_for_each_iter(trans, iter)
406 ret &= bch2_btree_iter_relock(iter);
407
408 return ret;
409}
410
411void bch2_btree_trans_unlock(struct btree_trans *trans)
412{
413 struct btree_iter *iter;
414
415 trans_for_each_iter(trans, iter)
416 __bch2_btree_iter_unlock(iter);
417}
418
419/* Btree transaction locking: */
420
1c6fdbd8
KO
421/* Btree iterator: */
422
423#ifdef CONFIG_BCACHEFS_DEBUG
424
425static void __bch2_btree_iter_verify(struct btree_iter *iter,
426 struct btree *b)
427{
428 struct btree_iter_level *l = &iter->l[b->level];
429 struct btree_node_iter tmp = l->iter;
430 struct bkey_packed *k;
431
271a3d3a
KO
432 if (iter->uptodate > BTREE_ITER_NEED_PEEK)
433 return;
434
1c6fdbd8
KO
435 bch2_btree_node_iter_verify(&l->iter, b);
436
437 /*
438 * For interior nodes, the iterator will have skipped past
439 * deleted keys:
271a3d3a
KO
440 *
441 * For extents, the iterator may have skipped past deleted keys (but not
442 * whiteouts)
1c6fdbd8 443 */
271a3d3a 444 k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
26609b61 445 ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
1c6fdbd8 446 : bch2_btree_node_iter_prev_all(&tmp, b);
a00fd8c5 447 if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
1c6fdbd8
KO
448 char buf[100];
449 struct bkey uk = bkey_unpack_key(b, k);
450
319f9ac3 451 bch2_bkey_to_text(&PBUF(buf), &uk);
271a3d3a 452 panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
1c6fdbd8
KO
453 buf, iter->pos.inode, iter->pos.offset);
454 }
455
456 k = bch2_btree_node_iter_peek_all(&l->iter, b);
a00fd8c5 457 if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
1c6fdbd8
KO
458 char buf[100];
459 struct bkey uk = bkey_unpack_key(b, k);
460
319f9ac3 461 bch2_bkey_to_text(&PBUF(buf), &uk);
271a3d3a
KO
462 panic("iter should be after current key:\n"
463 "iter pos %llu:%llu\n"
464 "cur key %s\n",
1c6fdbd8
KO
465 iter->pos.inode, iter->pos.offset, buf);
466 }
467
271a3d3a
KO
468 BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
469 (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
470 !bkey_whiteout(&iter->k) &&
471 bch2_btree_node_iter_end(&l->iter));
1c6fdbd8
KO
472}
473
474void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
475{
476 struct btree_iter *linked;
477
0f238367 478 trans_for_each_iter_with_node(iter->trans, b, linked)
1c6fdbd8
KO
479 __bch2_btree_iter_verify(linked, b);
480}
481
271a3d3a
KO
482#else
483
484static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
485 struct btree *b) {}
486
1c6fdbd8
KO
487#endif
488
489static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
490 struct btree *b,
491 struct btree_node_iter *node_iter,
492 struct bset_tree *t,
493 struct bkey_packed *where,
494 unsigned clobber_u64s,
495 unsigned new_u64s)
496{
497 const struct bkey_packed *end = btree_bkey_last(b, t);
498 struct btree_node_iter_set *set;
499 unsigned offset = __btree_node_key_to_offset(b, where);
500 int shift = new_u64s - clobber_u64s;
271a3d3a 501 unsigned old_end = t->end_offset - shift;
1c6fdbd8
KO
502
503 btree_node_iter_for_each(node_iter, set)
504 if (set->end == old_end)
505 goto found;
506
507 /* didn't find the bset in the iterator - might have to readd it: */
508 if (new_u64s &&
a00fd8c5 509 btree_iter_pos_cmp(iter, b, where) > 0) {
1c6fdbd8
KO
510 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
511
512 bch2_btree_node_iter_push(node_iter, b, where, end);
513
514 if (!b->level &&
515 node_iter == &iter->l[0].iter)
516 bkey_disassemble(b,
517 bch2_btree_node_iter_peek_all(node_iter, b),
518 &iter->k);
519 }
520 return;
521found:
271a3d3a 522 set->end = t->end_offset;
1c6fdbd8
KO
523
524 /* Iterator hasn't gotten to the key that changed yet: */
525 if (set->k < offset)
526 return;
527
528 if (new_u64s &&
a00fd8c5 529 btree_iter_pos_cmp(iter, b, where) > 0) {
1c6fdbd8
KO
530 set->k = offset;
531 } else if (set->k < offset + clobber_u64s) {
532 set->k = offset + new_u64s;
533 if (set->k == set->end)
534 bch2_btree_node_iter_set_drop(node_iter, set);
535 } else {
536 set->k = (int) set->k + shift;
537 goto iter_current_key_not_modified;
538 }
539
540 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
541
542 bch2_btree_node_iter_sort(node_iter, b);
56e0e7c7
KO
543 if (!b->level && node_iter == &iter->l[0].iter) {
544 /*
545 * not legal to call bkey_debugcheck() here, because we're
546 * called midway through the update path after update has been
547 * marked but before deletes have actually happened:
548 */
549#if 0
1c6fdbd8 550 __btree_iter_peek_all(iter, &iter->l[0], &iter->k);
56e0e7c7
KO
551#endif
552 struct btree_iter_level *l = &iter->l[0];
553 struct bkey_packed *k =
554 bch2_btree_node_iter_peek_all(&l->iter, l->b);
555
556 if (unlikely(!k))
557 iter->k.type = KEY_TYPE_deleted;
558 else
559 bkey_disassemble(l->b, k, &iter->k);
560 }
1c6fdbd8
KO
561iter_current_key_not_modified:
562
563 /*
564 * Interior nodes are special because iterators for interior nodes don't
565 * obey the usual invariants regarding the iterator position:
566 *
567 * We may have whiteouts that compare greater than the iterator
568 * position, and logically should be in the iterator, but that we
569 * skipped past to find the first live key greater than the iterator
570 * position. This becomes an issue when we insert a new key that is
571 * greater than the current iterator position, but smaller than the
572 * whiteouts we've already skipped past - this happens in the course of
573 * a btree split.
574 *
575 * We have to rewind the iterator past to before those whiteouts here,
576 * else bkey_node_iter_prev() is not going to work and who knows what
577 * else would happen. And we have to do it manually, because here we've
578 * already done the insert and the iterator is currently inconsistent:
579 *
580 * We've got multiple competing invariants, here - we have to be careful
581 * about rewinding iterators for interior nodes, because they should
582 * always point to the key for the child node the btree iterator points
583 * to.
584 */
a00fd8c5
KO
585 if (b->level && new_u64s &&
586 btree_iter_pos_cmp(iter, b, where) > 0) {
216c9fac 587 struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where);
1c6fdbd8
KO
588 struct bkey_packed *k;
589
590 for_each_bset(b, t) {
216c9fac 591 if (where_set == t)
1c6fdbd8
KO
592 continue;
593
594 k = bch2_bkey_prev_all(b, t,
595 bch2_btree_node_iter_bset_pos(node_iter, b, t));
596 if (k &&
a00fd8c5 597 bkey_iter_cmp(b, k, where) > 0) {
1c6fdbd8
KO
598 struct btree_node_iter_set *set;
599 unsigned offset =
600 __btree_node_key_to_offset(b, bkey_next(k));
601
602 btree_node_iter_for_each(node_iter, set)
603 if (set->k == offset) {
604 set->k = __btree_node_key_to_offset(b, k);
605 bch2_btree_node_iter_sort(node_iter, b);
606 goto next_bset;
607 }
608
609 bch2_btree_node_iter_push(node_iter, b, k,
610 btree_bkey_last(b, t));
611 }
612next_bset:
613 t = t;
614 }
615 }
616}
617
618void bch2_btree_node_iter_fix(struct btree_iter *iter,
216c9fac
KO
619 struct btree *b,
620 struct btree_node_iter *node_iter,
621 struct bkey_packed *where,
622 unsigned clobber_u64s,
623 unsigned new_u64s)
1c6fdbd8 624{
216c9fac 625 struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
1c6fdbd8
KO
626 struct btree_iter *linked;
627
628 if (node_iter != &iter->l[b->level].iter)
629 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
630 where, clobber_u64s, new_u64s);
631
0f238367 632 trans_for_each_iter_with_node(iter->trans, b, linked)
1c6fdbd8
KO
633 __bch2_btree_node_iter_fix(linked, b,
634 &linked->l[b->level].iter, t,
635 where, clobber_u64s, new_u64s);
1c6fdbd8
KO
636}
637
638static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
639 struct btree_iter_level *l,
640 struct bkey *u,
641 struct bkey_packed *k)
642{
643 struct bkey_s_c ret;
644
645 if (unlikely(!k)) {
646 /*
647 * signal to bch2_btree_iter_peek_slot() that we're currently at
648 * a hole
649 */
26609b61 650 u->type = KEY_TYPE_deleted;
1c6fdbd8
KO
651 return bkey_s_c_null;
652 }
653
654 ret = bkey_disassemble(l->b, k, u);
655
9e5e5b9e
KO
656 if (debug_check_bkeys(iter->trans->c))
657 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1c6fdbd8
KO
658
659 return ret;
660}
661
662/* peek_all() doesn't skip deleted keys */
663static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
664 struct btree_iter_level *l,
665 struct bkey *u)
666{
667 return __btree_iter_unpack(iter, l, u,
668 bch2_btree_node_iter_peek_all(&l->iter, l->b));
669}
670
671static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
672 struct btree_iter_level *l)
673{
674 return __btree_iter_unpack(iter, l, &iter->k,
675 bch2_btree_node_iter_peek(&l->iter, l->b));
676}
677
a00fd8c5
KO
678static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
679 struct btree_iter_level *l,
680 int max_advance)
1c6fdbd8 681{
a00fd8c5
KO
682 struct bkey_packed *k;
683 int nr_advanced = 0;
684
685 while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
686 btree_iter_pos_cmp(iter, l->b, k) < 0) {
687 if (max_advance > 0 && nr_advanced >= max_advance)
688 return false;
689
690 bch2_btree_node_iter_advance(&l->iter, l->b);
691 nr_advanced++;
692 }
693
694 return true;
1c6fdbd8
KO
695}
696
697/*
698 * Verify that iterator for parent node points to child node:
699 */
700static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
701{
702 struct btree_iter_level *l;
703 unsigned plevel;
704 bool parent_locked;
705 struct bkey_packed *k;
706
707 if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
708 return;
709
710 plevel = b->level + 1;
711 if (!btree_iter_node(iter, plevel))
712 return;
713
714 parent_locked = btree_node_locked(iter, plevel);
715
716 if (!bch2_btree_node_relock(iter, plevel))
717 return;
718
719 l = &iter->l[plevel];
720 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
721 if (!k ||
722 bkey_deleted(k) ||
723 bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
724 char buf[100];
725 struct bkey uk = bkey_unpack_key(b, k);
726
319f9ac3 727 bch2_bkey_to_text(&PBUF(buf), &uk);
1c6fdbd8
KO
728 panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
729 buf, b->key.k.p.inode, b->key.k.p.offset);
730 }
731
732 if (!parent_locked)
733 btree_node_unlock(iter, b->level + 1);
734}
735
1c6fdbd8
KO
736static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
737 struct btree *b)
738{
a00fd8c5 739 return __btree_iter_pos_cmp(iter, NULL,
cbdf24ce 740 bkey_to_packed(&b->key), true) < 0;
1c6fdbd8
KO
741}
742
743static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
744 struct btree *b)
745{
746 return iter->btree_id == b->btree_id &&
747 bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
748 !btree_iter_pos_after_node(iter, b);
749}
750
751static inline void __btree_iter_init(struct btree_iter *iter,
a00fd8c5 752 unsigned level)
1c6fdbd8 753{
a00fd8c5
KO
754 struct btree_iter_level *l = &iter->l[level];
755
756 bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
1c6fdbd8 757
a00fd8c5
KO
758 if (iter->flags & BTREE_ITER_IS_EXTENTS)
759 btree_iter_advance_to_pos(iter, l, -1);
1c6fdbd8
KO
760
761 /* Skip to first non whiteout: */
a00fd8c5
KO
762 if (level)
763 bch2_btree_node_iter_peek(&l->iter, l->b);
1c6fdbd8
KO
764
765 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
766}
767
768static inline void btree_iter_node_set(struct btree_iter *iter,
769 struct btree *b)
770{
771 btree_iter_verify_new_node(iter, b);
772
773 EBUG_ON(!btree_iter_pos_in_node(iter, b));
774 EBUG_ON(b->lock.state.seq & 1);
775
e4ccb251 776 iter->l[b->level].lock_seq = b->lock.state.seq;
1c6fdbd8 777 iter->l[b->level].b = b;
a00fd8c5 778 __btree_iter_init(iter, b->level);
1c6fdbd8
KO
779}
780
781/*
782 * A btree node is being replaced - update the iterator to point to the new
783 * node:
784 */
785void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
786{
787 enum btree_node_locked_type t;
788 struct btree_iter *linked;
789
0f238367 790 trans_for_each_iter(iter->trans, linked)
1c6fdbd8
KO
791 if (btree_iter_pos_in_node(linked, b)) {
792 /*
793 * bch2_btree_iter_node_drop() has already been called -
794 * the old node we're replacing has already been
795 * unlocked and the pointer invalidated
796 */
797 BUG_ON(btree_node_locked(linked, b->level));
798
799 t = btree_lock_want(linked, b->level);
800 if (t != BTREE_NODE_UNLOCKED) {
801 six_lock_increment(&b->lock, (enum six_lock_type) t);
802 mark_btree_node_locked(linked, b->level, (enum six_lock_type) t);
803 }
804
805 btree_iter_node_set(linked, b);
806 }
807
808 six_unlock_intent(&b->lock);
809}
810
811void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
812{
813 struct btree_iter *linked;
814 unsigned level = b->level;
815
ad7ae8d6
KO
816 /* caller now responsible for unlocking @b */
817
818 BUG_ON(iter->l[level].b != b);
819 BUG_ON(!btree_node_intent_locked(iter, level));
820
821 iter->l[level].b = BTREE_ITER_NOT_END;
822 mark_btree_node_unlocked(iter, level);
823
0f238367 824 trans_for_each_iter(iter->trans, linked)
1c6fdbd8 825 if (linked->l[level].b == b) {
ad7ae8d6 826 __btree_node_unlock(linked, level);
1c6fdbd8
KO
827 linked->l[level].b = BTREE_ITER_NOT_END;
828 }
829}
830
831/*
832 * A btree node has been modified in such a way as to invalidate iterators - fix
833 * them:
834 */
835void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
836{
837 struct btree_iter *linked;
838
0f238367 839 trans_for_each_iter_with_node(iter->trans, b, linked)
a00fd8c5 840 __btree_iter_init(linked, b->level);
1c6fdbd8
KO
841}
842
843static inline int btree_iter_lock_root(struct btree_iter *iter,
844 unsigned depth_want)
845{
9e5e5b9e 846 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
847 struct btree *b;
848 enum six_lock_type lock_type;
849 unsigned i;
850
851 EBUG_ON(iter->nodes_locked);
852
853 while (1) {
854 b = READ_ONCE(c->btree_roots[iter->btree_id].b);
855 iter->level = READ_ONCE(b->level);
856
857 if (unlikely(iter->level < depth_want)) {
858 /*
859 * the root is at a lower depth than the depth we want:
860 * got to the end of the btree, or we're walking nodes
861 * greater than some depth and there are no nodes >=
862 * that depth
863 */
864 iter->level = depth_want;
865 iter->l[iter->level].b = NULL;
8812600c 866 return 1;
1c6fdbd8
KO
867 }
868
869 lock_type = __btree_lock_want(iter, iter->level);
870 if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
871 iter, lock_type, true)))
872 return -EINTR;
873
874 if (likely(b == c->btree_roots[iter->btree_id].b &&
875 b->level == iter->level &&
876 !race_fault())) {
877 for (i = 0; i < iter->level; i++)
878 iter->l[i].b = BTREE_ITER_NOT_END;
879 iter->l[iter->level].b = b;
880
881 mark_btree_node_locked(iter, iter->level, lock_type);
882 btree_iter_node_set(iter, b);
883 return 0;
884
885 }
886
887 six_unlock_type(&b->lock, lock_type);
888 }
889}
890
891noinline
892static void btree_iter_prefetch(struct btree_iter *iter)
893{
9e5e5b9e 894 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
895 struct btree_iter_level *l = &iter->l[iter->level];
896 struct btree_node_iter node_iter = l->iter;
897 struct bkey_packed *k;
898 BKEY_PADDED(k) tmp;
9e5e5b9e 899 unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1c6fdbd8
KO
900 ? (iter->level > 1 ? 0 : 2)
901 : (iter->level > 1 ? 1 : 16);
902 bool was_locked = btree_node_locked(iter, iter->level);
903
904 while (nr) {
905 if (!bch2_btree_node_relock(iter, iter->level))
906 return;
907
908 bch2_btree_node_iter_advance(&node_iter, l->b);
909 k = bch2_btree_node_iter_peek(&node_iter, l->b);
910 if (!k)
911 break;
912
913 bch2_bkey_unpack(l->b, &tmp.k, k);
9e5e5b9e 914 bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
1c6fdbd8
KO
915 }
916
917 if (!was_locked)
918 btree_node_unlock(iter, iter->level);
919}
920
921static inline int btree_iter_down(struct btree_iter *iter)
922{
9e5e5b9e 923 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
924 struct btree_iter_level *l = &iter->l[iter->level];
925 struct btree *b;
926 unsigned level = iter->level - 1;
927 enum six_lock_type lock_type = __btree_lock_want(iter, level);
928 BKEY_PADDED(k) tmp;
929
930 BUG_ON(!btree_node_locked(iter, iter->level));
931
932 bch2_bkey_unpack(l->b, &tmp.k,
933 bch2_btree_node_iter_peek(&l->iter, l->b));
934
9e5e5b9e 935 b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, true);
1c6fdbd8
KO
936 if (unlikely(IS_ERR(b)))
937 return PTR_ERR(b);
938
939 mark_btree_node_locked(iter, level, lock_type);
940 btree_iter_node_set(iter, b);
941
942 if (iter->flags & BTREE_ITER_PREFETCH)
943 btree_iter_prefetch(iter);
944
945 iter->level = level;
946
947 return 0;
948}
949
950static void btree_iter_up(struct btree_iter *iter)
951{
952 btree_node_unlock(iter, iter->level++);
953}
954
955int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
956
957static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
958{
e542029e
KO
959 struct btree_trans *trans = iter->trans;
960 struct bch_fs *c = trans->c;
961 u8 sorted[BTREE_ITER_MAX];
962 unsigned i, nr_sorted = 0;
963
964 trans_for_each_iter(trans, iter)
965 sorted[nr_sorted++] = iter - trans->iters;
966
967#define btree_iter_cmp_by_idx(_l, _r) \
968 btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
969
970 bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
971#undef btree_iter_cmp_by_idx
972
1c6fdbd8 973retry_all:
e542029e 974 bch2_btree_trans_unlock(trans);
1c6fdbd8
KO
975
976 if (ret != -ENOMEM && ret != -EINTR)
977 goto io_error;
978
979 if (ret == -ENOMEM) {
980 struct closure cl;
981
982 closure_init_stack(&cl);
983
984 do {
985 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
986 closure_sync(&cl);
987 } while (ret);
988 }
989
1c6fdbd8 990 /* Now, redo traversals in correct order: */
e542029e
KO
991 for (i = 0; i < nr_sorted; i++) {
992 iter = &trans->iters[sorted[i]];
1c6fdbd8 993
e542029e
KO
994 do {
995 ret = __bch2_btree_iter_traverse(iter);
996 } while (ret == -EINTR);
1c6fdbd8 997
e542029e
KO
998 if (ret)
999 goto retry_all;
1000 }
1c6fdbd8 1001
e542029e 1002 ret = btree_trans_has_multiple_iters(trans) ? -EINTR : 0;
1c6fdbd8
KO
1003out:
1004 bch2_btree_cache_cannibalize_unlock(c);
1005 return ret;
1006io_error:
1007 BUG_ON(ret != -EIO);
1008
1009 iter->flags |= BTREE_ITER_ERROR;
1010 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1011 goto out;
1012}
1013
1014static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
1015 bool check_pos)
1016{
1017 unsigned l = iter->level;
1018
1019 while (btree_iter_node(iter, l) &&
1020 !(is_btree_node(iter, l) &&
1021 bch2_btree_node_relock(iter, l) &&
1022 (!check_pos ||
1023 btree_iter_pos_in_node(iter, iter->l[l].b)))) {
1024 btree_node_unlock(iter, l);
1025 iter->l[l].b = BTREE_ITER_NOT_END;
1026 l++;
1027 }
1028
1029 return l;
1030}
1031
1032/*
1033 * This is the main state machine for walking down the btree - walks down to a
1034 * specified depth
1035 *
1036 * Returns 0 on success, -EIO on error (error reading in a btree node).
1037 *
1038 * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1039 * stashed in the iterator and returned from bch2_btree_iter_unlock().
1040 */
1041int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1042{
1043 unsigned depth_want = iter->level;
1044
1045 if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1046 return 0;
1047
0f238367 1048 if (bch2_btree_iter_relock(iter))
1c6fdbd8
KO
1049 return 0;
1050
1c6fdbd8
KO
1051 /*
1052 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
1053 * here unnecessary
1054 */
1055 iter->level = btree_iter_up_until_locked(iter, true);
1056
1057 /*
1058 * If we've got a btree node locked (i.e. we aren't about to relock the
1059 * root) - advance its node iterator if necessary:
1060 *
1061 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
1062 */
a00fd8c5
KO
1063 if (btree_iter_node(iter, iter->level))
1064 btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
1c6fdbd8
KO
1065
1066 /*
1067 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1068 * would indicate to other code that we got to the end of the btree,
1069 * here it indicates that relocking the root failed - it's critical that
1070 * btree_iter_lock_root() comes next and that it can't fail
1071 */
1072 while (iter->level > depth_want) {
1073 int ret = btree_iter_node(iter, iter->level)
1074 ? btree_iter_down(iter)
1075 : btree_iter_lock_root(iter, depth_want);
1076 if (unlikely(ret)) {
8812600c
KO
1077 if (ret == 1)
1078 return 0;
1079
1c6fdbd8
KO
1080 iter->level = depth_want;
1081 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1082 return ret;
1083 }
1084 }
1085
1086 iter->uptodate = BTREE_ITER_NEED_PEEK;
271a3d3a 1087
0f238367 1088 bch2_btree_trans_verify_locks(iter->trans);
271a3d3a 1089 __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
1c6fdbd8
KO
1090 return 0;
1091}
1092
1093int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
1094{
1095 int ret;
1096
1097 ret = __bch2_btree_iter_traverse(iter);
1098 if (unlikely(ret))
1099 ret = btree_iter_traverse_error(iter, ret);
1100
0f238367 1101 BUG_ON(ret == -EINTR && !btree_trans_has_multiple_iters(iter->trans));
1c6fdbd8
KO
1102
1103 return ret;
1104}
1105
1106static inline void bch2_btree_iter_checks(struct btree_iter *iter,
1107 enum btree_iter_type type)
1108{
1109 EBUG_ON(iter->btree_id >= BTREE_ID_NR);
1c6fdbd8
KO
1110 EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
1111 (iter->btree_id == BTREE_ID_EXTENTS &&
1112 type != BTREE_ITER_NODES));
1113
0f238367 1114 bch2_btree_trans_verify_locks(iter->trans);
1c6fdbd8
KO
1115}
1116
1117/* Iterate across nodes (leaf and interior nodes) */
1118
1119struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1120{
1121 struct btree *b;
1122 int ret;
1123
1124 bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1125
1126 if (iter->uptodate == BTREE_ITER_UPTODATE)
1127 return iter->l[iter->level].b;
1128
1129 ret = bch2_btree_iter_traverse(iter);
1130 if (ret)
1131 return NULL;
1132
1133 b = btree_iter_node(iter, iter->level);
1134 if (!b)
1135 return NULL;
1136
1137 BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
1138
1139 iter->pos = b->key.k.p;
1140 iter->uptodate = BTREE_ITER_UPTODATE;
1141
1142 return b;
1143}
1144
1145struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
1146{
1147 struct btree *b;
1148 int ret;
1149
1150 bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1151
1152 /* already got to end? */
1153 if (!btree_iter_node(iter, iter->level))
1154 return NULL;
1155
1156 btree_iter_up(iter);
1157
1158 if (!bch2_btree_node_relock(iter, iter->level))
1159 btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
1160
1161 ret = bch2_btree_iter_traverse(iter);
1162 if (ret)
1163 return NULL;
1164
1165 /* got to end? */
1166 b = btree_iter_node(iter, iter->level);
1167 if (!b)
1168 return NULL;
1169
1170 if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
1171 /*
1172 * Haven't gotten to the end of the parent node: go back down to
1173 * the next child node
1174 */
1175
1176 /*
1177 * We don't really want to be unlocking here except we can't
1178 * directly tell btree_iter_traverse() "traverse to this level"
1179 * except by setting iter->level, so we have to unlock so we
1180 * don't screw up our lock invariants:
1181 */
1182 if (btree_node_read_locked(iter, iter->level))
1183 btree_node_unlock(iter, iter->level);
1184
1185 /* ick: */
1186 iter->pos = iter->btree_id == BTREE_ID_INODES
1187 ? btree_type_successor(iter->btree_id, iter->pos)
1188 : bkey_successor(iter->pos);
1189 iter->level = depth;
1190
1191 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1192 ret = bch2_btree_iter_traverse(iter);
1193 if (ret)
1194 return NULL;
1195
1196 b = iter->l[iter->level].b;
1197 }
1198
1199 iter->pos = b->key.k.p;
1200 iter->uptodate = BTREE_ITER_UPTODATE;
1201
1202 return b;
1203}
1204
1205/* Iterate across keys (in leaf nodes only) */
1206
1207void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
1208{
1209 struct btree_iter_level *l = &iter->l[0];
1c6fdbd8
KO
1210
1211 EBUG_ON(iter->level != 0);
1212 EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
1213 EBUG_ON(!btree_node_locked(iter, 0));
1214 EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
1215
1216 iter->pos = new_pos;
1217 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1218
a00fd8c5 1219 btree_iter_advance_to_pos(iter, l, -1);
1c6fdbd8 1220
a00fd8c5
KO
1221 if (bch2_btree_node_iter_end(&l->iter) &&
1222 btree_iter_pos_after_node(iter, l->b))
1c6fdbd8 1223 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1c6fdbd8
KO
1224}
1225
1226void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
1227{
1228 int cmp = bkey_cmp(new_pos, iter->pos);
1229 unsigned level;
1230
1231 if (!cmp)
1232 return;
1233
1234 iter->pos = new_pos;
1235
1236 level = btree_iter_up_until_locked(iter, true);
1237
1238 if (btree_iter_node(iter, level)) {
1c6fdbd8
KO
1239 /*
1240 * We might have to skip over many keys, or just a few: try
1241 * advancing the node iterator, and if we have to skip over too
1242 * many keys just reinit it (or if we're rewinding, since that
1243 * is expensive).
1244 */
a00fd8c5
KO
1245 if (cmp < 0 ||
1246 !btree_iter_advance_to_pos(iter, &iter->l[level], 8))
1247 __btree_iter_init(iter, level);
1c6fdbd8
KO
1248
1249 /* Don't leave it locked if we're not supposed to: */
1250 if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
1251 btree_node_unlock(iter, level);
1252 }
1253
1254 if (level != iter->level)
1255 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1256 else
1257 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1258}
1259
1260static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
1261{
1262 struct btree_iter_level *l = &iter->l[0];
1263 struct bkey_s_c ret = { .k = &iter->k };
1264
1265 if (!bkey_deleted(&iter->k)) {
1266 EBUG_ON(bch2_btree_node_iter_end(&l->iter));
1267 ret.v = bkeyp_val(&l->b->format,
1268 __bch2_btree_node_iter_peek_all(&l->iter, l->b));
1269 }
1270
9e5e5b9e 1271 if (debug_check_bkeys(iter->trans->c) &&
1c6fdbd8 1272 !bkey_deleted(ret.k))
9e5e5b9e 1273 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1c6fdbd8
KO
1274 return ret;
1275}
1276
1277struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1278{
1279 struct btree_iter_level *l = &iter->l[0];
1280 struct bkey_s_c k;
1281 int ret;
1282
1283 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1284
1285 if (iter->uptodate == BTREE_ITER_UPTODATE)
1286 return btree_iter_peek_uptodate(iter);
1287
1288 while (1) {
1289 ret = bch2_btree_iter_traverse(iter);
1290 if (unlikely(ret))
1291 return bkey_s_c_err(ret);
1292
1293 k = __btree_iter_peek(iter, l);
1294 if (likely(k.k))
1295 break;
1296
1297 /* got to the end of the leaf, iterator needs to be traversed: */
1298 iter->pos = l->b->key.k.p;
1299 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1300
1301 if (!bkey_cmp(iter->pos, POS_MAX))
1302 return bkey_s_c_null;
1303
1304 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1305 }
1306
1307 /*
1308 * iter->pos should always be equal to the key we just
1309 * returned - except extents can straddle iter->pos:
1310 */
1311 if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
1312 bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1313 iter->pos = bkey_start_pos(k.k);
1314
1315 iter->uptodate = BTREE_ITER_UPTODATE;
1316 return k;
1317}
1318
1319static noinline
1320struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
1321{
1322 struct btree_iter_level *l = &iter->l[0];
1323
1324 iter->pos = l->b->key.k.p;
1325 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1326
1327 if (!bkey_cmp(iter->pos, POS_MAX))
1328 return bkey_s_c_null;
1329
1330 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1331
1332 return bch2_btree_iter_peek(iter);
1333}
1334
1335struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1336{
1337 struct btree_iter_level *l = &iter->l[0];
1338 struct bkey_packed *p;
1339 struct bkey_s_c k;
1340
1341 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1342
1343 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1344 k = bch2_btree_iter_peek(iter);
1345 if (IS_ERR_OR_NULL(k.k))
1346 return k;
1347 }
1348
1349 do {
a00fd8c5 1350 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8
KO
1351 p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1352 if (unlikely(!p))
1353 return bch2_btree_iter_peek_next_leaf(iter);
1354 } while (bkey_whiteout(p));
1355
1356 k = __btree_iter_unpack(iter, l, &iter->k, p);
1357
1358 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
1359 iter->pos = bkey_start_pos(k.k);
1360 return k;
1361}
1362
1363struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1364{
1365 struct btree_iter_level *l = &iter->l[0];
1366 struct bkey_packed *p;
1367 struct bkey_s_c k;
1368 int ret;
1369
1370 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1371
1372 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1373 k = bch2_btree_iter_peek(iter);
1374 if (IS_ERR(k.k))
1375 return k;
1376 }
1377
1378 while (1) {
1379 p = bch2_btree_node_iter_prev(&l->iter, l->b);
1380 if (likely(p))
1381 break;
1382
1383 iter->pos = l->b->data->min_key;
1384 if (!bkey_cmp(iter->pos, POS_MIN))
1385 return bkey_s_c_null;
1386
1387 bch2_btree_iter_set_pos(iter,
1388 btree_type_predecessor(iter->btree_id, iter->pos));
1389
1390 ret = bch2_btree_iter_traverse(iter);
1391 if (unlikely(ret))
1392 return bkey_s_c_err(ret);
1393
1394 p = bch2_btree_node_iter_peek(&l->iter, l->b);
1395 if (p)
1396 break;
1397 }
1398
1399 k = __btree_iter_unpack(iter, l, &iter->k, p);
1400
1401 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
1402
1403 iter->pos = bkey_start_pos(k.k);
1404 iter->uptodate = BTREE_ITER_UPTODATE;
1405 return k;
1406}
1407
1408static inline struct bkey_s_c
271a3d3a 1409__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
1c6fdbd8
KO
1410{
1411 struct btree_iter_level *l = &iter->l[0];
271a3d3a 1412 struct btree_node_iter node_iter;
1c6fdbd8
KO
1413 struct bkey_s_c k;
1414 struct bkey n;
1415 int ret;
1416
1417recheck:
1418 while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1419 bkey_deleted(k.k) &&
1420 bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
a00fd8c5 1421 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8 1422
271a3d3a
KO
1423 /*
1424 * iterator is now at the correct position for inserting at iter->pos,
1425 * but we need to keep iterating until we find the first non whiteout so
1426 * we know how big a hole we have, if any:
1427 */
1428
1429 node_iter = l->iter;
1430 if (k.k && bkey_whiteout(k.k))
1431 k = __btree_iter_unpack(iter, l, &iter->k,
1432 bch2_btree_node_iter_peek(&node_iter, l->b));
1433
1c6fdbd8
KO
1434 /*
1435 * If we got to the end of the node, check if we need to traverse to the
1436 * next node:
1437 */
1438 if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1439 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1440 ret = bch2_btree_iter_traverse(iter);
1441 if (unlikely(ret))
1442 return bkey_s_c_err(ret);
1443
1444 goto recheck;
1445 }
1446
1447 if (k.k &&
1448 !bkey_whiteout(k.k) &&
1449 bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
271a3d3a
KO
1450 /*
1451 * if we skipped forward to find the first non whiteout and
1452 * there _wasn't_ actually a hole, we want the iterator to be
1453 * pointed at the key we found:
1454 */
1455 l->iter = node_iter;
1456
1c6fdbd8
KO
1457 EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
1458 EBUG_ON(bkey_deleted(k.k));
1459 iter->uptodate = BTREE_ITER_UPTODATE;
1460 return k;
1461 }
1462
1463 /* hole */
271a3d3a
KO
1464
1465 /* holes can't span inode numbers: */
1466 if (iter->pos.offset == KEY_OFFSET_MAX) {
1467 if (iter->pos.inode == KEY_INODE_MAX)
1468 return bkey_s_c_null;
1469
1470 iter->pos = bkey_successor(iter->pos);
1471 goto recheck;
1472 }
1473
1474 if (!k.k)
1475 k.k = &l->b->key.k;
1476
1c6fdbd8
KO
1477 bkey_init(&n);
1478 n.p = iter->pos;
271a3d3a
KO
1479 bch2_key_resize(&n,
1480 min_t(u64, KEY_SIZE_MAX,
1481 (k.k->p.inode == n.p.inode
1482 ? bkey_start_offset(k.k)
1483 : KEY_OFFSET_MAX) -
1484 n.p.offset));
1485
319f9ac3 1486 EBUG_ON(!n.size);
1c6fdbd8 1487
271a3d3a
KO
1488 iter->k = n;
1489 iter->uptodate = BTREE_ITER_UPTODATE;
1490 return (struct bkey_s_c) { &iter->k, NULL };
1491}
1c6fdbd8 1492
271a3d3a
KO
1493static inline struct bkey_s_c
1494__bch2_btree_iter_peek_slot(struct btree_iter *iter)
1495{
1496 struct btree_iter_level *l = &iter->l[0];
1497 struct bkey_s_c k;
1498 int ret;
1c6fdbd8 1499
271a3d3a
KO
1500 if (iter->flags & BTREE_ITER_IS_EXTENTS)
1501 return __bch2_btree_iter_peek_slot_extents(iter);
1c6fdbd8 1502
271a3d3a
KO
1503recheck:
1504 while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1505 bkey_deleted(k.k) &&
1506 bkey_cmp(k.k->p, iter->pos) == 0)
a00fd8c5 1507 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8 1508
271a3d3a
KO
1509 /*
1510 * If we got to the end of the node, check if we need to traverse to the
1511 * next node:
1512 */
1513 if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1514 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1515 ret = bch2_btree_iter_traverse(iter);
1516 if (unlikely(ret))
1517 return bkey_s_c_err(ret);
1c6fdbd8 1518
271a3d3a 1519 goto recheck;
1c6fdbd8
KO
1520 }
1521
271a3d3a
KO
1522 if (k.k &&
1523 !bkey_deleted(k.k) &&
1524 !bkey_cmp(iter->pos, k.k->p)) {
1525 iter->uptodate = BTREE_ITER_UPTODATE;
1526 return k;
1527 } else {
1528 /* hole */
1529 bkey_init(&iter->k);
1530 iter->k.p = iter->pos;
1531
1532 iter->uptodate = BTREE_ITER_UPTODATE;
1533 return (struct bkey_s_c) { &iter->k, NULL };
1534 }
1c6fdbd8
KO
1535}
1536
1537struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1538{
1539 int ret;
1540
1541 bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1542
1543 if (iter->uptodate == BTREE_ITER_UPTODATE)
1544 return btree_iter_peek_uptodate(iter);
1545
1546 ret = bch2_btree_iter_traverse(iter);
1547 if (unlikely(ret))
1548 return bkey_s_c_err(ret);
1549
1550 return __bch2_btree_iter_peek_slot(iter);
1551}
1552
1553struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1554{
1555 bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1556
1557 iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
1558
1559 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1560 /*
1561 * XXX: when we just need to relock we should be able to avoid
1562 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
1563 * for that to work
1564 */
1565 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1566
1567 return bch2_btree_iter_peek_slot(iter);
1568 }
1569
1570 if (!bkey_deleted(&iter->k))
a00fd8c5 1571 bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b);
1c6fdbd8
KO
1572
1573 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1574
1575 return __bch2_btree_iter_peek_slot(iter);
1576}
1577
9e5e5b9e
KO
1578static inline void bch2_btree_iter_init(struct btree_trans *trans,
1579 struct btree_iter *iter, enum btree_id btree_id,
424eb881 1580 struct bpos pos, unsigned flags)
1c6fdbd8 1581{
9e5e5b9e 1582 struct bch_fs *c = trans->c;
1c6fdbd8
KO
1583 unsigned i;
1584
424eb881
KO
1585 if (btree_id == BTREE_ID_EXTENTS &&
1586 !(flags & BTREE_ITER_NODES))
1587 flags |= BTREE_ITER_IS_EXTENTS;
1c6fdbd8 1588
9e5e5b9e 1589 iter->trans = trans;
1c6fdbd8
KO
1590 iter->pos = pos;
1591 bkey_init(&iter->k);
1592 iter->k.p = pos;
1593 iter->flags = flags;
1594 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1595 iter->btree_id = btree_id;
424eb881
KO
1596 iter->level = 0;
1597 iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0;
1c6fdbd8
KO
1598 iter->nodes_locked = 0;
1599 iter->nodes_intent_locked = 0;
1600 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1601 iter->l[i].b = NULL;
1602 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1c6fdbd8
KO
1603
1604 prefetch(c->btree_roots[btree_id].b);
1605}
1606
446c562c 1607static void bch2_btree_iter_unlink(struct btree_iter *iter)
1c6fdbd8
KO
1608{
1609 struct btree_iter *linked;
1610
1611 __bch2_btree_iter_unlock(iter);
1612
1613 if (!btree_iter_linked(iter))
1614 return;
1615
0f238367 1616 trans_for_each_iter(iter->trans, linked)
1c6fdbd8
KO
1617 if (linked->next == iter) {
1618 linked->next = iter->next;
1619 iter->next = iter;
1620 return;
1621 }
1622
1623 BUG();
1624}
1625
446c562c 1626static void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
1c6fdbd8
KO
1627{
1628 BUG_ON(btree_iter_linked(new));
1629
1630 new->next = iter->next;
1631 iter->next = new;
1632}
1633
7c26ecae
KO
1634static void __bch2_btree_iter_copy(struct btree_iter *dst,
1635 struct btree_iter *src)
1c6fdbd8
KO
1636{
1637 unsigned i;
1638
1c6fdbd8
KO
1639 memcpy(dst, src, offsetof(struct btree_iter, next));
1640
1641 for (i = 0; i < BTREE_MAX_DEPTH; i++)
1642 if (btree_node_locked(dst, i))
1643 six_lock_increment(&dst->l[i].b->lock,
1644 __btree_lock_want(dst, i));
1645}
1646
7c26ecae
KO
1647void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
1648{
1649 __bch2_btree_iter_unlock(dst);
1650 __bch2_btree_iter_copy(dst, src);
1651}
1652
1c6fdbd8
KO
1653/* new transactional stuff: */
1654
1655static void btree_trans_verify(struct btree_trans *trans)
1656{
1657 unsigned i;
1658
1659 for (i = 0; i < trans->nr_iters; i++) {
1660 struct btree_iter *iter = &trans->iters[i];
1661
1662 BUG_ON(btree_iter_linked(iter) !=
1663 ((trans->iters_linked & (1 << i)) &&
1664 !is_power_of_2(trans->iters_linked)));
1665 }
1666}
1667
08af47df
KO
1668static inline unsigned btree_trans_iter_idx(struct btree_trans *trans,
1669 struct btree_iter *iter)
1670{
1671 ssize_t idx = iter - trans->iters;
1672
5df4be3f
KO
1673 EBUG_ON(idx < 0 || idx >= trans->nr_iters);
1674 EBUG_ON(!(trans->iters_linked & (1ULL << idx)));
08af47df
KO
1675
1676 return idx;
1677}
1678
424eb881
KO
1679int bch2_trans_iter_put(struct btree_trans *trans,
1680 struct btree_iter *iter)
08af47df
KO
1681{
1682 ssize_t idx = btree_trans_iter_idx(trans, iter);
0f238367 1683 int ret = btree_iter_err(iter);
08af47df 1684
a8e00bd4 1685 trans->iters_live &= ~(1ULL << idx);
424eb881 1686 return ret;
08af47df
KO
1687}
1688
5df4be3f
KO
1689static inline void __bch2_trans_iter_free(struct btree_trans *trans,
1690 unsigned idx)
1691{
1692 trans->iters_linked &= ~(1ULL << idx);
1693 trans->iters_live &= ~(1ULL << idx);
1694 trans->iters_touched &= ~(1ULL << idx);
1695 trans->iters_unlink_on_restart &= ~(1ULL << idx);
1696 trans->iters_unlink_on_commit &= ~(1ULL << idx);
1697 bch2_btree_iter_unlink(&trans->iters[idx]);
1698}
1699
424eb881
KO
1700int bch2_trans_iter_free(struct btree_trans *trans,
1701 struct btree_iter *iter)
1c6fdbd8 1702{
0f238367 1703 int ret = btree_iter_err(iter);
424eb881 1704
5df4be3f 1705 __bch2_trans_iter_free(trans, btree_trans_iter_idx(trans, iter));
424eb881 1706 return ret;
5df4be3f 1707}
1c6fdbd8 1708
424eb881
KO
1709int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
1710 struct btree_iter *iter)
5df4be3f 1711{
0f238367 1712 int ret = btree_iter_err(iter);
424eb881 1713
5df4be3f
KO
1714 trans->iters_unlink_on_commit |=
1715 1ULL << btree_trans_iter_idx(trans, iter);
424eb881 1716 return ret;
1c6fdbd8
KO
1717}
1718
a8e00bd4
KO
1719static int btree_trans_realloc_iters(struct btree_trans *trans,
1720 unsigned new_size)
1c6fdbd8 1721{
a8e00bd4 1722 void *new_iters, *new_updates;
1c6fdbd8
KO
1723 unsigned i;
1724
a8e00bd4
KO
1725 BUG_ON(new_size > BTREE_ITER_MAX);
1726
1727 if (new_size <= trans->size)
1728 return 0;
1729
1730 BUG_ON(trans->used_mempool);
1731
1c6fdbd8
KO
1732 bch2_trans_unlock(trans);
1733
a8e00bd4
KO
1734 new_iters = kmalloc(sizeof(struct btree_iter) * new_size +
1735 sizeof(struct btree_insert_entry) * (new_size + 4),
1736 GFP_NOFS);
1737 if (new_iters)
1738 goto success;
1739
581edb63 1740 new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
a8e00bd4
KO
1741 new_size = BTREE_ITER_MAX;
1742
1743 trans->used_mempool = true;
1744success:
1745 new_updates = new_iters + sizeof(struct btree_iter) * new_size;
1c6fdbd8
KO
1746
1747 memcpy(new_iters, trans->iters,
1748 sizeof(struct btree_iter) * trans->nr_iters);
a8e00bd4
KO
1749 memcpy(new_updates, trans->updates,
1750 sizeof(struct btree_insert_entry) * trans->nr_updates);
1751
5df4be3f
KO
1752 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
1753 memset(trans->iters, POISON_FREE,
1754 sizeof(struct btree_iter) * trans->nr_iters +
1755 sizeof(struct btree_insert_entry) * trans->nr_iters);
1756
a8e00bd4
KO
1757 if (trans->iters != trans->iters_onstack)
1758 kfree(trans->iters);
1759
1760 trans->iters = new_iters;
1761 trans->updates = new_updates;
1762 trans->size = new_size;
1c6fdbd8
KO
1763
1764 for (i = 0; i < trans->nr_iters; i++)
1765 trans->iters[i].next = &trans->iters[i];
1766
1767 if (trans->iters_linked) {
1768 unsigned first_linked = __ffs(trans->iters_linked);
1769
1770 for (i = first_linked + 1; i < trans->nr_iters; i++)
1771 if (trans->iters_linked & (1 << i))
1772 bch2_btree_iter_link(&trans->iters[first_linked],
1773 &trans->iters[i]);
1774 }
1775
1776 btree_trans_verify(trans);
1777
1c7a0adf
KO
1778 if (trans->iters_live) {
1779 trans_restart();
1780 return -EINTR;
1781 }
1782
1783 return 0;
1c6fdbd8
KO
1784}
1785
581edb63 1786void bch2_trans_preload_iters(struct btree_trans *trans)
1c6fdbd8 1787{
a8e00bd4 1788 btree_trans_realloc_iters(trans, BTREE_ITER_MAX);
1c6fdbd8
KO
1789}
1790
7c26ecae
KO
1791static int btree_trans_iter_alloc(struct btree_trans *trans)
1792{
1793 struct btree_iter *iter;
1794 unsigned idx = ffz(trans->iters_linked);
1795
1796 if (idx < trans->nr_iters)
1797 goto got_slot;
1798
1799 if (trans->nr_iters == trans->size) {
1800 int ret = btree_trans_realloc_iters(trans, trans->size * 2);
1801 if (ret)
1802 return ret;
1803 }
1804
1805 idx = trans->nr_iters++;
1806 BUG_ON(trans->nr_iters > trans->size);
1807got_slot:
1808 iter = &trans->iters[idx];
1809 iter->next = iter;
1810
1811 BUG_ON(trans->iters_linked & (1ULL << idx));
1812
1813 if (trans->iters_linked)
1814 bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)],
1815 &trans->iters[idx]);
1816 trans->iters_linked |= 1ULL << idx;
1817 return idx;
1818}
1819
1c6fdbd8 1820static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
5df4be3f 1821 unsigned btree_id, struct bpos pos,
1c6fdbd8
KO
1822 unsigned flags, u64 iter_id)
1823{
1824 struct btree_iter *iter;
1825 int idx;
1826
1827 BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
1828
5df4be3f 1829 for (idx = 0; idx < trans->nr_iters; idx++) {
7c26ecae
KO
1830 if (!(trans->iters_linked & (1ULL << idx)))
1831 continue;
1832
5df4be3f
KO
1833 iter = &trans->iters[idx];
1834 if (iter_id
1835 ? iter->id == iter_id
1836 : (iter->btree_id == btree_id &&
1837 !bkey_cmp(iter->pos, pos)))
1c6fdbd8 1838 goto found;
5df4be3f 1839 }
1c6fdbd8
KO
1840 idx = -1;
1841found:
1842 if (idx < 0) {
7c26ecae
KO
1843 idx = btree_trans_iter_alloc(trans);
1844 if (idx < 0)
1845 return ERR_PTR(idx);
1c6fdbd8 1846
1c6fdbd8 1847 iter = &trans->iters[idx];
a8e00bd4 1848 iter->id = iter_id;
1c6fdbd8 1849
9e5e5b9e 1850 bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
1c6fdbd8
KO
1851 } else {
1852 iter = &trans->iters[idx];
1853
1c6fdbd8
KO
1854 iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1855 iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1856 }
1857
5df4be3f 1858 BUG_ON(iter->btree_id != btree_id);
a8e00bd4 1859 BUG_ON(trans->iters_live & (1ULL << idx));
5df4be3f
KO
1860 trans->iters_live |= 1ULL << idx;
1861 trans->iters_touched |= 1ULL << idx;
1c6fdbd8 1862
1c6fdbd8
KO
1863 btree_trans_verify(trans);
1864
08af47df
KO
1865 BUG_ON(iter->btree_id != btree_id);
1866 BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
1867
1c6fdbd8
KO
1868 return iter;
1869}
1870
1871struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
1872 enum btree_id btree_id,
1873 struct bpos pos, unsigned flags,
1874 u64 iter_id)
1875{
1876 struct btree_iter *iter =
5df4be3f 1877 __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
1c6fdbd8
KO
1878
1879 if (!IS_ERR(iter))
1880 bch2_btree_iter_set_pos(iter, pos);
1881 return iter;
1882}
1883
424eb881
KO
1884struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
1885 enum btree_id btree_id,
1886 struct bpos pos,
1887 unsigned locks_want,
1888 unsigned depth,
1889 unsigned flags)
1890{
1891 struct btree_iter *iter =
1892 __btree_trans_get_iter(trans, btree_id, pos,
1893 flags|BTREE_ITER_NODES, 0);
1894 unsigned i;
1895
1896 BUG_ON(IS_ERR(iter));
1897 BUG_ON(bkey_cmp(iter->pos, pos));
1898
1899 iter->locks_want = locks_want;
1900 iter->level = depth;
1901
1902 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1903 iter->l[i].b = NULL;
1904 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1905
1906 return iter;
1907}
1908
7c26ecae
KO
1909struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
1910 struct btree_iter *src)
1c6fdbd8 1911{
7c26ecae 1912 int idx;
1c6fdbd8 1913
7c26ecae
KO
1914 idx = btree_trans_iter_alloc(trans);
1915 if (idx < 0)
1916 return ERR_PTR(idx);
1917
1918 trans->iters_live |= 1ULL << idx;
1919 trans->iters_touched |= 1ULL << idx;
1920 trans->iters_unlink_on_restart |= 1ULL << idx;
1921
1922 __bch2_btree_iter_copy(&trans->iters[idx], src);
1923
1924 return &trans->iters[idx];
1c6fdbd8
KO
1925}
1926
1927void *bch2_trans_kmalloc(struct btree_trans *trans,
1928 size_t size)
1929{
1930 void *ret;
1931
1932 if (trans->mem_top + size > trans->mem_bytes) {
1933 size_t old_bytes = trans->mem_bytes;
1934 size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
1935 void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
1936
1937 if (!new_mem)
1938 return ERR_PTR(-ENOMEM);
1939
1940 trans->mem = new_mem;
1941 trans->mem_bytes = new_bytes;
1942
1c7a0adf
KO
1943 if (old_bytes) {
1944 trans_restart();
1c6fdbd8 1945 return ERR_PTR(-EINTR);
1c7a0adf 1946 }
1c6fdbd8
KO
1947 }
1948
1949 ret = trans->mem + trans->mem_top;
1950 trans->mem_top += size;
1951 return ret;
1952}
1953
1954int bch2_trans_unlock(struct btree_trans *trans)
1955{
1956 unsigned iters = trans->iters_linked;
1957 int ret = 0;
1958
1959 while (iters) {
1960 unsigned idx = __ffs(iters);
1961 struct btree_iter *iter = &trans->iters[idx];
1962
0f238367 1963 ret = ret ?: btree_iter_err(iter);
1c6fdbd8
KO
1964
1965 __bch2_btree_iter_unlock(iter);
1966 iters ^= 1 << idx;
1967 }
1968
1969 return ret;
1970}
1971
5df4be3f
KO
1972inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
1973{
1974 iters &= trans->iters_linked;
1975
1976 while (iters) {
1977 unsigned idx = __ffs64(iters);
1978
1979 iters &= ~(1ULL << idx);
1980 __bch2_trans_iter_free(trans, idx);
1981 }
1982}
1983
1c7a0adf 1984void __bch2_trans_begin(struct btree_trans *trans)
1c6fdbd8 1985{
5df4be3f 1986 u64 iters_to_unlink;
1c6fdbd8
KO
1987
1988 btree_trans_verify(trans);
1989
1990 /*
1991 * On transaction restart, the transaction isn't required to allocate
1992 * all the same iterators it on the last iteration:
1993 *
1994 * Unlink any iterators it didn't use this iteration, assuming it got
1995 * further (allocated an iter with a higher idx) than where the iter
1996 * was originally allocated:
1997 */
5df4be3f
KO
1998 iters_to_unlink = ~trans->iters_live &
1999 ((1ULL << fls64(trans->iters_live)) - 1);
a8e00bd4 2000
5df4be3f
KO
2001 iters_to_unlink |= trans->iters_unlink_on_restart;
2002 iters_to_unlink |= trans->iters_unlink_on_commit;
a8e00bd4 2003
5df4be3f 2004 bch2_trans_unlink_iters(trans, iters_to_unlink);
1c6fdbd8 2005
5df4be3f
KO
2006 trans->iters_live = 0;
2007 trans->iters_touched = 0;
2008 trans->iters_unlink_on_restart = 0;
2009 trans->iters_unlink_on_commit = 0;
2010 trans->nr_updates = 0;
2011 trans->mem_top = 0;
1c6fdbd8
KO
2012
2013 btree_trans_verify(trans);
2014}
2015
2016void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
2017{
a8e00bd4
KO
2018 memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
2019
1c6fdbd8 2020 trans->c = c;
a8e00bd4 2021 trans->size = ARRAY_SIZE(trans->iters_onstack);
1c6fdbd8 2022 trans->iters = trans->iters_onstack;
a8e00bd4 2023 trans->updates = trans->updates_onstack;
1c6fdbd8
KO
2024}
2025
2026int bch2_trans_exit(struct btree_trans *trans)
2027{
2028 int ret = bch2_trans_unlock(trans);
2029
2030 kfree(trans->mem);
a8e00bd4 2031 if (trans->used_mempool)
581edb63 2032 mempool_free(trans->iters, &trans->c->btree_iters_pool);
a8e00bd4
KO
2033 else if (trans->iters != trans->iters_onstack)
2034 kfree(trans->iters);
1c6fdbd8
KO
2035 trans->mem = (void *) 0x1;
2036 trans->iters = (void *) 0x1;
2037 return ret;
2038}