bcachefs: Kill remaining bch2_btree_iter_unlock() uses
[linux-block.git] / fs / bcachefs / btree_iter.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "bkey_methods.h"
5#include "btree_cache.h"
6#include "btree_iter.h"
7#include "btree_locking.h"
8#include "debug.h"
9#include "extents.h"
10#include "trace.h"
11
12#include <linux/prefetch.h>
13
14static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
15 struct btree_iter_level *,
16 struct bkey *);
17
18#define BTREE_ITER_NOT_END ((struct btree *) 1)
19
20static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
21{
22 return l < BTREE_MAX_DEPTH &&
23 iter->l[l].b &&
24 iter->l[l].b != BTREE_ITER_NOT_END;
25}
26
a00fd8c5
KO
27/* Returns < 0 if @k is before iter pos, > 0 if @k is after */
28static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
29 const struct btree *b,
30 const struct bkey_packed *k,
31 bool interior_node)
32{
33 int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
34
35 if (cmp)
36 return cmp;
37 if (bkey_deleted(k))
38 return -1;
cbdf24ce
KO
39
40 /*
41 * Normally, for extents we want the first key strictly greater than
42 * the iterator position - with the exception that for interior nodes,
43 * we don't want to advance past the last key if the iterator position
44 * is POS_MAX:
45 */
46 if (iter->flags & BTREE_ITER_IS_EXTENTS &&
47 (!interior_node ||
48 bkey_cmp_left_packed_byval(b, k, POS_MAX)))
a00fd8c5
KO
49 return -1;
50 return 1;
51}
52
53static inline int btree_iter_pos_cmp(struct btree_iter *iter,
54 const struct btree *b,
55 const struct bkey_packed *k)
56{
c43a6ef9 57 return __btree_iter_pos_cmp(iter, b, k, b->c.level != 0);
a00fd8c5
KO
58}
59
1c6fdbd8
KO
60/* Btree node locking: */
61
62/*
63 * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
64 * succeed:
65 */
66void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
67{
68 struct btree_iter *linked;
69
c43a6ef9
KO
70 EBUG_ON(iter->l[b->c.level].b != b);
71 EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
1c6fdbd8 72
0f238367 73 trans_for_each_iter_with_node(iter->trans, b, linked)
c43a6ef9 74 linked->l[b->c.level].lock_seq += 2;
1c6fdbd8 75
c43a6ef9 76 six_unlock_write(&b->c.lock);
1c6fdbd8
KO
77}
78
79void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
80{
1c6fdbd8
KO
81 struct btree_iter *linked;
82 unsigned readers = 0;
83
c43a6ef9 84 EBUG_ON(btree_node_read_locked(iter, b->c.level));
1c6fdbd8 85
0f238367 86 trans_for_each_iter(iter->trans, linked)
c43a6ef9
KO
87 if (linked->l[b->c.level].b == b &&
88 btree_node_read_locked(linked, b->c.level))
1c6fdbd8
KO
89 readers++;
90
91 /*
92 * Must drop our read locks before calling six_lock_write() -
93 * six_unlock() won't do wakeups until the reader count
94 * goes to 0, and it's safe because we have the node intent
95 * locked:
96 */
97 atomic64_sub(__SIX_VAL(read_lock, readers),
c43a6ef9 98 &b->c.lock.state.counter);
9e5e5b9e 99 btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
1c6fdbd8 100 atomic64_add(__SIX_VAL(read_lock, readers),
c43a6ef9 101 &b->c.lock.state.counter);
1c6fdbd8
KO
102}
103
1c6fdbd8
KO
104bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
105{
106 struct btree *b = btree_iter_node(iter, level);
107 int want = __btree_lock_want(iter, level);
108
109 if (!b || b == BTREE_ITER_NOT_END)
110 return false;
111
112 if (race_fault())
113 return false;
114
c43a6ef9
KO
115 if (!six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) &&
116 !(iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1 &&
1c6fdbd8
KO
117 btree_node_lock_increment(iter, b, level, want)))
118 return false;
119
120 mark_btree_node_locked(iter, level, want);
121 return true;
122}
123
124static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
125{
126 struct btree *b = iter->l[level].b;
127
128 EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
129
130 if (!is_btree_node(iter, level))
131 return false;
132
133 if (btree_node_intent_locked(iter, level))
134 return true;
135
136 if (race_fault())
137 return false;
138
139 if (btree_node_locked(iter, level)
c43a6ef9
KO
140 ? six_lock_tryupgrade(&b->c.lock)
141 : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
1c6fdbd8
KO
142 goto success;
143
c43a6ef9 144 if (iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1 &&
1c6fdbd8
KO
145 btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
146 btree_node_unlock(iter, level);
147 goto success;
148 }
149
150 return false;
151success:
152 mark_btree_node_intent_locked(iter, level);
153 return true;
154}
155
156static inline bool btree_iter_get_locks(struct btree_iter *iter,
157 bool upgrade)
158{
159 unsigned l = iter->level;
160 int fail_idx = -1;
161
162 do {
163 if (!btree_iter_node(iter, l))
164 break;
165
166 if (!(upgrade
167 ? bch2_btree_node_upgrade(iter, l)
168 : bch2_btree_node_relock(iter, l))) {
169 fail_idx = l;
170 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
171 }
172
173 l++;
174 } while (l < iter->locks_want);
175
176 /*
177 * When we fail to get a lock, we have to ensure that any child nodes
178 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
179 * the node that we failed to relock:
180 */
181 while (fail_idx >= 0) {
182 btree_node_unlock(iter, fail_idx);
183 iter->l[fail_idx].b = BTREE_ITER_NOT_END;
184 --fail_idx;
185 }
186
187 if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
188 iter->uptodate = BTREE_ITER_NEED_PEEK;
189
0f238367
KO
190 bch2_btree_trans_verify_locks(iter->trans);
191
1c6fdbd8
KO
192 return iter->uptodate < BTREE_ITER_NEED_RELOCK;
193}
194
195/* Slowpath: */
196bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
197 unsigned level,
198 struct btree_iter *iter,
199 enum six_lock_type type,
200 bool may_drop_locks)
201{
1c6fdbd8
KO
202 struct btree_iter *linked;
203 bool ret = true;
204
647d7b60 205 /* Check if it's safe to block: */
0f238367 206 trans_for_each_iter(iter->trans, linked) {
1c6fdbd8
KO
207 if (!linked->nodes_locked)
208 continue;
209
647d7b60 210 /* * Must lock btree nodes in key order: */
1c6fdbd8
KO
211 if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
212 ret = false;
213
214 /*
215 * Can't block taking an intent lock if we have _any_ nodes read
216 * locked:
217 *
218 * - Our read lock blocks another thread with an intent lock on
219 * the same node from getting a write lock, and thus from
220 * dropping its intent lock
221 *
222 * - And the other thread may have multiple nodes intent locked:
223 * both the node we want to intent lock, and the node we
224 * already have read locked - deadlock:
225 */
226 if (type == SIX_LOCK_intent &&
227 linked->nodes_locked != linked->nodes_intent_locked) {
228 if (may_drop_locks) {
229 linked->locks_want = max_t(unsigned,
230 linked->locks_want,
231 __fls(linked->nodes_locked) + 1);
232 btree_iter_get_locks(linked, true);
233 }
234 ret = false;
235 }
236
237 /*
238 * Interior nodes must be locked before their descendants: if
239 * another iterator has possible descendants locked of the node
240 * we're about to lock, it must have the ancestors locked too:
241 */
242 if (linked->btree_id == iter->btree_id &&
243 level > __fls(linked->nodes_locked)) {
244 if (may_drop_locks) {
647d7b60
KO
245 linked->locks_want =
246 max(level + 1, max_t(unsigned,
247 linked->locks_want,
248 iter->locks_want));
1c6fdbd8
KO
249 btree_iter_get_locks(linked, true);
250 }
251 ret = false;
252 }
253 }
254
ba5c6557 255 if (unlikely(!ret)) {
1c7a0adf 256 trans_restart();
ba5c6557
KO
257 trace_trans_restart_would_deadlock(iter->trans->c,
258 iter->trans->ip);
259 return false;
260 }
1c7a0adf 261
ba5c6557
KO
262 __btree_node_lock_type(iter->trans->c, b, type);
263 return true;
1c6fdbd8
KO
264}
265
266/* Btree iterator locking: */
267
268#ifdef CONFIG_BCACHEFS_DEBUG
0f238367 269void bch2_btree_iter_verify_locks(struct btree_iter *iter)
1c6fdbd8
KO
270{
271 unsigned l;
272
ad7ae8d6
KO
273 BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) &&
274 !btree_node_locked(iter, 0));
275
1c6fdbd8
KO
276 for (l = 0; btree_iter_node(iter, l); l++) {
277 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
278 !btree_node_locked(iter, l))
279 continue;
280
281 BUG_ON(btree_lock_want(iter, l) !=
282 btree_node_locked_type(iter, l));
283 }
284}
ad7ae8d6 285
0f238367 286void bch2_btree_trans_verify_locks(struct btree_trans *trans)
ad7ae8d6 287{
0f238367 288 struct btree_iter *iter;
ad7ae8d6 289
0f238367
KO
290 trans_for_each_iter(trans, iter)
291 bch2_btree_iter_verify_locks(iter);
ad7ae8d6 292}
1c6fdbd8
KO
293#endif
294
295__flatten
0f238367 296static bool bch2_btree_iter_relock(struct btree_iter *iter)
1c6fdbd8
KO
297{
298 return iter->uptodate >= BTREE_ITER_NEED_RELOCK
299 ? btree_iter_get_locks(iter, false)
300 : true;
301}
302
1c6fdbd8
KO
303bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
304 unsigned new_locks_want)
305{
306 struct btree_iter *linked;
307
308 EBUG_ON(iter->locks_want >= new_locks_want);
309
310 iter->locks_want = new_locks_want;
311
312 if (btree_iter_get_locks(iter, true))
313 return true;
314
315 /*
316 * Ancestor nodes must be locked before child nodes, so set locks_want
317 * on iterators that might lock ancestors before us to avoid getting
318 * -EINTR later:
319 */
0f238367
KO
320 trans_for_each_iter(iter->trans, linked)
321 if (linked != iter &&
322 linked->btree_id == iter->btree_id &&
1c6fdbd8
KO
323 linked->locks_want < new_locks_want) {
324 linked->locks_want = new_locks_want;
325 btree_iter_get_locks(linked, true);
326 }
327
328 return false;
329}
330
331bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
332 unsigned new_locks_want)
333{
334 unsigned l = iter->level;
335
336 EBUG_ON(iter->locks_want >= new_locks_want);
337
338 iter->locks_want = new_locks_want;
339
340 do {
341 if (!btree_iter_node(iter, l))
342 break;
343
344 if (!bch2_btree_node_upgrade(iter, l)) {
345 iter->locks_want = l;
346 return false;
347 }
348
349 l++;
350 } while (l < iter->locks_want);
351
352 return true;
353}
354
355void __bch2_btree_iter_downgrade(struct btree_iter *iter,
356 unsigned downgrade_to)
357{
358 struct btree_iter *linked;
359 unsigned l;
360
361 /*
362 * We downgrade linked iterators as well because btree_iter_upgrade
363 * might have had to modify locks_want on linked iterators due to lock
364 * ordering:
365 */
0f238367 366 trans_for_each_iter(iter->trans, linked) {
1c6fdbd8
KO
367 unsigned new_locks_want = downgrade_to ?:
368 (linked->flags & BTREE_ITER_INTENT ? 1 : 0);
369
370 if (linked->locks_want <= new_locks_want)
371 continue;
372
373 linked->locks_want = new_locks_want;
374
375 while (linked->nodes_locked &&
376 (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
377 if (l > linked->level) {
378 btree_node_unlock(linked, l);
379 } else {
380 if (btree_node_intent_locked(linked, l)) {
c43a6ef9 381 six_lock_downgrade(&linked->l[l].b->c.lock);
1c6fdbd8
KO
382 linked->nodes_intent_locked ^= 1 << l;
383 }
384 break;
385 }
386 }
1c6fdbd8 387 }
ad7ae8d6 388
0f238367 389 bch2_btree_trans_verify_locks(iter->trans);
1c6fdbd8
KO
390}
391
0f238367
KO
392bool bch2_btree_trans_relock(struct btree_trans *trans)
393{
394 struct btree_iter *iter;
395 bool ret = true;
396
397 trans_for_each_iter(trans, iter)
398 ret &= bch2_btree_iter_relock(iter);
399
400 return ret;
401}
402
403void bch2_btree_trans_unlock(struct btree_trans *trans)
404{
405 struct btree_iter *iter;
406
407 trans_for_each_iter(trans, iter)
408 __bch2_btree_iter_unlock(iter);
409}
410
411/* Btree transaction locking: */
412
1c6fdbd8
KO
413/* Btree iterator: */
414
415#ifdef CONFIG_BCACHEFS_DEBUG
416
417static void __bch2_btree_iter_verify(struct btree_iter *iter,
418 struct btree *b)
419{
c43a6ef9 420 struct btree_iter_level *l = &iter->l[b->c.level];
1c6fdbd8
KO
421 struct btree_node_iter tmp = l->iter;
422 struct bkey_packed *k;
423
f13f5a8c
KO
424 if (!debug_check_iterators(iter->trans->c))
425 return;
426
271a3d3a
KO
427 if (iter->uptodate > BTREE_ITER_NEED_PEEK)
428 return;
429
1c6fdbd8
KO
430 bch2_btree_node_iter_verify(&l->iter, b);
431
432 /*
433 * For interior nodes, the iterator will have skipped past
434 * deleted keys:
271a3d3a
KO
435 *
436 * For extents, the iterator may have skipped past deleted keys (but not
437 * whiteouts)
1c6fdbd8 438 */
c43a6ef9 439 k = b->c.level || iter->flags & BTREE_ITER_IS_EXTENTS
26609b61 440 ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
1c6fdbd8 441 : bch2_btree_node_iter_prev_all(&tmp, b);
a00fd8c5 442 if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
1c6fdbd8
KO
443 char buf[100];
444 struct bkey uk = bkey_unpack_key(b, k);
445
319f9ac3 446 bch2_bkey_to_text(&PBUF(buf), &uk);
271a3d3a 447 panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
1c6fdbd8
KO
448 buf, iter->pos.inode, iter->pos.offset);
449 }
450
451 k = bch2_btree_node_iter_peek_all(&l->iter, b);
a00fd8c5 452 if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
1c6fdbd8
KO
453 char buf[100];
454 struct bkey uk = bkey_unpack_key(b, k);
455
319f9ac3 456 bch2_bkey_to_text(&PBUF(buf), &uk);
271a3d3a
KO
457 panic("iter should be after current key:\n"
458 "iter pos %llu:%llu\n"
459 "cur key %s\n",
1c6fdbd8
KO
460 iter->pos.inode, iter->pos.offset, buf);
461 }
462
271a3d3a
KO
463 BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
464 (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
465 !bkey_whiteout(&iter->k) &&
466 bch2_btree_node_iter_end(&l->iter));
1c6fdbd8
KO
467}
468
469void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
470{
471 struct btree_iter *linked;
472
f13f5a8c
KO
473 if (!debug_check_iterators(iter->trans->c))
474 return;
475
0f238367 476 trans_for_each_iter_with_node(iter->trans, b, linked)
1c6fdbd8
KO
477 __bch2_btree_iter_verify(linked, b);
478}
479
271a3d3a
KO
480#else
481
482static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
483 struct btree *b) {}
484
1c6fdbd8
KO
485#endif
486
487static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
488 struct btree *b,
489 struct btree_node_iter *node_iter,
490 struct bset_tree *t,
491 struct bkey_packed *where,
492 unsigned clobber_u64s,
493 unsigned new_u64s)
494{
495 const struct bkey_packed *end = btree_bkey_last(b, t);
496 struct btree_node_iter_set *set;
497 unsigned offset = __btree_node_key_to_offset(b, where);
498 int shift = new_u64s - clobber_u64s;
271a3d3a 499 unsigned old_end = t->end_offset - shift;
1c6fdbd8
KO
500
501 btree_node_iter_for_each(node_iter, set)
502 if (set->end == old_end)
503 goto found;
504
505 /* didn't find the bset in the iterator - might have to readd it: */
506 if (new_u64s &&
a00fd8c5 507 btree_iter_pos_cmp(iter, b, where) > 0) {
1c6fdbd8
KO
508 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
509
510 bch2_btree_node_iter_push(node_iter, b, where, end);
511
c43a6ef9 512 if (!b->c.level &&
1c6fdbd8
KO
513 node_iter == &iter->l[0].iter)
514 bkey_disassemble(b,
515 bch2_btree_node_iter_peek_all(node_iter, b),
516 &iter->k);
517 }
518 return;
519found:
271a3d3a 520 set->end = t->end_offset;
1c6fdbd8
KO
521
522 /* Iterator hasn't gotten to the key that changed yet: */
523 if (set->k < offset)
524 return;
525
526 if (new_u64s &&
a00fd8c5 527 btree_iter_pos_cmp(iter, b, where) > 0) {
1c6fdbd8
KO
528 set->k = offset;
529 } else if (set->k < offset + clobber_u64s) {
530 set->k = offset + new_u64s;
531 if (set->k == set->end)
532 bch2_btree_node_iter_set_drop(node_iter, set);
533 } else {
534 set->k = (int) set->k + shift;
535 goto iter_current_key_not_modified;
536 }
537
538 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
539
540 bch2_btree_node_iter_sort(node_iter, b);
c43a6ef9 541 if (!b->c.level && node_iter == &iter->l[0].iter) {
56e0e7c7
KO
542 /*
543 * not legal to call bkey_debugcheck() here, because we're
544 * called midway through the update path after update has been
545 * marked but before deletes have actually happened:
546 */
547#if 0
1c6fdbd8 548 __btree_iter_peek_all(iter, &iter->l[0], &iter->k);
56e0e7c7
KO
549#endif
550 struct btree_iter_level *l = &iter->l[0];
551 struct bkey_packed *k =
552 bch2_btree_node_iter_peek_all(&l->iter, l->b);
553
554 if (unlikely(!k))
555 iter->k.type = KEY_TYPE_deleted;
556 else
557 bkey_disassemble(l->b, k, &iter->k);
558 }
1c6fdbd8
KO
559iter_current_key_not_modified:
560
561 /*
562 * Interior nodes are special because iterators for interior nodes don't
563 * obey the usual invariants regarding the iterator position:
564 *
565 * We may have whiteouts that compare greater than the iterator
566 * position, and logically should be in the iterator, but that we
567 * skipped past to find the first live key greater than the iterator
568 * position. This becomes an issue when we insert a new key that is
569 * greater than the current iterator position, but smaller than the
570 * whiteouts we've already skipped past - this happens in the course of
571 * a btree split.
572 *
573 * We have to rewind the iterator past to before those whiteouts here,
574 * else bkey_node_iter_prev() is not going to work and who knows what
575 * else would happen. And we have to do it manually, because here we've
576 * already done the insert and the iterator is currently inconsistent:
577 *
578 * We've got multiple competing invariants, here - we have to be careful
579 * about rewinding iterators for interior nodes, because they should
580 * always point to the key for the child node the btree iterator points
581 * to.
582 */
c43a6ef9 583 if (b->c.level && new_u64s &&
a00fd8c5 584 btree_iter_pos_cmp(iter, b, where) > 0) {
216c9fac 585 struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where);
1c6fdbd8
KO
586 struct bkey_packed *k;
587
588 for_each_bset(b, t) {
216c9fac 589 if (where_set == t)
1c6fdbd8
KO
590 continue;
591
592 k = bch2_bkey_prev_all(b, t,
593 bch2_btree_node_iter_bset_pos(node_iter, b, t));
594 if (k &&
a00fd8c5 595 bkey_iter_cmp(b, k, where) > 0) {
1c6fdbd8
KO
596 struct btree_node_iter_set *set;
597 unsigned offset =
598 __btree_node_key_to_offset(b, bkey_next(k));
599
600 btree_node_iter_for_each(node_iter, set)
601 if (set->k == offset) {
602 set->k = __btree_node_key_to_offset(b, k);
603 bch2_btree_node_iter_sort(node_iter, b);
604 goto next_bset;
605 }
606
607 bch2_btree_node_iter_push(node_iter, b, k,
608 btree_bkey_last(b, t));
609 }
610next_bset:
611 t = t;
612 }
613 }
614}
615
616void bch2_btree_node_iter_fix(struct btree_iter *iter,
216c9fac
KO
617 struct btree *b,
618 struct btree_node_iter *node_iter,
619 struct bkey_packed *where,
620 unsigned clobber_u64s,
621 unsigned new_u64s)
1c6fdbd8 622{
216c9fac 623 struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
1c6fdbd8
KO
624 struct btree_iter *linked;
625
c43a6ef9 626 if (node_iter != &iter->l[b->c.level].iter)
1c6fdbd8
KO
627 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
628 where, clobber_u64s, new_u64s);
629
0f238367 630 trans_for_each_iter_with_node(iter->trans, b, linked)
1c6fdbd8 631 __bch2_btree_node_iter_fix(linked, b,
c43a6ef9 632 &linked->l[b->c.level].iter, t,
1c6fdbd8 633 where, clobber_u64s, new_u64s);
1c6fdbd8
KO
634}
635
636static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
637 struct btree_iter_level *l,
638 struct bkey *u,
639 struct bkey_packed *k)
640{
641 struct bkey_s_c ret;
642
643 if (unlikely(!k)) {
644 /*
645 * signal to bch2_btree_iter_peek_slot() that we're currently at
646 * a hole
647 */
26609b61 648 u->type = KEY_TYPE_deleted;
1c6fdbd8
KO
649 return bkey_s_c_null;
650 }
651
652 ret = bkey_disassemble(l->b, k, u);
653
9e5e5b9e
KO
654 if (debug_check_bkeys(iter->trans->c))
655 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1c6fdbd8
KO
656
657 return ret;
658}
659
660/* peek_all() doesn't skip deleted keys */
661static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
662 struct btree_iter_level *l,
663 struct bkey *u)
664{
665 return __btree_iter_unpack(iter, l, u,
666 bch2_btree_node_iter_peek_all(&l->iter, l->b));
667}
668
669static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
670 struct btree_iter_level *l)
671{
672 return __btree_iter_unpack(iter, l, &iter->k,
673 bch2_btree_node_iter_peek(&l->iter, l->b));
674}
675
a00fd8c5
KO
676static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
677 struct btree_iter_level *l,
678 int max_advance)
1c6fdbd8 679{
a00fd8c5
KO
680 struct bkey_packed *k;
681 int nr_advanced = 0;
682
683 while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
684 btree_iter_pos_cmp(iter, l->b, k) < 0) {
685 if (max_advance > 0 && nr_advanced >= max_advance)
686 return false;
687
688 bch2_btree_node_iter_advance(&l->iter, l->b);
689 nr_advanced++;
690 }
691
692 return true;
1c6fdbd8
KO
693}
694
695/*
696 * Verify that iterator for parent node points to child node:
697 */
698static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
699{
700 struct btree_iter_level *l;
701 unsigned plevel;
702 bool parent_locked;
703 struct bkey_packed *k;
704
705 if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
706 return;
707
c43a6ef9 708 plevel = b->c.level + 1;
1c6fdbd8
KO
709 if (!btree_iter_node(iter, plevel))
710 return;
711
712 parent_locked = btree_node_locked(iter, plevel);
713
714 if (!bch2_btree_node_relock(iter, plevel))
715 return;
716
717 l = &iter->l[plevel];
718 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
719 if (!k ||
720 bkey_deleted(k) ||
721 bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
722 char buf[100];
723 struct bkey uk = bkey_unpack_key(b, k);
724
319f9ac3 725 bch2_bkey_to_text(&PBUF(buf), &uk);
1c6fdbd8
KO
726 panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
727 buf, b->key.k.p.inode, b->key.k.p.offset);
728 }
729
730 if (!parent_locked)
c43a6ef9 731 btree_node_unlock(iter, b->c.level + 1);
1c6fdbd8
KO
732}
733
1c6fdbd8
KO
734static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
735 struct btree *b)
736{
a00fd8c5 737 return __btree_iter_pos_cmp(iter, NULL,
cbdf24ce 738 bkey_to_packed(&b->key), true) < 0;
1c6fdbd8
KO
739}
740
741static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
742 struct btree *b)
743{
c43a6ef9 744 return iter->btree_id == b->c.btree_id &&
1c6fdbd8
KO
745 bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
746 !btree_iter_pos_after_node(iter, b);
747}
748
749static inline void __btree_iter_init(struct btree_iter *iter,
a00fd8c5 750 unsigned level)
1c6fdbd8 751{
a00fd8c5
KO
752 struct btree_iter_level *l = &iter->l[level];
753
754 bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
1c6fdbd8 755
a00fd8c5
KO
756 if (iter->flags & BTREE_ITER_IS_EXTENTS)
757 btree_iter_advance_to_pos(iter, l, -1);
1c6fdbd8
KO
758
759 /* Skip to first non whiteout: */
a00fd8c5
KO
760 if (level)
761 bch2_btree_node_iter_peek(&l->iter, l->b);
1c6fdbd8
KO
762
763 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
764}
765
766static inline void btree_iter_node_set(struct btree_iter *iter,
767 struct btree *b)
768{
769 btree_iter_verify_new_node(iter, b);
770
771 EBUG_ON(!btree_iter_pos_in_node(iter, b));
c43a6ef9 772 EBUG_ON(b->c.lock.state.seq & 1);
1c6fdbd8 773
c43a6ef9
KO
774 iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
775 iter->l[b->c.level].b = b;
776 __btree_iter_init(iter, b->c.level);
1c6fdbd8
KO
777}
778
779/*
780 * A btree node is being replaced - update the iterator to point to the new
781 * node:
782 */
783void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
784{
785 enum btree_node_locked_type t;
786 struct btree_iter *linked;
787
0f238367 788 trans_for_each_iter(iter->trans, linked)
1c6fdbd8
KO
789 if (btree_iter_pos_in_node(linked, b)) {
790 /*
791 * bch2_btree_iter_node_drop() has already been called -
792 * the old node we're replacing has already been
793 * unlocked and the pointer invalidated
794 */
c43a6ef9 795 BUG_ON(btree_node_locked(linked, b->c.level));
1c6fdbd8 796
c43a6ef9 797 t = btree_lock_want(linked, b->c.level);
1c6fdbd8 798 if (t != BTREE_NODE_UNLOCKED) {
c43a6ef9
KO
799 six_lock_increment(&b->c.lock, (enum six_lock_type) t);
800 mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t);
1c6fdbd8
KO
801 }
802
803 btree_iter_node_set(linked, b);
804 }
805
c43a6ef9 806 six_unlock_intent(&b->c.lock);
1c6fdbd8
KO
807}
808
809void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
810{
811 struct btree_iter *linked;
c43a6ef9 812 unsigned level = b->c.level;
1c6fdbd8 813
0f238367 814 trans_for_each_iter(iter->trans, linked)
1c6fdbd8 815 if (linked->l[level].b == b) {
ad7ae8d6 816 __btree_node_unlock(linked, level);
1c6fdbd8
KO
817 linked->l[level].b = BTREE_ITER_NOT_END;
818 }
819}
820
821/*
822 * A btree node has been modified in such a way as to invalidate iterators - fix
823 * them:
824 */
825void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
826{
827 struct btree_iter *linked;
828
0f238367 829 trans_for_each_iter_with_node(iter->trans, b, linked)
c43a6ef9 830 __btree_iter_init(linked, b->c.level);
1c6fdbd8
KO
831}
832
833static inline int btree_iter_lock_root(struct btree_iter *iter,
834 unsigned depth_want)
835{
9e5e5b9e 836 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
837 struct btree *b;
838 enum six_lock_type lock_type;
839 unsigned i;
840
841 EBUG_ON(iter->nodes_locked);
842
843 while (1) {
844 b = READ_ONCE(c->btree_roots[iter->btree_id].b);
c43a6ef9 845 iter->level = READ_ONCE(b->c.level);
1c6fdbd8
KO
846
847 if (unlikely(iter->level < depth_want)) {
848 /*
849 * the root is at a lower depth than the depth we want:
850 * got to the end of the btree, or we're walking nodes
851 * greater than some depth and there are no nodes >=
852 * that depth
853 */
854 iter->level = depth_want;
855 iter->l[iter->level].b = NULL;
8812600c 856 return 1;
1c6fdbd8
KO
857 }
858
859 lock_type = __btree_lock_want(iter, iter->level);
860 if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
861 iter, lock_type, true)))
862 return -EINTR;
863
864 if (likely(b == c->btree_roots[iter->btree_id].b &&
c43a6ef9 865 b->c.level == iter->level &&
1c6fdbd8
KO
866 !race_fault())) {
867 for (i = 0; i < iter->level; i++)
868 iter->l[i].b = BTREE_ITER_NOT_END;
869 iter->l[iter->level].b = b;
870
871 mark_btree_node_locked(iter, iter->level, lock_type);
872 btree_iter_node_set(iter, b);
873 return 0;
874
875 }
876
c43a6ef9 877 six_unlock_type(&b->c.lock, lock_type);
1c6fdbd8
KO
878 }
879}
880
881noinline
882static void btree_iter_prefetch(struct btree_iter *iter)
883{
9e5e5b9e 884 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
885 struct btree_iter_level *l = &iter->l[iter->level];
886 struct btree_node_iter node_iter = l->iter;
887 struct bkey_packed *k;
888 BKEY_PADDED(k) tmp;
9e5e5b9e 889 unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1c6fdbd8
KO
890 ? (iter->level > 1 ? 0 : 2)
891 : (iter->level > 1 ? 1 : 16);
892 bool was_locked = btree_node_locked(iter, iter->level);
893
894 while (nr) {
895 if (!bch2_btree_node_relock(iter, iter->level))
896 return;
897
898 bch2_btree_node_iter_advance(&node_iter, l->b);
899 k = bch2_btree_node_iter_peek(&node_iter, l->b);
900 if (!k)
901 break;
902
903 bch2_bkey_unpack(l->b, &tmp.k, k);
9e5e5b9e 904 bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
1c6fdbd8
KO
905 }
906
907 if (!was_locked)
908 btree_node_unlock(iter, iter->level);
909}
910
911static inline int btree_iter_down(struct btree_iter *iter)
912{
9e5e5b9e 913 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
914 struct btree_iter_level *l = &iter->l[iter->level];
915 struct btree *b;
916 unsigned level = iter->level - 1;
917 enum six_lock_type lock_type = __btree_lock_want(iter, level);
918 BKEY_PADDED(k) tmp;
919
920 BUG_ON(!btree_node_locked(iter, iter->level));
921
922 bch2_bkey_unpack(l->b, &tmp.k,
923 bch2_btree_node_iter_peek(&l->iter, l->b));
924
9e5e5b9e 925 b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, true);
1c6fdbd8
KO
926 if (unlikely(IS_ERR(b)))
927 return PTR_ERR(b);
928
929 mark_btree_node_locked(iter, level, lock_type);
930 btree_iter_node_set(iter, b);
931
932 if (iter->flags & BTREE_ITER_PREFETCH)
933 btree_iter_prefetch(iter);
934
935 iter->level = level;
936
937 return 0;
938}
939
940static void btree_iter_up(struct btree_iter *iter)
941{
942 btree_node_unlock(iter, iter->level++);
943}
944
945int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
946
bf7b87a4
KO
947static int __btree_iter_traverse_all(struct btree_trans *trans,
948 struct btree_iter *iter, int ret)
1c6fdbd8 949{
e542029e
KO
950 struct bch_fs *c = trans->c;
951 u8 sorted[BTREE_ITER_MAX];
952 unsigned i, nr_sorted = 0;
953
954 trans_for_each_iter(trans, iter)
955 sorted[nr_sorted++] = iter - trans->iters;
956
957#define btree_iter_cmp_by_idx(_l, _r) \
958 btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
959
960 bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
961#undef btree_iter_cmp_by_idx
962
1c6fdbd8 963retry_all:
e542029e 964 bch2_btree_trans_unlock(trans);
1c6fdbd8 965
bf7b87a4 966 if (unlikely(ret == -ENOMEM)) {
1c6fdbd8
KO
967 struct closure cl;
968
969 closure_init_stack(&cl);
970
971 do {
972 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
973 closure_sync(&cl);
974 } while (ret);
975 }
976
bf7b87a4 977 if (unlikely(ret == -EIO)) {
ece254b2 978 trans->error = true;
bf7b87a4
KO
979 iter->flags |= BTREE_ITER_ERROR;
980 iter->l[iter->level].b = BTREE_ITER_NOT_END;
981 goto out;
982 }
983
984 BUG_ON(ret && ret != -EINTR);
985
1c6fdbd8 986 /* Now, redo traversals in correct order: */
e542029e
KO
987 for (i = 0; i < nr_sorted; i++) {
988 iter = &trans->iters[sorted[i]];
1c6fdbd8 989
e542029e
KO
990 do {
991 ret = __bch2_btree_iter_traverse(iter);
992 } while (ret == -EINTR);
1c6fdbd8 993
e542029e
KO
994 if (ret)
995 goto retry_all;
996 }
1c6fdbd8 997
932aa837 998 ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
1c6fdbd8
KO
999out:
1000 bch2_btree_cache_cannibalize_unlock(c);
1001 return ret;
bf7b87a4 1002}
1c6fdbd8 1003
bf7b87a4
KO
1004int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1005{
1006 return __btree_iter_traverse_all(trans, NULL, 0);
1c6fdbd8
KO
1007}
1008
1009static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
1010 bool check_pos)
1011{
1012 unsigned l = iter->level;
1013
1014 while (btree_iter_node(iter, l) &&
1015 !(is_btree_node(iter, l) &&
1016 bch2_btree_node_relock(iter, l) &&
1017 (!check_pos ||
1018 btree_iter_pos_in_node(iter, iter->l[l].b)))) {
1019 btree_node_unlock(iter, l);
1020 iter->l[l].b = BTREE_ITER_NOT_END;
1021 l++;
1022 }
1023
1024 return l;
1025}
1026
1027/*
1028 * This is the main state machine for walking down the btree - walks down to a
1029 * specified depth
1030 *
1031 * Returns 0 on success, -EIO on error (error reading in a btree node).
1032 *
1033 * On error, caller (peek_node()/peek_key()) must return NULL; the error is
b7607ce9 1034 * stashed in the iterator and returned from bch2_trans_exit().
1c6fdbd8
KO
1035 */
1036int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1037{
1038 unsigned depth_want = iter->level;
1039
1040 if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1041 return 0;
1042
0f238367 1043 if (bch2_btree_iter_relock(iter))
1c6fdbd8
KO
1044 return 0;
1045
1c6fdbd8
KO
1046 /*
1047 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
1048 * here unnecessary
1049 */
1050 iter->level = btree_iter_up_until_locked(iter, true);
1051
1052 /*
1053 * If we've got a btree node locked (i.e. we aren't about to relock the
1054 * root) - advance its node iterator if necessary:
1055 *
1056 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
1057 */
a00fd8c5
KO
1058 if (btree_iter_node(iter, iter->level))
1059 btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
1c6fdbd8
KO
1060
1061 /*
1062 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1063 * would indicate to other code that we got to the end of the btree,
1064 * here it indicates that relocking the root failed - it's critical that
1065 * btree_iter_lock_root() comes next and that it can't fail
1066 */
1067 while (iter->level > depth_want) {
1068 int ret = btree_iter_node(iter, iter->level)
1069 ? btree_iter_down(iter)
1070 : btree_iter_lock_root(iter, depth_want);
1071 if (unlikely(ret)) {
8812600c
KO
1072 if (ret == 1)
1073 return 0;
1074
1c6fdbd8
KO
1075 iter->level = depth_want;
1076 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1077 return ret;
1078 }
1079 }
1080
1081 iter->uptodate = BTREE_ITER_NEED_PEEK;
271a3d3a 1082
0f238367 1083 bch2_btree_trans_verify_locks(iter->trans);
271a3d3a 1084 __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
1c6fdbd8
KO
1085 return 0;
1086}
1087
1088int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
1089{
1090 int ret;
1091
1092 ret = __bch2_btree_iter_traverse(iter);
1093 if (unlikely(ret))
bf7b87a4 1094 ret = __btree_iter_traverse_all(iter->trans, iter, ret);
1c6fdbd8 1095
1c6fdbd8
KO
1096 return ret;
1097}
1098
1099static inline void bch2_btree_iter_checks(struct btree_iter *iter,
1100 enum btree_iter_type type)
1101{
1102 EBUG_ON(iter->btree_id >= BTREE_ID_NR);
1c6fdbd8 1103 EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
ab5c63f5 1104 (btree_node_type_is_extents(iter->btree_id) &&
1c6fdbd8
KO
1105 type != BTREE_ITER_NODES));
1106
0f238367 1107 bch2_btree_trans_verify_locks(iter->trans);
1c6fdbd8
KO
1108}
1109
1110/* Iterate across nodes (leaf and interior nodes) */
1111
1112struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1113{
1114 struct btree *b;
1115 int ret;
1116
1117 bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1118
1119 if (iter->uptodate == BTREE_ITER_UPTODATE)
1120 return iter->l[iter->level].b;
1121
1122 ret = bch2_btree_iter_traverse(iter);
1123 if (ret)
1124 return NULL;
1125
1126 b = btree_iter_node(iter, iter->level);
1127 if (!b)
1128 return NULL;
1129
1130 BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
1131
1132 iter->pos = b->key.k.p;
1133 iter->uptodate = BTREE_ITER_UPTODATE;
1134
1135 return b;
1136}
1137
1138struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
1139{
1140 struct btree *b;
1141 int ret;
1142
1143 bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1144
1145 /* already got to end? */
1146 if (!btree_iter_node(iter, iter->level))
1147 return NULL;
1148
1dd7f9d9
KO
1149 bch2_trans_cond_resched(iter->trans);
1150
1c6fdbd8
KO
1151 btree_iter_up(iter);
1152
1153 if (!bch2_btree_node_relock(iter, iter->level))
1154 btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
1155
1156 ret = bch2_btree_iter_traverse(iter);
1157 if (ret)
1158 return NULL;
1159
1160 /* got to end? */
1161 b = btree_iter_node(iter, iter->level);
1162 if (!b)
1163 return NULL;
1164
1165 if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
1166 /*
1167 * Haven't gotten to the end of the parent node: go back down to
1168 * the next child node
1169 */
1170
1171 /*
1172 * We don't really want to be unlocking here except we can't
1173 * directly tell btree_iter_traverse() "traverse to this level"
1174 * except by setting iter->level, so we have to unlock so we
1175 * don't screw up our lock invariants:
1176 */
1177 if (btree_node_read_locked(iter, iter->level))
1178 btree_node_unlock(iter, iter->level);
1179
1180 /* ick: */
1181 iter->pos = iter->btree_id == BTREE_ID_INODES
1182 ? btree_type_successor(iter->btree_id, iter->pos)
1183 : bkey_successor(iter->pos);
1184 iter->level = depth;
1185
1186 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1187 ret = bch2_btree_iter_traverse(iter);
1188 if (ret)
1189 return NULL;
1190
1191 b = iter->l[iter->level].b;
1192 }
1193
1194 iter->pos = b->key.k.p;
1195 iter->uptodate = BTREE_ITER_UPTODATE;
1196
1197 return b;
1198}
1199
1200/* Iterate across keys (in leaf nodes only) */
1201
1202void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
1203{
1204 struct btree_iter_level *l = &iter->l[0];
1c6fdbd8
KO
1205
1206 EBUG_ON(iter->level != 0);
1207 EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
1208 EBUG_ON(!btree_node_locked(iter, 0));
1209 EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
1210
1211 iter->pos = new_pos;
1212 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1213
a00fd8c5 1214 btree_iter_advance_to_pos(iter, l, -1);
1c6fdbd8 1215
a00fd8c5
KO
1216 if (bch2_btree_node_iter_end(&l->iter) &&
1217 btree_iter_pos_after_node(iter, l->b))
1c6fdbd8 1218 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1c6fdbd8
KO
1219}
1220
1221void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
1222{
1223 int cmp = bkey_cmp(new_pos, iter->pos);
1224 unsigned level;
1225
1226 if (!cmp)
1227 return;
1228
1229 iter->pos = new_pos;
1230
1231 level = btree_iter_up_until_locked(iter, true);
1232
1233 if (btree_iter_node(iter, level)) {
1c6fdbd8
KO
1234 /*
1235 * We might have to skip over many keys, or just a few: try
1236 * advancing the node iterator, and if we have to skip over too
1237 * many keys just reinit it (or if we're rewinding, since that
1238 * is expensive).
1239 */
a00fd8c5
KO
1240 if (cmp < 0 ||
1241 !btree_iter_advance_to_pos(iter, &iter->l[level], 8))
1242 __btree_iter_init(iter, level);
1c6fdbd8
KO
1243
1244 /* Don't leave it locked if we're not supposed to: */
1245 if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
1246 btree_node_unlock(iter, level);
1247 }
1248
1249 if (level != iter->level)
1250 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1251 else
1252 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1253}
1254
1255static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
1256{
1257 struct btree_iter_level *l = &iter->l[0];
1258 struct bkey_s_c ret = { .k = &iter->k };
1259
1260 if (!bkey_deleted(&iter->k)) {
1261 EBUG_ON(bch2_btree_node_iter_end(&l->iter));
1262 ret.v = bkeyp_val(&l->b->format,
1263 __bch2_btree_node_iter_peek_all(&l->iter, l->b));
1264 }
1265
9e5e5b9e 1266 if (debug_check_bkeys(iter->trans->c) &&
1c6fdbd8 1267 !bkey_deleted(ret.k))
9e5e5b9e 1268 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1c6fdbd8
KO
1269 return ret;
1270}
1271
1272struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1273{
1274 struct btree_iter_level *l = &iter->l[0];
1275 struct bkey_s_c k;
1276 int ret;
1277
1278 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1279
1280 if (iter->uptodate == BTREE_ITER_UPTODATE)
1281 return btree_iter_peek_uptodate(iter);
1282
1283 while (1) {
1284 ret = bch2_btree_iter_traverse(iter);
1285 if (unlikely(ret))
1286 return bkey_s_c_err(ret);
1287
1288 k = __btree_iter_peek(iter, l);
1289 if (likely(k.k))
1290 break;
1291
1292 /* got to the end of the leaf, iterator needs to be traversed: */
1293 iter->pos = l->b->key.k.p;
1294 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1295
1296 if (!bkey_cmp(iter->pos, POS_MAX))
1297 return bkey_s_c_null;
1298
1299 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1300 }
1301
1302 /*
1303 * iter->pos should always be equal to the key we just
1304 * returned - except extents can straddle iter->pos:
1305 */
1306 if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
1307 bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1308 iter->pos = bkey_start_pos(k.k);
1309
1310 iter->uptodate = BTREE_ITER_UPTODATE;
1311 return k;
1312}
1313
1314static noinline
1315struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
1316{
1317 struct btree_iter_level *l = &iter->l[0];
1318
1319 iter->pos = l->b->key.k.p;
1320 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1321
1322 if (!bkey_cmp(iter->pos, POS_MAX))
1323 return bkey_s_c_null;
1324
1325 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1326
1327 return bch2_btree_iter_peek(iter);
1328}
1329
1330struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1331{
1332 struct btree_iter_level *l = &iter->l[0];
1333 struct bkey_packed *p;
1334 struct bkey_s_c k;
1335
1336 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1337
1338 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1339 k = bch2_btree_iter_peek(iter);
1340 if (IS_ERR_OR_NULL(k.k))
1341 return k;
1342 }
1343
1344 do {
a00fd8c5 1345 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8
KO
1346 p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1347 if (unlikely(!p))
1348 return bch2_btree_iter_peek_next_leaf(iter);
1349 } while (bkey_whiteout(p));
1350
1351 k = __btree_iter_unpack(iter, l, &iter->k, p);
1352
1353 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
1354 iter->pos = bkey_start_pos(k.k);
1355 return k;
1356}
1357
1358struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1359{
1360 struct btree_iter_level *l = &iter->l[0];
1361 struct bkey_packed *p;
1362 struct bkey_s_c k;
1363 int ret;
1364
1365 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1366
1367 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1368 k = bch2_btree_iter_peek(iter);
1369 if (IS_ERR(k.k))
1370 return k;
1371 }
1372
1373 while (1) {
1374 p = bch2_btree_node_iter_prev(&l->iter, l->b);
1375 if (likely(p))
1376 break;
1377
1378 iter->pos = l->b->data->min_key;
1379 if (!bkey_cmp(iter->pos, POS_MIN))
1380 return bkey_s_c_null;
1381
1382 bch2_btree_iter_set_pos(iter,
1383 btree_type_predecessor(iter->btree_id, iter->pos));
1384
1385 ret = bch2_btree_iter_traverse(iter);
1386 if (unlikely(ret))
1387 return bkey_s_c_err(ret);
1388
1389 p = bch2_btree_node_iter_peek(&l->iter, l->b);
1390 if (p)
1391 break;
1392 }
1393
1394 k = __btree_iter_unpack(iter, l, &iter->k, p);
1395
1396 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
1397
1398 iter->pos = bkey_start_pos(k.k);
1399 iter->uptodate = BTREE_ITER_UPTODATE;
1400 return k;
1401}
1402
1403static inline struct bkey_s_c
271a3d3a 1404__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
1c6fdbd8
KO
1405{
1406 struct btree_iter_level *l = &iter->l[0];
271a3d3a 1407 struct btree_node_iter node_iter;
1c6fdbd8
KO
1408 struct bkey_s_c k;
1409 struct bkey n;
1410 int ret;
1411
1412recheck:
1413 while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1414 bkey_deleted(k.k) &&
1415 bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
a00fd8c5 1416 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8 1417
271a3d3a
KO
1418 /*
1419 * iterator is now at the correct position for inserting at iter->pos,
1420 * but we need to keep iterating until we find the first non whiteout so
1421 * we know how big a hole we have, if any:
1422 */
1423
1424 node_iter = l->iter;
1425 if (k.k && bkey_whiteout(k.k))
1426 k = __btree_iter_unpack(iter, l, &iter->k,
1427 bch2_btree_node_iter_peek(&node_iter, l->b));
1428
1c6fdbd8
KO
1429 /*
1430 * If we got to the end of the node, check if we need to traverse to the
1431 * next node:
1432 */
1433 if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1434 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1435 ret = bch2_btree_iter_traverse(iter);
1436 if (unlikely(ret))
1437 return bkey_s_c_err(ret);
1438
1439 goto recheck;
1440 }
1441
1442 if (k.k &&
1443 !bkey_whiteout(k.k) &&
1444 bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
271a3d3a
KO
1445 /*
1446 * if we skipped forward to find the first non whiteout and
1447 * there _wasn't_ actually a hole, we want the iterator to be
1448 * pointed at the key we found:
1449 */
1450 l->iter = node_iter;
1451
1c6fdbd8
KO
1452 EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
1453 EBUG_ON(bkey_deleted(k.k));
1454 iter->uptodate = BTREE_ITER_UPTODATE;
1455 return k;
1456 }
1457
1458 /* hole */
271a3d3a
KO
1459
1460 /* holes can't span inode numbers: */
1461 if (iter->pos.offset == KEY_OFFSET_MAX) {
1462 if (iter->pos.inode == KEY_INODE_MAX)
1463 return bkey_s_c_null;
1464
1465 iter->pos = bkey_successor(iter->pos);
1466 goto recheck;
1467 }
1468
1469 if (!k.k)
1470 k.k = &l->b->key.k;
1471
1c6fdbd8
KO
1472 bkey_init(&n);
1473 n.p = iter->pos;
271a3d3a
KO
1474 bch2_key_resize(&n,
1475 min_t(u64, KEY_SIZE_MAX,
1476 (k.k->p.inode == n.p.inode
1477 ? bkey_start_offset(k.k)
1478 : KEY_OFFSET_MAX) -
1479 n.p.offset));
1480
319f9ac3 1481 EBUG_ON(!n.size);
1c6fdbd8 1482
271a3d3a
KO
1483 iter->k = n;
1484 iter->uptodate = BTREE_ITER_UPTODATE;
1485 return (struct bkey_s_c) { &iter->k, NULL };
1486}
1c6fdbd8 1487
271a3d3a
KO
1488static inline struct bkey_s_c
1489__bch2_btree_iter_peek_slot(struct btree_iter *iter)
1490{
1491 struct btree_iter_level *l = &iter->l[0];
1492 struct bkey_s_c k;
1493 int ret;
1c6fdbd8 1494
271a3d3a
KO
1495 if (iter->flags & BTREE_ITER_IS_EXTENTS)
1496 return __bch2_btree_iter_peek_slot_extents(iter);
1c6fdbd8 1497
271a3d3a
KO
1498recheck:
1499 while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1500 bkey_deleted(k.k) &&
1501 bkey_cmp(k.k->p, iter->pos) == 0)
a00fd8c5 1502 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8 1503
271a3d3a
KO
1504 /*
1505 * If we got to the end of the node, check if we need to traverse to the
1506 * next node:
1507 */
1508 if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1509 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1510 ret = bch2_btree_iter_traverse(iter);
1511 if (unlikely(ret))
1512 return bkey_s_c_err(ret);
1c6fdbd8 1513
271a3d3a 1514 goto recheck;
1c6fdbd8
KO
1515 }
1516
271a3d3a
KO
1517 if (k.k &&
1518 !bkey_deleted(k.k) &&
1519 !bkey_cmp(iter->pos, k.k->p)) {
1520 iter->uptodate = BTREE_ITER_UPTODATE;
1521 return k;
1522 } else {
1523 /* hole */
1524 bkey_init(&iter->k);
1525 iter->k.p = iter->pos;
1526
1527 iter->uptodate = BTREE_ITER_UPTODATE;
1528 return (struct bkey_s_c) { &iter->k, NULL };
1529 }
1c6fdbd8
KO
1530}
1531
1532struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1533{
1534 int ret;
1535
1536 bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1537
1538 if (iter->uptodate == BTREE_ITER_UPTODATE)
1539 return btree_iter_peek_uptodate(iter);
1540
1541 ret = bch2_btree_iter_traverse(iter);
1542 if (unlikely(ret))
1543 return bkey_s_c_err(ret);
1544
1545 return __bch2_btree_iter_peek_slot(iter);
1546}
1547
1548struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1549{
1550 bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1551
1552 iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
1553
1554 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1555 /*
1556 * XXX: when we just need to relock we should be able to avoid
1557 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
1558 * for that to work
1559 */
1560 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1561
1562 return bch2_btree_iter_peek_slot(iter);
1563 }
1564
1565 if (!bkey_deleted(&iter->k))
a00fd8c5 1566 bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b);
1c6fdbd8
KO
1567
1568 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1569
1570 return __bch2_btree_iter_peek_slot(iter);
1571}
1572
9e5e5b9e
KO
1573static inline void bch2_btree_iter_init(struct btree_trans *trans,
1574 struct btree_iter *iter, enum btree_id btree_id,
424eb881 1575 struct bpos pos, unsigned flags)
1c6fdbd8 1576{
9e5e5b9e 1577 struct bch_fs *c = trans->c;
1c6fdbd8
KO
1578 unsigned i;
1579
ab5c63f5 1580 if (btree_node_type_is_extents(btree_id) &&
424eb881
KO
1581 !(flags & BTREE_ITER_NODES))
1582 flags |= BTREE_ITER_IS_EXTENTS;
1c6fdbd8 1583
9e5e5b9e 1584 iter->trans = trans;
1c6fdbd8
KO
1585 iter->pos = pos;
1586 bkey_init(&iter->k);
1587 iter->k.p = pos;
1588 iter->flags = flags;
1589 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1590 iter->btree_id = btree_id;
424eb881
KO
1591 iter->level = 0;
1592 iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0;
1c6fdbd8
KO
1593 iter->nodes_locked = 0;
1594 iter->nodes_intent_locked = 0;
1595 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1596 iter->l[i].b = NULL;
1597 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1c6fdbd8
KO
1598
1599 prefetch(c->btree_roots[btree_id].b);
1600}
1601
1c6fdbd8
KO
1602/* new transactional stuff: */
1603
424eb881
KO
1604int bch2_trans_iter_put(struct btree_trans *trans,
1605 struct btree_iter *iter)
08af47df 1606{
0f238367 1607 int ret = btree_iter_err(iter);
08af47df 1608
e1120a4c 1609 trans->iters_live &= ~(1ULL << iter->idx);
424eb881 1610 return ret;
08af47df
KO
1611}
1612
5df4be3f
KO
1613static inline void __bch2_trans_iter_free(struct btree_trans *trans,
1614 unsigned idx)
1615{
ecc892e4 1616 __bch2_btree_iter_unlock(&trans->iters[idx]);
5df4be3f
KO
1617 trans->iters_linked &= ~(1ULL << idx);
1618 trans->iters_live &= ~(1ULL << idx);
1619 trans->iters_touched &= ~(1ULL << idx);
1620 trans->iters_unlink_on_restart &= ~(1ULL << idx);
1621 trans->iters_unlink_on_commit &= ~(1ULL << idx);
5df4be3f
KO
1622}
1623
424eb881
KO
1624int bch2_trans_iter_free(struct btree_trans *trans,
1625 struct btree_iter *iter)
1c6fdbd8 1626{
0f238367 1627 int ret = btree_iter_err(iter);
424eb881 1628
e1120a4c 1629 __bch2_trans_iter_free(trans, iter->idx);
424eb881 1630 return ret;
5df4be3f 1631}
1c6fdbd8 1632
424eb881
KO
1633int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
1634 struct btree_iter *iter)
5df4be3f 1635{
0f238367 1636 int ret = btree_iter_err(iter);
424eb881 1637
e1120a4c 1638 trans->iters_unlink_on_commit |= 1ULL << iter->idx;
424eb881 1639 return ret;
1c6fdbd8
KO
1640}
1641
a8e00bd4
KO
1642static int btree_trans_realloc_iters(struct btree_trans *trans,
1643 unsigned new_size)
1c6fdbd8 1644{
a8e00bd4 1645 void *new_iters, *new_updates;
1c6fdbd8 1646
a8e00bd4
KO
1647 BUG_ON(new_size > BTREE_ITER_MAX);
1648
1649 if (new_size <= trans->size)
1650 return 0;
1651
1652 BUG_ON(trans->used_mempool);
1653
1c6fdbd8
KO
1654 bch2_trans_unlock(trans);
1655
a8e00bd4
KO
1656 new_iters = kmalloc(sizeof(struct btree_iter) * new_size +
1657 sizeof(struct btree_insert_entry) * (new_size + 4),
1658 GFP_NOFS);
1659 if (new_iters)
1660 goto success;
1661
581edb63 1662 new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
a8e00bd4
KO
1663 new_size = BTREE_ITER_MAX;
1664
1665 trans->used_mempool = true;
1666success:
1667 new_updates = new_iters + sizeof(struct btree_iter) * new_size;
1c6fdbd8
KO
1668
1669 memcpy(new_iters, trans->iters,
1670 sizeof(struct btree_iter) * trans->nr_iters);
a8e00bd4
KO
1671 memcpy(new_updates, trans->updates,
1672 sizeof(struct btree_insert_entry) * trans->nr_updates);
1673
5df4be3f
KO
1674 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
1675 memset(trans->iters, POISON_FREE,
1676 sizeof(struct btree_iter) * trans->nr_iters +
1677 sizeof(struct btree_insert_entry) * trans->nr_iters);
1678
a8e00bd4
KO
1679 if (trans->iters != trans->iters_onstack)
1680 kfree(trans->iters);
1681
1682 trans->iters = new_iters;
1683 trans->updates = new_updates;
1684 trans->size = new_size;
1c6fdbd8 1685
1c7a0adf
KO
1686 if (trans->iters_live) {
1687 trans_restart();
ba5c6557 1688 trace_trans_restart_iters_realloced(trans->c, trans->ip);
1c7a0adf
KO
1689 return -EINTR;
1690 }
1691
1692 return 0;
1c6fdbd8
KO
1693}
1694
581edb63 1695void bch2_trans_preload_iters(struct btree_trans *trans)
1c6fdbd8 1696{
a8e00bd4 1697 btree_trans_realloc_iters(trans, BTREE_ITER_MAX);
1c6fdbd8
KO
1698}
1699
7c26ecae
KO
1700static int btree_trans_iter_alloc(struct btree_trans *trans)
1701{
4c1c1e39 1702 unsigned idx = __ffs64(~trans->iters_linked);
7c26ecae
KO
1703
1704 if (idx < trans->nr_iters)
1705 goto got_slot;
1706
1707 if (trans->nr_iters == trans->size) {
1708 int ret = btree_trans_realloc_iters(trans, trans->size * 2);
1709 if (ret)
1710 return ret;
1711 }
1712
1713 idx = trans->nr_iters++;
1714 BUG_ON(trans->nr_iters > trans->size);
e1120a4c
KO
1715
1716 trans->iters[idx].idx = idx;
7c26ecae 1717got_slot:
7c26ecae 1718 BUG_ON(trans->iters_linked & (1ULL << idx));
7c26ecae
KO
1719 trans->iters_linked |= 1ULL << idx;
1720 return idx;
1721}
1722
1c6fdbd8 1723static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
5df4be3f 1724 unsigned btree_id, struct bpos pos,
1c6fdbd8
KO
1725 unsigned flags, u64 iter_id)
1726{
1727 struct btree_iter *iter;
1728 int idx;
1729
1730 BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
1731
5df4be3f 1732 for (idx = 0; idx < trans->nr_iters; idx++) {
7c26ecae
KO
1733 if (!(trans->iters_linked & (1ULL << idx)))
1734 continue;
1735
5df4be3f
KO
1736 iter = &trans->iters[idx];
1737 if (iter_id
1738 ? iter->id == iter_id
1739 : (iter->btree_id == btree_id &&
1740 !bkey_cmp(iter->pos, pos)))
1c6fdbd8 1741 goto found;
5df4be3f 1742 }
1c6fdbd8
KO
1743 idx = -1;
1744found:
1745 if (idx < 0) {
7c26ecae
KO
1746 idx = btree_trans_iter_alloc(trans);
1747 if (idx < 0)
1748 return ERR_PTR(idx);
1c6fdbd8 1749
1c6fdbd8 1750 iter = &trans->iters[idx];
a8e00bd4 1751 iter->id = iter_id;
1c6fdbd8 1752
9e5e5b9e 1753 bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
1c6fdbd8
KO
1754 } else {
1755 iter = &trans->iters[idx];
1756
1c6fdbd8
KO
1757 iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1758 iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1759 }
1760
5df4be3f 1761 BUG_ON(iter->btree_id != btree_id);
a8e00bd4 1762 BUG_ON(trans->iters_live & (1ULL << idx));
5df4be3f
KO
1763 trans->iters_live |= 1ULL << idx;
1764 trans->iters_touched |= 1ULL << idx;
1c6fdbd8 1765
08af47df
KO
1766 BUG_ON(iter->btree_id != btree_id);
1767 BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
1768
1c6fdbd8
KO
1769 return iter;
1770}
1771
1772struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
1773 enum btree_id btree_id,
1774 struct bpos pos, unsigned flags,
1775 u64 iter_id)
1776{
1777 struct btree_iter *iter =
5df4be3f 1778 __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
1c6fdbd8
KO
1779
1780 if (!IS_ERR(iter))
1781 bch2_btree_iter_set_pos(iter, pos);
1782 return iter;
1783}
1784
424eb881
KO
1785struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
1786 enum btree_id btree_id,
1787 struct bpos pos,
1788 unsigned locks_want,
1789 unsigned depth,
1790 unsigned flags)
1791{
1792 struct btree_iter *iter =
1793 __btree_trans_get_iter(trans, btree_id, pos,
1794 flags|BTREE_ITER_NODES, 0);
1795 unsigned i;
1796
1797 BUG_ON(IS_ERR(iter));
1798 BUG_ON(bkey_cmp(iter->pos, pos));
1799
1800 iter->locks_want = locks_want;
1801 iter->level = depth;
1802
1803 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1804 iter->l[i].b = NULL;
1805 iter->l[iter->level].b = BTREE_ITER_NOT_END;
1806
1807 return iter;
1808}
1809
7c26ecae
KO
1810struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
1811 struct btree_iter *src)
1c6fdbd8 1812{
ecc892e4 1813 struct btree_iter *iter;
e1120a4c 1814 unsigned offset = offsetof(struct btree_iter, trans);
ecc892e4 1815 int i, idx;
1c6fdbd8 1816
7c26ecae
KO
1817 idx = btree_trans_iter_alloc(trans);
1818 if (idx < 0)
1819 return ERR_PTR(idx);
1820
1821 trans->iters_live |= 1ULL << idx;
1822 trans->iters_touched |= 1ULL << idx;
1823 trans->iters_unlink_on_restart |= 1ULL << idx;
1824
ecc892e4 1825 iter = &trans->iters[idx];
e1120a4c
KO
1826
1827 memcpy((void *) iter + offset,
1828 (void *) src + offset,
1829 sizeof(*iter) - offset);
ecc892e4
KO
1830
1831 for (i = 0; i < BTREE_MAX_DEPTH; i++)
1832 if (btree_node_locked(iter, i))
c43a6ef9 1833 six_lock_increment(&iter->l[i].b->c.lock,
ecc892e4 1834 __btree_lock_want(iter, i));
7c26ecae
KO
1835
1836 return &trans->iters[idx];
1c6fdbd8
KO
1837}
1838
1839void *bch2_trans_kmalloc(struct btree_trans *trans,
1840 size_t size)
1841{
1842 void *ret;
1843
1844 if (trans->mem_top + size > trans->mem_bytes) {
1845 size_t old_bytes = trans->mem_bytes;
1846 size_t new_bytes = roundup_pow_of_two(trans->mem_top + size);
1847 void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
1848
1849 if (!new_mem)
1850 return ERR_PTR(-ENOMEM);
1851
1852 trans->mem = new_mem;
1853 trans->mem_bytes = new_bytes;
1854
1c7a0adf
KO
1855 if (old_bytes) {
1856 trans_restart();
ba5c6557 1857 trace_trans_restart_mem_realloced(trans->c, trans->ip);
1c6fdbd8 1858 return ERR_PTR(-EINTR);
1c7a0adf 1859 }
1c6fdbd8
KO
1860 }
1861
1862 ret = trans->mem + trans->mem_top;
1863 trans->mem_top += size;
1864 return ret;
1865}
1866
1867int bch2_trans_unlock(struct btree_trans *trans)
1868{
4c1c1e39 1869 u64 iters = trans->iters_linked;
1c6fdbd8
KO
1870 int ret = 0;
1871
1872 while (iters) {
4c1c1e39 1873 unsigned idx = __ffs64(iters);
1c6fdbd8
KO
1874 struct btree_iter *iter = &trans->iters[idx];
1875
0f238367 1876 ret = ret ?: btree_iter_err(iter);
1c6fdbd8
KO
1877
1878 __bch2_btree_iter_unlock(iter);
4c1c1e39 1879 iters ^= 1ULL << idx;
1c6fdbd8
KO
1880 }
1881
1882 return ret;
1883}
1884
5df4be3f
KO
1885inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
1886{
1887 iters &= trans->iters_linked;
4afe7000 1888 iters &= ~trans->iters_live;
5df4be3f
KO
1889
1890 while (iters) {
1891 unsigned idx = __ffs64(iters);
1892
1893 iters &= ~(1ULL << idx);
1894 __bch2_trans_iter_free(trans, idx);
1895 }
1896}
1897
1c7a0adf 1898void __bch2_trans_begin(struct btree_trans *trans)
1c6fdbd8 1899{
5df4be3f 1900 u64 iters_to_unlink;
1c6fdbd8 1901
1c6fdbd8
KO
1902 /*
1903 * On transaction restart, the transaction isn't required to allocate
1904 * all the same iterators it on the last iteration:
1905 *
1906 * Unlink any iterators it didn't use this iteration, assuming it got
1907 * further (allocated an iter with a higher idx) than where the iter
1908 * was originally allocated:
1909 */
5df4be3f
KO
1910 iters_to_unlink = ~trans->iters_live &
1911 ((1ULL << fls64(trans->iters_live)) - 1);
a8e00bd4 1912
5df4be3f
KO
1913 iters_to_unlink |= trans->iters_unlink_on_restart;
1914 iters_to_unlink |= trans->iters_unlink_on_commit;
a8e00bd4 1915
4afe7000
KO
1916 trans->iters_live = 0;
1917
5df4be3f 1918 bch2_trans_unlink_iters(trans, iters_to_unlink);
1c6fdbd8 1919
5df4be3f
KO
1920 trans->iters_touched = 0;
1921 trans->iters_unlink_on_restart = 0;
1922 trans->iters_unlink_on_commit = 0;
1923 trans->nr_updates = 0;
1924 trans->mem_top = 0;
bf7b87a4
KO
1925
1926 bch2_btree_iter_traverse_all(trans);
1c6fdbd8
KO
1927}
1928
1929void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c)
1930{
a8e00bd4
KO
1931 memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
1932
1c6fdbd8 1933 trans->c = c;
ba5c6557 1934 trans->ip = _RET_IP_;
a8e00bd4 1935 trans->size = ARRAY_SIZE(trans->iters_onstack);
1c6fdbd8 1936 trans->iters = trans->iters_onstack;
a8e00bd4 1937 trans->updates = trans->updates_onstack;
1c6fdbd8
KO
1938}
1939
1940int bch2_trans_exit(struct btree_trans *trans)
1941{
ece254b2 1942 bch2_trans_unlock(trans);
1c6fdbd8
KO
1943
1944 kfree(trans->mem);
a8e00bd4 1945 if (trans->used_mempool)
581edb63 1946 mempool_free(trans->iters, &trans->c->btree_iters_pool);
a8e00bd4
KO
1947 else if (trans->iters != trans->iters_onstack)
1948 kfree(trans->iters);
1c6fdbd8
KO
1949 trans->mem = (void *) 0x1;
1950 trans->iters = (void *) 0x1;
ece254b2
KO
1951
1952 return trans->error ? -EIO : 0;
1c6fdbd8 1953}