bcachefs: fix for_each_btree_key()
[linux-block.git] / fs / bcachefs / btree_iter.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "bkey_methods.h"
5#include "btree_cache.h"
6#include "btree_iter.h"
7#include "btree_locking.h"
8#include "debug.h"
9#include "extents.h"
10#include "trace.h"
11
12#include <linux/prefetch.h>
13
14static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
15 struct btree_iter_level *,
16 struct bkey *);
17
ed8413fd
KO
18#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
19#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
20#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
21#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
22#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
23#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
24#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
1c6fdbd8
KO
25
26static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
27{
28 return l < BTREE_MAX_DEPTH &&
ed8413fd 29 (unsigned long) iter->l[l].b >= 128;
1c6fdbd8
KO
30}
31
a00fd8c5
KO
32/* Returns < 0 if @k is before iter pos, > 0 if @k is after */
33static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
34 const struct btree *b,
35 const struct bkey_packed *k,
36 bool interior_node)
37{
38 int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
39
40 if (cmp)
41 return cmp;
42 if (bkey_deleted(k))
43 return -1;
cbdf24ce
KO
44
45 /*
46 * Normally, for extents we want the first key strictly greater than
47 * the iterator position - with the exception that for interior nodes,
48 * we don't want to advance past the last key if the iterator position
49 * is POS_MAX:
50 */
51 if (iter->flags & BTREE_ITER_IS_EXTENTS &&
52 (!interior_node ||
53 bkey_cmp_left_packed_byval(b, k, POS_MAX)))
a00fd8c5
KO
54 return -1;
55 return 1;
56}
57
58static inline int btree_iter_pos_cmp(struct btree_iter *iter,
59 const struct btree *b,
60 const struct bkey_packed *k)
61{
c43a6ef9 62 return __btree_iter_pos_cmp(iter, b, k, b->c.level != 0);
a00fd8c5
KO
63}
64
1c6fdbd8
KO
65/* Btree node locking: */
66
67/*
68 * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
69 * succeed:
70 */
71void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
72{
73 struct btree_iter *linked;
74
c43a6ef9
KO
75 EBUG_ON(iter->l[b->c.level].b != b);
76 EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
1c6fdbd8 77
0f238367 78 trans_for_each_iter_with_node(iter->trans, b, linked)
c43a6ef9 79 linked->l[b->c.level].lock_seq += 2;
1c6fdbd8 80
c43a6ef9 81 six_unlock_write(&b->c.lock);
1c6fdbd8
KO
82}
83
84void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
85{
1c6fdbd8
KO
86 struct btree_iter *linked;
87 unsigned readers = 0;
88
c43a6ef9 89 EBUG_ON(btree_node_read_locked(iter, b->c.level));
1c6fdbd8 90
0f238367 91 trans_for_each_iter(iter->trans, linked)
c43a6ef9
KO
92 if (linked->l[b->c.level].b == b &&
93 btree_node_read_locked(linked, b->c.level))
1c6fdbd8
KO
94 readers++;
95
96 /*
97 * Must drop our read locks before calling six_lock_write() -
98 * six_unlock() won't do wakeups until the reader count
99 * goes to 0, and it's safe because we have the node intent
100 * locked:
101 */
102 atomic64_sub(__SIX_VAL(read_lock, readers),
c43a6ef9 103 &b->c.lock.state.counter);
9e5e5b9e 104 btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
1c6fdbd8 105 atomic64_add(__SIX_VAL(read_lock, readers),
c43a6ef9 106 &b->c.lock.state.counter);
1c6fdbd8
KO
107}
108
1c6fdbd8
KO
109bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
110{
111 struct btree *b = btree_iter_node(iter, level);
112 int want = __btree_lock_want(iter, level);
113
ed8413fd 114 if (!is_btree_node(iter, level))
1c6fdbd8
KO
115 return false;
116
117 if (race_fault())
118 return false;
119
ed8413fd
KO
120 if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
121 (btree_node_lock_seq_matches(iter, b, level) &&
122 btree_node_lock_increment(iter, b, level, want))) {
123 mark_btree_node_locked(iter, level, want);
124 return true;
125 } else {
1c6fdbd8 126 return false;
ed8413fd 127 }
1c6fdbd8
KO
128}
129
130static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
131{
132 struct btree *b = iter->l[level].b;
133
134 EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
135
136 if (!is_btree_node(iter, level))
137 return false;
138
139 if (btree_node_intent_locked(iter, level))
140 return true;
141
142 if (race_fault())
143 return false;
144
145 if (btree_node_locked(iter, level)
c43a6ef9
KO
146 ? six_lock_tryupgrade(&b->c.lock)
147 : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
1c6fdbd8
KO
148 goto success;
149
ed8413fd 150 if (btree_node_lock_seq_matches(iter, b, level) &&
1c6fdbd8
KO
151 btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
152 btree_node_unlock(iter, level);
153 goto success;
154 }
155
156 return false;
157success:
158 mark_btree_node_intent_locked(iter, level);
159 return true;
160}
161
162static inline bool btree_iter_get_locks(struct btree_iter *iter,
7d825866 163 bool upgrade, bool trace)
1c6fdbd8
KO
164{
165 unsigned l = iter->level;
166 int fail_idx = -1;
167
168 do {
169 if (!btree_iter_node(iter, l))
170 break;
171
172 if (!(upgrade
173 ? bch2_btree_node_upgrade(iter, l)
174 : bch2_btree_node_relock(iter, l))) {
7d825866
KO
175 if (trace)
176 (upgrade
177 ? trace_node_upgrade_fail
178 : trace_node_relock_fail)(l, iter->l[l].lock_seq,
ed8413fd
KO
179 is_btree_node(iter, l)
180 ? 0
181 : (unsigned long) iter->l[l].b,
182 is_btree_node(iter, l)
183 ? iter->l[l].b->c.lock.state.seq
184 : 0);
185
1c6fdbd8
KO
186 fail_idx = l;
187 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
188 }
189
190 l++;
191 } while (l < iter->locks_want);
192
193 /*
194 * When we fail to get a lock, we have to ensure that any child nodes
195 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
196 * the node that we failed to relock:
197 */
198 while (fail_idx >= 0) {
199 btree_node_unlock(iter, fail_idx);
ed8413fd 200 iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
1c6fdbd8
KO
201 --fail_idx;
202 }
203
204 if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
205 iter->uptodate = BTREE_ITER_NEED_PEEK;
206
0f238367
KO
207 bch2_btree_trans_verify_locks(iter->trans);
208
1c6fdbd8
KO
209 return iter->uptodate < BTREE_ITER_NEED_RELOCK;
210}
211
212/* Slowpath: */
213bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
214 unsigned level,
215 struct btree_iter *iter,
b03b81df 216 enum six_lock_type type)
1c6fdbd8 217{
1c6fdbd8
KO
218 struct btree_iter *linked;
219 bool ret = true;
220
647d7b60 221 /* Check if it's safe to block: */
0f238367 222 trans_for_each_iter(iter->trans, linked) {
1c6fdbd8
KO
223 if (!linked->nodes_locked)
224 continue;
225
647d7b60 226 /* * Must lock btree nodes in key order: */
1c6fdbd8
KO
227 if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
228 ret = false;
229
230 /*
231 * Can't block taking an intent lock if we have _any_ nodes read
232 * locked:
233 *
234 * - Our read lock blocks another thread with an intent lock on
235 * the same node from getting a write lock, and thus from
236 * dropping its intent lock
237 *
238 * - And the other thread may have multiple nodes intent locked:
239 * both the node we want to intent lock, and the node we
240 * already have read locked - deadlock:
241 */
242 if (type == SIX_LOCK_intent &&
243 linked->nodes_locked != linked->nodes_intent_locked) {
60755344 244 if (!(iter->trans->nounlock)) {
1c6fdbd8
KO
245 linked->locks_want = max_t(unsigned,
246 linked->locks_want,
247 __fls(linked->nodes_locked) + 1);
7d825866 248 btree_iter_get_locks(linked, true, false);
1c6fdbd8
KO
249 }
250 ret = false;
251 }
252
253 /*
254 * Interior nodes must be locked before their descendants: if
255 * another iterator has possible descendants locked of the node
256 * we're about to lock, it must have the ancestors locked too:
257 */
258 if (linked->btree_id == iter->btree_id &&
259 level > __fls(linked->nodes_locked)) {
60755344 260 if (!(iter->trans->nounlock)) {
647d7b60
KO
261 linked->locks_want =
262 max(level + 1, max_t(unsigned,
263 linked->locks_want,
264 iter->locks_want));
7d825866 265 btree_iter_get_locks(linked, true, false);
1c6fdbd8
KO
266 }
267 ret = false;
268 }
269 }
270
ba5c6557 271 if (unlikely(!ret)) {
20bceecb 272 trace_trans_restart_would_deadlock(iter->trans->ip);
ba5c6557
KO
273 return false;
274 }
1c7a0adf 275
ba5c6557
KO
276 __btree_node_lock_type(iter->trans->c, b, type);
277 return true;
1c6fdbd8
KO
278}
279
280/* Btree iterator locking: */
281
282#ifdef CONFIG_BCACHEFS_DEBUG
0f238367 283void bch2_btree_iter_verify_locks(struct btree_iter *iter)
1c6fdbd8
KO
284{
285 unsigned l;
286
287 for (l = 0; btree_iter_node(iter, l); l++) {
288 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
289 !btree_node_locked(iter, l))
290 continue;
291
292 BUG_ON(btree_lock_want(iter, l) !=
293 btree_node_locked_type(iter, l));
294 }
295}
ad7ae8d6 296
0f238367 297void bch2_btree_trans_verify_locks(struct btree_trans *trans)
ad7ae8d6 298{
0f238367 299 struct btree_iter *iter;
ad7ae8d6 300
0f238367
KO
301 trans_for_each_iter(trans, iter)
302 bch2_btree_iter_verify_locks(iter);
ad7ae8d6 303}
1c6fdbd8
KO
304#endif
305
306__flatten
7d825866 307static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
1c6fdbd8
KO
308{
309 return iter->uptodate >= BTREE_ITER_NEED_RELOCK
7d825866 310 ? btree_iter_get_locks(iter, false, trace)
1c6fdbd8
KO
311 : true;
312}
313
1c6fdbd8
KO
314bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
315 unsigned new_locks_want)
316{
317 struct btree_iter *linked;
318
319 EBUG_ON(iter->locks_want >= new_locks_want);
320
321 iter->locks_want = new_locks_want;
322
7d825866 323 if (btree_iter_get_locks(iter, true, true))
1c6fdbd8
KO
324 return true;
325
326 /*
327 * Ancestor nodes must be locked before child nodes, so set locks_want
328 * on iterators that might lock ancestors before us to avoid getting
329 * -EINTR later:
330 */
0f238367
KO
331 trans_for_each_iter(iter->trans, linked)
332 if (linked != iter &&
333 linked->btree_id == iter->btree_id &&
1c6fdbd8
KO
334 linked->locks_want < new_locks_want) {
335 linked->locks_want = new_locks_want;
7d825866 336 btree_iter_get_locks(linked, true, false);
1c6fdbd8
KO
337 }
338
339 return false;
340}
341
342bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
343 unsigned new_locks_want)
344{
345 unsigned l = iter->level;
346
347 EBUG_ON(iter->locks_want >= new_locks_want);
348
349 iter->locks_want = new_locks_want;
350
351 do {
352 if (!btree_iter_node(iter, l))
353 break;
354
355 if (!bch2_btree_node_upgrade(iter, l)) {
356 iter->locks_want = l;
357 return false;
358 }
359
360 l++;
361 } while (l < iter->locks_want);
362
363 return true;
364}
365
366void __bch2_btree_iter_downgrade(struct btree_iter *iter,
367 unsigned downgrade_to)
368{
369 struct btree_iter *linked;
370 unsigned l;
371
372 /*
373 * We downgrade linked iterators as well because btree_iter_upgrade
374 * might have had to modify locks_want on linked iterators due to lock
375 * ordering:
376 */
0f238367 377 trans_for_each_iter(iter->trans, linked) {
1c6fdbd8
KO
378 unsigned new_locks_want = downgrade_to ?:
379 (linked->flags & BTREE_ITER_INTENT ? 1 : 0);
380
381 if (linked->locks_want <= new_locks_want)
382 continue;
383
384 linked->locks_want = new_locks_want;
385
386 while (linked->nodes_locked &&
387 (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
388 if (l > linked->level) {
389 btree_node_unlock(linked, l);
390 } else {
391 if (btree_node_intent_locked(linked, l)) {
c43a6ef9 392 six_lock_downgrade(&linked->l[l].b->c.lock);
1c6fdbd8
KO
393 linked->nodes_intent_locked ^= 1 << l;
394 }
395 break;
396 }
397 }
1c6fdbd8 398 }
ad7ae8d6 399
0f238367 400 bch2_btree_trans_verify_locks(iter->trans);
1c6fdbd8
KO
401}
402
58fbf808
KO
403/* Btree transaction locking: */
404
405bool bch2_trans_relock(struct btree_trans *trans)
0f238367
KO
406{
407 struct btree_iter *iter;
408 bool ret = true;
409
410 trans_for_each_iter(trans, iter)
7d825866
KO
411 if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
412 ret &= bch2_btree_iter_relock(iter, true);
0f238367
KO
413
414 return ret;
415}
416
58fbf808 417void bch2_trans_unlock(struct btree_trans *trans)
0f238367
KO
418{
419 struct btree_iter *iter;
420
421 trans_for_each_iter(trans, iter)
422 __bch2_btree_iter_unlock(iter);
423}
424
1c6fdbd8
KO
425/* Btree iterator: */
426
427#ifdef CONFIG_BCACHEFS_DEBUG
428
429static void __bch2_btree_iter_verify(struct btree_iter *iter,
430 struct btree *b)
431{
c43a6ef9 432 struct btree_iter_level *l = &iter->l[b->c.level];
1c6fdbd8
KO
433 struct btree_node_iter tmp = l->iter;
434 struct bkey_packed *k;
435
f13f5a8c
KO
436 if (!debug_check_iterators(iter->trans->c))
437 return;
438
271a3d3a
KO
439 if (iter->uptodate > BTREE_ITER_NEED_PEEK)
440 return;
441
1c6fdbd8
KO
442 bch2_btree_node_iter_verify(&l->iter, b);
443
444 /*
445 * For interior nodes, the iterator will have skipped past
446 * deleted keys:
271a3d3a
KO
447 *
448 * For extents, the iterator may have skipped past deleted keys (but not
449 * whiteouts)
1c6fdbd8 450 */
c43a6ef9 451 k = b->c.level || iter->flags & BTREE_ITER_IS_EXTENTS
26609b61 452 ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
1c6fdbd8 453 : bch2_btree_node_iter_prev_all(&tmp, b);
a00fd8c5 454 if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
1c6fdbd8
KO
455 char buf[100];
456 struct bkey uk = bkey_unpack_key(b, k);
457
319f9ac3 458 bch2_bkey_to_text(&PBUF(buf), &uk);
271a3d3a 459 panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
1c6fdbd8
KO
460 buf, iter->pos.inode, iter->pos.offset);
461 }
462
463 k = bch2_btree_node_iter_peek_all(&l->iter, b);
a00fd8c5 464 if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
1c6fdbd8
KO
465 char buf[100];
466 struct bkey uk = bkey_unpack_key(b, k);
467
319f9ac3 468 bch2_bkey_to_text(&PBUF(buf), &uk);
271a3d3a
KO
469 panic("iter should be after current key:\n"
470 "iter pos %llu:%llu\n"
471 "cur key %s\n",
1c6fdbd8
KO
472 iter->pos.inode, iter->pos.offset, buf);
473 }
474
271a3d3a
KO
475 BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
476 (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
477 !bkey_whiteout(&iter->k) &&
478 bch2_btree_node_iter_end(&l->iter));
1c6fdbd8
KO
479}
480
481void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
482{
483 struct btree_iter *linked;
484
f13f5a8c
KO
485 if (!debug_check_iterators(iter->trans->c))
486 return;
487
0f238367 488 trans_for_each_iter_with_node(iter->trans, b, linked)
1c6fdbd8
KO
489 __bch2_btree_iter_verify(linked, b);
490}
491
271a3d3a
KO
492#else
493
494static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
495 struct btree *b) {}
496
1c6fdbd8
KO
497#endif
498
499static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
500 struct btree *b,
501 struct btree_node_iter *node_iter,
502 struct bset_tree *t,
503 struct bkey_packed *where,
504 unsigned clobber_u64s,
505 unsigned new_u64s)
506{
507 const struct bkey_packed *end = btree_bkey_last(b, t);
508 struct btree_node_iter_set *set;
509 unsigned offset = __btree_node_key_to_offset(b, where);
510 int shift = new_u64s - clobber_u64s;
271a3d3a 511 unsigned old_end = t->end_offset - shift;
1c6fdbd8
KO
512
513 btree_node_iter_for_each(node_iter, set)
514 if (set->end == old_end)
515 goto found;
516
517 /* didn't find the bset in the iterator - might have to readd it: */
518 if (new_u64s &&
a00fd8c5 519 btree_iter_pos_cmp(iter, b, where) > 0) {
1c6fdbd8
KO
520 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
521
522 bch2_btree_node_iter_push(node_iter, b, where, end);
523
c43a6ef9 524 if (!b->c.level &&
1c6fdbd8
KO
525 node_iter == &iter->l[0].iter)
526 bkey_disassemble(b,
527 bch2_btree_node_iter_peek_all(node_iter, b),
528 &iter->k);
529 }
530 return;
531found:
271a3d3a 532 set->end = t->end_offset;
1c6fdbd8
KO
533
534 /* Iterator hasn't gotten to the key that changed yet: */
535 if (set->k < offset)
536 return;
537
538 if (new_u64s &&
a00fd8c5 539 btree_iter_pos_cmp(iter, b, where) > 0) {
1c6fdbd8
KO
540 set->k = offset;
541 } else if (set->k < offset + clobber_u64s) {
542 set->k = offset + new_u64s;
543 if (set->k == set->end)
544 bch2_btree_node_iter_set_drop(node_iter, set);
545 } else {
546 set->k = (int) set->k + shift;
547 goto iter_current_key_not_modified;
548 }
549
550 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
551
552 bch2_btree_node_iter_sort(node_iter, b);
c43a6ef9 553 if (!b->c.level && node_iter == &iter->l[0].iter) {
56e0e7c7
KO
554 /*
555 * not legal to call bkey_debugcheck() here, because we're
556 * called midway through the update path after update has been
557 * marked but before deletes have actually happened:
558 */
559#if 0
1c6fdbd8 560 __btree_iter_peek_all(iter, &iter->l[0], &iter->k);
56e0e7c7
KO
561#endif
562 struct btree_iter_level *l = &iter->l[0];
563 struct bkey_packed *k =
564 bch2_btree_node_iter_peek_all(&l->iter, l->b);
565
566 if (unlikely(!k))
567 iter->k.type = KEY_TYPE_deleted;
568 else
569 bkey_disassemble(l->b, k, &iter->k);
570 }
1c6fdbd8
KO
571iter_current_key_not_modified:
572
573 /*
574 * Interior nodes are special because iterators for interior nodes don't
575 * obey the usual invariants regarding the iterator position:
576 *
577 * We may have whiteouts that compare greater than the iterator
578 * position, and logically should be in the iterator, but that we
579 * skipped past to find the first live key greater than the iterator
580 * position. This becomes an issue when we insert a new key that is
581 * greater than the current iterator position, but smaller than the
582 * whiteouts we've already skipped past - this happens in the course of
583 * a btree split.
584 *
585 * We have to rewind the iterator past to before those whiteouts here,
586 * else bkey_node_iter_prev() is not going to work and who knows what
587 * else would happen. And we have to do it manually, because here we've
588 * already done the insert and the iterator is currently inconsistent:
589 *
590 * We've got multiple competing invariants, here - we have to be careful
591 * about rewinding iterators for interior nodes, because they should
592 * always point to the key for the child node the btree iterator points
593 * to.
594 */
c43a6ef9 595 if (b->c.level && new_u64s &&
a00fd8c5 596 btree_iter_pos_cmp(iter, b, where) > 0) {
216c9fac 597 struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where);
1c6fdbd8
KO
598 struct bkey_packed *k;
599
600 for_each_bset(b, t) {
216c9fac 601 if (where_set == t)
1c6fdbd8
KO
602 continue;
603
604 k = bch2_bkey_prev_all(b, t,
605 bch2_btree_node_iter_bset_pos(node_iter, b, t));
606 if (k &&
a00fd8c5 607 bkey_iter_cmp(b, k, where) > 0) {
1c6fdbd8
KO
608 struct btree_node_iter_set *set;
609 unsigned offset =
610 __btree_node_key_to_offset(b, bkey_next(k));
611
612 btree_node_iter_for_each(node_iter, set)
613 if (set->k == offset) {
614 set->k = __btree_node_key_to_offset(b, k);
615 bch2_btree_node_iter_sort(node_iter, b);
616 goto next_bset;
617 }
618
619 bch2_btree_node_iter_push(node_iter, b, k,
620 btree_bkey_last(b, t));
621 }
622next_bset:
623 t = t;
624 }
625 }
626}
627
628void bch2_btree_node_iter_fix(struct btree_iter *iter,
216c9fac
KO
629 struct btree *b,
630 struct btree_node_iter *node_iter,
631 struct bkey_packed *where,
632 unsigned clobber_u64s,
633 unsigned new_u64s)
1c6fdbd8 634{
216c9fac 635 struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
1c6fdbd8
KO
636 struct btree_iter *linked;
637
c43a6ef9 638 if (node_iter != &iter->l[b->c.level].iter)
1c6fdbd8
KO
639 __bch2_btree_node_iter_fix(iter, b, node_iter, t,
640 where, clobber_u64s, new_u64s);
641
0f238367 642 trans_for_each_iter_with_node(iter->trans, b, linked)
1c6fdbd8 643 __bch2_btree_node_iter_fix(linked, b,
c43a6ef9 644 &linked->l[b->c.level].iter, t,
1c6fdbd8 645 where, clobber_u64s, new_u64s);
1c6fdbd8
KO
646}
647
648static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
649 struct btree_iter_level *l,
650 struct bkey *u,
651 struct bkey_packed *k)
652{
653 struct bkey_s_c ret;
654
655 if (unlikely(!k)) {
656 /*
657 * signal to bch2_btree_iter_peek_slot() that we're currently at
658 * a hole
659 */
26609b61 660 u->type = KEY_TYPE_deleted;
1c6fdbd8
KO
661 return bkey_s_c_null;
662 }
663
664 ret = bkey_disassemble(l->b, k, u);
665
9e5e5b9e
KO
666 if (debug_check_bkeys(iter->trans->c))
667 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1c6fdbd8
KO
668
669 return ret;
670}
671
672/* peek_all() doesn't skip deleted keys */
673static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
674 struct btree_iter_level *l,
675 struct bkey *u)
676{
677 return __btree_iter_unpack(iter, l, u,
678 bch2_btree_node_iter_peek_all(&l->iter, l->b));
679}
680
681static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
682 struct btree_iter_level *l)
683{
684 return __btree_iter_unpack(iter, l, &iter->k,
685 bch2_btree_node_iter_peek(&l->iter, l->b));
686}
687
a00fd8c5
KO
688static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
689 struct btree_iter_level *l,
690 int max_advance)
1c6fdbd8 691{
a00fd8c5
KO
692 struct bkey_packed *k;
693 int nr_advanced = 0;
694
695 while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
696 btree_iter_pos_cmp(iter, l->b, k) < 0) {
697 if (max_advance > 0 && nr_advanced >= max_advance)
698 return false;
699
700 bch2_btree_node_iter_advance(&l->iter, l->b);
701 nr_advanced++;
702 }
703
704 return true;
1c6fdbd8
KO
705}
706
707/*
708 * Verify that iterator for parent node points to child node:
709 */
710static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
711{
712 struct btree_iter_level *l;
713 unsigned plevel;
714 bool parent_locked;
715 struct bkey_packed *k;
716
717 if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
718 return;
719
c43a6ef9 720 plevel = b->c.level + 1;
1c6fdbd8
KO
721 if (!btree_iter_node(iter, plevel))
722 return;
723
724 parent_locked = btree_node_locked(iter, plevel);
725
726 if (!bch2_btree_node_relock(iter, plevel))
727 return;
728
729 l = &iter->l[plevel];
730 k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
731 if (!k ||
732 bkey_deleted(k) ||
733 bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
734 char buf[100];
735 struct bkey uk = bkey_unpack_key(b, k);
736
319f9ac3 737 bch2_bkey_to_text(&PBUF(buf), &uk);
1c6fdbd8
KO
738 panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
739 buf, b->key.k.p.inode, b->key.k.p.offset);
740 }
741
742 if (!parent_locked)
c43a6ef9 743 btree_node_unlock(iter, b->c.level + 1);
1c6fdbd8
KO
744}
745
1c6fdbd8
KO
746static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
747 struct btree *b)
748{
a00fd8c5 749 return __btree_iter_pos_cmp(iter, NULL,
cbdf24ce 750 bkey_to_packed(&b->key), true) < 0;
1c6fdbd8
KO
751}
752
753static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
754 struct btree *b)
755{
c43a6ef9 756 return iter->btree_id == b->c.btree_id &&
1c6fdbd8
KO
757 bkey_cmp(iter->pos, b->data->min_key) >= 0 &&
758 !btree_iter_pos_after_node(iter, b);
759}
760
761static inline void __btree_iter_init(struct btree_iter *iter,
a00fd8c5 762 unsigned level)
1c6fdbd8 763{
a00fd8c5
KO
764 struct btree_iter_level *l = &iter->l[level];
765
766 bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
1c6fdbd8 767
a00fd8c5
KO
768 if (iter->flags & BTREE_ITER_IS_EXTENTS)
769 btree_iter_advance_to_pos(iter, l, -1);
1c6fdbd8
KO
770
771 /* Skip to first non whiteout: */
a00fd8c5
KO
772 if (level)
773 bch2_btree_node_iter_peek(&l->iter, l->b);
1c6fdbd8
KO
774
775 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
776}
777
778static inline void btree_iter_node_set(struct btree_iter *iter,
779 struct btree *b)
780{
781 btree_iter_verify_new_node(iter, b);
782
783 EBUG_ON(!btree_iter_pos_in_node(iter, b));
c43a6ef9 784 EBUG_ON(b->c.lock.state.seq & 1);
1c6fdbd8 785
c43a6ef9
KO
786 iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
787 iter->l[b->c.level].b = b;
788 __btree_iter_init(iter, b->c.level);
1c6fdbd8
KO
789}
790
791/*
792 * A btree node is being replaced - update the iterator to point to the new
793 * node:
794 */
795void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
796{
797 enum btree_node_locked_type t;
798 struct btree_iter *linked;
799
0f238367 800 trans_for_each_iter(iter->trans, linked)
1c6fdbd8
KO
801 if (btree_iter_pos_in_node(linked, b)) {
802 /*
803 * bch2_btree_iter_node_drop() has already been called -
804 * the old node we're replacing has already been
805 * unlocked and the pointer invalidated
806 */
c43a6ef9 807 BUG_ON(btree_node_locked(linked, b->c.level));
1c6fdbd8 808
c43a6ef9 809 t = btree_lock_want(linked, b->c.level);
1c6fdbd8 810 if (t != BTREE_NODE_UNLOCKED) {
c43a6ef9
KO
811 six_lock_increment(&b->c.lock, (enum six_lock_type) t);
812 mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t);
1c6fdbd8
KO
813 }
814
815 btree_iter_node_set(linked, b);
816 }
817
c43a6ef9 818 six_unlock_intent(&b->c.lock);
1c6fdbd8
KO
819}
820
821void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
822{
823 struct btree_iter *linked;
c43a6ef9 824 unsigned level = b->c.level;
1c6fdbd8 825
0f238367 826 trans_for_each_iter(iter->trans, linked)
1c6fdbd8 827 if (linked->l[level].b == b) {
ad7ae8d6 828 __btree_node_unlock(linked, level);
ed8413fd 829 linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
1c6fdbd8
KO
830 }
831}
832
833/*
834 * A btree node has been modified in such a way as to invalidate iterators - fix
835 * them:
836 */
837void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
838{
839 struct btree_iter *linked;
840
0f238367 841 trans_for_each_iter_with_node(iter->trans, b, linked)
c43a6ef9 842 __btree_iter_init(linked, b->c.level);
1c6fdbd8
KO
843}
844
845static inline int btree_iter_lock_root(struct btree_iter *iter,
846 unsigned depth_want)
847{
9e5e5b9e 848 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
849 struct btree *b;
850 enum six_lock_type lock_type;
851 unsigned i;
852
853 EBUG_ON(iter->nodes_locked);
854
855 while (1) {
856 b = READ_ONCE(c->btree_roots[iter->btree_id].b);
c43a6ef9 857 iter->level = READ_ONCE(b->c.level);
1c6fdbd8
KO
858
859 if (unlikely(iter->level < depth_want)) {
860 /*
861 * the root is at a lower depth than the depth we want:
862 * got to the end of the btree, or we're walking nodes
863 * greater than some depth and there are no nodes >=
864 * that depth
865 */
866 iter->level = depth_want;
ed8413fd
KO
867 for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
868 iter->l[i].b = NULL;
8812600c 869 return 1;
1c6fdbd8
KO
870 }
871
872 lock_type = __btree_lock_want(iter, iter->level);
873 if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
b03b81df 874 iter, lock_type)))
1c6fdbd8
KO
875 return -EINTR;
876
877 if (likely(b == c->btree_roots[iter->btree_id].b &&
c43a6ef9 878 b->c.level == iter->level &&
1c6fdbd8
KO
879 !race_fault())) {
880 for (i = 0; i < iter->level; i++)
ed8413fd 881 iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
1c6fdbd8 882 iter->l[iter->level].b = b;
ed8413fd
KO
883 for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
884 iter->l[i].b = NULL;
1c6fdbd8
KO
885
886 mark_btree_node_locked(iter, iter->level, lock_type);
887 btree_iter_node_set(iter, b);
888 return 0;
1c6fdbd8
KO
889 }
890
c43a6ef9 891 six_unlock_type(&b->c.lock, lock_type);
1c6fdbd8
KO
892 }
893}
894
895noinline
896static void btree_iter_prefetch(struct btree_iter *iter)
897{
9e5e5b9e 898 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
899 struct btree_iter_level *l = &iter->l[iter->level];
900 struct btree_node_iter node_iter = l->iter;
901 struct bkey_packed *k;
902 BKEY_PADDED(k) tmp;
9e5e5b9e 903 unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1c6fdbd8
KO
904 ? (iter->level > 1 ? 0 : 2)
905 : (iter->level > 1 ? 1 : 16);
906 bool was_locked = btree_node_locked(iter, iter->level);
907
908 while (nr) {
909 if (!bch2_btree_node_relock(iter, iter->level))
910 return;
911
912 bch2_btree_node_iter_advance(&node_iter, l->b);
913 k = bch2_btree_node_iter_peek(&node_iter, l->b);
914 if (!k)
915 break;
916
917 bch2_bkey_unpack(l->b, &tmp.k, k);
9e5e5b9e 918 bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
1c6fdbd8
KO
919 }
920
921 if (!was_locked)
922 btree_node_unlock(iter, iter->level);
923}
924
925static inline int btree_iter_down(struct btree_iter *iter)
926{
9e5e5b9e 927 struct bch_fs *c = iter->trans->c;
1c6fdbd8
KO
928 struct btree_iter_level *l = &iter->l[iter->level];
929 struct btree *b;
930 unsigned level = iter->level - 1;
931 enum six_lock_type lock_type = __btree_lock_want(iter, level);
932 BKEY_PADDED(k) tmp;
933
934 BUG_ON(!btree_node_locked(iter, iter->level));
935
936 bch2_bkey_unpack(l->b, &tmp.k,
937 bch2_btree_node_iter_peek(&l->iter, l->b));
938
b03b81df 939 b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
1c6fdbd8
KO
940 if (unlikely(IS_ERR(b)))
941 return PTR_ERR(b);
942
943 mark_btree_node_locked(iter, level, lock_type);
944 btree_iter_node_set(iter, b);
945
946 if (iter->flags & BTREE_ITER_PREFETCH)
947 btree_iter_prefetch(iter);
948
949 iter->level = level;
950
951 return 0;
952}
953
954static void btree_iter_up(struct btree_iter *iter)
955{
956 btree_node_unlock(iter, iter->level++);
957}
958
959int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
960
bf7b87a4 961static int __btree_iter_traverse_all(struct btree_trans *trans,
37dd7834 962 struct btree_iter *orig_iter, int ret)
1c6fdbd8 963{
e542029e 964 struct bch_fs *c = trans->c;
37dd7834 965 struct btree_iter *iter;
e542029e
KO
966 u8 sorted[BTREE_ITER_MAX];
967 unsigned i, nr_sorted = 0;
968
969 trans_for_each_iter(trans, iter)
970 sorted[nr_sorted++] = iter - trans->iters;
971
972#define btree_iter_cmp_by_idx(_l, _r) \
973 btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
974
975 bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
976#undef btree_iter_cmp_by_idx
977
1c6fdbd8 978retry_all:
58fbf808 979 bch2_trans_unlock(trans);
1c6fdbd8 980
bf7b87a4 981 if (unlikely(ret == -ENOMEM)) {
1c6fdbd8
KO
982 struct closure cl;
983
984 closure_init_stack(&cl);
985
986 do {
987 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
988 closure_sync(&cl);
989 } while (ret);
990 }
991
bf7b87a4 992 if (unlikely(ret == -EIO)) {
ece254b2 993 trans->error = true;
37dd7834
KO
994 orig_iter->flags |= BTREE_ITER_ERROR;
995 orig_iter->l[orig_iter->level].b = BTREE_ITER_NO_NODE_ERROR;
bf7b87a4
KO
996 goto out;
997 }
998
999 BUG_ON(ret && ret != -EINTR);
1000
1c6fdbd8 1001 /* Now, redo traversals in correct order: */
e542029e
KO
1002 for (i = 0; i < nr_sorted; i++) {
1003 iter = &trans->iters[sorted[i]];
1c6fdbd8 1004
e542029e
KO
1005 do {
1006 ret = __bch2_btree_iter_traverse(iter);
1007 } while (ret == -EINTR);
1c6fdbd8 1008
e542029e
KO
1009 if (ret)
1010 goto retry_all;
1011 }
1c6fdbd8 1012
932aa837 1013 ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
1c6fdbd8
KO
1014out:
1015 bch2_btree_cache_cannibalize_unlock(c);
1016 return ret;
bf7b87a4 1017}
1c6fdbd8 1018
bf7b87a4
KO
1019int bch2_btree_iter_traverse_all(struct btree_trans *trans)
1020{
1021 return __btree_iter_traverse_all(trans, NULL, 0);
1c6fdbd8
KO
1022}
1023
1024static unsigned btree_iter_up_until_locked(struct btree_iter *iter,
1025 bool check_pos)
1026{
1027 unsigned l = iter->level;
1028
1029 while (btree_iter_node(iter, l) &&
ed8413fd
KO
1030 (!is_btree_node(iter, l) ||
1031 !bch2_btree_node_relock(iter, l) ||
1032 (check_pos &&
1033 !btree_iter_pos_in_node(iter, iter->l[l].b)))) {
1c6fdbd8 1034 btree_node_unlock(iter, l);
ed8413fd 1035 iter->l[l].b = BTREE_ITER_NO_NODE_UP;
1c6fdbd8
KO
1036 l++;
1037 }
1038
1039 return l;
1040}
1041
1042/*
1043 * This is the main state machine for walking down the btree - walks down to a
1044 * specified depth
1045 *
1046 * Returns 0 on success, -EIO on error (error reading in a btree node).
1047 *
1048 * On error, caller (peek_node()/peek_key()) must return NULL; the error is
b7607ce9 1049 * stashed in the iterator and returned from bch2_trans_exit().
1c6fdbd8
KO
1050 */
1051int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1052{
1053 unsigned depth_want = iter->level;
1054
1055 if (unlikely(iter->level >= BTREE_MAX_DEPTH))
1056 return 0;
1057
7d825866 1058 if (bch2_btree_iter_relock(iter, false))
1c6fdbd8
KO
1059 return 0;
1060
1c6fdbd8
KO
1061 /*
1062 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
1063 * here unnecessary
1064 */
1065 iter->level = btree_iter_up_until_locked(iter, true);
1066
1067 /*
1068 * If we've got a btree node locked (i.e. we aren't about to relock the
1069 * root) - advance its node iterator if necessary:
1070 *
1071 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
1072 */
a00fd8c5
KO
1073 if (btree_iter_node(iter, iter->level))
1074 btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
1c6fdbd8
KO
1075
1076 /*
1077 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
1078 * would indicate to other code that we got to the end of the btree,
1079 * here it indicates that relocking the root failed - it's critical that
1080 * btree_iter_lock_root() comes next and that it can't fail
1081 */
1082 while (iter->level > depth_want) {
1083 int ret = btree_iter_node(iter, iter->level)
1084 ? btree_iter_down(iter)
1085 : btree_iter_lock_root(iter, depth_want);
1086 if (unlikely(ret)) {
8812600c
KO
1087 if (ret == 1)
1088 return 0;
1089
1c6fdbd8 1090 iter->level = depth_want;
ed8413fd 1091 iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN;
1c6fdbd8
KO
1092 return ret;
1093 }
1094 }
1095
1096 iter->uptodate = BTREE_ITER_NEED_PEEK;
271a3d3a 1097
0f238367 1098 bch2_btree_trans_verify_locks(iter->trans);
271a3d3a 1099 __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
1c6fdbd8
KO
1100 return 0;
1101}
1102
1103int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
1104{
1105 int ret;
1106
0e6dd8fb
KO
1107 ret = bch2_trans_cond_resched(iter->trans) ?:
1108 __bch2_btree_iter_traverse(iter);
1c6fdbd8 1109 if (unlikely(ret))
bf7b87a4 1110 ret = __btree_iter_traverse_all(iter->trans, iter, ret);
1c6fdbd8 1111
1c6fdbd8
KO
1112 return ret;
1113}
1114
1115static inline void bch2_btree_iter_checks(struct btree_iter *iter,
1116 enum btree_iter_type type)
1117{
1118 EBUG_ON(iter->btree_id >= BTREE_ID_NR);
1c6fdbd8 1119 EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
ab5c63f5 1120 (btree_node_type_is_extents(iter->btree_id) &&
1c6fdbd8
KO
1121 type != BTREE_ITER_NODES));
1122
0f238367 1123 bch2_btree_trans_verify_locks(iter->trans);
1c6fdbd8
KO
1124}
1125
1126/* Iterate across nodes (leaf and interior nodes) */
1127
1128struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
1129{
1130 struct btree *b;
1131 int ret;
1132
1133 bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1134
1135 if (iter->uptodate == BTREE_ITER_UPTODATE)
1136 return iter->l[iter->level].b;
1137
1138 ret = bch2_btree_iter_traverse(iter);
1139 if (ret)
1140 return NULL;
1141
1142 b = btree_iter_node(iter, iter->level);
1143 if (!b)
1144 return NULL;
1145
1146 BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
1147
1148 iter->pos = b->key.k.p;
1149 iter->uptodate = BTREE_ITER_UPTODATE;
1150
1151 return b;
1152}
1153
1154struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
1155{
1156 struct btree *b;
1157 int ret;
1158
1159 bch2_btree_iter_checks(iter, BTREE_ITER_NODES);
1160
1161 /* already got to end? */
1162 if (!btree_iter_node(iter, iter->level))
1163 return NULL;
1164
1dd7f9d9
KO
1165 bch2_trans_cond_resched(iter->trans);
1166
1c6fdbd8
KO
1167 btree_iter_up(iter);
1168
1169 if (!bch2_btree_node_relock(iter, iter->level))
1170 btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
1171
1172 ret = bch2_btree_iter_traverse(iter);
1173 if (ret)
1174 return NULL;
1175
1176 /* got to end? */
1177 b = btree_iter_node(iter, iter->level);
1178 if (!b)
1179 return NULL;
1180
1181 if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
1182 /*
1183 * Haven't gotten to the end of the parent node: go back down to
1184 * the next child node
1185 */
1186
1187 /*
1188 * We don't really want to be unlocking here except we can't
1189 * directly tell btree_iter_traverse() "traverse to this level"
1190 * except by setting iter->level, so we have to unlock so we
1191 * don't screw up our lock invariants:
1192 */
1193 if (btree_node_read_locked(iter, iter->level))
1194 btree_node_unlock(iter, iter->level);
1195
1196 /* ick: */
1197 iter->pos = iter->btree_id == BTREE_ID_INODES
1198 ? btree_type_successor(iter->btree_id, iter->pos)
1199 : bkey_successor(iter->pos);
1200 iter->level = depth;
1201
1202 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1203 ret = bch2_btree_iter_traverse(iter);
1204 if (ret)
1205 return NULL;
1206
1207 b = iter->l[iter->level].b;
1208 }
1209
1210 iter->pos = b->key.k.p;
1211 iter->uptodate = BTREE_ITER_UPTODATE;
1212
1213 return b;
1214}
1215
1216/* Iterate across keys (in leaf nodes only) */
1217
1218void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
1219{
1220 struct btree_iter_level *l = &iter->l[0];
1c6fdbd8
KO
1221
1222 EBUG_ON(iter->level != 0);
1223 EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
1224 EBUG_ON(!btree_node_locked(iter, 0));
1225 EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);
1226
1227 iter->pos = new_pos;
1228 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1229
a00fd8c5 1230 btree_iter_advance_to_pos(iter, l, -1);
1c6fdbd8 1231
a00fd8c5
KO
1232 if (bch2_btree_node_iter_end(&l->iter) &&
1233 btree_iter_pos_after_node(iter, l->b))
1c6fdbd8 1234 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1c6fdbd8
KO
1235}
1236
1237void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
1238{
1239 int cmp = bkey_cmp(new_pos, iter->pos);
1240 unsigned level;
1241
1242 if (!cmp)
1243 return;
1244
1245 iter->pos = new_pos;
1246
1247 level = btree_iter_up_until_locked(iter, true);
1248
1249 if (btree_iter_node(iter, level)) {
1c6fdbd8
KO
1250 /*
1251 * We might have to skip over many keys, or just a few: try
1252 * advancing the node iterator, and if we have to skip over too
1253 * many keys just reinit it (or if we're rewinding, since that
1254 * is expensive).
1255 */
a00fd8c5
KO
1256 if (cmp < 0 ||
1257 !btree_iter_advance_to_pos(iter, &iter->l[level], 8))
1258 __btree_iter_init(iter, level);
1c6fdbd8
KO
1259
1260 /* Don't leave it locked if we're not supposed to: */
1261 if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED)
1262 btree_node_unlock(iter, level);
1263 }
1264
1265 if (level != iter->level)
1266 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1267 else
1268 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1269}
1270
1271static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
1272{
1273 struct btree_iter_level *l = &iter->l[0];
1274 struct bkey_s_c ret = { .k = &iter->k };
1275
1276 if (!bkey_deleted(&iter->k)) {
1277 EBUG_ON(bch2_btree_node_iter_end(&l->iter));
1278 ret.v = bkeyp_val(&l->b->format,
1279 __bch2_btree_node_iter_peek_all(&l->iter, l->b));
1280 }
1281
9e5e5b9e 1282 if (debug_check_bkeys(iter->trans->c) &&
1c6fdbd8 1283 !bkey_deleted(ret.k))
9e5e5b9e 1284 bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1c6fdbd8
KO
1285 return ret;
1286}
1287
1288struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
1289{
1290 struct btree_iter_level *l = &iter->l[0];
1291 struct bkey_s_c k;
1292 int ret;
1293
1294 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1295
1296 if (iter->uptodate == BTREE_ITER_UPTODATE)
1297 return btree_iter_peek_uptodate(iter);
1298
1299 while (1) {
0e6dd8fb
KO
1300 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) {
1301 ret = bch2_btree_iter_traverse(iter);
1302 if (unlikely(ret))
1303 return bkey_s_c_err(ret);
1304 }
1c6fdbd8
KO
1305
1306 k = __btree_iter_peek(iter, l);
1307 if (likely(k.k))
1308 break;
1309
1310 /* got to the end of the leaf, iterator needs to be traversed: */
1311 iter->pos = l->b->key.k.p;
1312 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1313
1314 if (!bkey_cmp(iter->pos, POS_MAX))
1315 return bkey_s_c_null;
1316
1317 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1318 }
1319
1320 /*
1321 * iter->pos should always be equal to the key we just
1322 * returned - except extents can straddle iter->pos:
1323 */
1324 if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
1325 bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
1326 iter->pos = bkey_start_pos(k.k);
1327
1328 iter->uptodate = BTREE_ITER_UPTODATE;
1329 return k;
1330}
1331
1332static noinline
1333struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
1334{
1335 struct btree_iter_level *l = &iter->l[0];
1336
1337 iter->pos = l->b->key.k.p;
1338 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1339
1340 if (!bkey_cmp(iter->pos, POS_MAX))
1341 return bkey_s_c_null;
1342
1343 iter->pos = btree_type_successor(iter->btree_id, iter->pos);
1344
1345 return bch2_btree_iter_peek(iter);
1346}
1347
1348struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
1349{
1350 struct btree_iter_level *l = &iter->l[0];
1351 struct bkey_packed *p;
1352 struct bkey_s_c k;
1353
1354 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1355
0e6dd8fb
KO
1356 iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
1357
1c6fdbd8 1358 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
0e6dd8fb
KO
1359 /*
1360 * XXX: when we just need to relock we should be able to avoid
1361 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
1362 * for that to work
1363 */
1364 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1365
1366 return bch2_btree_iter_peek(iter);
1c6fdbd8
KO
1367 }
1368
1369 do {
a00fd8c5 1370 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8
KO
1371 p = bch2_btree_node_iter_peek_all(&l->iter, l->b);
1372 if (unlikely(!p))
1373 return bch2_btree_iter_peek_next_leaf(iter);
1374 } while (bkey_whiteout(p));
1375
1376 k = __btree_iter_unpack(iter, l, &iter->k, p);
1377
1378 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0);
1379 iter->pos = bkey_start_pos(k.k);
1380 return k;
1381}
1382
1383struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
1384{
1385 struct btree_iter_level *l = &iter->l[0];
1386 struct bkey_packed *p;
1387 struct bkey_s_c k;
1388 int ret;
1389
1390 bch2_btree_iter_checks(iter, BTREE_ITER_KEYS);
1391
1392 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1393 k = bch2_btree_iter_peek(iter);
1394 if (IS_ERR(k.k))
1395 return k;
1396 }
1397
1398 while (1) {
1399 p = bch2_btree_node_iter_prev(&l->iter, l->b);
1400 if (likely(p))
1401 break;
1402
1403 iter->pos = l->b->data->min_key;
1404 if (!bkey_cmp(iter->pos, POS_MIN))
1405 return bkey_s_c_null;
1406
1407 bch2_btree_iter_set_pos(iter,
1408 btree_type_predecessor(iter->btree_id, iter->pos));
1409
1410 ret = bch2_btree_iter_traverse(iter);
1411 if (unlikely(ret))
1412 return bkey_s_c_err(ret);
1413
1414 p = bch2_btree_node_iter_peek(&l->iter, l->b);
1415 if (p)
1416 break;
1417 }
1418
1419 k = __btree_iter_unpack(iter, l, &iter->k, p);
1420
1421 EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
1422
1423 iter->pos = bkey_start_pos(k.k);
1424 iter->uptodate = BTREE_ITER_UPTODATE;
1425 return k;
1426}
1427
1428static inline struct bkey_s_c
271a3d3a 1429__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
1c6fdbd8
KO
1430{
1431 struct btree_iter_level *l = &iter->l[0];
271a3d3a 1432 struct btree_node_iter node_iter;
1c6fdbd8
KO
1433 struct bkey_s_c k;
1434 struct bkey n;
1435 int ret;
1436
1437recheck:
1438 while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1439 bkey_deleted(k.k) &&
1440 bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
a00fd8c5 1441 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8 1442
271a3d3a
KO
1443 /*
1444 * iterator is now at the correct position for inserting at iter->pos,
1445 * but we need to keep iterating until we find the first non whiteout so
1446 * we know how big a hole we have, if any:
1447 */
1448
1449 node_iter = l->iter;
1450 if (k.k && bkey_whiteout(k.k))
1451 k = __btree_iter_unpack(iter, l, &iter->k,
1452 bch2_btree_node_iter_peek(&node_iter, l->b));
1453
1c6fdbd8
KO
1454 /*
1455 * If we got to the end of the node, check if we need to traverse to the
1456 * next node:
1457 */
1458 if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1459 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1460 ret = bch2_btree_iter_traverse(iter);
1461 if (unlikely(ret))
1462 return bkey_s_c_err(ret);
1463
1464 goto recheck;
1465 }
1466
1467 if (k.k &&
1468 !bkey_whiteout(k.k) &&
1469 bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
271a3d3a
KO
1470 /*
1471 * if we skipped forward to find the first non whiteout and
1472 * there _wasn't_ actually a hole, we want the iterator to be
1473 * pointed at the key we found:
1474 */
1475 l->iter = node_iter;
1476
1c6fdbd8
KO
1477 EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
1478 EBUG_ON(bkey_deleted(k.k));
1479 iter->uptodate = BTREE_ITER_UPTODATE;
1480 return k;
1481 }
1482
1483 /* hole */
271a3d3a
KO
1484
1485 /* holes can't span inode numbers: */
1486 if (iter->pos.offset == KEY_OFFSET_MAX) {
1487 if (iter->pos.inode == KEY_INODE_MAX)
1488 return bkey_s_c_null;
1489
1490 iter->pos = bkey_successor(iter->pos);
1491 goto recheck;
1492 }
1493
1494 if (!k.k)
1495 k.k = &l->b->key.k;
1496
1c6fdbd8
KO
1497 bkey_init(&n);
1498 n.p = iter->pos;
271a3d3a
KO
1499 bch2_key_resize(&n,
1500 min_t(u64, KEY_SIZE_MAX,
1501 (k.k->p.inode == n.p.inode
1502 ? bkey_start_offset(k.k)
1503 : KEY_OFFSET_MAX) -
1504 n.p.offset));
1505
319f9ac3 1506 EBUG_ON(!n.size);
1c6fdbd8 1507
271a3d3a
KO
1508 iter->k = n;
1509 iter->uptodate = BTREE_ITER_UPTODATE;
1510 return (struct bkey_s_c) { &iter->k, NULL };
1511}
1c6fdbd8 1512
271a3d3a
KO
1513static inline struct bkey_s_c
1514__bch2_btree_iter_peek_slot(struct btree_iter *iter)
1515{
1516 struct btree_iter_level *l = &iter->l[0];
1517 struct bkey_s_c k;
1518 int ret;
1c6fdbd8 1519
271a3d3a
KO
1520 if (iter->flags & BTREE_ITER_IS_EXTENTS)
1521 return __bch2_btree_iter_peek_slot_extents(iter);
1c6fdbd8 1522
271a3d3a
KO
1523recheck:
1524 while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
1525 bkey_deleted(k.k) &&
1526 bkey_cmp(k.k->p, iter->pos) == 0)
a00fd8c5 1527 bch2_btree_node_iter_advance(&l->iter, l->b);
1c6fdbd8 1528
271a3d3a
KO
1529 /*
1530 * If we got to the end of the node, check if we need to traverse to the
1531 * next node:
1532 */
1533 if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
1534 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1535 ret = bch2_btree_iter_traverse(iter);
1536 if (unlikely(ret))
1537 return bkey_s_c_err(ret);
1c6fdbd8 1538
271a3d3a 1539 goto recheck;
1c6fdbd8
KO
1540 }
1541
271a3d3a
KO
1542 if (k.k &&
1543 !bkey_deleted(k.k) &&
1544 !bkey_cmp(iter->pos, k.k->p)) {
1545 iter->uptodate = BTREE_ITER_UPTODATE;
1546 return k;
1547 } else {
1548 /* hole */
1549 bkey_init(&iter->k);
1550 iter->k.p = iter->pos;
1551
1552 iter->uptodate = BTREE_ITER_UPTODATE;
1553 return (struct bkey_s_c) { &iter->k, NULL };
1554 }
1c6fdbd8
KO
1555}
1556
1557struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1558{
1559 int ret;
1560
1561 bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1562
1563 if (iter->uptodate == BTREE_ITER_UPTODATE)
1564 return btree_iter_peek_uptodate(iter);
1565
0e6dd8fb
KO
1566 if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) {
1567 ret = bch2_btree_iter_traverse(iter);
1568 if (unlikely(ret))
1569 return bkey_s_c_err(ret);
1570 }
1c6fdbd8
KO
1571
1572 return __bch2_btree_iter_peek_slot(iter);
1573}
1574
1575struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
1576{
1577 bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS);
1578
1579 iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
1580
1581 if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
1582 /*
1583 * XXX: when we just need to relock we should be able to avoid
1584 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
1585 * for that to work
1586 */
1587 btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
1588
1589 return bch2_btree_iter_peek_slot(iter);
1590 }
1591
1592 if (!bkey_deleted(&iter->k))
a00fd8c5 1593 bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b);
1c6fdbd8
KO
1594
1595 btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1596
1597 return __bch2_btree_iter_peek_slot(iter);
1598}
1599
9e5e5b9e
KO
1600static inline void bch2_btree_iter_init(struct btree_trans *trans,
1601 struct btree_iter *iter, enum btree_id btree_id,
424eb881 1602 struct bpos pos, unsigned flags)
1c6fdbd8 1603{
9e5e5b9e 1604 struct bch_fs *c = trans->c;
1c6fdbd8
KO
1605 unsigned i;
1606
ab5c63f5 1607 if (btree_node_type_is_extents(btree_id) &&
424eb881
KO
1608 !(flags & BTREE_ITER_NODES))
1609 flags |= BTREE_ITER_IS_EXTENTS;
1c6fdbd8 1610
9e5e5b9e 1611 iter->trans = trans;
1c6fdbd8
KO
1612 iter->pos = pos;
1613 bkey_init(&iter->k);
1614 iter->k.p = pos;
1615 iter->flags = flags;
1616 iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
1617 iter->btree_id = btree_id;
424eb881
KO
1618 iter->level = 0;
1619 iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0;
1c6fdbd8
KO
1620 iter->nodes_locked = 0;
1621 iter->nodes_intent_locked = 0;
1622 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1623 iter->l[i].b = NULL;
ed8413fd 1624 iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
1c6fdbd8
KO
1625
1626 prefetch(c->btree_roots[btree_id].b);
1627}
1628
1c6fdbd8
KO
1629/* new transactional stuff: */
1630
424eb881
KO
1631int bch2_trans_iter_put(struct btree_trans *trans,
1632 struct btree_iter *iter)
08af47df 1633{
0f238367 1634 int ret = btree_iter_err(iter);
08af47df 1635
e1120a4c 1636 trans->iters_live &= ~(1ULL << iter->idx);
424eb881 1637 return ret;
08af47df
KO
1638}
1639
5df4be3f
KO
1640static inline void __bch2_trans_iter_free(struct btree_trans *trans,
1641 unsigned idx)
1642{
ecc892e4 1643 __bch2_btree_iter_unlock(&trans->iters[idx]);
5df4be3f
KO
1644 trans->iters_linked &= ~(1ULL << idx);
1645 trans->iters_live &= ~(1ULL << idx);
1646 trans->iters_touched &= ~(1ULL << idx);
1647 trans->iters_unlink_on_restart &= ~(1ULL << idx);
1648 trans->iters_unlink_on_commit &= ~(1ULL << idx);
5df4be3f
KO
1649}
1650
424eb881
KO
1651int bch2_trans_iter_free(struct btree_trans *trans,
1652 struct btree_iter *iter)
1c6fdbd8 1653{
0f238367 1654 int ret = btree_iter_err(iter);
424eb881 1655
e1120a4c 1656 __bch2_trans_iter_free(trans, iter->idx);
424eb881 1657 return ret;
5df4be3f 1658}
1c6fdbd8 1659
424eb881
KO
1660int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
1661 struct btree_iter *iter)
5df4be3f 1662{
0f238367 1663 int ret = btree_iter_err(iter);
424eb881 1664
e1120a4c 1665 trans->iters_unlink_on_commit |= 1ULL << iter->idx;
424eb881 1666 return ret;
1c6fdbd8
KO
1667}
1668
20bceecb 1669static int bch2_trans_realloc_iters(struct btree_trans *trans,
7d825866 1670 unsigned new_size)
1c6fdbd8 1671{
a8e00bd4 1672 void *new_iters, *new_updates;
1c6fdbd8 1673
7d825866
KO
1674 new_size = roundup_pow_of_two(new_size);
1675
a8e00bd4
KO
1676 BUG_ON(new_size > BTREE_ITER_MAX);
1677
1678 if (new_size <= trans->size)
1679 return 0;
1680
1681 BUG_ON(trans->used_mempool);
1682
1c6fdbd8
KO
1683 bch2_trans_unlock(trans);
1684
a8e00bd4
KO
1685 new_iters = kmalloc(sizeof(struct btree_iter) * new_size +
1686 sizeof(struct btree_insert_entry) * (new_size + 4),
1687 GFP_NOFS);
1688 if (new_iters)
1689 goto success;
1690
581edb63 1691 new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
a8e00bd4
KO
1692 new_size = BTREE_ITER_MAX;
1693
1694 trans->used_mempool = true;
1695success:
1696 new_updates = new_iters + sizeof(struct btree_iter) * new_size;
1c6fdbd8
KO
1697
1698 memcpy(new_iters, trans->iters,
1699 sizeof(struct btree_iter) * trans->nr_iters);
a8e00bd4
KO
1700 memcpy(new_updates, trans->updates,
1701 sizeof(struct btree_insert_entry) * trans->nr_updates);
1702
5df4be3f
KO
1703 if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
1704 memset(trans->iters, POISON_FREE,
1705 sizeof(struct btree_iter) * trans->nr_iters +
1706 sizeof(struct btree_insert_entry) * trans->nr_iters);
1707
a8e00bd4
KO
1708 if (trans->iters != trans->iters_onstack)
1709 kfree(trans->iters);
1710
1711 trans->iters = new_iters;
1712 trans->updates = new_updates;
1713 trans->size = new_size;
1c6fdbd8 1714
1c7a0adf 1715 if (trans->iters_live) {
20bceecb 1716 trace_trans_restart_iters_realloced(trans->ip, trans->size);
1c7a0adf
KO
1717 return -EINTR;
1718 }
1719
1720 return 0;
1c6fdbd8
KO
1721}
1722
7c26ecae
KO
1723static int btree_trans_iter_alloc(struct btree_trans *trans)
1724{
4c1c1e39 1725 unsigned idx = __ffs64(~trans->iters_linked);
7c26ecae
KO
1726
1727 if (idx < trans->nr_iters)
1728 goto got_slot;
1729
1730 if (trans->nr_iters == trans->size) {
7d825866 1731 int ret = bch2_trans_realloc_iters(trans, trans->size * 2);
7c26ecae
KO
1732 if (ret)
1733 return ret;
1734 }
1735
1736 idx = trans->nr_iters++;
1737 BUG_ON(trans->nr_iters > trans->size);
e1120a4c
KO
1738
1739 trans->iters[idx].idx = idx;
7c26ecae 1740got_slot:
7c26ecae 1741 BUG_ON(trans->iters_linked & (1ULL << idx));
7c26ecae
KO
1742 trans->iters_linked |= 1ULL << idx;
1743 return idx;
1744}
1745
1c6fdbd8 1746static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
5df4be3f 1747 unsigned btree_id, struct bpos pos,
1c6fdbd8
KO
1748 unsigned flags, u64 iter_id)
1749{
1750 struct btree_iter *iter;
1751 int idx;
1752
1753 BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
1754
5df4be3f 1755 for (idx = 0; idx < trans->nr_iters; idx++) {
7c26ecae
KO
1756 if (!(trans->iters_linked & (1ULL << idx)))
1757 continue;
1758
5df4be3f
KO
1759 iter = &trans->iters[idx];
1760 if (iter_id
1761 ? iter->id == iter_id
1762 : (iter->btree_id == btree_id &&
1763 !bkey_cmp(iter->pos, pos)))
1c6fdbd8 1764 goto found;
5df4be3f 1765 }
1c6fdbd8
KO
1766 idx = -1;
1767found:
1768 if (idx < 0) {
7c26ecae
KO
1769 idx = btree_trans_iter_alloc(trans);
1770 if (idx < 0)
1771 return ERR_PTR(idx);
1c6fdbd8 1772
1c6fdbd8 1773 iter = &trans->iters[idx];
a8e00bd4 1774 iter->id = iter_id;
1c6fdbd8 1775
9e5e5b9e 1776 bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
1c6fdbd8
KO
1777 } else {
1778 iter = &trans->iters[idx];
1779
1c6fdbd8
KO
1780 iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1781 iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
1782 }
1783
5df4be3f 1784 BUG_ON(iter->btree_id != btree_id);
a8e00bd4 1785 BUG_ON(trans->iters_live & (1ULL << idx));
5df4be3f
KO
1786 trans->iters_live |= 1ULL << idx;
1787 trans->iters_touched |= 1ULL << idx;
1c6fdbd8 1788
08af47df
KO
1789 BUG_ON(iter->btree_id != btree_id);
1790 BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
1791
1c6fdbd8
KO
1792 return iter;
1793}
1794
1795struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
1796 enum btree_id btree_id,
1797 struct bpos pos, unsigned flags,
1798 u64 iter_id)
1799{
1800 struct btree_iter *iter =
5df4be3f 1801 __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
1c6fdbd8
KO
1802
1803 if (!IS_ERR(iter))
1804 bch2_btree_iter_set_pos(iter, pos);
1805 return iter;
1806}
1807
424eb881
KO
1808struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
1809 enum btree_id btree_id,
1810 struct bpos pos,
1811 unsigned locks_want,
1812 unsigned depth,
1813 unsigned flags)
1814{
1815 struct btree_iter *iter =
1816 __btree_trans_get_iter(trans, btree_id, pos,
1817 flags|BTREE_ITER_NODES, 0);
1818 unsigned i;
1819
1820 BUG_ON(IS_ERR(iter));
1821 BUG_ON(bkey_cmp(iter->pos, pos));
1822
1823 iter->locks_want = locks_want;
1824 iter->level = depth;
1825
1826 for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1827 iter->l[i].b = NULL;
ed8413fd 1828 iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
424eb881
KO
1829
1830 return iter;
1831}
1832
7c26ecae
KO
1833struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
1834 struct btree_iter *src)
1c6fdbd8 1835{
ecc892e4 1836 struct btree_iter *iter;
e1120a4c 1837 unsigned offset = offsetof(struct btree_iter, trans);
ecc892e4 1838 int i, idx;
1c6fdbd8 1839
7c26ecae
KO
1840 idx = btree_trans_iter_alloc(trans);
1841 if (idx < 0)
1842 return ERR_PTR(idx);
1843
1844 trans->iters_live |= 1ULL << idx;
1845 trans->iters_touched |= 1ULL << idx;
1846 trans->iters_unlink_on_restart |= 1ULL << idx;
1847
ecc892e4 1848 iter = &trans->iters[idx];
e1120a4c
KO
1849
1850 memcpy((void *) iter + offset,
1851 (void *) src + offset,
1852 sizeof(*iter) - offset);
ecc892e4
KO
1853
1854 for (i = 0; i < BTREE_MAX_DEPTH; i++)
1855 if (btree_node_locked(iter, i))
c43a6ef9 1856 six_lock_increment(&iter->l[i].b->c.lock,
ecc892e4 1857 __btree_lock_want(iter, i));
7c26ecae
KO
1858
1859 return &trans->iters[idx];
1c6fdbd8
KO
1860}
1861
20bceecb 1862static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
1c6fdbd8 1863{
20bceecb 1864 if (size > trans->mem_bytes) {
1c6fdbd8 1865 size_t old_bytes = trans->mem_bytes;
20bceecb 1866 size_t new_bytes = roundup_pow_of_two(size);
1c6fdbd8
KO
1867 void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
1868
1869 if (!new_mem)
20bceecb 1870 return -ENOMEM;
1c6fdbd8
KO
1871
1872 trans->mem = new_mem;
1873 trans->mem_bytes = new_bytes;
1874
1c7a0adf 1875 if (old_bytes) {
20bceecb
KO
1876 trace_trans_restart_mem_realloced(trans->ip, new_bytes);
1877 return -EINTR;
1c7a0adf 1878 }
1c6fdbd8
KO
1879 }
1880
20bceecb
KO
1881 return 0;
1882}
1883
1884void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
1885{
1886 void *p;
1887 int ret;
1888
1889 ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
1890 if (ret)
1891 return ERR_PTR(ret);
1892
1893 p = trans->mem + trans->mem_top;
1c6fdbd8 1894 trans->mem_top += size;
20bceecb 1895 return p;
1c6fdbd8
KO
1896}
1897
5df4be3f
KO
1898inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
1899{
1900 iters &= trans->iters_linked;
4afe7000 1901 iters &= ~trans->iters_live;
5df4be3f
KO
1902
1903 while (iters) {
1904 unsigned idx = __ffs64(iters);
1905
1906 iters &= ~(1ULL << idx);
1907 __bch2_trans_iter_free(trans, idx);
1908 }
1909}
1910
61011ea2 1911void bch2_trans_begin(struct btree_trans *trans)
1c6fdbd8 1912{
5df4be3f 1913 u64 iters_to_unlink;
1c6fdbd8 1914
1c6fdbd8
KO
1915 /*
1916 * On transaction restart, the transaction isn't required to allocate
1917 * all the same iterators it on the last iteration:
1918 *
1919 * Unlink any iterators it didn't use this iteration, assuming it got
1920 * further (allocated an iter with a higher idx) than where the iter
1921 * was originally allocated:
1922 */
5df4be3f
KO
1923 iters_to_unlink = ~trans->iters_live &
1924 ((1ULL << fls64(trans->iters_live)) - 1);
a8e00bd4 1925
5df4be3f
KO
1926 iters_to_unlink |= trans->iters_unlink_on_restart;
1927 iters_to_unlink |= trans->iters_unlink_on_commit;
a8e00bd4 1928
4afe7000
KO
1929 trans->iters_live = 0;
1930
5df4be3f 1931 bch2_trans_unlink_iters(trans, iters_to_unlink);
1c6fdbd8 1932
5df4be3f
KO
1933 trans->iters_touched = 0;
1934 trans->iters_unlink_on_restart = 0;
1935 trans->iters_unlink_on_commit = 0;
1936 trans->nr_updates = 0;
1937 trans->mem_top = 0;
bf7b87a4
KO
1938
1939 bch2_btree_iter_traverse_all(trans);
1c6fdbd8
KO
1940}
1941
20bceecb
KO
1942void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
1943 unsigned expected_nr_iters,
1944 size_t expected_mem_bytes)
1c6fdbd8 1945{
a8e00bd4
KO
1946 memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
1947
1c6fdbd8 1948 trans->c = c;
ba5c6557 1949 trans->ip = _RET_IP_;
a8e00bd4 1950 trans->size = ARRAY_SIZE(trans->iters_onstack);
1c6fdbd8 1951 trans->iters = trans->iters_onstack;
a8e00bd4 1952 trans->updates = trans->updates_onstack;
3838be78 1953 trans->fs_usage_deltas = NULL;
20bceecb
KO
1954
1955 if (expected_nr_iters > trans->size)
1956 bch2_trans_realloc_iters(trans, expected_nr_iters);
1957
1958 if (expected_mem_bytes)
1959 bch2_trans_preload_mem(trans, expected_mem_bytes);
1c6fdbd8
KO
1960}
1961
1962int bch2_trans_exit(struct btree_trans *trans)
1963{
ece254b2 1964 bch2_trans_unlock(trans);
1c6fdbd8 1965
3838be78 1966 kfree(trans->fs_usage_deltas);
1c6fdbd8 1967 kfree(trans->mem);
a8e00bd4 1968 if (trans->used_mempool)
581edb63 1969 mempool_free(trans->iters, &trans->c->btree_iters_pool);
a8e00bd4
KO
1970 else if (trans->iters != trans->iters_onstack)
1971 kfree(trans->iters);
1c6fdbd8
KO
1972 trans->mem = (void *) 0x1;
1973 trans->iters = (void *) 0x1;
ece254b2
KO
1974
1975 return trans->error ? -EIO : 0;
1c6fdbd8 1976}