Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
4 | #include "bkey_methods.h" | |
5 | #include "btree_cache.h" | |
6 | #include "btree_iter.h" | |
7 | #include "btree_locking.h" | |
8 | #include "debug.h" | |
9 | #include "extents.h" | |
10 | #include "trace.h" | |
11 | ||
12 | #include <linux/prefetch.h> | |
13 | ||
14 | static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, | |
15 | struct btree_iter_level *, | |
16 | struct bkey *); | |
17 | ||
18 | #define BTREE_ITER_NOT_END ((struct btree *) 1) | |
19 | ||
20 | static inline bool is_btree_node(struct btree_iter *iter, unsigned l) | |
21 | { | |
22 | return l < BTREE_MAX_DEPTH && | |
23 | iter->l[l].b && | |
24 | iter->l[l].b != BTREE_ITER_NOT_END; | |
25 | } | |
26 | ||
a00fd8c5 KO |
27 | /* Returns < 0 if @k is before iter pos, > 0 if @k is after */ |
28 | static inline int __btree_iter_pos_cmp(struct btree_iter *iter, | |
29 | const struct btree *b, | |
30 | const struct bkey_packed *k, | |
31 | bool interior_node) | |
32 | { | |
33 | int cmp = bkey_cmp_left_packed(b, k, &iter->pos); | |
34 | ||
35 | if (cmp) | |
36 | return cmp; | |
37 | if (bkey_deleted(k)) | |
38 | return -1; | |
cbdf24ce KO |
39 | |
40 | /* | |
41 | * Normally, for extents we want the first key strictly greater than | |
42 | * the iterator position - with the exception that for interior nodes, | |
43 | * we don't want to advance past the last key if the iterator position | |
44 | * is POS_MAX: | |
45 | */ | |
46 | if (iter->flags & BTREE_ITER_IS_EXTENTS && | |
47 | (!interior_node || | |
48 | bkey_cmp_left_packed_byval(b, k, POS_MAX))) | |
a00fd8c5 KO |
49 | return -1; |
50 | return 1; | |
51 | } | |
52 | ||
53 | static inline int btree_iter_pos_cmp(struct btree_iter *iter, | |
54 | const struct btree *b, | |
55 | const struct bkey_packed *k) | |
56 | { | |
57 | return __btree_iter_pos_cmp(iter, b, k, b->level != 0); | |
58 | } | |
59 | ||
1c6fdbd8 KO |
60 | /* Btree node locking: */ |
61 | ||
62 | /* | |
63 | * Updates the saved lock sequence number, so that bch2_btree_node_relock() will | |
64 | * succeed: | |
65 | */ | |
66 | void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) | |
67 | { | |
68 | struct btree_iter *linked; | |
69 | ||
70 | EBUG_ON(iter->l[b->level].b != b); | |
e4ccb251 | 71 | EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); |
1c6fdbd8 | 72 | |
0f238367 | 73 | trans_for_each_iter_with_node(iter->trans, b, linked) |
e4ccb251 | 74 | linked->l[b->level].lock_seq += 2; |
1c6fdbd8 KO |
75 | |
76 | six_unlock_write(&b->lock); | |
77 | } | |
78 | ||
79 | void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) | |
80 | { | |
1c6fdbd8 KO |
81 | struct btree_iter *linked; |
82 | unsigned readers = 0; | |
83 | ||
84 | EBUG_ON(btree_node_read_locked(iter, b->level)); | |
85 | ||
0f238367 | 86 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 KO |
87 | if (linked->l[b->level].b == b && |
88 | btree_node_read_locked(linked, b->level)) | |
89 | readers++; | |
90 | ||
91 | /* | |
92 | * Must drop our read locks before calling six_lock_write() - | |
93 | * six_unlock() won't do wakeups until the reader count | |
94 | * goes to 0, and it's safe because we have the node intent | |
95 | * locked: | |
96 | */ | |
97 | atomic64_sub(__SIX_VAL(read_lock, readers), | |
98 | &b->lock.state.counter); | |
9e5e5b9e | 99 | btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); |
1c6fdbd8 KO |
100 | atomic64_add(__SIX_VAL(read_lock, readers), |
101 | &b->lock.state.counter); | |
102 | } | |
103 | ||
1c6fdbd8 KO |
104 | bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) |
105 | { | |
106 | struct btree *b = btree_iter_node(iter, level); | |
107 | int want = __btree_lock_want(iter, level); | |
108 | ||
109 | if (!b || b == BTREE_ITER_NOT_END) | |
110 | return false; | |
111 | ||
112 | if (race_fault()) | |
113 | return false; | |
114 | ||
e4ccb251 KO |
115 | if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) && |
116 | !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 && | |
1c6fdbd8 KO |
117 | btree_node_lock_increment(iter, b, level, want))) |
118 | return false; | |
119 | ||
120 | mark_btree_node_locked(iter, level, want); | |
121 | return true; | |
122 | } | |
123 | ||
124 | static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) | |
125 | { | |
126 | struct btree *b = iter->l[level].b; | |
127 | ||
128 | EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); | |
129 | ||
130 | if (!is_btree_node(iter, level)) | |
131 | return false; | |
132 | ||
133 | if (btree_node_intent_locked(iter, level)) | |
134 | return true; | |
135 | ||
136 | if (race_fault()) | |
137 | return false; | |
138 | ||
139 | if (btree_node_locked(iter, level) | |
140 | ? six_lock_tryupgrade(&b->lock) | |
e4ccb251 | 141 | : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq)) |
1c6fdbd8 KO |
142 | goto success; |
143 | ||
e4ccb251 | 144 | if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 && |
1c6fdbd8 KO |
145 | btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) { |
146 | btree_node_unlock(iter, level); | |
147 | goto success; | |
148 | } | |
149 | ||
150 | return false; | |
151 | success: | |
152 | mark_btree_node_intent_locked(iter, level); | |
153 | return true; | |
154 | } | |
155 | ||
156 | static inline bool btree_iter_get_locks(struct btree_iter *iter, | |
157 | bool upgrade) | |
158 | { | |
159 | unsigned l = iter->level; | |
160 | int fail_idx = -1; | |
161 | ||
162 | do { | |
163 | if (!btree_iter_node(iter, l)) | |
164 | break; | |
165 | ||
166 | if (!(upgrade | |
167 | ? bch2_btree_node_upgrade(iter, l) | |
168 | : bch2_btree_node_relock(iter, l))) { | |
169 | fail_idx = l; | |
170 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
171 | } | |
172 | ||
173 | l++; | |
174 | } while (l < iter->locks_want); | |
175 | ||
176 | /* | |
177 | * When we fail to get a lock, we have to ensure that any child nodes | |
178 | * can't be relocked so bch2_btree_iter_traverse has to walk back up to | |
179 | * the node that we failed to relock: | |
180 | */ | |
181 | while (fail_idx >= 0) { | |
182 | btree_node_unlock(iter, fail_idx); | |
183 | iter->l[fail_idx].b = BTREE_ITER_NOT_END; | |
184 | --fail_idx; | |
185 | } | |
186 | ||
187 | if (iter->uptodate == BTREE_ITER_NEED_RELOCK) | |
188 | iter->uptodate = BTREE_ITER_NEED_PEEK; | |
189 | ||
0f238367 KO |
190 | bch2_btree_trans_verify_locks(iter->trans); |
191 | ||
1c6fdbd8 KO |
192 | return iter->uptodate < BTREE_ITER_NEED_RELOCK; |
193 | } | |
194 | ||
195 | /* Slowpath: */ | |
196 | bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, | |
197 | unsigned level, | |
198 | struct btree_iter *iter, | |
199 | enum six_lock_type type, | |
200 | bool may_drop_locks) | |
201 | { | |
1c6fdbd8 KO |
202 | struct btree_iter *linked; |
203 | bool ret = true; | |
204 | ||
647d7b60 | 205 | /* Check if it's safe to block: */ |
0f238367 | 206 | trans_for_each_iter(iter->trans, linked) { |
1c6fdbd8 KO |
207 | if (!linked->nodes_locked) |
208 | continue; | |
209 | ||
647d7b60 | 210 | /* * Must lock btree nodes in key order: */ |
1c6fdbd8 KO |
211 | if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) |
212 | ret = false; | |
213 | ||
214 | /* | |
215 | * Can't block taking an intent lock if we have _any_ nodes read | |
216 | * locked: | |
217 | * | |
218 | * - Our read lock blocks another thread with an intent lock on | |
219 | * the same node from getting a write lock, and thus from | |
220 | * dropping its intent lock | |
221 | * | |
222 | * - And the other thread may have multiple nodes intent locked: | |
223 | * both the node we want to intent lock, and the node we | |
224 | * already have read locked - deadlock: | |
225 | */ | |
226 | if (type == SIX_LOCK_intent && | |
227 | linked->nodes_locked != linked->nodes_intent_locked) { | |
228 | if (may_drop_locks) { | |
229 | linked->locks_want = max_t(unsigned, | |
230 | linked->locks_want, | |
231 | __fls(linked->nodes_locked) + 1); | |
232 | btree_iter_get_locks(linked, true); | |
233 | } | |
234 | ret = false; | |
235 | } | |
236 | ||
237 | /* | |
238 | * Interior nodes must be locked before their descendants: if | |
239 | * another iterator has possible descendants locked of the node | |
240 | * we're about to lock, it must have the ancestors locked too: | |
241 | */ | |
242 | if (linked->btree_id == iter->btree_id && | |
243 | level > __fls(linked->nodes_locked)) { | |
244 | if (may_drop_locks) { | |
647d7b60 KO |
245 | linked->locks_want = |
246 | max(level + 1, max_t(unsigned, | |
247 | linked->locks_want, | |
248 | iter->locks_want)); | |
1c6fdbd8 KO |
249 | btree_iter_get_locks(linked, true); |
250 | } | |
251 | ret = false; | |
252 | } | |
253 | } | |
254 | ||
255 | if (ret) | |
9e5e5b9e | 256 | __btree_node_lock_type(iter->trans->c, b, type); |
1c7a0adf KO |
257 | else |
258 | trans_restart(); | |
259 | ||
1c6fdbd8 KO |
260 | return ret; |
261 | } | |
262 | ||
263 | /* Btree iterator locking: */ | |
264 | ||
265 | #ifdef CONFIG_BCACHEFS_DEBUG | |
0f238367 | 266 | void bch2_btree_iter_verify_locks(struct btree_iter *iter) |
1c6fdbd8 KO |
267 | { |
268 | unsigned l; | |
269 | ||
ad7ae8d6 KO |
270 | BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) && |
271 | !btree_node_locked(iter, 0)); | |
272 | ||
1c6fdbd8 KO |
273 | for (l = 0; btree_iter_node(iter, l); l++) { |
274 | if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && | |
275 | !btree_node_locked(iter, l)) | |
276 | continue; | |
277 | ||
278 | BUG_ON(btree_lock_want(iter, l) != | |
279 | btree_node_locked_type(iter, l)); | |
280 | } | |
281 | } | |
ad7ae8d6 | 282 | |
0f238367 | 283 | void bch2_btree_trans_verify_locks(struct btree_trans *trans) |
ad7ae8d6 | 284 | { |
0f238367 | 285 | struct btree_iter *iter; |
ad7ae8d6 | 286 | |
0f238367 KO |
287 | trans_for_each_iter(trans, iter) |
288 | bch2_btree_iter_verify_locks(iter); | |
ad7ae8d6 | 289 | } |
1c6fdbd8 KO |
290 | #endif |
291 | ||
292 | __flatten | |
0f238367 | 293 | static bool bch2_btree_iter_relock(struct btree_iter *iter) |
1c6fdbd8 KO |
294 | { |
295 | return iter->uptodate >= BTREE_ITER_NEED_RELOCK | |
296 | ? btree_iter_get_locks(iter, false) | |
297 | : true; | |
298 | } | |
299 | ||
1c6fdbd8 KO |
300 | bool __bch2_btree_iter_upgrade(struct btree_iter *iter, |
301 | unsigned new_locks_want) | |
302 | { | |
303 | struct btree_iter *linked; | |
304 | ||
305 | EBUG_ON(iter->locks_want >= new_locks_want); | |
306 | ||
307 | iter->locks_want = new_locks_want; | |
308 | ||
309 | if (btree_iter_get_locks(iter, true)) | |
310 | return true; | |
311 | ||
312 | /* | |
313 | * Ancestor nodes must be locked before child nodes, so set locks_want | |
314 | * on iterators that might lock ancestors before us to avoid getting | |
315 | * -EINTR later: | |
316 | */ | |
0f238367 KO |
317 | trans_for_each_iter(iter->trans, linked) |
318 | if (linked != iter && | |
319 | linked->btree_id == iter->btree_id && | |
1c6fdbd8 KO |
320 | btree_iter_cmp(linked, iter) <= 0 && |
321 | linked->locks_want < new_locks_want) { | |
322 | linked->locks_want = new_locks_want; | |
323 | btree_iter_get_locks(linked, true); | |
324 | } | |
325 | ||
326 | return false; | |
327 | } | |
328 | ||
329 | bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, | |
330 | unsigned new_locks_want) | |
331 | { | |
332 | unsigned l = iter->level; | |
333 | ||
334 | EBUG_ON(iter->locks_want >= new_locks_want); | |
335 | ||
336 | iter->locks_want = new_locks_want; | |
337 | ||
338 | do { | |
339 | if (!btree_iter_node(iter, l)) | |
340 | break; | |
341 | ||
342 | if (!bch2_btree_node_upgrade(iter, l)) { | |
343 | iter->locks_want = l; | |
344 | return false; | |
345 | } | |
346 | ||
347 | l++; | |
348 | } while (l < iter->locks_want); | |
349 | ||
350 | return true; | |
351 | } | |
352 | ||
353 | void __bch2_btree_iter_downgrade(struct btree_iter *iter, | |
354 | unsigned downgrade_to) | |
355 | { | |
356 | struct btree_iter *linked; | |
357 | unsigned l; | |
358 | ||
359 | /* | |
360 | * We downgrade linked iterators as well because btree_iter_upgrade | |
361 | * might have had to modify locks_want on linked iterators due to lock | |
362 | * ordering: | |
363 | */ | |
0f238367 | 364 | trans_for_each_iter(iter->trans, linked) { |
1c6fdbd8 KO |
365 | unsigned new_locks_want = downgrade_to ?: |
366 | (linked->flags & BTREE_ITER_INTENT ? 1 : 0); | |
367 | ||
368 | if (linked->locks_want <= new_locks_want) | |
369 | continue; | |
370 | ||
371 | linked->locks_want = new_locks_want; | |
372 | ||
373 | while (linked->nodes_locked && | |
374 | (l = __fls(linked->nodes_locked)) >= linked->locks_want) { | |
375 | if (l > linked->level) { | |
376 | btree_node_unlock(linked, l); | |
377 | } else { | |
378 | if (btree_node_intent_locked(linked, l)) { | |
379 | six_lock_downgrade(&linked->l[l].b->lock); | |
380 | linked->nodes_intent_locked ^= 1 << l; | |
381 | } | |
382 | break; | |
383 | } | |
384 | } | |
1c6fdbd8 | 385 | } |
ad7ae8d6 | 386 | |
0f238367 | 387 | bch2_btree_trans_verify_locks(iter->trans); |
1c6fdbd8 KO |
388 | } |
389 | ||
390 | int bch2_btree_iter_unlock(struct btree_iter *iter) | |
391 | { | |
392 | struct btree_iter *linked; | |
393 | ||
0f238367 | 394 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 KO |
395 | __bch2_btree_iter_unlock(linked); |
396 | ||
0f238367 | 397 | return btree_iter_err(iter); |
1c6fdbd8 KO |
398 | } |
399 | ||
0f238367 KO |
400 | bool bch2_btree_trans_relock(struct btree_trans *trans) |
401 | { | |
402 | struct btree_iter *iter; | |
403 | bool ret = true; | |
404 | ||
405 | trans_for_each_iter(trans, iter) | |
406 | ret &= bch2_btree_iter_relock(iter); | |
407 | ||
408 | return ret; | |
409 | } | |
410 | ||
411 | void bch2_btree_trans_unlock(struct btree_trans *trans) | |
412 | { | |
413 | struct btree_iter *iter; | |
414 | ||
415 | trans_for_each_iter(trans, iter) | |
416 | __bch2_btree_iter_unlock(iter); | |
417 | } | |
418 | ||
419 | /* Btree transaction locking: */ | |
420 | ||
1c6fdbd8 KO |
421 | /* Btree iterator: */ |
422 | ||
423 | #ifdef CONFIG_BCACHEFS_DEBUG | |
424 | ||
425 | static void __bch2_btree_iter_verify(struct btree_iter *iter, | |
426 | struct btree *b) | |
427 | { | |
428 | struct btree_iter_level *l = &iter->l[b->level]; | |
429 | struct btree_node_iter tmp = l->iter; | |
430 | struct bkey_packed *k; | |
431 | ||
f13f5a8c KO |
432 | if (!debug_check_iterators(iter->trans->c)) |
433 | return; | |
434 | ||
271a3d3a KO |
435 | if (iter->uptodate > BTREE_ITER_NEED_PEEK) |
436 | return; | |
437 | ||
1c6fdbd8 KO |
438 | bch2_btree_node_iter_verify(&l->iter, b); |
439 | ||
440 | /* | |
441 | * For interior nodes, the iterator will have skipped past | |
442 | * deleted keys: | |
271a3d3a KO |
443 | * |
444 | * For extents, the iterator may have skipped past deleted keys (but not | |
445 | * whiteouts) | |
1c6fdbd8 | 446 | */ |
271a3d3a | 447 | k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS |
26609b61 | 448 | ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) |
1c6fdbd8 | 449 | : bch2_btree_node_iter_prev_all(&tmp, b); |
a00fd8c5 | 450 | if (k && btree_iter_pos_cmp(iter, b, k) > 0) { |
1c6fdbd8 KO |
451 | char buf[100]; |
452 | struct bkey uk = bkey_unpack_key(b, k); | |
453 | ||
319f9ac3 | 454 | bch2_bkey_to_text(&PBUF(buf), &uk); |
271a3d3a | 455 | panic("prev key should be before iter pos:\n%s\n%llu:%llu\n", |
1c6fdbd8 KO |
456 | buf, iter->pos.inode, iter->pos.offset); |
457 | } | |
458 | ||
459 | k = bch2_btree_node_iter_peek_all(&l->iter, b); | |
a00fd8c5 | 460 | if (k && btree_iter_pos_cmp(iter, b, k) < 0) { |
1c6fdbd8 KO |
461 | char buf[100]; |
462 | struct bkey uk = bkey_unpack_key(b, k); | |
463 | ||
319f9ac3 | 464 | bch2_bkey_to_text(&PBUF(buf), &uk); |
271a3d3a KO |
465 | panic("iter should be after current key:\n" |
466 | "iter pos %llu:%llu\n" | |
467 | "cur key %s\n", | |
1c6fdbd8 KO |
468 | iter->pos.inode, iter->pos.offset, buf); |
469 | } | |
470 | ||
271a3d3a KO |
471 | BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && |
472 | (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS && | |
473 | !bkey_whiteout(&iter->k) && | |
474 | bch2_btree_node_iter_end(&l->iter)); | |
1c6fdbd8 KO |
475 | } |
476 | ||
477 | void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) | |
478 | { | |
479 | struct btree_iter *linked; | |
480 | ||
f13f5a8c KO |
481 | if (!debug_check_iterators(iter->trans->c)) |
482 | return; | |
483 | ||
0f238367 | 484 | trans_for_each_iter_with_node(iter->trans, b, linked) |
1c6fdbd8 KO |
485 | __bch2_btree_iter_verify(linked, b); |
486 | } | |
487 | ||
271a3d3a KO |
488 | #else |
489 | ||
490 | static inline void __bch2_btree_iter_verify(struct btree_iter *iter, | |
491 | struct btree *b) {} | |
492 | ||
1c6fdbd8 KO |
493 | #endif |
494 | ||
495 | static void __bch2_btree_node_iter_fix(struct btree_iter *iter, | |
496 | struct btree *b, | |
497 | struct btree_node_iter *node_iter, | |
498 | struct bset_tree *t, | |
499 | struct bkey_packed *where, | |
500 | unsigned clobber_u64s, | |
501 | unsigned new_u64s) | |
502 | { | |
503 | const struct bkey_packed *end = btree_bkey_last(b, t); | |
504 | struct btree_node_iter_set *set; | |
505 | unsigned offset = __btree_node_key_to_offset(b, where); | |
506 | int shift = new_u64s - clobber_u64s; | |
271a3d3a | 507 | unsigned old_end = t->end_offset - shift; |
1c6fdbd8 KO |
508 | |
509 | btree_node_iter_for_each(node_iter, set) | |
510 | if (set->end == old_end) | |
511 | goto found; | |
512 | ||
513 | /* didn't find the bset in the iterator - might have to readd it: */ | |
514 | if (new_u64s && | |
a00fd8c5 | 515 | btree_iter_pos_cmp(iter, b, where) > 0) { |
1c6fdbd8 KO |
516 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); |
517 | ||
518 | bch2_btree_node_iter_push(node_iter, b, where, end); | |
519 | ||
520 | if (!b->level && | |
521 | node_iter == &iter->l[0].iter) | |
522 | bkey_disassemble(b, | |
523 | bch2_btree_node_iter_peek_all(node_iter, b), | |
524 | &iter->k); | |
525 | } | |
526 | return; | |
527 | found: | |
271a3d3a | 528 | set->end = t->end_offset; |
1c6fdbd8 KO |
529 | |
530 | /* Iterator hasn't gotten to the key that changed yet: */ | |
531 | if (set->k < offset) | |
532 | return; | |
533 | ||
534 | if (new_u64s && | |
a00fd8c5 | 535 | btree_iter_pos_cmp(iter, b, where) > 0) { |
1c6fdbd8 KO |
536 | set->k = offset; |
537 | } else if (set->k < offset + clobber_u64s) { | |
538 | set->k = offset + new_u64s; | |
539 | if (set->k == set->end) | |
540 | bch2_btree_node_iter_set_drop(node_iter, set); | |
541 | } else { | |
542 | set->k = (int) set->k + shift; | |
543 | goto iter_current_key_not_modified; | |
544 | } | |
545 | ||
546 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
547 | ||
548 | bch2_btree_node_iter_sort(node_iter, b); | |
56e0e7c7 KO |
549 | if (!b->level && node_iter == &iter->l[0].iter) { |
550 | /* | |
551 | * not legal to call bkey_debugcheck() here, because we're | |
552 | * called midway through the update path after update has been | |
553 | * marked but before deletes have actually happened: | |
554 | */ | |
555 | #if 0 | |
1c6fdbd8 | 556 | __btree_iter_peek_all(iter, &iter->l[0], &iter->k); |
56e0e7c7 KO |
557 | #endif |
558 | struct btree_iter_level *l = &iter->l[0]; | |
559 | struct bkey_packed *k = | |
560 | bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
561 | ||
562 | if (unlikely(!k)) | |
563 | iter->k.type = KEY_TYPE_deleted; | |
564 | else | |
565 | bkey_disassemble(l->b, k, &iter->k); | |
566 | } | |
1c6fdbd8 KO |
567 | iter_current_key_not_modified: |
568 | ||
569 | /* | |
570 | * Interior nodes are special because iterators for interior nodes don't | |
571 | * obey the usual invariants regarding the iterator position: | |
572 | * | |
573 | * We may have whiteouts that compare greater than the iterator | |
574 | * position, and logically should be in the iterator, but that we | |
575 | * skipped past to find the first live key greater than the iterator | |
576 | * position. This becomes an issue when we insert a new key that is | |
577 | * greater than the current iterator position, but smaller than the | |
578 | * whiteouts we've already skipped past - this happens in the course of | |
579 | * a btree split. | |
580 | * | |
581 | * We have to rewind the iterator past to before those whiteouts here, | |
582 | * else bkey_node_iter_prev() is not going to work and who knows what | |
583 | * else would happen. And we have to do it manually, because here we've | |
584 | * already done the insert and the iterator is currently inconsistent: | |
585 | * | |
586 | * We've got multiple competing invariants, here - we have to be careful | |
587 | * about rewinding iterators for interior nodes, because they should | |
588 | * always point to the key for the child node the btree iterator points | |
589 | * to. | |
590 | */ | |
a00fd8c5 KO |
591 | if (b->level && new_u64s && |
592 | btree_iter_pos_cmp(iter, b, where) > 0) { | |
216c9fac | 593 | struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where); |
1c6fdbd8 KO |
594 | struct bkey_packed *k; |
595 | ||
596 | for_each_bset(b, t) { | |
216c9fac | 597 | if (where_set == t) |
1c6fdbd8 KO |
598 | continue; |
599 | ||
600 | k = bch2_bkey_prev_all(b, t, | |
601 | bch2_btree_node_iter_bset_pos(node_iter, b, t)); | |
602 | if (k && | |
a00fd8c5 | 603 | bkey_iter_cmp(b, k, where) > 0) { |
1c6fdbd8 KO |
604 | struct btree_node_iter_set *set; |
605 | unsigned offset = | |
606 | __btree_node_key_to_offset(b, bkey_next(k)); | |
607 | ||
608 | btree_node_iter_for_each(node_iter, set) | |
609 | if (set->k == offset) { | |
610 | set->k = __btree_node_key_to_offset(b, k); | |
611 | bch2_btree_node_iter_sort(node_iter, b); | |
612 | goto next_bset; | |
613 | } | |
614 | ||
615 | bch2_btree_node_iter_push(node_iter, b, k, | |
616 | btree_bkey_last(b, t)); | |
617 | } | |
618 | next_bset: | |
619 | t = t; | |
620 | } | |
621 | } | |
622 | } | |
623 | ||
624 | void bch2_btree_node_iter_fix(struct btree_iter *iter, | |
216c9fac KO |
625 | struct btree *b, |
626 | struct btree_node_iter *node_iter, | |
627 | struct bkey_packed *where, | |
628 | unsigned clobber_u64s, | |
629 | unsigned new_u64s) | |
1c6fdbd8 | 630 | { |
216c9fac | 631 | struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); |
1c6fdbd8 KO |
632 | struct btree_iter *linked; |
633 | ||
634 | if (node_iter != &iter->l[b->level].iter) | |
635 | __bch2_btree_node_iter_fix(iter, b, node_iter, t, | |
636 | where, clobber_u64s, new_u64s); | |
637 | ||
0f238367 | 638 | trans_for_each_iter_with_node(iter->trans, b, linked) |
1c6fdbd8 KO |
639 | __bch2_btree_node_iter_fix(linked, b, |
640 | &linked->l[b->level].iter, t, | |
641 | where, clobber_u64s, new_u64s); | |
1c6fdbd8 KO |
642 | } |
643 | ||
644 | static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, | |
645 | struct btree_iter_level *l, | |
646 | struct bkey *u, | |
647 | struct bkey_packed *k) | |
648 | { | |
649 | struct bkey_s_c ret; | |
650 | ||
651 | if (unlikely(!k)) { | |
652 | /* | |
653 | * signal to bch2_btree_iter_peek_slot() that we're currently at | |
654 | * a hole | |
655 | */ | |
26609b61 | 656 | u->type = KEY_TYPE_deleted; |
1c6fdbd8 KO |
657 | return bkey_s_c_null; |
658 | } | |
659 | ||
660 | ret = bkey_disassemble(l->b, k, u); | |
661 | ||
9e5e5b9e KO |
662 | if (debug_check_bkeys(iter->trans->c)) |
663 | bch2_bkey_debugcheck(iter->trans->c, l->b, ret); | |
1c6fdbd8 KO |
664 | |
665 | return ret; | |
666 | } | |
667 | ||
668 | /* peek_all() doesn't skip deleted keys */ | |
669 | static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, | |
670 | struct btree_iter_level *l, | |
671 | struct bkey *u) | |
672 | { | |
673 | return __btree_iter_unpack(iter, l, u, | |
674 | bch2_btree_node_iter_peek_all(&l->iter, l->b)); | |
675 | } | |
676 | ||
677 | static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, | |
678 | struct btree_iter_level *l) | |
679 | { | |
680 | return __btree_iter_unpack(iter, l, &iter->k, | |
681 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
682 | } | |
683 | ||
a00fd8c5 KO |
684 | static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, |
685 | struct btree_iter_level *l, | |
686 | int max_advance) | |
1c6fdbd8 | 687 | { |
a00fd8c5 KO |
688 | struct bkey_packed *k; |
689 | int nr_advanced = 0; | |
690 | ||
691 | while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && | |
692 | btree_iter_pos_cmp(iter, l->b, k) < 0) { | |
693 | if (max_advance > 0 && nr_advanced >= max_advance) | |
694 | return false; | |
695 | ||
696 | bch2_btree_node_iter_advance(&l->iter, l->b); | |
697 | nr_advanced++; | |
698 | } | |
699 | ||
700 | return true; | |
1c6fdbd8 KO |
701 | } |
702 | ||
703 | /* | |
704 | * Verify that iterator for parent node points to child node: | |
705 | */ | |
706 | static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) | |
707 | { | |
708 | struct btree_iter_level *l; | |
709 | unsigned plevel; | |
710 | bool parent_locked; | |
711 | struct bkey_packed *k; | |
712 | ||
713 | if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) | |
714 | return; | |
715 | ||
716 | plevel = b->level + 1; | |
717 | if (!btree_iter_node(iter, plevel)) | |
718 | return; | |
719 | ||
720 | parent_locked = btree_node_locked(iter, plevel); | |
721 | ||
722 | if (!bch2_btree_node_relock(iter, plevel)) | |
723 | return; | |
724 | ||
725 | l = &iter->l[plevel]; | |
726 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
727 | if (!k || | |
728 | bkey_deleted(k) || | |
729 | bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { | |
730 | char buf[100]; | |
731 | struct bkey uk = bkey_unpack_key(b, k); | |
732 | ||
319f9ac3 | 733 | bch2_bkey_to_text(&PBUF(buf), &uk); |
1c6fdbd8 KO |
734 | panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", |
735 | buf, b->key.k.p.inode, b->key.k.p.offset); | |
736 | } | |
737 | ||
738 | if (!parent_locked) | |
739 | btree_node_unlock(iter, b->level + 1); | |
740 | } | |
741 | ||
1c6fdbd8 KO |
742 | static inline bool btree_iter_pos_after_node(struct btree_iter *iter, |
743 | struct btree *b) | |
744 | { | |
a00fd8c5 | 745 | return __btree_iter_pos_cmp(iter, NULL, |
cbdf24ce | 746 | bkey_to_packed(&b->key), true) < 0; |
1c6fdbd8 KO |
747 | } |
748 | ||
749 | static inline bool btree_iter_pos_in_node(struct btree_iter *iter, | |
750 | struct btree *b) | |
751 | { | |
752 | return iter->btree_id == b->btree_id && | |
753 | bkey_cmp(iter->pos, b->data->min_key) >= 0 && | |
754 | !btree_iter_pos_after_node(iter, b); | |
755 | } | |
756 | ||
757 | static inline void __btree_iter_init(struct btree_iter *iter, | |
a00fd8c5 | 758 | unsigned level) |
1c6fdbd8 | 759 | { |
a00fd8c5 KO |
760 | struct btree_iter_level *l = &iter->l[level]; |
761 | ||
762 | bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos); | |
1c6fdbd8 | 763 | |
a00fd8c5 KO |
764 | if (iter->flags & BTREE_ITER_IS_EXTENTS) |
765 | btree_iter_advance_to_pos(iter, l, -1); | |
1c6fdbd8 KO |
766 | |
767 | /* Skip to first non whiteout: */ | |
a00fd8c5 KO |
768 | if (level) |
769 | bch2_btree_node_iter_peek(&l->iter, l->b); | |
1c6fdbd8 KO |
770 | |
771 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
772 | } | |
773 | ||
774 | static inline void btree_iter_node_set(struct btree_iter *iter, | |
775 | struct btree *b) | |
776 | { | |
777 | btree_iter_verify_new_node(iter, b); | |
778 | ||
779 | EBUG_ON(!btree_iter_pos_in_node(iter, b)); | |
780 | EBUG_ON(b->lock.state.seq & 1); | |
781 | ||
e4ccb251 | 782 | iter->l[b->level].lock_seq = b->lock.state.seq; |
1c6fdbd8 | 783 | iter->l[b->level].b = b; |
a00fd8c5 | 784 | __btree_iter_init(iter, b->level); |
1c6fdbd8 KO |
785 | } |
786 | ||
787 | /* | |
788 | * A btree node is being replaced - update the iterator to point to the new | |
789 | * node: | |
790 | */ | |
791 | void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) | |
792 | { | |
793 | enum btree_node_locked_type t; | |
794 | struct btree_iter *linked; | |
795 | ||
0f238367 | 796 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 KO |
797 | if (btree_iter_pos_in_node(linked, b)) { |
798 | /* | |
799 | * bch2_btree_iter_node_drop() has already been called - | |
800 | * the old node we're replacing has already been | |
801 | * unlocked and the pointer invalidated | |
802 | */ | |
803 | BUG_ON(btree_node_locked(linked, b->level)); | |
804 | ||
805 | t = btree_lock_want(linked, b->level); | |
806 | if (t != BTREE_NODE_UNLOCKED) { | |
807 | six_lock_increment(&b->lock, (enum six_lock_type) t); | |
808 | mark_btree_node_locked(linked, b->level, (enum six_lock_type) t); | |
809 | } | |
810 | ||
811 | btree_iter_node_set(linked, b); | |
812 | } | |
813 | ||
814 | six_unlock_intent(&b->lock); | |
815 | } | |
816 | ||
817 | void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) | |
818 | { | |
819 | struct btree_iter *linked; | |
820 | unsigned level = b->level; | |
821 | ||
0f238367 | 822 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 | 823 | if (linked->l[level].b == b) { |
ad7ae8d6 | 824 | __btree_node_unlock(linked, level); |
1c6fdbd8 KO |
825 | linked->l[level].b = BTREE_ITER_NOT_END; |
826 | } | |
827 | } | |
828 | ||
829 | /* | |
830 | * A btree node has been modified in such a way as to invalidate iterators - fix | |
831 | * them: | |
832 | */ | |
833 | void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) | |
834 | { | |
835 | struct btree_iter *linked; | |
836 | ||
0f238367 | 837 | trans_for_each_iter_with_node(iter->trans, b, linked) |
a00fd8c5 | 838 | __btree_iter_init(linked, b->level); |
1c6fdbd8 KO |
839 | } |
840 | ||
841 | static inline int btree_iter_lock_root(struct btree_iter *iter, | |
842 | unsigned depth_want) | |
843 | { | |
9e5e5b9e | 844 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
845 | struct btree *b; |
846 | enum six_lock_type lock_type; | |
847 | unsigned i; | |
848 | ||
849 | EBUG_ON(iter->nodes_locked); | |
850 | ||
851 | while (1) { | |
852 | b = READ_ONCE(c->btree_roots[iter->btree_id].b); | |
853 | iter->level = READ_ONCE(b->level); | |
854 | ||
855 | if (unlikely(iter->level < depth_want)) { | |
856 | /* | |
857 | * the root is at a lower depth than the depth we want: | |
858 | * got to the end of the btree, or we're walking nodes | |
859 | * greater than some depth and there are no nodes >= | |
860 | * that depth | |
861 | */ | |
862 | iter->level = depth_want; | |
863 | iter->l[iter->level].b = NULL; | |
8812600c | 864 | return 1; |
1c6fdbd8 KO |
865 | } |
866 | ||
867 | lock_type = __btree_lock_want(iter, iter->level); | |
868 | if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, | |
869 | iter, lock_type, true))) | |
870 | return -EINTR; | |
871 | ||
872 | if (likely(b == c->btree_roots[iter->btree_id].b && | |
873 | b->level == iter->level && | |
874 | !race_fault())) { | |
875 | for (i = 0; i < iter->level; i++) | |
876 | iter->l[i].b = BTREE_ITER_NOT_END; | |
877 | iter->l[iter->level].b = b; | |
878 | ||
879 | mark_btree_node_locked(iter, iter->level, lock_type); | |
880 | btree_iter_node_set(iter, b); | |
881 | return 0; | |
882 | ||
883 | } | |
884 | ||
885 | six_unlock_type(&b->lock, lock_type); | |
886 | } | |
887 | } | |
888 | ||
889 | noinline | |
890 | static void btree_iter_prefetch(struct btree_iter *iter) | |
891 | { | |
9e5e5b9e | 892 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
893 | struct btree_iter_level *l = &iter->l[iter->level]; |
894 | struct btree_node_iter node_iter = l->iter; | |
895 | struct bkey_packed *k; | |
896 | BKEY_PADDED(k) tmp; | |
9e5e5b9e | 897 | unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) |
1c6fdbd8 KO |
898 | ? (iter->level > 1 ? 0 : 2) |
899 | : (iter->level > 1 ? 1 : 16); | |
900 | bool was_locked = btree_node_locked(iter, iter->level); | |
901 | ||
902 | while (nr) { | |
903 | if (!bch2_btree_node_relock(iter, iter->level)) | |
904 | return; | |
905 | ||
906 | bch2_btree_node_iter_advance(&node_iter, l->b); | |
907 | k = bch2_btree_node_iter_peek(&node_iter, l->b); | |
908 | if (!k) | |
909 | break; | |
910 | ||
911 | bch2_bkey_unpack(l->b, &tmp.k, k); | |
9e5e5b9e | 912 | bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); |
1c6fdbd8 KO |
913 | } |
914 | ||
915 | if (!was_locked) | |
916 | btree_node_unlock(iter, iter->level); | |
917 | } | |
918 | ||
919 | static inline int btree_iter_down(struct btree_iter *iter) | |
920 | { | |
9e5e5b9e | 921 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
922 | struct btree_iter_level *l = &iter->l[iter->level]; |
923 | struct btree *b; | |
924 | unsigned level = iter->level - 1; | |
925 | enum six_lock_type lock_type = __btree_lock_want(iter, level); | |
926 | BKEY_PADDED(k) tmp; | |
927 | ||
928 | BUG_ON(!btree_node_locked(iter, iter->level)); | |
929 | ||
930 | bch2_bkey_unpack(l->b, &tmp.k, | |
931 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
932 | ||
9e5e5b9e | 933 | b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, true); |
1c6fdbd8 KO |
934 | if (unlikely(IS_ERR(b))) |
935 | return PTR_ERR(b); | |
936 | ||
937 | mark_btree_node_locked(iter, level, lock_type); | |
938 | btree_iter_node_set(iter, b); | |
939 | ||
940 | if (iter->flags & BTREE_ITER_PREFETCH) | |
941 | btree_iter_prefetch(iter); | |
942 | ||
943 | iter->level = level; | |
944 | ||
945 | return 0; | |
946 | } | |
947 | ||
948 | static void btree_iter_up(struct btree_iter *iter) | |
949 | { | |
950 | btree_node_unlock(iter, iter->level++); | |
951 | } | |
952 | ||
953 | int __must_check __bch2_btree_iter_traverse(struct btree_iter *); | |
954 | ||
bf7b87a4 KO |
955 | static int __btree_iter_traverse_all(struct btree_trans *trans, |
956 | struct btree_iter *iter, int ret) | |
1c6fdbd8 | 957 | { |
e542029e KO |
958 | struct bch_fs *c = trans->c; |
959 | u8 sorted[BTREE_ITER_MAX]; | |
960 | unsigned i, nr_sorted = 0; | |
961 | ||
962 | trans_for_each_iter(trans, iter) | |
963 | sorted[nr_sorted++] = iter - trans->iters; | |
964 | ||
965 | #define btree_iter_cmp_by_idx(_l, _r) \ | |
966 | btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) | |
967 | ||
968 | bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); | |
969 | #undef btree_iter_cmp_by_idx | |
970 | ||
1c6fdbd8 | 971 | retry_all: |
e542029e | 972 | bch2_btree_trans_unlock(trans); |
1c6fdbd8 | 973 | |
bf7b87a4 | 974 | if (unlikely(ret == -ENOMEM)) { |
1c6fdbd8 KO |
975 | struct closure cl; |
976 | ||
977 | closure_init_stack(&cl); | |
978 | ||
979 | do { | |
980 | ret = bch2_btree_cache_cannibalize_lock(c, &cl); | |
981 | closure_sync(&cl); | |
982 | } while (ret); | |
983 | } | |
984 | ||
bf7b87a4 KO |
985 | if (unlikely(ret == -EIO)) { |
986 | iter->flags |= BTREE_ITER_ERROR; | |
987 | iter->l[iter->level].b = BTREE_ITER_NOT_END; | |
988 | goto out; | |
989 | } | |
990 | ||
991 | BUG_ON(ret && ret != -EINTR); | |
992 | ||
1c6fdbd8 | 993 | /* Now, redo traversals in correct order: */ |
e542029e KO |
994 | for (i = 0; i < nr_sorted; i++) { |
995 | iter = &trans->iters[sorted[i]]; | |
1c6fdbd8 | 996 | |
e542029e KO |
997 | do { |
998 | ret = __bch2_btree_iter_traverse(iter); | |
999 | } while (ret == -EINTR); | |
1c6fdbd8 | 1000 | |
e542029e KO |
1001 | if (ret) |
1002 | goto retry_all; | |
1003 | } | |
1c6fdbd8 | 1004 | |
e542029e | 1005 | ret = btree_trans_has_multiple_iters(trans) ? -EINTR : 0; |
1c6fdbd8 KO |
1006 | out: |
1007 | bch2_btree_cache_cannibalize_unlock(c); | |
1008 | return ret; | |
bf7b87a4 | 1009 | } |
1c6fdbd8 | 1010 | |
bf7b87a4 KO |
1011 | int bch2_btree_iter_traverse_all(struct btree_trans *trans) |
1012 | { | |
1013 | return __btree_iter_traverse_all(trans, NULL, 0); | |
1c6fdbd8 KO |
1014 | } |
1015 | ||
1016 | static unsigned btree_iter_up_until_locked(struct btree_iter *iter, | |
1017 | bool check_pos) | |
1018 | { | |
1019 | unsigned l = iter->level; | |
1020 | ||
1021 | while (btree_iter_node(iter, l) && | |
1022 | !(is_btree_node(iter, l) && | |
1023 | bch2_btree_node_relock(iter, l) && | |
1024 | (!check_pos || | |
1025 | btree_iter_pos_in_node(iter, iter->l[l].b)))) { | |
1026 | btree_node_unlock(iter, l); | |
1027 | iter->l[l].b = BTREE_ITER_NOT_END; | |
1028 | l++; | |
1029 | } | |
1030 | ||
1031 | return l; | |
1032 | } | |
1033 | ||
1034 | /* | |
1035 | * This is the main state machine for walking down the btree - walks down to a | |
1036 | * specified depth | |
1037 | * | |
1038 | * Returns 0 on success, -EIO on error (error reading in a btree node). | |
1039 | * | |
1040 | * On error, caller (peek_node()/peek_key()) must return NULL; the error is | |
1041 | * stashed in the iterator and returned from bch2_btree_iter_unlock(). | |
1042 | */ | |
1043 | int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) | |
1044 | { | |
1045 | unsigned depth_want = iter->level; | |
1046 | ||
1047 | if (unlikely(iter->level >= BTREE_MAX_DEPTH)) | |
1048 | return 0; | |
1049 | ||
0f238367 | 1050 | if (bch2_btree_iter_relock(iter)) |
1c6fdbd8 KO |
1051 | return 0; |
1052 | ||
1c6fdbd8 KO |
1053 | /* |
1054 | * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos | |
1055 | * here unnecessary | |
1056 | */ | |
1057 | iter->level = btree_iter_up_until_locked(iter, true); | |
1058 | ||
1059 | /* | |
1060 | * If we've got a btree node locked (i.e. we aren't about to relock the | |
1061 | * root) - advance its node iterator if necessary: | |
1062 | * | |
1063 | * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary | |
1064 | */ | |
a00fd8c5 KO |
1065 | if (btree_iter_node(iter, iter->level)) |
1066 | btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); | |
1c6fdbd8 KO |
1067 | |
1068 | /* | |
1069 | * Note: iter->nodes[iter->level] may be temporarily NULL here - that | |
1070 | * would indicate to other code that we got to the end of the btree, | |
1071 | * here it indicates that relocking the root failed - it's critical that | |
1072 | * btree_iter_lock_root() comes next and that it can't fail | |
1073 | */ | |
1074 | while (iter->level > depth_want) { | |
1075 | int ret = btree_iter_node(iter, iter->level) | |
1076 | ? btree_iter_down(iter) | |
1077 | : btree_iter_lock_root(iter, depth_want); | |
1078 | if (unlikely(ret)) { | |
8812600c KO |
1079 | if (ret == 1) |
1080 | return 0; | |
1081 | ||
1c6fdbd8 KO |
1082 | iter->level = depth_want; |
1083 | iter->l[iter->level].b = BTREE_ITER_NOT_END; | |
1084 | return ret; | |
1085 | } | |
1086 | } | |
1087 | ||
1088 | iter->uptodate = BTREE_ITER_NEED_PEEK; | |
271a3d3a | 1089 | |
0f238367 | 1090 | bch2_btree_trans_verify_locks(iter->trans); |
271a3d3a | 1091 | __bch2_btree_iter_verify(iter, iter->l[iter->level].b); |
1c6fdbd8 KO |
1092 | return 0; |
1093 | } | |
1094 | ||
1095 | int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) | |
1096 | { | |
1097 | int ret; | |
1098 | ||
1099 | ret = __bch2_btree_iter_traverse(iter); | |
1100 | if (unlikely(ret)) | |
bf7b87a4 | 1101 | ret = __btree_iter_traverse_all(iter->trans, iter, ret); |
1c6fdbd8 | 1102 | |
0f238367 | 1103 | BUG_ON(ret == -EINTR && !btree_trans_has_multiple_iters(iter->trans)); |
1c6fdbd8 KO |
1104 | |
1105 | return ret; | |
1106 | } | |
1107 | ||
1108 | static inline void bch2_btree_iter_checks(struct btree_iter *iter, | |
1109 | enum btree_iter_type type) | |
1110 | { | |
1111 | EBUG_ON(iter->btree_id >= BTREE_ID_NR); | |
1c6fdbd8 KO |
1112 | EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != |
1113 | (iter->btree_id == BTREE_ID_EXTENTS && | |
1114 | type != BTREE_ITER_NODES)); | |
1115 | ||
0f238367 | 1116 | bch2_btree_trans_verify_locks(iter->trans); |
1c6fdbd8 KO |
1117 | } |
1118 | ||
1119 | /* Iterate across nodes (leaf and interior nodes) */ | |
1120 | ||
1121 | struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) | |
1122 | { | |
1123 | struct btree *b; | |
1124 | int ret; | |
1125 | ||
1126 | bch2_btree_iter_checks(iter, BTREE_ITER_NODES); | |
1127 | ||
1128 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1129 | return iter->l[iter->level].b; | |
1130 | ||
1131 | ret = bch2_btree_iter_traverse(iter); | |
1132 | if (ret) | |
1133 | return NULL; | |
1134 | ||
1135 | b = btree_iter_node(iter, iter->level); | |
1136 | if (!b) | |
1137 | return NULL; | |
1138 | ||
1139 | BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); | |
1140 | ||
1141 | iter->pos = b->key.k.p; | |
1142 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1143 | ||
1144 | return b; | |
1145 | } | |
1146 | ||
1147 | struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) | |
1148 | { | |
1149 | struct btree *b; | |
1150 | int ret; | |
1151 | ||
1152 | bch2_btree_iter_checks(iter, BTREE_ITER_NODES); | |
1153 | ||
1154 | /* already got to end? */ | |
1155 | if (!btree_iter_node(iter, iter->level)) | |
1156 | return NULL; | |
1157 | ||
1158 | btree_iter_up(iter); | |
1159 | ||
1160 | if (!bch2_btree_node_relock(iter, iter->level)) | |
1161 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); | |
1162 | ||
1163 | ret = bch2_btree_iter_traverse(iter); | |
1164 | if (ret) | |
1165 | return NULL; | |
1166 | ||
1167 | /* got to end? */ | |
1168 | b = btree_iter_node(iter, iter->level); | |
1169 | if (!b) | |
1170 | return NULL; | |
1171 | ||
1172 | if (bkey_cmp(iter->pos, b->key.k.p) < 0) { | |
1173 | /* | |
1174 | * Haven't gotten to the end of the parent node: go back down to | |
1175 | * the next child node | |
1176 | */ | |
1177 | ||
1178 | /* | |
1179 | * We don't really want to be unlocking here except we can't | |
1180 | * directly tell btree_iter_traverse() "traverse to this level" | |
1181 | * except by setting iter->level, so we have to unlock so we | |
1182 | * don't screw up our lock invariants: | |
1183 | */ | |
1184 | if (btree_node_read_locked(iter, iter->level)) | |
1185 | btree_node_unlock(iter, iter->level); | |
1186 | ||
1187 | /* ick: */ | |
1188 | iter->pos = iter->btree_id == BTREE_ID_INODES | |
1189 | ? btree_type_successor(iter->btree_id, iter->pos) | |
1190 | : bkey_successor(iter->pos); | |
1191 | iter->level = depth; | |
1192 | ||
1193 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1194 | ret = bch2_btree_iter_traverse(iter); | |
1195 | if (ret) | |
1196 | return NULL; | |
1197 | ||
1198 | b = iter->l[iter->level].b; | |
1199 | } | |
1200 | ||
1201 | iter->pos = b->key.k.p; | |
1202 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1203 | ||
1204 | return b; | |
1205 | } | |
1206 | ||
1207 | /* Iterate across keys (in leaf nodes only) */ | |
1208 | ||
1209 | void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) | |
1210 | { | |
1211 | struct btree_iter_level *l = &iter->l[0]; | |
1c6fdbd8 KO |
1212 | |
1213 | EBUG_ON(iter->level != 0); | |
1214 | EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); | |
1215 | EBUG_ON(!btree_node_locked(iter, 0)); | |
1216 | EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); | |
1217 | ||
1218 | iter->pos = new_pos; | |
1219 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
1220 | ||
a00fd8c5 | 1221 | btree_iter_advance_to_pos(iter, l, -1); |
1c6fdbd8 | 1222 | |
a00fd8c5 KO |
1223 | if (bch2_btree_node_iter_end(&l->iter) && |
1224 | btree_iter_pos_after_node(iter, l->b)) | |
1c6fdbd8 | 1225 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); |
1c6fdbd8 KO |
1226 | } |
1227 | ||
1228 | void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) | |
1229 | { | |
1230 | int cmp = bkey_cmp(new_pos, iter->pos); | |
1231 | unsigned level; | |
1232 | ||
1233 | if (!cmp) | |
1234 | return; | |
1235 | ||
1236 | iter->pos = new_pos; | |
1237 | ||
1238 | level = btree_iter_up_until_locked(iter, true); | |
1239 | ||
1240 | if (btree_iter_node(iter, level)) { | |
1c6fdbd8 KO |
1241 | /* |
1242 | * We might have to skip over many keys, or just a few: try | |
1243 | * advancing the node iterator, and if we have to skip over too | |
1244 | * many keys just reinit it (or if we're rewinding, since that | |
1245 | * is expensive). | |
1246 | */ | |
a00fd8c5 KO |
1247 | if (cmp < 0 || |
1248 | !btree_iter_advance_to_pos(iter, &iter->l[level], 8)) | |
1249 | __btree_iter_init(iter, level); | |
1c6fdbd8 KO |
1250 | |
1251 | /* Don't leave it locked if we're not supposed to: */ | |
1252 | if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED) | |
1253 | btree_node_unlock(iter, level); | |
1254 | } | |
1255 | ||
1256 | if (level != iter->level) | |
1257 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1258 | else | |
1259 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
1260 | } | |
1261 | ||
1262 | static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) | |
1263 | { | |
1264 | struct btree_iter_level *l = &iter->l[0]; | |
1265 | struct bkey_s_c ret = { .k = &iter->k }; | |
1266 | ||
1267 | if (!bkey_deleted(&iter->k)) { | |
1268 | EBUG_ON(bch2_btree_node_iter_end(&l->iter)); | |
1269 | ret.v = bkeyp_val(&l->b->format, | |
1270 | __bch2_btree_node_iter_peek_all(&l->iter, l->b)); | |
1271 | } | |
1272 | ||
9e5e5b9e | 1273 | if (debug_check_bkeys(iter->trans->c) && |
1c6fdbd8 | 1274 | !bkey_deleted(ret.k)) |
9e5e5b9e | 1275 | bch2_bkey_debugcheck(iter->trans->c, l->b, ret); |
1c6fdbd8 KO |
1276 | return ret; |
1277 | } | |
1278 | ||
1279 | struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) | |
1280 | { | |
1281 | struct btree_iter_level *l = &iter->l[0]; | |
1282 | struct bkey_s_c k; | |
1283 | int ret; | |
1284 | ||
1285 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1286 | ||
1287 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1288 | return btree_iter_peek_uptodate(iter); | |
1289 | ||
1290 | while (1) { | |
1291 | ret = bch2_btree_iter_traverse(iter); | |
1292 | if (unlikely(ret)) | |
1293 | return bkey_s_c_err(ret); | |
1294 | ||
1295 | k = __btree_iter_peek(iter, l); | |
1296 | if (likely(k.k)) | |
1297 | break; | |
1298 | ||
1299 | /* got to the end of the leaf, iterator needs to be traversed: */ | |
1300 | iter->pos = l->b->key.k.p; | |
1301 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1302 | ||
1303 | if (!bkey_cmp(iter->pos, POS_MAX)) | |
1304 | return bkey_s_c_null; | |
1305 | ||
1306 | iter->pos = btree_type_successor(iter->btree_id, iter->pos); | |
1307 | } | |
1308 | ||
1309 | /* | |
1310 | * iter->pos should always be equal to the key we just | |
1311 | * returned - except extents can straddle iter->pos: | |
1312 | */ | |
1313 | if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || | |
1314 | bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) | |
1315 | iter->pos = bkey_start_pos(k.k); | |
1316 | ||
1317 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1318 | return k; | |
1319 | } | |
1320 | ||
1321 | static noinline | |
1322 | struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter) | |
1323 | { | |
1324 | struct btree_iter_level *l = &iter->l[0]; | |
1325 | ||
1326 | iter->pos = l->b->key.k.p; | |
1327 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1328 | ||
1329 | if (!bkey_cmp(iter->pos, POS_MAX)) | |
1330 | return bkey_s_c_null; | |
1331 | ||
1332 | iter->pos = btree_type_successor(iter->btree_id, iter->pos); | |
1333 | ||
1334 | return bch2_btree_iter_peek(iter); | |
1335 | } | |
1336 | ||
1337 | struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) | |
1338 | { | |
1339 | struct btree_iter_level *l = &iter->l[0]; | |
1340 | struct bkey_packed *p; | |
1341 | struct bkey_s_c k; | |
1342 | ||
1343 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1344 | ||
1345 | if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { | |
1346 | k = bch2_btree_iter_peek(iter); | |
1347 | if (IS_ERR_OR_NULL(k.k)) | |
1348 | return k; | |
1349 | } | |
1350 | ||
1351 | do { | |
a00fd8c5 | 1352 | bch2_btree_node_iter_advance(&l->iter, l->b); |
1c6fdbd8 KO |
1353 | p = bch2_btree_node_iter_peek_all(&l->iter, l->b); |
1354 | if (unlikely(!p)) | |
1355 | return bch2_btree_iter_peek_next_leaf(iter); | |
1356 | } while (bkey_whiteout(p)); | |
1357 | ||
1358 | k = __btree_iter_unpack(iter, l, &iter->k, p); | |
1359 | ||
1360 | EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0); | |
1361 | iter->pos = bkey_start_pos(k.k); | |
1362 | return k; | |
1363 | } | |
1364 | ||
1365 | struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) | |
1366 | { | |
1367 | struct btree_iter_level *l = &iter->l[0]; | |
1368 | struct bkey_packed *p; | |
1369 | struct bkey_s_c k; | |
1370 | int ret; | |
1371 | ||
1372 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1373 | ||
1374 | if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { | |
1375 | k = bch2_btree_iter_peek(iter); | |
1376 | if (IS_ERR(k.k)) | |
1377 | return k; | |
1378 | } | |
1379 | ||
1380 | while (1) { | |
1381 | p = bch2_btree_node_iter_prev(&l->iter, l->b); | |
1382 | if (likely(p)) | |
1383 | break; | |
1384 | ||
1385 | iter->pos = l->b->data->min_key; | |
1386 | if (!bkey_cmp(iter->pos, POS_MIN)) | |
1387 | return bkey_s_c_null; | |
1388 | ||
1389 | bch2_btree_iter_set_pos(iter, | |
1390 | btree_type_predecessor(iter->btree_id, iter->pos)); | |
1391 | ||
1392 | ret = bch2_btree_iter_traverse(iter); | |
1393 | if (unlikely(ret)) | |
1394 | return bkey_s_c_err(ret); | |
1395 | ||
1396 | p = bch2_btree_node_iter_peek(&l->iter, l->b); | |
1397 | if (p) | |
1398 | break; | |
1399 | } | |
1400 | ||
1401 | k = __btree_iter_unpack(iter, l, &iter->k, p); | |
1402 | ||
1403 | EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); | |
1404 | ||
1405 | iter->pos = bkey_start_pos(k.k); | |
1406 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1407 | return k; | |
1408 | } | |
1409 | ||
1410 | static inline struct bkey_s_c | |
271a3d3a | 1411 | __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) |
1c6fdbd8 KO |
1412 | { |
1413 | struct btree_iter_level *l = &iter->l[0]; | |
271a3d3a | 1414 | struct btree_node_iter node_iter; |
1c6fdbd8 KO |
1415 | struct bkey_s_c k; |
1416 | struct bkey n; | |
1417 | int ret; | |
1418 | ||
1419 | recheck: | |
1420 | while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && | |
1421 | bkey_deleted(k.k) && | |
1422 | bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0) | |
a00fd8c5 | 1423 | bch2_btree_node_iter_advance(&l->iter, l->b); |
1c6fdbd8 | 1424 | |
271a3d3a KO |
1425 | /* |
1426 | * iterator is now at the correct position for inserting at iter->pos, | |
1427 | * but we need to keep iterating until we find the first non whiteout so | |
1428 | * we know how big a hole we have, if any: | |
1429 | */ | |
1430 | ||
1431 | node_iter = l->iter; | |
1432 | if (k.k && bkey_whiteout(k.k)) | |
1433 | k = __btree_iter_unpack(iter, l, &iter->k, | |
1434 | bch2_btree_node_iter_peek(&node_iter, l->b)); | |
1435 | ||
1c6fdbd8 KO |
1436 | /* |
1437 | * If we got to the end of the node, check if we need to traverse to the | |
1438 | * next node: | |
1439 | */ | |
1440 | if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { | |
1441 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1442 | ret = bch2_btree_iter_traverse(iter); | |
1443 | if (unlikely(ret)) | |
1444 | return bkey_s_c_err(ret); | |
1445 | ||
1446 | goto recheck; | |
1447 | } | |
1448 | ||
1449 | if (k.k && | |
1450 | !bkey_whiteout(k.k) && | |
1451 | bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { | |
271a3d3a KO |
1452 | /* |
1453 | * if we skipped forward to find the first non whiteout and | |
1454 | * there _wasn't_ actually a hole, we want the iterator to be | |
1455 | * pointed at the key we found: | |
1456 | */ | |
1457 | l->iter = node_iter; | |
1458 | ||
1c6fdbd8 KO |
1459 | EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); |
1460 | EBUG_ON(bkey_deleted(k.k)); | |
1461 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1462 | return k; | |
1463 | } | |
1464 | ||
1465 | /* hole */ | |
271a3d3a KO |
1466 | |
1467 | /* holes can't span inode numbers: */ | |
1468 | if (iter->pos.offset == KEY_OFFSET_MAX) { | |
1469 | if (iter->pos.inode == KEY_INODE_MAX) | |
1470 | return bkey_s_c_null; | |
1471 | ||
1472 | iter->pos = bkey_successor(iter->pos); | |
1473 | goto recheck; | |
1474 | } | |
1475 | ||
1476 | if (!k.k) | |
1477 | k.k = &l->b->key.k; | |
1478 | ||
1c6fdbd8 KO |
1479 | bkey_init(&n); |
1480 | n.p = iter->pos; | |
271a3d3a KO |
1481 | bch2_key_resize(&n, |
1482 | min_t(u64, KEY_SIZE_MAX, | |
1483 | (k.k->p.inode == n.p.inode | |
1484 | ? bkey_start_offset(k.k) | |
1485 | : KEY_OFFSET_MAX) - | |
1486 | n.p.offset)); | |
1487 | ||
319f9ac3 | 1488 | EBUG_ON(!n.size); |
1c6fdbd8 | 1489 | |
271a3d3a KO |
1490 | iter->k = n; |
1491 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1492 | return (struct bkey_s_c) { &iter->k, NULL }; | |
1493 | } | |
1c6fdbd8 | 1494 | |
271a3d3a KO |
1495 | static inline struct bkey_s_c |
1496 | __bch2_btree_iter_peek_slot(struct btree_iter *iter) | |
1497 | { | |
1498 | struct btree_iter_level *l = &iter->l[0]; | |
1499 | struct bkey_s_c k; | |
1500 | int ret; | |
1c6fdbd8 | 1501 | |
271a3d3a KO |
1502 | if (iter->flags & BTREE_ITER_IS_EXTENTS) |
1503 | return __bch2_btree_iter_peek_slot_extents(iter); | |
1c6fdbd8 | 1504 | |
271a3d3a KO |
1505 | recheck: |
1506 | while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && | |
1507 | bkey_deleted(k.k) && | |
1508 | bkey_cmp(k.k->p, iter->pos) == 0) | |
a00fd8c5 | 1509 | bch2_btree_node_iter_advance(&l->iter, l->b); |
1c6fdbd8 | 1510 | |
271a3d3a KO |
1511 | /* |
1512 | * If we got to the end of the node, check if we need to traverse to the | |
1513 | * next node: | |
1514 | */ | |
1515 | if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { | |
1516 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1517 | ret = bch2_btree_iter_traverse(iter); | |
1518 | if (unlikely(ret)) | |
1519 | return bkey_s_c_err(ret); | |
1c6fdbd8 | 1520 | |
271a3d3a | 1521 | goto recheck; |
1c6fdbd8 KO |
1522 | } |
1523 | ||
271a3d3a KO |
1524 | if (k.k && |
1525 | !bkey_deleted(k.k) && | |
1526 | !bkey_cmp(iter->pos, k.k->p)) { | |
1527 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1528 | return k; | |
1529 | } else { | |
1530 | /* hole */ | |
1531 | bkey_init(&iter->k); | |
1532 | iter->k.p = iter->pos; | |
1533 | ||
1534 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1535 | return (struct bkey_s_c) { &iter->k, NULL }; | |
1536 | } | |
1c6fdbd8 KO |
1537 | } |
1538 | ||
1539 | struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) | |
1540 | { | |
1541 | int ret; | |
1542 | ||
1543 | bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); | |
1544 | ||
1545 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1546 | return btree_iter_peek_uptodate(iter); | |
1547 | ||
1548 | ret = bch2_btree_iter_traverse(iter); | |
1549 | if (unlikely(ret)) | |
1550 | return bkey_s_c_err(ret); | |
1551 | ||
1552 | return __bch2_btree_iter_peek_slot(iter); | |
1553 | } | |
1554 | ||
1555 | struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) | |
1556 | { | |
1557 | bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); | |
1558 | ||
1559 | iter->pos = btree_type_successor(iter->btree_id, iter->k.p); | |
1560 | ||
1561 | if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { | |
1562 | /* | |
1563 | * XXX: when we just need to relock we should be able to avoid | |
1564 | * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK | |
1565 | * for that to work | |
1566 | */ | |
1567 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1568 | ||
1569 | return bch2_btree_iter_peek_slot(iter); | |
1570 | } | |
1571 | ||
1572 | if (!bkey_deleted(&iter->k)) | |
a00fd8c5 | 1573 | bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b); |
1c6fdbd8 KO |
1574 | |
1575 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
1576 | ||
1577 | return __bch2_btree_iter_peek_slot(iter); | |
1578 | } | |
1579 | ||
9e5e5b9e KO |
1580 | static inline void bch2_btree_iter_init(struct btree_trans *trans, |
1581 | struct btree_iter *iter, enum btree_id btree_id, | |
424eb881 | 1582 | struct bpos pos, unsigned flags) |
1c6fdbd8 | 1583 | { |
9e5e5b9e | 1584 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
1585 | unsigned i; |
1586 | ||
424eb881 KO |
1587 | if (btree_id == BTREE_ID_EXTENTS && |
1588 | !(flags & BTREE_ITER_NODES)) | |
1589 | flags |= BTREE_ITER_IS_EXTENTS; | |
1c6fdbd8 | 1590 | |
9e5e5b9e | 1591 | iter->trans = trans; |
1c6fdbd8 KO |
1592 | iter->pos = pos; |
1593 | bkey_init(&iter->k); | |
1594 | iter->k.p = pos; | |
1595 | iter->flags = flags; | |
1596 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1597 | iter->btree_id = btree_id; | |
424eb881 KO |
1598 | iter->level = 0; |
1599 | iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; | |
1c6fdbd8 KO |
1600 | iter->nodes_locked = 0; |
1601 | iter->nodes_intent_locked = 0; | |
1602 | for (i = 0; i < ARRAY_SIZE(iter->l); i++) | |
1603 | iter->l[i].b = NULL; | |
1604 | iter->l[iter->level].b = BTREE_ITER_NOT_END; | |
1c6fdbd8 KO |
1605 | |
1606 | prefetch(c->btree_roots[btree_id].b); | |
1607 | } | |
1608 | ||
1c6fdbd8 KO |
1609 | /* new transactional stuff: */ |
1610 | ||
424eb881 KO |
1611 | int bch2_trans_iter_put(struct btree_trans *trans, |
1612 | struct btree_iter *iter) | |
08af47df | 1613 | { |
0f238367 | 1614 | int ret = btree_iter_err(iter); |
08af47df | 1615 | |
e1120a4c | 1616 | trans->iters_live &= ~(1ULL << iter->idx); |
424eb881 | 1617 | return ret; |
08af47df KO |
1618 | } |
1619 | ||
5df4be3f KO |
1620 | static inline void __bch2_trans_iter_free(struct btree_trans *trans, |
1621 | unsigned idx) | |
1622 | { | |
ecc892e4 | 1623 | __bch2_btree_iter_unlock(&trans->iters[idx]); |
5df4be3f KO |
1624 | trans->iters_linked &= ~(1ULL << idx); |
1625 | trans->iters_live &= ~(1ULL << idx); | |
1626 | trans->iters_touched &= ~(1ULL << idx); | |
1627 | trans->iters_unlink_on_restart &= ~(1ULL << idx); | |
1628 | trans->iters_unlink_on_commit &= ~(1ULL << idx); | |
5df4be3f KO |
1629 | } |
1630 | ||
424eb881 KO |
1631 | int bch2_trans_iter_free(struct btree_trans *trans, |
1632 | struct btree_iter *iter) | |
1c6fdbd8 | 1633 | { |
0f238367 | 1634 | int ret = btree_iter_err(iter); |
424eb881 | 1635 | |
e1120a4c | 1636 | __bch2_trans_iter_free(trans, iter->idx); |
424eb881 | 1637 | return ret; |
5df4be3f | 1638 | } |
1c6fdbd8 | 1639 | |
424eb881 KO |
1640 | int bch2_trans_iter_free_on_commit(struct btree_trans *trans, |
1641 | struct btree_iter *iter) | |
5df4be3f | 1642 | { |
0f238367 | 1643 | int ret = btree_iter_err(iter); |
424eb881 | 1644 | |
e1120a4c | 1645 | trans->iters_unlink_on_commit |= 1ULL << iter->idx; |
424eb881 | 1646 | return ret; |
1c6fdbd8 KO |
1647 | } |
1648 | ||
a8e00bd4 KO |
1649 | static int btree_trans_realloc_iters(struct btree_trans *trans, |
1650 | unsigned new_size) | |
1c6fdbd8 | 1651 | { |
a8e00bd4 | 1652 | void *new_iters, *new_updates; |
1c6fdbd8 | 1653 | |
a8e00bd4 KO |
1654 | BUG_ON(new_size > BTREE_ITER_MAX); |
1655 | ||
1656 | if (new_size <= trans->size) | |
1657 | return 0; | |
1658 | ||
1659 | BUG_ON(trans->used_mempool); | |
1660 | ||
1c6fdbd8 KO |
1661 | bch2_trans_unlock(trans); |
1662 | ||
a8e00bd4 KO |
1663 | new_iters = kmalloc(sizeof(struct btree_iter) * new_size + |
1664 | sizeof(struct btree_insert_entry) * (new_size + 4), | |
1665 | GFP_NOFS); | |
1666 | if (new_iters) | |
1667 | goto success; | |
1668 | ||
581edb63 | 1669 | new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); |
a8e00bd4 KO |
1670 | new_size = BTREE_ITER_MAX; |
1671 | ||
1672 | trans->used_mempool = true; | |
1673 | success: | |
1674 | new_updates = new_iters + sizeof(struct btree_iter) * new_size; | |
1c6fdbd8 KO |
1675 | |
1676 | memcpy(new_iters, trans->iters, | |
1677 | sizeof(struct btree_iter) * trans->nr_iters); | |
a8e00bd4 KO |
1678 | memcpy(new_updates, trans->updates, |
1679 | sizeof(struct btree_insert_entry) * trans->nr_updates); | |
1680 | ||
5df4be3f KO |
1681 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) |
1682 | memset(trans->iters, POISON_FREE, | |
1683 | sizeof(struct btree_iter) * trans->nr_iters + | |
1684 | sizeof(struct btree_insert_entry) * trans->nr_iters); | |
1685 | ||
a8e00bd4 KO |
1686 | if (trans->iters != trans->iters_onstack) |
1687 | kfree(trans->iters); | |
1688 | ||
1689 | trans->iters = new_iters; | |
1690 | trans->updates = new_updates; | |
1691 | trans->size = new_size; | |
1c6fdbd8 | 1692 | |
1c7a0adf KO |
1693 | if (trans->iters_live) { |
1694 | trans_restart(); | |
1695 | return -EINTR; | |
1696 | } | |
1697 | ||
1698 | return 0; | |
1c6fdbd8 KO |
1699 | } |
1700 | ||
581edb63 | 1701 | void bch2_trans_preload_iters(struct btree_trans *trans) |
1c6fdbd8 | 1702 | { |
a8e00bd4 | 1703 | btree_trans_realloc_iters(trans, BTREE_ITER_MAX); |
1c6fdbd8 KO |
1704 | } |
1705 | ||
7c26ecae KO |
1706 | static int btree_trans_iter_alloc(struct btree_trans *trans) |
1707 | { | |
7c26ecae KO |
1708 | unsigned idx = ffz(trans->iters_linked); |
1709 | ||
1710 | if (idx < trans->nr_iters) | |
1711 | goto got_slot; | |
1712 | ||
1713 | if (trans->nr_iters == trans->size) { | |
1714 | int ret = btree_trans_realloc_iters(trans, trans->size * 2); | |
1715 | if (ret) | |
1716 | return ret; | |
1717 | } | |
1718 | ||
1719 | idx = trans->nr_iters++; | |
1720 | BUG_ON(trans->nr_iters > trans->size); | |
e1120a4c KO |
1721 | |
1722 | trans->iters[idx].idx = idx; | |
7c26ecae | 1723 | got_slot: |
7c26ecae | 1724 | BUG_ON(trans->iters_linked & (1ULL << idx)); |
7c26ecae KO |
1725 | trans->iters_linked |= 1ULL << idx; |
1726 | return idx; | |
1727 | } | |
1728 | ||
1c6fdbd8 | 1729 | static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, |
5df4be3f | 1730 | unsigned btree_id, struct bpos pos, |
1c6fdbd8 KO |
1731 | unsigned flags, u64 iter_id) |
1732 | { | |
1733 | struct btree_iter *iter; | |
1734 | int idx; | |
1735 | ||
1736 | BUG_ON(trans->nr_iters > BTREE_ITER_MAX); | |
1737 | ||
5df4be3f | 1738 | for (idx = 0; idx < trans->nr_iters; idx++) { |
7c26ecae KO |
1739 | if (!(trans->iters_linked & (1ULL << idx))) |
1740 | continue; | |
1741 | ||
5df4be3f KO |
1742 | iter = &trans->iters[idx]; |
1743 | if (iter_id | |
1744 | ? iter->id == iter_id | |
1745 | : (iter->btree_id == btree_id && | |
1746 | !bkey_cmp(iter->pos, pos))) | |
1c6fdbd8 | 1747 | goto found; |
5df4be3f | 1748 | } |
1c6fdbd8 KO |
1749 | idx = -1; |
1750 | found: | |
1751 | if (idx < 0) { | |
7c26ecae KO |
1752 | idx = btree_trans_iter_alloc(trans); |
1753 | if (idx < 0) | |
1754 | return ERR_PTR(idx); | |
1c6fdbd8 | 1755 | |
1c6fdbd8 | 1756 | iter = &trans->iters[idx]; |
a8e00bd4 | 1757 | iter->id = iter_id; |
1c6fdbd8 | 1758 | |
9e5e5b9e | 1759 | bch2_btree_iter_init(trans, iter, btree_id, pos, flags); |
1c6fdbd8 KO |
1760 | } else { |
1761 | iter = &trans->iters[idx]; | |
1762 | ||
1c6fdbd8 KO |
1763 | iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); |
1764 | iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); | |
1765 | } | |
1766 | ||
5df4be3f | 1767 | BUG_ON(iter->btree_id != btree_id); |
a8e00bd4 | 1768 | BUG_ON(trans->iters_live & (1ULL << idx)); |
5df4be3f KO |
1769 | trans->iters_live |= 1ULL << idx; |
1770 | trans->iters_touched |= 1ULL << idx; | |
1c6fdbd8 | 1771 | |
08af47df KO |
1772 | BUG_ON(iter->btree_id != btree_id); |
1773 | BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); | |
1774 | ||
1c6fdbd8 KO |
1775 | return iter; |
1776 | } | |
1777 | ||
1778 | struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, | |
1779 | enum btree_id btree_id, | |
1780 | struct bpos pos, unsigned flags, | |
1781 | u64 iter_id) | |
1782 | { | |
1783 | struct btree_iter *iter = | |
5df4be3f | 1784 | __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id); |
1c6fdbd8 KO |
1785 | |
1786 | if (!IS_ERR(iter)) | |
1787 | bch2_btree_iter_set_pos(iter, pos); | |
1788 | return iter; | |
1789 | } | |
1790 | ||
424eb881 KO |
1791 | struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, |
1792 | enum btree_id btree_id, | |
1793 | struct bpos pos, | |
1794 | unsigned locks_want, | |
1795 | unsigned depth, | |
1796 | unsigned flags) | |
1797 | { | |
1798 | struct btree_iter *iter = | |
1799 | __btree_trans_get_iter(trans, btree_id, pos, | |
1800 | flags|BTREE_ITER_NODES, 0); | |
1801 | unsigned i; | |
1802 | ||
1803 | BUG_ON(IS_ERR(iter)); | |
1804 | BUG_ON(bkey_cmp(iter->pos, pos)); | |
1805 | ||
1806 | iter->locks_want = locks_want; | |
1807 | iter->level = depth; | |
1808 | ||
1809 | for (i = 0; i < ARRAY_SIZE(iter->l); i++) | |
1810 | iter->l[i].b = NULL; | |
1811 | iter->l[iter->level].b = BTREE_ITER_NOT_END; | |
1812 | ||
1813 | return iter; | |
1814 | } | |
1815 | ||
7c26ecae KO |
1816 | struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, |
1817 | struct btree_iter *src) | |
1c6fdbd8 | 1818 | { |
ecc892e4 | 1819 | struct btree_iter *iter; |
e1120a4c | 1820 | unsigned offset = offsetof(struct btree_iter, trans); |
ecc892e4 | 1821 | int i, idx; |
1c6fdbd8 | 1822 | |
7c26ecae KO |
1823 | idx = btree_trans_iter_alloc(trans); |
1824 | if (idx < 0) | |
1825 | return ERR_PTR(idx); | |
1826 | ||
1827 | trans->iters_live |= 1ULL << idx; | |
1828 | trans->iters_touched |= 1ULL << idx; | |
1829 | trans->iters_unlink_on_restart |= 1ULL << idx; | |
1830 | ||
ecc892e4 | 1831 | iter = &trans->iters[idx]; |
e1120a4c KO |
1832 | |
1833 | memcpy((void *) iter + offset, | |
1834 | (void *) src + offset, | |
1835 | sizeof(*iter) - offset); | |
ecc892e4 KO |
1836 | |
1837 | for (i = 0; i < BTREE_MAX_DEPTH; i++) | |
1838 | if (btree_node_locked(iter, i)) | |
1839 | six_lock_increment(&iter->l[i].b->lock, | |
1840 | __btree_lock_want(iter, i)); | |
7c26ecae KO |
1841 | |
1842 | return &trans->iters[idx]; | |
1c6fdbd8 KO |
1843 | } |
1844 | ||
1845 | void *bch2_trans_kmalloc(struct btree_trans *trans, | |
1846 | size_t size) | |
1847 | { | |
1848 | void *ret; | |
1849 | ||
1850 | if (trans->mem_top + size > trans->mem_bytes) { | |
1851 | size_t old_bytes = trans->mem_bytes; | |
1852 | size_t new_bytes = roundup_pow_of_two(trans->mem_top + size); | |
1853 | void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); | |
1854 | ||
1855 | if (!new_mem) | |
1856 | return ERR_PTR(-ENOMEM); | |
1857 | ||
1858 | trans->mem = new_mem; | |
1859 | trans->mem_bytes = new_bytes; | |
1860 | ||
1c7a0adf KO |
1861 | if (old_bytes) { |
1862 | trans_restart(); | |
1c6fdbd8 | 1863 | return ERR_PTR(-EINTR); |
1c7a0adf | 1864 | } |
1c6fdbd8 KO |
1865 | } |
1866 | ||
1867 | ret = trans->mem + trans->mem_top; | |
1868 | trans->mem_top += size; | |
1869 | return ret; | |
1870 | } | |
1871 | ||
1872 | int bch2_trans_unlock(struct btree_trans *trans) | |
1873 | { | |
1874 | unsigned iters = trans->iters_linked; | |
1875 | int ret = 0; | |
1876 | ||
1877 | while (iters) { | |
1878 | unsigned idx = __ffs(iters); | |
1879 | struct btree_iter *iter = &trans->iters[idx]; | |
1880 | ||
0f238367 | 1881 | ret = ret ?: btree_iter_err(iter); |
1c6fdbd8 KO |
1882 | |
1883 | __bch2_btree_iter_unlock(iter); | |
1884 | iters ^= 1 << idx; | |
1885 | } | |
1886 | ||
1887 | return ret; | |
1888 | } | |
1889 | ||
5df4be3f KO |
1890 | inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters) |
1891 | { | |
1892 | iters &= trans->iters_linked; | |
4afe7000 | 1893 | iters &= ~trans->iters_live; |
5df4be3f KO |
1894 | |
1895 | while (iters) { | |
1896 | unsigned idx = __ffs64(iters); | |
1897 | ||
1898 | iters &= ~(1ULL << idx); | |
1899 | __bch2_trans_iter_free(trans, idx); | |
1900 | } | |
1901 | } | |
1902 | ||
1c7a0adf | 1903 | void __bch2_trans_begin(struct btree_trans *trans) |
1c6fdbd8 | 1904 | { |
5df4be3f | 1905 | u64 iters_to_unlink; |
1c6fdbd8 | 1906 | |
1c6fdbd8 KO |
1907 | /* |
1908 | * On transaction restart, the transaction isn't required to allocate | |
1909 | * all the same iterators it on the last iteration: | |
1910 | * | |
1911 | * Unlink any iterators it didn't use this iteration, assuming it got | |
1912 | * further (allocated an iter with a higher idx) than where the iter | |
1913 | * was originally allocated: | |
1914 | */ | |
5df4be3f KO |
1915 | iters_to_unlink = ~trans->iters_live & |
1916 | ((1ULL << fls64(trans->iters_live)) - 1); | |
a8e00bd4 | 1917 | |
5df4be3f KO |
1918 | iters_to_unlink |= trans->iters_unlink_on_restart; |
1919 | iters_to_unlink |= trans->iters_unlink_on_commit; | |
a8e00bd4 | 1920 | |
4afe7000 KO |
1921 | trans->iters_live = 0; |
1922 | ||
5df4be3f | 1923 | bch2_trans_unlink_iters(trans, iters_to_unlink); |
1c6fdbd8 | 1924 | |
5df4be3f KO |
1925 | trans->iters_touched = 0; |
1926 | trans->iters_unlink_on_restart = 0; | |
1927 | trans->iters_unlink_on_commit = 0; | |
1928 | trans->nr_updates = 0; | |
1929 | trans->mem_top = 0; | |
bf7b87a4 KO |
1930 | |
1931 | bch2_btree_iter_traverse_all(trans); | |
1c6fdbd8 KO |
1932 | } |
1933 | ||
1934 | void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c) | |
1935 | { | |
a8e00bd4 KO |
1936 | memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); |
1937 | ||
1c6fdbd8 | 1938 | trans->c = c; |
a8e00bd4 | 1939 | trans->size = ARRAY_SIZE(trans->iters_onstack); |
1c6fdbd8 | 1940 | trans->iters = trans->iters_onstack; |
a8e00bd4 | 1941 | trans->updates = trans->updates_onstack; |
1c6fdbd8 KO |
1942 | } |
1943 | ||
1944 | int bch2_trans_exit(struct btree_trans *trans) | |
1945 | { | |
1946 | int ret = bch2_trans_unlock(trans); | |
1947 | ||
1948 | kfree(trans->mem); | |
a8e00bd4 | 1949 | if (trans->used_mempool) |
581edb63 | 1950 | mempool_free(trans->iters, &trans->c->btree_iters_pool); |
a8e00bd4 KO |
1951 | else if (trans->iters != trans->iters_onstack) |
1952 | kfree(trans->iters); | |
1c6fdbd8 KO |
1953 | trans->mem = (void *) 0x1; |
1954 | trans->iters = (void *) 0x1; | |
1955 | return ret; | |
1956 | } |