Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
4 | #include "bkey_methods.h" | |
5 | #include "btree_cache.h" | |
6 | #include "btree_iter.h" | |
7 | #include "btree_locking.h" | |
8 | #include "debug.h" | |
9 | #include "extents.h" | |
10 | #include "trace.h" | |
11 | ||
12 | #include <linux/prefetch.h> | |
13 | ||
14 | static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, | |
15 | struct btree_iter_level *, | |
16 | struct bkey *); | |
17 | ||
18 | #define BTREE_ITER_NOT_END ((struct btree *) 1) | |
19 | ||
20 | static inline bool is_btree_node(struct btree_iter *iter, unsigned l) | |
21 | { | |
22 | return l < BTREE_MAX_DEPTH && | |
23 | iter->l[l].b && | |
24 | iter->l[l].b != BTREE_ITER_NOT_END; | |
25 | } | |
26 | ||
a00fd8c5 KO |
27 | /* Returns < 0 if @k is before iter pos, > 0 if @k is after */ |
28 | static inline int __btree_iter_pos_cmp(struct btree_iter *iter, | |
29 | const struct btree *b, | |
30 | const struct bkey_packed *k, | |
31 | bool interior_node) | |
32 | { | |
33 | int cmp = bkey_cmp_left_packed(b, k, &iter->pos); | |
34 | ||
35 | if (cmp) | |
36 | return cmp; | |
37 | if (bkey_deleted(k)) | |
38 | return -1; | |
cbdf24ce KO |
39 | |
40 | /* | |
41 | * Normally, for extents we want the first key strictly greater than | |
42 | * the iterator position - with the exception that for interior nodes, | |
43 | * we don't want to advance past the last key if the iterator position | |
44 | * is POS_MAX: | |
45 | */ | |
46 | if (iter->flags & BTREE_ITER_IS_EXTENTS && | |
47 | (!interior_node || | |
48 | bkey_cmp_left_packed_byval(b, k, POS_MAX))) | |
a00fd8c5 KO |
49 | return -1; |
50 | return 1; | |
51 | } | |
52 | ||
53 | static inline int btree_iter_pos_cmp(struct btree_iter *iter, | |
54 | const struct btree *b, | |
55 | const struct bkey_packed *k) | |
56 | { | |
57 | return __btree_iter_pos_cmp(iter, b, k, b->level != 0); | |
58 | } | |
59 | ||
1c6fdbd8 KO |
60 | /* Btree node locking: */ |
61 | ||
62 | /* | |
63 | * Updates the saved lock sequence number, so that bch2_btree_node_relock() will | |
64 | * succeed: | |
65 | */ | |
66 | void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) | |
67 | { | |
68 | struct btree_iter *linked; | |
69 | ||
70 | EBUG_ON(iter->l[b->level].b != b); | |
e4ccb251 | 71 | EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq); |
1c6fdbd8 | 72 | |
0f238367 | 73 | trans_for_each_iter_with_node(iter->trans, b, linked) |
e4ccb251 | 74 | linked->l[b->level].lock_seq += 2; |
1c6fdbd8 KO |
75 | |
76 | six_unlock_write(&b->lock); | |
77 | } | |
78 | ||
79 | void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) | |
80 | { | |
1c6fdbd8 KO |
81 | struct btree_iter *linked; |
82 | unsigned readers = 0; | |
83 | ||
84 | EBUG_ON(btree_node_read_locked(iter, b->level)); | |
85 | ||
0f238367 | 86 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 KO |
87 | if (linked->l[b->level].b == b && |
88 | btree_node_read_locked(linked, b->level)) | |
89 | readers++; | |
90 | ||
91 | /* | |
92 | * Must drop our read locks before calling six_lock_write() - | |
93 | * six_unlock() won't do wakeups until the reader count | |
94 | * goes to 0, and it's safe because we have the node intent | |
95 | * locked: | |
96 | */ | |
97 | atomic64_sub(__SIX_VAL(read_lock, readers), | |
98 | &b->lock.state.counter); | |
9e5e5b9e | 99 | btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); |
1c6fdbd8 KO |
100 | atomic64_add(__SIX_VAL(read_lock, readers), |
101 | &b->lock.state.counter); | |
102 | } | |
103 | ||
1c6fdbd8 KO |
104 | bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) |
105 | { | |
106 | struct btree *b = btree_iter_node(iter, level); | |
107 | int want = __btree_lock_want(iter, level); | |
108 | ||
109 | if (!b || b == BTREE_ITER_NOT_END) | |
110 | return false; | |
111 | ||
112 | if (race_fault()) | |
113 | return false; | |
114 | ||
e4ccb251 KO |
115 | if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) && |
116 | !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 && | |
1c6fdbd8 KO |
117 | btree_node_lock_increment(iter, b, level, want))) |
118 | return false; | |
119 | ||
120 | mark_btree_node_locked(iter, level, want); | |
121 | return true; | |
122 | } | |
123 | ||
124 | static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) | |
125 | { | |
126 | struct btree *b = iter->l[level].b; | |
127 | ||
128 | EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); | |
129 | ||
130 | if (!is_btree_node(iter, level)) | |
131 | return false; | |
132 | ||
133 | if (btree_node_intent_locked(iter, level)) | |
134 | return true; | |
135 | ||
136 | if (race_fault()) | |
137 | return false; | |
138 | ||
139 | if (btree_node_locked(iter, level) | |
140 | ? six_lock_tryupgrade(&b->lock) | |
e4ccb251 | 141 | : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq)) |
1c6fdbd8 KO |
142 | goto success; |
143 | ||
e4ccb251 | 144 | if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 && |
1c6fdbd8 KO |
145 | btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) { |
146 | btree_node_unlock(iter, level); | |
147 | goto success; | |
148 | } | |
149 | ||
150 | return false; | |
151 | success: | |
152 | mark_btree_node_intent_locked(iter, level); | |
153 | return true; | |
154 | } | |
155 | ||
156 | static inline bool btree_iter_get_locks(struct btree_iter *iter, | |
157 | bool upgrade) | |
158 | { | |
159 | unsigned l = iter->level; | |
160 | int fail_idx = -1; | |
161 | ||
162 | do { | |
163 | if (!btree_iter_node(iter, l)) | |
164 | break; | |
165 | ||
166 | if (!(upgrade | |
167 | ? bch2_btree_node_upgrade(iter, l) | |
168 | : bch2_btree_node_relock(iter, l))) { | |
169 | fail_idx = l; | |
170 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
171 | } | |
172 | ||
173 | l++; | |
174 | } while (l < iter->locks_want); | |
175 | ||
176 | /* | |
177 | * When we fail to get a lock, we have to ensure that any child nodes | |
178 | * can't be relocked so bch2_btree_iter_traverse has to walk back up to | |
179 | * the node that we failed to relock: | |
180 | */ | |
181 | while (fail_idx >= 0) { | |
182 | btree_node_unlock(iter, fail_idx); | |
183 | iter->l[fail_idx].b = BTREE_ITER_NOT_END; | |
184 | --fail_idx; | |
185 | } | |
186 | ||
187 | if (iter->uptodate == BTREE_ITER_NEED_RELOCK) | |
188 | iter->uptodate = BTREE_ITER_NEED_PEEK; | |
189 | ||
0f238367 KO |
190 | bch2_btree_trans_verify_locks(iter->trans); |
191 | ||
1c6fdbd8 KO |
192 | return iter->uptodate < BTREE_ITER_NEED_RELOCK; |
193 | } | |
194 | ||
195 | /* Slowpath: */ | |
196 | bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, | |
197 | unsigned level, | |
198 | struct btree_iter *iter, | |
199 | enum six_lock_type type, | |
200 | bool may_drop_locks) | |
201 | { | |
1c6fdbd8 KO |
202 | struct btree_iter *linked; |
203 | bool ret = true; | |
204 | ||
647d7b60 | 205 | /* Check if it's safe to block: */ |
0f238367 | 206 | trans_for_each_iter(iter->trans, linked) { |
1c6fdbd8 KO |
207 | if (!linked->nodes_locked) |
208 | continue; | |
209 | ||
647d7b60 | 210 | /* * Must lock btree nodes in key order: */ |
1c6fdbd8 KO |
211 | if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) |
212 | ret = false; | |
213 | ||
214 | /* | |
215 | * Can't block taking an intent lock if we have _any_ nodes read | |
216 | * locked: | |
217 | * | |
218 | * - Our read lock blocks another thread with an intent lock on | |
219 | * the same node from getting a write lock, and thus from | |
220 | * dropping its intent lock | |
221 | * | |
222 | * - And the other thread may have multiple nodes intent locked: | |
223 | * both the node we want to intent lock, and the node we | |
224 | * already have read locked - deadlock: | |
225 | */ | |
226 | if (type == SIX_LOCK_intent && | |
227 | linked->nodes_locked != linked->nodes_intent_locked) { | |
228 | if (may_drop_locks) { | |
229 | linked->locks_want = max_t(unsigned, | |
230 | linked->locks_want, | |
231 | __fls(linked->nodes_locked) + 1); | |
232 | btree_iter_get_locks(linked, true); | |
233 | } | |
234 | ret = false; | |
235 | } | |
236 | ||
237 | /* | |
238 | * Interior nodes must be locked before their descendants: if | |
239 | * another iterator has possible descendants locked of the node | |
240 | * we're about to lock, it must have the ancestors locked too: | |
241 | */ | |
242 | if (linked->btree_id == iter->btree_id && | |
243 | level > __fls(linked->nodes_locked)) { | |
244 | if (may_drop_locks) { | |
647d7b60 KO |
245 | linked->locks_want = |
246 | max(level + 1, max_t(unsigned, | |
247 | linked->locks_want, | |
248 | iter->locks_want)); | |
1c6fdbd8 KO |
249 | btree_iter_get_locks(linked, true); |
250 | } | |
251 | ret = false; | |
252 | } | |
253 | } | |
254 | ||
255 | if (ret) | |
9e5e5b9e | 256 | __btree_node_lock_type(iter->trans->c, b, type); |
1c7a0adf KO |
257 | else |
258 | trans_restart(); | |
259 | ||
1c6fdbd8 KO |
260 | return ret; |
261 | } | |
262 | ||
263 | /* Btree iterator locking: */ | |
264 | ||
265 | #ifdef CONFIG_BCACHEFS_DEBUG | |
0f238367 | 266 | void bch2_btree_iter_verify_locks(struct btree_iter *iter) |
1c6fdbd8 KO |
267 | { |
268 | unsigned l; | |
269 | ||
ad7ae8d6 KO |
270 | BUG_ON((iter->flags & BTREE_ITER_NOUNLOCK) && |
271 | !btree_node_locked(iter, 0)); | |
272 | ||
1c6fdbd8 KO |
273 | for (l = 0; btree_iter_node(iter, l); l++) { |
274 | if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && | |
275 | !btree_node_locked(iter, l)) | |
276 | continue; | |
277 | ||
278 | BUG_ON(btree_lock_want(iter, l) != | |
279 | btree_node_locked_type(iter, l)); | |
280 | } | |
281 | } | |
ad7ae8d6 | 282 | |
0f238367 | 283 | void bch2_btree_trans_verify_locks(struct btree_trans *trans) |
ad7ae8d6 | 284 | { |
0f238367 | 285 | struct btree_iter *iter; |
ad7ae8d6 | 286 | |
0f238367 KO |
287 | trans_for_each_iter(trans, iter) |
288 | bch2_btree_iter_verify_locks(iter); | |
ad7ae8d6 | 289 | } |
1c6fdbd8 KO |
290 | #endif |
291 | ||
292 | __flatten | |
0f238367 | 293 | static bool bch2_btree_iter_relock(struct btree_iter *iter) |
1c6fdbd8 KO |
294 | { |
295 | return iter->uptodate >= BTREE_ITER_NEED_RELOCK | |
296 | ? btree_iter_get_locks(iter, false) | |
297 | : true; | |
298 | } | |
299 | ||
1c6fdbd8 KO |
300 | bool __bch2_btree_iter_upgrade(struct btree_iter *iter, |
301 | unsigned new_locks_want) | |
302 | { | |
303 | struct btree_iter *linked; | |
304 | ||
305 | EBUG_ON(iter->locks_want >= new_locks_want); | |
306 | ||
307 | iter->locks_want = new_locks_want; | |
308 | ||
309 | if (btree_iter_get_locks(iter, true)) | |
310 | return true; | |
311 | ||
312 | /* | |
313 | * Ancestor nodes must be locked before child nodes, so set locks_want | |
314 | * on iterators that might lock ancestors before us to avoid getting | |
315 | * -EINTR later: | |
316 | */ | |
0f238367 KO |
317 | trans_for_each_iter(iter->trans, linked) |
318 | if (linked != iter && | |
319 | linked->btree_id == iter->btree_id && | |
1c6fdbd8 KO |
320 | btree_iter_cmp(linked, iter) <= 0 && |
321 | linked->locks_want < new_locks_want) { | |
322 | linked->locks_want = new_locks_want; | |
323 | btree_iter_get_locks(linked, true); | |
324 | } | |
325 | ||
326 | return false; | |
327 | } | |
328 | ||
329 | bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, | |
330 | unsigned new_locks_want) | |
331 | { | |
332 | unsigned l = iter->level; | |
333 | ||
334 | EBUG_ON(iter->locks_want >= new_locks_want); | |
335 | ||
336 | iter->locks_want = new_locks_want; | |
337 | ||
338 | do { | |
339 | if (!btree_iter_node(iter, l)) | |
340 | break; | |
341 | ||
342 | if (!bch2_btree_node_upgrade(iter, l)) { | |
343 | iter->locks_want = l; | |
344 | return false; | |
345 | } | |
346 | ||
347 | l++; | |
348 | } while (l < iter->locks_want); | |
349 | ||
350 | return true; | |
351 | } | |
352 | ||
353 | void __bch2_btree_iter_downgrade(struct btree_iter *iter, | |
354 | unsigned downgrade_to) | |
355 | { | |
356 | struct btree_iter *linked; | |
357 | unsigned l; | |
358 | ||
359 | /* | |
360 | * We downgrade linked iterators as well because btree_iter_upgrade | |
361 | * might have had to modify locks_want on linked iterators due to lock | |
362 | * ordering: | |
363 | */ | |
0f238367 | 364 | trans_for_each_iter(iter->trans, linked) { |
1c6fdbd8 KO |
365 | unsigned new_locks_want = downgrade_to ?: |
366 | (linked->flags & BTREE_ITER_INTENT ? 1 : 0); | |
367 | ||
368 | if (linked->locks_want <= new_locks_want) | |
369 | continue; | |
370 | ||
371 | linked->locks_want = new_locks_want; | |
372 | ||
373 | while (linked->nodes_locked && | |
374 | (l = __fls(linked->nodes_locked)) >= linked->locks_want) { | |
375 | if (l > linked->level) { | |
376 | btree_node_unlock(linked, l); | |
377 | } else { | |
378 | if (btree_node_intent_locked(linked, l)) { | |
379 | six_lock_downgrade(&linked->l[l].b->lock); | |
380 | linked->nodes_intent_locked ^= 1 << l; | |
381 | } | |
382 | break; | |
383 | } | |
384 | } | |
1c6fdbd8 | 385 | } |
ad7ae8d6 | 386 | |
0f238367 | 387 | bch2_btree_trans_verify_locks(iter->trans); |
1c6fdbd8 KO |
388 | } |
389 | ||
390 | int bch2_btree_iter_unlock(struct btree_iter *iter) | |
391 | { | |
392 | struct btree_iter *linked; | |
393 | ||
0f238367 | 394 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 KO |
395 | __bch2_btree_iter_unlock(linked); |
396 | ||
0f238367 | 397 | return btree_iter_err(iter); |
1c6fdbd8 KO |
398 | } |
399 | ||
0f238367 KO |
400 | bool bch2_btree_trans_relock(struct btree_trans *trans) |
401 | { | |
402 | struct btree_iter *iter; | |
403 | bool ret = true; | |
404 | ||
405 | trans_for_each_iter(trans, iter) | |
406 | ret &= bch2_btree_iter_relock(iter); | |
407 | ||
408 | return ret; | |
409 | } | |
410 | ||
411 | void bch2_btree_trans_unlock(struct btree_trans *trans) | |
412 | { | |
413 | struct btree_iter *iter; | |
414 | ||
415 | trans_for_each_iter(trans, iter) | |
416 | __bch2_btree_iter_unlock(iter); | |
417 | } | |
418 | ||
419 | /* Btree transaction locking: */ | |
420 | ||
1c6fdbd8 KO |
421 | /* Btree iterator: */ |
422 | ||
423 | #ifdef CONFIG_BCACHEFS_DEBUG | |
424 | ||
425 | static void __bch2_btree_iter_verify(struct btree_iter *iter, | |
426 | struct btree *b) | |
427 | { | |
428 | struct btree_iter_level *l = &iter->l[b->level]; | |
429 | struct btree_node_iter tmp = l->iter; | |
430 | struct bkey_packed *k; | |
431 | ||
271a3d3a KO |
432 | if (iter->uptodate > BTREE_ITER_NEED_PEEK) |
433 | return; | |
434 | ||
1c6fdbd8 KO |
435 | bch2_btree_node_iter_verify(&l->iter, b); |
436 | ||
437 | /* | |
438 | * For interior nodes, the iterator will have skipped past | |
439 | * deleted keys: | |
271a3d3a KO |
440 | * |
441 | * For extents, the iterator may have skipped past deleted keys (but not | |
442 | * whiteouts) | |
1c6fdbd8 | 443 | */ |
271a3d3a | 444 | k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS |
26609b61 | 445 | ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) |
1c6fdbd8 | 446 | : bch2_btree_node_iter_prev_all(&tmp, b); |
a00fd8c5 | 447 | if (k && btree_iter_pos_cmp(iter, b, k) > 0) { |
1c6fdbd8 KO |
448 | char buf[100]; |
449 | struct bkey uk = bkey_unpack_key(b, k); | |
450 | ||
319f9ac3 | 451 | bch2_bkey_to_text(&PBUF(buf), &uk); |
271a3d3a | 452 | panic("prev key should be before iter pos:\n%s\n%llu:%llu\n", |
1c6fdbd8 KO |
453 | buf, iter->pos.inode, iter->pos.offset); |
454 | } | |
455 | ||
456 | k = bch2_btree_node_iter_peek_all(&l->iter, b); | |
a00fd8c5 | 457 | if (k && btree_iter_pos_cmp(iter, b, k) < 0) { |
1c6fdbd8 KO |
458 | char buf[100]; |
459 | struct bkey uk = bkey_unpack_key(b, k); | |
460 | ||
319f9ac3 | 461 | bch2_bkey_to_text(&PBUF(buf), &uk); |
271a3d3a KO |
462 | panic("iter should be after current key:\n" |
463 | "iter pos %llu:%llu\n" | |
464 | "cur key %s\n", | |
1c6fdbd8 KO |
465 | iter->pos.inode, iter->pos.offset, buf); |
466 | } | |
467 | ||
271a3d3a KO |
468 | BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && |
469 | (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS && | |
470 | !bkey_whiteout(&iter->k) && | |
471 | bch2_btree_node_iter_end(&l->iter)); | |
1c6fdbd8 KO |
472 | } |
473 | ||
474 | void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) | |
475 | { | |
476 | struct btree_iter *linked; | |
477 | ||
0f238367 | 478 | trans_for_each_iter_with_node(iter->trans, b, linked) |
1c6fdbd8 KO |
479 | __bch2_btree_iter_verify(linked, b); |
480 | } | |
481 | ||
271a3d3a KO |
482 | #else |
483 | ||
484 | static inline void __bch2_btree_iter_verify(struct btree_iter *iter, | |
485 | struct btree *b) {} | |
486 | ||
1c6fdbd8 KO |
487 | #endif |
488 | ||
489 | static void __bch2_btree_node_iter_fix(struct btree_iter *iter, | |
490 | struct btree *b, | |
491 | struct btree_node_iter *node_iter, | |
492 | struct bset_tree *t, | |
493 | struct bkey_packed *where, | |
494 | unsigned clobber_u64s, | |
495 | unsigned new_u64s) | |
496 | { | |
497 | const struct bkey_packed *end = btree_bkey_last(b, t); | |
498 | struct btree_node_iter_set *set; | |
499 | unsigned offset = __btree_node_key_to_offset(b, where); | |
500 | int shift = new_u64s - clobber_u64s; | |
271a3d3a | 501 | unsigned old_end = t->end_offset - shift; |
1c6fdbd8 KO |
502 | |
503 | btree_node_iter_for_each(node_iter, set) | |
504 | if (set->end == old_end) | |
505 | goto found; | |
506 | ||
507 | /* didn't find the bset in the iterator - might have to readd it: */ | |
508 | if (new_u64s && | |
a00fd8c5 | 509 | btree_iter_pos_cmp(iter, b, where) > 0) { |
1c6fdbd8 KO |
510 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); |
511 | ||
512 | bch2_btree_node_iter_push(node_iter, b, where, end); | |
513 | ||
514 | if (!b->level && | |
515 | node_iter == &iter->l[0].iter) | |
516 | bkey_disassemble(b, | |
517 | bch2_btree_node_iter_peek_all(node_iter, b), | |
518 | &iter->k); | |
519 | } | |
520 | return; | |
521 | found: | |
271a3d3a | 522 | set->end = t->end_offset; |
1c6fdbd8 KO |
523 | |
524 | /* Iterator hasn't gotten to the key that changed yet: */ | |
525 | if (set->k < offset) | |
526 | return; | |
527 | ||
528 | if (new_u64s && | |
a00fd8c5 | 529 | btree_iter_pos_cmp(iter, b, where) > 0) { |
1c6fdbd8 KO |
530 | set->k = offset; |
531 | } else if (set->k < offset + clobber_u64s) { | |
532 | set->k = offset + new_u64s; | |
533 | if (set->k == set->end) | |
534 | bch2_btree_node_iter_set_drop(node_iter, set); | |
535 | } else { | |
536 | set->k = (int) set->k + shift; | |
537 | goto iter_current_key_not_modified; | |
538 | } | |
539 | ||
540 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
541 | ||
542 | bch2_btree_node_iter_sort(node_iter, b); | |
56e0e7c7 KO |
543 | if (!b->level && node_iter == &iter->l[0].iter) { |
544 | /* | |
545 | * not legal to call bkey_debugcheck() here, because we're | |
546 | * called midway through the update path after update has been | |
547 | * marked but before deletes have actually happened: | |
548 | */ | |
549 | #if 0 | |
1c6fdbd8 | 550 | __btree_iter_peek_all(iter, &iter->l[0], &iter->k); |
56e0e7c7 KO |
551 | #endif |
552 | struct btree_iter_level *l = &iter->l[0]; | |
553 | struct bkey_packed *k = | |
554 | bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
555 | ||
556 | if (unlikely(!k)) | |
557 | iter->k.type = KEY_TYPE_deleted; | |
558 | else | |
559 | bkey_disassemble(l->b, k, &iter->k); | |
560 | } | |
1c6fdbd8 KO |
561 | iter_current_key_not_modified: |
562 | ||
563 | /* | |
564 | * Interior nodes are special because iterators for interior nodes don't | |
565 | * obey the usual invariants regarding the iterator position: | |
566 | * | |
567 | * We may have whiteouts that compare greater than the iterator | |
568 | * position, and logically should be in the iterator, but that we | |
569 | * skipped past to find the first live key greater than the iterator | |
570 | * position. This becomes an issue when we insert a new key that is | |
571 | * greater than the current iterator position, but smaller than the | |
572 | * whiteouts we've already skipped past - this happens in the course of | |
573 | * a btree split. | |
574 | * | |
575 | * We have to rewind the iterator past to before those whiteouts here, | |
576 | * else bkey_node_iter_prev() is not going to work and who knows what | |
577 | * else would happen. And we have to do it manually, because here we've | |
578 | * already done the insert and the iterator is currently inconsistent: | |
579 | * | |
580 | * We've got multiple competing invariants, here - we have to be careful | |
581 | * about rewinding iterators for interior nodes, because they should | |
582 | * always point to the key for the child node the btree iterator points | |
583 | * to. | |
584 | */ | |
a00fd8c5 KO |
585 | if (b->level && new_u64s && |
586 | btree_iter_pos_cmp(iter, b, where) > 0) { | |
216c9fac | 587 | struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where); |
1c6fdbd8 KO |
588 | struct bkey_packed *k; |
589 | ||
590 | for_each_bset(b, t) { | |
216c9fac | 591 | if (where_set == t) |
1c6fdbd8 KO |
592 | continue; |
593 | ||
594 | k = bch2_bkey_prev_all(b, t, | |
595 | bch2_btree_node_iter_bset_pos(node_iter, b, t)); | |
596 | if (k && | |
a00fd8c5 | 597 | bkey_iter_cmp(b, k, where) > 0) { |
1c6fdbd8 KO |
598 | struct btree_node_iter_set *set; |
599 | unsigned offset = | |
600 | __btree_node_key_to_offset(b, bkey_next(k)); | |
601 | ||
602 | btree_node_iter_for_each(node_iter, set) | |
603 | if (set->k == offset) { | |
604 | set->k = __btree_node_key_to_offset(b, k); | |
605 | bch2_btree_node_iter_sort(node_iter, b); | |
606 | goto next_bset; | |
607 | } | |
608 | ||
609 | bch2_btree_node_iter_push(node_iter, b, k, | |
610 | btree_bkey_last(b, t)); | |
611 | } | |
612 | next_bset: | |
613 | t = t; | |
614 | } | |
615 | } | |
616 | } | |
617 | ||
618 | void bch2_btree_node_iter_fix(struct btree_iter *iter, | |
216c9fac KO |
619 | struct btree *b, |
620 | struct btree_node_iter *node_iter, | |
621 | struct bkey_packed *where, | |
622 | unsigned clobber_u64s, | |
623 | unsigned new_u64s) | |
1c6fdbd8 | 624 | { |
216c9fac | 625 | struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); |
1c6fdbd8 KO |
626 | struct btree_iter *linked; |
627 | ||
628 | if (node_iter != &iter->l[b->level].iter) | |
629 | __bch2_btree_node_iter_fix(iter, b, node_iter, t, | |
630 | where, clobber_u64s, new_u64s); | |
631 | ||
0f238367 | 632 | trans_for_each_iter_with_node(iter->trans, b, linked) |
1c6fdbd8 KO |
633 | __bch2_btree_node_iter_fix(linked, b, |
634 | &linked->l[b->level].iter, t, | |
635 | where, clobber_u64s, new_u64s); | |
1c6fdbd8 KO |
636 | } |
637 | ||
638 | static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, | |
639 | struct btree_iter_level *l, | |
640 | struct bkey *u, | |
641 | struct bkey_packed *k) | |
642 | { | |
643 | struct bkey_s_c ret; | |
644 | ||
645 | if (unlikely(!k)) { | |
646 | /* | |
647 | * signal to bch2_btree_iter_peek_slot() that we're currently at | |
648 | * a hole | |
649 | */ | |
26609b61 | 650 | u->type = KEY_TYPE_deleted; |
1c6fdbd8 KO |
651 | return bkey_s_c_null; |
652 | } | |
653 | ||
654 | ret = bkey_disassemble(l->b, k, u); | |
655 | ||
9e5e5b9e KO |
656 | if (debug_check_bkeys(iter->trans->c)) |
657 | bch2_bkey_debugcheck(iter->trans->c, l->b, ret); | |
1c6fdbd8 KO |
658 | |
659 | return ret; | |
660 | } | |
661 | ||
662 | /* peek_all() doesn't skip deleted keys */ | |
663 | static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, | |
664 | struct btree_iter_level *l, | |
665 | struct bkey *u) | |
666 | { | |
667 | return __btree_iter_unpack(iter, l, u, | |
668 | bch2_btree_node_iter_peek_all(&l->iter, l->b)); | |
669 | } | |
670 | ||
671 | static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, | |
672 | struct btree_iter_level *l) | |
673 | { | |
674 | return __btree_iter_unpack(iter, l, &iter->k, | |
675 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
676 | } | |
677 | ||
a00fd8c5 KO |
678 | static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, |
679 | struct btree_iter_level *l, | |
680 | int max_advance) | |
1c6fdbd8 | 681 | { |
a00fd8c5 KO |
682 | struct bkey_packed *k; |
683 | int nr_advanced = 0; | |
684 | ||
685 | while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && | |
686 | btree_iter_pos_cmp(iter, l->b, k) < 0) { | |
687 | if (max_advance > 0 && nr_advanced >= max_advance) | |
688 | return false; | |
689 | ||
690 | bch2_btree_node_iter_advance(&l->iter, l->b); | |
691 | nr_advanced++; | |
692 | } | |
693 | ||
694 | return true; | |
1c6fdbd8 KO |
695 | } |
696 | ||
697 | /* | |
698 | * Verify that iterator for parent node points to child node: | |
699 | */ | |
700 | static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) | |
701 | { | |
702 | struct btree_iter_level *l; | |
703 | unsigned plevel; | |
704 | bool parent_locked; | |
705 | struct bkey_packed *k; | |
706 | ||
707 | if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) | |
708 | return; | |
709 | ||
710 | plevel = b->level + 1; | |
711 | if (!btree_iter_node(iter, plevel)) | |
712 | return; | |
713 | ||
714 | parent_locked = btree_node_locked(iter, plevel); | |
715 | ||
716 | if (!bch2_btree_node_relock(iter, plevel)) | |
717 | return; | |
718 | ||
719 | l = &iter->l[plevel]; | |
720 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
721 | if (!k || | |
722 | bkey_deleted(k) || | |
723 | bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { | |
724 | char buf[100]; | |
725 | struct bkey uk = bkey_unpack_key(b, k); | |
726 | ||
319f9ac3 | 727 | bch2_bkey_to_text(&PBUF(buf), &uk); |
1c6fdbd8 KO |
728 | panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", |
729 | buf, b->key.k.p.inode, b->key.k.p.offset); | |
730 | } | |
731 | ||
732 | if (!parent_locked) | |
733 | btree_node_unlock(iter, b->level + 1); | |
734 | } | |
735 | ||
1c6fdbd8 KO |
736 | static inline bool btree_iter_pos_after_node(struct btree_iter *iter, |
737 | struct btree *b) | |
738 | { | |
a00fd8c5 | 739 | return __btree_iter_pos_cmp(iter, NULL, |
cbdf24ce | 740 | bkey_to_packed(&b->key), true) < 0; |
1c6fdbd8 KO |
741 | } |
742 | ||
743 | static inline bool btree_iter_pos_in_node(struct btree_iter *iter, | |
744 | struct btree *b) | |
745 | { | |
746 | return iter->btree_id == b->btree_id && | |
747 | bkey_cmp(iter->pos, b->data->min_key) >= 0 && | |
748 | !btree_iter_pos_after_node(iter, b); | |
749 | } | |
750 | ||
751 | static inline void __btree_iter_init(struct btree_iter *iter, | |
a00fd8c5 | 752 | unsigned level) |
1c6fdbd8 | 753 | { |
a00fd8c5 KO |
754 | struct btree_iter_level *l = &iter->l[level]; |
755 | ||
756 | bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos); | |
1c6fdbd8 | 757 | |
a00fd8c5 KO |
758 | if (iter->flags & BTREE_ITER_IS_EXTENTS) |
759 | btree_iter_advance_to_pos(iter, l, -1); | |
1c6fdbd8 KO |
760 | |
761 | /* Skip to first non whiteout: */ | |
a00fd8c5 KO |
762 | if (level) |
763 | bch2_btree_node_iter_peek(&l->iter, l->b); | |
1c6fdbd8 KO |
764 | |
765 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
766 | } | |
767 | ||
768 | static inline void btree_iter_node_set(struct btree_iter *iter, | |
769 | struct btree *b) | |
770 | { | |
771 | btree_iter_verify_new_node(iter, b); | |
772 | ||
773 | EBUG_ON(!btree_iter_pos_in_node(iter, b)); | |
774 | EBUG_ON(b->lock.state.seq & 1); | |
775 | ||
e4ccb251 | 776 | iter->l[b->level].lock_seq = b->lock.state.seq; |
1c6fdbd8 | 777 | iter->l[b->level].b = b; |
a00fd8c5 | 778 | __btree_iter_init(iter, b->level); |
1c6fdbd8 KO |
779 | } |
780 | ||
781 | /* | |
782 | * A btree node is being replaced - update the iterator to point to the new | |
783 | * node: | |
784 | */ | |
785 | void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) | |
786 | { | |
787 | enum btree_node_locked_type t; | |
788 | struct btree_iter *linked; | |
789 | ||
0f238367 | 790 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 KO |
791 | if (btree_iter_pos_in_node(linked, b)) { |
792 | /* | |
793 | * bch2_btree_iter_node_drop() has already been called - | |
794 | * the old node we're replacing has already been | |
795 | * unlocked and the pointer invalidated | |
796 | */ | |
797 | BUG_ON(btree_node_locked(linked, b->level)); | |
798 | ||
799 | t = btree_lock_want(linked, b->level); | |
800 | if (t != BTREE_NODE_UNLOCKED) { | |
801 | six_lock_increment(&b->lock, (enum six_lock_type) t); | |
802 | mark_btree_node_locked(linked, b->level, (enum six_lock_type) t); | |
803 | } | |
804 | ||
805 | btree_iter_node_set(linked, b); | |
806 | } | |
807 | ||
808 | six_unlock_intent(&b->lock); | |
809 | } | |
810 | ||
811 | void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) | |
812 | { | |
813 | struct btree_iter *linked; | |
814 | unsigned level = b->level; | |
815 | ||
ad7ae8d6 KO |
816 | /* caller now responsible for unlocking @b */ |
817 | ||
818 | BUG_ON(iter->l[level].b != b); | |
819 | BUG_ON(!btree_node_intent_locked(iter, level)); | |
820 | ||
821 | iter->l[level].b = BTREE_ITER_NOT_END; | |
822 | mark_btree_node_unlocked(iter, level); | |
823 | ||
0f238367 | 824 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 | 825 | if (linked->l[level].b == b) { |
ad7ae8d6 | 826 | __btree_node_unlock(linked, level); |
1c6fdbd8 KO |
827 | linked->l[level].b = BTREE_ITER_NOT_END; |
828 | } | |
829 | } | |
830 | ||
831 | /* | |
832 | * A btree node has been modified in such a way as to invalidate iterators - fix | |
833 | * them: | |
834 | */ | |
835 | void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) | |
836 | { | |
837 | struct btree_iter *linked; | |
838 | ||
0f238367 | 839 | trans_for_each_iter_with_node(iter->trans, b, linked) |
a00fd8c5 | 840 | __btree_iter_init(linked, b->level); |
1c6fdbd8 KO |
841 | } |
842 | ||
843 | static inline int btree_iter_lock_root(struct btree_iter *iter, | |
844 | unsigned depth_want) | |
845 | { | |
9e5e5b9e | 846 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
847 | struct btree *b; |
848 | enum six_lock_type lock_type; | |
849 | unsigned i; | |
850 | ||
851 | EBUG_ON(iter->nodes_locked); | |
852 | ||
853 | while (1) { | |
854 | b = READ_ONCE(c->btree_roots[iter->btree_id].b); | |
855 | iter->level = READ_ONCE(b->level); | |
856 | ||
857 | if (unlikely(iter->level < depth_want)) { | |
858 | /* | |
859 | * the root is at a lower depth than the depth we want: | |
860 | * got to the end of the btree, or we're walking nodes | |
861 | * greater than some depth and there are no nodes >= | |
862 | * that depth | |
863 | */ | |
864 | iter->level = depth_want; | |
865 | iter->l[iter->level].b = NULL; | |
8812600c | 866 | return 1; |
1c6fdbd8 KO |
867 | } |
868 | ||
869 | lock_type = __btree_lock_want(iter, iter->level); | |
870 | if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, | |
871 | iter, lock_type, true))) | |
872 | return -EINTR; | |
873 | ||
874 | if (likely(b == c->btree_roots[iter->btree_id].b && | |
875 | b->level == iter->level && | |
876 | !race_fault())) { | |
877 | for (i = 0; i < iter->level; i++) | |
878 | iter->l[i].b = BTREE_ITER_NOT_END; | |
879 | iter->l[iter->level].b = b; | |
880 | ||
881 | mark_btree_node_locked(iter, iter->level, lock_type); | |
882 | btree_iter_node_set(iter, b); | |
883 | return 0; | |
884 | ||
885 | } | |
886 | ||
887 | six_unlock_type(&b->lock, lock_type); | |
888 | } | |
889 | } | |
890 | ||
891 | noinline | |
892 | static void btree_iter_prefetch(struct btree_iter *iter) | |
893 | { | |
9e5e5b9e | 894 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
895 | struct btree_iter_level *l = &iter->l[iter->level]; |
896 | struct btree_node_iter node_iter = l->iter; | |
897 | struct bkey_packed *k; | |
898 | BKEY_PADDED(k) tmp; | |
9e5e5b9e | 899 | unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) |
1c6fdbd8 KO |
900 | ? (iter->level > 1 ? 0 : 2) |
901 | : (iter->level > 1 ? 1 : 16); | |
902 | bool was_locked = btree_node_locked(iter, iter->level); | |
903 | ||
904 | while (nr) { | |
905 | if (!bch2_btree_node_relock(iter, iter->level)) | |
906 | return; | |
907 | ||
908 | bch2_btree_node_iter_advance(&node_iter, l->b); | |
909 | k = bch2_btree_node_iter_peek(&node_iter, l->b); | |
910 | if (!k) | |
911 | break; | |
912 | ||
913 | bch2_bkey_unpack(l->b, &tmp.k, k); | |
9e5e5b9e | 914 | bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); |
1c6fdbd8 KO |
915 | } |
916 | ||
917 | if (!was_locked) | |
918 | btree_node_unlock(iter, iter->level); | |
919 | } | |
920 | ||
921 | static inline int btree_iter_down(struct btree_iter *iter) | |
922 | { | |
9e5e5b9e | 923 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
924 | struct btree_iter_level *l = &iter->l[iter->level]; |
925 | struct btree *b; | |
926 | unsigned level = iter->level - 1; | |
927 | enum six_lock_type lock_type = __btree_lock_want(iter, level); | |
928 | BKEY_PADDED(k) tmp; | |
929 | ||
930 | BUG_ON(!btree_node_locked(iter, iter->level)); | |
931 | ||
932 | bch2_bkey_unpack(l->b, &tmp.k, | |
933 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
934 | ||
9e5e5b9e | 935 | b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, true); |
1c6fdbd8 KO |
936 | if (unlikely(IS_ERR(b))) |
937 | return PTR_ERR(b); | |
938 | ||
939 | mark_btree_node_locked(iter, level, lock_type); | |
940 | btree_iter_node_set(iter, b); | |
941 | ||
942 | if (iter->flags & BTREE_ITER_PREFETCH) | |
943 | btree_iter_prefetch(iter); | |
944 | ||
945 | iter->level = level; | |
946 | ||
947 | return 0; | |
948 | } | |
949 | ||
950 | static void btree_iter_up(struct btree_iter *iter) | |
951 | { | |
952 | btree_node_unlock(iter, iter->level++); | |
953 | } | |
954 | ||
955 | int __must_check __bch2_btree_iter_traverse(struct btree_iter *); | |
956 | ||
957 | static int btree_iter_traverse_error(struct btree_iter *iter, int ret) | |
958 | { | |
e542029e KO |
959 | struct btree_trans *trans = iter->trans; |
960 | struct bch_fs *c = trans->c; | |
961 | u8 sorted[BTREE_ITER_MAX]; | |
962 | unsigned i, nr_sorted = 0; | |
963 | ||
964 | trans_for_each_iter(trans, iter) | |
965 | sorted[nr_sorted++] = iter - trans->iters; | |
966 | ||
967 | #define btree_iter_cmp_by_idx(_l, _r) \ | |
968 | btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) | |
969 | ||
970 | bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); | |
971 | #undef btree_iter_cmp_by_idx | |
972 | ||
1c6fdbd8 | 973 | retry_all: |
e542029e | 974 | bch2_btree_trans_unlock(trans); |
1c6fdbd8 KO |
975 | |
976 | if (ret != -ENOMEM && ret != -EINTR) | |
977 | goto io_error; | |
978 | ||
979 | if (ret == -ENOMEM) { | |
980 | struct closure cl; | |
981 | ||
982 | closure_init_stack(&cl); | |
983 | ||
984 | do { | |
985 | ret = bch2_btree_cache_cannibalize_lock(c, &cl); | |
986 | closure_sync(&cl); | |
987 | } while (ret); | |
988 | } | |
989 | ||
1c6fdbd8 | 990 | /* Now, redo traversals in correct order: */ |
e542029e KO |
991 | for (i = 0; i < nr_sorted; i++) { |
992 | iter = &trans->iters[sorted[i]]; | |
1c6fdbd8 | 993 | |
e542029e KO |
994 | do { |
995 | ret = __bch2_btree_iter_traverse(iter); | |
996 | } while (ret == -EINTR); | |
1c6fdbd8 | 997 | |
e542029e KO |
998 | if (ret) |
999 | goto retry_all; | |
1000 | } | |
1c6fdbd8 | 1001 | |
e542029e | 1002 | ret = btree_trans_has_multiple_iters(trans) ? -EINTR : 0; |
1c6fdbd8 KO |
1003 | out: |
1004 | bch2_btree_cache_cannibalize_unlock(c); | |
1005 | return ret; | |
1006 | io_error: | |
1007 | BUG_ON(ret != -EIO); | |
1008 | ||
1009 | iter->flags |= BTREE_ITER_ERROR; | |
1010 | iter->l[iter->level].b = BTREE_ITER_NOT_END; | |
1011 | goto out; | |
1012 | } | |
1013 | ||
1014 | static unsigned btree_iter_up_until_locked(struct btree_iter *iter, | |
1015 | bool check_pos) | |
1016 | { | |
1017 | unsigned l = iter->level; | |
1018 | ||
1019 | while (btree_iter_node(iter, l) && | |
1020 | !(is_btree_node(iter, l) && | |
1021 | bch2_btree_node_relock(iter, l) && | |
1022 | (!check_pos || | |
1023 | btree_iter_pos_in_node(iter, iter->l[l].b)))) { | |
1024 | btree_node_unlock(iter, l); | |
1025 | iter->l[l].b = BTREE_ITER_NOT_END; | |
1026 | l++; | |
1027 | } | |
1028 | ||
1029 | return l; | |
1030 | } | |
1031 | ||
1032 | /* | |
1033 | * This is the main state machine for walking down the btree - walks down to a | |
1034 | * specified depth | |
1035 | * | |
1036 | * Returns 0 on success, -EIO on error (error reading in a btree node). | |
1037 | * | |
1038 | * On error, caller (peek_node()/peek_key()) must return NULL; the error is | |
1039 | * stashed in the iterator and returned from bch2_btree_iter_unlock(). | |
1040 | */ | |
1041 | int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) | |
1042 | { | |
1043 | unsigned depth_want = iter->level; | |
1044 | ||
1045 | if (unlikely(iter->level >= BTREE_MAX_DEPTH)) | |
1046 | return 0; | |
1047 | ||
0f238367 | 1048 | if (bch2_btree_iter_relock(iter)) |
1c6fdbd8 KO |
1049 | return 0; |
1050 | ||
1c6fdbd8 KO |
1051 | /* |
1052 | * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos | |
1053 | * here unnecessary | |
1054 | */ | |
1055 | iter->level = btree_iter_up_until_locked(iter, true); | |
1056 | ||
1057 | /* | |
1058 | * If we've got a btree node locked (i.e. we aren't about to relock the | |
1059 | * root) - advance its node iterator if necessary: | |
1060 | * | |
1061 | * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary | |
1062 | */ | |
a00fd8c5 KO |
1063 | if (btree_iter_node(iter, iter->level)) |
1064 | btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); | |
1c6fdbd8 KO |
1065 | |
1066 | /* | |
1067 | * Note: iter->nodes[iter->level] may be temporarily NULL here - that | |
1068 | * would indicate to other code that we got to the end of the btree, | |
1069 | * here it indicates that relocking the root failed - it's critical that | |
1070 | * btree_iter_lock_root() comes next and that it can't fail | |
1071 | */ | |
1072 | while (iter->level > depth_want) { | |
1073 | int ret = btree_iter_node(iter, iter->level) | |
1074 | ? btree_iter_down(iter) | |
1075 | : btree_iter_lock_root(iter, depth_want); | |
1076 | if (unlikely(ret)) { | |
8812600c KO |
1077 | if (ret == 1) |
1078 | return 0; | |
1079 | ||
1c6fdbd8 KO |
1080 | iter->level = depth_want; |
1081 | iter->l[iter->level].b = BTREE_ITER_NOT_END; | |
1082 | return ret; | |
1083 | } | |
1084 | } | |
1085 | ||
1086 | iter->uptodate = BTREE_ITER_NEED_PEEK; | |
271a3d3a | 1087 | |
0f238367 | 1088 | bch2_btree_trans_verify_locks(iter->trans); |
271a3d3a | 1089 | __bch2_btree_iter_verify(iter, iter->l[iter->level].b); |
1c6fdbd8 KO |
1090 | return 0; |
1091 | } | |
1092 | ||
1093 | int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) | |
1094 | { | |
1095 | int ret; | |
1096 | ||
1097 | ret = __bch2_btree_iter_traverse(iter); | |
1098 | if (unlikely(ret)) | |
1099 | ret = btree_iter_traverse_error(iter, ret); | |
1100 | ||
0f238367 | 1101 | BUG_ON(ret == -EINTR && !btree_trans_has_multiple_iters(iter->trans)); |
1c6fdbd8 KO |
1102 | |
1103 | return ret; | |
1104 | } | |
1105 | ||
1106 | static inline void bch2_btree_iter_checks(struct btree_iter *iter, | |
1107 | enum btree_iter_type type) | |
1108 | { | |
1109 | EBUG_ON(iter->btree_id >= BTREE_ID_NR); | |
1c6fdbd8 KO |
1110 | EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != |
1111 | (iter->btree_id == BTREE_ID_EXTENTS && | |
1112 | type != BTREE_ITER_NODES)); | |
1113 | ||
0f238367 | 1114 | bch2_btree_trans_verify_locks(iter->trans); |
1c6fdbd8 KO |
1115 | } |
1116 | ||
1117 | /* Iterate across nodes (leaf and interior nodes) */ | |
1118 | ||
1119 | struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) | |
1120 | { | |
1121 | struct btree *b; | |
1122 | int ret; | |
1123 | ||
1124 | bch2_btree_iter_checks(iter, BTREE_ITER_NODES); | |
1125 | ||
1126 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1127 | return iter->l[iter->level].b; | |
1128 | ||
1129 | ret = bch2_btree_iter_traverse(iter); | |
1130 | if (ret) | |
1131 | return NULL; | |
1132 | ||
1133 | b = btree_iter_node(iter, iter->level); | |
1134 | if (!b) | |
1135 | return NULL; | |
1136 | ||
1137 | BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); | |
1138 | ||
1139 | iter->pos = b->key.k.p; | |
1140 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1141 | ||
1142 | return b; | |
1143 | } | |
1144 | ||
1145 | struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) | |
1146 | { | |
1147 | struct btree *b; | |
1148 | int ret; | |
1149 | ||
1150 | bch2_btree_iter_checks(iter, BTREE_ITER_NODES); | |
1151 | ||
1152 | /* already got to end? */ | |
1153 | if (!btree_iter_node(iter, iter->level)) | |
1154 | return NULL; | |
1155 | ||
1156 | btree_iter_up(iter); | |
1157 | ||
1158 | if (!bch2_btree_node_relock(iter, iter->level)) | |
1159 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); | |
1160 | ||
1161 | ret = bch2_btree_iter_traverse(iter); | |
1162 | if (ret) | |
1163 | return NULL; | |
1164 | ||
1165 | /* got to end? */ | |
1166 | b = btree_iter_node(iter, iter->level); | |
1167 | if (!b) | |
1168 | return NULL; | |
1169 | ||
1170 | if (bkey_cmp(iter->pos, b->key.k.p) < 0) { | |
1171 | /* | |
1172 | * Haven't gotten to the end of the parent node: go back down to | |
1173 | * the next child node | |
1174 | */ | |
1175 | ||
1176 | /* | |
1177 | * We don't really want to be unlocking here except we can't | |
1178 | * directly tell btree_iter_traverse() "traverse to this level" | |
1179 | * except by setting iter->level, so we have to unlock so we | |
1180 | * don't screw up our lock invariants: | |
1181 | */ | |
1182 | if (btree_node_read_locked(iter, iter->level)) | |
1183 | btree_node_unlock(iter, iter->level); | |
1184 | ||
1185 | /* ick: */ | |
1186 | iter->pos = iter->btree_id == BTREE_ID_INODES | |
1187 | ? btree_type_successor(iter->btree_id, iter->pos) | |
1188 | : bkey_successor(iter->pos); | |
1189 | iter->level = depth; | |
1190 | ||
1191 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1192 | ret = bch2_btree_iter_traverse(iter); | |
1193 | if (ret) | |
1194 | return NULL; | |
1195 | ||
1196 | b = iter->l[iter->level].b; | |
1197 | } | |
1198 | ||
1199 | iter->pos = b->key.k.p; | |
1200 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1201 | ||
1202 | return b; | |
1203 | } | |
1204 | ||
1205 | /* Iterate across keys (in leaf nodes only) */ | |
1206 | ||
1207 | void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) | |
1208 | { | |
1209 | struct btree_iter_level *l = &iter->l[0]; | |
1c6fdbd8 KO |
1210 | |
1211 | EBUG_ON(iter->level != 0); | |
1212 | EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); | |
1213 | EBUG_ON(!btree_node_locked(iter, 0)); | |
1214 | EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); | |
1215 | ||
1216 | iter->pos = new_pos; | |
1217 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
1218 | ||
a00fd8c5 | 1219 | btree_iter_advance_to_pos(iter, l, -1); |
1c6fdbd8 | 1220 | |
a00fd8c5 KO |
1221 | if (bch2_btree_node_iter_end(&l->iter) && |
1222 | btree_iter_pos_after_node(iter, l->b)) | |
1c6fdbd8 | 1223 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); |
1c6fdbd8 KO |
1224 | } |
1225 | ||
1226 | void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) | |
1227 | { | |
1228 | int cmp = bkey_cmp(new_pos, iter->pos); | |
1229 | unsigned level; | |
1230 | ||
1231 | if (!cmp) | |
1232 | return; | |
1233 | ||
1234 | iter->pos = new_pos; | |
1235 | ||
1236 | level = btree_iter_up_until_locked(iter, true); | |
1237 | ||
1238 | if (btree_iter_node(iter, level)) { | |
1c6fdbd8 KO |
1239 | /* |
1240 | * We might have to skip over many keys, or just a few: try | |
1241 | * advancing the node iterator, and if we have to skip over too | |
1242 | * many keys just reinit it (or if we're rewinding, since that | |
1243 | * is expensive). | |
1244 | */ | |
a00fd8c5 KO |
1245 | if (cmp < 0 || |
1246 | !btree_iter_advance_to_pos(iter, &iter->l[level], 8)) | |
1247 | __btree_iter_init(iter, level); | |
1c6fdbd8 KO |
1248 | |
1249 | /* Don't leave it locked if we're not supposed to: */ | |
1250 | if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED) | |
1251 | btree_node_unlock(iter, level); | |
1252 | } | |
1253 | ||
1254 | if (level != iter->level) | |
1255 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1256 | else | |
1257 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
1258 | } | |
1259 | ||
1260 | static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) | |
1261 | { | |
1262 | struct btree_iter_level *l = &iter->l[0]; | |
1263 | struct bkey_s_c ret = { .k = &iter->k }; | |
1264 | ||
1265 | if (!bkey_deleted(&iter->k)) { | |
1266 | EBUG_ON(bch2_btree_node_iter_end(&l->iter)); | |
1267 | ret.v = bkeyp_val(&l->b->format, | |
1268 | __bch2_btree_node_iter_peek_all(&l->iter, l->b)); | |
1269 | } | |
1270 | ||
9e5e5b9e | 1271 | if (debug_check_bkeys(iter->trans->c) && |
1c6fdbd8 | 1272 | !bkey_deleted(ret.k)) |
9e5e5b9e | 1273 | bch2_bkey_debugcheck(iter->trans->c, l->b, ret); |
1c6fdbd8 KO |
1274 | return ret; |
1275 | } | |
1276 | ||
1277 | struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) | |
1278 | { | |
1279 | struct btree_iter_level *l = &iter->l[0]; | |
1280 | struct bkey_s_c k; | |
1281 | int ret; | |
1282 | ||
1283 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1284 | ||
1285 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1286 | return btree_iter_peek_uptodate(iter); | |
1287 | ||
1288 | while (1) { | |
1289 | ret = bch2_btree_iter_traverse(iter); | |
1290 | if (unlikely(ret)) | |
1291 | return bkey_s_c_err(ret); | |
1292 | ||
1293 | k = __btree_iter_peek(iter, l); | |
1294 | if (likely(k.k)) | |
1295 | break; | |
1296 | ||
1297 | /* got to the end of the leaf, iterator needs to be traversed: */ | |
1298 | iter->pos = l->b->key.k.p; | |
1299 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1300 | ||
1301 | if (!bkey_cmp(iter->pos, POS_MAX)) | |
1302 | return bkey_s_c_null; | |
1303 | ||
1304 | iter->pos = btree_type_successor(iter->btree_id, iter->pos); | |
1305 | } | |
1306 | ||
1307 | /* | |
1308 | * iter->pos should always be equal to the key we just | |
1309 | * returned - except extents can straddle iter->pos: | |
1310 | */ | |
1311 | if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || | |
1312 | bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) | |
1313 | iter->pos = bkey_start_pos(k.k); | |
1314 | ||
1315 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1316 | return k; | |
1317 | } | |
1318 | ||
1319 | static noinline | |
1320 | struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter) | |
1321 | { | |
1322 | struct btree_iter_level *l = &iter->l[0]; | |
1323 | ||
1324 | iter->pos = l->b->key.k.p; | |
1325 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1326 | ||
1327 | if (!bkey_cmp(iter->pos, POS_MAX)) | |
1328 | return bkey_s_c_null; | |
1329 | ||
1330 | iter->pos = btree_type_successor(iter->btree_id, iter->pos); | |
1331 | ||
1332 | return bch2_btree_iter_peek(iter); | |
1333 | } | |
1334 | ||
1335 | struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) | |
1336 | { | |
1337 | struct btree_iter_level *l = &iter->l[0]; | |
1338 | struct bkey_packed *p; | |
1339 | struct bkey_s_c k; | |
1340 | ||
1341 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1342 | ||
1343 | if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { | |
1344 | k = bch2_btree_iter_peek(iter); | |
1345 | if (IS_ERR_OR_NULL(k.k)) | |
1346 | return k; | |
1347 | } | |
1348 | ||
1349 | do { | |
a00fd8c5 | 1350 | bch2_btree_node_iter_advance(&l->iter, l->b); |
1c6fdbd8 KO |
1351 | p = bch2_btree_node_iter_peek_all(&l->iter, l->b); |
1352 | if (unlikely(!p)) | |
1353 | return bch2_btree_iter_peek_next_leaf(iter); | |
1354 | } while (bkey_whiteout(p)); | |
1355 | ||
1356 | k = __btree_iter_unpack(iter, l, &iter->k, p); | |
1357 | ||
1358 | EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0); | |
1359 | iter->pos = bkey_start_pos(k.k); | |
1360 | return k; | |
1361 | } | |
1362 | ||
1363 | struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) | |
1364 | { | |
1365 | struct btree_iter_level *l = &iter->l[0]; | |
1366 | struct bkey_packed *p; | |
1367 | struct bkey_s_c k; | |
1368 | int ret; | |
1369 | ||
1370 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1371 | ||
1372 | if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { | |
1373 | k = bch2_btree_iter_peek(iter); | |
1374 | if (IS_ERR(k.k)) | |
1375 | return k; | |
1376 | } | |
1377 | ||
1378 | while (1) { | |
1379 | p = bch2_btree_node_iter_prev(&l->iter, l->b); | |
1380 | if (likely(p)) | |
1381 | break; | |
1382 | ||
1383 | iter->pos = l->b->data->min_key; | |
1384 | if (!bkey_cmp(iter->pos, POS_MIN)) | |
1385 | return bkey_s_c_null; | |
1386 | ||
1387 | bch2_btree_iter_set_pos(iter, | |
1388 | btree_type_predecessor(iter->btree_id, iter->pos)); | |
1389 | ||
1390 | ret = bch2_btree_iter_traverse(iter); | |
1391 | if (unlikely(ret)) | |
1392 | return bkey_s_c_err(ret); | |
1393 | ||
1394 | p = bch2_btree_node_iter_peek(&l->iter, l->b); | |
1395 | if (p) | |
1396 | break; | |
1397 | } | |
1398 | ||
1399 | k = __btree_iter_unpack(iter, l, &iter->k, p); | |
1400 | ||
1401 | EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); | |
1402 | ||
1403 | iter->pos = bkey_start_pos(k.k); | |
1404 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1405 | return k; | |
1406 | } | |
1407 | ||
1408 | static inline struct bkey_s_c | |
271a3d3a | 1409 | __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) |
1c6fdbd8 KO |
1410 | { |
1411 | struct btree_iter_level *l = &iter->l[0]; | |
271a3d3a | 1412 | struct btree_node_iter node_iter; |
1c6fdbd8 KO |
1413 | struct bkey_s_c k; |
1414 | struct bkey n; | |
1415 | int ret; | |
1416 | ||
1417 | recheck: | |
1418 | while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && | |
1419 | bkey_deleted(k.k) && | |
1420 | bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0) | |
a00fd8c5 | 1421 | bch2_btree_node_iter_advance(&l->iter, l->b); |
1c6fdbd8 | 1422 | |
271a3d3a KO |
1423 | /* |
1424 | * iterator is now at the correct position for inserting at iter->pos, | |
1425 | * but we need to keep iterating until we find the first non whiteout so | |
1426 | * we know how big a hole we have, if any: | |
1427 | */ | |
1428 | ||
1429 | node_iter = l->iter; | |
1430 | if (k.k && bkey_whiteout(k.k)) | |
1431 | k = __btree_iter_unpack(iter, l, &iter->k, | |
1432 | bch2_btree_node_iter_peek(&node_iter, l->b)); | |
1433 | ||
1c6fdbd8 KO |
1434 | /* |
1435 | * If we got to the end of the node, check if we need to traverse to the | |
1436 | * next node: | |
1437 | */ | |
1438 | if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { | |
1439 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1440 | ret = bch2_btree_iter_traverse(iter); | |
1441 | if (unlikely(ret)) | |
1442 | return bkey_s_c_err(ret); | |
1443 | ||
1444 | goto recheck; | |
1445 | } | |
1446 | ||
1447 | if (k.k && | |
1448 | !bkey_whiteout(k.k) && | |
1449 | bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { | |
271a3d3a KO |
1450 | /* |
1451 | * if we skipped forward to find the first non whiteout and | |
1452 | * there _wasn't_ actually a hole, we want the iterator to be | |
1453 | * pointed at the key we found: | |
1454 | */ | |
1455 | l->iter = node_iter; | |
1456 | ||
1c6fdbd8 KO |
1457 | EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); |
1458 | EBUG_ON(bkey_deleted(k.k)); | |
1459 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1460 | return k; | |
1461 | } | |
1462 | ||
1463 | /* hole */ | |
271a3d3a KO |
1464 | |
1465 | /* holes can't span inode numbers: */ | |
1466 | if (iter->pos.offset == KEY_OFFSET_MAX) { | |
1467 | if (iter->pos.inode == KEY_INODE_MAX) | |
1468 | return bkey_s_c_null; | |
1469 | ||
1470 | iter->pos = bkey_successor(iter->pos); | |
1471 | goto recheck; | |
1472 | } | |
1473 | ||
1474 | if (!k.k) | |
1475 | k.k = &l->b->key.k; | |
1476 | ||
1c6fdbd8 KO |
1477 | bkey_init(&n); |
1478 | n.p = iter->pos; | |
271a3d3a KO |
1479 | bch2_key_resize(&n, |
1480 | min_t(u64, KEY_SIZE_MAX, | |
1481 | (k.k->p.inode == n.p.inode | |
1482 | ? bkey_start_offset(k.k) | |
1483 | : KEY_OFFSET_MAX) - | |
1484 | n.p.offset)); | |
1485 | ||
319f9ac3 | 1486 | EBUG_ON(!n.size); |
1c6fdbd8 | 1487 | |
271a3d3a KO |
1488 | iter->k = n; |
1489 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1490 | return (struct bkey_s_c) { &iter->k, NULL }; | |
1491 | } | |
1c6fdbd8 | 1492 | |
271a3d3a KO |
1493 | static inline struct bkey_s_c |
1494 | __bch2_btree_iter_peek_slot(struct btree_iter *iter) | |
1495 | { | |
1496 | struct btree_iter_level *l = &iter->l[0]; | |
1497 | struct bkey_s_c k; | |
1498 | int ret; | |
1c6fdbd8 | 1499 | |
271a3d3a KO |
1500 | if (iter->flags & BTREE_ITER_IS_EXTENTS) |
1501 | return __bch2_btree_iter_peek_slot_extents(iter); | |
1c6fdbd8 | 1502 | |
271a3d3a KO |
1503 | recheck: |
1504 | while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && | |
1505 | bkey_deleted(k.k) && | |
1506 | bkey_cmp(k.k->p, iter->pos) == 0) | |
a00fd8c5 | 1507 | bch2_btree_node_iter_advance(&l->iter, l->b); |
1c6fdbd8 | 1508 | |
271a3d3a KO |
1509 | /* |
1510 | * If we got to the end of the node, check if we need to traverse to the | |
1511 | * next node: | |
1512 | */ | |
1513 | if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { | |
1514 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1515 | ret = bch2_btree_iter_traverse(iter); | |
1516 | if (unlikely(ret)) | |
1517 | return bkey_s_c_err(ret); | |
1c6fdbd8 | 1518 | |
271a3d3a | 1519 | goto recheck; |
1c6fdbd8 KO |
1520 | } |
1521 | ||
271a3d3a KO |
1522 | if (k.k && |
1523 | !bkey_deleted(k.k) && | |
1524 | !bkey_cmp(iter->pos, k.k->p)) { | |
1525 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1526 | return k; | |
1527 | } else { | |
1528 | /* hole */ | |
1529 | bkey_init(&iter->k); | |
1530 | iter->k.p = iter->pos; | |
1531 | ||
1532 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1533 | return (struct bkey_s_c) { &iter->k, NULL }; | |
1534 | } | |
1c6fdbd8 KO |
1535 | } |
1536 | ||
1537 | struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) | |
1538 | { | |
1539 | int ret; | |
1540 | ||
1541 | bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); | |
1542 | ||
1543 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1544 | return btree_iter_peek_uptodate(iter); | |
1545 | ||
1546 | ret = bch2_btree_iter_traverse(iter); | |
1547 | if (unlikely(ret)) | |
1548 | return bkey_s_c_err(ret); | |
1549 | ||
1550 | return __bch2_btree_iter_peek_slot(iter); | |
1551 | } | |
1552 | ||
1553 | struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) | |
1554 | { | |
1555 | bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); | |
1556 | ||
1557 | iter->pos = btree_type_successor(iter->btree_id, iter->k.p); | |
1558 | ||
1559 | if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { | |
1560 | /* | |
1561 | * XXX: when we just need to relock we should be able to avoid | |
1562 | * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK | |
1563 | * for that to work | |
1564 | */ | |
1565 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1566 | ||
1567 | return bch2_btree_iter_peek_slot(iter); | |
1568 | } | |
1569 | ||
1570 | if (!bkey_deleted(&iter->k)) | |
a00fd8c5 | 1571 | bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b); |
1c6fdbd8 KO |
1572 | |
1573 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
1574 | ||
1575 | return __bch2_btree_iter_peek_slot(iter); | |
1576 | } | |
1577 | ||
9e5e5b9e KO |
1578 | static inline void bch2_btree_iter_init(struct btree_trans *trans, |
1579 | struct btree_iter *iter, enum btree_id btree_id, | |
424eb881 | 1580 | struct bpos pos, unsigned flags) |
1c6fdbd8 | 1581 | { |
9e5e5b9e | 1582 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
1583 | unsigned i; |
1584 | ||
424eb881 KO |
1585 | if (btree_id == BTREE_ID_EXTENTS && |
1586 | !(flags & BTREE_ITER_NODES)) | |
1587 | flags |= BTREE_ITER_IS_EXTENTS; | |
1c6fdbd8 | 1588 | |
9e5e5b9e | 1589 | iter->trans = trans; |
1c6fdbd8 KO |
1590 | iter->pos = pos; |
1591 | bkey_init(&iter->k); | |
1592 | iter->k.p = pos; | |
1593 | iter->flags = flags; | |
1594 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1595 | iter->btree_id = btree_id; | |
424eb881 KO |
1596 | iter->level = 0; |
1597 | iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; | |
1c6fdbd8 KO |
1598 | iter->nodes_locked = 0; |
1599 | iter->nodes_intent_locked = 0; | |
1600 | for (i = 0; i < ARRAY_SIZE(iter->l); i++) | |
1601 | iter->l[i].b = NULL; | |
1602 | iter->l[iter->level].b = BTREE_ITER_NOT_END; | |
1c6fdbd8 KO |
1603 | |
1604 | prefetch(c->btree_roots[btree_id].b); | |
1605 | } | |
1606 | ||
446c562c | 1607 | static void bch2_btree_iter_unlink(struct btree_iter *iter) |
1c6fdbd8 KO |
1608 | { |
1609 | struct btree_iter *linked; | |
1610 | ||
1611 | __bch2_btree_iter_unlock(iter); | |
1612 | ||
1613 | if (!btree_iter_linked(iter)) | |
1614 | return; | |
1615 | ||
0f238367 | 1616 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 KO |
1617 | if (linked->next == iter) { |
1618 | linked->next = iter->next; | |
1619 | iter->next = iter; | |
1620 | return; | |
1621 | } | |
1622 | ||
1623 | BUG(); | |
1624 | } | |
1625 | ||
446c562c | 1626 | static void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new) |
1c6fdbd8 KO |
1627 | { |
1628 | BUG_ON(btree_iter_linked(new)); | |
1629 | ||
1630 | new->next = iter->next; | |
1631 | iter->next = new; | |
1632 | } | |
1633 | ||
7c26ecae KO |
1634 | static void __bch2_btree_iter_copy(struct btree_iter *dst, |
1635 | struct btree_iter *src) | |
1c6fdbd8 KO |
1636 | { |
1637 | unsigned i; | |
1638 | ||
1c6fdbd8 KO |
1639 | memcpy(dst, src, offsetof(struct btree_iter, next)); |
1640 | ||
1641 | for (i = 0; i < BTREE_MAX_DEPTH; i++) | |
1642 | if (btree_node_locked(dst, i)) | |
1643 | six_lock_increment(&dst->l[i].b->lock, | |
1644 | __btree_lock_want(dst, i)); | |
1645 | } | |
1646 | ||
7c26ecae KO |
1647 | void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) |
1648 | { | |
1649 | __bch2_btree_iter_unlock(dst); | |
1650 | __bch2_btree_iter_copy(dst, src); | |
1651 | } | |
1652 | ||
1c6fdbd8 KO |
1653 | /* new transactional stuff: */ |
1654 | ||
1655 | static void btree_trans_verify(struct btree_trans *trans) | |
1656 | { | |
1657 | unsigned i; | |
1658 | ||
1659 | for (i = 0; i < trans->nr_iters; i++) { | |
1660 | struct btree_iter *iter = &trans->iters[i]; | |
1661 | ||
1662 | BUG_ON(btree_iter_linked(iter) != | |
1663 | ((trans->iters_linked & (1 << i)) && | |
1664 | !is_power_of_2(trans->iters_linked))); | |
1665 | } | |
1666 | } | |
1667 | ||
08af47df KO |
1668 | static inline unsigned btree_trans_iter_idx(struct btree_trans *trans, |
1669 | struct btree_iter *iter) | |
1670 | { | |
1671 | ssize_t idx = iter - trans->iters; | |
1672 | ||
5df4be3f KO |
1673 | EBUG_ON(idx < 0 || idx >= trans->nr_iters); |
1674 | EBUG_ON(!(trans->iters_linked & (1ULL << idx))); | |
08af47df KO |
1675 | |
1676 | return idx; | |
1677 | } | |
1678 | ||
424eb881 KO |
1679 | int bch2_trans_iter_put(struct btree_trans *trans, |
1680 | struct btree_iter *iter) | |
08af47df KO |
1681 | { |
1682 | ssize_t idx = btree_trans_iter_idx(trans, iter); | |
0f238367 | 1683 | int ret = btree_iter_err(iter); |
08af47df | 1684 | |
a8e00bd4 | 1685 | trans->iters_live &= ~(1ULL << idx); |
424eb881 | 1686 | return ret; |
08af47df KO |
1687 | } |
1688 | ||
5df4be3f KO |
1689 | static inline void __bch2_trans_iter_free(struct btree_trans *trans, |
1690 | unsigned idx) | |
1691 | { | |
1692 | trans->iters_linked &= ~(1ULL << idx); | |
1693 | trans->iters_live &= ~(1ULL << idx); | |
1694 | trans->iters_touched &= ~(1ULL << idx); | |
1695 | trans->iters_unlink_on_restart &= ~(1ULL << idx); | |
1696 | trans->iters_unlink_on_commit &= ~(1ULL << idx); | |
1697 | bch2_btree_iter_unlink(&trans->iters[idx]); | |
1698 | } | |
1699 | ||
424eb881 KO |
1700 | int bch2_trans_iter_free(struct btree_trans *trans, |
1701 | struct btree_iter *iter) | |
1c6fdbd8 | 1702 | { |
0f238367 | 1703 | int ret = btree_iter_err(iter); |
424eb881 | 1704 | |
5df4be3f | 1705 | __bch2_trans_iter_free(trans, btree_trans_iter_idx(trans, iter)); |
424eb881 | 1706 | return ret; |
5df4be3f | 1707 | } |
1c6fdbd8 | 1708 | |
424eb881 KO |
1709 | int bch2_trans_iter_free_on_commit(struct btree_trans *trans, |
1710 | struct btree_iter *iter) | |
5df4be3f | 1711 | { |
0f238367 | 1712 | int ret = btree_iter_err(iter); |
424eb881 | 1713 | |
5df4be3f KO |
1714 | trans->iters_unlink_on_commit |= |
1715 | 1ULL << btree_trans_iter_idx(trans, iter); | |
424eb881 | 1716 | return ret; |
1c6fdbd8 KO |
1717 | } |
1718 | ||
a8e00bd4 KO |
1719 | static int btree_trans_realloc_iters(struct btree_trans *trans, |
1720 | unsigned new_size) | |
1c6fdbd8 | 1721 | { |
a8e00bd4 | 1722 | void *new_iters, *new_updates; |
1c6fdbd8 KO |
1723 | unsigned i; |
1724 | ||
a8e00bd4 KO |
1725 | BUG_ON(new_size > BTREE_ITER_MAX); |
1726 | ||
1727 | if (new_size <= trans->size) | |
1728 | return 0; | |
1729 | ||
1730 | BUG_ON(trans->used_mempool); | |
1731 | ||
1c6fdbd8 KO |
1732 | bch2_trans_unlock(trans); |
1733 | ||
a8e00bd4 KO |
1734 | new_iters = kmalloc(sizeof(struct btree_iter) * new_size + |
1735 | sizeof(struct btree_insert_entry) * (new_size + 4), | |
1736 | GFP_NOFS); | |
1737 | if (new_iters) | |
1738 | goto success; | |
1739 | ||
581edb63 | 1740 | new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); |
a8e00bd4 KO |
1741 | new_size = BTREE_ITER_MAX; |
1742 | ||
1743 | trans->used_mempool = true; | |
1744 | success: | |
1745 | new_updates = new_iters + sizeof(struct btree_iter) * new_size; | |
1c6fdbd8 KO |
1746 | |
1747 | memcpy(new_iters, trans->iters, | |
1748 | sizeof(struct btree_iter) * trans->nr_iters); | |
a8e00bd4 KO |
1749 | memcpy(new_updates, trans->updates, |
1750 | sizeof(struct btree_insert_entry) * trans->nr_updates); | |
1751 | ||
5df4be3f KO |
1752 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) |
1753 | memset(trans->iters, POISON_FREE, | |
1754 | sizeof(struct btree_iter) * trans->nr_iters + | |
1755 | sizeof(struct btree_insert_entry) * trans->nr_iters); | |
1756 | ||
a8e00bd4 KO |
1757 | if (trans->iters != trans->iters_onstack) |
1758 | kfree(trans->iters); | |
1759 | ||
1760 | trans->iters = new_iters; | |
1761 | trans->updates = new_updates; | |
1762 | trans->size = new_size; | |
1c6fdbd8 KO |
1763 | |
1764 | for (i = 0; i < trans->nr_iters; i++) | |
1765 | trans->iters[i].next = &trans->iters[i]; | |
1766 | ||
1767 | if (trans->iters_linked) { | |
1768 | unsigned first_linked = __ffs(trans->iters_linked); | |
1769 | ||
1770 | for (i = first_linked + 1; i < trans->nr_iters; i++) | |
1771 | if (trans->iters_linked & (1 << i)) | |
1772 | bch2_btree_iter_link(&trans->iters[first_linked], | |
1773 | &trans->iters[i]); | |
1774 | } | |
1775 | ||
1776 | btree_trans_verify(trans); | |
1777 | ||
1c7a0adf KO |
1778 | if (trans->iters_live) { |
1779 | trans_restart(); | |
1780 | return -EINTR; | |
1781 | } | |
1782 | ||
1783 | return 0; | |
1c6fdbd8 KO |
1784 | } |
1785 | ||
581edb63 | 1786 | void bch2_trans_preload_iters(struct btree_trans *trans) |
1c6fdbd8 | 1787 | { |
a8e00bd4 | 1788 | btree_trans_realloc_iters(trans, BTREE_ITER_MAX); |
1c6fdbd8 KO |
1789 | } |
1790 | ||
7c26ecae KO |
1791 | static int btree_trans_iter_alloc(struct btree_trans *trans) |
1792 | { | |
1793 | struct btree_iter *iter; | |
1794 | unsigned idx = ffz(trans->iters_linked); | |
1795 | ||
1796 | if (idx < trans->nr_iters) | |
1797 | goto got_slot; | |
1798 | ||
1799 | if (trans->nr_iters == trans->size) { | |
1800 | int ret = btree_trans_realloc_iters(trans, trans->size * 2); | |
1801 | if (ret) | |
1802 | return ret; | |
1803 | } | |
1804 | ||
1805 | idx = trans->nr_iters++; | |
1806 | BUG_ON(trans->nr_iters > trans->size); | |
1807 | got_slot: | |
1808 | iter = &trans->iters[idx]; | |
1809 | iter->next = iter; | |
1810 | ||
1811 | BUG_ON(trans->iters_linked & (1ULL << idx)); | |
1812 | ||
1813 | if (trans->iters_linked) | |
1814 | bch2_btree_iter_link(&trans->iters[__ffs(trans->iters_linked)], | |
1815 | &trans->iters[idx]); | |
1816 | trans->iters_linked |= 1ULL << idx; | |
1817 | return idx; | |
1818 | } | |
1819 | ||
1c6fdbd8 | 1820 | static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, |
5df4be3f | 1821 | unsigned btree_id, struct bpos pos, |
1c6fdbd8 KO |
1822 | unsigned flags, u64 iter_id) |
1823 | { | |
1824 | struct btree_iter *iter; | |
1825 | int idx; | |
1826 | ||
1827 | BUG_ON(trans->nr_iters > BTREE_ITER_MAX); | |
1828 | ||
5df4be3f | 1829 | for (idx = 0; idx < trans->nr_iters; idx++) { |
7c26ecae KO |
1830 | if (!(trans->iters_linked & (1ULL << idx))) |
1831 | continue; | |
1832 | ||
5df4be3f KO |
1833 | iter = &trans->iters[idx]; |
1834 | if (iter_id | |
1835 | ? iter->id == iter_id | |
1836 | : (iter->btree_id == btree_id && | |
1837 | !bkey_cmp(iter->pos, pos))) | |
1c6fdbd8 | 1838 | goto found; |
5df4be3f | 1839 | } |
1c6fdbd8 KO |
1840 | idx = -1; |
1841 | found: | |
1842 | if (idx < 0) { | |
7c26ecae KO |
1843 | idx = btree_trans_iter_alloc(trans); |
1844 | if (idx < 0) | |
1845 | return ERR_PTR(idx); | |
1c6fdbd8 | 1846 | |
1c6fdbd8 | 1847 | iter = &trans->iters[idx]; |
a8e00bd4 | 1848 | iter->id = iter_id; |
1c6fdbd8 | 1849 | |
9e5e5b9e | 1850 | bch2_btree_iter_init(trans, iter, btree_id, pos, flags); |
1c6fdbd8 KO |
1851 | } else { |
1852 | iter = &trans->iters[idx]; | |
1853 | ||
1c6fdbd8 KO |
1854 | iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); |
1855 | iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); | |
1856 | } | |
1857 | ||
5df4be3f | 1858 | BUG_ON(iter->btree_id != btree_id); |
a8e00bd4 | 1859 | BUG_ON(trans->iters_live & (1ULL << idx)); |
5df4be3f KO |
1860 | trans->iters_live |= 1ULL << idx; |
1861 | trans->iters_touched |= 1ULL << idx; | |
1c6fdbd8 | 1862 | |
1c6fdbd8 KO |
1863 | btree_trans_verify(trans); |
1864 | ||
08af47df KO |
1865 | BUG_ON(iter->btree_id != btree_id); |
1866 | BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); | |
1867 | ||
1c6fdbd8 KO |
1868 | return iter; |
1869 | } | |
1870 | ||
1871 | struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, | |
1872 | enum btree_id btree_id, | |
1873 | struct bpos pos, unsigned flags, | |
1874 | u64 iter_id) | |
1875 | { | |
1876 | struct btree_iter *iter = | |
5df4be3f | 1877 | __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id); |
1c6fdbd8 KO |
1878 | |
1879 | if (!IS_ERR(iter)) | |
1880 | bch2_btree_iter_set_pos(iter, pos); | |
1881 | return iter; | |
1882 | } | |
1883 | ||
424eb881 KO |
1884 | struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, |
1885 | enum btree_id btree_id, | |
1886 | struct bpos pos, | |
1887 | unsigned locks_want, | |
1888 | unsigned depth, | |
1889 | unsigned flags) | |
1890 | { | |
1891 | struct btree_iter *iter = | |
1892 | __btree_trans_get_iter(trans, btree_id, pos, | |
1893 | flags|BTREE_ITER_NODES, 0); | |
1894 | unsigned i; | |
1895 | ||
1896 | BUG_ON(IS_ERR(iter)); | |
1897 | BUG_ON(bkey_cmp(iter->pos, pos)); | |
1898 | ||
1899 | iter->locks_want = locks_want; | |
1900 | iter->level = depth; | |
1901 | ||
1902 | for (i = 0; i < ARRAY_SIZE(iter->l); i++) | |
1903 | iter->l[i].b = NULL; | |
1904 | iter->l[iter->level].b = BTREE_ITER_NOT_END; | |
1905 | ||
1906 | return iter; | |
1907 | } | |
1908 | ||
7c26ecae KO |
1909 | struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, |
1910 | struct btree_iter *src) | |
1c6fdbd8 | 1911 | { |
7c26ecae | 1912 | int idx; |
1c6fdbd8 | 1913 | |
7c26ecae KO |
1914 | idx = btree_trans_iter_alloc(trans); |
1915 | if (idx < 0) | |
1916 | return ERR_PTR(idx); | |
1917 | ||
1918 | trans->iters_live |= 1ULL << idx; | |
1919 | trans->iters_touched |= 1ULL << idx; | |
1920 | trans->iters_unlink_on_restart |= 1ULL << idx; | |
1921 | ||
1922 | __bch2_btree_iter_copy(&trans->iters[idx], src); | |
1923 | ||
1924 | return &trans->iters[idx]; | |
1c6fdbd8 KO |
1925 | } |
1926 | ||
1927 | void *bch2_trans_kmalloc(struct btree_trans *trans, | |
1928 | size_t size) | |
1929 | { | |
1930 | void *ret; | |
1931 | ||
1932 | if (trans->mem_top + size > trans->mem_bytes) { | |
1933 | size_t old_bytes = trans->mem_bytes; | |
1934 | size_t new_bytes = roundup_pow_of_two(trans->mem_top + size); | |
1935 | void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); | |
1936 | ||
1937 | if (!new_mem) | |
1938 | return ERR_PTR(-ENOMEM); | |
1939 | ||
1940 | trans->mem = new_mem; | |
1941 | trans->mem_bytes = new_bytes; | |
1942 | ||
1c7a0adf KO |
1943 | if (old_bytes) { |
1944 | trans_restart(); | |
1c6fdbd8 | 1945 | return ERR_PTR(-EINTR); |
1c7a0adf | 1946 | } |
1c6fdbd8 KO |
1947 | } |
1948 | ||
1949 | ret = trans->mem + trans->mem_top; | |
1950 | trans->mem_top += size; | |
1951 | return ret; | |
1952 | } | |
1953 | ||
1954 | int bch2_trans_unlock(struct btree_trans *trans) | |
1955 | { | |
1956 | unsigned iters = trans->iters_linked; | |
1957 | int ret = 0; | |
1958 | ||
1959 | while (iters) { | |
1960 | unsigned idx = __ffs(iters); | |
1961 | struct btree_iter *iter = &trans->iters[idx]; | |
1962 | ||
0f238367 | 1963 | ret = ret ?: btree_iter_err(iter); |
1c6fdbd8 KO |
1964 | |
1965 | __bch2_btree_iter_unlock(iter); | |
1966 | iters ^= 1 << idx; | |
1967 | } | |
1968 | ||
1969 | return ret; | |
1970 | } | |
1971 | ||
5df4be3f KO |
1972 | inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters) |
1973 | { | |
1974 | iters &= trans->iters_linked; | |
1975 | ||
1976 | while (iters) { | |
1977 | unsigned idx = __ffs64(iters); | |
1978 | ||
1979 | iters &= ~(1ULL << idx); | |
1980 | __bch2_trans_iter_free(trans, idx); | |
1981 | } | |
1982 | } | |
1983 | ||
1c7a0adf | 1984 | void __bch2_trans_begin(struct btree_trans *trans) |
1c6fdbd8 | 1985 | { |
5df4be3f | 1986 | u64 iters_to_unlink; |
1c6fdbd8 KO |
1987 | |
1988 | btree_trans_verify(trans); | |
1989 | ||
1990 | /* | |
1991 | * On transaction restart, the transaction isn't required to allocate | |
1992 | * all the same iterators it on the last iteration: | |
1993 | * | |
1994 | * Unlink any iterators it didn't use this iteration, assuming it got | |
1995 | * further (allocated an iter with a higher idx) than where the iter | |
1996 | * was originally allocated: | |
1997 | */ | |
5df4be3f KO |
1998 | iters_to_unlink = ~trans->iters_live & |
1999 | ((1ULL << fls64(trans->iters_live)) - 1); | |
a8e00bd4 | 2000 | |
5df4be3f KO |
2001 | iters_to_unlink |= trans->iters_unlink_on_restart; |
2002 | iters_to_unlink |= trans->iters_unlink_on_commit; | |
a8e00bd4 | 2003 | |
5df4be3f | 2004 | bch2_trans_unlink_iters(trans, iters_to_unlink); |
1c6fdbd8 | 2005 | |
5df4be3f KO |
2006 | trans->iters_live = 0; |
2007 | trans->iters_touched = 0; | |
2008 | trans->iters_unlink_on_restart = 0; | |
2009 | trans->iters_unlink_on_commit = 0; | |
2010 | trans->nr_updates = 0; | |
2011 | trans->mem_top = 0; | |
1c6fdbd8 KO |
2012 | |
2013 | btree_trans_verify(trans); | |
2014 | } | |
2015 | ||
2016 | void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c) | |
2017 | { | |
a8e00bd4 KO |
2018 | memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); |
2019 | ||
1c6fdbd8 | 2020 | trans->c = c; |
a8e00bd4 | 2021 | trans->size = ARRAY_SIZE(trans->iters_onstack); |
1c6fdbd8 | 2022 | trans->iters = trans->iters_onstack; |
a8e00bd4 | 2023 | trans->updates = trans->updates_onstack; |
1c6fdbd8 KO |
2024 | } |
2025 | ||
2026 | int bch2_trans_exit(struct btree_trans *trans) | |
2027 | { | |
2028 | int ret = bch2_trans_unlock(trans); | |
2029 | ||
2030 | kfree(trans->mem); | |
a8e00bd4 | 2031 | if (trans->used_mempool) |
581edb63 | 2032 | mempool_free(trans->iters, &trans->c->btree_iters_pool); |
a8e00bd4 KO |
2033 | else if (trans->iters != trans->iters_onstack) |
2034 | kfree(trans->iters); | |
1c6fdbd8 KO |
2035 | trans->mem = (void *) 0x1; |
2036 | trans->iters = (void *) 0x1; | |
2037 | return ret; | |
2038 | } |