Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
4 | #include "bkey_methods.h" | |
5 | #include "btree_cache.h" | |
6 | #include "btree_iter.h" | |
7 | #include "btree_locking.h" | |
8 | #include "debug.h" | |
9 | #include "extents.h" | |
10 | #include "trace.h" | |
11 | ||
12 | #include <linux/prefetch.h> | |
13 | ||
14 | static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *, | |
15 | struct btree_iter_level *, | |
16 | struct bkey *); | |
17 | ||
ed8413fd KO |
18 | #define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) |
19 | #define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) | |
20 | #define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) | |
21 | #define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) | |
22 | #define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) | |
23 | #define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) | |
24 | #define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) | |
1c6fdbd8 KO |
25 | |
26 | static inline bool is_btree_node(struct btree_iter *iter, unsigned l) | |
27 | { | |
28 | return l < BTREE_MAX_DEPTH && | |
ed8413fd | 29 | (unsigned long) iter->l[l].b >= 128; |
1c6fdbd8 KO |
30 | } |
31 | ||
a00fd8c5 KO |
32 | /* Returns < 0 if @k is before iter pos, > 0 if @k is after */ |
33 | static inline int __btree_iter_pos_cmp(struct btree_iter *iter, | |
34 | const struct btree *b, | |
35 | const struct bkey_packed *k, | |
36 | bool interior_node) | |
37 | { | |
38 | int cmp = bkey_cmp_left_packed(b, k, &iter->pos); | |
39 | ||
40 | if (cmp) | |
41 | return cmp; | |
42 | if (bkey_deleted(k)) | |
43 | return -1; | |
cbdf24ce KO |
44 | |
45 | /* | |
46 | * Normally, for extents we want the first key strictly greater than | |
47 | * the iterator position - with the exception that for interior nodes, | |
48 | * we don't want to advance past the last key if the iterator position | |
49 | * is POS_MAX: | |
50 | */ | |
51 | if (iter->flags & BTREE_ITER_IS_EXTENTS && | |
52 | (!interior_node || | |
53 | bkey_cmp_left_packed_byval(b, k, POS_MAX))) | |
a00fd8c5 KO |
54 | return -1; |
55 | return 1; | |
56 | } | |
57 | ||
58 | static inline int btree_iter_pos_cmp(struct btree_iter *iter, | |
59 | const struct btree *b, | |
60 | const struct bkey_packed *k) | |
61 | { | |
c43a6ef9 | 62 | return __btree_iter_pos_cmp(iter, b, k, b->c.level != 0); |
a00fd8c5 KO |
63 | } |
64 | ||
1c6fdbd8 KO |
65 | /* Btree node locking: */ |
66 | ||
67 | /* | |
68 | * Updates the saved lock sequence number, so that bch2_btree_node_relock() will | |
69 | * succeed: | |
70 | */ | |
71 | void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) | |
72 | { | |
73 | struct btree_iter *linked; | |
74 | ||
c43a6ef9 KO |
75 | EBUG_ON(iter->l[b->c.level].b != b); |
76 | EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); | |
1c6fdbd8 | 77 | |
0f238367 | 78 | trans_for_each_iter_with_node(iter->trans, b, linked) |
c43a6ef9 | 79 | linked->l[b->c.level].lock_seq += 2; |
1c6fdbd8 | 80 | |
c43a6ef9 | 81 | six_unlock_write(&b->c.lock); |
1c6fdbd8 KO |
82 | } |
83 | ||
84 | void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) | |
85 | { | |
1c6fdbd8 KO |
86 | struct btree_iter *linked; |
87 | unsigned readers = 0; | |
88 | ||
c43a6ef9 | 89 | EBUG_ON(btree_node_read_locked(iter, b->c.level)); |
1c6fdbd8 | 90 | |
0f238367 | 91 | trans_for_each_iter(iter->trans, linked) |
c43a6ef9 KO |
92 | if (linked->l[b->c.level].b == b && |
93 | btree_node_read_locked(linked, b->c.level)) | |
1c6fdbd8 KO |
94 | readers++; |
95 | ||
96 | /* | |
97 | * Must drop our read locks before calling six_lock_write() - | |
98 | * six_unlock() won't do wakeups until the reader count | |
99 | * goes to 0, and it's safe because we have the node intent | |
100 | * locked: | |
101 | */ | |
102 | atomic64_sub(__SIX_VAL(read_lock, readers), | |
c43a6ef9 | 103 | &b->c.lock.state.counter); |
9e5e5b9e | 104 | btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); |
1c6fdbd8 | 105 | atomic64_add(__SIX_VAL(read_lock, readers), |
c43a6ef9 | 106 | &b->c.lock.state.counter); |
1c6fdbd8 KO |
107 | } |
108 | ||
1c6fdbd8 KO |
109 | bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) |
110 | { | |
111 | struct btree *b = btree_iter_node(iter, level); | |
112 | int want = __btree_lock_want(iter, level); | |
113 | ||
ed8413fd | 114 | if (!is_btree_node(iter, level)) |
1c6fdbd8 KO |
115 | return false; |
116 | ||
117 | if (race_fault()) | |
118 | return false; | |
119 | ||
ed8413fd KO |
120 | if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || |
121 | (btree_node_lock_seq_matches(iter, b, level) && | |
122 | btree_node_lock_increment(iter, b, level, want))) { | |
123 | mark_btree_node_locked(iter, level, want); | |
124 | return true; | |
125 | } else { | |
1c6fdbd8 | 126 | return false; |
ed8413fd | 127 | } |
1c6fdbd8 KO |
128 | } |
129 | ||
130 | static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) | |
131 | { | |
132 | struct btree *b = iter->l[level].b; | |
133 | ||
134 | EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); | |
135 | ||
136 | if (!is_btree_node(iter, level)) | |
137 | return false; | |
138 | ||
139 | if (btree_node_intent_locked(iter, level)) | |
140 | return true; | |
141 | ||
142 | if (race_fault()) | |
143 | return false; | |
144 | ||
145 | if (btree_node_locked(iter, level) | |
c43a6ef9 KO |
146 | ? six_lock_tryupgrade(&b->c.lock) |
147 | : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) | |
1c6fdbd8 KO |
148 | goto success; |
149 | ||
ed8413fd | 150 | if (btree_node_lock_seq_matches(iter, b, level) && |
1c6fdbd8 KO |
151 | btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) { |
152 | btree_node_unlock(iter, level); | |
153 | goto success; | |
154 | } | |
155 | ||
156 | return false; | |
157 | success: | |
158 | mark_btree_node_intent_locked(iter, level); | |
159 | return true; | |
160 | } | |
161 | ||
162 | static inline bool btree_iter_get_locks(struct btree_iter *iter, | |
7d825866 | 163 | bool upgrade, bool trace) |
1c6fdbd8 KO |
164 | { |
165 | unsigned l = iter->level; | |
166 | int fail_idx = -1; | |
167 | ||
168 | do { | |
169 | if (!btree_iter_node(iter, l)) | |
170 | break; | |
171 | ||
172 | if (!(upgrade | |
173 | ? bch2_btree_node_upgrade(iter, l) | |
174 | : bch2_btree_node_relock(iter, l))) { | |
7d825866 KO |
175 | if (trace) |
176 | (upgrade | |
177 | ? trace_node_upgrade_fail | |
178 | : trace_node_relock_fail)(l, iter->l[l].lock_seq, | |
ed8413fd KO |
179 | is_btree_node(iter, l) |
180 | ? 0 | |
181 | : (unsigned long) iter->l[l].b, | |
182 | is_btree_node(iter, l) | |
183 | ? iter->l[l].b->c.lock.state.seq | |
184 | : 0); | |
185 | ||
1c6fdbd8 KO |
186 | fail_idx = l; |
187 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
188 | } | |
189 | ||
190 | l++; | |
191 | } while (l < iter->locks_want); | |
192 | ||
193 | /* | |
194 | * When we fail to get a lock, we have to ensure that any child nodes | |
195 | * can't be relocked so bch2_btree_iter_traverse has to walk back up to | |
196 | * the node that we failed to relock: | |
197 | */ | |
198 | while (fail_idx >= 0) { | |
199 | btree_node_unlock(iter, fail_idx); | |
ed8413fd | 200 | iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; |
1c6fdbd8 KO |
201 | --fail_idx; |
202 | } | |
203 | ||
204 | if (iter->uptodate == BTREE_ITER_NEED_RELOCK) | |
205 | iter->uptodate = BTREE_ITER_NEED_PEEK; | |
206 | ||
0f238367 KO |
207 | bch2_btree_trans_verify_locks(iter->trans); |
208 | ||
1c6fdbd8 KO |
209 | return iter->uptodate < BTREE_ITER_NEED_RELOCK; |
210 | } | |
211 | ||
212 | /* Slowpath: */ | |
213 | bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, | |
214 | unsigned level, | |
215 | struct btree_iter *iter, | |
b03b81df | 216 | enum six_lock_type type) |
1c6fdbd8 | 217 | { |
1c6fdbd8 KO |
218 | struct btree_iter *linked; |
219 | bool ret = true; | |
220 | ||
647d7b60 | 221 | /* Check if it's safe to block: */ |
0f238367 | 222 | trans_for_each_iter(iter->trans, linked) { |
1c6fdbd8 KO |
223 | if (!linked->nodes_locked) |
224 | continue; | |
225 | ||
647d7b60 | 226 | /* * Must lock btree nodes in key order: */ |
1c6fdbd8 KO |
227 | if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0) |
228 | ret = false; | |
229 | ||
230 | /* | |
231 | * Can't block taking an intent lock if we have _any_ nodes read | |
232 | * locked: | |
233 | * | |
234 | * - Our read lock blocks another thread with an intent lock on | |
235 | * the same node from getting a write lock, and thus from | |
236 | * dropping its intent lock | |
237 | * | |
238 | * - And the other thread may have multiple nodes intent locked: | |
239 | * both the node we want to intent lock, and the node we | |
240 | * already have read locked - deadlock: | |
241 | */ | |
242 | if (type == SIX_LOCK_intent && | |
243 | linked->nodes_locked != linked->nodes_intent_locked) { | |
60755344 | 244 | if (!(iter->trans->nounlock)) { |
1c6fdbd8 KO |
245 | linked->locks_want = max_t(unsigned, |
246 | linked->locks_want, | |
247 | __fls(linked->nodes_locked) + 1); | |
7d825866 | 248 | btree_iter_get_locks(linked, true, false); |
1c6fdbd8 KO |
249 | } |
250 | ret = false; | |
251 | } | |
252 | ||
253 | /* | |
254 | * Interior nodes must be locked before their descendants: if | |
255 | * another iterator has possible descendants locked of the node | |
256 | * we're about to lock, it must have the ancestors locked too: | |
257 | */ | |
258 | if (linked->btree_id == iter->btree_id && | |
259 | level > __fls(linked->nodes_locked)) { | |
60755344 | 260 | if (!(iter->trans->nounlock)) { |
647d7b60 KO |
261 | linked->locks_want = |
262 | max(level + 1, max_t(unsigned, | |
263 | linked->locks_want, | |
264 | iter->locks_want)); | |
7d825866 | 265 | btree_iter_get_locks(linked, true, false); |
1c6fdbd8 KO |
266 | } |
267 | ret = false; | |
268 | } | |
269 | } | |
270 | ||
ba5c6557 | 271 | if (unlikely(!ret)) { |
20bceecb | 272 | trace_trans_restart_would_deadlock(iter->trans->ip); |
ba5c6557 KO |
273 | return false; |
274 | } | |
1c7a0adf | 275 | |
ba5c6557 KO |
276 | __btree_node_lock_type(iter->trans->c, b, type); |
277 | return true; | |
1c6fdbd8 KO |
278 | } |
279 | ||
280 | /* Btree iterator locking: */ | |
281 | ||
282 | #ifdef CONFIG_BCACHEFS_DEBUG | |
0f238367 | 283 | void bch2_btree_iter_verify_locks(struct btree_iter *iter) |
1c6fdbd8 KO |
284 | { |
285 | unsigned l; | |
286 | ||
287 | for (l = 0; btree_iter_node(iter, l); l++) { | |
288 | if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && | |
289 | !btree_node_locked(iter, l)) | |
290 | continue; | |
291 | ||
292 | BUG_ON(btree_lock_want(iter, l) != | |
293 | btree_node_locked_type(iter, l)); | |
294 | } | |
295 | } | |
ad7ae8d6 | 296 | |
0f238367 | 297 | void bch2_btree_trans_verify_locks(struct btree_trans *trans) |
ad7ae8d6 | 298 | { |
0f238367 | 299 | struct btree_iter *iter; |
ad7ae8d6 | 300 | |
0f238367 KO |
301 | trans_for_each_iter(trans, iter) |
302 | bch2_btree_iter_verify_locks(iter); | |
ad7ae8d6 | 303 | } |
1c6fdbd8 KO |
304 | #endif |
305 | ||
306 | __flatten | |
7d825866 | 307 | static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) |
1c6fdbd8 KO |
308 | { |
309 | return iter->uptodate >= BTREE_ITER_NEED_RELOCK | |
7d825866 | 310 | ? btree_iter_get_locks(iter, false, trace) |
1c6fdbd8 KO |
311 | : true; |
312 | } | |
313 | ||
1c6fdbd8 KO |
314 | bool __bch2_btree_iter_upgrade(struct btree_iter *iter, |
315 | unsigned new_locks_want) | |
316 | { | |
317 | struct btree_iter *linked; | |
318 | ||
319 | EBUG_ON(iter->locks_want >= new_locks_want); | |
320 | ||
321 | iter->locks_want = new_locks_want; | |
322 | ||
7d825866 | 323 | if (btree_iter_get_locks(iter, true, true)) |
1c6fdbd8 KO |
324 | return true; |
325 | ||
326 | /* | |
327 | * Ancestor nodes must be locked before child nodes, so set locks_want | |
328 | * on iterators that might lock ancestors before us to avoid getting | |
329 | * -EINTR later: | |
330 | */ | |
0f238367 KO |
331 | trans_for_each_iter(iter->trans, linked) |
332 | if (linked != iter && | |
333 | linked->btree_id == iter->btree_id && | |
1c6fdbd8 KO |
334 | linked->locks_want < new_locks_want) { |
335 | linked->locks_want = new_locks_want; | |
7d825866 | 336 | btree_iter_get_locks(linked, true, false); |
1c6fdbd8 KO |
337 | } |
338 | ||
339 | return false; | |
340 | } | |
341 | ||
342 | bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, | |
343 | unsigned new_locks_want) | |
344 | { | |
345 | unsigned l = iter->level; | |
346 | ||
347 | EBUG_ON(iter->locks_want >= new_locks_want); | |
348 | ||
349 | iter->locks_want = new_locks_want; | |
350 | ||
351 | do { | |
352 | if (!btree_iter_node(iter, l)) | |
353 | break; | |
354 | ||
355 | if (!bch2_btree_node_upgrade(iter, l)) { | |
356 | iter->locks_want = l; | |
357 | return false; | |
358 | } | |
359 | ||
360 | l++; | |
361 | } while (l < iter->locks_want); | |
362 | ||
363 | return true; | |
364 | } | |
365 | ||
366 | void __bch2_btree_iter_downgrade(struct btree_iter *iter, | |
367 | unsigned downgrade_to) | |
368 | { | |
369 | struct btree_iter *linked; | |
370 | unsigned l; | |
371 | ||
372 | /* | |
373 | * We downgrade linked iterators as well because btree_iter_upgrade | |
374 | * might have had to modify locks_want on linked iterators due to lock | |
375 | * ordering: | |
376 | */ | |
0f238367 | 377 | trans_for_each_iter(iter->trans, linked) { |
1c6fdbd8 KO |
378 | unsigned new_locks_want = downgrade_to ?: |
379 | (linked->flags & BTREE_ITER_INTENT ? 1 : 0); | |
380 | ||
381 | if (linked->locks_want <= new_locks_want) | |
382 | continue; | |
383 | ||
384 | linked->locks_want = new_locks_want; | |
385 | ||
386 | while (linked->nodes_locked && | |
387 | (l = __fls(linked->nodes_locked)) >= linked->locks_want) { | |
388 | if (l > linked->level) { | |
389 | btree_node_unlock(linked, l); | |
390 | } else { | |
391 | if (btree_node_intent_locked(linked, l)) { | |
c43a6ef9 | 392 | six_lock_downgrade(&linked->l[l].b->c.lock); |
1c6fdbd8 KO |
393 | linked->nodes_intent_locked ^= 1 << l; |
394 | } | |
395 | break; | |
396 | } | |
397 | } | |
1c6fdbd8 | 398 | } |
ad7ae8d6 | 399 | |
0f238367 | 400 | bch2_btree_trans_verify_locks(iter->trans); |
1c6fdbd8 KO |
401 | } |
402 | ||
58fbf808 KO |
403 | /* Btree transaction locking: */ |
404 | ||
405 | bool bch2_trans_relock(struct btree_trans *trans) | |
0f238367 KO |
406 | { |
407 | struct btree_iter *iter; | |
408 | bool ret = true; | |
409 | ||
410 | trans_for_each_iter(trans, iter) | |
7d825866 KO |
411 | if (iter->uptodate == BTREE_ITER_NEED_RELOCK) |
412 | ret &= bch2_btree_iter_relock(iter, true); | |
0f238367 KO |
413 | |
414 | return ret; | |
415 | } | |
416 | ||
58fbf808 | 417 | void bch2_trans_unlock(struct btree_trans *trans) |
0f238367 KO |
418 | { |
419 | struct btree_iter *iter; | |
420 | ||
421 | trans_for_each_iter(trans, iter) | |
422 | __bch2_btree_iter_unlock(iter); | |
423 | } | |
424 | ||
1c6fdbd8 KO |
425 | /* Btree iterator: */ |
426 | ||
427 | #ifdef CONFIG_BCACHEFS_DEBUG | |
428 | ||
429 | static void __bch2_btree_iter_verify(struct btree_iter *iter, | |
430 | struct btree *b) | |
431 | { | |
c43a6ef9 | 432 | struct btree_iter_level *l = &iter->l[b->c.level]; |
1c6fdbd8 KO |
433 | struct btree_node_iter tmp = l->iter; |
434 | struct bkey_packed *k; | |
435 | ||
f13f5a8c KO |
436 | if (!debug_check_iterators(iter->trans->c)) |
437 | return; | |
438 | ||
271a3d3a KO |
439 | if (iter->uptodate > BTREE_ITER_NEED_PEEK) |
440 | return; | |
441 | ||
1c6fdbd8 KO |
442 | bch2_btree_node_iter_verify(&l->iter, b); |
443 | ||
444 | /* | |
445 | * For interior nodes, the iterator will have skipped past | |
446 | * deleted keys: | |
271a3d3a KO |
447 | * |
448 | * For extents, the iterator may have skipped past deleted keys (but not | |
449 | * whiteouts) | |
1c6fdbd8 | 450 | */ |
c43a6ef9 | 451 | k = b->c.level || iter->flags & BTREE_ITER_IS_EXTENTS |
26609b61 | 452 | ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard) |
1c6fdbd8 | 453 | : bch2_btree_node_iter_prev_all(&tmp, b); |
a00fd8c5 | 454 | if (k && btree_iter_pos_cmp(iter, b, k) > 0) { |
1c6fdbd8 KO |
455 | char buf[100]; |
456 | struct bkey uk = bkey_unpack_key(b, k); | |
457 | ||
319f9ac3 | 458 | bch2_bkey_to_text(&PBUF(buf), &uk); |
271a3d3a | 459 | panic("prev key should be before iter pos:\n%s\n%llu:%llu\n", |
1c6fdbd8 KO |
460 | buf, iter->pos.inode, iter->pos.offset); |
461 | } | |
462 | ||
463 | k = bch2_btree_node_iter_peek_all(&l->iter, b); | |
a00fd8c5 | 464 | if (k && btree_iter_pos_cmp(iter, b, k) < 0) { |
1c6fdbd8 KO |
465 | char buf[100]; |
466 | struct bkey uk = bkey_unpack_key(b, k); | |
467 | ||
319f9ac3 | 468 | bch2_bkey_to_text(&PBUF(buf), &uk); |
271a3d3a KO |
469 | panic("iter should be after current key:\n" |
470 | "iter pos %llu:%llu\n" | |
471 | "cur key %s\n", | |
1c6fdbd8 KO |
472 | iter->pos.inode, iter->pos.offset, buf); |
473 | } | |
474 | ||
271a3d3a KO |
475 | BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE && |
476 | (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS && | |
477 | !bkey_whiteout(&iter->k) && | |
478 | bch2_btree_node_iter_end(&l->iter)); | |
1c6fdbd8 KO |
479 | } |
480 | ||
481 | void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b) | |
482 | { | |
483 | struct btree_iter *linked; | |
484 | ||
f13f5a8c KO |
485 | if (!debug_check_iterators(iter->trans->c)) |
486 | return; | |
487 | ||
0f238367 | 488 | trans_for_each_iter_with_node(iter->trans, b, linked) |
1c6fdbd8 KO |
489 | __bch2_btree_iter_verify(linked, b); |
490 | } | |
491 | ||
271a3d3a KO |
492 | #else |
493 | ||
494 | static inline void __bch2_btree_iter_verify(struct btree_iter *iter, | |
495 | struct btree *b) {} | |
496 | ||
1c6fdbd8 KO |
497 | #endif |
498 | ||
499 | static void __bch2_btree_node_iter_fix(struct btree_iter *iter, | |
500 | struct btree *b, | |
501 | struct btree_node_iter *node_iter, | |
502 | struct bset_tree *t, | |
503 | struct bkey_packed *where, | |
504 | unsigned clobber_u64s, | |
505 | unsigned new_u64s) | |
506 | { | |
507 | const struct bkey_packed *end = btree_bkey_last(b, t); | |
508 | struct btree_node_iter_set *set; | |
509 | unsigned offset = __btree_node_key_to_offset(b, where); | |
510 | int shift = new_u64s - clobber_u64s; | |
271a3d3a | 511 | unsigned old_end = t->end_offset - shift; |
1c6fdbd8 KO |
512 | |
513 | btree_node_iter_for_each(node_iter, set) | |
514 | if (set->end == old_end) | |
515 | goto found; | |
516 | ||
517 | /* didn't find the bset in the iterator - might have to readd it: */ | |
518 | if (new_u64s && | |
a00fd8c5 | 519 | btree_iter_pos_cmp(iter, b, where) > 0) { |
1c6fdbd8 KO |
520 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); |
521 | ||
522 | bch2_btree_node_iter_push(node_iter, b, where, end); | |
523 | ||
c43a6ef9 | 524 | if (!b->c.level && |
1c6fdbd8 KO |
525 | node_iter == &iter->l[0].iter) |
526 | bkey_disassemble(b, | |
527 | bch2_btree_node_iter_peek_all(node_iter, b), | |
528 | &iter->k); | |
529 | } | |
530 | return; | |
531 | found: | |
271a3d3a | 532 | set->end = t->end_offset; |
1c6fdbd8 KO |
533 | |
534 | /* Iterator hasn't gotten to the key that changed yet: */ | |
535 | if (set->k < offset) | |
536 | return; | |
537 | ||
538 | if (new_u64s && | |
a00fd8c5 | 539 | btree_iter_pos_cmp(iter, b, where) > 0) { |
1c6fdbd8 KO |
540 | set->k = offset; |
541 | } else if (set->k < offset + clobber_u64s) { | |
542 | set->k = offset + new_u64s; | |
543 | if (set->k == set->end) | |
544 | bch2_btree_node_iter_set_drop(node_iter, set); | |
545 | } else { | |
546 | set->k = (int) set->k + shift; | |
547 | goto iter_current_key_not_modified; | |
548 | } | |
549 | ||
550 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
551 | ||
552 | bch2_btree_node_iter_sort(node_iter, b); | |
c43a6ef9 | 553 | if (!b->c.level && node_iter == &iter->l[0].iter) { |
56e0e7c7 KO |
554 | /* |
555 | * not legal to call bkey_debugcheck() here, because we're | |
556 | * called midway through the update path after update has been | |
557 | * marked but before deletes have actually happened: | |
558 | */ | |
559 | #if 0 | |
1c6fdbd8 | 560 | __btree_iter_peek_all(iter, &iter->l[0], &iter->k); |
56e0e7c7 KO |
561 | #endif |
562 | struct btree_iter_level *l = &iter->l[0]; | |
563 | struct bkey_packed *k = | |
564 | bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
565 | ||
566 | if (unlikely(!k)) | |
567 | iter->k.type = KEY_TYPE_deleted; | |
568 | else | |
569 | bkey_disassemble(l->b, k, &iter->k); | |
570 | } | |
1c6fdbd8 KO |
571 | iter_current_key_not_modified: |
572 | ||
573 | /* | |
574 | * Interior nodes are special because iterators for interior nodes don't | |
575 | * obey the usual invariants regarding the iterator position: | |
576 | * | |
577 | * We may have whiteouts that compare greater than the iterator | |
578 | * position, and logically should be in the iterator, but that we | |
579 | * skipped past to find the first live key greater than the iterator | |
580 | * position. This becomes an issue when we insert a new key that is | |
581 | * greater than the current iterator position, but smaller than the | |
582 | * whiteouts we've already skipped past - this happens in the course of | |
583 | * a btree split. | |
584 | * | |
585 | * We have to rewind the iterator past to before those whiteouts here, | |
586 | * else bkey_node_iter_prev() is not going to work and who knows what | |
587 | * else would happen. And we have to do it manually, because here we've | |
588 | * already done the insert and the iterator is currently inconsistent: | |
589 | * | |
590 | * We've got multiple competing invariants, here - we have to be careful | |
591 | * about rewinding iterators for interior nodes, because they should | |
592 | * always point to the key for the child node the btree iterator points | |
593 | * to. | |
594 | */ | |
c43a6ef9 | 595 | if (b->c.level && new_u64s && |
a00fd8c5 | 596 | btree_iter_pos_cmp(iter, b, where) > 0) { |
216c9fac | 597 | struct bset_tree *t, *where_set = bch2_bkey_to_bset_inlined(b, where); |
1c6fdbd8 KO |
598 | struct bkey_packed *k; |
599 | ||
600 | for_each_bset(b, t) { | |
216c9fac | 601 | if (where_set == t) |
1c6fdbd8 KO |
602 | continue; |
603 | ||
604 | k = bch2_bkey_prev_all(b, t, | |
605 | bch2_btree_node_iter_bset_pos(node_iter, b, t)); | |
606 | if (k && | |
a00fd8c5 | 607 | bkey_iter_cmp(b, k, where) > 0) { |
1c6fdbd8 KO |
608 | struct btree_node_iter_set *set; |
609 | unsigned offset = | |
610 | __btree_node_key_to_offset(b, bkey_next(k)); | |
611 | ||
612 | btree_node_iter_for_each(node_iter, set) | |
613 | if (set->k == offset) { | |
614 | set->k = __btree_node_key_to_offset(b, k); | |
615 | bch2_btree_node_iter_sort(node_iter, b); | |
616 | goto next_bset; | |
617 | } | |
618 | ||
619 | bch2_btree_node_iter_push(node_iter, b, k, | |
620 | btree_bkey_last(b, t)); | |
621 | } | |
622 | next_bset: | |
623 | t = t; | |
624 | } | |
625 | } | |
626 | } | |
627 | ||
628 | void bch2_btree_node_iter_fix(struct btree_iter *iter, | |
216c9fac KO |
629 | struct btree *b, |
630 | struct btree_node_iter *node_iter, | |
631 | struct bkey_packed *where, | |
632 | unsigned clobber_u64s, | |
633 | unsigned new_u64s) | |
1c6fdbd8 | 634 | { |
216c9fac | 635 | struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); |
1c6fdbd8 KO |
636 | struct btree_iter *linked; |
637 | ||
c43a6ef9 | 638 | if (node_iter != &iter->l[b->c.level].iter) |
1c6fdbd8 KO |
639 | __bch2_btree_node_iter_fix(iter, b, node_iter, t, |
640 | where, clobber_u64s, new_u64s); | |
641 | ||
0f238367 | 642 | trans_for_each_iter_with_node(iter->trans, b, linked) |
1c6fdbd8 | 643 | __bch2_btree_node_iter_fix(linked, b, |
c43a6ef9 | 644 | &linked->l[b->c.level].iter, t, |
1c6fdbd8 | 645 | where, clobber_u64s, new_u64s); |
1c6fdbd8 KO |
646 | } |
647 | ||
648 | static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, | |
649 | struct btree_iter_level *l, | |
650 | struct bkey *u, | |
651 | struct bkey_packed *k) | |
652 | { | |
653 | struct bkey_s_c ret; | |
654 | ||
655 | if (unlikely(!k)) { | |
656 | /* | |
657 | * signal to bch2_btree_iter_peek_slot() that we're currently at | |
658 | * a hole | |
659 | */ | |
26609b61 | 660 | u->type = KEY_TYPE_deleted; |
1c6fdbd8 KO |
661 | return bkey_s_c_null; |
662 | } | |
663 | ||
664 | ret = bkey_disassemble(l->b, k, u); | |
665 | ||
9e5e5b9e KO |
666 | if (debug_check_bkeys(iter->trans->c)) |
667 | bch2_bkey_debugcheck(iter->trans->c, l->b, ret); | |
1c6fdbd8 KO |
668 | |
669 | return ret; | |
670 | } | |
671 | ||
672 | /* peek_all() doesn't skip deleted keys */ | |
673 | static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, | |
674 | struct btree_iter_level *l, | |
675 | struct bkey *u) | |
676 | { | |
677 | return __btree_iter_unpack(iter, l, u, | |
678 | bch2_btree_node_iter_peek_all(&l->iter, l->b)); | |
679 | } | |
680 | ||
681 | static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, | |
682 | struct btree_iter_level *l) | |
683 | { | |
684 | return __btree_iter_unpack(iter, l, &iter->k, | |
685 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
686 | } | |
687 | ||
a00fd8c5 KO |
688 | static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, |
689 | struct btree_iter_level *l, | |
690 | int max_advance) | |
1c6fdbd8 | 691 | { |
a00fd8c5 KO |
692 | struct bkey_packed *k; |
693 | int nr_advanced = 0; | |
694 | ||
695 | while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && | |
696 | btree_iter_pos_cmp(iter, l->b, k) < 0) { | |
697 | if (max_advance > 0 && nr_advanced >= max_advance) | |
698 | return false; | |
699 | ||
700 | bch2_btree_node_iter_advance(&l->iter, l->b); | |
701 | nr_advanced++; | |
702 | } | |
703 | ||
704 | return true; | |
1c6fdbd8 KO |
705 | } |
706 | ||
707 | /* | |
708 | * Verify that iterator for parent node points to child node: | |
709 | */ | |
710 | static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) | |
711 | { | |
712 | struct btree_iter_level *l; | |
713 | unsigned plevel; | |
714 | bool parent_locked; | |
715 | struct bkey_packed *k; | |
716 | ||
717 | if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) | |
718 | return; | |
719 | ||
c43a6ef9 | 720 | plevel = b->c.level + 1; |
1c6fdbd8 KO |
721 | if (!btree_iter_node(iter, plevel)) |
722 | return; | |
723 | ||
724 | parent_locked = btree_node_locked(iter, plevel); | |
725 | ||
726 | if (!bch2_btree_node_relock(iter, plevel)) | |
727 | return; | |
728 | ||
729 | l = &iter->l[plevel]; | |
730 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
731 | if (!k || | |
732 | bkey_deleted(k) || | |
733 | bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { | |
734 | char buf[100]; | |
735 | struct bkey uk = bkey_unpack_key(b, k); | |
736 | ||
319f9ac3 | 737 | bch2_bkey_to_text(&PBUF(buf), &uk); |
1c6fdbd8 KO |
738 | panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", |
739 | buf, b->key.k.p.inode, b->key.k.p.offset); | |
740 | } | |
741 | ||
742 | if (!parent_locked) | |
c43a6ef9 | 743 | btree_node_unlock(iter, b->c.level + 1); |
1c6fdbd8 KO |
744 | } |
745 | ||
1c6fdbd8 KO |
746 | static inline bool btree_iter_pos_after_node(struct btree_iter *iter, |
747 | struct btree *b) | |
748 | { | |
a00fd8c5 | 749 | return __btree_iter_pos_cmp(iter, NULL, |
cbdf24ce | 750 | bkey_to_packed(&b->key), true) < 0; |
1c6fdbd8 KO |
751 | } |
752 | ||
753 | static inline bool btree_iter_pos_in_node(struct btree_iter *iter, | |
754 | struct btree *b) | |
755 | { | |
c43a6ef9 | 756 | return iter->btree_id == b->c.btree_id && |
1c6fdbd8 KO |
757 | bkey_cmp(iter->pos, b->data->min_key) >= 0 && |
758 | !btree_iter_pos_after_node(iter, b); | |
759 | } | |
760 | ||
761 | static inline void __btree_iter_init(struct btree_iter *iter, | |
a00fd8c5 | 762 | unsigned level) |
1c6fdbd8 | 763 | { |
a00fd8c5 KO |
764 | struct btree_iter_level *l = &iter->l[level]; |
765 | ||
766 | bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos); | |
1c6fdbd8 | 767 | |
a00fd8c5 KO |
768 | if (iter->flags & BTREE_ITER_IS_EXTENTS) |
769 | btree_iter_advance_to_pos(iter, l, -1); | |
1c6fdbd8 KO |
770 | |
771 | /* Skip to first non whiteout: */ | |
a00fd8c5 KO |
772 | if (level) |
773 | bch2_btree_node_iter_peek(&l->iter, l->b); | |
1c6fdbd8 KO |
774 | |
775 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
776 | } | |
777 | ||
778 | static inline void btree_iter_node_set(struct btree_iter *iter, | |
779 | struct btree *b) | |
780 | { | |
781 | btree_iter_verify_new_node(iter, b); | |
782 | ||
783 | EBUG_ON(!btree_iter_pos_in_node(iter, b)); | |
c43a6ef9 | 784 | EBUG_ON(b->c.lock.state.seq & 1); |
1c6fdbd8 | 785 | |
c43a6ef9 KO |
786 | iter->l[b->c.level].lock_seq = b->c.lock.state.seq; |
787 | iter->l[b->c.level].b = b; | |
788 | __btree_iter_init(iter, b->c.level); | |
1c6fdbd8 KO |
789 | } |
790 | ||
791 | /* | |
792 | * A btree node is being replaced - update the iterator to point to the new | |
793 | * node: | |
794 | */ | |
795 | void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) | |
796 | { | |
797 | enum btree_node_locked_type t; | |
798 | struct btree_iter *linked; | |
799 | ||
0f238367 | 800 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 KO |
801 | if (btree_iter_pos_in_node(linked, b)) { |
802 | /* | |
803 | * bch2_btree_iter_node_drop() has already been called - | |
804 | * the old node we're replacing has already been | |
805 | * unlocked and the pointer invalidated | |
806 | */ | |
c43a6ef9 | 807 | BUG_ON(btree_node_locked(linked, b->c.level)); |
1c6fdbd8 | 808 | |
c43a6ef9 | 809 | t = btree_lock_want(linked, b->c.level); |
1c6fdbd8 | 810 | if (t != BTREE_NODE_UNLOCKED) { |
c43a6ef9 KO |
811 | six_lock_increment(&b->c.lock, (enum six_lock_type) t); |
812 | mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t); | |
1c6fdbd8 KO |
813 | } |
814 | ||
815 | btree_iter_node_set(linked, b); | |
816 | } | |
817 | ||
c43a6ef9 | 818 | six_unlock_intent(&b->c.lock); |
1c6fdbd8 KO |
819 | } |
820 | ||
821 | void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) | |
822 | { | |
823 | struct btree_iter *linked; | |
c43a6ef9 | 824 | unsigned level = b->c.level; |
1c6fdbd8 | 825 | |
0f238367 | 826 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 | 827 | if (linked->l[level].b == b) { |
ad7ae8d6 | 828 | __btree_node_unlock(linked, level); |
ed8413fd | 829 | linked->l[level].b = BTREE_ITER_NO_NODE_DROP; |
1c6fdbd8 KO |
830 | } |
831 | } | |
832 | ||
833 | /* | |
834 | * A btree node has been modified in such a way as to invalidate iterators - fix | |
835 | * them: | |
836 | */ | |
837 | void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) | |
838 | { | |
839 | struct btree_iter *linked; | |
840 | ||
0f238367 | 841 | trans_for_each_iter_with_node(iter->trans, b, linked) |
c43a6ef9 | 842 | __btree_iter_init(linked, b->c.level); |
1c6fdbd8 KO |
843 | } |
844 | ||
845 | static inline int btree_iter_lock_root(struct btree_iter *iter, | |
846 | unsigned depth_want) | |
847 | { | |
9e5e5b9e | 848 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
849 | struct btree *b; |
850 | enum six_lock_type lock_type; | |
851 | unsigned i; | |
852 | ||
853 | EBUG_ON(iter->nodes_locked); | |
854 | ||
855 | while (1) { | |
856 | b = READ_ONCE(c->btree_roots[iter->btree_id].b); | |
c43a6ef9 | 857 | iter->level = READ_ONCE(b->c.level); |
1c6fdbd8 KO |
858 | |
859 | if (unlikely(iter->level < depth_want)) { | |
860 | /* | |
861 | * the root is at a lower depth than the depth we want: | |
862 | * got to the end of the btree, or we're walking nodes | |
863 | * greater than some depth and there are no nodes >= | |
864 | * that depth | |
865 | */ | |
866 | iter->level = depth_want; | |
ed8413fd KO |
867 | for (i = iter->level; i < BTREE_MAX_DEPTH; i++) |
868 | iter->l[i].b = NULL; | |
8812600c | 869 | return 1; |
1c6fdbd8 KO |
870 | } |
871 | ||
872 | lock_type = __btree_lock_want(iter, iter->level); | |
873 | if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, | |
b03b81df | 874 | iter, lock_type))) |
1c6fdbd8 KO |
875 | return -EINTR; |
876 | ||
877 | if (likely(b == c->btree_roots[iter->btree_id].b && | |
c43a6ef9 | 878 | b->c.level == iter->level && |
1c6fdbd8 KO |
879 | !race_fault())) { |
880 | for (i = 0; i < iter->level; i++) | |
ed8413fd | 881 | iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; |
1c6fdbd8 | 882 | iter->l[iter->level].b = b; |
ed8413fd KO |
883 | for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) |
884 | iter->l[i].b = NULL; | |
1c6fdbd8 KO |
885 | |
886 | mark_btree_node_locked(iter, iter->level, lock_type); | |
887 | btree_iter_node_set(iter, b); | |
888 | return 0; | |
1c6fdbd8 KO |
889 | } |
890 | ||
c43a6ef9 | 891 | six_unlock_type(&b->c.lock, lock_type); |
1c6fdbd8 KO |
892 | } |
893 | } | |
894 | ||
895 | noinline | |
896 | static void btree_iter_prefetch(struct btree_iter *iter) | |
897 | { | |
9e5e5b9e | 898 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
899 | struct btree_iter_level *l = &iter->l[iter->level]; |
900 | struct btree_node_iter node_iter = l->iter; | |
901 | struct bkey_packed *k; | |
902 | BKEY_PADDED(k) tmp; | |
9e5e5b9e | 903 | unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) |
1c6fdbd8 KO |
904 | ? (iter->level > 1 ? 0 : 2) |
905 | : (iter->level > 1 ? 1 : 16); | |
906 | bool was_locked = btree_node_locked(iter, iter->level); | |
907 | ||
908 | while (nr) { | |
909 | if (!bch2_btree_node_relock(iter, iter->level)) | |
910 | return; | |
911 | ||
912 | bch2_btree_node_iter_advance(&node_iter, l->b); | |
913 | k = bch2_btree_node_iter_peek(&node_iter, l->b); | |
914 | if (!k) | |
915 | break; | |
916 | ||
917 | bch2_bkey_unpack(l->b, &tmp.k, k); | |
9e5e5b9e | 918 | bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); |
1c6fdbd8 KO |
919 | } |
920 | ||
921 | if (!was_locked) | |
922 | btree_node_unlock(iter, iter->level); | |
923 | } | |
924 | ||
925 | static inline int btree_iter_down(struct btree_iter *iter) | |
926 | { | |
9e5e5b9e | 927 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
928 | struct btree_iter_level *l = &iter->l[iter->level]; |
929 | struct btree *b; | |
930 | unsigned level = iter->level - 1; | |
931 | enum six_lock_type lock_type = __btree_lock_want(iter, level); | |
932 | BKEY_PADDED(k) tmp; | |
933 | ||
934 | BUG_ON(!btree_node_locked(iter, iter->level)); | |
935 | ||
936 | bch2_bkey_unpack(l->b, &tmp.k, | |
937 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
938 | ||
b03b81df | 939 | b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); |
1c6fdbd8 KO |
940 | if (unlikely(IS_ERR(b))) |
941 | return PTR_ERR(b); | |
942 | ||
943 | mark_btree_node_locked(iter, level, lock_type); | |
944 | btree_iter_node_set(iter, b); | |
945 | ||
946 | if (iter->flags & BTREE_ITER_PREFETCH) | |
947 | btree_iter_prefetch(iter); | |
948 | ||
949 | iter->level = level; | |
950 | ||
951 | return 0; | |
952 | } | |
953 | ||
954 | static void btree_iter_up(struct btree_iter *iter) | |
955 | { | |
956 | btree_node_unlock(iter, iter->level++); | |
957 | } | |
958 | ||
959 | int __must_check __bch2_btree_iter_traverse(struct btree_iter *); | |
960 | ||
bf7b87a4 | 961 | static int __btree_iter_traverse_all(struct btree_trans *trans, |
37dd7834 | 962 | struct btree_iter *orig_iter, int ret) |
1c6fdbd8 | 963 | { |
e542029e | 964 | struct bch_fs *c = trans->c; |
37dd7834 | 965 | struct btree_iter *iter; |
e542029e KO |
966 | u8 sorted[BTREE_ITER_MAX]; |
967 | unsigned i, nr_sorted = 0; | |
968 | ||
969 | trans_for_each_iter(trans, iter) | |
970 | sorted[nr_sorted++] = iter - trans->iters; | |
971 | ||
972 | #define btree_iter_cmp_by_idx(_l, _r) \ | |
973 | btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) | |
974 | ||
975 | bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); | |
976 | #undef btree_iter_cmp_by_idx | |
977 | ||
1c6fdbd8 | 978 | retry_all: |
58fbf808 | 979 | bch2_trans_unlock(trans); |
1c6fdbd8 | 980 | |
bf7b87a4 | 981 | if (unlikely(ret == -ENOMEM)) { |
1c6fdbd8 KO |
982 | struct closure cl; |
983 | ||
984 | closure_init_stack(&cl); | |
985 | ||
986 | do { | |
987 | ret = bch2_btree_cache_cannibalize_lock(c, &cl); | |
988 | closure_sync(&cl); | |
989 | } while (ret); | |
990 | } | |
991 | ||
bf7b87a4 | 992 | if (unlikely(ret == -EIO)) { |
ece254b2 | 993 | trans->error = true; |
37dd7834 KO |
994 | orig_iter->flags |= BTREE_ITER_ERROR; |
995 | orig_iter->l[orig_iter->level].b = BTREE_ITER_NO_NODE_ERROR; | |
bf7b87a4 KO |
996 | goto out; |
997 | } | |
998 | ||
999 | BUG_ON(ret && ret != -EINTR); | |
1000 | ||
1c6fdbd8 | 1001 | /* Now, redo traversals in correct order: */ |
e542029e KO |
1002 | for (i = 0; i < nr_sorted; i++) { |
1003 | iter = &trans->iters[sorted[i]]; | |
1c6fdbd8 | 1004 | |
e542029e KO |
1005 | do { |
1006 | ret = __bch2_btree_iter_traverse(iter); | |
1007 | } while (ret == -EINTR); | |
1c6fdbd8 | 1008 | |
e542029e KO |
1009 | if (ret) |
1010 | goto retry_all; | |
1011 | } | |
1c6fdbd8 | 1012 | |
932aa837 | 1013 | ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0; |
1c6fdbd8 KO |
1014 | out: |
1015 | bch2_btree_cache_cannibalize_unlock(c); | |
1016 | return ret; | |
bf7b87a4 | 1017 | } |
1c6fdbd8 | 1018 | |
bf7b87a4 KO |
1019 | int bch2_btree_iter_traverse_all(struct btree_trans *trans) |
1020 | { | |
1021 | return __btree_iter_traverse_all(trans, NULL, 0); | |
1c6fdbd8 KO |
1022 | } |
1023 | ||
1024 | static unsigned btree_iter_up_until_locked(struct btree_iter *iter, | |
1025 | bool check_pos) | |
1026 | { | |
1027 | unsigned l = iter->level; | |
1028 | ||
1029 | while (btree_iter_node(iter, l) && | |
ed8413fd KO |
1030 | (!is_btree_node(iter, l) || |
1031 | !bch2_btree_node_relock(iter, l) || | |
1032 | (check_pos && | |
1033 | !btree_iter_pos_in_node(iter, iter->l[l].b)))) { | |
1c6fdbd8 | 1034 | btree_node_unlock(iter, l); |
ed8413fd | 1035 | iter->l[l].b = BTREE_ITER_NO_NODE_UP; |
1c6fdbd8 KO |
1036 | l++; |
1037 | } | |
1038 | ||
1039 | return l; | |
1040 | } | |
1041 | ||
1042 | /* | |
1043 | * This is the main state machine for walking down the btree - walks down to a | |
1044 | * specified depth | |
1045 | * | |
1046 | * Returns 0 on success, -EIO on error (error reading in a btree node). | |
1047 | * | |
1048 | * On error, caller (peek_node()/peek_key()) must return NULL; the error is | |
b7607ce9 | 1049 | * stashed in the iterator and returned from bch2_trans_exit(). |
1c6fdbd8 KO |
1050 | */ |
1051 | int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) | |
1052 | { | |
1053 | unsigned depth_want = iter->level; | |
1054 | ||
1055 | if (unlikely(iter->level >= BTREE_MAX_DEPTH)) | |
1056 | return 0; | |
1057 | ||
7d825866 | 1058 | if (bch2_btree_iter_relock(iter, false)) |
1c6fdbd8 KO |
1059 | return 0; |
1060 | ||
1c6fdbd8 KO |
1061 | /* |
1062 | * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos | |
1063 | * here unnecessary | |
1064 | */ | |
1065 | iter->level = btree_iter_up_until_locked(iter, true); | |
1066 | ||
1067 | /* | |
1068 | * If we've got a btree node locked (i.e. we aren't about to relock the | |
1069 | * root) - advance its node iterator if necessary: | |
1070 | * | |
1071 | * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary | |
1072 | */ | |
a00fd8c5 KO |
1073 | if (btree_iter_node(iter, iter->level)) |
1074 | btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); | |
1c6fdbd8 KO |
1075 | |
1076 | /* | |
1077 | * Note: iter->nodes[iter->level] may be temporarily NULL here - that | |
1078 | * would indicate to other code that we got to the end of the btree, | |
1079 | * here it indicates that relocking the root failed - it's critical that | |
1080 | * btree_iter_lock_root() comes next and that it can't fail | |
1081 | */ | |
1082 | while (iter->level > depth_want) { | |
1083 | int ret = btree_iter_node(iter, iter->level) | |
1084 | ? btree_iter_down(iter) | |
1085 | : btree_iter_lock_root(iter, depth_want); | |
1086 | if (unlikely(ret)) { | |
8812600c KO |
1087 | if (ret == 1) |
1088 | return 0; | |
1089 | ||
1c6fdbd8 | 1090 | iter->level = depth_want; |
ed8413fd | 1091 | iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN; |
1c6fdbd8 KO |
1092 | return ret; |
1093 | } | |
1094 | } | |
1095 | ||
1096 | iter->uptodate = BTREE_ITER_NEED_PEEK; | |
271a3d3a | 1097 | |
0f238367 | 1098 | bch2_btree_trans_verify_locks(iter->trans); |
271a3d3a | 1099 | __bch2_btree_iter_verify(iter, iter->l[iter->level].b); |
1c6fdbd8 KO |
1100 | return 0; |
1101 | } | |
1102 | ||
1103 | int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) | |
1104 | { | |
1105 | int ret; | |
1106 | ||
0e6dd8fb KO |
1107 | ret = bch2_trans_cond_resched(iter->trans) ?: |
1108 | __bch2_btree_iter_traverse(iter); | |
1c6fdbd8 | 1109 | if (unlikely(ret)) |
bf7b87a4 | 1110 | ret = __btree_iter_traverse_all(iter->trans, iter, ret); |
1c6fdbd8 | 1111 | |
1c6fdbd8 KO |
1112 | return ret; |
1113 | } | |
1114 | ||
1115 | static inline void bch2_btree_iter_checks(struct btree_iter *iter, | |
1116 | enum btree_iter_type type) | |
1117 | { | |
1118 | EBUG_ON(iter->btree_id >= BTREE_ID_NR); | |
1c6fdbd8 | 1119 | EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) != |
ab5c63f5 | 1120 | (btree_node_type_is_extents(iter->btree_id) && |
1c6fdbd8 KO |
1121 | type != BTREE_ITER_NODES)); |
1122 | ||
0f238367 | 1123 | bch2_btree_trans_verify_locks(iter->trans); |
1c6fdbd8 KO |
1124 | } |
1125 | ||
1126 | /* Iterate across nodes (leaf and interior nodes) */ | |
1127 | ||
1128 | struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) | |
1129 | { | |
1130 | struct btree *b; | |
1131 | int ret; | |
1132 | ||
1133 | bch2_btree_iter_checks(iter, BTREE_ITER_NODES); | |
1134 | ||
1135 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1136 | return iter->l[iter->level].b; | |
1137 | ||
1138 | ret = bch2_btree_iter_traverse(iter); | |
1139 | if (ret) | |
1140 | return NULL; | |
1141 | ||
1142 | b = btree_iter_node(iter, iter->level); | |
1143 | if (!b) | |
1144 | return NULL; | |
1145 | ||
1146 | BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); | |
1147 | ||
1148 | iter->pos = b->key.k.p; | |
1149 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1150 | ||
1151 | return b; | |
1152 | } | |
1153 | ||
1154 | struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) | |
1155 | { | |
1156 | struct btree *b; | |
1157 | int ret; | |
1158 | ||
1159 | bch2_btree_iter_checks(iter, BTREE_ITER_NODES); | |
1160 | ||
1161 | /* already got to end? */ | |
1162 | if (!btree_iter_node(iter, iter->level)) | |
1163 | return NULL; | |
1164 | ||
1dd7f9d9 KO |
1165 | bch2_trans_cond_resched(iter->trans); |
1166 | ||
1c6fdbd8 KO |
1167 | btree_iter_up(iter); |
1168 | ||
1169 | if (!bch2_btree_node_relock(iter, iter->level)) | |
1170 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); | |
1171 | ||
1172 | ret = bch2_btree_iter_traverse(iter); | |
1173 | if (ret) | |
1174 | return NULL; | |
1175 | ||
1176 | /* got to end? */ | |
1177 | b = btree_iter_node(iter, iter->level); | |
1178 | if (!b) | |
1179 | return NULL; | |
1180 | ||
1181 | if (bkey_cmp(iter->pos, b->key.k.p) < 0) { | |
1182 | /* | |
1183 | * Haven't gotten to the end of the parent node: go back down to | |
1184 | * the next child node | |
1185 | */ | |
1186 | ||
1187 | /* | |
1188 | * We don't really want to be unlocking here except we can't | |
1189 | * directly tell btree_iter_traverse() "traverse to this level" | |
1190 | * except by setting iter->level, so we have to unlock so we | |
1191 | * don't screw up our lock invariants: | |
1192 | */ | |
1193 | if (btree_node_read_locked(iter, iter->level)) | |
1194 | btree_node_unlock(iter, iter->level); | |
1195 | ||
1196 | /* ick: */ | |
1197 | iter->pos = iter->btree_id == BTREE_ID_INODES | |
1198 | ? btree_type_successor(iter->btree_id, iter->pos) | |
1199 | : bkey_successor(iter->pos); | |
1200 | iter->level = depth; | |
1201 | ||
1202 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1203 | ret = bch2_btree_iter_traverse(iter); | |
1204 | if (ret) | |
1205 | return NULL; | |
1206 | ||
1207 | b = iter->l[iter->level].b; | |
1208 | } | |
1209 | ||
1210 | iter->pos = b->key.k.p; | |
1211 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1212 | ||
1213 | return b; | |
1214 | } | |
1215 | ||
1216 | /* Iterate across keys (in leaf nodes only) */ | |
1217 | ||
1218 | void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) | |
1219 | { | |
1220 | struct btree_iter_level *l = &iter->l[0]; | |
1c6fdbd8 KO |
1221 | |
1222 | EBUG_ON(iter->level != 0); | |
1223 | EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); | |
1224 | EBUG_ON(!btree_node_locked(iter, 0)); | |
1225 | EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); | |
1226 | ||
1227 | iter->pos = new_pos; | |
1228 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
1229 | ||
a00fd8c5 | 1230 | btree_iter_advance_to_pos(iter, l, -1); |
1c6fdbd8 | 1231 | |
a00fd8c5 KO |
1232 | if (bch2_btree_node_iter_end(&l->iter) && |
1233 | btree_iter_pos_after_node(iter, l->b)) | |
1c6fdbd8 | 1234 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); |
1c6fdbd8 KO |
1235 | } |
1236 | ||
1237 | void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) | |
1238 | { | |
1239 | int cmp = bkey_cmp(new_pos, iter->pos); | |
1240 | unsigned level; | |
1241 | ||
1242 | if (!cmp) | |
1243 | return; | |
1244 | ||
1245 | iter->pos = new_pos; | |
1246 | ||
1247 | level = btree_iter_up_until_locked(iter, true); | |
1248 | ||
1249 | if (btree_iter_node(iter, level)) { | |
1c6fdbd8 KO |
1250 | /* |
1251 | * We might have to skip over many keys, or just a few: try | |
1252 | * advancing the node iterator, and if we have to skip over too | |
1253 | * many keys just reinit it (or if we're rewinding, since that | |
1254 | * is expensive). | |
1255 | */ | |
a00fd8c5 KO |
1256 | if (cmp < 0 || |
1257 | !btree_iter_advance_to_pos(iter, &iter->l[level], 8)) | |
1258 | __btree_iter_init(iter, level); | |
1c6fdbd8 KO |
1259 | |
1260 | /* Don't leave it locked if we're not supposed to: */ | |
1261 | if (btree_lock_want(iter, level) == BTREE_NODE_UNLOCKED) | |
1262 | btree_node_unlock(iter, level); | |
1263 | } | |
1264 | ||
1265 | if (level != iter->level) | |
1266 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1267 | else | |
1268 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
1269 | } | |
1270 | ||
1271 | static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) | |
1272 | { | |
1273 | struct btree_iter_level *l = &iter->l[0]; | |
1274 | struct bkey_s_c ret = { .k = &iter->k }; | |
1275 | ||
1276 | if (!bkey_deleted(&iter->k)) { | |
1277 | EBUG_ON(bch2_btree_node_iter_end(&l->iter)); | |
1278 | ret.v = bkeyp_val(&l->b->format, | |
1279 | __bch2_btree_node_iter_peek_all(&l->iter, l->b)); | |
1280 | } | |
1281 | ||
9e5e5b9e | 1282 | if (debug_check_bkeys(iter->trans->c) && |
1c6fdbd8 | 1283 | !bkey_deleted(ret.k)) |
9e5e5b9e | 1284 | bch2_bkey_debugcheck(iter->trans->c, l->b, ret); |
1c6fdbd8 KO |
1285 | return ret; |
1286 | } | |
1287 | ||
1288 | struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) | |
1289 | { | |
1290 | struct btree_iter_level *l = &iter->l[0]; | |
1291 | struct bkey_s_c k; | |
1292 | int ret; | |
1293 | ||
1294 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1295 | ||
1296 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1297 | return btree_iter_peek_uptodate(iter); | |
1298 | ||
1299 | while (1) { | |
0e6dd8fb KO |
1300 | if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { |
1301 | ret = bch2_btree_iter_traverse(iter); | |
1302 | if (unlikely(ret)) | |
1303 | return bkey_s_c_err(ret); | |
1304 | } | |
1c6fdbd8 KO |
1305 | |
1306 | k = __btree_iter_peek(iter, l); | |
1307 | if (likely(k.k)) | |
1308 | break; | |
1309 | ||
1310 | /* got to the end of the leaf, iterator needs to be traversed: */ | |
1311 | iter->pos = l->b->key.k.p; | |
1312 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1313 | ||
1314 | if (!bkey_cmp(iter->pos, POS_MAX)) | |
1315 | return bkey_s_c_null; | |
1316 | ||
1317 | iter->pos = btree_type_successor(iter->btree_id, iter->pos); | |
1318 | } | |
1319 | ||
1320 | /* | |
1321 | * iter->pos should always be equal to the key we just | |
1322 | * returned - except extents can straddle iter->pos: | |
1323 | */ | |
1324 | if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || | |
1325 | bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) | |
1326 | iter->pos = bkey_start_pos(k.k); | |
1327 | ||
1328 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1329 | return k; | |
1330 | } | |
1331 | ||
1332 | static noinline | |
1333 | struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter) | |
1334 | { | |
1335 | struct btree_iter_level *l = &iter->l[0]; | |
1336 | ||
1337 | iter->pos = l->b->key.k.p; | |
1338 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1339 | ||
1340 | if (!bkey_cmp(iter->pos, POS_MAX)) | |
1341 | return bkey_s_c_null; | |
1342 | ||
1343 | iter->pos = btree_type_successor(iter->btree_id, iter->pos); | |
1344 | ||
1345 | return bch2_btree_iter_peek(iter); | |
1346 | } | |
1347 | ||
1348 | struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) | |
1349 | { | |
1350 | struct btree_iter_level *l = &iter->l[0]; | |
1351 | struct bkey_packed *p; | |
1352 | struct bkey_s_c k; | |
1353 | ||
1354 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1355 | ||
0e6dd8fb KO |
1356 | iter->pos = btree_type_successor(iter->btree_id, iter->k.p); |
1357 | ||
1c6fdbd8 | 1358 | if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { |
0e6dd8fb KO |
1359 | /* |
1360 | * XXX: when we just need to relock we should be able to avoid | |
1361 | * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK | |
1362 | * for that to work | |
1363 | */ | |
1364 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1365 | ||
1366 | return bch2_btree_iter_peek(iter); | |
1c6fdbd8 KO |
1367 | } |
1368 | ||
1369 | do { | |
a00fd8c5 | 1370 | bch2_btree_node_iter_advance(&l->iter, l->b); |
1c6fdbd8 KO |
1371 | p = bch2_btree_node_iter_peek_all(&l->iter, l->b); |
1372 | if (unlikely(!p)) | |
1373 | return bch2_btree_iter_peek_next_leaf(iter); | |
1374 | } while (bkey_whiteout(p)); | |
1375 | ||
1376 | k = __btree_iter_unpack(iter, l, &iter->k, p); | |
1377 | ||
1378 | EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) < 0); | |
1379 | iter->pos = bkey_start_pos(k.k); | |
1380 | return k; | |
1381 | } | |
1382 | ||
1383 | struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) | |
1384 | { | |
1385 | struct btree_iter_level *l = &iter->l[0]; | |
1386 | struct bkey_packed *p; | |
1387 | struct bkey_s_c k; | |
1388 | int ret; | |
1389 | ||
1390 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1391 | ||
1392 | if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { | |
1393 | k = bch2_btree_iter_peek(iter); | |
1394 | if (IS_ERR(k.k)) | |
1395 | return k; | |
1396 | } | |
1397 | ||
1398 | while (1) { | |
1399 | p = bch2_btree_node_iter_prev(&l->iter, l->b); | |
1400 | if (likely(p)) | |
1401 | break; | |
1402 | ||
1403 | iter->pos = l->b->data->min_key; | |
1404 | if (!bkey_cmp(iter->pos, POS_MIN)) | |
1405 | return bkey_s_c_null; | |
1406 | ||
1407 | bch2_btree_iter_set_pos(iter, | |
1408 | btree_type_predecessor(iter->btree_id, iter->pos)); | |
1409 | ||
1410 | ret = bch2_btree_iter_traverse(iter); | |
1411 | if (unlikely(ret)) | |
1412 | return bkey_s_c_err(ret); | |
1413 | ||
1414 | p = bch2_btree_node_iter_peek(&l->iter, l->b); | |
1415 | if (p) | |
1416 | break; | |
1417 | } | |
1418 | ||
1419 | k = __btree_iter_unpack(iter, l, &iter->k, p); | |
1420 | ||
1421 | EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); | |
1422 | ||
1423 | iter->pos = bkey_start_pos(k.k); | |
1424 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1425 | return k; | |
1426 | } | |
1427 | ||
1428 | static inline struct bkey_s_c | |
271a3d3a | 1429 | __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) |
1c6fdbd8 KO |
1430 | { |
1431 | struct btree_iter_level *l = &iter->l[0]; | |
271a3d3a | 1432 | struct btree_node_iter node_iter; |
1c6fdbd8 KO |
1433 | struct bkey_s_c k; |
1434 | struct bkey n; | |
1435 | int ret; | |
1436 | ||
1437 | recheck: | |
1438 | while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && | |
1439 | bkey_deleted(k.k) && | |
1440 | bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0) | |
a00fd8c5 | 1441 | bch2_btree_node_iter_advance(&l->iter, l->b); |
1c6fdbd8 | 1442 | |
271a3d3a KO |
1443 | /* |
1444 | * iterator is now at the correct position for inserting at iter->pos, | |
1445 | * but we need to keep iterating until we find the first non whiteout so | |
1446 | * we know how big a hole we have, if any: | |
1447 | */ | |
1448 | ||
1449 | node_iter = l->iter; | |
1450 | if (k.k && bkey_whiteout(k.k)) | |
1451 | k = __btree_iter_unpack(iter, l, &iter->k, | |
1452 | bch2_btree_node_iter_peek(&node_iter, l->b)); | |
1453 | ||
1c6fdbd8 KO |
1454 | /* |
1455 | * If we got to the end of the node, check if we need to traverse to the | |
1456 | * next node: | |
1457 | */ | |
1458 | if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { | |
1459 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1460 | ret = bch2_btree_iter_traverse(iter); | |
1461 | if (unlikely(ret)) | |
1462 | return bkey_s_c_err(ret); | |
1463 | ||
1464 | goto recheck; | |
1465 | } | |
1466 | ||
1467 | if (k.k && | |
1468 | !bkey_whiteout(k.k) && | |
1469 | bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { | |
271a3d3a KO |
1470 | /* |
1471 | * if we skipped forward to find the first non whiteout and | |
1472 | * there _wasn't_ actually a hole, we want the iterator to be | |
1473 | * pointed at the key we found: | |
1474 | */ | |
1475 | l->iter = node_iter; | |
1476 | ||
1c6fdbd8 KO |
1477 | EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0); |
1478 | EBUG_ON(bkey_deleted(k.k)); | |
1479 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1480 | return k; | |
1481 | } | |
1482 | ||
1483 | /* hole */ | |
271a3d3a KO |
1484 | |
1485 | /* holes can't span inode numbers: */ | |
1486 | if (iter->pos.offset == KEY_OFFSET_MAX) { | |
1487 | if (iter->pos.inode == KEY_INODE_MAX) | |
1488 | return bkey_s_c_null; | |
1489 | ||
1490 | iter->pos = bkey_successor(iter->pos); | |
1491 | goto recheck; | |
1492 | } | |
1493 | ||
1494 | if (!k.k) | |
1495 | k.k = &l->b->key.k; | |
1496 | ||
1c6fdbd8 KO |
1497 | bkey_init(&n); |
1498 | n.p = iter->pos; | |
271a3d3a KO |
1499 | bch2_key_resize(&n, |
1500 | min_t(u64, KEY_SIZE_MAX, | |
1501 | (k.k->p.inode == n.p.inode | |
1502 | ? bkey_start_offset(k.k) | |
1503 | : KEY_OFFSET_MAX) - | |
1504 | n.p.offset)); | |
1505 | ||
319f9ac3 | 1506 | EBUG_ON(!n.size); |
1c6fdbd8 | 1507 | |
271a3d3a KO |
1508 | iter->k = n; |
1509 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1510 | return (struct bkey_s_c) { &iter->k, NULL }; | |
1511 | } | |
1c6fdbd8 | 1512 | |
271a3d3a KO |
1513 | static inline struct bkey_s_c |
1514 | __bch2_btree_iter_peek_slot(struct btree_iter *iter) | |
1515 | { | |
1516 | struct btree_iter_level *l = &iter->l[0]; | |
1517 | struct bkey_s_c k; | |
1518 | int ret; | |
1c6fdbd8 | 1519 | |
271a3d3a KO |
1520 | if (iter->flags & BTREE_ITER_IS_EXTENTS) |
1521 | return __bch2_btree_iter_peek_slot_extents(iter); | |
1c6fdbd8 | 1522 | |
271a3d3a KO |
1523 | recheck: |
1524 | while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k && | |
1525 | bkey_deleted(k.k) && | |
1526 | bkey_cmp(k.k->p, iter->pos) == 0) | |
a00fd8c5 | 1527 | bch2_btree_node_iter_advance(&l->iter, l->b); |
1c6fdbd8 | 1528 | |
271a3d3a KO |
1529 | /* |
1530 | * If we got to the end of the node, check if we need to traverse to the | |
1531 | * next node: | |
1532 | */ | |
1533 | if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) { | |
1534 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1535 | ret = bch2_btree_iter_traverse(iter); | |
1536 | if (unlikely(ret)) | |
1537 | return bkey_s_c_err(ret); | |
1c6fdbd8 | 1538 | |
271a3d3a | 1539 | goto recheck; |
1c6fdbd8 KO |
1540 | } |
1541 | ||
271a3d3a KO |
1542 | if (k.k && |
1543 | !bkey_deleted(k.k) && | |
1544 | !bkey_cmp(iter->pos, k.k->p)) { | |
1545 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1546 | return k; | |
1547 | } else { | |
1548 | /* hole */ | |
1549 | bkey_init(&iter->k); | |
1550 | iter->k.p = iter->pos; | |
1551 | ||
1552 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1553 | return (struct bkey_s_c) { &iter->k, NULL }; | |
1554 | } | |
1c6fdbd8 KO |
1555 | } |
1556 | ||
1557 | struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) | |
1558 | { | |
1559 | int ret; | |
1560 | ||
1561 | bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); | |
1562 | ||
1563 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1564 | return btree_iter_peek_uptodate(iter); | |
1565 | ||
0e6dd8fb KO |
1566 | if (iter->uptodate >= BTREE_ITER_NEED_RELOCK) { |
1567 | ret = bch2_btree_iter_traverse(iter); | |
1568 | if (unlikely(ret)) | |
1569 | return bkey_s_c_err(ret); | |
1570 | } | |
1c6fdbd8 KO |
1571 | |
1572 | return __bch2_btree_iter_peek_slot(iter); | |
1573 | } | |
1574 | ||
1575 | struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) | |
1576 | { | |
1577 | bch2_btree_iter_checks(iter, BTREE_ITER_SLOTS); | |
1578 | ||
1579 | iter->pos = btree_type_successor(iter->btree_id, iter->k.p); | |
1580 | ||
1581 | if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) { | |
1582 | /* | |
1583 | * XXX: when we just need to relock we should be able to avoid | |
1584 | * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK | |
1585 | * for that to work | |
1586 | */ | |
1587 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1588 | ||
1589 | return bch2_btree_iter_peek_slot(iter); | |
1590 | } | |
1591 | ||
1592 | if (!bkey_deleted(&iter->k)) | |
a00fd8c5 | 1593 | bch2_btree_node_iter_advance(&iter->l[0].iter, iter->l[0].b); |
1c6fdbd8 KO |
1594 | |
1595 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
1596 | ||
1597 | return __bch2_btree_iter_peek_slot(iter); | |
1598 | } | |
1599 | ||
9e5e5b9e KO |
1600 | static inline void bch2_btree_iter_init(struct btree_trans *trans, |
1601 | struct btree_iter *iter, enum btree_id btree_id, | |
424eb881 | 1602 | struct bpos pos, unsigned flags) |
1c6fdbd8 | 1603 | { |
9e5e5b9e | 1604 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
1605 | unsigned i; |
1606 | ||
ab5c63f5 | 1607 | if (btree_node_type_is_extents(btree_id) && |
424eb881 KO |
1608 | !(flags & BTREE_ITER_NODES)) |
1609 | flags |= BTREE_ITER_IS_EXTENTS; | |
1c6fdbd8 | 1610 | |
9e5e5b9e | 1611 | iter->trans = trans; |
1c6fdbd8 KO |
1612 | iter->pos = pos; |
1613 | bkey_init(&iter->k); | |
1614 | iter->k.p = pos; | |
1615 | iter->flags = flags; | |
1616 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1617 | iter->btree_id = btree_id; | |
424eb881 KO |
1618 | iter->level = 0; |
1619 | iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; | |
1c6fdbd8 KO |
1620 | iter->nodes_locked = 0; |
1621 | iter->nodes_intent_locked = 0; | |
1622 | for (i = 0; i < ARRAY_SIZE(iter->l); i++) | |
1623 | iter->l[i].b = NULL; | |
ed8413fd | 1624 | iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; |
1c6fdbd8 KO |
1625 | |
1626 | prefetch(c->btree_roots[btree_id].b); | |
1627 | } | |
1628 | ||
1c6fdbd8 KO |
1629 | /* new transactional stuff: */ |
1630 | ||
424eb881 KO |
1631 | int bch2_trans_iter_put(struct btree_trans *trans, |
1632 | struct btree_iter *iter) | |
08af47df | 1633 | { |
0f238367 | 1634 | int ret = btree_iter_err(iter); |
08af47df | 1635 | |
e1120a4c | 1636 | trans->iters_live &= ~(1ULL << iter->idx); |
424eb881 | 1637 | return ret; |
08af47df KO |
1638 | } |
1639 | ||
5df4be3f KO |
1640 | static inline void __bch2_trans_iter_free(struct btree_trans *trans, |
1641 | unsigned idx) | |
1642 | { | |
ecc892e4 | 1643 | __bch2_btree_iter_unlock(&trans->iters[idx]); |
5df4be3f KO |
1644 | trans->iters_linked &= ~(1ULL << idx); |
1645 | trans->iters_live &= ~(1ULL << idx); | |
1646 | trans->iters_touched &= ~(1ULL << idx); | |
1647 | trans->iters_unlink_on_restart &= ~(1ULL << idx); | |
1648 | trans->iters_unlink_on_commit &= ~(1ULL << idx); | |
5df4be3f KO |
1649 | } |
1650 | ||
424eb881 KO |
1651 | int bch2_trans_iter_free(struct btree_trans *trans, |
1652 | struct btree_iter *iter) | |
1c6fdbd8 | 1653 | { |
0f238367 | 1654 | int ret = btree_iter_err(iter); |
424eb881 | 1655 | |
e1120a4c | 1656 | __bch2_trans_iter_free(trans, iter->idx); |
424eb881 | 1657 | return ret; |
5df4be3f | 1658 | } |
1c6fdbd8 | 1659 | |
424eb881 KO |
1660 | int bch2_trans_iter_free_on_commit(struct btree_trans *trans, |
1661 | struct btree_iter *iter) | |
5df4be3f | 1662 | { |
0f238367 | 1663 | int ret = btree_iter_err(iter); |
424eb881 | 1664 | |
e1120a4c | 1665 | trans->iters_unlink_on_commit |= 1ULL << iter->idx; |
424eb881 | 1666 | return ret; |
1c6fdbd8 KO |
1667 | } |
1668 | ||
20bceecb | 1669 | static int bch2_trans_realloc_iters(struct btree_trans *trans, |
7d825866 | 1670 | unsigned new_size) |
1c6fdbd8 | 1671 | { |
a8e00bd4 | 1672 | void *new_iters, *new_updates; |
1c6fdbd8 | 1673 | |
7d825866 KO |
1674 | new_size = roundup_pow_of_two(new_size); |
1675 | ||
a8e00bd4 KO |
1676 | BUG_ON(new_size > BTREE_ITER_MAX); |
1677 | ||
1678 | if (new_size <= trans->size) | |
1679 | return 0; | |
1680 | ||
1681 | BUG_ON(trans->used_mempool); | |
1682 | ||
1c6fdbd8 KO |
1683 | bch2_trans_unlock(trans); |
1684 | ||
a8e00bd4 KO |
1685 | new_iters = kmalloc(sizeof(struct btree_iter) * new_size + |
1686 | sizeof(struct btree_insert_entry) * (new_size + 4), | |
1687 | GFP_NOFS); | |
1688 | if (new_iters) | |
1689 | goto success; | |
1690 | ||
581edb63 | 1691 | new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); |
a8e00bd4 KO |
1692 | new_size = BTREE_ITER_MAX; |
1693 | ||
1694 | trans->used_mempool = true; | |
1695 | success: | |
1696 | new_updates = new_iters + sizeof(struct btree_iter) * new_size; | |
1c6fdbd8 KO |
1697 | |
1698 | memcpy(new_iters, trans->iters, | |
1699 | sizeof(struct btree_iter) * trans->nr_iters); | |
a8e00bd4 KO |
1700 | memcpy(new_updates, trans->updates, |
1701 | sizeof(struct btree_insert_entry) * trans->nr_updates); | |
1702 | ||
5df4be3f KO |
1703 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) |
1704 | memset(trans->iters, POISON_FREE, | |
1705 | sizeof(struct btree_iter) * trans->nr_iters + | |
1706 | sizeof(struct btree_insert_entry) * trans->nr_iters); | |
1707 | ||
a8e00bd4 KO |
1708 | if (trans->iters != trans->iters_onstack) |
1709 | kfree(trans->iters); | |
1710 | ||
1711 | trans->iters = new_iters; | |
1712 | trans->updates = new_updates; | |
1713 | trans->size = new_size; | |
1c6fdbd8 | 1714 | |
1c7a0adf | 1715 | if (trans->iters_live) { |
20bceecb | 1716 | trace_trans_restart_iters_realloced(trans->ip, trans->size); |
1c7a0adf KO |
1717 | return -EINTR; |
1718 | } | |
1719 | ||
1720 | return 0; | |
1c6fdbd8 KO |
1721 | } |
1722 | ||
7c26ecae KO |
1723 | static int btree_trans_iter_alloc(struct btree_trans *trans) |
1724 | { | |
4c1c1e39 | 1725 | unsigned idx = __ffs64(~trans->iters_linked); |
7c26ecae KO |
1726 | |
1727 | if (idx < trans->nr_iters) | |
1728 | goto got_slot; | |
1729 | ||
1730 | if (trans->nr_iters == trans->size) { | |
7d825866 | 1731 | int ret = bch2_trans_realloc_iters(trans, trans->size * 2); |
7c26ecae KO |
1732 | if (ret) |
1733 | return ret; | |
1734 | } | |
1735 | ||
1736 | idx = trans->nr_iters++; | |
1737 | BUG_ON(trans->nr_iters > trans->size); | |
e1120a4c KO |
1738 | |
1739 | trans->iters[idx].idx = idx; | |
7c26ecae | 1740 | got_slot: |
7c26ecae | 1741 | BUG_ON(trans->iters_linked & (1ULL << idx)); |
7c26ecae KO |
1742 | trans->iters_linked |= 1ULL << idx; |
1743 | return idx; | |
1744 | } | |
1745 | ||
1c6fdbd8 | 1746 | static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, |
5df4be3f | 1747 | unsigned btree_id, struct bpos pos, |
1c6fdbd8 KO |
1748 | unsigned flags, u64 iter_id) |
1749 | { | |
1750 | struct btree_iter *iter; | |
1751 | int idx; | |
1752 | ||
1753 | BUG_ON(trans->nr_iters > BTREE_ITER_MAX); | |
1754 | ||
5df4be3f | 1755 | for (idx = 0; idx < trans->nr_iters; idx++) { |
7c26ecae KO |
1756 | if (!(trans->iters_linked & (1ULL << idx))) |
1757 | continue; | |
1758 | ||
5df4be3f KO |
1759 | iter = &trans->iters[idx]; |
1760 | if (iter_id | |
1761 | ? iter->id == iter_id | |
1762 | : (iter->btree_id == btree_id && | |
1763 | !bkey_cmp(iter->pos, pos))) | |
1c6fdbd8 | 1764 | goto found; |
5df4be3f | 1765 | } |
1c6fdbd8 KO |
1766 | idx = -1; |
1767 | found: | |
1768 | if (idx < 0) { | |
7c26ecae KO |
1769 | idx = btree_trans_iter_alloc(trans); |
1770 | if (idx < 0) | |
1771 | return ERR_PTR(idx); | |
1c6fdbd8 | 1772 | |
1c6fdbd8 | 1773 | iter = &trans->iters[idx]; |
a8e00bd4 | 1774 | iter->id = iter_id; |
1c6fdbd8 | 1775 | |
9e5e5b9e | 1776 | bch2_btree_iter_init(trans, iter, btree_id, pos, flags); |
1c6fdbd8 KO |
1777 | } else { |
1778 | iter = &trans->iters[idx]; | |
1779 | ||
1c6fdbd8 KO |
1780 | iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); |
1781 | iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); | |
1782 | } | |
1783 | ||
5df4be3f | 1784 | BUG_ON(iter->btree_id != btree_id); |
a8e00bd4 | 1785 | BUG_ON(trans->iters_live & (1ULL << idx)); |
5df4be3f KO |
1786 | trans->iters_live |= 1ULL << idx; |
1787 | trans->iters_touched |= 1ULL << idx; | |
1c6fdbd8 | 1788 | |
08af47df KO |
1789 | BUG_ON(iter->btree_id != btree_id); |
1790 | BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); | |
1791 | ||
1c6fdbd8 KO |
1792 | return iter; |
1793 | } | |
1794 | ||
1795 | struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, | |
1796 | enum btree_id btree_id, | |
1797 | struct bpos pos, unsigned flags, | |
1798 | u64 iter_id) | |
1799 | { | |
1800 | struct btree_iter *iter = | |
5df4be3f | 1801 | __btree_trans_get_iter(trans, btree_id, pos, flags, iter_id); |
1c6fdbd8 KO |
1802 | |
1803 | if (!IS_ERR(iter)) | |
1804 | bch2_btree_iter_set_pos(iter, pos); | |
1805 | return iter; | |
1806 | } | |
1807 | ||
424eb881 KO |
1808 | struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, |
1809 | enum btree_id btree_id, | |
1810 | struct bpos pos, | |
1811 | unsigned locks_want, | |
1812 | unsigned depth, | |
1813 | unsigned flags) | |
1814 | { | |
1815 | struct btree_iter *iter = | |
1816 | __btree_trans_get_iter(trans, btree_id, pos, | |
1817 | flags|BTREE_ITER_NODES, 0); | |
1818 | unsigned i; | |
1819 | ||
1820 | BUG_ON(IS_ERR(iter)); | |
1821 | BUG_ON(bkey_cmp(iter->pos, pos)); | |
1822 | ||
1823 | iter->locks_want = locks_want; | |
1824 | iter->level = depth; | |
1825 | ||
1826 | for (i = 0; i < ARRAY_SIZE(iter->l); i++) | |
1827 | iter->l[i].b = NULL; | |
ed8413fd | 1828 | iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; |
424eb881 KO |
1829 | |
1830 | return iter; | |
1831 | } | |
1832 | ||
7c26ecae KO |
1833 | struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans, |
1834 | struct btree_iter *src) | |
1c6fdbd8 | 1835 | { |
ecc892e4 | 1836 | struct btree_iter *iter; |
e1120a4c | 1837 | unsigned offset = offsetof(struct btree_iter, trans); |
ecc892e4 | 1838 | int i, idx; |
1c6fdbd8 | 1839 | |
7c26ecae KO |
1840 | idx = btree_trans_iter_alloc(trans); |
1841 | if (idx < 0) | |
1842 | return ERR_PTR(idx); | |
1843 | ||
1844 | trans->iters_live |= 1ULL << idx; | |
1845 | trans->iters_touched |= 1ULL << idx; | |
1846 | trans->iters_unlink_on_restart |= 1ULL << idx; | |
1847 | ||
ecc892e4 | 1848 | iter = &trans->iters[idx]; |
e1120a4c KO |
1849 | |
1850 | memcpy((void *) iter + offset, | |
1851 | (void *) src + offset, | |
1852 | sizeof(*iter) - offset); | |
ecc892e4 KO |
1853 | |
1854 | for (i = 0; i < BTREE_MAX_DEPTH; i++) | |
1855 | if (btree_node_locked(iter, i)) | |
c43a6ef9 | 1856 | six_lock_increment(&iter->l[i].b->c.lock, |
ecc892e4 | 1857 | __btree_lock_want(iter, i)); |
7c26ecae KO |
1858 | |
1859 | return &trans->iters[idx]; | |
1c6fdbd8 KO |
1860 | } |
1861 | ||
20bceecb | 1862 | static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) |
1c6fdbd8 | 1863 | { |
20bceecb | 1864 | if (size > trans->mem_bytes) { |
1c6fdbd8 | 1865 | size_t old_bytes = trans->mem_bytes; |
20bceecb | 1866 | size_t new_bytes = roundup_pow_of_two(size); |
1c6fdbd8 KO |
1867 | void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); |
1868 | ||
1869 | if (!new_mem) | |
20bceecb | 1870 | return -ENOMEM; |
1c6fdbd8 KO |
1871 | |
1872 | trans->mem = new_mem; | |
1873 | trans->mem_bytes = new_bytes; | |
1874 | ||
1c7a0adf | 1875 | if (old_bytes) { |
20bceecb KO |
1876 | trace_trans_restart_mem_realloced(trans->ip, new_bytes); |
1877 | return -EINTR; | |
1c7a0adf | 1878 | } |
1c6fdbd8 KO |
1879 | } |
1880 | ||
20bceecb KO |
1881 | return 0; |
1882 | } | |
1883 | ||
1884 | void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) | |
1885 | { | |
1886 | void *p; | |
1887 | int ret; | |
1888 | ||
1889 | ret = bch2_trans_preload_mem(trans, trans->mem_top + size); | |
1890 | if (ret) | |
1891 | return ERR_PTR(ret); | |
1892 | ||
1893 | p = trans->mem + trans->mem_top; | |
1c6fdbd8 | 1894 | trans->mem_top += size; |
20bceecb | 1895 | return p; |
1c6fdbd8 KO |
1896 | } |
1897 | ||
5df4be3f KO |
1898 | inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters) |
1899 | { | |
1900 | iters &= trans->iters_linked; | |
4afe7000 | 1901 | iters &= ~trans->iters_live; |
5df4be3f KO |
1902 | |
1903 | while (iters) { | |
1904 | unsigned idx = __ffs64(iters); | |
1905 | ||
1906 | iters &= ~(1ULL << idx); | |
1907 | __bch2_trans_iter_free(trans, idx); | |
1908 | } | |
1909 | } | |
1910 | ||
61011ea2 | 1911 | void bch2_trans_begin(struct btree_trans *trans) |
1c6fdbd8 | 1912 | { |
5df4be3f | 1913 | u64 iters_to_unlink; |
1c6fdbd8 | 1914 | |
1c6fdbd8 KO |
1915 | /* |
1916 | * On transaction restart, the transaction isn't required to allocate | |
1917 | * all the same iterators it on the last iteration: | |
1918 | * | |
1919 | * Unlink any iterators it didn't use this iteration, assuming it got | |
1920 | * further (allocated an iter with a higher idx) than where the iter | |
1921 | * was originally allocated: | |
1922 | */ | |
5df4be3f KO |
1923 | iters_to_unlink = ~trans->iters_live & |
1924 | ((1ULL << fls64(trans->iters_live)) - 1); | |
a8e00bd4 | 1925 | |
5df4be3f KO |
1926 | iters_to_unlink |= trans->iters_unlink_on_restart; |
1927 | iters_to_unlink |= trans->iters_unlink_on_commit; | |
a8e00bd4 | 1928 | |
4afe7000 KO |
1929 | trans->iters_live = 0; |
1930 | ||
5df4be3f | 1931 | bch2_trans_unlink_iters(trans, iters_to_unlink); |
1c6fdbd8 | 1932 | |
5df4be3f KO |
1933 | trans->iters_touched = 0; |
1934 | trans->iters_unlink_on_restart = 0; | |
1935 | trans->iters_unlink_on_commit = 0; | |
1936 | trans->nr_updates = 0; | |
1937 | trans->mem_top = 0; | |
bf7b87a4 KO |
1938 | |
1939 | bch2_btree_iter_traverse_all(trans); | |
1c6fdbd8 KO |
1940 | } |
1941 | ||
20bceecb KO |
1942 | void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, |
1943 | unsigned expected_nr_iters, | |
1944 | size_t expected_mem_bytes) | |
1c6fdbd8 | 1945 | { |
a8e00bd4 KO |
1946 | memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); |
1947 | ||
1c6fdbd8 | 1948 | trans->c = c; |
ba5c6557 | 1949 | trans->ip = _RET_IP_; |
a8e00bd4 | 1950 | trans->size = ARRAY_SIZE(trans->iters_onstack); |
1c6fdbd8 | 1951 | trans->iters = trans->iters_onstack; |
a8e00bd4 | 1952 | trans->updates = trans->updates_onstack; |
3838be78 | 1953 | trans->fs_usage_deltas = NULL; |
20bceecb KO |
1954 | |
1955 | if (expected_nr_iters > trans->size) | |
1956 | bch2_trans_realloc_iters(trans, expected_nr_iters); | |
1957 | ||
1958 | if (expected_mem_bytes) | |
1959 | bch2_trans_preload_mem(trans, expected_mem_bytes); | |
1c6fdbd8 KO |
1960 | } |
1961 | ||
1962 | int bch2_trans_exit(struct btree_trans *trans) | |
1963 | { | |
ece254b2 | 1964 | bch2_trans_unlock(trans); |
1c6fdbd8 | 1965 | |
3838be78 | 1966 | kfree(trans->fs_usage_deltas); |
1c6fdbd8 | 1967 | kfree(trans->mem); |
a8e00bd4 | 1968 | if (trans->used_mempool) |
581edb63 | 1969 | mempool_free(trans->iters, &trans->c->btree_iters_pool); |
a8e00bd4 KO |
1970 | else if (trans->iters != trans->iters_onstack) |
1971 | kfree(trans->iters); | |
1c6fdbd8 KO |
1972 | trans->mem = (void *) 0x1; |
1973 | trans->iters = (void *) 0x1; | |
ece254b2 KO |
1974 | |
1975 | return trans->error ? -EIO : 0; | |
1c6fdbd8 | 1976 | } |