Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
4 | #include "bkey_methods.h" | |
5 | #include "btree_cache.h" | |
6 | #include "btree_iter.h" | |
7 | #include "btree_locking.h" | |
57b0b3db | 8 | #include "btree_update.h" |
1c6fdbd8 KO |
9 | #include "debug.h" |
10 | #include "extents.h" | |
11 | #include "trace.h" | |
12 | ||
13 | #include <linux/prefetch.h> | |
14 | ||
ed8413fd KO |
15 | #define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) |
16 | #define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) | |
17 | #define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) | |
18 | #define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) | |
19 | #define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) | |
20 | #define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) | |
21 | #define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) | |
1c6fdbd8 KO |
22 | |
23 | static inline bool is_btree_node(struct btree_iter *iter, unsigned l) | |
24 | { | |
25 | return l < BTREE_MAX_DEPTH && | |
ed8413fd | 26 | (unsigned long) iter->l[l].b >= 128; |
1c6fdbd8 KO |
27 | } |
28 | ||
9626aeb1 | 29 | static inline struct bpos btree_iter_search_key(struct btree_iter *iter) |
a00fd8c5 | 30 | { |
9626aeb1 | 31 | struct bpos pos = iter->pos; |
cbdf24ce | 32 | |
9626aeb1 KO |
33 | if ((iter->flags & BTREE_ITER_IS_EXTENTS) && |
34 | bkey_cmp(pos, POS_MAX)) | |
35 | pos = bkey_successor(pos); | |
36 | return pos; | |
a00fd8c5 KO |
37 | } |
38 | ||
e3ecf4f5 KO |
39 | static inline bool btree_iter_pos_before_node(struct btree_iter *iter, |
40 | struct btree *b) | |
41 | { | |
39fb2983 | 42 | return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; |
e3ecf4f5 KO |
43 | } |
44 | ||
45 | static inline bool btree_iter_pos_after_node(struct btree_iter *iter, | |
46 | struct btree *b) | |
47 | { | |
48 | return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; | |
49 | } | |
50 | ||
51 | static inline bool btree_iter_pos_in_node(struct btree_iter *iter, | |
52 | struct btree *b) | |
53 | { | |
54 | return iter->btree_id == b->c.btree_id && | |
55 | !btree_iter_pos_before_node(iter, b) && | |
56 | !btree_iter_pos_after_node(iter, b); | |
57 | } | |
58 | ||
1c6fdbd8 KO |
59 | /* Btree node locking: */ |
60 | ||
1c6fdbd8 KO |
61 | void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) |
62 | { | |
b7ba66c8 | 63 | bch2_btree_node_unlock_write_inlined(b, iter); |
1c6fdbd8 KO |
64 | } |
65 | ||
66 | void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) | |
67 | { | |
1c6fdbd8 KO |
68 | struct btree_iter *linked; |
69 | unsigned readers = 0; | |
70 | ||
1904a65a | 71 | EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); |
1c6fdbd8 | 72 | |
0f238367 | 73 | trans_for_each_iter(iter->trans, linked) |
c43a6ef9 KO |
74 | if (linked->l[b->c.level].b == b && |
75 | btree_node_read_locked(linked, b->c.level)) | |
1c6fdbd8 KO |
76 | readers++; |
77 | ||
78 | /* | |
79 | * Must drop our read locks before calling six_lock_write() - | |
80 | * six_unlock() won't do wakeups until the reader count | |
81 | * goes to 0, and it's safe because we have the node intent | |
82 | * locked: | |
83 | */ | |
84 | atomic64_sub(__SIX_VAL(read_lock, readers), | |
c43a6ef9 | 85 | &b->c.lock.state.counter); |
9e5e5b9e | 86 | btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); |
1c6fdbd8 | 87 | atomic64_add(__SIX_VAL(read_lock, readers), |
c43a6ef9 | 88 | &b->c.lock.state.counter); |
1c6fdbd8 KO |
89 | } |
90 | ||
1c6fdbd8 KO |
91 | bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) |
92 | { | |
93 | struct btree *b = btree_iter_node(iter, level); | |
94 | int want = __btree_lock_want(iter, level); | |
95 | ||
ed8413fd | 96 | if (!is_btree_node(iter, level)) |
1c6fdbd8 KO |
97 | return false; |
98 | ||
99 | if (race_fault()) | |
100 | return false; | |
101 | ||
ed8413fd KO |
102 | if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || |
103 | (btree_node_lock_seq_matches(iter, b, level) && | |
515282ac | 104 | btree_node_lock_increment(iter->trans, b, level, want))) { |
ed8413fd KO |
105 | mark_btree_node_locked(iter, level, want); |
106 | return true; | |
107 | } else { | |
1c6fdbd8 | 108 | return false; |
ed8413fd | 109 | } |
1c6fdbd8 KO |
110 | } |
111 | ||
112 | static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) | |
113 | { | |
114 | struct btree *b = iter->l[level].b; | |
115 | ||
116 | EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); | |
117 | ||
118 | if (!is_btree_node(iter, level)) | |
119 | return false; | |
120 | ||
121 | if (btree_node_intent_locked(iter, level)) | |
122 | return true; | |
123 | ||
124 | if (race_fault()) | |
125 | return false; | |
126 | ||
127 | if (btree_node_locked(iter, level) | |
c43a6ef9 KO |
128 | ? six_lock_tryupgrade(&b->c.lock) |
129 | : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) | |
1c6fdbd8 KO |
130 | goto success; |
131 | ||
ed8413fd | 132 | if (btree_node_lock_seq_matches(iter, b, level) && |
515282ac | 133 | btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { |
1c6fdbd8 KO |
134 | btree_node_unlock(iter, level); |
135 | goto success; | |
136 | } | |
137 | ||
138 | return false; | |
139 | success: | |
140 | mark_btree_node_intent_locked(iter, level); | |
141 | return true; | |
142 | } | |
143 | ||
144 | static inline bool btree_iter_get_locks(struct btree_iter *iter, | |
7d825866 | 145 | bool upgrade, bool trace) |
1c6fdbd8 KO |
146 | { |
147 | unsigned l = iter->level; | |
148 | int fail_idx = -1; | |
149 | ||
150 | do { | |
151 | if (!btree_iter_node(iter, l)) | |
152 | break; | |
153 | ||
154 | if (!(upgrade | |
155 | ? bch2_btree_node_upgrade(iter, l) | |
156 | : bch2_btree_node_relock(iter, l))) { | |
7d825866 KO |
157 | if (trace) |
158 | (upgrade | |
159 | ? trace_node_upgrade_fail | |
160 | : trace_node_relock_fail)(l, iter->l[l].lock_seq, | |
ed8413fd KO |
161 | is_btree_node(iter, l) |
162 | ? 0 | |
163 | : (unsigned long) iter->l[l].b, | |
164 | is_btree_node(iter, l) | |
165 | ? iter->l[l].b->c.lock.state.seq | |
166 | : 0); | |
167 | ||
1c6fdbd8 KO |
168 | fail_idx = l; |
169 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
170 | } | |
171 | ||
172 | l++; | |
173 | } while (l < iter->locks_want); | |
174 | ||
175 | /* | |
176 | * When we fail to get a lock, we have to ensure that any child nodes | |
177 | * can't be relocked so bch2_btree_iter_traverse has to walk back up to | |
178 | * the node that we failed to relock: | |
179 | */ | |
180 | while (fail_idx >= 0) { | |
181 | btree_node_unlock(iter, fail_idx); | |
ed8413fd | 182 | iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; |
1c6fdbd8 KO |
183 | --fail_idx; |
184 | } | |
185 | ||
186 | if (iter->uptodate == BTREE_ITER_NEED_RELOCK) | |
187 | iter->uptodate = BTREE_ITER_NEED_PEEK; | |
188 | ||
0f238367 KO |
189 | bch2_btree_trans_verify_locks(iter->trans); |
190 | ||
1c6fdbd8 KO |
191 | return iter->uptodate < BTREE_ITER_NEED_RELOCK; |
192 | } | |
193 | ||
194 | /* Slowpath: */ | |
195 | bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, | |
515282ac | 196 | unsigned level, struct btree_iter *iter, |
bd2bb273 KO |
197 | enum six_lock_type type, |
198 | six_lock_should_sleep_fn should_sleep_fn, | |
199 | void *p) | |
1c6fdbd8 | 200 | { |
515282ac | 201 | struct btree_trans *trans = iter->trans; |
1c6fdbd8 | 202 | struct btree_iter *linked; |
bd2bb273 | 203 | u64 start_time = local_clock(); |
1c6fdbd8 KO |
204 | bool ret = true; |
205 | ||
647d7b60 | 206 | /* Check if it's safe to block: */ |
515282ac | 207 | trans_for_each_iter(trans, linked) { |
1c6fdbd8 KO |
208 | if (!linked->nodes_locked) |
209 | continue; | |
210 | ||
1c6fdbd8 KO |
211 | /* |
212 | * Can't block taking an intent lock if we have _any_ nodes read | |
213 | * locked: | |
214 | * | |
215 | * - Our read lock blocks another thread with an intent lock on | |
216 | * the same node from getting a write lock, and thus from | |
217 | * dropping its intent lock | |
218 | * | |
219 | * - And the other thread may have multiple nodes intent locked: | |
220 | * both the node we want to intent lock, and the node we | |
221 | * already have read locked - deadlock: | |
222 | */ | |
223 | if (type == SIX_LOCK_intent && | |
224 | linked->nodes_locked != linked->nodes_intent_locked) { | |
515282ac | 225 | if (!(trans->nounlock)) { |
1c6fdbd8 KO |
226 | linked->locks_want = max_t(unsigned, |
227 | linked->locks_want, | |
228 | __fls(linked->nodes_locked) + 1); | |
515282ac KO |
229 | if (!btree_iter_get_locks(linked, true, false)) |
230 | ret = false; | |
231 | } else { | |
232 | ret = false; | |
1c6fdbd8 | 233 | } |
1c6fdbd8 KO |
234 | } |
235 | ||
236 | /* | |
237 | * Interior nodes must be locked before their descendants: if | |
238 | * another iterator has possible descendants locked of the node | |
239 | * we're about to lock, it must have the ancestors locked too: | |
240 | */ | |
241 | if (linked->btree_id == iter->btree_id && | |
242 | level > __fls(linked->nodes_locked)) { | |
515282ac | 243 | if (!(trans->nounlock)) { |
647d7b60 KO |
244 | linked->locks_want = |
245 | max(level + 1, max_t(unsigned, | |
246 | linked->locks_want, | |
247 | iter->locks_want)); | |
515282ac KO |
248 | if (!btree_iter_get_locks(linked, true, false)) |
249 | ret = false; | |
250 | } else { | |
251 | ret = false; | |
1c6fdbd8 | 252 | } |
515282ac KO |
253 | } |
254 | ||
255 | /* Must lock btree nodes in key order: */ | |
256 | if (iter->btree_id < linked->btree_id) | |
257 | ret = false; | |
258 | ||
259 | if (iter->btree_id == linked->btree_id && | |
260 | btree_node_locked(linked, level) && | |
261 | bkey_cmp(pos, linked->l[level].b->key.k.p) <= 0) | |
1c6fdbd8 | 262 | ret = false; |
515282ac KO |
263 | |
264 | /* | |
265 | * Recheck if this is a node we already have locked - since one | |
266 | * of the get_locks() calls might've successfully | |
267 | * upgraded/relocked it: | |
268 | */ | |
269 | if (linked->l[level].b == b && | |
270 | btree_node_locked_type(linked, level) >= type) { | |
271 | six_lock_increment(&b->c.lock, type); | |
272 | return true; | |
1c6fdbd8 KO |
273 | } |
274 | } | |
275 | ||
ba5c6557 | 276 | if (unlikely(!ret)) { |
20bceecb | 277 | trace_trans_restart_would_deadlock(iter->trans->ip); |
ba5c6557 KO |
278 | return false; |
279 | } | |
1c7a0adf | 280 | |
bd2bb273 KO |
281 | if (six_trylock_type(&b->c.lock, type)) |
282 | return true; | |
283 | ||
284 | if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) | |
285 | return false; | |
286 | ||
287 | bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], | |
288 | start_time); | |
ba5c6557 | 289 | return true; |
1c6fdbd8 KO |
290 | } |
291 | ||
292 | /* Btree iterator locking: */ | |
293 | ||
294 | #ifdef CONFIG_BCACHEFS_DEBUG | |
7d6f9b64 | 295 | static void bch2_btree_iter_verify_locks(struct btree_iter *iter) |
1c6fdbd8 KO |
296 | { |
297 | unsigned l; | |
298 | ||
bd2bb273 KO |
299 | if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { |
300 | BUG_ON(iter->nodes_locked); | |
301 | return; | |
302 | } | |
303 | ||
1c6fdbd8 KO |
304 | for (l = 0; btree_iter_node(iter, l); l++) { |
305 | if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && | |
306 | !btree_node_locked(iter, l)) | |
307 | continue; | |
308 | ||
309 | BUG_ON(btree_lock_want(iter, l) != | |
310 | btree_node_locked_type(iter, l)); | |
311 | } | |
312 | } | |
ad7ae8d6 | 313 | |
0f238367 | 314 | void bch2_btree_trans_verify_locks(struct btree_trans *trans) |
ad7ae8d6 | 315 | { |
0f238367 | 316 | struct btree_iter *iter; |
ad7ae8d6 | 317 | |
bd2bb273 | 318 | trans_for_each_iter_all(trans, iter) |
0f238367 | 319 | bch2_btree_iter_verify_locks(iter); |
ad7ae8d6 | 320 | } |
7d6f9b64 KO |
321 | #else |
322 | static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} | |
1c6fdbd8 KO |
323 | #endif |
324 | ||
325 | __flatten | |
7d825866 | 326 | static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) |
1c6fdbd8 | 327 | { |
f58c22e7 | 328 | return btree_iter_get_locks(iter, false, trace); |
1c6fdbd8 KO |
329 | } |
330 | ||
1c6fdbd8 KO |
331 | bool __bch2_btree_iter_upgrade(struct btree_iter *iter, |
332 | unsigned new_locks_want) | |
333 | { | |
334 | struct btree_iter *linked; | |
335 | ||
336 | EBUG_ON(iter->locks_want >= new_locks_want); | |
337 | ||
338 | iter->locks_want = new_locks_want; | |
339 | ||
7d825866 | 340 | if (btree_iter_get_locks(iter, true, true)) |
1c6fdbd8 KO |
341 | return true; |
342 | ||
343 | /* | |
344 | * Ancestor nodes must be locked before child nodes, so set locks_want | |
345 | * on iterators that might lock ancestors before us to avoid getting | |
346 | * -EINTR later: | |
347 | */ | |
0f238367 KO |
348 | trans_for_each_iter(iter->trans, linked) |
349 | if (linked != iter && | |
350 | linked->btree_id == iter->btree_id && | |
1c6fdbd8 KO |
351 | linked->locks_want < new_locks_want) { |
352 | linked->locks_want = new_locks_want; | |
7d825866 | 353 | btree_iter_get_locks(linked, true, false); |
1c6fdbd8 KO |
354 | } |
355 | ||
356 | return false; | |
357 | } | |
358 | ||
359 | bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, | |
360 | unsigned new_locks_want) | |
361 | { | |
362 | unsigned l = iter->level; | |
363 | ||
364 | EBUG_ON(iter->locks_want >= new_locks_want); | |
365 | ||
366 | iter->locks_want = new_locks_want; | |
367 | ||
368 | do { | |
369 | if (!btree_iter_node(iter, l)) | |
370 | break; | |
371 | ||
372 | if (!bch2_btree_node_upgrade(iter, l)) { | |
373 | iter->locks_want = l; | |
374 | return false; | |
375 | } | |
376 | ||
377 | l++; | |
378 | } while (l < iter->locks_want); | |
379 | ||
380 | return true; | |
381 | } | |
382 | ||
383 | void __bch2_btree_iter_downgrade(struct btree_iter *iter, | |
384 | unsigned downgrade_to) | |
385 | { | |
72545b5e KO |
386 | unsigned l, new_locks_want = downgrade_to ?: |
387 | (iter->flags & BTREE_ITER_INTENT ? 1 : 0); | |
1c6fdbd8 | 388 | |
72545b5e KO |
389 | if (iter->locks_want < downgrade_to) { |
390 | iter->locks_want = new_locks_want; | |
1c6fdbd8 | 391 | |
72545b5e KO |
392 | while (iter->nodes_locked && |
393 | (l = __fls(iter->nodes_locked)) >= iter->locks_want) { | |
394 | if (l > iter->level) { | |
395 | btree_node_unlock(iter, l); | |
1c6fdbd8 | 396 | } else { |
72545b5e KO |
397 | if (btree_node_intent_locked(iter, l)) { |
398 | six_lock_downgrade(&iter->l[l].b->c.lock); | |
399 | iter->nodes_intent_locked ^= 1 << l; | |
1c6fdbd8 KO |
400 | } |
401 | break; | |
402 | } | |
403 | } | |
1c6fdbd8 | 404 | } |
ad7ae8d6 | 405 | |
0f238367 | 406 | bch2_btree_trans_verify_locks(iter->trans); |
1c6fdbd8 KO |
407 | } |
408 | ||
72545b5e KO |
409 | void bch2_trans_downgrade(struct btree_trans *trans) |
410 | { | |
411 | struct btree_iter *iter; | |
412 | ||
413 | trans_for_each_iter(trans, iter) | |
414 | bch2_btree_iter_downgrade(iter); | |
415 | } | |
416 | ||
58fbf808 KO |
417 | /* Btree transaction locking: */ |
418 | ||
419 | bool bch2_trans_relock(struct btree_trans *trans) | |
0f238367 KO |
420 | { |
421 | struct btree_iter *iter; | |
422 | bool ret = true; | |
423 | ||
424 | trans_for_each_iter(trans, iter) | |
7d825866 KO |
425 | if (iter->uptodate == BTREE_ITER_NEED_RELOCK) |
426 | ret &= bch2_btree_iter_relock(iter, true); | |
0f238367 KO |
427 | |
428 | return ret; | |
429 | } | |
430 | ||
58fbf808 | 431 | void bch2_trans_unlock(struct btree_trans *trans) |
0f238367 KO |
432 | { |
433 | struct btree_iter *iter; | |
434 | ||
435 | trans_for_each_iter(trans, iter) | |
436 | __bch2_btree_iter_unlock(iter); | |
437 | } | |
438 | ||
1c6fdbd8 KO |
439 | /* Btree iterator: */ |
440 | ||
441 | #ifdef CONFIG_BCACHEFS_DEBUG | |
442 | ||
2dac0eae KO |
443 | static void bch2_btree_iter_verify_level(struct btree_iter *iter, |
444 | unsigned level) | |
1c6fdbd8 | 445 | { |
9626aeb1 | 446 | struct bpos pos = btree_iter_search_key(iter); |
2dac0eae | 447 | struct btree_iter_level *l = &iter->l[level]; |
1c6fdbd8 | 448 | struct btree_node_iter tmp = l->iter; |
2dac0eae KO |
449 | bool locked = btree_node_locked(iter, level); |
450 | struct bkey_packed *p, *k; | |
451 | char buf1[100], buf2[100]; | |
452 | const char *msg; | |
1c6fdbd8 | 453 | |
f13f5a8c KO |
454 | if (!debug_check_iterators(iter->trans->c)) |
455 | return; | |
456 | ||
2dac0eae KO |
457 | BUG_ON(iter->level < iter->min_depth); |
458 | ||
459 | if (!btree_iter_node(iter, level)) | |
460 | return; | |
461 | ||
462 | if (!bch2_btree_node_relock(iter, level)) | |
271a3d3a KO |
463 | return; |
464 | ||
2dac0eae KO |
465 | /* |
466 | * Ideally this invariant would always be true, and hopefully in the | |
467 | * future it will be, but for now set_pos_same_leaf() breaks it: | |
468 | */ | |
469 | BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && | |
470 | !btree_iter_pos_in_node(iter, l->b)); | |
471 | ||
472 | /* | |
473 | * node iterators don't use leaf node iterator: | |
474 | */ | |
475 | if (btree_iter_type(iter) == BTREE_ITER_NODES && | |
476 | level <= iter->min_depth) | |
477 | goto unlock; | |
e3ecf4f5 | 478 | |
2dac0eae | 479 | bch2_btree_node_iter_verify(&l->iter, l->b); |
1c6fdbd8 KO |
480 | |
481 | /* | |
482 | * For interior nodes, the iterator will have skipped past | |
483 | * deleted keys: | |
271a3d3a KO |
484 | * |
485 | * For extents, the iterator may have skipped past deleted keys (but not | |
486 | * whiteouts) | |
1c6fdbd8 | 487 | */ |
2dac0eae KO |
488 | p = level || btree_node_type_is_extents(iter->btree_id) |
489 | ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) | |
490 | : bch2_btree_node_iter_prev_all(&tmp, l->b); | |
491 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
1c6fdbd8 | 492 | |
2dac0eae KO |
493 | if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { |
494 | msg = "before"; | |
495 | goto err; | |
1c6fdbd8 KO |
496 | } |
497 | ||
2dac0eae KO |
498 | if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { |
499 | msg = "after"; | |
500 | goto err; | |
501 | } | |
502 | unlock: | |
503 | if (!locked) | |
504 | btree_node_unlock(iter, level); | |
505 | return; | |
506 | err: | |
507 | strcpy(buf1, "(none)"); | |
508 | strcpy(buf2, "(none)"); | |
509 | ||
510 | if (p) { | |
511 | struct bkey uk = bkey_unpack_key(l->b, p); | |
512 | bch2_bkey_to_text(&PBUF(buf1), &uk); | |
513 | } | |
1c6fdbd8 | 514 | |
2dac0eae KO |
515 | if (k) { |
516 | struct bkey uk = bkey_unpack_key(l->b, k); | |
517 | bch2_bkey_to_text(&PBUF(buf2), &uk); | |
1c6fdbd8 | 518 | } |
2dac0eae KO |
519 | |
520 | panic("iterator should be %s key at level %u:\n" | |
521 | "iter pos %s %llu:%llu\n" | |
522 | "prev key %s\n" | |
523 | "cur key %s\n", | |
524 | msg, level, | |
525 | iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", | |
526 | iter->pos.inode, iter->pos.offset, | |
527 | buf1, buf2); | |
1c6fdbd8 KO |
528 | } |
529 | ||
2dac0eae | 530 | static void bch2_btree_iter_verify(struct btree_iter *iter) |
1c6fdbd8 | 531 | { |
2dac0eae | 532 | unsigned i; |
1c6fdbd8 | 533 | |
2dac0eae KO |
534 | bch2_btree_trans_verify_locks(iter->trans); |
535 | ||
536 | for (i = 0; i < BTREE_MAX_DEPTH; i++) | |
537 | bch2_btree_iter_verify_level(iter, i); | |
538 | } | |
539 | ||
540 | void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) | |
541 | { | |
542 | struct btree_iter *iter; | |
543 | ||
544 | if (!debug_check_iterators(trans->c)) | |
f13f5a8c KO |
545 | return; |
546 | ||
2dac0eae KO |
547 | trans_for_each_iter_with_node(trans, b, iter) |
548 | bch2_btree_iter_verify_level(iter, b->c.level); | |
1c6fdbd8 KO |
549 | } |
550 | ||
271a3d3a KO |
551 | #else |
552 | ||
7d6f9b64 KO |
553 | static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} |
554 | static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} | |
271a3d3a | 555 | |
1c6fdbd8 KO |
556 | #endif |
557 | ||
23bbd2bb KO |
558 | static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, |
559 | struct btree *b, | |
560 | struct bset_tree *t, | |
561 | struct bkey_packed *k) | |
562 | { | |
563 | struct btree_node_iter_set *set; | |
564 | ||
565 | btree_node_iter_for_each(iter, set) | |
566 | if (set->end == t->end_offset) { | |
567 | set->k = __btree_node_key_to_offset(b, k); | |
568 | bch2_btree_node_iter_sort(iter, b); | |
569 | return; | |
570 | } | |
571 | ||
572 | bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); | |
573 | } | |
574 | ||
887c2a4e | 575 | static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, |
9626aeb1 KO |
576 | struct btree *b, |
577 | struct bkey_packed *where) | |
887c2a4e | 578 | { |
9626aeb1 KO |
579 | struct btree_iter_level *l = &iter->l[b->c.level]; |
580 | struct bpos pos = btree_iter_search_key(iter); | |
887c2a4e | 581 | |
9626aeb1 KO |
582 | if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) |
583 | return; | |
584 | ||
585 | if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) | |
586 | bch2_btree_node_iter_advance(&l->iter, l->b); | |
587 | ||
588 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
887c2a4e KO |
589 | } |
590 | ||
591 | void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, | |
592 | struct btree *b, | |
593 | struct bkey_packed *where) | |
594 | { | |
595 | struct btree_iter *linked; | |
596 | ||
597 | trans_for_each_iter_with_node(iter->trans, b, linked) { | |
598 | __bch2_btree_iter_fix_key_modified(linked, b, where); | |
2dac0eae | 599 | bch2_btree_iter_verify_level(linked, b->c.level); |
887c2a4e KO |
600 | } |
601 | } | |
602 | ||
1c6fdbd8 KO |
603 | static void __bch2_btree_node_iter_fix(struct btree_iter *iter, |
604 | struct btree *b, | |
605 | struct btree_node_iter *node_iter, | |
606 | struct bset_tree *t, | |
607 | struct bkey_packed *where, | |
608 | unsigned clobber_u64s, | |
609 | unsigned new_u64s) | |
610 | { | |
611 | const struct bkey_packed *end = btree_bkey_last(b, t); | |
612 | struct btree_node_iter_set *set; | |
613 | unsigned offset = __btree_node_key_to_offset(b, where); | |
614 | int shift = new_u64s - clobber_u64s; | |
271a3d3a | 615 | unsigned old_end = t->end_offset - shift; |
c0fc30da KO |
616 | unsigned orig_iter_pos = node_iter->data[0].k; |
617 | bool iter_current_key_modified = | |
618 | orig_iter_pos >= offset && | |
619 | orig_iter_pos <= offset + clobber_u64s; | |
9626aeb1 | 620 | struct bpos iter_pos = btree_iter_search_key(iter); |
1c6fdbd8 KO |
621 | |
622 | btree_node_iter_for_each(node_iter, set) | |
623 | if (set->end == old_end) | |
624 | goto found; | |
625 | ||
626 | /* didn't find the bset in the iterator - might have to readd it: */ | |
627 | if (new_u64s && | |
9626aeb1 | 628 | bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { |
1c6fdbd8 | 629 | bch2_btree_node_iter_push(node_iter, b, where, end); |
c0fc30da KO |
630 | goto fixup_done; |
631 | } else { | |
632 | /* Iterator is after key that changed */ | |
059e4134 | 633 | return; |
1c6fdbd8 | 634 | } |
1c6fdbd8 | 635 | found: |
271a3d3a | 636 | set->end = t->end_offset; |
1c6fdbd8 KO |
637 | |
638 | /* Iterator hasn't gotten to the key that changed yet: */ | |
639 | if (set->k < offset) | |
059e4134 | 640 | return; |
1c6fdbd8 KO |
641 | |
642 | if (new_u64s && | |
9626aeb1 | 643 | bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { |
1c6fdbd8 KO |
644 | set->k = offset; |
645 | } else if (set->k < offset + clobber_u64s) { | |
646 | set->k = offset + new_u64s; | |
647 | if (set->k == set->end) | |
648 | bch2_btree_node_iter_set_drop(node_iter, set); | |
649 | } else { | |
c0fc30da | 650 | /* Iterator is after key that changed */ |
1c6fdbd8 | 651 | set->k = (int) set->k + shift; |
059e4134 | 652 | return; |
1c6fdbd8 KO |
653 | } |
654 | ||
1c6fdbd8 | 655 | bch2_btree_node_iter_sort(node_iter, b); |
c0fc30da KO |
656 | fixup_done: |
657 | if (node_iter->data[0].k != orig_iter_pos) | |
658 | iter_current_key_modified = true; | |
56e0e7c7 | 659 | |
1c6fdbd8 | 660 | /* |
23bbd2bb KO |
661 | * When a new key is added, and the node iterator now points to that |
662 | * key, the iterator might have skipped past deleted keys that should | |
663 | * come after the key the iterator now points to. We have to rewind to | |
c0fc30da KO |
664 | * before those deleted keys - otherwise |
665 | * bch2_btree_node_iter_prev_all() breaks: | |
1c6fdbd8 | 666 | */ |
23bbd2bb | 667 | if (!bch2_btree_node_iter_end(node_iter) && |
c0fc30da | 668 | iter_current_key_modified && |
23bbd2bb | 669 | (b->c.level || |
c4a94ae3 | 670 | btree_node_type_is_extents(iter->btree_id))) { |
23bbd2bb KO |
671 | struct bset_tree *t; |
672 | struct bkey_packed *k, *k2, *p; | |
673 | ||
674 | k = bch2_btree_node_iter_peek_all(node_iter, b); | |
1c6fdbd8 KO |
675 | |
676 | for_each_bset(b, t) { | |
23bbd2bb KO |
677 | bool set_pos = false; |
678 | ||
679 | if (node_iter->data[0].end == t->end_offset) | |
1c6fdbd8 KO |
680 | continue; |
681 | ||
23bbd2bb KO |
682 | k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); |
683 | ||
684 | while ((p = bch2_bkey_prev_all(b, t, k2)) && | |
685 | bkey_iter_cmp(b, k, p) < 0) { | |
686 | k2 = p; | |
687 | set_pos = true; | |
1c6fdbd8 | 688 | } |
23bbd2bb KO |
689 | |
690 | if (set_pos) | |
691 | btree_node_iter_set_set_pos(node_iter, | |
692 | b, t, k2); | |
1c6fdbd8 KO |
693 | } |
694 | } | |
23bbd2bb | 695 | |
c0fc30da KO |
696 | if (!b->c.level && |
697 | node_iter == &iter->l[0].iter && | |
2e70ce56 | 698 | iter_current_key_modified) |
c0fc30da | 699 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); |
1c6fdbd8 KO |
700 | } |
701 | ||
702 | void bch2_btree_node_iter_fix(struct btree_iter *iter, | |
216c9fac KO |
703 | struct btree *b, |
704 | struct btree_node_iter *node_iter, | |
705 | struct bkey_packed *where, | |
706 | unsigned clobber_u64s, | |
707 | unsigned new_u64s) | |
1c6fdbd8 | 708 | { |
216c9fac | 709 | struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); |
1c6fdbd8 KO |
710 | struct btree_iter *linked; |
711 | ||
059e4134 | 712 | if (node_iter != &iter->l[b->c.level].iter) { |
1c6fdbd8 | 713 | __bch2_btree_node_iter_fix(iter, b, node_iter, t, |
059e4134 | 714 | where, clobber_u64s, new_u64s); |
2dac0eae KO |
715 | |
716 | if (debug_check_iterators(iter->trans->c)) | |
717 | bch2_btree_node_iter_verify(node_iter, b); | |
059e4134 | 718 | } |
1c6fdbd8 | 719 | |
059e4134 | 720 | trans_for_each_iter_with_node(iter->trans, b, linked) { |
1c6fdbd8 | 721 | __bch2_btree_node_iter_fix(linked, b, |
059e4134 KO |
722 | &linked->l[b->c.level].iter, t, |
723 | where, clobber_u64s, new_u64s); | |
2dac0eae | 724 | bch2_btree_iter_verify_level(linked, b->c.level); |
059e4134 | 725 | } |
1c6fdbd8 KO |
726 | } |
727 | ||
728 | static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, | |
729 | struct btree_iter_level *l, | |
730 | struct bkey *u, | |
731 | struct bkey_packed *k) | |
732 | { | |
733 | struct bkey_s_c ret; | |
734 | ||
735 | if (unlikely(!k)) { | |
736 | /* | |
737 | * signal to bch2_btree_iter_peek_slot() that we're currently at | |
738 | * a hole | |
739 | */ | |
26609b61 | 740 | u->type = KEY_TYPE_deleted; |
1c6fdbd8 KO |
741 | return bkey_s_c_null; |
742 | } | |
743 | ||
744 | ret = bkey_disassemble(l->b, k, u); | |
745 | ||
9e5e5b9e KO |
746 | if (debug_check_bkeys(iter->trans->c)) |
747 | bch2_bkey_debugcheck(iter->trans->c, l->b, ret); | |
1c6fdbd8 KO |
748 | |
749 | return ret; | |
750 | } | |
751 | ||
752 | /* peek_all() doesn't skip deleted keys */ | |
753 | static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, | |
754 | struct btree_iter_level *l, | |
755 | struct bkey *u) | |
756 | { | |
757 | return __btree_iter_unpack(iter, l, u, | |
758 | bch2_btree_node_iter_peek_all(&l->iter, l->b)); | |
759 | } | |
760 | ||
761 | static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, | |
762 | struct btree_iter_level *l) | |
763 | { | |
764 | return __btree_iter_unpack(iter, l, &iter->k, | |
765 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
766 | } | |
767 | ||
ccf5a109 KO |
768 | static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, |
769 | struct btree_iter_level *l) | |
770 | { | |
771 | return __btree_iter_unpack(iter, l, &iter->k, | |
772 | bch2_btree_node_iter_prev(&l->iter, l->b)); | |
773 | } | |
774 | ||
a00fd8c5 KO |
775 | static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, |
776 | struct btree_iter_level *l, | |
777 | int max_advance) | |
1c6fdbd8 | 778 | { |
9626aeb1 | 779 | struct bpos pos = btree_iter_search_key(iter); |
a00fd8c5 KO |
780 | struct bkey_packed *k; |
781 | int nr_advanced = 0; | |
782 | ||
783 | while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && | |
9626aeb1 | 784 | bkey_iter_pos_cmp(l->b, k, &pos) < 0) { |
a00fd8c5 KO |
785 | if (max_advance > 0 && nr_advanced >= max_advance) |
786 | return false; | |
787 | ||
788 | bch2_btree_node_iter_advance(&l->iter, l->b); | |
789 | nr_advanced++; | |
790 | } | |
791 | ||
792 | return true; | |
1c6fdbd8 KO |
793 | } |
794 | ||
795 | /* | |
796 | * Verify that iterator for parent node points to child node: | |
797 | */ | |
798 | static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) | |
799 | { | |
800 | struct btree_iter_level *l; | |
801 | unsigned plevel; | |
802 | bool parent_locked; | |
803 | struct bkey_packed *k; | |
804 | ||
805 | if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) | |
806 | return; | |
807 | ||
c43a6ef9 | 808 | plevel = b->c.level + 1; |
1c6fdbd8 KO |
809 | if (!btree_iter_node(iter, plevel)) |
810 | return; | |
811 | ||
812 | parent_locked = btree_node_locked(iter, plevel); | |
813 | ||
814 | if (!bch2_btree_node_relock(iter, plevel)) | |
815 | return; | |
816 | ||
817 | l = &iter->l[plevel]; | |
818 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
819 | if (!k || | |
820 | bkey_deleted(k) || | |
821 | bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { | |
822 | char buf[100]; | |
823 | struct bkey uk = bkey_unpack_key(b, k); | |
824 | ||
319f9ac3 | 825 | bch2_bkey_to_text(&PBUF(buf), &uk); |
1c6fdbd8 KO |
826 | panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", |
827 | buf, b->key.k.p.inode, b->key.k.p.offset); | |
828 | } | |
829 | ||
830 | if (!parent_locked) | |
c43a6ef9 | 831 | btree_node_unlock(iter, b->c.level + 1); |
1c6fdbd8 KO |
832 | } |
833 | ||
1c6fdbd8 | 834 | static inline void __btree_iter_init(struct btree_iter *iter, |
a00fd8c5 | 835 | unsigned level) |
1c6fdbd8 | 836 | { |
9626aeb1 | 837 | struct bpos pos = btree_iter_search_key(iter); |
a00fd8c5 KO |
838 | struct btree_iter_level *l = &iter->l[level]; |
839 | ||
9626aeb1 | 840 | bch2_btree_node_iter_init(&l->iter, l->b, &pos); |
1c6fdbd8 KO |
841 | |
842 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
843 | } | |
844 | ||
845 | static inline void btree_iter_node_set(struct btree_iter *iter, | |
846 | struct btree *b) | |
847 | { | |
848 | btree_iter_verify_new_node(iter, b); | |
849 | ||
850 | EBUG_ON(!btree_iter_pos_in_node(iter, b)); | |
c43a6ef9 | 851 | EBUG_ON(b->c.lock.state.seq & 1); |
1c6fdbd8 | 852 | |
c43a6ef9 KO |
853 | iter->l[b->c.level].lock_seq = b->c.lock.state.seq; |
854 | iter->l[b->c.level].b = b; | |
855 | __btree_iter_init(iter, b->c.level); | |
1c6fdbd8 KO |
856 | } |
857 | ||
858 | /* | |
859 | * A btree node is being replaced - update the iterator to point to the new | |
860 | * node: | |
861 | */ | |
862 | void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) | |
863 | { | |
864 | enum btree_node_locked_type t; | |
865 | struct btree_iter *linked; | |
866 | ||
0f238367 | 867 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 KO |
868 | if (btree_iter_pos_in_node(linked, b)) { |
869 | /* | |
870 | * bch2_btree_iter_node_drop() has already been called - | |
871 | * the old node we're replacing has already been | |
872 | * unlocked and the pointer invalidated | |
873 | */ | |
c43a6ef9 | 874 | BUG_ON(btree_node_locked(linked, b->c.level)); |
1c6fdbd8 | 875 | |
c43a6ef9 | 876 | t = btree_lock_want(linked, b->c.level); |
1c6fdbd8 | 877 | if (t != BTREE_NODE_UNLOCKED) { |
c43a6ef9 KO |
878 | six_lock_increment(&b->c.lock, (enum six_lock_type) t); |
879 | mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t); | |
1c6fdbd8 KO |
880 | } |
881 | ||
882 | btree_iter_node_set(linked, b); | |
883 | } | |
1c6fdbd8 KO |
884 | } |
885 | ||
886 | void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) | |
887 | { | |
888 | struct btree_iter *linked; | |
c43a6ef9 | 889 | unsigned level = b->c.level; |
1c6fdbd8 | 890 | |
0f238367 | 891 | trans_for_each_iter(iter->trans, linked) |
1c6fdbd8 | 892 | if (linked->l[level].b == b) { |
ad7ae8d6 | 893 | __btree_node_unlock(linked, level); |
ed8413fd | 894 | linked->l[level].b = BTREE_ITER_NO_NODE_DROP; |
1c6fdbd8 KO |
895 | } |
896 | } | |
897 | ||
898 | /* | |
899 | * A btree node has been modified in such a way as to invalidate iterators - fix | |
900 | * them: | |
901 | */ | |
902 | void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) | |
903 | { | |
904 | struct btree_iter *linked; | |
905 | ||
0f238367 | 906 | trans_for_each_iter_with_node(iter->trans, b, linked) |
c43a6ef9 | 907 | __btree_iter_init(linked, b->c.level); |
1c6fdbd8 KO |
908 | } |
909 | ||
bd2bb273 KO |
910 | static int lock_root_check_fn(struct six_lock *lock, void *p) |
911 | { | |
912 | struct btree *b = container_of(lock, struct btree, c.lock); | |
913 | struct btree **rootp = p; | |
914 | ||
915 | return b == *rootp ? 0 : -1; | |
916 | } | |
917 | ||
1c6fdbd8 KO |
918 | static inline int btree_iter_lock_root(struct btree_iter *iter, |
919 | unsigned depth_want) | |
920 | { | |
9e5e5b9e | 921 | struct bch_fs *c = iter->trans->c; |
bd2bb273 | 922 | struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; |
1c6fdbd8 KO |
923 | enum six_lock_type lock_type; |
924 | unsigned i; | |
925 | ||
926 | EBUG_ON(iter->nodes_locked); | |
927 | ||
928 | while (1) { | |
bd2bb273 | 929 | b = READ_ONCE(*rootp); |
c43a6ef9 | 930 | iter->level = READ_ONCE(b->c.level); |
1c6fdbd8 KO |
931 | |
932 | if (unlikely(iter->level < depth_want)) { | |
933 | /* | |
934 | * the root is at a lower depth than the depth we want: | |
935 | * got to the end of the btree, or we're walking nodes | |
936 | * greater than some depth and there are no nodes >= | |
937 | * that depth | |
938 | */ | |
939 | iter->level = depth_want; | |
ed8413fd KO |
940 | for (i = iter->level; i < BTREE_MAX_DEPTH; i++) |
941 | iter->l[i].b = NULL; | |
8812600c | 942 | return 1; |
1c6fdbd8 KO |
943 | } |
944 | ||
945 | lock_type = __btree_lock_want(iter, iter->level); | |
946 | if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, | |
bd2bb273 KO |
947 | iter, lock_type, |
948 | lock_root_check_fn, rootp))) | |
1c6fdbd8 KO |
949 | return -EINTR; |
950 | ||
bd2bb273 | 951 | if (likely(b == READ_ONCE(*rootp) && |
c43a6ef9 | 952 | b->c.level == iter->level && |
1c6fdbd8 KO |
953 | !race_fault())) { |
954 | for (i = 0; i < iter->level; i++) | |
ed8413fd | 955 | iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; |
1c6fdbd8 | 956 | iter->l[iter->level].b = b; |
ed8413fd KO |
957 | for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) |
958 | iter->l[i].b = NULL; | |
1c6fdbd8 KO |
959 | |
960 | mark_btree_node_locked(iter, iter->level, lock_type); | |
961 | btree_iter_node_set(iter, b); | |
962 | return 0; | |
1c6fdbd8 KO |
963 | } |
964 | ||
c43a6ef9 | 965 | six_unlock_type(&b->c.lock, lock_type); |
1c6fdbd8 KO |
966 | } |
967 | } | |
968 | ||
969 | noinline | |
970 | static void btree_iter_prefetch(struct btree_iter *iter) | |
971 | { | |
9e5e5b9e | 972 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
973 | struct btree_iter_level *l = &iter->l[iter->level]; |
974 | struct btree_node_iter node_iter = l->iter; | |
975 | struct bkey_packed *k; | |
976 | BKEY_PADDED(k) tmp; | |
9e5e5b9e | 977 | unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) |
1c6fdbd8 KO |
978 | ? (iter->level > 1 ? 0 : 2) |
979 | : (iter->level > 1 ? 1 : 16); | |
980 | bool was_locked = btree_node_locked(iter, iter->level); | |
981 | ||
982 | while (nr) { | |
983 | if (!bch2_btree_node_relock(iter, iter->level)) | |
984 | return; | |
985 | ||
986 | bch2_btree_node_iter_advance(&node_iter, l->b); | |
987 | k = bch2_btree_node_iter_peek(&node_iter, l->b); | |
988 | if (!k) | |
989 | break; | |
990 | ||
991 | bch2_bkey_unpack(l->b, &tmp.k, k); | |
9e5e5b9e | 992 | bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); |
1c6fdbd8 KO |
993 | } |
994 | ||
995 | if (!was_locked) | |
996 | btree_node_unlock(iter, iter->level); | |
997 | } | |
998 | ||
72141e1f KO |
999 | static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, |
1000 | unsigned plevel, struct btree *b) | |
1001 | { | |
1002 | struct btree_iter_level *l = &iter->l[plevel]; | |
1003 | bool locked = btree_node_locked(iter, plevel); | |
1004 | struct bkey_packed *k; | |
1005 | struct bch_btree_ptr_v2 *bp; | |
1006 | ||
1007 | if (!bch2_btree_node_relock(iter, plevel)) | |
1008 | return; | |
1009 | ||
1010 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
1011 | BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); | |
1012 | ||
1013 | bp = (void *) bkeyp_val(&l->b->format, k); | |
1014 | bp->mem_ptr = (unsigned long)b; | |
1015 | ||
1016 | if (!locked) | |
1017 | btree_node_unlock(iter, plevel); | |
1018 | } | |
1019 | ||
c4e065c2 | 1020 | static __always_inline int btree_iter_down(struct btree_iter *iter) |
1c6fdbd8 | 1021 | { |
9e5e5b9e | 1022 | struct bch_fs *c = iter->trans->c; |
1c6fdbd8 KO |
1023 | struct btree_iter_level *l = &iter->l[iter->level]; |
1024 | struct btree *b; | |
1025 | unsigned level = iter->level - 1; | |
1026 | enum six_lock_type lock_type = __btree_lock_want(iter, level); | |
1027 | BKEY_PADDED(k) tmp; | |
1028 | ||
c4e065c2 | 1029 | EBUG_ON(!btree_node_locked(iter, iter->level)); |
1c6fdbd8 KO |
1030 | |
1031 | bch2_bkey_unpack(l->b, &tmp.k, | |
1032 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
1033 | ||
b03b81df | 1034 | b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); |
1c6fdbd8 KO |
1035 | if (unlikely(IS_ERR(b))) |
1036 | return PTR_ERR(b); | |
1037 | ||
1038 | mark_btree_node_locked(iter, level, lock_type); | |
1039 | btree_iter_node_set(iter, b); | |
1040 | ||
72141e1f KO |
1041 | if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && |
1042 | unlikely(b != btree_node_mem_ptr(&tmp.k))) | |
1043 | btree_node_mem_ptr_set(iter, level + 1, b); | |
1044 | ||
1c6fdbd8 KO |
1045 | if (iter->flags & BTREE_ITER_PREFETCH) |
1046 | btree_iter_prefetch(iter); | |
1047 | ||
1048 | iter->level = level; | |
1049 | ||
1050 | return 0; | |
1051 | } | |
1052 | ||
1053 | static void btree_iter_up(struct btree_iter *iter) | |
1054 | { | |
1055 | btree_node_unlock(iter, iter->level++); | |
1056 | } | |
1057 | ||
9b02d1c4 | 1058 | static int btree_iter_traverse_one(struct btree_iter *); |
1c6fdbd8 | 1059 | |
bf7b87a4 | 1060 | static int __btree_iter_traverse_all(struct btree_trans *trans, |
9b02d1c4 | 1061 | struct btree_iter *orig_iter, int ret) |
1c6fdbd8 | 1062 | { |
e542029e | 1063 | struct bch_fs *c = trans->c; |
37dd7834 | 1064 | struct btree_iter *iter; |
e542029e KO |
1065 | u8 sorted[BTREE_ITER_MAX]; |
1066 | unsigned i, nr_sorted = 0; | |
1067 | ||
1068 | trans_for_each_iter(trans, iter) | |
1069 | sorted[nr_sorted++] = iter - trans->iters; | |
1070 | ||
1071 | #define btree_iter_cmp_by_idx(_l, _r) \ | |
1072 | btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) | |
1073 | ||
1074 | bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); | |
1075 | #undef btree_iter_cmp_by_idx | |
1076 | ||
1c6fdbd8 | 1077 | retry_all: |
58fbf808 | 1078 | bch2_trans_unlock(trans); |
1c6fdbd8 | 1079 | |
bf7b87a4 | 1080 | if (unlikely(ret == -ENOMEM)) { |
1c6fdbd8 KO |
1081 | struct closure cl; |
1082 | ||
1083 | closure_init_stack(&cl); | |
1084 | ||
1085 | do { | |
1086 | ret = bch2_btree_cache_cannibalize_lock(c, &cl); | |
1087 | closure_sync(&cl); | |
1088 | } while (ret); | |
1089 | } | |
1090 | ||
bf7b87a4 | 1091 | if (unlikely(ret == -EIO)) { |
ece254b2 | 1092 | trans->error = true; |
ab9ff733 KO |
1093 | if (orig_iter) { |
1094 | orig_iter->flags |= BTREE_ITER_ERROR; | |
1095 | orig_iter->l[orig_iter->level].b = | |
1096 | BTREE_ITER_NO_NODE_ERROR; | |
1097 | } | |
bf7b87a4 KO |
1098 | goto out; |
1099 | } | |
1100 | ||
1101 | BUG_ON(ret && ret != -EINTR); | |
1102 | ||
1c6fdbd8 | 1103 | /* Now, redo traversals in correct order: */ |
e542029e KO |
1104 | for (i = 0; i < nr_sorted; i++) { |
1105 | iter = &trans->iters[sorted[i]]; | |
1c6fdbd8 | 1106 | |
d5cdf033 | 1107 | ret = btree_iter_traverse_one(iter); |
e542029e KO |
1108 | if (ret) |
1109 | goto retry_all; | |
1110 | } | |
1c6fdbd8 | 1111 | |
8666a9ad KO |
1112 | if (hweight64(trans->iters_live) > 1) |
1113 | ret = -EINTR; | |
1114 | else | |
1115 | trans_for_each_iter(trans, iter) | |
1116 | if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { | |
1117 | ret = -EINTR; | |
1118 | break; | |
1119 | } | |
1c6fdbd8 KO |
1120 | out: |
1121 | bch2_btree_cache_cannibalize_unlock(c); | |
1122 | return ret; | |
bf7b87a4 | 1123 | } |
1c6fdbd8 | 1124 | |
bf7b87a4 KO |
1125 | int bch2_btree_iter_traverse_all(struct btree_trans *trans) |
1126 | { | |
1127 | return __btree_iter_traverse_all(trans, NULL, 0); | |
1c6fdbd8 KO |
1128 | } |
1129 | ||
f4b61341 KO |
1130 | static inline bool btree_iter_good_node(struct btree_iter *iter, |
1131 | unsigned l, int check_pos) | |
1132 | { | |
1133 | if (!is_btree_node(iter, l) || | |
1134 | !bch2_btree_node_relock(iter, l)) | |
1135 | return false; | |
1136 | ||
1137 | if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) | |
1138 | return false; | |
1139 | if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) | |
1140 | return false; | |
1141 | return true; | |
1142 | } | |
1143 | ||
1144 | static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, | |
1145 | int check_pos) | |
1c6fdbd8 KO |
1146 | { |
1147 | unsigned l = iter->level; | |
1148 | ||
1149 | while (btree_iter_node(iter, l) && | |
f4b61341 | 1150 | !btree_iter_good_node(iter, l, check_pos)) { |
1c6fdbd8 | 1151 | btree_node_unlock(iter, l); |
ed8413fd | 1152 | iter->l[l].b = BTREE_ITER_NO_NODE_UP; |
1c6fdbd8 KO |
1153 | l++; |
1154 | } | |
1155 | ||
1156 | return l; | |
1157 | } | |
1158 | ||
1159 | /* | |
1160 | * This is the main state machine for walking down the btree - walks down to a | |
1161 | * specified depth | |
1162 | * | |
1163 | * Returns 0 on success, -EIO on error (error reading in a btree node). | |
1164 | * | |
1165 | * On error, caller (peek_node()/peek_key()) must return NULL; the error is | |
b7607ce9 | 1166 | * stashed in the iterator and returned from bch2_trans_exit(). |
1c6fdbd8 | 1167 | */ |
9b02d1c4 | 1168 | static int btree_iter_traverse_one(struct btree_iter *iter) |
1c6fdbd8 KO |
1169 | { |
1170 | unsigned depth_want = iter->level; | |
1171 | ||
1172 | if (unlikely(iter->level >= BTREE_MAX_DEPTH)) | |
1173 | return 0; | |
1174 | ||
f58c22e7 KO |
1175 | /* |
1176 | * if we need interior nodes locked, call btree_iter_relock() to make | |
1177 | * sure we walk back up enough that we lock them: | |
1178 | */ | |
1179 | if (iter->uptodate == BTREE_ITER_NEED_RELOCK || | |
1180 | iter->locks_want > 1) | |
1181 | bch2_btree_iter_relock(iter, false); | |
1182 | ||
1183 | if (iter->uptodate < BTREE_ITER_NEED_RELOCK) | |
1c6fdbd8 KO |
1184 | return 0; |
1185 | ||
1c6fdbd8 KO |
1186 | /* |
1187 | * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos | |
1188 | * here unnecessary | |
1189 | */ | |
f4b61341 | 1190 | iter->level = btree_iter_up_until_good_node(iter, 0); |
1c6fdbd8 KO |
1191 | |
1192 | /* | |
1193 | * If we've got a btree node locked (i.e. we aren't about to relock the | |
1194 | * root) - advance its node iterator if necessary: | |
1195 | * | |
1196 | * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary | |
1197 | */ | |
f4b61341 KO |
1198 | if (btree_iter_node(iter, iter->level)) { |
1199 | BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); | |
1200 | ||
a00fd8c5 | 1201 | btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); |
f4b61341 | 1202 | } |
1c6fdbd8 KO |
1203 | |
1204 | /* | |
1205 | * Note: iter->nodes[iter->level] may be temporarily NULL here - that | |
1206 | * would indicate to other code that we got to the end of the btree, | |
1207 | * here it indicates that relocking the root failed - it's critical that | |
1208 | * btree_iter_lock_root() comes next and that it can't fail | |
1209 | */ | |
1210 | while (iter->level > depth_want) { | |
1211 | int ret = btree_iter_node(iter, iter->level) | |
1212 | ? btree_iter_down(iter) | |
1213 | : btree_iter_lock_root(iter, depth_want); | |
1214 | if (unlikely(ret)) { | |
8812600c KO |
1215 | if (ret == 1) |
1216 | return 0; | |
1217 | ||
1c6fdbd8 | 1218 | iter->level = depth_want; |
ed8413fd | 1219 | iter->l[iter->level].b = BTREE_ITER_NO_NODE_DOWN; |
1c6fdbd8 KO |
1220 | return ret; |
1221 | } | |
1222 | } | |
1223 | ||
1224 | iter->uptodate = BTREE_ITER_NEED_PEEK; | |
271a3d3a | 1225 | |
2dac0eae | 1226 | bch2_btree_iter_verify(iter); |
1c6fdbd8 KO |
1227 | return 0; |
1228 | } | |
1229 | ||
9b02d1c4 | 1230 | int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) |
1c6fdbd8 KO |
1231 | { |
1232 | int ret; | |
1233 | ||
0e6dd8fb | 1234 | ret = bch2_trans_cond_resched(iter->trans) ?: |
9b02d1c4 | 1235 | btree_iter_traverse_one(iter); |
1c6fdbd8 | 1236 | if (unlikely(ret)) |
bf7b87a4 | 1237 | ret = __btree_iter_traverse_all(iter->trans, iter, ret); |
1c6fdbd8 | 1238 | |
1c6fdbd8 KO |
1239 | return ret; |
1240 | } | |
1241 | ||
1242 | static inline void bch2_btree_iter_checks(struct btree_iter *iter, | |
1243 | enum btree_iter_type type) | |
1244 | { | |
1245 | EBUG_ON(iter->btree_id >= BTREE_ID_NR); | |
bbd8d203 | 1246 | EBUG_ON(btree_iter_type(iter) != type); |
1c6fdbd8 | 1247 | |
2e70ce56 KO |
1248 | BUG_ON(type == BTREE_ITER_KEYS && |
1249 | (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || | |
1250 | bkey_cmp(iter->pos, iter->k.p) > 0)); | |
1251 | ||
2dac0eae KO |
1252 | bch2_btree_iter_verify_locks(iter); |
1253 | bch2_btree_iter_verify_level(iter, iter->level); | |
1c6fdbd8 KO |
1254 | } |
1255 | ||
1256 | /* Iterate across nodes (leaf and interior nodes) */ | |
1257 | ||
1258 | struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) | |
1259 | { | |
1260 | struct btree *b; | |
1261 | int ret; | |
1262 | ||
1263 | bch2_btree_iter_checks(iter, BTREE_ITER_NODES); | |
1264 | ||
1265 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1266 | return iter->l[iter->level].b; | |
1267 | ||
1268 | ret = bch2_btree_iter_traverse(iter); | |
1269 | if (ret) | |
1270 | return NULL; | |
1271 | ||
1272 | b = btree_iter_node(iter, iter->level); | |
1273 | if (!b) | |
1274 | return NULL; | |
1275 | ||
1276 | BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); | |
1277 | ||
1278 | iter->pos = b->key.k.p; | |
1279 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1280 | ||
2dac0eae KO |
1281 | bch2_btree_iter_verify(iter); |
1282 | ||
1c6fdbd8 KO |
1283 | return b; |
1284 | } | |
1285 | ||
2dac0eae | 1286 | struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) |
1c6fdbd8 KO |
1287 | { |
1288 | struct btree *b; | |
1289 | int ret; | |
1290 | ||
1291 | bch2_btree_iter_checks(iter, BTREE_ITER_NODES); | |
1292 | ||
1293 | /* already got to end? */ | |
1294 | if (!btree_iter_node(iter, iter->level)) | |
1295 | return NULL; | |
1296 | ||
1dd7f9d9 KO |
1297 | bch2_trans_cond_resched(iter->trans); |
1298 | ||
1c6fdbd8 KO |
1299 | btree_iter_up(iter); |
1300 | ||
1301 | if (!bch2_btree_node_relock(iter, iter->level)) | |
1302 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); | |
1303 | ||
1304 | ret = bch2_btree_iter_traverse(iter); | |
1305 | if (ret) | |
1306 | return NULL; | |
1307 | ||
1308 | /* got to end? */ | |
1309 | b = btree_iter_node(iter, iter->level); | |
1310 | if (!b) | |
1311 | return NULL; | |
1312 | ||
1313 | if (bkey_cmp(iter->pos, b->key.k.p) < 0) { | |
1314 | /* | |
1315 | * Haven't gotten to the end of the parent node: go back down to | |
1316 | * the next child node | |
1317 | */ | |
1318 | ||
1319 | /* | |
1320 | * We don't really want to be unlocking here except we can't | |
1321 | * directly tell btree_iter_traverse() "traverse to this level" | |
1322 | * except by setting iter->level, so we have to unlock so we | |
1323 | * don't screw up our lock invariants: | |
1324 | */ | |
1325 | if (btree_node_read_locked(iter, iter->level)) | |
1326 | btree_node_unlock(iter, iter->level); | |
1327 | ||
39fb2983 | 1328 | iter->pos = bkey_successor(iter->pos); |
2dac0eae | 1329 | iter->level = iter->min_depth; |
1c6fdbd8 KO |
1330 | |
1331 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1332 | ret = bch2_btree_iter_traverse(iter); | |
1333 | if (ret) | |
1334 | return NULL; | |
1335 | ||
1336 | b = iter->l[iter->level].b; | |
1337 | } | |
1338 | ||
1339 | iter->pos = b->key.k.p; | |
1340 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1341 | ||
2dac0eae KO |
1342 | bch2_btree_iter_verify(iter); |
1343 | ||
1c6fdbd8 KO |
1344 | return b; |
1345 | } | |
1346 | ||
1347 | /* Iterate across keys (in leaf nodes only) */ | |
1348 | ||
1349 | void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) | |
1350 | { | |
1351 | struct btree_iter_level *l = &iter->l[0]; | |
1c6fdbd8 KO |
1352 | |
1353 | EBUG_ON(iter->level != 0); | |
1354 | EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); | |
1355 | EBUG_ON(!btree_node_locked(iter, 0)); | |
1356 | EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); | |
1357 | ||
2e70ce56 KO |
1358 | bkey_init(&iter->k); |
1359 | iter->k.p = iter->pos = new_pos; | |
1c6fdbd8 KO |
1360 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); |
1361 | ||
a00fd8c5 | 1362 | btree_iter_advance_to_pos(iter, l, -1); |
1c6fdbd8 | 1363 | |
f96c0df4 KO |
1364 | /* |
1365 | * XXX: | |
1366 | * keeping a node locked that's outside (even just outside) iter->pos | |
1367 | * breaks __bch2_btree_node_lock(). This seems to only affect | |
1368 | * bch2_btree_node_get_sibling so for now it's fixed there, but we | |
1369 | * should try to get rid of this corner case. | |
1370 | * | |
1371 | * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) | |
1372 | */ | |
1373 | ||
a00fd8c5 KO |
1374 | if (bch2_btree_node_iter_end(&l->iter) && |
1375 | btree_iter_pos_after_node(iter, l->b)) | |
1c6fdbd8 | 1376 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); |
1c6fdbd8 KO |
1377 | } |
1378 | ||
2e70ce56 | 1379 | static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) |
1c6fdbd8 | 1380 | { |
2e70ce56 KO |
1381 | unsigned l = iter->level; |
1382 | ||
1383 | if (!cmp) | |
1384 | goto out; | |
1385 | ||
1386 | l = btree_iter_up_until_good_node(iter, cmp); | |
1c6fdbd8 | 1387 | |
f4b61341 | 1388 | if (btree_iter_node(iter, l)) { |
1c6fdbd8 KO |
1389 | /* |
1390 | * We might have to skip over many keys, or just a few: try | |
1391 | * advancing the node iterator, and if we have to skip over too | |
1392 | * many keys just reinit it (or if we're rewinding, since that | |
1393 | * is expensive). | |
1394 | */ | |
a00fd8c5 | 1395 | if (cmp < 0 || |
f4b61341 KO |
1396 | !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) |
1397 | __btree_iter_init(iter, l); | |
1c6fdbd8 KO |
1398 | |
1399 | /* Don't leave it locked if we're not supposed to: */ | |
f4b61341 KO |
1400 | if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) |
1401 | btree_node_unlock(iter, l); | |
1c6fdbd8 | 1402 | } |
2e70ce56 KO |
1403 | out: |
1404 | if (l != iter->level) | |
1405 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); | |
1406 | else | |
1407 | btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); | |
f4b61341 KO |
1408 | } |
1409 | ||
6a9ec828 KO |
1410 | void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, |
1411 | bool strictly_greater) | |
1412 | { | |
1413 | struct bpos old = btree_iter_search_key(iter); | |
6a9ec828 KO |
1414 | int cmp; |
1415 | ||
1416 | iter->flags &= ~BTREE_ITER_IS_EXTENTS; | |
1417 | iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; | |
6a9ec828 | 1418 | |
2e70ce56 KO |
1419 | bkey_init(&iter->k); |
1420 | iter->k.p = iter->pos = new_pos; | |
6a9ec828 | 1421 | |
2e70ce56 | 1422 | cmp = bkey_cmp(btree_iter_search_key(iter), old); |
6a9ec828 | 1423 | |
2e70ce56 | 1424 | btree_iter_pos_changed(iter, cmp); |
6a9ec828 KO |
1425 | } |
1426 | ||
f4b61341 KO |
1427 | void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) |
1428 | { | |
1429 | int cmp = bkey_cmp(new_pos, iter->pos); | |
f4b61341 | 1430 | |
2e70ce56 KO |
1431 | bkey_init(&iter->k); |
1432 | iter->k.p = iter->pos = new_pos; | |
f4b61341 | 1433 | |
2e70ce56 | 1434 | btree_iter_pos_changed(iter, cmp); |
1c6fdbd8 KO |
1435 | } |
1436 | ||
f4b61341 KO |
1437 | static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) |
1438 | { | |
1439 | struct btree_iter_level *l = &iter->l[0]; | |
2e70ce56 | 1440 | bool ret; |
f4b61341 | 1441 | |
2e70ce56 KO |
1442 | bkey_init(&iter->k); |
1443 | iter->k.p = iter->pos = l->b->key.k.p; | |
f4b61341 | 1444 | |
2e70ce56 | 1445 | ret = bkey_cmp(iter->pos, POS_MAX) != 0; |
39fb2983 KO |
1446 | if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) |
1447 | iter->k.p = iter->pos = bkey_successor(iter->pos); | |
f4b61341 | 1448 | |
f4b61341 | 1449 | btree_iter_pos_changed(iter, 1); |
2e70ce56 | 1450 | return ret; |
f4b61341 KO |
1451 | } |
1452 | ||
1453 | static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) | |
1454 | { | |
1455 | struct btree_iter_level *l = &iter->l[0]; | |
2e70ce56 | 1456 | bool ret; |
f4b61341 | 1457 | |
2e70ce56 KO |
1458 | bkey_init(&iter->k); |
1459 | iter->k.p = iter->pos = l->b->data->min_key; | |
f4b61341 KO |
1460 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; |
1461 | ||
2e70ce56 | 1462 | ret = bkey_cmp(iter->pos, POS_MIN) != 0; |
39fb2983 KO |
1463 | if (ret) { |
1464 | iter->k.p = iter->pos = bkey_predecessor(iter->pos); | |
1465 | ||
1466 | if (iter->flags & BTREE_ITER_IS_EXTENTS) | |
1467 | iter->k.p = iter->pos = bkey_predecessor(iter->pos); | |
1468 | } | |
f4b61341 | 1469 | |
f4b61341 | 1470 | btree_iter_pos_changed(iter, -1); |
2e70ce56 | 1471 | return ret; |
f4b61341 KO |
1472 | } |
1473 | ||
e3ecf4f5 KO |
1474 | /** |
1475 | * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key | |
1476 | * it currently points to | |
1477 | */ | |
1c6fdbd8 KO |
1478 | static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) |
1479 | { | |
1480 | struct btree_iter_level *l = &iter->l[0]; | |
1481 | struct bkey_s_c ret = { .k = &iter->k }; | |
1482 | ||
1483 | if (!bkey_deleted(&iter->k)) { | |
059e4134 KO |
1484 | struct bkey_packed *_k = |
1485 | __bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
1486 | ||
1487 | ret.v = bkeyp_val(&l->b->format, _k); | |
1488 | ||
1489 | if (debug_check_iterators(iter->trans->c)) { | |
1490 | struct bkey k = bkey_unpack_key(l->b, _k); | |
538abcb8 | 1491 | |
059e4134 KO |
1492 | BUG_ON(memcmp(&k, &iter->k, sizeof(k))); |
1493 | } | |
1494 | ||
1495 | if (debug_check_bkeys(iter->trans->c)) | |
1496 | bch2_bkey_debugcheck(iter->trans->c, l->b, ret); | |
1c6fdbd8 KO |
1497 | } |
1498 | ||
1c6fdbd8 KO |
1499 | return ret; |
1500 | } | |
1501 | ||
f4b61341 KO |
1502 | /** |
1503 | * bch2_btree_iter_peek: returns first key greater than or equal to iterator's | |
1504 | * current position | |
1505 | */ | |
1c6fdbd8 KO |
1506 | struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) |
1507 | { | |
1508 | struct btree_iter_level *l = &iter->l[0]; | |
1509 | struct bkey_s_c k; | |
1510 | int ret; | |
1511 | ||
1512 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1513 | ||
e3ecf4f5 KO |
1514 | if (iter->uptodate == BTREE_ITER_UPTODATE && |
1515 | !bkey_deleted(&iter->k)) | |
1c6fdbd8 KO |
1516 | return btree_iter_peek_uptodate(iter); |
1517 | ||
1518 | while (1) { | |
9b02d1c4 KO |
1519 | ret = bch2_btree_iter_traverse(iter); |
1520 | if (unlikely(ret)) | |
1521 | return bkey_s_c_err(ret); | |
1c6fdbd8 KO |
1522 | |
1523 | k = __btree_iter_peek(iter, l); | |
1524 | if (likely(k.k)) | |
1525 | break; | |
1526 | ||
f4b61341 | 1527 | if (!btree_iter_set_pos_to_next_leaf(iter)) |
1c6fdbd8 | 1528 | return bkey_s_c_null; |
1c6fdbd8 KO |
1529 | } |
1530 | ||
1531 | /* | |
1532 | * iter->pos should always be equal to the key we just | |
1533 | * returned - except extents can straddle iter->pos: | |
1534 | */ | |
1535 | if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || | |
1536 | bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) | |
1537 | iter->pos = bkey_start_pos(k.k); | |
1538 | ||
1539 | iter->uptodate = BTREE_ITER_UPTODATE; | |
2dac0eae KO |
1540 | |
1541 | bch2_btree_iter_verify_level(iter, 0); | |
1c6fdbd8 KO |
1542 | return k; |
1543 | } | |
1544 | ||
f4b61341 KO |
1545 | /** |
1546 | * bch2_btree_iter_next: returns first key greater than iterator's current | |
1547 | * position | |
1548 | */ | |
1c6fdbd8 KO |
1549 | struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) |
1550 | { | |
2e70ce56 KO |
1551 | if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) |
1552 | return bkey_s_c_null; | |
f4b61341 | 1553 | |
2e70ce56 | 1554 | bch2_btree_iter_set_pos(iter, |
39fb2983 KO |
1555 | (iter->flags & BTREE_ITER_IS_EXTENTS) |
1556 | ? iter->k.p | |
1557 | : bkey_successor(iter->k.p)); | |
0e6dd8fb | 1558 | |
2dac0eae | 1559 | return bch2_btree_iter_peek(iter); |
1c6fdbd8 KO |
1560 | } |
1561 | ||
57b0b3db KO |
1562 | static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) |
1563 | { | |
1564 | struct bpos pos = btree_iter_search_key(iter); | |
1565 | struct btree_trans *trans = iter->trans; | |
1566 | struct btree_insert_entry *i; | |
1567 | ||
e3e464ac | 1568 | trans_for_each_update2(trans, i) |
57b0b3db KO |
1569 | if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: |
1570 | bkey_cmp(pos, i->k->k.p)) <= 0) | |
1571 | break; | |
1572 | ||
e3e464ac | 1573 | return i < trans->updates2 + trans->nr_updates2 && |
57b0b3db KO |
1574 | iter->btree_id == i->iter->btree_id |
1575 | ? bkey_i_to_s_c(i->k) | |
1576 | : bkey_s_c_null; | |
1577 | } | |
1578 | ||
1579 | static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) | |
1580 | { | |
1581 | struct btree_iter_level *l = &iter->l[0]; | |
1582 | struct bkey_s_c k = __btree_iter_peek(iter, l); | |
1583 | struct bkey_s_c u = __btree_trans_updates_peek(iter); | |
1584 | ||
1585 | if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) | |
1586 | return k; | |
1587 | if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { | |
1588 | iter->k = *u.k; | |
1589 | return u; | |
1590 | } | |
1591 | return bkey_s_c_null; | |
1592 | } | |
1593 | ||
1594 | struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) | |
1595 | { | |
1596 | struct bkey_s_c k; | |
1597 | int ret; | |
1598 | ||
1599 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1600 | ||
1601 | while (1) { | |
1602 | ret = bch2_btree_iter_traverse(iter); | |
1603 | if (unlikely(ret)) | |
1604 | return bkey_s_c_err(ret); | |
1605 | ||
1606 | k = __bch2_btree_iter_peek_with_updates(iter); | |
1607 | ||
1608 | if (k.k && bkey_deleted(k.k)) { | |
1609 | bch2_btree_iter_set_pos(iter, | |
39fb2983 KO |
1610 | (iter->flags & BTREE_ITER_IS_EXTENTS) |
1611 | ? iter->k.p | |
1612 | : bkey_successor(iter->k.p)); | |
57b0b3db KO |
1613 | continue; |
1614 | } | |
1615 | ||
1616 | if (likely(k.k)) | |
1617 | break; | |
1618 | ||
1619 | if (!btree_iter_set_pos_to_next_leaf(iter)) | |
1620 | return bkey_s_c_null; | |
1621 | } | |
1622 | ||
1623 | /* | |
1624 | * iter->pos should always be equal to the key we just | |
1625 | * returned - except extents can straddle iter->pos: | |
1626 | */ | |
1627 | if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || | |
1628 | bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) | |
1629 | iter->pos = bkey_start_pos(k.k); | |
1630 | ||
1631 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1632 | return k; | |
1633 | } | |
1634 | ||
1635 | struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) | |
1636 | { | |
1637 | if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) | |
1638 | return bkey_s_c_null; | |
1639 | ||
1640 | bch2_btree_iter_set_pos(iter, | |
39fb2983 KO |
1641 | (iter->flags & BTREE_ITER_IS_EXTENTS) |
1642 | ? iter->k.p | |
1643 | : bkey_successor(iter->k.p)); | |
57b0b3db KO |
1644 | |
1645 | return bch2_btree_iter_peek_with_updates(iter); | |
1646 | } | |
1647 | ||
ccf5a109 KO |
1648 | /** |
1649 | * bch2_btree_iter_peek_prev: returns first key less than or equal to | |
1650 | * iterator's current position | |
1651 | */ | |
1652 | struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) | |
1c6fdbd8 | 1653 | { |
2e70ce56 | 1654 | struct bpos pos = iter->pos; |
1c6fdbd8 | 1655 | struct btree_iter_level *l = &iter->l[0]; |
1c6fdbd8 KO |
1656 | struct bkey_s_c k; |
1657 | int ret; | |
1658 | ||
1659 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1660 | ||
e3ecf4f5 KO |
1661 | if (iter->uptodate == BTREE_ITER_UPTODATE && |
1662 | !bkey_deleted(&iter->k)) | |
ccf5a109 | 1663 | return btree_iter_peek_uptodate(iter); |
1c6fdbd8 KO |
1664 | |
1665 | while (1) { | |
1c6fdbd8 KO |
1666 | ret = bch2_btree_iter_traverse(iter); |
1667 | if (unlikely(ret)) | |
1668 | return bkey_s_c_err(ret); | |
1669 | ||
ccf5a109 | 1670 | k = __btree_iter_peek(iter, l); |
2e70ce56 | 1671 | if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) |
ccf5a109 KO |
1672 | k = __btree_iter_prev(iter, l); |
1673 | ||
1674 | if (likely(k.k)) | |
1c6fdbd8 | 1675 | break; |
1c6fdbd8 | 1676 | |
ccf5a109 KO |
1677 | if (!btree_iter_set_pos_to_prev_leaf(iter)) |
1678 | return bkey_s_c_null; | |
1679 | } | |
1c6fdbd8 | 1680 | |
2e70ce56 | 1681 | EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); |
1c6fdbd8 KO |
1682 | iter->pos = bkey_start_pos(k.k); |
1683 | iter->uptodate = BTREE_ITER_UPTODATE; | |
1684 | return k; | |
1685 | } | |
1686 | ||
ccf5a109 KO |
1687 | /** |
1688 | * bch2_btree_iter_prev: returns first key less than iterator's current | |
1689 | * position | |
1690 | */ | |
1691 | struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) | |
1692 | { | |
2e70ce56 | 1693 | struct bpos pos = bkey_start_pos(&iter->k); |
ccf5a109 KO |
1694 | |
1695 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1696 | ||
2e70ce56 KO |
1697 | if (unlikely(!bkey_cmp(pos, POS_MIN))) |
1698 | return bkey_s_c_null; | |
ccf5a109 | 1699 | |
2e70ce56 | 1700 | bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); |
ccf5a109 | 1701 | |
2e70ce56 | 1702 | return bch2_btree_iter_peek_prev(iter); |
ccf5a109 KO |
1703 | } |
1704 | ||
1c6fdbd8 | 1705 | static inline struct bkey_s_c |
271a3d3a | 1706 | __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) |
1c6fdbd8 KO |
1707 | { |
1708 | struct btree_iter_level *l = &iter->l[0]; | |
271a3d3a | 1709 | struct btree_node_iter node_iter; |
1c6fdbd8 KO |
1710 | struct bkey_s_c k; |
1711 | struct bkey n; | |
1712 | int ret; | |
1713 | ||
c3801239 KO |
1714 | /* keys & holes can't span inode numbers: */ |
1715 | if (iter->pos.offset == KEY_OFFSET_MAX) { | |
1716 | if (iter->pos.inode == KEY_INODE_MAX) | |
1717 | return bkey_s_c_null; | |
1718 | ||
1719 | bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); | |
1720 | ||
1721 | ret = bch2_btree_iter_traverse(iter); | |
1722 | if (unlikely(ret)) | |
1723 | return bkey_s_c_err(ret); | |
1724 | } | |
1c6fdbd8 | 1725 | |
271a3d3a KO |
1726 | /* |
1727 | * iterator is now at the correct position for inserting at iter->pos, | |
1728 | * but we need to keep iterating until we find the first non whiteout so | |
1729 | * we know how big a hole we have, if any: | |
1730 | */ | |
1731 | ||
1732 | node_iter = l->iter; | |
9626aeb1 KO |
1733 | k = __btree_iter_unpack(iter, l, &iter->k, |
1734 | bch2_btree_node_iter_peek(&node_iter, l->b)); | |
1735 | ||
1736 | if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { | |
1737 | /* | |
c3801239 KO |
1738 | * We're not setting iter->uptodate because the node iterator |
1739 | * doesn't necessarily point at the key we're returning: | |
9626aeb1 | 1740 | */ |
9626aeb1 KO |
1741 | |
1742 | EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); | |
2dac0eae | 1743 | bch2_btree_iter_verify_level(iter, 0); |
9626aeb1 KO |
1744 | return k; |
1745 | } | |
271a3d3a | 1746 | |
1c6fdbd8 | 1747 | /* hole */ |
271a3d3a | 1748 | |
271a3d3a KO |
1749 | if (!k.k) |
1750 | k.k = &l->b->key.k; | |
1751 | ||
1c6fdbd8 KO |
1752 | bkey_init(&n); |
1753 | n.p = iter->pos; | |
271a3d3a KO |
1754 | bch2_key_resize(&n, |
1755 | min_t(u64, KEY_SIZE_MAX, | |
1756 | (k.k->p.inode == n.p.inode | |
1757 | ? bkey_start_offset(k.k) | |
1758 | : KEY_OFFSET_MAX) - | |
1759 | n.p.offset)); | |
1760 | ||
319f9ac3 | 1761 | EBUG_ON(!n.size); |
1c6fdbd8 | 1762 | |
271a3d3a KO |
1763 | iter->k = n; |
1764 | iter->uptodate = BTREE_ITER_UPTODATE; | |
63f1a598 | 1765 | |
2dac0eae | 1766 | bch2_btree_iter_verify_level(iter, 0); |
271a3d3a KO |
1767 | return (struct bkey_s_c) { &iter->k, NULL }; |
1768 | } | |
1c6fdbd8 | 1769 | |
c3801239 | 1770 | struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) |
271a3d3a KO |
1771 | { |
1772 | struct btree_iter_level *l = &iter->l[0]; | |
1773 | struct bkey_s_c k; | |
c3801239 KO |
1774 | int ret; |
1775 | ||
1776 | bch2_btree_iter_checks(iter, BTREE_ITER_KEYS); | |
1777 | ||
1778 | if (iter->uptodate == BTREE_ITER_UPTODATE) | |
1779 | return btree_iter_peek_uptodate(iter); | |
1780 | ||
1781 | ret = bch2_btree_iter_traverse(iter); | |
1782 | if (unlikely(ret)) | |
1783 | return bkey_s_c_err(ret); | |
1c6fdbd8 | 1784 | |
271a3d3a KO |
1785 | if (iter->flags & BTREE_ITER_IS_EXTENTS) |
1786 | return __bch2_btree_iter_peek_slot_extents(iter); | |
1c6fdbd8 | 1787 | |
e3ecf4f5 | 1788 | k = __btree_iter_peek_all(iter, l, &iter->k); |
1c6fdbd8 | 1789 | |
e3ecf4f5 | 1790 | EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); |
1c6fdbd8 | 1791 | |
e3ecf4f5 | 1792 | if (!k.k || bkey_cmp(iter->pos, k.k->p)) { |
271a3d3a KO |
1793 | /* hole */ |
1794 | bkey_init(&iter->k); | |
1795 | iter->k.p = iter->pos; | |
63f1a598 | 1796 | k = (struct bkey_s_c) { &iter->k, NULL }; |
271a3d3a | 1797 | } |
63f1a598 KO |
1798 | |
1799 | iter->uptodate = BTREE_ITER_UPTODATE; | |
2dac0eae | 1800 | bch2_btree_iter_verify_level(iter, 0); |
63f1a598 | 1801 | return k; |
1c6fdbd8 KO |
1802 | } |
1803 | ||
1c6fdbd8 KO |
1804 | struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) |
1805 | { | |
2e70ce56 KO |
1806 | if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) |
1807 | return bkey_s_c_null; | |
1c6fdbd8 | 1808 | |
2dac0eae | 1809 | bch2_btree_iter_set_pos(iter, |
39fb2983 KO |
1810 | (iter->flags & BTREE_ITER_IS_EXTENTS) |
1811 | ? iter->k.p | |
1812 | : bkey_successor(iter->k.p)); | |
1c6fdbd8 | 1813 | |
2dac0eae | 1814 | return bch2_btree_iter_peek_slot(iter); |
1c6fdbd8 KO |
1815 | } |
1816 | ||
9e5e5b9e KO |
1817 | static inline void bch2_btree_iter_init(struct btree_trans *trans, |
1818 | struct btree_iter *iter, enum btree_id btree_id, | |
424eb881 | 1819 | struct bpos pos, unsigned flags) |
1c6fdbd8 | 1820 | { |
9e5e5b9e | 1821 | struct bch_fs *c = trans->c; |
1c6fdbd8 KO |
1822 | unsigned i; |
1823 | ||
ab5c63f5 | 1824 | if (btree_node_type_is_extents(btree_id) && |
424eb881 KO |
1825 | !(flags & BTREE_ITER_NODES)) |
1826 | flags |= BTREE_ITER_IS_EXTENTS; | |
1c6fdbd8 | 1827 | |
9e5e5b9e | 1828 | iter->trans = trans; |
1c6fdbd8 KO |
1829 | iter->pos = pos; |
1830 | bkey_init(&iter->k); | |
1831 | iter->k.p = pos; | |
1832 | iter->flags = flags; | |
1833 | iter->uptodate = BTREE_ITER_NEED_TRAVERSE; | |
1834 | iter->btree_id = btree_id; | |
424eb881 | 1835 | iter->level = 0; |
2dac0eae | 1836 | iter->min_depth = 0; |
424eb881 | 1837 | iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; |
1c6fdbd8 KO |
1838 | iter->nodes_locked = 0; |
1839 | iter->nodes_intent_locked = 0; | |
1840 | for (i = 0; i < ARRAY_SIZE(iter->l); i++) | |
b606c8aa | 1841 | iter->l[i].b = BTREE_ITER_NO_NODE_INIT; |
1c6fdbd8 KO |
1842 | |
1843 | prefetch(c->btree_roots[btree_id].b); | |
1844 | } | |
1845 | ||
1c6fdbd8 KO |
1846 | /* new transactional stuff: */ |
1847 | ||
5df4be3f KO |
1848 | static inline void __bch2_trans_iter_free(struct btree_trans *trans, |
1849 | unsigned idx) | |
1850 | { | |
ecc892e4 | 1851 | __bch2_btree_iter_unlock(&trans->iters[idx]); |
5df4be3f KO |
1852 | trans->iters_linked &= ~(1ULL << idx); |
1853 | trans->iters_live &= ~(1ULL << idx); | |
1854 | trans->iters_touched &= ~(1ULL << idx); | |
5df4be3f KO |
1855 | } |
1856 | ||
64bc0011 KO |
1857 | int bch2_trans_iter_put(struct btree_trans *trans, |
1858 | struct btree_iter *iter) | |
1c6fdbd8 | 1859 | { |
8b53852d KO |
1860 | int ret; |
1861 | ||
1862 | if (IS_ERR_OR_NULL(iter)) | |
1863 | return 0; | |
1864 | ||
163e885a KO |
1865 | BUG_ON(trans->iters + iter->idx != iter); |
1866 | ||
8b53852d | 1867 | ret = btree_iter_err(iter); |
424eb881 | 1868 | |
64bc0011 KO |
1869 | if (!(trans->iters_touched & (1ULL << iter->idx)) && |
1870 | !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) | |
1871 | __bch2_trans_iter_free(trans, iter->idx); | |
1872 | ||
1873 | trans->iters_live &= ~(1ULL << iter->idx); | |
424eb881 | 1874 | return ret; |
5df4be3f | 1875 | } |
1c6fdbd8 | 1876 | |
64bc0011 KO |
1877 | int bch2_trans_iter_free(struct btree_trans *trans, |
1878 | struct btree_iter *iter) | |
5df4be3f | 1879 | { |
8b53852d KO |
1880 | if (IS_ERR_OR_NULL(iter)) |
1881 | return 0; | |
1882 | ||
64bc0011 | 1883 | trans->iters_touched &= ~(1ULL << iter->idx); |
424eb881 | 1884 | |
64bc0011 | 1885 | return bch2_trans_iter_put(trans, iter); |
1c6fdbd8 KO |
1886 | } |
1887 | ||
20bceecb | 1888 | static int bch2_trans_realloc_iters(struct btree_trans *trans, |
7d825866 | 1889 | unsigned new_size) |
1c6fdbd8 | 1890 | { |
e3e464ac | 1891 | void *p, *new_iters, *new_updates, *new_updates2; |
36e9d698 KO |
1892 | size_t iters_bytes; |
1893 | size_t updates_bytes; | |
1c6fdbd8 | 1894 | |
7d825866 KO |
1895 | new_size = roundup_pow_of_two(new_size); |
1896 | ||
a8e00bd4 KO |
1897 | BUG_ON(new_size > BTREE_ITER_MAX); |
1898 | ||
1899 | if (new_size <= trans->size) | |
1900 | return 0; | |
1901 | ||
1902 | BUG_ON(trans->used_mempool); | |
1903 | ||
1c6fdbd8 KO |
1904 | bch2_trans_unlock(trans); |
1905 | ||
36e9d698 | 1906 | iters_bytes = sizeof(struct btree_iter) * new_size; |
54e86b58 | 1907 | updates_bytes = sizeof(struct btree_insert_entry) * new_size; |
36e9d698 | 1908 | |
e3e464ac KO |
1909 | p = kmalloc(iters_bytes + |
1910 | updates_bytes + | |
1911 | updates_bytes, GFP_NOFS); | |
1912 | if (p) | |
a8e00bd4 KO |
1913 | goto success; |
1914 | ||
e3e464ac | 1915 | p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); |
a8e00bd4 KO |
1916 | new_size = BTREE_ITER_MAX; |
1917 | ||
1918 | trans->used_mempool = true; | |
1919 | success: | |
e3e464ac KO |
1920 | new_iters = p; p += iters_bytes; |
1921 | new_updates = p; p += updates_bytes; | |
1922 | new_updates2 = p; p += updates_bytes; | |
1c6fdbd8 KO |
1923 | |
1924 | memcpy(new_iters, trans->iters, | |
1925 | sizeof(struct btree_iter) * trans->nr_iters); | |
a8e00bd4 KO |
1926 | memcpy(new_updates, trans->updates, |
1927 | sizeof(struct btree_insert_entry) * trans->nr_updates); | |
e3e464ac KO |
1928 | memcpy(new_updates2, trans->updates2, |
1929 | sizeof(struct btree_insert_entry) * trans->nr_updates2); | |
a8e00bd4 | 1930 | |
5df4be3f KO |
1931 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) |
1932 | memset(trans->iters, POISON_FREE, | |
1933 | sizeof(struct btree_iter) * trans->nr_iters + | |
1934 | sizeof(struct btree_insert_entry) * trans->nr_iters); | |
1935 | ||
a8e00bd4 KO |
1936 | if (trans->iters != trans->iters_onstack) |
1937 | kfree(trans->iters); | |
1938 | ||
36e9d698 KO |
1939 | trans->iters = new_iters; |
1940 | trans->updates = new_updates; | |
e3e464ac | 1941 | trans->updates2 = new_updates2; |
36e9d698 | 1942 | trans->size = new_size; |
1c6fdbd8 | 1943 | |
1c7a0adf | 1944 | if (trans->iters_live) { |
20bceecb | 1945 | trace_trans_restart_iters_realloced(trans->ip, trans->size); |
1c7a0adf KO |
1946 | return -EINTR; |
1947 | } | |
1948 | ||
1949 | return 0; | |
1c6fdbd8 KO |
1950 | } |
1951 | ||
bbd8d203 | 1952 | static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) |
7c26ecae | 1953 | { |
4c1c1e39 | 1954 | unsigned idx = __ffs64(~trans->iters_linked); |
7c26ecae KO |
1955 | |
1956 | if (idx < trans->nr_iters) | |
1957 | goto got_slot; | |
1958 | ||
1959 | if (trans->nr_iters == trans->size) { | |
64bc0011 KO |
1960 | int ret; |
1961 | ||
1962 | if (trans->nr_iters >= BTREE_ITER_MAX) { | |
1963 | struct btree_iter *iter; | |
1964 | ||
1965 | trans_for_each_iter(trans, iter) { | |
495aabed | 1966 | pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", |
64bc0011 KO |
1967 | bch2_btree_ids[iter->btree_id], |
1968 | iter->pos.inode, | |
1969 | iter->pos.offset, | |
1970 | (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", | |
1971 | (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", | |
0329b150 KO |
1972 | iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", |
1973 | (void *) iter->ip_allocated); | |
64bc0011 KO |
1974 | } |
1975 | ||
1976 | panic("trans iter oveflow\n"); | |
1977 | } | |
1978 | ||
1979 | ret = bch2_trans_realloc_iters(trans, trans->size * 2); | |
7c26ecae | 1980 | if (ret) |
bbd8d203 | 1981 | return ERR_PTR(ret); |
7c26ecae KO |
1982 | } |
1983 | ||
1984 | idx = trans->nr_iters++; | |
1985 | BUG_ON(trans->nr_iters > trans->size); | |
e1120a4c KO |
1986 | |
1987 | trans->iters[idx].idx = idx; | |
7c26ecae | 1988 | got_slot: |
7c26ecae | 1989 | BUG_ON(trans->iters_linked & (1ULL << idx)); |
7c26ecae | 1990 | trans->iters_linked |= 1ULL << idx; |
24326cd1 | 1991 | trans->iters[idx].flags = 0; |
bbd8d203 | 1992 | return &trans->iters[idx]; |
7c26ecae KO |
1993 | } |
1994 | ||
64bc0011 KO |
1995 | static inline void btree_iter_copy(struct btree_iter *dst, |
1996 | struct btree_iter *src) | |
1997 | { | |
1998 | unsigned i, idx = dst->idx; | |
1999 | ||
2000 | *dst = *src; | |
2001 | dst->idx = idx; | |
2002 | ||
2003 | for (i = 0; i < BTREE_MAX_DEPTH; i++) | |
2004 | if (btree_node_locked(dst, i)) | |
2005 | six_lock_increment(&dst->l[i].b->c.lock, | |
2006 | __btree_lock_want(dst, i)); | |
24326cd1 KO |
2007 | |
2008 | dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; | |
2009 | dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; | |
64bc0011 KO |
2010 | } |
2011 | ||
2012 | static inline struct bpos bpos_diff(struct bpos l, struct bpos r) | |
2013 | { | |
2014 | if (bkey_cmp(l, r) > 0) | |
2015 | swap(l, r); | |
2016 | ||
2017 | return POS(r.inode - l.inode, r.offset - l.offset); | |
2018 | } | |
2019 | ||
1c6fdbd8 | 2020 | static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, |
5df4be3f | 2021 | unsigned btree_id, struct bpos pos, |
64bc0011 | 2022 | unsigned flags) |
1c6fdbd8 | 2023 | { |
64bc0011 | 2024 | struct btree_iter *iter, *best = NULL; |
1c6fdbd8 KO |
2025 | |
2026 | BUG_ON(trans->nr_iters > BTREE_ITER_MAX); | |
2027 | ||
64bc0011 KO |
2028 | trans_for_each_iter(trans, iter) { |
2029 | if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) | |
2030 | continue; | |
bbd8d203 | 2031 | |
64bc0011 KO |
2032 | if (iter->btree_id != btree_id) |
2033 | continue; | |
2034 | ||
2035 | if (best && | |
2036 | bkey_cmp(bpos_diff(best->pos, pos), | |
2037 | bpos_diff(iter->pos, pos)) < 0) | |
2038 | continue; | |
2039 | ||
2040 | best = iter; | |
2041 | } | |
2042 | ||
2043 | if (!best) { | |
bbd8d203 KO |
2044 | iter = btree_trans_iter_alloc(trans); |
2045 | if (IS_ERR(iter)) | |
2046 | return iter; | |
1c6fdbd8 | 2047 | |
9e5e5b9e | 2048 | bch2_btree_iter_init(trans, iter, btree_id, pos, flags); |
64bc0011 KO |
2049 | } else if ((trans->iters_live & (1ULL << best->idx)) || |
2050 | (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { | |
2051 | iter = btree_trans_iter_alloc(trans); | |
2052 | if (IS_ERR(iter)) | |
2053 | return iter; | |
1904a65a | 2054 | |
64bc0011 KO |
2055 | btree_iter_copy(iter, best); |
2056 | } else { | |
2057 | iter = best; | |
1c6fdbd8 KO |
2058 | } |
2059 | ||
64bc0011 KO |
2060 | iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); |
2061 | iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH); | |
2062 | ||
2063 | if (iter->flags & BTREE_ITER_INTENT) | |
2064 | bch2_btree_iter_upgrade(iter, 1); | |
2065 | else | |
2066 | bch2_btree_iter_downgrade(iter); | |
2067 | ||
5df4be3f | 2068 | BUG_ON(iter->btree_id != btree_id); |
64bc0011 KO |
2069 | BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); |
2070 | BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); | |
24326cd1 | 2071 | BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); |
bbd8d203 | 2072 | BUG_ON(trans->iters_live & (1ULL << iter->idx)); |
64bc0011 | 2073 | |
bbd8d203 KO |
2074 | trans->iters_live |= 1ULL << iter->idx; |
2075 | trans->iters_touched |= 1ULL << iter->idx; | |
1c6fdbd8 | 2076 | |
1c6fdbd8 KO |
2077 | return iter; |
2078 | } | |
2079 | ||
0329b150 KO |
2080 | struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, |
2081 | enum btree_id btree_id, | |
2082 | struct bpos pos, unsigned flags) | |
1c6fdbd8 KO |
2083 | { |
2084 | struct btree_iter *iter = | |
64bc0011 | 2085 | __btree_trans_get_iter(trans, btree_id, pos, flags); |
1c6fdbd8 KO |
2086 | |
2087 | if (!IS_ERR(iter)) | |
6a9ec828 KO |
2088 | __bch2_btree_iter_set_pos(iter, pos, |
2089 | btree_node_type_is_extents(btree_id)); | |
1c6fdbd8 KO |
2090 | return iter; |
2091 | } | |
2092 | ||
424eb881 KO |
2093 | struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, |
2094 | enum btree_id btree_id, | |
2095 | struct bpos pos, | |
2096 | unsigned locks_want, | |
2097 | unsigned depth, | |
2098 | unsigned flags) | |
2099 | { | |
2100 | struct btree_iter *iter = | |
2101 | __btree_trans_get_iter(trans, btree_id, pos, | |
64bc0011 | 2102 | flags|BTREE_ITER_NODES); |
424eb881 KO |
2103 | unsigned i; |
2104 | ||
2105 | BUG_ON(IS_ERR(iter)); | |
2106 | BUG_ON(bkey_cmp(iter->pos, pos)); | |
2107 | ||
2108 | iter->locks_want = locks_want; | |
2109 | iter->level = depth; | |
2dac0eae | 2110 | iter->min_depth = depth; |
424eb881 KO |
2111 | |
2112 | for (i = 0; i < ARRAY_SIZE(iter->l); i++) | |
2113 | iter->l[i].b = NULL; | |
ed8413fd | 2114 | iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; |
424eb881 KO |
2115 | |
2116 | return iter; | |
2117 | } | |
2118 | ||
0329b150 | 2119 | struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, |
7c26ecae | 2120 | struct btree_iter *src) |
1c6fdbd8 | 2121 | { |
ecc892e4 | 2122 | struct btree_iter *iter; |
1c6fdbd8 | 2123 | |
bbd8d203 KO |
2124 | iter = btree_trans_iter_alloc(trans); |
2125 | if (IS_ERR(iter)) | |
2126 | return iter; | |
2127 | ||
64bc0011 | 2128 | btree_iter_copy(iter, src); |
7c26ecae | 2129 | |
64bc0011 KO |
2130 | trans->iters_live |= 1ULL << iter->idx; |
2131 | /* | |
8b53852d KO |
2132 | * We don't need to preserve this iter since it's cheap to copy it |
2133 | * again - this will cause trans_iter_put() to free it right away: | |
64bc0011 KO |
2134 | */ |
2135 | trans->iters_touched &= ~(1ULL << iter->idx); | |
7c26ecae | 2136 | |
bbd8d203 | 2137 | return iter; |
1c6fdbd8 KO |
2138 | } |
2139 | ||
20bceecb | 2140 | static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) |
1c6fdbd8 | 2141 | { |
20bceecb | 2142 | if (size > trans->mem_bytes) { |
1c6fdbd8 | 2143 | size_t old_bytes = trans->mem_bytes; |
20bceecb | 2144 | size_t new_bytes = roundup_pow_of_two(size); |
1c6fdbd8 KO |
2145 | void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); |
2146 | ||
2147 | if (!new_mem) | |
20bceecb | 2148 | return -ENOMEM; |
1c6fdbd8 KO |
2149 | |
2150 | trans->mem = new_mem; | |
2151 | trans->mem_bytes = new_bytes; | |
2152 | ||
1c7a0adf | 2153 | if (old_bytes) { |
20bceecb KO |
2154 | trace_trans_restart_mem_realloced(trans->ip, new_bytes); |
2155 | return -EINTR; | |
1c7a0adf | 2156 | } |
1c6fdbd8 KO |
2157 | } |
2158 | ||
20bceecb KO |
2159 | return 0; |
2160 | } | |
2161 | ||
2162 | void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) | |
2163 | { | |
2164 | void *p; | |
2165 | int ret; | |
2166 | ||
2167 | ret = bch2_trans_preload_mem(trans, trans->mem_top + size); | |
2168 | if (ret) | |
2169 | return ERR_PTR(ret); | |
2170 | ||
2171 | p = trans->mem + trans->mem_top; | |
1c6fdbd8 | 2172 | trans->mem_top += size; |
20bceecb | 2173 | return p; |
1c6fdbd8 KO |
2174 | } |
2175 | ||
64bc0011 | 2176 | inline void bch2_trans_unlink_iters(struct btree_trans *trans) |
5df4be3f | 2177 | { |
64bc0011 KO |
2178 | u64 iters = trans->iters_linked & |
2179 | ~trans->iters_touched & | |
2180 | ~trans->iters_live; | |
5df4be3f KO |
2181 | |
2182 | while (iters) { | |
2183 | unsigned idx = __ffs64(iters); | |
2184 | ||
2185 | iters &= ~(1ULL << idx); | |
2186 | __bch2_trans_iter_free(trans, idx); | |
2187 | } | |
2188 | } | |
2189 | ||
64bc0011 | 2190 | void bch2_trans_reset(struct btree_trans *trans, unsigned flags) |
1c6fdbd8 | 2191 | { |
64bc0011 | 2192 | struct btree_iter *iter; |
1c6fdbd8 | 2193 | |
64bc0011 | 2194 | trans_for_each_iter(trans, iter) |
24326cd1 KO |
2195 | iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| |
2196 | BTREE_ITER_SET_POS_AFTER_COMMIT); | |
a8e00bd4 | 2197 | |
64bc0011 | 2198 | bch2_trans_unlink_iters(trans); |
a8e00bd4 | 2199 | |
64bc0011 | 2200 | trans->iters_touched &= trans->iters_live; |
1c6fdbd8 | 2201 | |
24326cd1 | 2202 | trans->need_reset = 0; |
5df4be3f | 2203 | trans->nr_updates = 0; |
e3e464ac | 2204 | trans->nr_updates2 = 0; |
163e885a | 2205 | trans->mem_top = 0; |
bf7b87a4 | 2206 | |
96e2aa1b KO |
2207 | trans->extra_journal_entries = NULL; |
2208 | trans->extra_journal_entry_u64s = 0; | |
2209 | ||
f21539a5 KO |
2210 | if (trans->fs_usage_deltas) { |
2211 | trans->fs_usage_deltas->used = 0; | |
2212 | memset((void *) trans->fs_usage_deltas + | |
2213 | offsetof(struct replicas_delta_list, memset_start), 0, | |
2214 | (void *) &trans->fs_usage_deltas->memset_end - | |
2215 | (void *) &trans->fs_usage_deltas->memset_start); | |
2216 | } | |
2217 | ||
2218 | if (!(flags & TRANS_RESET_NOTRAVERSE)) | |
2219 | bch2_btree_iter_traverse_all(trans); | |
1c6fdbd8 KO |
2220 | } |
2221 | ||
20bceecb KO |
2222 | void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, |
2223 | unsigned expected_nr_iters, | |
2224 | size_t expected_mem_bytes) | |
1c6fdbd8 | 2225 | { |
a8e00bd4 KO |
2226 | memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); |
2227 | ||
163e885a KO |
2228 | /* |
2229 | * reallocating iterators currently completely breaks | |
2230 | * bch2_trans_iter_put(): | |
2231 | */ | |
2232 | expected_nr_iters = BTREE_ITER_MAX; | |
2233 | ||
1c6fdbd8 | 2234 | trans->c = c; |
ba5c6557 | 2235 | trans->ip = _RET_IP_; |
a8e00bd4 | 2236 | trans->size = ARRAY_SIZE(trans->iters_onstack); |
1c6fdbd8 | 2237 | trans->iters = trans->iters_onstack; |
a8e00bd4 | 2238 | trans->updates = trans->updates_onstack; |
e3e464ac | 2239 | trans->updates2 = trans->updates2_onstack; |
3838be78 | 2240 | trans->fs_usage_deltas = NULL; |
20bceecb KO |
2241 | |
2242 | if (expected_nr_iters > trans->size) | |
2243 | bch2_trans_realloc_iters(trans, expected_nr_iters); | |
2244 | ||
2245 | if (expected_mem_bytes) | |
2246 | bch2_trans_preload_mem(trans, expected_mem_bytes); | |
495aabed KO |
2247 | |
2248 | #ifdef CONFIG_BCACHEFS_DEBUG | |
f96c0df4 | 2249 | trans->pid = current->pid; |
495aabed KO |
2250 | mutex_lock(&c->btree_trans_lock); |
2251 | list_add(&trans->list, &c->btree_trans_list); | |
2252 | mutex_unlock(&c->btree_trans_lock); | |
2253 | #endif | |
1c6fdbd8 KO |
2254 | } |
2255 | ||
2256 | int bch2_trans_exit(struct btree_trans *trans) | |
2257 | { | |
ece254b2 | 2258 | bch2_trans_unlock(trans); |
1c6fdbd8 | 2259 | |
495aabed KO |
2260 | #ifdef CONFIG_BCACHEFS_DEBUG |
2261 | mutex_lock(&trans->c->btree_trans_lock); | |
2262 | list_del(&trans->list); | |
2263 | mutex_unlock(&trans->c->btree_trans_lock); | |
2264 | #endif | |
2265 | ||
3838be78 | 2266 | kfree(trans->fs_usage_deltas); |
1c6fdbd8 | 2267 | kfree(trans->mem); |
a8e00bd4 | 2268 | if (trans->used_mempool) |
581edb63 | 2269 | mempool_free(trans->iters, &trans->c->btree_iters_pool); |
a8e00bd4 KO |
2270 | else if (trans->iters != trans->iters_onstack) |
2271 | kfree(trans->iters); | |
1c6fdbd8 KO |
2272 | trans->mem = (void *) 0x1; |
2273 | trans->iters = (void *) 0x1; | |
ece254b2 KO |
2274 | |
2275 | return trans->error ? -EIO : 0; | |
1c6fdbd8 | 2276 | } |
36e9d698 | 2277 | |
495aabed KO |
2278 | void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) |
2279 | { | |
2280 | #ifdef CONFIG_BCACHEFS_DEBUG | |
2281 | struct btree_trans *trans; | |
2282 | struct btree_iter *iter; | |
2283 | struct btree *b; | |
2284 | unsigned l; | |
2285 | ||
2286 | mutex_lock(&c->btree_trans_lock); | |
2287 | list_for_each_entry(trans, &c->btree_trans_list, list) { | |
515282ac | 2288 | pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); |
495aabed KO |
2289 | |
2290 | trans_for_each_iter(trans, iter) { | |
2291 | if (!iter->nodes_locked) | |
2292 | continue; | |
2293 | ||
515282ac KO |
2294 | pr_buf(out, " iter %u %s:", |
2295 | iter->idx, | |
2296 | bch2_btree_ids[iter->btree_id]); | |
495aabed KO |
2297 | bch2_bpos_to_text(out, iter->pos); |
2298 | pr_buf(out, "\n"); | |
2299 | ||
2300 | for (l = 0; l < BTREE_MAX_DEPTH; l++) { | |
2301 | if (btree_node_locked(iter, l)) { | |
2302 | b = iter->l[l].b; | |
2303 | ||
515282ac KO |
2304 | pr_buf(out, " %px %s l=%u ", |
2305 | b, btree_node_intent_locked(iter, l) ? "i" : "r", l); | |
495aabed KO |
2306 | bch2_bpos_to_text(out, b->key.k.p); |
2307 | pr_buf(out, "\n"); | |
2308 | } | |
2309 | } | |
2310 | } | |
2311 | ||
2312 | b = READ_ONCE(trans->locking); | |
2313 | if (b) { | |
515282ac KO |
2314 | pr_buf(out, " locking iter %u l=%u %s:", |
2315 | trans->locking_iter_idx, | |
2316 | trans->locking_level, | |
2317 | bch2_btree_ids[trans->locking_btree_id]); | |
2318 | bch2_bpos_to_text(out, trans->locking_pos); | |
2319 | ||
2320 | pr_buf(out, " node %px l=%u %s:", | |
495aabed KO |
2321 | b, b->c.level, |
2322 | bch2_btree_ids[b->c.btree_id]); | |
2323 | bch2_bpos_to_text(out, b->key.k.p); | |
2324 | pr_buf(out, "\n"); | |
2325 | } | |
2326 | } | |
2327 | mutex_unlock(&c->btree_trans_lock); | |
2328 | #endif | |
2329 | } | |
2330 | ||
36e9d698 KO |
2331 | void bch2_fs_btree_iter_exit(struct bch_fs *c) |
2332 | { | |
2333 | mempool_exit(&c->btree_iters_pool); | |
2334 | } | |
2335 | ||
2336 | int bch2_fs_btree_iter_init(struct bch_fs *c) | |
2337 | { | |
2338 | unsigned nr = BTREE_ITER_MAX; | |
2339 | ||
495aabed KO |
2340 | INIT_LIST_HEAD(&c->btree_trans_list); |
2341 | mutex_init(&c->btree_trans_lock); | |
2342 | ||
36e9d698 KO |
2343 | return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, |
2344 | sizeof(struct btree_iter) * nr + | |
54e86b58 | 2345 | sizeof(struct btree_insert_entry) * nr + |
e3e464ac | 2346 | sizeof(struct btree_insert_entry) * nr); |
36e9d698 | 2347 | } |