Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
4 | #include "bkey_methods.h" | |
07a1006a | 5 | #include "bkey_buf.h" |
1c6fdbd8 KO |
6 | #include "btree_cache.h" |
7 | #include "btree_iter.h" | |
2ca88e5a | 8 | #include "btree_key_cache.h" |
1c6fdbd8 | 9 | #include "btree_locking.h" |
57b0b3db | 10 | #include "btree_update.h" |
1c6fdbd8 | 11 | #include "debug.h" |
50dc0f69 | 12 | #include "error.h" |
1c6fdbd8 | 13 | #include "extents.h" |
2ca88e5a | 14 | #include "journal.h" |
5222a460 | 15 | #include "recovery.h" |
35d5aff2 | 16 | #include "replicas.h" |
c075ff70 | 17 | #include "subvolume.h" |
1c6fdbd8 KO |
18 | #include "trace.h" |
19 | ||
20 | #include <linux/prefetch.h> | |
21 | ||
67e0dd8f KO |
22 | static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); |
23 | static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, | |
24 | struct btree_path *); | |
25 | ||
26 | static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); | |
27 | ||
b0d1b70a KO |
28 | /* |
29 | * Unlocks before scheduling | |
30 | * Note: does not revalidate iterator | |
31 | */ | |
32 | static inline int bch2_trans_cond_resched(struct btree_trans *trans) | |
33 | { | |
34 | if (need_resched() || race_fault()) { | |
35 | bch2_trans_unlock(trans); | |
36 | schedule(); | |
37 | return bch2_trans_relock(trans) ? 0 : -EINTR; | |
38 | } else { | |
39 | return 0; | |
40 | } | |
41 | } | |
42 | ||
67e0dd8f KO |
43 | static inline int __btree_path_cmp(const struct btree_path *l, |
44 | enum btree_id r_btree_id, | |
45 | bool r_cached, | |
46 | struct bpos r_pos, | |
47 | unsigned r_level) | |
0423fb71 | 48 | { |
7abda8c1 KO |
49 | /* |
50 | * Must match lock ordering as defined by __bch2_btree_node_lock: | |
51 | */ | |
67e0dd8f | 52 | return cmp_int(l->btree_id, r_btree_id) ?: |
32b26e8c | 53 | cmp_int((int) l->cached, (int) r_cached) ?: |
67e0dd8f KO |
54 | bpos_cmp(l->pos, r_pos) ?: |
55 | -cmp_int(l->level, r_level); | |
56 | } | |
57 | ||
58 | static inline int btree_path_cmp(const struct btree_path *l, | |
59 | const struct btree_path *r) | |
60 | { | |
61 | return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); | |
0423fb71 KO |
62 | } |
63 | ||
e751c01a KO |
64 | static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) |
65 | { | |
e751c01a KO |
66 | /* Are we iterating over keys in all snapshots? */ |
67 | if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { | |
68 | p = bpos_successor(p); | |
69 | } else { | |
70 | p = bpos_nosnap_successor(p); | |
71 | p.snapshot = iter->snapshot; | |
72 | } | |
73 | ||
74 | return p; | |
75 | } | |
76 | ||
77 | static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) | |
78 | { | |
e751c01a KO |
79 | /* Are we iterating over keys in all snapshots? */ |
80 | if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { | |
81 | p = bpos_predecessor(p); | |
82 | } else { | |
83 | p = bpos_nosnap_predecessor(p); | |
84 | p.snapshot = iter->snapshot; | |
85 | } | |
86 | ||
87 | return p; | |
88 | } | |
89 | ||
67e0dd8f | 90 | static inline bool is_btree_node(struct btree_path *path, unsigned l) |
1c6fdbd8 KO |
91 | { |
92 | return l < BTREE_MAX_DEPTH && | |
67e0dd8f | 93 | (unsigned long) path->l[l].b >= 128; |
1c6fdbd8 KO |
94 | } |
95 | ||
9626aeb1 | 96 | static inline struct bpos btree_iter_search_key(struct btree_iter *iter) |
a00fd8c5 | 97 | { |
9626aeb1 | 98 | struct bpos pos = iter->pos; |
cbdf24ce | 99 | |
9626aeb1 KO |
100 | if ((iter->flags & BTREE_ITER_IS_EXTENTS) && |
101 | bkey_cmp(pos, POS_MAX)) | |
e751c01a | 102 | pos = bkey_successor(iter, pos); |
9626aeb1 | 103 | return pos; |
a00fd8c5 KO |
104 | } |
105 | ||
67e0dd8f | 106 | static inline bool btree_path_pos_before_node(struct btree_path *path, |
e3ecf4f5 KO |
107 | struct btree *b) |
108 | { | |
67e0dd8f | 109 | return bpos_cmp(path->pos, b->data->min_key) < 0; |
e3ecf4f5 KO |
110 | } |
111 | ||
67e0dd8f | 112 | static inline bool btree_path_pos_after_node(struct btree_path *path, |
e3ecf4f5 KO |
113 | struct btree *b) |
114 | { | |
67e0dd8f | 115 | return bpos_cmp(b->key.k.p, path->pos) < 0; |
e3ecf4f5 KO |
116 | } |
117 | ||
67e0dd8f | 118 | static inline bool btree_path_pos_in_node(struct btree_path *path, |
e3ecf4f5 KO |
119 | struct btree *b) |
120 | { | |
67e0dd8f KO |
121 | return path->btree_id == b->c.btree_id && |
122 | !btree_path_pos_before_node(path, b) && | |
123 | !btree_path_pos_after_node(path, b); | |
e3ecf4f5 KO |
124 | } |
125 | ||
1c6fdbd8 KO |
126 | /* Btree node locking: */ |
127 | ||
9f6bd307 | 128 | void bch2_btree_node_unlock_write(struct btree_trans *trans, |
67e0dd8f | 129 | struct btree_path *path, struct btree *b) |
1c6fdbd8 | 130 | { |
67e0dd8f | 131 | bch2_btree_node_unlock_write_inlined(trans, path, b); |
1c6fdbd8 KO |
132 | } |
133 | ||
78cf784e | 134 | void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) |
1c6fdbd8 | 135 | { |
67e0dd8f | 136 | struct btree_path *linked; |
1c6fdbd8 KO |
137 | unsigned readers = 0; |
138 | ||
67e0dd8f KO |
139 | trans_for_each_path(trans, linked) |
140 | if (linked->l[b->c.level].b == b && | |
141 | btree_node_read_locked(linked, b->c.level)) | |
1c6fdbd8 KO |
142 | readers++; |
143 | ||
144 | /* | |
145 | * Must drop our read locks before calling six_lock_write() - | |
146 | * six_unlock() won't do wakeups until the reader count | |
147 | * goes to 0, and it's safe because we have the node intent | |
148 | * locked: | |
149 | */ | |
a9d79c6e KO |
150 | if (!b->c.lock.readers) |
151 | atomic64_sub(__SIX_VAL(read_lock, readers), | |
152 | &b->c.lock.state.counter); | |
153 | else | |
154 | this_cpu_sub(*b->c.lock.readers, readers); | |
155 | ||
c7ce2732 | 156 | six_lock_write(&b->c.lock, NULL, NULL); |
a9d79c6e KO |
157 | |
158 | if (!b->c.lock.readers) | |
159 | atomic64_add(__SIX_VAL(read_lock, readers), | |
160 | &b->c.lock.state.counter); | |
161 | else | |
162 | this_cpu_add(*b->c.lock.readers, readers); | |
1c6fdbd8 KO |
163 | } |
164 | ||
78cf784e | 165 | bool __bch2_btree_node_relock(struct btree_trans *trans, |
67e0dd8f | 166 | struct btree_path *path, unsigned level) |
1c6fdbd8 | 167 | { |
67e0dd8f KO |
168 | struct btree *b = btree_path_node(path, level); |
169 | int want = __btree_lock_want(path, level); | |
1c6fdbd8 | 170 | |
67e0dd8f | 171 | if (!is_btree_node(path, level)) |
bc82d08b | 172 | goto fail; |
1c6fdbd8 KO |
173 | |
174 | if (race_fault()) | |
bc82d08b | 175 | goto fail; |
1c6fdbd8 | 176 | |
67e0dd8f KO |
177 | if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || |
178 | (btree_node_lock_seq_matches(path, b, level) && | |
78cf784e | 179 | btree_node_lock_increment(trans, b, level, want))) { |
3074bc0f | 180 | mark_btree_node_locked(path, level, want); |
ed8413fd | 181 | return true; |
ed8413fd | 182 | } |
bc82d08b KO |
183 | fail: |
184 | trace_btree_node_relock_fail(trans->fn, _RET_IP_, | |
185 | path->btree_id, | |
186 | &path->pos, | |
187 | (unsigned long) b, | |
188 | path->l[level].lock_seq, | |
189 | is_btree_node(path, level) ? b->c.lock.state.seq : 0); | |
190 | return false; | |
1c6fdbd8 KO |
191 | } |
192 | ||
f527afea KO |
193 | bool bch2_btree_node_upgrade(struct btree_trans *trans, |
194 | struct btree_path *path, unsigned level) | |
1c6fdbd8 | 195 | { |
67e0dd8f | 196 | struct btree *b = path->l[level].b; |
1c6fdbd8 | 197 | |
67e0dd8f | 198 | if (!is_btree_node(path, level)) |
1c6fdbd8 KO |
199 | return false; |
200 | ||
d697b9ab KO |
201 | switch (btree_lock_want(path, level)) { |
202 | case BTREE_NODE_UNLOCKED: | |
203 | BUG_ON(btree_node_locked(path, level)); | |
204 | return true; | |
205 | case BTREE_NODE_READ_LOCKED: | |
206 | BUG_ON(btree_node_intent_locked(path, level)); | |
207 | return bch2_btree_node_relock(trans, path, level); | |
208 | case BTREE_NODE_INTENT_LOCKED: | |
209 | break; | |
210 | } | |
211 | ||
67e0dd8f | 212 | if (btree_node_intent_locked(path, level)) |
1c6fdbd8 KO |
213 | return true; |
214 | ||
215 | if (race_fault()) | |
216 | return false; | |
217 | ||
67e0dd8f | 218 | if (btree_node_locked(path, level) |
c43a6ef9 | 219 | ? six_lock_tryupgrade(&b->c.lock) |
67e0dd8f | 220 | : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) |
1c6fdbd8 KO |
221 | goto success; |
222 | ||
67e0dd8f | 223 | if (btree_node_lock_seq_matches(path, b, level) && |
78cf784e | 224 | btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { |
67e0dd8f | 225 | btree_node_unlock(path, level); |
1c6fdbd8 KO |
226 | goto success; |
227 | } | |
228 | ||
229 | return false; | |
230 | success: | |
3074bc0f | 231 | mark_btree_node_intent_locked(path, level); |
1c6fdbd8 KO |
232 | return true; |
233 | } | |
234 | ||
67e0dd8f KO |
235 | static inline bool btree_path_get_locks(struct btree_trans *trans, |
236 | struct btree_path *path, | |
bc82d08b | 237 | bool upgrade) |
1c6fdbd8 | 238 | { |
67e0dd8f | 239 | unsigned l = path->level; |
1c6fdbd8 KO |
240 | int fail_idx = -1; |
241 | ||
242 | do { | |
67e0dd8f | 243 | if (!btree_path_node(path, l)) |
1c6fdbd8 KO |
244 | break; |
245 | ||
246 | if (!(upgrade | |
67e0dd8f | 247 | ? bch2_btree_node_upgrade(trans, path, l) |
f48361b0 | 248 | : bch2_btree_node_relock(trans, path, l))) |
1c6fdbd8 | 249 | fail_idx = l; |
1c6fdbd8 KO |
250 | |
251 | l++; | |
67e0dd8f | 252 | } while (l < path->locks_want); |
1c6fdbd8 KO |
253 | |
254 | /* | |
255 | * When we fail to get a lock, we have to ensure that any child nodes | |
67e0dd8f | 256 | * can't be relocked so bch2_btree_path_traverse has to walk back up to |
1c6fdbd8 KO |
257 | * the node that we failed to relock: |
258 | */ | |
1d3ecd7e KO |
259 | if (fail_idx >= 0) { |
260 | __bch2_btree_path_unlock(path); | |
261 | btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); | |
262 | ||
263 | do { | |
264 | path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; | |
265 | --fail_idx; | |
266 | } while (fail_idx >= 0); | |
1c6fdbd8 KO |
267 | } |
268 | ||
67e0dd8f KO |
269 | if (path->uptodate == BTREE_ITER_NEED_RELOCK) |
270 | path->uptodate = BTREE_ITER_UPTODATE; | |
1c6fdbd8 | 271 | |
a0a56879 | 272 | bch2_trans_verify_locks(trans); |
0f238367 | 273 | |
67e0dd8f | 274 | return path->uptodate < BTREE_ITER_NEED_RELOCK; |
1c6fdbd8 KO |
275 | } |
276 | ||
d211b408 | 277 | static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, |
f21566f1 | 278 | bool cached) |
d211b408 | 279 | { |
f21566f1 | 280 | return !cached |
d211b408 KO |
281 | ? container_of(_b, struct btree, c)->key.k.p |
282 | : container_of(_b, struct bkey_cached, c)->key.pos; | |
283 | } | |
284 | ||
1c6fdbd8 | 285 | /* Slowpath: */ |
78cf784e | 286 | bool __bch2_btree_node_lock(struct btree_trans *trans, |
67e0dd8f KO |
287 | struct btree_path *path, |
288 | struct btree *b, | |
289 | struct bpos pos, unsigned level, | |
bd2bb273 | 290 | enum six_lock_type type, |
a301dc38 KO |
291 | six_lock_should_sleep_fn should_sleep_fn, void *p, |
292 | unsigned long ip) | |
1c6fdbd8 | 293 | { |
7abda8c1 KO |
294 | struct btree_path *linked; |
295 | unsigned reason; | |
1c6fdbd8 | 296 | |
647d7b60 | 297 | /* Check if it's safe to block: */ |
67e0dd8f | 298 | trans_for_each_path(trans, linked) { |
1c6fdbd8 KO |
299 | if (!linked->nodes_locked) |
300 | continue; | |
301 | ||
1c6fdbd8 KO |
302 | /* |
303 | * Can't block taking an intent lock if we have _any_ nodes read | |
304 | * locked: | |
305 | * | |
306 | * - Our read lock blocks another thread with an intent lock on | |
307 | * the same node from getting a write lock, and thus from | |
308 | * dropping its intent lock | |
309 | * | |
310 | * - And the other thread may have multiple nodes intent locked: | |
311 | * both the node we want to intent lock, and the node we | |
312 | * already have read locked - deadlock: | |
313 | */ | |
314 | if (type == SIX_LOCK_intent && | |
315 | linked->nodes_locked != linked->nodes_intent_locked) { | |
2527dd91 | 316 | reason = 1; |
7abda8c1 | 317 | goto deadlock; |
1c6fdbd8 KO |
318 | } |
319 | ||
67e0dd8f | 320 | if (linked->btree_id != path->btree_id) { |
7abda8c1 KO |
321 | if (linked->btree_id < path->btree_id) |
322 | continue; | |
323 | ||
324 | reason = 3; | |
325 | goto deadlock; | |
7e7ae6ca KO |
326 | } |
327 | ||
328 | /* | |
7abda8c1 KO |
329 | * Within the same btree, non-cached paths come before cached |
330 | * paths: | |
7e7ae6ca | 331 | */ |
67e0dd8f | 332 | if (linked->cached != path->cached) { |
7abda8c1 KO |
333 | if (!linked->cached) |
334 | continue; | |
335 | ||
336 | reason = 4; | |
337 | goto deadlock; | |
7e7ae6ca KO |
338 | } |
339 | ||
1c6fdbd8 KO |
340 | /* |
341 | * Interior nodes must be locked before their descendants: if | |
67e0dd8f | 342 | * another path has possible descendants locked of the node |
1c6fdbd8 KO |
343 | * we're about to lock, it must have the ancestors locked too: |
344 | */ | |
7e7ae6ca | 345 | if (level > __fls(linked->nodes_locked)) { |
2527dd91 | 346 | reason = 5; |
7abda8c1 | 347 | goto deadlock; |
515282ac KO |
348 | } |
349 | ||
350 | /* Must lock btree nodes in key order: */ | |
7e7ae6ca | 351 | if (btree_node_locked(linked, level) && |
4cf91b02 | 352 | bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, |
f21566f1 | 353 | linked->cached)) <= 0) { |
7e7ae6ca | 354 | reason = 7; |
7abda8c1 | 355 | goto deadlock; |
a301dc38 | 356 | } |
1c6fdbd8 KO |
357 | } |
358 | ||
c7ce2732 KO |
359 | return btree_node_lock_type(trans, path, b, pos, level, |
360 | type, should_sleep_fn, p); | |
7abda8c1 KO |
361 | deadlock: |
362 | trace_trans_restart_would_deadlock(trans->fn, ip, | |
363 | trans->in_traverse_all, reason, | |
364 | linked->btree_id, | |
365 | linked->cached, | |
366 | &linked->pos, | |
367 | path->btree_id, | |
368 | path->cached, | |
369 | &pos); | |
370 | btree_trans_restart(trans); | |
371 | return false; | |
1c6fdbd8 KO |
372 | } |
373 | ||
374 | /* Btree iterator locking: */ | |
375 | ||
376 | #ifdef CONFIG_BCACHEFS_DEBUG | |
67e0dd8f KO |
377 | |
378 | static void bch2_btree_path_verify_locks(struct btree_path *path) | |
1c6fdbd8 KO |
379 | { |
380 | unsigned l; | |
381 | ||
1d3ecd7e | 382 | if (!path->nodes_locked) { |
d697b9ab KO |
383 | BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && |
384 | btree_path_node(path, path->level)); | |
1d3ecd7e KO |
385 | return; |
386 | } | |
1c6fdbd8 | 387 | |
1d3ecd7e | 388 | for (l = 0; btree_path_node(path, l); l++) |
67e0dd8f KO |
389 | BUG_ON(btree_lock_want(path, l) != |
390 | btree_node_locked_type(path, l)); | |
1c6fdbd8 | 391 | } |
ad7ae8d6 | 392 | |
a0a56879 | 393 | void bch2_trans_verify_locks(struct btree_trans *trans) |
ad7ae8d6 | 394 | { |
67e0dd8f | 395 | struct btree_path *path; |
ad7ae8d6 | 396 | |
67e0dd8f KO |
397 | trans_for_each_path(trans, path) |
398 | bch2_btree_path_verify_locks(path); | |
ad7ae8d6 | 399 | } |
7d6f9b64 | 400 | #else |
67e0dd8f | 401 | static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} |
1c6fdbd8 KO |
402 | #endif |
403 | ||
67e0dd8f KO |
404 | /* Btree path locking: */ |
405 | ||
6e075b54 KO |
406 | /* |
407 | * Only for btree_cache.c - only relocks intent locks | |
408 | */ | |
67e0dd8f KO |
409 | bool bch2_btree_path_relock_intent(struct btree_trans *trans, |
410 | struct btree_path *path) | |
6e075b54 KO |
411 | { |
412 | unsigned l; | |
413 | ||
67e0dd8f KO |
414 | for (l = path->level; |
415 | l < path->locks_want && btree_path_node(path, l); | |
6e075b54 | 416 | l++) { |
67e0dd8f | 417 | if (!bch2_btree_node_relock(trans, path, l)) { |
1d3ecd7e | 418 | __bch2_btree_path_unlock(path); |
67e0dd8f | 419 | btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); |
bc82d08b KO |
420 | trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, |
421 | path->btree_id, &path->pos); | |
9f6bd307 | 422 | btree_trans_restart(trans); |
6e075b54 KO |
423 | return false; |
424 | } | |
425 | } | |
426 | ||
427 | return true; | |
428 | } | |
429 | ||
1c6fdbd8 | 430 | __flatten |
67e0dd8f KO |
431 | static bool bch2_btree_path_relock(struct btree_trans *trans, |
432 | struct btree_path *path, unsigned long trace_ip) | |
1c6fdbd8 | 433 | { |
bc82d08b | 434 | bool ret = btree_path_get_locks(trans, path, false); |
e5af273f | 435 | |
bc82d08b KO |
436 | if (!ret) { |
437 | trace_trans_restart_relock_path(trans->fn, trace_ip, | |
438 | path->btree_id, &path->pos); | |
9f6bd307 | 439 | btree_trans_restart(trans); |
bc82d08b | 440 | } |
e5af273f | 441 | return ret; |
1c6fdbd8 KO |
442 | } |
443 | ||
67e0dd8f KO |
444 | bool __bch2_btree_path_upgrade(struct btree_trans *trans, |
445 | struct btree_path *path, | |
1c6fdbd8 KO |
446 | unsigned new_locks_want) |
447 | { | |
67e0dd8f | 448 | struct btree_path *linked; |
5e6a668b | 449 | |
67e0dd8f | 450 | EBUG_ON(path->locks_want >= new_locks_want); |
1c6fdbd8 | 451 | |
67e0dd8f | 452 | path->locks_want = new_locks_want; |
1c6fdbd8 | 453 | |
bc82d08b | 454 | if (btree_path_get_locks(trans, path, true)) |
5e6a668b KO |
455 | return true; |
456 | ||
457 | /* | |
458 | * XXX: this is ugly - we'd prefer to not be mucking with other | |
459 | * iterators in the btree_trans here. | |
460 | * | |
461 | * On failure to upgrade the iterator, setting iter->locks_want and | |
67e0dd8f | 462 | * calling get_locks() is sufficient to make bch2_btree_path_traverse() |
5e6a668b KO |
463 | * get the locks we want on transaction restart. |
464 | * | |
465 | * But if this iterator was a clone, on transaction restart what we did | |
466 | * to this iterator isn't going to be preserved. | |
467 | * | |
468 | * Possibly we could add an iterator field for the parent iterator when | |
469 | * an iterator is a copy - for now, we'll just upgrade any other | |
470 | * iterators with the same btree id. | |
471 | * | |
472 | * The code below used to be needed to ensure ancestor nodes get locked | |
473 | * before interior nodes - now that's handled by | |
67e0dd8f | 474 | * bch2_btree_path_traverse_all(). |
5e6a668b | 475 | */ |
67e0dd8f KO |
476 | trans_for_each_path(trans, linked) |
477 | if (linked != path && | |
478 | linked->cached == path->cached && | |
479 | linked->btree_id == path->btree_id && | |
5e6a668b KO |
480 | linked->locks_want < new_locks_want) { |
481 | linked->locks_want = new_locks_want; | |
bc82d08b | 482 | btree_path_get_locks(trans, linked, true); |
5e6a668b KO |
483 | } |
484 | ||
485 | return false; | |
1c6fdbd8 KO |
486 | } |
487 | ||
67e0dd8f | 488 | void __bch2_btree_path_downgrade(struct btree_path *path, |
2d587674 | 489 | unsigned new_locks_want) |
1c6fdbd8 | 490 | { |
2d587674 | 491 | unsigned l; |
1c6fdbd8 | 492 | |
67e0dd8f | 493 | EBUG_ON(path->locks_want < new_locks_want); |
1c6fdbd8 | 494 | |
67e0dd8f | 495 | path->locks_want = new_locks_want; |
2d587674 | 496 | |
67e0dd8f KO |
497 | while (path->nodes_locked && |
498 | (l = __fls(path->nodes_locked)) >= path->locks_want) { | |
499 | if (l > path->level) { | |
500 | btree_node_unlock(path, l); | |
2d587674 | 501 | } else { |
67e0dd8f KO |
502 | if (btree_node_intent_locked(path, l)) { |
503 | six_lock_downgrade(&path->l[l].b->c.lock); | |
504 | path->nodes_intent_locked ^= 1 << l; | |
1c6fdbd8 | 505 | } |
2d587674 | 506 | break; |
1c6fdbd8 | 507 | } |
1c6fdbd8 | 508 | } |
ad7ae8d6 | 509 | |
67e0dd8f | 510 | bch2_btree_path_verify_locks(path); |
1c6fdbd8 KO |
511 | } |
512 | ||
72545b5e KO |
513 | void bch2_trans_downgrade(struct btree_trans *trans) |
514 | { | |
67e0dd8f | 515 | struct btree_path *path; |
72545b5e | 516 | |
67e0dd8f KO |
517 | trans_for_each_path(trans, path) |
518 | bch2_btree_path_downgrade(path); | |
72545b5e KO |
519 | } |
520 | ||
58fbf808 KO |
521 | /* Btree transaction locking: */ |
522 | ||
523 | bool bch2_trans_relock(struct btree_trans *trans) | |
0f238367 | 524 | { |
67e0dd8f | 525 | struct btree_path *path; |
0f238367 | 526 | |
e5af273f KO |
527 | if (unlikely(trans->restarted)) |
528 | return false; | |
529 | ||
67e0dd8f KO |
530 | trans_for_each_path(trans, path) |
531 | if (path->should_be_locked && | |
532 | !bch2_btree_path_relock(trans, path, _RET_IP_)) { | |
669f87a5 | 533 | trace_trans_restart_relock(trans->fn, _RET_IP_, |
67e0dd8f | 534 | path->btree_id, &path->pos); |
e5af273f | 535 | BUG_ON(!trans->restarted); |
d5a43661 | 536 | return false; |
241e2636 | 537 | } |
d5a43661 | 538 | return true; |
0f238367 KO |
539 | } |
540 | ||
58fbf808 | 541 | void bch2_trans_unlock(struct btree_trans *trans) |
0f238367 | 542 | { |
67e0dd8f | 543 | struct btree_path *path; |
0f238367 | 544 | |
67e0dd8f KO |
545 | trans_for_each_path(trans, path) |
546 | __bch2_btree_path_unlock(path); | |
0f238367 KO |
547 | } |
548 | ||
1c6fdbd8 KO |
549 | /* Btree iterator: */ |
550 | ||
551 | #ifdef CONFIG_BCACHEFS_DEBUG | |
552 | ||
67e0dd8f KO |
553 | static void bch2_btree_path_verify_cached(struct btree_trans *trans, |
554 | struct btree_path *path) | |
d211b408 KO |
555 | { |
556 | struct bkey_cached *ck; | |
67e0dd8f | 557 | bool locked = btree_node_locked(path, 0); |
d211b408 | 558 | |
67e0dd8f | 559 | if (!bch2_btree_node_relock(trans, path, 0)) |
d211b408 KO |
560 | return; |
561 | ||
67e0dd8f KO |
562 | ck = (void *) path->l[0].b; |
563 | BUG_ON(ck->key.btree_id != path->btree_id || | |
564 | bkey_cmp(ck->key.pos, path->pos)); | |
d211b408 KO |
565 | |
566 | if (!locked) | |
67e0dd8f | 567 | btree_node_unlock(path, 0); |
d211b408 KO |
568 | } |
569 | ||
67e0dd8f KO |
570 | static void bch2_btree_path_verify_level(struct btree_trans *trans, |
571 | struct btree_path *path, unsigned level) | |
1c6fdbd8 | 572 | { |
67e0dd8f | 573 | struct btree_path_level *l; |
4ce41957 KO |
574 | struct btree_node_iter tmp; |
575 | bool locked; | |
2dac0eae | 576 | struct bkey_packed *p, *k; |
f020bfcd | 577 | char buf1[100], buf2[100], buf3[100]; |
2dac0eae | 578 | const char *msg; |
1c6fdbd8 | 579 | |
29364f34 | 580 | if (!bch2_debug_check_iterators) |
f13f5a8c KO |
581 | return; |
582 | ||
67e0dd8f | 583 | l = &path->l[level]; |
4ce41957 | 584 | tmp = l->iter; |
67e0dd8f | 585 | locked = btree_node_locked(path, level); |
4ce41957 | 586 | |
67e0dd8f | 587 | if (path->cached) { |
d211b408 | 588 | if (!level) |
67e0dd8f | 589 | bch2_btree_path_verify_cached(trans, path); |
d211b408 KO |
590 | return; |
591 | } | |
592 | ||
67e0dd8f | 593 | if (!btree_path_node(path, level)) |
2dac0eae KO |
594 | return; |
595 | ||
67e0dd8f | 596 | if (!bch2_btree_node_relock(trans, path, level)) |
271a3d3a KO |
597 | return; |
598 | ||
67e0dd8f | 599 | BUG_ON(!btree_path_pos_in_node(path, l->b)); |
2dac0eae | 600 | |
2dac0eae | 601 | bch2_btree_node_iter_verify(&l->iter, l->b); |
1c6fdbd8 KO |
602 | |
603 | /* | |
1ae29c1f | 604 | * For interior nodes, the iterator will have skipped past deleted keys: |
1c6fdbd8 | 605 | */ |
1ae29c1f | 606 | p = level |
c052cf82 | 607 | ? bch2_btree_node_iter_prev(&tmp, l->b) |
2dac0eae KO |
608 | : bch2_btree_node_iter_prev_all(&tmp, l->b); |
609 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
1c6fdbd8 | 610 | |
67e0dd8f | 611 | if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { |
2dac0eae KO |
612 | msg = "before"; |
613 | goto err; | |
1c6fdbd8 KO |
614 | } |
615 | ||
67e0dd8f | 616 | if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { |
2dac0eae KO |
617 | msg = "after"; |
618 | goto err; | |
619 | } | |
f21566f1 | 620 | |
2dac0eae | 621 | if (!locked) |
67e0dd8f | 622 | btree_node_unlock(path, level); |
2dac0eae KO |
623 | return; |
624 | err: | |
2dac0eae | 625 | strcpy(buf2, "(none)"); |
f020bfcd KO |
626 | strcpy(buf3, "(none)"); |
627 | ||
67e0dd8f | 628 | bch2_bpos_to_text(&PBUF(buf1), path->pos); |
2dac0eae KO |
629 | |
630 | if (p) { | |
631 | struct bkey uk = bkey_unpack_key(l->b, p); | |
f020bfcd | 632 | bch2_bkey_to_text(&PBUF(buf2), &uk); |
2dac0eae | 633 | } |
1c6fdbd8 | 634 | |
2dac0eae KO |
635 | if (k) { |
636 | struct bkey uk = bkey_unpack_key(l->b, k); | |
f020bfcd | 637 | bch2_bkey_to_text(&PBUF(buf3), &uk); |
1c6fdbd8 | 638 | } |
2dac0eae | 639 | |
67e0dd8f KO |
640 | panic("path should be %s key at level %u:\n" |
641 | "path pos %s\n" | |
2dac0eae KO |
642 | "prev key %s\n" |
643 | "cur key %s\n", | |
f020bfcd | 644 | msg, level, buf1, buf2, buf3); |
1c6fdbd8 KO |
645 | } |
646 | ||
67e0dd8f KO |
647 | static void bch2_btree_path_verify(struct btree_trans *trans, |
648 | struct btree_path *path) | |
1c6fdbd8 | 649 | { |
5aab6635 | 650 | struct bch_fs *c = trans->c; |
2dac0eae | 651 | unsigned i; |
1c6fdbd8 | 652 | |
67e0dd8f KO |
653 | EBUG_ON(path->btree_id >= BTREE_ID_NR); |
654 | ||
655 | for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { | |
656 | if (!path->l[i].b) { | |
d7407292 KO |
657 | BUG_ON(!path->cached && |
658 | c->btree_roots[path->btree_id].b->c.level > i); | |
67e0dd8f KO |
659 | break; |
660 | } | |
661 | ||
662 | bch2_btree_path_verify_level(trans, path, i); | |
663 | } | |
664 | ||
665 | bch2_btree_path_verify_locks(path); | |
666 | } | |
667 | ||
668 | void bch2_trans_verify_paths(struct btree_trans *trans) | |
669 | { | |
670 | struct btree_path *path; | |
671 | ||
67e0dd8f KO |
672 | trans_for_each_path(trans, path) |
673 | bch2_btree_path_verify(trans, path); | |
674 | } | |
675 | ||
676 | static void bch2_btree_iter_verify(struct btree_iter *iter) | |
677 | { | |
678 | struct btree_trans *trans = iter->trans; | |
679 | ||
680 | BUG_ON(iter->btree_id >= BTREE_ID_NR); | |
681 | ||
682 | BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); | |
7e1a3aa9 | 683 | |
e751c01a KO |
684 | BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && |
685 | (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); | |
686 | ||
f21566f1 | 687 | BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && |
e751c01a KO |
688 | (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && |
689 | !btree_type_has_snapshots(iter->btree_id)); | |
690 | ||
1f2d9192 KO |
691 | if (iter->update_path) |
692 | bch2_btree_path_verify(trans, iter->update_path); | |
67e0dd8f | 693 | bch2_btree_path_verify(trans, iter->path); |
2dac0eae KO |
694 | } |
695 | ||
7e1a3aa9 KO |
696 | static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) |
697 | { | |
a861c722 KO |
698 | BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && |
699 | !iter->pos.snapshot); | |
700 | ||
e751c01a KO |
701 | BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && |
702 | iter->pos.snapshot != iter->snapshot); | |
703 | ||
f21566f1 KO |
704 | BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || |
705 | bkey_cmp(iter->pos, iter->k.p) > 0); | |
7e1a3aa9 KO |
706 | } |
707 | ||
c075ff70 KO |
708 | static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) |
709 | { | |
710 | struct btree_trans *trans = iter->trans; | |
711 | struct btree_iter copy; | |
712 | struct bkey_s_c prev; | |
713 | int ret = 0; | |
714 | ||
715 | if (!bch2_debug_check_iterators) | |
716 | return 0; | |
717 | ||
718 | if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) | |
719 | return 0; | |
720 | ||
721 | if (bkey_err(k) || !k.k) | |
722 | return 0; | |
723 | ||
724 | BUG_ON(!bch2_snapshot_is_ancestor(trans->c, | |
725 | iter->snapshot, | |
726 | k.k->p.snapshot)); | |
727 | ||
728 | bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, | |
ffa7d262 | 729 | BTREE_ITER_NOPRESERVE| |
c075ff70 KO |
730 | BTREE_ITER_ALL_SNAPSHOTS); |
731 | prev = bch2_btree_iter_prev(©); | |
732 | if (!prev.k) | |
733 | goto out; | |
734 | ||
735 | ret = bkey_err(prev); | |
736 | if (ret) | |
737 | goto out; | |
738 | ||
739 | if (!bkey_cmp(prev.k->p, k.k->p) && | |
740 | bch2_snapshot_is_ancestor(trans->c, iter->snapshot, | |
741 | prev.k->p.snapshot) > 0) { | |
742 | char buf1[100], buf2[200]; | |
743 | ||
744 | bch2_bkey_to_text(&PBUF(buf1), k.k); | |
745 | bch2_bkey_to_text(&PBUF(buf2), prev.k); | |
746 | ||
747 | panic("iter snap %u\n" | |
748 | "k %s\n" | |
749 | "prev %s\n", | |
750 | iter->snapshot, | |
751 | buf1, buf2); | |
752 | } | |
753 | out: | |
754 | bch2_trans_iter_exit(trans, ©); | |
755 | return ret; | |
756 | } | |
757 | ||
32b26e8c KO |
758 | void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, |
759 | struct bpos pos, bool key_cache) | |
760 | { | |
761 | struct btree_path *path; | |
762 | unsigned idx; | |
763 | char buf[100]; | |
764 | ||
765 | trans_for_each_path_inorder(trans, path, idx) { | |
766 | int cmp = cmp_int(path->btree_id, id) ?: | |
767 | cmp_int(path->cached, key_cache); | |
768 | ||
769 | if (cmp > 0) | |
770 | break; | |
771 | if (cmp < 0) | |
772 | continue; | |
773 | ||
774 | if (!(path->nodes_locked & 1) || | |
775 | !path->should_be_locked) | |
776 | continue; | |
777 | ||
778 | if (!key_cache) { | |
779 | if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 && | |
780 | bkey_cmp(pos, path->l[0].b->key.k.p) <= 0) | |
781 | return; | |
782 | } else { | |
783 | if (!bkey_cmp(pos, path->pos)) | |
784 | return; | |
785 | } | |
786 | } | |
787 | ||
788 | bch2_dump_trans_paths_updates(trans); | |
789 | panic("not locked: %s %s%s\n", | |
790 | bch2_btree_ids[id], | |
791 | (bch2_bpos_to_text(&PBUF(buf), pos), buf), | |
792 | key_cache ? " cached" : ""); | |
793 | } | |
794 | ||
271a3d3a KO |
795 | #else |
796 | ||
67e0dd8f KO |
797 | static inline void bch2_btree_path_verify_level(struct btree_trans *trans, |
798 | struct btree_path *path, unsigned l) {} | |
799 | static inline void bch2_btree_path_verify(struct btree_trans *trans, | |
800 | struct btree_path *path) {} | |
7d6f9b64 | 801 | static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} |
7e1a3aa9 | 802 | static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} |
c075ff70 | 803 | static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } |
271a3d3a | 804 | |
1c6fdbd8 KO |
805 | #endif |
806 | ||
67e0dd8f KO |
807 | /* Btree path: fixups after btree updates */ |
808 | ||
23bbd2bb KO |
809 | static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, |
810 | struct btree *b, | |
811 | struct bset_tree *t, | |
812 | struct bkey_packed *k) | |
813 | { | |
814 | struct btree_node_iter_set *set; | |
815 | ||
816 | btree_node_iter_for_each(iter, set) | |
817 | if (set->end == t->end_offset) { | |
818 | set->k = __btree_node_key_to_offset(b, k); | |
819 | bch2_btree_node_iter_sort(iter, b); | |
820 | return; | |
821 | } | |
822 | ||
823 | bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); | |
824 | } | |
825 | ||
67e0dd8f | 826 | static void __bch2_btree_path_fix_key_modified(struct btree_path *path, |
9626aeb1 KO |
827 | struct btree *b, |
828 | struct bkey_packed *where) | |
887c2a4e | 829 | { |
67e0dd8f | 830 | struct btree_path_level *l = &path->l[b->c.level]; |
887c2a4e | 831 | |
9626aeb1 KO |
832 | if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) |
833 | return; | |
834 | ||
67e0dd8f | 835 | if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) |
9626aeb1 | 836 | bch2_btree_node_iter_advance(&l->iter, l->b); |
887c2a4e KO |
837 | } |
838 | ||
67e0dd8f | 839 | void bch2_btree_path_fix_key_modified(struct btree_trans *trans, |
887c2a4e KO |
840 | struct btree *b, |
841 | struct bkey_packed *where) | |
842 | { | |
67e0dd8f | 843 | struct btree_path *path; |
887c2a4e | 844 | |
67e0dd8f KO |
845 | trans_for_each_path_with_node(trans, b, path) { |
846 | __bch2_btree_path_fix_key_modified(path, b, where); | |
847 | bch2_btree_path_verify_level(trans, path, b->c.level); | |
887c2a4e KO |
848 | } |
849 | } | |
850 | ||
67e0dd8f KO |
851 | static void __bch2_btree_node_iter_fix(struct btree_path *path, |
852 | struct btree *b, | |
853 | struct btree_node_iter *node_iter, | |
854 | struct bset_tree *t, | |
855 | struct bkey_packed *where, | |
856 | unsigned clobber_u64s, | |
857 | unsigned new_u64s) | |
1c6fdbd8 KO |
858 | { |
859 | const struct bkey_packed *end = btree_bkey_last(b, t); | |
860 | struct btree_node_iter_set *set; | |
861 | unsigned offset = __btree_node_key_to_offset(b, where); | |
862 | int shift = new_u64s - clobber_u64s; | |
271a3d3a | 863 | unsigned old_end = t->end_offset - shift; |
c0fc30da KO |
864 | unsigned orig_iter_pos = node_iter->data[0].k; |
865 | bool iter_current_key_modified = | |
866 | orig_iter_pos >= offset && | |
867 | orig_iter_pos <= offset + clobber_u64s; | |
1c6fdbd8 KO |
868 | |
869 | btree_node_iter_for_each(node_iter, set) | |
870 | if (set->end == old_end) | |
871 | goto found; | |
872 | ||
873 | /* didn't find the bset in the iterator - might have to readd it: */ | |
874 | if (new_u64s && | |
67e0dd8f | 875 | bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { |
1c6fdbd8 | 876 | bch2_btree_node_iter_push(node_iter, b, where, end); |
c0fc30da KO |
877 | goto fixup_done; |
878 | } else { | |
879 | /* Iterator is after key that changed */ | |
059e4134 | 880 | return; |
1c6fdbd8 | 881 | } |
1c6fdbd8 | 882 | found: |
271a3d3a | 883 | set->end = t->end_offset; |
1c6fdbd8 KO |
884 | |
885 | /* Iterator hasn't gotten to the key that changed yet: */ | |
886 | if (set->k < offset) | |
059e4134 | 887 | return; |
1c6fdbd8 KO |
888 | |
889 | if (new_u64s && | |
67e0dd8f | 890 | bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { |
1c6fdbd8 KO |
891 | set->k = offset; |
892 | } else if (set->k < offset + clobber_u64s) { | |
893 | set->k = offset + new_u64s; | |
894 | if (set->k == set->end) | |
895 | bch2_btree_node_iter_set_drop(node_iter, set); | |
896 | } else { | |
c0fc30da | 897 | /* Iterator is after key that changed */ |
1c6fdbd8 | 898 | set->k = (int) set->k + shift; |
059e4134 | 899 | return; |
1c6fdbd8 KO |
900 | } |
901 | ||
1c6fdbd8 | 902 | bch2_btree_node_iter_sort(node_iter, b); |
c0fc30da KO |
903 | fixup_done: |
904 | if (node_iter->data[0].k != orig_iter_pos) | |
905 | iter_current_key_modified = true; | |
56e0e7c7 | 906 | |
1c6fdbd8 | 907 | /* |
23bbd2bb KO |
908 | * When a new key is added, and the node iterator now points to that |
909 | * key, the iterator might have skipped past deleted keys that should | |
910 | * come after the key the iterator now points to. We have to rewind to | |
c0fc30da KO |
911 | * before those deleted keys - otherwise |
912 | * bch2_btree_node_iter_prev_all() breaks: | |
1c6fdbd8 | 913 | */ |
23bbd2bb | 914 | if (!bch2_btree_node_iter_end(node_iter) && |
c0fc30da | 915 | iter_current_key_modified && |
1ae29c1f | 916 | b->c.level) { |
23bbd2bb KO |
917 | struct bset_tree *t; |
918 | struct bkey_packed *k, *k2, *p; | |
919 | ||
920 | k = bch2_btree_node_iter_peek_all(node_iter, b); | |
1c6fdbd8 KO |
921 | |
922 | for_each_bset(b, t) { | |
23bbd2bb KO |
923 | bool set_pos = false; |
924 | ||
925 | if (node_iter->data[0].end == t->end_offset) | |
1c6fdbd8 KO |
926 | continue; |
927 | ||
23bbd2bb KO |
928 | k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); |
929 | ||
930 | while ((p = bch2_bkey_prev_all(b, t, k2)) && | |
931 | bkey_iter_cmp(b, k, p) < 0) { | |
932 | k2 = p; | |
933 | set_pos = true; | |
1c6fdbd8 | 934 | } |
23bbd2bb KO |
935 | |
936 | if (set_pos) | |
937 | btree_node_iter_set_set_pos(node_iter, | |
938 | b, t, k2); | |
1c6fdbd8 KO |
939 | } |
940 | } | |
941 | } | |
942 | ||
9f6bd307 | 943 | void bch2_btree_node_iter_fix(struct btree_trans *trans, |
67e0dd8f | 944 | struct btree_path *path, |
216c9fac KO |
945 | struct btree *b, |
946 | struct btree_node_iter *node_iter, | |
947 | struct bkey_packed *where, | |
948 | unsigned clobber_u64s, | |
949 | unsigned new_u64s) | |
1c6fdbd8 | 950 | { |
216c9fac | 951 | struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); |
67e0dd8f | 952 | struct btree_path *linked; |
1c6fdbd8 | 953 | |
67e0dd8f KO |
954 | if (node_iter != &path->l[b->c.level].iter) { |
955 | __bch2_btree_node_iter_fix(path, b, node_iter, t, | |
059e4134 | 956 | where, clobber_u64s, new_u64s); |
2dac0eae | 957 | |
29364f34 | 958 | if (bch2_debug_check_iterators) |
2dac0eae | 959 | bch2_btree_node_iter_verify(node_iter, b); |
059e4134 | 960 | } |
1c6fdbd8 | 961 | |
67e0dd8f | 962 | trans_for_each_path_with_node(trans, b, linked) { |
1c6fdbd8 | 963 | __bch2_btree_node_iter_fix(linked, b, |
059e4134 KO |
964 | &linked->l[b->c.level].iter, t, |
965 | where, clobber_u64s, new_u64s); | |
67e0dd8f | 966 | bch2_btree_path_verify_level(trans, linked, b->c.level); |
059e4134 | 967 | } |
1c6fdbd8 KO |
968 | } |
969 | ||
67e0dd8f KO |
970 | /* Btree path level: pointer to a particular btree node and node iter */ |
971 | ||
972 | static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, | |
973 | struct btree_path_level *l, | |
1c6fdbd8 KO |
974 | struct bkey *u, |
975 | struct bkey_packed *k) | |
976 | { | |
1c6fdbd8 KO |
977 | if (unlikely(!k)) { |
978 | /* | |
979 | * signal to bch2_btree_iter_peek_slot() that we're currently at | |
980 | * a hole | |
981 | */ | |
26609b61 | 982 | u->type = KEY_TYPE_deleted; |
1c6fdbd8 KO |
983 | return bkey_s_c_null; |
984 | } | |
985 | ||
2ce8fbd9 | 986 | return bkey_disassemble(l->b, k, u); |
1c6fdbd8 KO |
987 | } |
988 | ||
67e0dd8f KO |
989 | static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, |
990 | struct btree_path_level *l, | |
991 | struct bkey *u) | |
1c6fdbd8 | 992 | { |
67e0dd8f | 993 | return __btree_iter_unpack(c, l, u, |
1c6fdbd8 KO |
994 | bch2_btree_node_iter_peek_all(&l->iter, l->b)); |
995 | } | |
996 | ||
67e0dd8f KO |
997 | static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, |
998 | struct btree_path *path, | |
999 | struct btree_path_level *l, | |
1000 | struct bkey *u) | |
1c6fdbd8 | 1001 | { |
67e0dd8f | 1002 | struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, |
1c6fdbd8 | 1003 | bch2_btree_node_iter_peek(&l->iter, l->b)); |
ca58cbd4 | 1004 | |
67e0dd8f KO |
1005 | path->pos = k.k ? k.k->p : l->b->key.k.p; |
1006 | trans->paths_sorted = false; | |
ca58cbd4 | 1007 | return k; |
1c6fdbd8 KO |
1008 | } |
1009 | ||
67e0dd8f KO |
1010 | static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, |
1011 | struct btree_path *path, | |
1012 | struct btree_path_level *l, | |
1013 | struct bkey *u) | |
ccf5a109 | 1014 | { |
67e0dd8f | 1015 | struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, |
ccf5a109 | 1016 | bch2_btree_node_iter_prev(&l->iter, l->b)); |
ca58cbd4 | 1017 | |
67e0dd8f KO |
1018 | path->pos = k.k ? k.k->p : l->b->data->min_key; |
1019 | trans->paths_sorted = false; | |
ca58cbd4 | 1020 | return k; |
ccf5a109 KO |
1021 | } |
1022 | ||
67e0dd8f KO |
1023 | static inline bool btree_path_advance_to_pos(struct btree_path *path, |
1024 | struct btree_path_level *l, | |
a00fd8c5 | 1025 | int max_advance) |
1c6fdbd8 | 1026 | { |
a00fd8c5 KO |
1027 | struct bkey_packed *k; |
1028 | int nr_advanced = 0; | |
1029 | ||
1030 | while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && | |
67e0dd8f | 1031 | bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { |
a00fd8c5 KO |
1032 | if (max_advance > 0 && nr_advanced >= max_advance) |
1033 | return false; | |
1034 | ||
1035 | bch2_btree_node_iter_advance(&l->iter, l->b); | |
1036 | nr_advanced++; | |
1037 | } | |
1038 | ||
1039 | return true; | |
1c6fdbd8 KO |
1040 | } |
1041 | ||
1042 | /* | |
1043 | * Verify that iterator for parent node points to child node: | |
1044 | */ | |
67e0dd8f KO |
1045 | static void btree_path_verify_new_node(struct btree_trans *trans, |
1046 | struct btree_path *path, struct btree *b) | |
1c6fdbd8 | 1047 | { |
5222a460 | 1048 | struct bch_fs *c = trans->c; |
67e0dd8f | 1049 | struct btree_path_level *l; |
1c6fdbd8 KO |
1050 | unsigned plevel; |
1051 | bool parent_locked; | |
1052 | struct bkey_packed *k; | |
1053 | ||
1054 | if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) | |
1055 | return; | |
1056 | ||
5222a460 KO |
1057 | if (trans->journal_replay_not_finished) |
1058 | return; | |
1059 | ||
c43a6ef9 | 1060 | plevel = b->c.level + 1; |
67e0dd8f | 1061 | if (!btree_path_node(path, plevel)) |
1c6fdbd8 KO |
1062 | return; |
1063 | ||
67e0dd8f | 1064 | parent_locked = btree_node_locked(path, plevel); |
1c6fdbd8 | 1065 | |
67e0dd8f | 1066 | if (!bch2_btree_node_relock(trans, path, plevel)) |
1c6fdbd8 KO |
1067 | return; |
1068 | ||
67e0dd8f | 1069 | l = &path->l[plevel]; |
1c6fdbd8 KO |
1070 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); |
1071 | if (!k || | |
1072 | bkey_deleted(k) || | |
1073 | bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { | |
f020bfcd KO |
1074 | char buf1[100]; |
1075 | char buf2[100]; | |
1076 | char buf3[100]; | |
1077 | char buf4[100]; | |
1c6fdbd8 KO |
1078 | struct bkey uk = bkey_unpack_key(b, k); |
1079 | ||
5222a460 | 1080 | bch2_dump_btree_node(c, l->b); |
67e0dd8f | 1081 | bch2_bpos_to_text(&PBUF(buf1), path->pos); |
f020bfcd KO |
1082 | bch2_bkey_to_text(&PBUF(buf2), &uk); |
1083 | bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); | |
1084 | bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); | |
a2bfc841 | 1085 | panic("parent iter doesn't point to new node:\n" |
f020bfcd | 1086 | "iter pos %s %s\n" |
a2bfc841 | 1087 | "iter key %s\n" |
f020bfcd | 1088 | "new node %s-%s\n", |
67e0dd8f | 1089 | bch2_btree_ids[path->btree_id], buf1, |
f020bfcd | 1090 | buf2, buf3, buf4); |
1c6fdbd8 KO |
1091 | } |
1092 | ||
1093 | if (!parent_locked) | |
1d3ecd7e | 1094 | btree_node_unlock(path, plevel); |
1c6fdbd8 KO |
1095 | } |
1096 | ||
67e0dd8f | 1097 | static inline void __btree_path_level_init(struct btree_path *path, |
78cf784e | 1098 | unsigned level) |
1c6fdbd8 | 1099 | { |
67e0dd8f | 1100 | struct btree_path_level *l = &path->l[level]; |
a00fd8c5 | 1101 | |
67e0dd8f | 1102 | bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); |
1c6fdbd8 | 1103 | |
537c49d6 KO |
1104 | /* |
1105 | * Iterators to interior nodes should always be pointed at the first non | |
1106 | * whiteout: | |
1107 | */ | |
1108 | if (level) | |
1109 | bch2_btree_node_iter_peek(&l->iter, l->b); | |
1c6fdbd8 KO |
1110 | } |
1111 | ||
67e0dd8f KO |
1112 | static inline void btree_path_level_init(struct btree_trans *trans, |
1113 | struct btree_path *path, | |
78cf784e | 1114 | struct btree *b) |
1c6fdbd8 | 1115 | { |
67e0dd8f | 1116 | BUG_ON(path->cached); |
2ca88e5a | 1117 | |
67e0dd8f | 1118 | btree_path_verify_new_node(trans, path, b); |
1c6fdbd8 | 1119 | |
67e0dd8f | 1120 | EBUG_ON(!btree_path_pos_in_node(path, b)); |
c43a6ef9 | 1121 | EBUG_ON(b->c.lock.state.seq & 1); |
1c6fdbd8 | 1122 | |
67e0dd8f KO |
1123 | path->l[b->c.level].lock_seq = b->c.lock.state.seq; |
1124 | path->l[b->c.level].b = b; | |
1125 | __btree_path_level_init(path, b->c.level); | |
1c6fdbd8 KO |
1126 | } |
1127 | ||
67e0dd8f KO |
1128 | /* Btree path: fixups after btree node updates: */ |
1129 | ||
1c6fdbd8 KO |
1130 | /* |
1131 | * A btree node is being replaced - update the iterator to point to the new | |
1132 | * node: | |
1133 | */ | |
f7a966a3 | 1134 | void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) |
1c6fdbd8 | 1135 | { |
67e0dd8f | 1136 | struct btree_path *path; |
1c6fdbd8 | 1137 | |
67e0dd8f KO |
1138 | trans_for_each_path(trans, path) |
1139 | if (!path->cached && | |
1140 | btree_path_pos_in_node(path, b)) { | |
1d3ecd7e KO |
1141 | enum btree_node_locked_type t = |
1142 | btree_lock_want(path, b->c.level); | |
1c6fdbd8 | 1143 | |
1d3ecd7e KO |
1144 | if (path->nodes_locked && |
1145 | t != BTREE_NODE_UNLOCKED) { | |
1146 | btree_node_unlock(path, b->c.level); | |
c43a6ef9 | 1147 | six_lock_increment(&b->c.lock, (enum six_lock_type) t); |
3074bc0f | 1148 | mark_btree_node_locked(path, b->c.level, (enum six_lock_type) t); |
1c6fdbd8 KO |
1149 | } |
1150 | ||
67e0dd8f | 1151 | btree_path_level_init(trans, path, b); |
1c6fdbd8 | 1152 | } |
1c6fdbd8 KO |
1153 | } |
1154 | ||
1c6fdbd8 KO |
1155 | /* |
1156 | * A btree node has been modified in such a way as to invalidate iterators - fix | |
1157 | * them: | |
1158 | */ | |
f7a966a3 | 1159 | void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) |
1c6fdbd8 | 1160 | { |
67e0dd8f | 1161 | struct btree_path *path; |
1c6fdbd8 | 1162 | |
67e0dd8f KO |
1163 | trans_for_each_path_with_node(trans, b, path) |
1164 | __btree_path_level_init(path, b->c.level); | |
1c6fdbd8 KO |
1165 | } |
1166 | ||
67e0dd8f KO |
1167 | /* Btree path: traverse, set_pos: */ |
1168 | ||
bd2bb273 KO |
1169 | static int lock_root_check_fn(struct six_lock *lock, void *p) |
1170 | { | |
1171 | struct btree *b = container_of(lock, struct btree, c.lock); | |
1172 | struct btree **rootp = p; | |
1173 | ||
1174 | return b == *rootp ? 0 : -1; | |
1175 | } | |
1176 | ||
67e0dd8f KO |
1177 | static inline int btree_path_lock_root(struct btree_trans *trans, |
1178 | struct btree_path *path, | |
a301dc38 KO |
1179 | unsigned depth_want, |
1180 | unsigned long trace_ip) | |
1c6fdbd8 | 1181 | { |
e5af273f | 1182 | struct bch_fs *c = trans->c; |
67e0dd8f | 1183 | struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; |
1c6fdbd8 KO |
1184 | enum six_lock_type lock_type; |
1185 | unsigned i; | |
1186 | ||
67e0dd8f | 1187 | EBUG_ON(path->nodes_locked); |
1c6fdbd8 KO |
1188 | |
1189 | while (1) { | |
bd2bb273 | 1190 | b = READ_ONCE(*rootp); |
67e0dd8f | 1191 | path->level = READ_ONCE(b->c.level); |
1c6fdbd8 | 1192 | |
67e0dd8f | 1193 | if (unlikely(path->level < depth_want)) { |
1c6fdbd8 KO |
1194 | /* |
1195 | * the root is at a lower depth than the depth we want: | |
1196 | * got to the end of the btree, or we're walking nodes | |
1197 | * greater than some depth and there are no nodes >= | |
1198 | * that depth | |
1199 | */ | |
67e0dd8f KO |
1200 | path->level = depth_want; |
1201 | for (i = path->level; i < BTREE_MAX_DEPTH; i++) | |
1202 | path->l[i].b = NULL; | |
8812600c | 1203 | return 1; |
1c6fdbd8 KO |
1204 | } |
1205 | ||
67e0dd8f KO |
1206 | lock_type = __btree_lock_want(path, path->level); |
1207 | if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX, | |
1208 | path->level, lock_type, | |
a301dc38 | 1209 | lock_root_check_fn, rootp, |
e5af273f KO |
1210 | trace_ip))) { |
1211 | if (trans->restarted) | |
1212 | return -EINTR; | |
1213 | continue; | |
1214 | } | |
1c6fdbd8 | 1215 | |
bd2bb273 | 1216 | if (likely(b == READ_ONCE(*rootp) && |
67e0dd8f | 1217 | b->c.level == path->level && |
1c6fdbd8 | 1218 | !race_fault())) { |
67e0dd8f KO |
1219 | for (i = 0; i < path->level; i++) |
1220 | path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; | |
1221 | path->l[path->level].b = b; | |
1222 | for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) | |
1223 | path->l[i].b = NULL; | |
1224 | ||
3074bc0f | 1225 | mark_btree_node_locked(path, path->level, lock_type); |
67e0dd8f | 1226 | btree_path_level_init(trans, path, b); |
1c6fdbd8 | 1227 | return 0; |
1c6fdbd8 KO |
1228 | } |
1229 | ||
c43a6ef9 | 1230 | six_unlock_type(&b->c.lock, lock_type); |
1c6fdbd8 KO |
1231 | } |
1232 | } | |
1233 | ||
1234 | noinline | |
67e0dd8f | 1235 | static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) |
1c6fdbd8 | 1236 | { |
9f6bd307 | 1237 | struct bch_fs *c = trans->c; |
67e0dd8f | 1238 | struct btree_path_level *l = path_l(path); |
1c6fdbd8 KO |
1239 | struct btree_node_iter node_iter = l->iter; |
1240 | struct bkey_packed *k; | |
07a1006a | 1241 | struct bkey_buf tmp; |
9e5e5b9e | 1242 | unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) |
67e0dd8f KO |
1243 | ? (path->level > 1 ? 0 : 2) |
1244 | : (path->level > 1 ? 1 : 16); | |
1245 | bool was_locked = btree_node_locked(path, path->level); | |
8b3e9bd6 | 1246 | int ret = 0; |
1c6fdbd8 | 1247 | |
07a1006a KO |
1248 | bch2_bkey_buf_init(&tmp); |
1249 | ||
8b3e9bd6 | 1250 | while (nr && !ret) { |
67e0dd8f | 1251 | if (!bch2_btree_node_relock(trans, path, path->level)) |
07a1006a | 1252 | break; |
1c6fdbd8 KO |
1253 | |
1254 | bch2_btree_node_iter_advance(&node_iter, l->b); | |
1255 | k = bch2_btree_node_iter_peek(&node_iter, l->b); | |
1256 | if (!k) | |
1257 | break; | |
1258 | ||
07a1006a | 1259 | bch2_bkey_buf_unpack(&tmp, c, l->b, k); |
67e0dd8f KO |
1260 | ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, |
1261 | path->level - 1); | |
1c6fdbd8 KO |
1262 | } |
1263 | ||
1264 | if (!was_locked) | |
67e0dd8f | 1265 | btree_node_unlock(path, path->level); |
07a1006a KO |
1266 | |
1267 | bch2_bkey_buf_exit(&tmp, c); | |
8b3e9bd6 | 1268 | return ret; |
1c6fdbd8 KO |
1269 | } |
1270 | ||
5222a460 KO |
1271 | static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, |
1272 | struct btree_and_journal_iter *jiter) | |
1273 | { | |
1274 | struct bch_fs *c = trans->c; | |
1275 | struct bkey_s_c k; | |
1276 | struct bkey_buf tmp; | |
1277 | unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) | |
1278 | ? (path->level > 1 ? 0 : 2) | |
1279 | : (path->level > 1 ? 1 : 16); | |
1280 | bool was_locked = btree_node_locked(path, path->level); | |
1281 | int ret = 0; | |
1282 | ||
1283 | bch2_bkey_buf_init(&tmp); | |
1284 | ||
1285 | while (nr && !ret) { | |
1286 | if (!bch2_btree_node_relock(trans, path, path->level)) | |
1287 | break; | |
1288 | ||
1289 | bch2_btree_and_journal_iter_advance(jiter); | |
1290 | k = bch2_btree_and_journal_iter_peek(jiter); | |
1291 | if (!k.k) | |
1292 | break; | |
1293 | ||
1294 | bch2_bkey_buf_reassemble(&tmp, c, k); | |
1295 | ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, | |
1296 | path->level - 1); | |
1297 | } | |
1298 | ||
1299 | if (!was_locked) | |
1300 | btree_node_unlock(path, path->level); | |
1301 | ||
1302 | bch2_bkey_buf_exit(&tmp, c); | |
1303 | return ret; | |
1304 | } | |
1305 | ||
78cf784e | 1306 | static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, |
67e0dd8f | 1307 | struct btree_path *path, |
72141e1f KO |
1308 | unsigned plevel, struct btree *b) |
1309 | { | |
67e0dd8f KO |
1310 | struct btree_path_level *l = &path->l[plevel]; |
1311 | bool locked = btree_node_locked(path, plevel); | |
72141e1f KO |
1312 | struct bkey_packed *k; |
1313 | struct bch_btree_ptr_v2 *bp; | |
1314 | ||
67e0dd8f | 1315 | if (!bch2_btree_node_relock(trans, path, plevel)) |
72141e1f KO |
1316 | return; |
1317 | ||
1318 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
1319 | BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); | |
1320 | ||
1321 | bp = (void *) bkeyp_val(&l->b->format, k); | |
1322 | bp->mem_ptr = (unsigned long)b; | |
1323 | ||
1324 | if (!locked) | |
67e0dd8f | 1325 | btree_node_unlock(path, plevel); |
72141e1f KO |
1326 | } |
1327 | ||
5222a460 KO |
1328 | static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, |
1329 | struct btree_path *path, | |
1330 | unsigned flags, | |
1331 | struct bkey_buf *out) | |
1332 | { | |
1333 | struct bch_fs *c = trans->c; | |
1334 | struct btree_path_level *l = path_l(path); | |
1335 | struct btree_and_journal_iter jiter; | |
1336 | struct bkey_s_c k; | |
1337 | int ret = 0; | |
1338 | ||
1339 | __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); | |
1340 | ||
1341 | k = bch2_btree_and_journal_iter_peek(&jiter); | |
1342 | ||
1343 | bch2_bkey_buf_reassemble(out, c, k); | |
1344 | ||
1345 | if (flags & BTREE_ITER_PREFETCH) | |
1346 | ret = btree_path_prefetch_j(trans, path, &jiter); | |
1347 | ||
1348 | bch2_btree_and_journal_iter_exit(&jiter); | |
1349 | return ret; | |
1350 | } | |
1351 | ||
67e0dd8f KO |
1352 | static __always_inline int btree_path_down(struct btree_trans *trans, |
1353 | struct btree_path *path, | |
1354 | unsigned flags, | |
a301dc38 | 1355 | unsigned long trace_ip) |
1c6fdbd8 | 1356 | { |
6e075b54 | 1357 | struct bch_fs *c = trans->c; |
67e0dd8f | 1358 | struct btree_path_level *l = path_l(path); |
1c6fdbd8 | 1359 | struct btree *b; |
67e0dd8f KO |
1360 | unsigned level = path->level - 1; |
1361 | enum six_lock_type lock_type = __btree_lock_want(path, level); | |
07a1006a KO |
1362 | struct bkey_buf tmp; |
1363 | int ret; | |
1c6fdbd8 | 1364 | |
67e0dd8f | 1365 | EBUG_ON(!btree_node_locked(path, path->level)); |
1c6fdbd8 | 1366 | |
07a1006a | 1367 | bch2_bkey_buf_init(&tmp); |
5222a460 KO |
1368 | |
1369 | if (unlikely(trans->journal_replay_not_finished)) { | |
1370 | ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); | |
1371 | if (ret) | |
1372 | goto err; | |
1373 | } else { | |
1374 | bch2_bkey_buf_unpack(&tmp, c, l->b, | |
1375 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
1376 | ||
1377 | if (flags & BTREE_ITER_PREFETCH) { | |
1378 | ret = btree_path_prefetch(trans, path); | |
1379 | if (ret) | |
1380 | goto err; | |
1381 | } | |
1382 | } | |
1c6fdbd8 | 1383 | |
67e0dd8f | 1384 | b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); |
07a1006a KO |
1385 | ret = PTR_ERR_OR_ZERO(b); |
1386 | if (unlikely(ret)) | |
1387 | goto err; | |
1c6fdbd8 | 1388 | |
3074bc0f | 1389 | mark_btree_node_locked(path, level, lock_type); |
67e0dd8f | 1390 | btree_path_level_init(trans, path, b); |
1c6fdbd8 | 1391 | |
5222a460 KO |
1392 | if (likely(!trans->journal_replay_not_finished && |
1393 | tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && | |
07a1006a | 1394 | unlikely(b != btree_node_mem_ptr(tmp.k))) |
67e0dd8f | 1395 | btree_node_mem_ptr_set(trans, path, level + 1, b); |
72141e1f | 1396 | |
67e0dd8f KO |
1397 | if (btree_node_read_locked(path, level + 1)) |
1398 | btree_node_unlock(path, level + 1); | |
1399 | path->level = level; | |
c205321b | 1400 | |
67e0dd8f | 1401 | bch2_btree_path_verify_locks(path); |
07a1006a KO |
1402 | err: |
1403 | bch2_bkey_buf_exit(&tmp, c); | |
1404 | return ret; | |
1c6fdbd8 KO |
1405 | } |
1406 | ||
67e0dd8f KO |
1407 | static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, |
1408 | unsigned, unsigned long); | |
1c6fdbd8 | 1409 | |
8f9ad91a | 1410 | static int bch2_btree_path_traverse_all(struct btree_trans *trans) |
1c6fdbd8 | 1411 | { |
e542029e | 1412 | struct bch_fs *c = trans->c; |
eb7bd15f | 1413 | struct btree_path *path, *prev; |
8f9ad91a KO |
1414 | unsigned long trace_ip = _RET_IP_; |
1415 | int i, ret = 0; | |
e542029e | 1416 | |
2ca88e5a KO |
1417 | if (trans->in_traverse_all) |
1418 | return -EINTR; | |
1419 | ||
1420 | trans->in_traverse_all = true; | |
1421 | retry_all: | |
eb7bd15f | 1422 | prev = NULL; |
e5af273f KO |
1423 | trans->restarted = false; |
1424 | ||
67e0dd8f KO |
1425 | trans_for_each_path(trans, path) |
1426 | path->should_be_locked = false; | |
e542029e | 1427 | |
67e0dd8f | 1428 | btree_trans_sort_paths(trans); |
2527dd91 | 1429 | |
67e0dd8f | 1430 | trans_for_each_path_inorder_reverse(trans, path, i) { |
0423fb71 | 1431 | if (prev) { |
67e0dd8f KO |
1432 | if (path->btree_id == prev->btree_id && |
1433 | path->locks_want < prev->locks_want) | |
1434 | __bch2_btree_path_upgrade(trans, path, prev->locks_want); | |
1435 | else if (!path->locks_want && prev->locks_want) | |
1436 | __bch2_btree_path_upgrade(trans, path, 1); | |
0423fb71 | 1437 | } |
2527dd91 | 1438 | |
67e0dd8f | 1439 | prev = path; |
2527dd91 KO |
1440 | } |
1441 | ||
58fbf808 | 1442 | bch2_trans_unlock(trans); |
a301dc38 | 1443 | cond_resched(); |
1c6fdbd8 | 1444 | |
8f9ad91a | 1445 | if (unlikely(trans->memory_allocation_failure)) { |
1c6fdbd8 KO |
1446 | struct closure cl; |
1447 | ||
1448 | closure_init_stack(&cl); | |
1449 | ||
1450 | do { | |
1451 | ret = bch2_btree_cache_cannibalize_lock(c, &cl); | |
1452 | closure_sync(&cl); | |
1453 | } while (ret); | |
1454 | } | |
1455 | ||
1c6fdbd8 | 1456 | /* Now, redo traversals in correct order: */ |
0423fb71 KO |
1457 | i = 0; |
1458 | while (i < trans->nr_sorted) { | |
67e0dd8f | 1459 | path = trans->paths + trans->sorted[i]; |
2ca88e5a | 1460 | |
33aa419d KO |
1461 | /* |
1462 | * Traversing a path can cause another path to be added at about | |
1463 | * the same position: | |
1464 | */ | |
1465 | if (path->uptodate) { | |
1466 | ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); | |
1467 | if (ret) | |
1468 | goto retry_all; | |
1469 | } else { | |
0423fb71 | 1470 | i++; |
33aa419d | 1471 | } |
e542029e | 1472 | } |
0423fb71 KO |
1473 | |
1474 | /* | |
1475 | * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock() | |
67e0dd8f | 1476 | * and relock(), relock() won't relock since path->should_be_locked |
0423fb71 KO |
1477 | * isn't set yet, which is all fine |
1478 | */ | |
67e0dd8f KO |
1479 | trans_for_each_path(trans, path) |
1480 | BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); | |
8f9ad91a | 1481 | |
1c6fdbd8 | 1482 | bch2_btree_cache_cannibalize_unlock(c); |
2ca88e5a | 1483 | |
669f87a5 KO |
1484 | trans->in_traverse_all = false; |
1485 | ||
1486 | trace_trans_traverse_all(trans->fn, trace_ip); | |
1c6fdbd8 | 1487 | return ret; |
bf7b87a4 | 1488 | } |
1c6fdbd8 | 1489 | |
67e0dd8f KO |
1490 | static inline bool btree_path_good_node(struct btree_trans *trans, |
1491 | struct btree_path *path, | |
f4b61341 KO |
1492 | unsigned l, int check_pos) |
1493 | { | |
67e0dd8f KO |
1494 | if (!is_btree_node(path, l) || |
1495 | !bch2_btree_node_relock(trans, path, l)) | |
f4b61341 KO |
1496 | return false; |
1497 | ||
67e0dd8f | 1498 | if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) |
f4b61341 | 1499 | return false; |
67e0dd8f | 1500 | if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) |
f4b61341 KO |
1501 | return false; |
1502 | return true; | |
1503 | } | |
1504 | ||
67e0dd8f KO |
1505 | static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, |
1506 | struct btree_path *path, | |
f4b61341 | 1507 | int check_pos) |
1c6fdbd8 | 1508 | { |
8ee0134e | 1509 | unsigned i, l = path->level; |
1d3ecd7e | 1510 | |
67e0dd8f KO |
1511 | while (btree_path_node(path, l) && |
1512 | !btree_path_good_node(trans, path, l, check_pos)) { | |
1513 | btree_node_unlock(path, l); | |
1514 | path->l[l].b = BTREE_ITER_NO_NODE_UP; | |
1c6fdbd8 KO |
1515 | l++; |
1516 | } | |
1517 | ||
8ee0134e KO |
1518 | /* If we need intent locks, take them too: */ |
1519 | for (i = l + 1; | |
1520 | i < path->locks_want && btree_path_node(path, i); | |
1521 | i++) | |
1522 | if (!bch2_btree_node_relock(trans, path, i)) | |
1523 | while (l <= i) { | |
1524 | btree_node_unlock(path, l); | |
1525 | path->l[l].b = BTREE_ITER_NO_NODE_UP; | |
1526 | l++; | |
1527 | } | |
1528 | ||
1c6fdbd8 KO |
1529 | return l; |
1530 | } | |
1531 | ||
1532 | /* | |
1533 | * This is the main state machine for walking down the btree - walks down to a | |
1534 | * specified depth | |
1535 | * | |
1536 | * Returns 0 on success, -EIO on error (error reading in a btree node). | |
1537 | * | |
1538 | * On error, caller (peek_node()/peek_key()) must return NULL; the error is | |
b7607ce9 | 1539 | * stashed in the iterator and returned from bch2_trans_exit(). |
1c6fdbd8 | 1540 | */ |
67e0dd8f KO |
1541 | static int btree_path_traverse_one(struct btree_trans *trans, |
1542 | struct btree_path *path, | |
1543 | unsigned flags, | |
a301dc38 | 1544 | unsigned long trace_ip) |
1c6fdbd8 | 1545 | { |
8ee0134e | 1546 | unsigned depth_want = path->level; |
531a0095 | 1547 | int ret = 0; |
1c6fdbd8 | 1548 | |
979735df KO |
1549 | if (unlikely(trans->restarted)) { |
1550 | ret = -EINTR; | |
1551 | goto out; | |
1552 | } | |
1553 | ||
e829b717 | 1554 | /* |
67e0dd8f KO |
1555 | * Ensure we obey path->should_be_locked: if it's set, we can't unlock |
1556 | * and re-traverse the path without a transaction restart: | |
e829b717 | 1557 | */ |
67e0dd8f KO |
1558 | if (path->should_be_locked) { |
1559 | ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR; | |
e829b717 KO |
1560 | goto out; |
1561 | } | |
1562 | ||
67e0dd8f KO |
1563 | if (path->cached) { |
1564 | ret = bch2_btree_path_traverse_cached(trans, path, flags); | |
531a0095 KO |
1565 | goto out; |
1566 | } | |
2ca88e5a | 1567 | |
67e0dd8f | 1568 | if (unlikely(path->level >= BTREE_MAX_DEPTH)) |
531a0095 | 1569 | goto out; |
2ca88e5a | 1570 | |
67e0dd8f | 1571 | path->level = btree_path_up_until_good_node(trans, path, 0); |
1c6fdbd8 | 1572 | |
1c6fdbd8 | 1573 | /* |
67e0dd8f | 1574 | * Note: path->nodes[path->level] may be temporarily NULL here - that |
1c6fdbd8 KO |
1575 | * would indicate to other code that we got to the end of the btree, |
1576 | * here it indicates that relocking the root failed - it's critical that | |
67e0dd8f | 1577 | * btree_path_lock_root() comes next and that it can't fail |
1c6fdbd8 | 1578 | */ |
67e0dd8f KO |
1579 | while (path->level > depth_want) { |
1580 | ret = btree_path_node(path, path->level) | |
1581 | ? btree_path_down(trans, path, flags, trace_ip) | |
1582 | : btree_path_lock_root(trans, path, depth_want, trace_ip); | |
1c6fdbd8 | 1583 | if (unlikely(ret)) { |
531a0095 KO |
1584 | if (ret == 1) { |
1585 | /* | |
f21566f1 KO |
1586 | * No nodes at this level - got to the end of |
1587 | * the btree: | |
531a0095 KO |
1588 | */ |
1589 | ret = 0; | |
1590 | goto out; | |
1591 | } | |
8812600c | 1592 | |
67e0dd8f KO |
1593 | __bch2_btree_path_unlock(path); |
1594 | path->level = depth_want; | |
2ca88e5a | 1595 | |
67e0dd8f KO |
1596 | if (ret == -EIO) |
1597 | path->l[path->level].b = | |
2ca88e5a | 1598 | BTREE_ITER_NO_NODE_ERROR; |
67e0dd8f KO |
1599 | else |
1600 | path->l[path->level].b = | |
2ca88e5a | 1601 | BTREE_ITER_NO_NODE_DOWN; |
531a0095 | 1602 | goto out; |
1c6fdbd8 KO |
1603 | } |
1604 | } | |
1605 | ||
67e0dd8f | 1606 | path->uptodate = BTREE_ITER_UPTODATE; |
531a0095 | 1607 | out: |
e5af273f | 1608 | BUG_ON((ret == -EINTR) != !!trans->restarted); |
67e0dd8f | 1609 | bch2_btree_path_verify(trans, path); |
531a0095 | 1610 | return ret; |
1c6fdbd8 KO |
1611 | } |
1612 | ||
67e0dd8f KO |
1613 | int __must_check bch2_btree_path_traverse(struct btree_trans *trans, |
1614 | struct btree_path *path, unsigned flags) | |
1c6fdbd8 | 1615 | { |
67e0dd8f KO |
1616 | if (path->uptodate < BTREE_ITER_NEED_RELOCK) |
1617 | return 0; | |
1618 | ||
e5fa91d7 | 1619 | return bch2_trans_cond_resched(trans) ?: |
67e0dd8f | 1620 | btree_path_traverse_one(trans, path, flags, _RET_IP_); |
1c6fdbd8 KO |
1621 | } |
1622 | ||
67e0dd8f KO |
1623 | static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, |
1624 | struct btree_path *src) | |
1625 | { | |
1626 | unsigned i, offset = offsetof(struct btree_path, pos); | |
1627 | ||
1628 | memcpy((void *) dst + offset, | |
1629 | (void *) src + offset, | |
1630 | sizeof(struct btree_path) - offset); | |
1631 | ||
1632 | for (i = 0; i < BTREE_MAX_DEPTH; i++) | |
1633 | if (btree_node_locked(dst, i)) | |
1634 | six_lock_increment(&dst->l[i].b->c.lock, | |
1635 | __btree_lock_want(dst, i)); | |
1636 | ||
1637 | trans->paths_sorted = false; | |
1638 | } | |
1639 | ||
c404f203 KO |
1640 | static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, |
1641 | bool intent) | |
67e0dd8f | 1642 | { |
c404f203 | 1643 | struct btree_path *new = btree_path_alloc(trans, src); |
67e0dd8f | 1644 | |
c404f203 | 1645 | btree_path_copy(trans, new, src); |
67e0dd8f | 1646 | __btree_path_get(new, intent); |
c404f203 KO |
1647 | return new; |
1648 | } | |
1649 | ||
1650 | struct btree_path * __must_check | |
1651 | __bch2_btree_path_make_mut(struct btree_trans *trans, | |
1652 | struct btree_path *path, bool intent) | |
1653 | { | |
67e0dd8f | 1654 | __btree_path_put(path, intent); |
c404f203 | 1655 | path = btree_path_clone(trans, path, intent); |
67e0dd8f KO |
1656 | path->preserve = false; |
1657 | #ifdef CONFIG_BCACHEFS_DEBUG | |
1658 | path->ip_allocated = _RET_IP_; | |
1659 | #endif | |
1660 | return path; | |
1661 | } | |
1662 | ||
ce91abd6 | 1663 | struct btree_path * __must_check |
67e0dd8f KO |
1664 | __bch2_btree_path_set_pos(struct btree_trans *trans, |
1665 | struct btree_path *path, struct bpos new_pos, | |
1666 | bool intent, int cmp) | |
1667 | { | |
67e0dd8f KO |
1668 | unsigned l = path->level; |
1669 | ||
1670 | EBUG_ON(trans->restarted); | |
1671 | EBUG_ON(!path->ref); | |
1672 | ||
1673 | path = bch2_btree_path_make_mut(trans, path, intent); | |
1674 | ||
1675 | path->pos = new_pos; | |
1676 | path->should_be_locked = false; | |
1677 | trans->paths_sorted = false; | |
1678 | ||
1679 | if (unlikely(path->cached)) { | |
1680 | btree_node_unlock(path, 0); | |
1681 | path->l[0].b = BTREE_ITER_NO_NODE_CACHED; | |
1682 | btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); | |
1683 | goto out; | |
1684 | } | |
1685 | ||
1686 | l = btree_path_up_until_good_node(trans, path, cmp); | |
1687 | ||
1688 | if (btree_path_node(path, l)) { | |
1689 | /* | |
1690 | * We might have to skip over many keys, or just a few: try | |
1691 | * advancing the node iterator, and if we have to skip over too | |
1692 | * many keys just reinit it (or if we're rewinding, since that | |
1693 | * is expensive). | |
1694 | */ | |
1695 | if (cmp < 0 || | |
1696 | !btree_path_advance_to_pos(path, &path->l[l], 8)) | |
1697 | __btree_path_level_init(path, l); | |
67e0dd8f KO |
1698 | } |
1699 | ||
1d3ecd7e | 1700 | if (l != path->level) { |
67e0dd8f | 1701 | btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); |
1d3ecd7e KO |
1702 | __bch2_btree_path_unlock(path); |
1703 | } | |
67e0dd8f KO |
1704 | out: |
1705 | bch2_btree_path_verify(trans, path); | |
67e0dd8f KO |
1706 | return path; |
1707 | } | |
1708 | ||
67e0dd8f KO |
1709 | /* Btree path: main interface: */ |
1710 | ||
1711 | static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) | |
1712 | { | |
1713 | struct btree_path *next; | |
1714 | ||
1715 | next = prev_btree_path(trans, path); | |
1716 | if (next && !btree_path_cmp(next, path)) | |
1717 | return next; | |
1718 | ||
1719 | next = next_btree_path(trans, path); | |
1720 | if (next && !btree_path_cmp(next, path)) | |
1721 | return next; | |
1722 | ||
1723 | return NULL; | |
1724 | } | |
1725 | ||
9a74f63c | 1726 | static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) |
67e0dd8f KO |
1727 | { |
1728 | struct btree_path *next; | |
1729 | ||
1730 | next = prev_btree_path(trans, path); | |
9a74f63c KO |
1731 | if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) |
1732 | return next; | |
67e0dd8f KO |
1733 | |
1734 | next = next_btree_path(trans, path); | |
9a74f63c KO |
1735 | if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) |
1736 | return next; | |
67e0dd8f | 1737 | |
9a74f63c | 1738 | return NULL; |
67e0dd8f KO |
1739 | } |
1740 | ||
1741 | static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) | |
08070cba | 1742 | { |
67e0dd8f KO |
1743 | __bch2_btree_path_unlock(path); |
1744 | btree_path_list_remove(trans, path); | |
1745 | trans->paths_allocated &= ~(1ULL << path->idx); | |
08070cba KO |
1746 | } |
1747 | ||
67e0dd8f KO |
1748 | void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) |
1749 | { | |
1750 | struct btree_path *dup; | |
1751 | ||
1752 | EBUG_ON(trans->paths + path->idx != path); | |
1753 | EBUG_ON(!path->ref); | |
1754 | ||
1755 | if (!__btree_path_put(path, intent)) | |
1756 | return; | |
1757 | ||
1758 | /* | |
1759 | * Perhaps instead we should check for duplicate paths in traverse_all: | |
1760 | */ | |
1761 | if (path->preserve && | |
1762 | (dup = have_path_at_pos(trans, path))) { | |
1763 | dup->preserve = true; | |
1764 | path->preserve = false; | |
9a74f63c | 1765 | goto free; |
67e0dd8f KO |
1766 | } |
1767 | ||
1768 | if (!path->preserve && | |
9a74f63c KO |
1769 | (dup = have_node_at_pos(trans, path))) |
1770 | goto free; | |
1771 | return; | |
1772 | free: | |
1773 | if (path->should_be_locked && | |
1774 | !btree_node_locked(dup, path->level)) | |
1775 | return; | |
1776 | ||
1777 | dup->should_be_locked |= path->should_be_locked; | |
1778 | __bch2_path_free(trans, path); | |
67e0dd8f KO |
1779 | } |
1780 | ||
1781 | noinline __cold | |
1782 | void bch2_dump_trans_paths_updates(struct btree_trans *trans) | |
1783 | { | |
1784 | struct btree_path *path; | |
1785 | struct btree_insert_entry *i; | |
1786 | unsigned idx; | |
60816d9b | 1787 | char buf1[300], buf2[300]; |
67e0dd8f KO |
1788 | |
1789 | btree_trans_sort_paths(trans); | |
1790 | ||
1791 | trans_for_each_path_inorder(trans, path, idx) | |
32b26e8c | 1792 | printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", |
67e0dd8f | 1793 | path->idx, path->ref, path->intent_ref, |
32b26e8c KO |
1794 | path->should_be_locked ? " S" : "", |
1795 | path->preserve ? " P" : "", | |
67e0dd8f | 1796 | bch2_btree_ids[path->btree_id], |
60816d9b | 1797 | (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), |
32b26e8c | 1798 | path->nodes_locked, |
67e0dd8f KO |
1799 | #ifdef CONFIG_BCACHEFS_DEBUG |
1800 | (void *) path->ip_allocated | |
1801 | #else | |
1802 | NULL | |
1803 | #endif | |
1804 | ); | |
1805 | ||
60816d9b KO |
1806 | trans_for_each_update(trans, i) { |
1807 | struct bkey u; | |
1808 | struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u); | |
1809 | ||
1810 | printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", | |
67e0dd8f | 1811 | bch2_btree_ids[i->btree_id], |
60816d9b KO |
1812 | (void *) i->ip_allocated, |
1813 | (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1), | |
1814 | (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2)); | |
1815 | } | |
67e0dd8f KO |
1816 | } |
1817 | ||
1818 | static struct btree_path *btree_path_alloc(struct btree_trans *trans, | |
1819 | struct btree_path *pos) | |
1820 | { | |
1821 | struct btree_path *path; | |
1822 | unsigned idx; | |
1823 | ||
1824 | if (unlikely(trans->paths_allocated == | |
1825 | ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { | |
1826 | bch2_dump_trans_paths_updates(trans); | |
1827 | panic("trans path oveflow\n"); | |
1828 | } | |
1829 | ||
1830 | idx = __ffs64(~trans->paths_allocated); | |
1831 | trans->paths_allocated |= 1ULL << idx; | |
1832 | ||
1833 | path = &trans->paths[idx]; | |
1834 | ||
1835 | path->idx = idx; | |
1836 | path->ref = 0; | |
1837 | path->intent_ref = 0; | |
1838 | path->nodes_locked = 0; | |
1839 | path->nodes_intent_locked = 0; | |
1840 | ||
1841 | btree_path_list_add(trans, pos, path); | |
1842 | return path; | |
1843 | } | |
1844 | ||
f3e1f444 | 1845 | struct btree_path *bch2_path_get(struct btree_trans *trans, |
67e0dd8f KO |
1846 | enum btree_id btree_id, struct bpos pos, |
1847 | unsigned locks_want, unsigned level, | |
f3e1f444 | 1848 | unsigned flags) |
67e0dd8f | 1849 | { |
807dda8c | 1850 | struct btree_path *path, *path_pos = NULL; |
f3e1f444 KO |
1851 | bool cached = flags & BTREE_ITER_CACHED; |
1852 | bool intent = flags & BTREE_ITER_INTENT; | |
67e0dd8f KO |
1853 | int i; |
1854 | ||
1855 | BUG_ON(trans->restarted); | |
eb7bd15f | 1856 | btree_trans_sort_paths(trans); |
67e0dd8f | 1857 | |
807dda8c | 1858 | btree_trans_sort_paths(trans); |
67e0dd8f | 1859 | |
807dda8c KO |
1860 | trans_for_each_path_inorder(trans, path, i) { |
1861 | if (__btree_path_cmp(path, | |
1862 | btree_id, | |
1863 | cached, | |
1864 | pos, | |
1865 | level) > 0) | |
1866 | break; | |
67e0dd8f | 1867 | |
807dda8c | 1868 | path_pos = path; |
67e0dd8f KO |
1869 | } |
1870 | ||
807dda8c KO |
1871 | if (path_pos && |
1872 | path_pos->cached == cached && | |
1873 | path_pos->btree_id == btree_id && | |
1874 | path_pos->level == level) { | |
1875 | __btree_path_get(path_pos, intent); | |
ce91abd6 | 1876 | path = bch2_btree_path_set_pos(trans, path_pos, pos, intent); |
67e0dd8f | 1877 | } else { |
807dda8c KO |
1878 | path = btree_path_alloc(trans, path_pos); |
1879 | path_pos = NULL; | |
67e0dd8f KO |
1880 | |
1881 | __btree_path_get(path, intent); | |
1882 | path->pos = pos; | |
1883 | path->btree_id = btree_id; | |
1884 | path->cached = cached; | |
67e0dd8f KO |
1885 | path->uptodate = BTREE_ITER_NEED_TRAVERSE; |
1886 | path->should_be_locked = false; | |
1887 | path->level = level; | |
1888 | path->locks_want = locks_want; | |
1889 | path->nodes_locked = 0; | |
1890 | path->nodes_intent_locked = 0; | |
1891 | for (i = 0; i < ARRAY_SIZE(path->l); i++) | |
1892 | path->l[i].b = BTREE_ITER_NO_NODE_INIT; | |
1893 | #ifdef CONFIG_BCACHEFS_DEBUG | |
1894 | path->ip_allocated = _RET_IP_; | |
1895 | #endif | |
1896 | trans->paths_sorted = false; | |
1897 | } | |
1898 | ||
f3e1f444 KO |
1899 | if (!(flags & BTREE_ITER_NOPRESERVE)) |
1900 | path->preserve = true; | |
1901 | ||
67e0dd8f KO |
1902 | if (path->intent_ref) |
1903 | locks_want = max(locks_want, level + 1); | |
1904 | ||
1905 | /* | |
1906 | * If the path has locks_want greater than requested, we don't downgrade | |
1907 | * it here - on transaction restart because btree node split needs to | |
1908 | * upgrade locks, we might be putting/getting the iterator again. | |
1909 | * Downgrading iterators only happens via bch2_trans_downgrade(), after | |
1910 | * a successful transaction commit. | |
1911 | */ | |
1912 | ||
1913 | locks_want = min(locks_want, BTREE_MAX_DEPTH); | |
1914 | if (locks_want > path->locks_want) { | |
1915 | path->locks_want = locks_want; | |
bc82d08b | 1916 | btree_path_get_locks(trans, path, true); |
67e0dd8f KO |
1917 | } |
1918 | ||
67e0dd8f KO |
1919 | return path; |
1920 | } | |
1921 | ||
1922 | inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) | |
1923 | { | |
1924 | ||
1925 | struct bkey_s_c k; | |
1926 | ||
67e0dd8f KO |
1927 | if (!path->cached) { |
1928 | struct btree_path_level *l = path_l(path); | |
f7b6ca23 KO |
1929 | struct bkey_packed *_k; |
1930 | ||
1931 | EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); | |
67e0dd8f | 1932 | |
f7b6ca23 | 1933 | _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); |
67e0dd8f KO |
1934 | k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; |
1935 | ||
1936 | EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); | |
1937 | ||
1938 | if (!k.k || bpos_cmp(path->pos, k.k->p)) | |
1939 | goto hole; | |
1940 | } else { | |
1941 | struct bkey_cached *ck = (void *) path->l[0].b; | |
1942 | ||
f7b6ca23 KO |
1943 | EBUG_ON(ck && |
1944 | (path->btree_id != ck->key.btree_id || | |
1945 | bkey_cmp(path->pos, ck->key.pos))); | |
67e0dd8f | 1946 | |
f7b6ca23 KO |
1947 | /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */ |
1948 | if (unlikely(!ck || !ck->valid)) | |
1949 | return bkey_s_c_null; | |
1950 | ||
1951 | EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); | |
67e0dd8f | 1952 | |
2e63e180 | 1953 | *u = ck->k->k; |
67e0dd8f KO |
1954 | k = bkey_i_to_s_c(ck->k); |
1955 | } | |
1956 | ||
1957 | return k; | |
1958 | hole: | |
1959 | bkey_init(u); | |
1960 | u->p = path->pos; | |
1961 | return (struct bkey_s_c) { u, NULL }; | |
1962 | } | |
1963 | ||
1964 | /* Btree iterators: */ | |
1965 | ||
db92f2ea KO |
1966 | int __must_check |
1967 | __bch2_btree_iter_traverse(struct btree_iter *iter) | |
1968 | { | |
1969 | return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); | |
1970 | } | |
1971 | ||
08070cba KO |
1972 | int __must_check |
1973 | bch2_btree_iter_traverse(struct btree_iter *iter) | |
1974 | { | |
66a0a497 KO |
1975 | int ret; |
1976 | ||
ce91abd6 | 1977 | iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, |
67e0dd8f KO |
1978 | btree_iter_search_key(iter), |
1979 | iter->flags & BTREE_ITER_INTENT); | |
08070cba | 1980 | |
67e0dd8f | 1981 | ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); |
66a0a497 KO |
1982 | if (ret) |
1983 | return ret; | |
1984 | ||
67e0dd8f | 1985 | iter->path->should_be_locked = true; |
66a0a497 | 1986 | return 0; |
08070cba KO |
1987 | } |
1988 | ||
1c6fdbd8 KO |
1989 | /* Iterate across nodes (leaf and interior nodes) */ |
1990 | ||
1991 | struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) | |
1992 | { | |
502027a8 | 1993 | struct btree_trans *trans = iter->trans; |
f21566f1 | 1994 | struct btree *b = NULL; |
1c6fdbd8 KO |
1995 | int ret; |
1996 | ||
67e0dd8f | 1997 | EBUG_ON(iter->path->cached); |
7e1a3aa9 | 1998 | bch2_btree_iter_verify(iter); |
1c6fdbd8 | 1999 | |
502027a8 | 2000 | ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); |
1c6fdbd8 | 2001 | if (ret) |
d355c6f4 | 2002 | goto err; |
1c6fdbd8 | 2003 | |
67e0dd8f | 2004 | b = btree_path_node(iter->path, iter->path->level); |
1c6fdbd8 | 2005 | if (!b) |
f21566f1 | 2006 | goto out; |
1c6fdbd8 | 2007 | |
4cf91b02 | 2008 | BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); |
1c6fdbd8 | 2009 | |
f21566f1 | 2010 | bkey_init(&iter->k); |
67e0dd8f | 2011 | iter->k.p = iter->pos = b->key.k.p; |
502027a8 | 2012 | |
ce91abd6 | 2013 | iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, |
502027a8 | 2014 | iter->flags & BTREE_ITER_INTENT); |
67e0dd8f | 2015 | iter->path->should_be_locked = true; |
502027a8 | 2016 | BUG_ON(iter->path->uptodate); |
f21566f1 KO |
2017 | out: |
2018 | bch2_btree_iter_verify_entry_exit(iter); | |
2019 | bch2_btree_iter_verify(iter); | |
2dac0eae | 2020 | |
1c6fdbd8 | 2021 | return b; |
d355c6f4 KO |
2022 | err: |
2023 | b = ERR_PTR(ret); | |
2024 | goto out; | |
1c6fdbd8 KO |
2025 | } |
2026 | ||
2dac0eae | 2027 | struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) |
1c6fdbd8 | 2028 | { |
67e0dd8f KO |
2029 | struct btree_trans *trans = iter->trans; |
2030 | struct btree_path *path = iter->path; | |
f21566f1 | 2031 | struct btree *b = NULL; |
979735df | 2032 | unsigned l; |
1c6fdbd8 KO |
2033 | int ret; |
2034 | ||
979735df | 2035 | BUG_ON(trans->restarted); |
67e0dd8f | 2036 | EBUG_ON(iter->path->cached); |
7e1a3aa9 | 2037 | bch2_btree_iter_verify(iter); |
1c6fdbd8 | 2038 | |
979735df | 2039 | /* already at end? */ |
67e0dd8f | 2040 | if (!btree_path_node(path, path->level)) |
979735df | 2041 | return NULL; |
1c6fdbd8 | 2042 | |
979735df KO |
2043 | /* got to end? */ |
2044 | if (!btree_path_node(path, path->level + 1)) { | |
2045 | btree_node_unlock(path, path->level); | |
2046 | path->l[path->level].b = BTREE_ITER_NO_NODE_UP; | |
2047 | path->level++; | |
2048 | return NULL; | |
2049 | } | |
1c6fdbd8 | 2050 | |
979735df KO |
2051 | if (!bch2_btree_node_relock(trans, path, path->level + 1)) { |
2052 | __bch2_btree_path_unlock(path); | |
2053 | path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; | |
2054 | path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; | |
bc82d08b KO |
2055 | trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, |
2056 | path->btree_id, &path->pos); | |
979735df KO |
2057 | btree_trans_restart(trans); |
2058 | ret = -EINTR; | |
d355c6f4 | 2059 | goto err; |
979735df | 2060 | } |
1c6fdbd8 | 2061 | |
979735df | 2062 | b = btree_path_node(path, path->level + 1); |
1c6fdbd8 | 2063 | |
979735df KO |
2064 | if (!bpos_cmp(iter->pos, b->key.k.p)) { |
2065 | btree_node_unlock(path, path->level); | |
2066 | path->l[path->level].b = BTREE_ITER_NO_NODE_UP; | |
2067 | path->level++; | |
2068 | } else { | |
1c6fdbd8 KO |
2069 | /* |
2070 | * Haven't gotten to the end of the parent node: go back down to | |
2071 | * the next child node | |
2072 | */ | |
67e0dd8f | 2073 | path = iter->path = |
ce91abd6 | 2074 | bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), |
67e0dd8f | 2075 | iter->flags & BTREE_ITER_INTENT); |
1c6fdbd8 | 2076 | |
67e0dd8f | 2077 | path->level = iter->min_depth; |
979735df KO |
2078 | |
2079 | for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) | |
2080 | if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) | |
2081 | btree_node_unlock(path, l); | |
2082 | ||
67e0dd8f | 2083 | btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); |
345ca825 KO |
2084 | bch2_btree_iter_verify(iter); |
2085 | ||
67e0dd8f | 2086 | ret = bch2_btree_path_traverse(trans, path, iter->flags); |
d355c6f4 KO |
2087 | if (ret) |
2088 | goto err; | |
1c6fdbd8 | 2089 | |
67e0dd8f | 2090 | b = path->l[path->level].b; |
1c6fdbd8 KO |
2091 | } |
2092 | ||
f21566f1 | 2093 | bkey_init(&iter->k); |
67e0dd8f | 2094 | iter->k.p = iter->pos = b->key.k.p; |
502027a8 | 2095 | |
ce91abd6 | 2096 | iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, |
502027a8 | 2097 | iter->flags & BTREE_ITER_INTENT); |
67e0dd8f | 2098 | iter->path->should_be_locked = true; |
502027a8 | 2099 | BUG_ON(iter->path->uptodate); |
f21566f1 KO |
2100 | out: |
2101 | bch2_btree_iter_verify_entry_exit(iter); | |
2102 | bch2_btree_iter_verify(iter); | |
2dac0eae | 2103 | |
1c6fdbd8 | 2104 | return b; |
d355c6f4 KO |
2105 | err: |
2106 | b = ERR_PTR(ret); | |
2107 | goto out; | |
1c6fdbd8 KO |
2108 | } |
2109 | ||
2110 | /* Iterate across keys (in leaf nodes only) */ | |
2111 | ||
e0ba3b64 | 2112 | inline bool bch2_btree_iter_advance(struct btree_iter *iter) |
434094be | 2113 | { |
3d495595 | 2114 | struct bpos pos = iter->k.p; |
6caf0578 KO |
2115 | bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS |
2116 | ? bpos_cmp(pos, SPOS_MAX) | |
2117 | : bkey_cmp(pos, SPOS_MAX)) != 0; | |
3d495595 | 2118 | |
7e1a3aa9 | 2119 | if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) |
e751c01a | 2120 | pos = bkey_successor(iter, pos); |
3d495595 | 2121 | bch2_btree_iter_set_pos(iter, pos); |
7e1a3aa9 | 2122 | return ret; |
3d495595 KO |
2123 | } |
2124 | ||
e0ba3b64 | 2125 | inline bool bch2_btree_iter_rewind(struct btree_iter *iter) |
3d495595 KO |
2126 | { |
2127 | struct bpos pos = bkey_start_pos(&iter->k); | |
71f892a4 KO |
2128 | bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS |
2129 | ? bpos_cmp(pos, POS_MIN) | |
2130 | : bkey_cmp(pos, POS_MIN)) != 0; | |
3d495595 | 2131 | |
7e1a3aa9 | 2132 | if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) |
e751c01a | 2133 | pos = bkey_predecessor(iter, pos); |
3d495595 | 2134 | bch2_btree_iter_set_pos(iter, pos); |
7e1a3aa9 | 2135 | return ret; |
434094be KO |
2136 | } |
2137 | ||
3763cb95 | 2138 | static noinline |
67e0dd8f | 2139 | struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) |
818664f5 KO |
2140 | { |
2141 | struct btree_insert_entry *i; | |
5288e66a | 2142 | struct bkey_i *ret = NULL; |
818664f5 | 2143 | |
8e6bbc41 | 2144 | trans_for_each_update(iter->trans, i) { |
5288e66a KO |
2145 | if (i->btree_id < iter->btree_id) |
2146 | continue; | |
2147 | if (i->btree_id > iter->btree_id) | |
818664f5 | 2148 | break; |
67e0dd8f | 2149 | if (bpos_cmp(i->k->k.p, iter->path->pos) < 0) |
5288e66a | 2150 | continue; |
12ce5b7d KO |
2151 | if (i->key_cache_already_flushed) |
2152 | continue; | |
5288e66a KO |
2153 | if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0) |
2154 | ret = i->k; | |
2155 | } | |
818664f5 | 2156 | |
5288e66a KO |
2157 | return ret; |
2158 | } | |
2159 | ||
3763cb95 KO |
2160 | static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) |
2161 | { | |
2162 | return iter->flags & BTREE_ITER_WITH_UPDATES | |
2163 | ? __bch2_btree_trans_peek_updates(iter) | |
2164 | : NULL; | |
2165 | } | |
2166 | ||
5222a460 KO |
2167 | static noinline |
2168 | struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, | |
2169 | struct btree_iter *iter) | |
2170 | { | |
45e4cd9e KO |
2171 | struct bkey_i *k = bch2_journal_keys_peek(trans->c, iter->btree_id, 0, |
2172 | iter->path->pos); | |
5222a460 | 2173 | |
45e4cd9e | 2174 | if (k && !bpos_cmp(k->k.p, iter->path->pos)) { |
5222a460 KO |
2175 | iter->k = k->k; |
2176 | return bkey_i_to_s_c(k); | |
2177 | } else { | |
2178 | return bkey_s_c_null; | |
2179 | } | |
2180 | } | |
2181 | ||
2182 | static noinline | |
2183 | struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, | |
2184 | struct btree_iter *iter, | |
2185 | struct bkey_s_c k) | |
2186 | { | |
2187 | struct bkey_i *next_journal = | |
45e4cd9e KO |
2188 | bch2_journal_keys_peek(trans->c, iter->btree_id, 0, |
2189 | iter->path->pos); | |
5222a460 KO |
2190 | |
2191 | if (next_journal && | |
2192 | bpos_cmp(next_journal->k.p, | |
2193 | k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { | |
2194 | iter->k = next_journal->k; | |
2195 | k = bkey_i_to_s_c(next_journal); | |
2196 | } | |
2197 | ||
2198 | return k; | |
2199 | } | |
2200 | ||
f7b6ca23 KO |
2201 | /* |
2202 | * Checks btree key cache for key at iter->pos and returns it if present, or | |
2203 | * bkey_s_c_null: | |
2204 | */ | |
2205 | static noinline | |
2206 | struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) | |
2207 | { | |
2208 | struct btree_trans *trans = iter->trans; | |
2209 | struct bch_fs *c = trans->c; | |
2210 | struct bkey u; | |
2211 | int ret; | |
2212 | ||
2213 | if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) | |
2214 | return bkey_s_c_null; | |
2215 | ||
2216 | if (!iter->key_cache_path) | |
2217 | iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, | |
2218 | iter->flags & BTREE_ITER_INTENT, 0, | |
2219 | iter->flags|BTREE_ITER_CACHED); | |
2220 | ||
2221 | iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, | |
2222 | iter->flags & BTREE_ITER_INTENT); | |
2223 | ||
2224 | ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); | |
2225 | if (unlikely(ret)) | |
2226 | return bkey_s_c_err(ret); | |
2227 | ||
2228 | iter->key_cache_path->should_be_locked = true; | |
2229 | ||
2230 | return bch2_btree_path_peek_slot(iter->key_cache_path, &u); | |
2231 | } | |
2232 | ||
a1e82d35 | 2233 | static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) |
1c6fdbd8 | 2234 | { |
67e0dd8f | 2235 | struct btree_trans *trans = iter->trans; |
360746bf | 2236 | struct bkey_i *next_update; |
f7b6ca23 | 2237 | struct bkey_s_c k, k2; |
8e432d98 | 2238 | int ret; |
1c6fdbd8 | 2239 | |
67e0dd8f | 2240 | EBUG_ON(iter->path->cached || iter->path->level); |
7e1a3aa9 | 2241 | bch2_btree_iter_verify(iter); |
1c6fdbd8 | 2242 | |
1c6fdbd8 | 2243 | while (1) { |
ce91abd6 KO |
2244 | iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, |
2245 | iter->flags & BTREE_ITER_INTENT); | |
c8476a4e | 2246 | |
67e0dd8f | 2247 | ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); |
e6e024e9 KO |
2248 | if (unlikely(ret)) { |
2249 | /* ensure that iter->k is consistent with iter->pos: */ | |
2250 | bch2_btree_iter_set_pos(iter, iter->pos); | |
2251 | k = bkey_s_c_err(ret); | |
2252 | goto out; | |
2253 | } | |
1c6fdbd8 | 2254 | |
f7b6ca23 KO |
2255 | iter->path->should_be_locked = true; |
2256 | ||
67e0dd8f | 2257 | k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); |
e6e024e9 | 2258 | |
f7b6ca23 KO |
2259 | if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && |
2260 | k.k && | |
2261 | (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { | |
2262 | ret = bkey_err(k2); | |
2263 | if (ret) { | |
2264 | k = k2; | |
2265 | bch2_btree_iter_set_pos(iter, iter->pos); | |
2266 | goto out; | |
2267 | } | |
2268 | ||
2269 | k = k2; | |
2270 | iter->k = *k.k; | |
2271 | } | |
2272 | ||
5222a460 KO |
2273 | if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) |
2274 | k = btree_trans_peek_journal(trans, iter, k); | |
2275 | ||
2276 | next_update = btree_trans_peek_updates(iter); | |
e6e024e9 | 2277 | |
818664f5 | 2278 | if (next_update && |
e6e024e9 | 2279 | bpos_cmp(next_update->k.p, |
67e0dd8f | 2280 | k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { |
5288e66a | 2281 | iter->k = next_update->k; |
818664f5 | 2282 | k = bkey_i_to_s_c(next_update); |
5288e66a | 2283 | } |
818664f5 | 2284 | |
5222a460 KO |
2285 | if (k.k && bkey_deleted(k.k)) { |
2286 | /* | |
2287 | * If we've got a whiteout, and it's after the search | |
2288 | * key, advance the search key to the whiteout instead | |
2289 | * of just after the whiteout - it might be a btree | |
2290 | * whiteout, with a real key at the same position, since | |
2291 | * in the btree deleted keys sort before non deleted. | |
2292 | */ | |
2293 | search_key = bpos_cmp(search_key, k.k->p) | |
2294 | ? k.k->p | |
2295 | : bpos_successor(k.k->p); | |
2296 | continue; | |
2297 | } | |
2298 | ||
818664f5 | 2299 | if (likely(k.k)) { |
c075ff70 | 2300 | break; |
67e0dd8f | 2301 | } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { |
e6e024e9 | 2302 | /* Advance to next leaf node: */ |
67e0dd8f | 2303 | search_key = bpos_successor(iter->path->l[0].b->key.k.p); |
e6e024e9 KO |
2304 | } else { |
2305 | /* End of btree: */ | |
c8476a4e KO |
2306 | bch2_btree_iter_set_pos(iter, SPOS_MAX); |
2307 | k = bkey_s_c_null; | |
2308 | goto out; | |
2309 | } | |
1c6fdbd8 | 2310 | } |
a1e82d35 | 2311 | out: |
a1e82d35 KO |
2312 | bch2_btree_iter_verify(iter); |
2313 | ||
2314 | return k; | |
2315 | } | |
2316 | ||
2317 | /** | |
2318 | * bch2_btree_iter_peek: returns first key greater than or equal to iterator's | |
2319 | * current position | |
2320 | */ | |
2321 | struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) | |
2322 | { | |
2323 | struct btree_trans *trans = iter->trans; | |
2324 | struct bpos search_key = btree_iter_search_key(iter); | |
2325 | struct bkey_s_c k; | |
2326 | int ret; | |
2327 | ||
1f2d9192 KO |
2328 | if (iter->update_path) { |
2329 | bch2_path_put(trans, iter->update_path, | |
2330 | iter->flags & BTREE_ITER_INTENT); | |
2331 | iter->update_path = NULL; | |
2332 | } | |
2333 | ||
a1e82d35 KO |
2334 | bch2_btree_iter_verify_entry_exit(iter); |
2335 | ||
2336 | while (1) { | |
2337 | k = __bch2_btree_iter_peek(iter, search_key); | |
2338 | if (!k.k || bkey_err(k)) | |
2339 | goto out; | |
2340 | ||
1f2d9192 KO |
2341 | if (iter->update_path && |
2342 | bkey_cmp(iter->update_path->pos, k.k->p)) { | |
2343 | bch2_path_put(trans, iter->update_path, | |
2344 | iter->flags & BTREE_ITER_INTENT); | |
2345 | iter->update_path = NULL; | |
2346 | } | |
2347 | ||
2348 | if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && | |
2349 | (iter->flags & BTREE_ITER_INTENT) && | |
2350 | !(iter->flags & BTREE_ITER_IS_EXTENTS) && | |
2351 | !iter->update_path) { | |
2352 | struct bpos pos = k.k->p; | |
2353 | ||
2354 | if (pos.snapshot < iter->snapshot) { | |
2355 | search_key = bpos_successor(k.k->p); | |
2356 | continue; | |
2357 | } | |
2358 | ||
2359 | pos.snapshot = iter->snapshot; | |
2360 | ||
2361 | /* | |
2362 | * advance, same as on exit for iter->path, but only up | |
2363 | * to snapshot | |
2364 | */ | |
2365 | __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); | |
2366 | iter->update_path = iter->path; | |
2367 | ||
ce91abd6 | 2368 | iter->update_path = bch2_btree_path_set_pos(trans, |
1f2d9192 KO |
2369 | iter->update_path, pos, |
2370 | iter->flags & BTREE_ITER_INTENT); | |
2371 | ||
2372 | BUG_ON(!(iter->update_path->nodes_locked & 1)); | |
2373 | iter->update_path->should_be_locked = true; | |
2374 | } | |
2375 | ||
a1e82d35 KO |
2376 | /* |
2377 | * We can never have a key in a leaf node at POS_MAX, so | |
2378 | * we don't have to check these successor() calls: | |
2379 | */ | |
2380 | if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && | |
2381 | !bch2_snapshot_is_ancestor(trans->c, | |
2382 | iter->snapshot, | |
2383 | k.k->p.snapshot)) { | |
2384 | search_key = bpos_successor(k.k->p); | |
2385 | continue; | |
2386 | } | |
2387 | ||
2388 | if (bkey_whiteout(k.k) && | |
2389 | !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { | |
2390 | search_key = bkey_successor(iter, k.k->p); | |
2391 | continue; | |
2392 | } | |
2393 | ||
2394 | break; | |
2395 | } | |
2396 | ||
1c6fdbd8 | 2397 | /* |
818664f5 KO |
2398 | * iter->pos should be mononotically increasing, and always be equal to |
2399 | * the key we just returned - except extents can straddle iter->pos: | |
1c6fdbd8 | 2400 | */ |
d7f35163 KO |
2401 | if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) |
2402 | iter->pos = k.k->p; | |
2403 | else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) | |
1c6fdbd8 | 2404 | iter->pos = bkey_start_pos(k.k); |
1f2d9192 | 2405 | |
ce91abd6 | 2406 | iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, |
1f2d9192 KO |
2407 | iter->flags & BTREE_ITER_INTENT); |
2408 | BUG_ON(!iter->path->nodes_locked); | |
a1e82d35 | 2409 | out: |
1f2d9192 KO |
2410 | if (iter->update_path) { |
2411 | BUG_ON(!(iter->update_path->nodes_locked & 1)); | |
2412 | iter->update_path->should_be_locked = true; | |
2413 | } | |
2414 | iter->path->should_be_locked = true; | |
2415 | ||
a1e82d35 | 2416 | if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) |
c075ff70 KO |
2417 | iter->pos.snapshot = iter->snapshot; |
2418 | ||
a1e82d35 KO |
2419 | ret = bch2_btree_iter_verify_ret(iter, k); |
2420 | if (unlikely(ret)) { | |
2421 | bch2_btree_iter_set_pos(iter, iter->pos); | |
2422 | k = bkey_s_c_err(ret); | |
2423 | } | |
67e0dd8f | 2424 | |
7e1a3aa9 | 2425 | bch2_btree_iter_verify_entry_exit(iter); |
c075ff70 | 2426 | |
1c6fdbd8 KO |
2427 | return k; |
2428 | } | |
2429 | ||
f4b61341 KO |
2430 | /** |
2431 | * bch2_btree_iter_next: returns first key greater than iterator's current | |
2432 | * position | |
2433 | */ | |
1c6fdbd8 KO |
2434 | struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) |
2435 | { | |
e0ba3b64 | 2436 | if (!bch2_btree_iter_advance(iter)) |
2e70ce56 | 2437 | return bkey_s_c_null; |
f4b61341 | 2438 | |
2dac0eae | 2439 | return bch2_btree_iter_peek(iter); |
1c6fdbd8 KO |
2440 | } |
2441 | ||
ccf5a109 KO |
2442 | /** |
2443 | * bch2_btree_iter_peek_prev: returns first key less than or equal to | |
2444 | * iterator's current position | |
2445 | */ | |
2446 | struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) | |
1c6fdbd8 | 2447 | { |
67e0dd8f | 2448 | struct btree_trans *trans = iter->trans; |
c8476a4e | 2449 | struct bpos search_key = iter->pos; |
c075ff70 | 2450 | struct btree_path *saved_path = NULL; |
1c6fdbd8 | 2451 | struct bkey_s_c k; |
c075ff70 KO |
2452 | struct bkey saved_k; |
2453 | const struct bch_val *saved_v; | |
1c6fdbd8 KO |
2454 | int ret; |
2455 | ||
67e0dd8f | 2456 | EBUG_ON(iter->path->cached || iter->path->level); |
5288e66a | 2457 | EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); |
5222a460 KO |
2458 | |
2459 | if (iter->flags & BTREE_ITER_WITH_JOURNAL) | |
2460 | return bkey_s_c_err(-EIO); | |
2461 | ||
7e1a3aa9 KO |
2462 | bch2_btree_iter_verify(iter); |
2463 | bch2_btree_iter_verify_entry_exit(iter); | |
2464 | ||
c075ff70 KO |
2465 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) |
2466 | search_key.snapshot = U32_MAX; | |
2467 | ||
1c6fdbd8 | 2468 | while (1) { |
ce91abd6 | 2469 | iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, |
67e0dd8f | 2470 | iter->flags & BTREE_ITER_INTENT); |
c8476a4e | 2471 | |
67e0dd8f | 2472 | ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); |
7e1a3aa9 | 2473 | if (unlikely(ret)) { |
e6e024e9 KO |
2474 | /* ensure that iter->k is consistent with iter->pos: */ |
2475 | bch2_btree_iter_set_pos(iter, iter->pos); | |
7e1a3aa9 | 2476 | k = bkey_s_c_err(ret); |
e6e024e9 | 2477 | goto out; |
7e1a3aa9 | 2478 | } |
1c6fdbd8 | 2479 | |
67e0dd8f KO |
2480 | k = btree_path_level_peek(trans, iter->path, |
2481 | &iter->path->l[0], &iter->k); | |
3d495595 KO |
2482 | if (!k.k || |
2483 | ((iter->flags & BTREE_ITER_IS_EXTENTS) | |
c075ff70 KO |
2484 | ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 |
2485 | : bpos_cmp(k.k->p, search_key) > 0)) | |
67e0dd8f KO |
2486 | k = btree_path_level_prev(trans, iter->path, |
2487 | &iter->path->l[0], &iter->k); | |
ccf5a109 | 2488 | |
e6e024e9 | 2489 | if (likely(k.k)) { |
c075ff70 KO |
2490 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { |
2491 | if (k.k->p.snapshot == iter->snapshot) | |
2492 | goto got_key; | |
2493 | ||
2494 | /* | |
2495 | * If we have a saved candidate, and we're no | |
2496 | * longer at the same _key_ (not pos), return | |
2497 | * that candidate | |
2498 | */ | |
2499 | if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { | |
2500 | bch2_path_put(trans, iter->path, | |
2501 | iter->flags & BTREE_ITER_INTENT); | |
2502 | iter->path = saved_path; | |
2503 | saved_path = NULL; | |
2504 | iter->k = saved_k; | |
2505 | k.v = saved_v; | |
2506 | goto got_key; | |
2507 | } | |
2508 | ||
2509 | if (bch2_snapshot_is_ancestor(iter->trans->c, | |
2510 | iter->snapshot, | |
2511 | k.k->p.snapshot)) { | |
2512 | if (saved_path) | |
2513 | bch2_path_put(trans, saved_path, | |
2514 | iter->flags & BTREE_ITER_INTENT); | |
2515 | saved_path = btree_path_clone(trans, iter->path, | |
2516 | iter->flags & BTREE_ITER_INTENT); | |
2517 | saved_k = *k.k; | |
2518 | saved_v = k.v; | |
2519 | } | |
2520 | ||
2521 | search_key = bpos_predecessor(k.k->p); | |
2522 | continue; | |
2523 | } | |
2524 | got_key: | |
2525 | if (bkey_whiteout(k.k) && | |
2526 | !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { | |
2527 | search_key = bkey_predecessor(iter, k.k->p); | |
2528 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) | |
2529 | search_key.snapshot = U32_MAX; | |
2530 | continue; | |
2531 | } | |
2532 | ||
1c6fdbd8 | 2533 | break; |
67e0dd8f | 2534 | } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) { |
e6e024e9 | 2535 | /* Advance to previous leaf node: */ |
67e0dd8f | 2536 | search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); |
e6e024e9 KO |
2537 | } else { |
2538 | /* Start of btree: */ | |
c8476a4e | 2539 | bch2_btree_iter_set_pos(iter, POS_MIN); |
7e1a3aa9 | 2540 | k = bkey_s_c_null; |
e6e024e9 | 2541 | goto out; |
7e1a3aa9 | 2542 | } |
ccf5a109 | 2543 | } |
1c6fdbd8 | 2544 | |
a045be5a | 2545 | EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); |
3d495595 KO |
2546 | |
2547 | /* Extents can straddle iter->pos: */ | |
a045be5a | 2548 | if (bkey_cmp(k.k->p, iter->pos) < 0) |
3d495595 | 2549 | iter->pos = k.k->p; |
c075ff70 KO |
2550 | |
2551 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) | |
2552 | iter->pos.snapshot = iter->snapshot; | |
7e1a3aa9 | 2553 | out: |
c075ff70 KO |
2554 | if (saved_path) |
2555 | bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT); | |
67e0dd8f KO |
2556 | iter->path->should_be_locked = true; |
2557 | ||
7e1a3aa9 KO |
2558 | bch2_btree_iter_verify_entry_exit(iter); |
2559 | bch2_btree_iter_verify(iter); | |
67e0dd8f | 2560 | |
1c6fdbd8 KO |
2561 | return k; |
2562 | } | |
2563 | ||
ccf5a109 KO |
2564 | /** |
2565 | * bch2_btree_iter_prev: returns first key less than iterator's current | |
2566 | * position | |
2567 | */ | |
2568 | struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) | |
2569 | { | |
e0ba3b64 | 2570 | if (!bch2_btree_iter_rewind(iter)) |
2e70ce56 | 2571 | return bkey_s_c_null; |
ccf5a109 | 2572 | |
2e70ce56 | 2573 | return bch2_btree_iter_peek_prev(iter); |
ccf5a109 KO |
2574 | } |
2575 | ||
c3801239 | 2576 | struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) |
271a3d3a | 2577 | { |
9f6bd307 | 2578 | struct btree_trans *trans = iter->trans; |
953ee28a | 2579 | struct bpos search_key; |
271a3d3a | 2580 | struct bkey_s_c k; |
c3801239 KO |
2581 | int ret; |
2582 | ||
67e0dd8f | 2583 | EBUG_ON(iter->path->level); |
7e1a3aa9 KO |
2584 | bch2_btree_iter_verify(iter); |
2585 | bch2_btree_iter_verify_entry_exit(iter); | |
2586 | ||
1d214eb1 KO |
2587 | /* extents can't span inode numbers: */ |
2588 | if ((iter->flags & BTREE_ITER_IS_EXTENTS) && | |
953ee28a | 2589 | unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { |
1d214eb1 KO |
2590 | if (iter->pos.inode == KEY_INODE_MAX) |
2591 | return bkey_s_c_null; | |
c3801239 | 2592 | |
1d214eb1 KO |
2593 | bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); |
2594 | } | |
8042b5b7 | 2595 | |
953ee28a | 2596 | search_key = btree_iter_search_key(iter); |
ce91abd6 | 2597 | iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, |
67e0dd8f | 2598 | iter->flags & BTREE_ITER_INTENT); |
953ee28a | 2599 | |
67e0dd8f | 2600 | ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); |
c3801239 KO |
2601 | if (unlikely(ret)) |
2602 | return bkey_s_c_err(ret); | |
1c6fdbd8 | 2603 | |
c075ff70 KO |
2604 | if ((iter->flags & BTREE_ITER_CACHED) || |
2605 | !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { | |
953ee28a | 2606 | struct bkey_i *next_update; |
1c6fdbd8 | 2607 | |
5222a460 | 2608 | if ((next_update = btree_trans_peek_updates(iter)) && |
67e0dd8f | 2609 | !bpos_cmp(next_update->k.p, iter->pos)) { |
1d214eb1 KO |
2610 | iter->k = next_update->k; |
2611 | k = bkey_i_to_s_c(next_update); | |
5222a460 | 2612 | goto out; |
1d214eb1 KO |
2613 | } |
2614 | ||
5222a460 KO |
2615 | if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && |
2616 | (k = btree_trans_peek_slot_journal(trans, iter)).k) | |
2617 | goto out; | |
2618 | ||
f7b6ca23 KO |
2619 | if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && |
2620 | (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { | |
2621 | if (!bkey_err(k)) | |
2622 | iter->k = *k.k; | |
2623 | goto out; | |
2624 | } | |
2625 | ||
5222a460 | 2626 | k = bch2_btree_path_peek_slot(iter->path, &iter->k); |
1d214eb1 KO |
2627 | } else { |
2628 | struct bpos next; | |
1d214eb1 | 2629 | |
b1d87f52 | 2630 | if (iter->flags & BTREE_ITER_INTENT) { |
67e0dd8f | 2631 | struct btree_iter iter2; |
b1d87f52 | 2632 | |
67e0dd8f KO |
2633 | bch2_trans_copy_iter(&iter2, iter); |
2634 | k = bch2_btree_iter_peek(&iter2); | |
b1d87f52 | 2635 | |
67e0dd8f KO |
2636 | if (k.k && !bkey_err(k)) { |
2637 | iter->k = iter2.k; | |
2638 | k.k = &iter->k; | |
2639 | } | |
2640 | bch2_trans_iter_exit(trans, &iter2); | |
b1d87f52 KO |
2641 | } else { |
2642 | struct bpos pos = iter->pos; | |
2643 | ||
2644 | k = bch2_btree_iter_peek(iter); | |
2645 | iter->pos = pos; | |
2646 | } | |
1d214eb1 KO |
2647 | |
2648 | if (unlikely(bkey_err(k))) | |
2649 | return k; | |
2650 | ||
2651 | next = k.k ? bkey_start_pos(k.k) : POS_MAX; | |
2652 | ||
2653 | if (bkey_cmp(iter->pos, next) < 0) { | |
2654 | bkey_init(&iter->k); | |
2655 | iter->k.p = iter->pos; | |
c075ff70 KO |
2656 | |
2657 | if (iter->flags & BTREE_ITER_IS_EXTENTS) { | |
2658 | bch2_key_resize(&iter->k, | |
2659 | min_t(u64, KEY_SIZE_MAX, | |
2660 | (next.inode == iter->pos.inode | |
2661 | ? next.offset | |
2662 | : KEY_OFFSET_MAX) - | |
2663 | iter->pos.offset)); | |
2664 | EBUG_ON(!iter->k.size); | |
2665 | } | |
1d214eb1 KO |
2666 | |
2667 | k = (struct bkey_s_c) { &iter->k, NULL }; | |
1d214eb1 | 2668 | } |
271a3d3a | 2669 | } |
5222a460 | 2670 | out: |
67e0dd8f KO |
2671 | iter->path->should_be_locked = true; |
2672 | ||
7e1a3aa9 KO |
2673 | bch2_btree_iter_verify_entry_exit(iter); |
2674 | bch2_btree_iter_verify(iter); | |
c075ff70 KO |
2675 | ret = bch2_btree_iter_verify_ret(iter, k); |
2676 | if (unlikely(ret)) | |
2677 | return bkey_s_c_err(ret); | |
66a0a497 | 2678 | |
63f1a598 | 2679 | return k; |
1c6fdbd8 KO |
2680 | } |
2681 | ||
1c6fdbd8 KO |
2682 | struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) |
2683 | { | |
e0ba3b64 | 2684 | if (!bch2_btree_iter_advance(iter)) |
2e70ce56 | 2685 | return bkey_s_c_null; |
1c6fdbd8 | 2686 | |
2dac0eae | 2687 | return bch2_btree_iter_peek_slot(iter); |
1c6fdbd8 KO |
2688 | } |
2689 | ||
18fc6ae5 KO |
2690 | struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) |
2691 | { | |
e0ba3b64 | 2692 | if (!bch2_btree_iter_rewind(iter)) |
18fc6ae5 KO |
2693 | return bkey_s_c_null; |
2694 | ||
2695 | return bch2_btree_iter_peek_slot(iter); | |
2696 | } | |
2697 | ||
1c6fdbd8 KO |
2698 | /* new transactional stuff: */ |
2699 | ||
0423fb71 KO |
2700 | #ifdef CONFIG_BCACHEFS_DEBUG |
2701 | static void btree_trans_verify_sorted_refs(struct btree_trans *trans) | |
2702 | { | |
67e0dd8f | 2703 | struct btree_path *path; |
0423fb71 KO |
2704 | unsigned i; |
2705 | ||
67e0dd8f | 2706 | BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); |
0423fb71 | 2707 | |
67e0dd8f KO |
2708 | trans_for_each_path(trans, path) { |
2709 | BUG_ON(path->sorted_idx >= trans->nr_sorted); | |
2710 | BUG_ON(trans->sorted[path->sorted_idx] != path->idx); | |
0423fb71 KO |
2711 | } |
2712 | ||
2713 | for (i = 0; i < trans->nr_sorted; i++) { | |
2714 | unsigned idx = trans->sorted[i]; | |
2715 | ||
67e0dd8f KO |
2716 | EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); |
2717 | BUG_ON(trans->paths[idx].sorted_idx != i); | |
0423fb71 KO |
2718 | } |
2719 | } | |
0423fb71 KO |
2720 | |
2721 | static void btree_trans_verify_sorted(struct btree_trans *trans) | |
2722 | { | |
67e0dd8f | 2723 | struct btree_path *path, *prev = NULL; |
0423fb71 KO |
2724 | unsigned i; |
2725 | ||
67e0dd8f | 2726 | trans_for_each_path_inorder(trans, path, i) { |
eb7bd15f KO |
2727 | if (prev && btree_path_cmp(prev, path) > 0) { |
2728 | bch2_dump_trans_paths_updates(trans); | |
2729 | panic("trans paths out of order!\n"); | |
2730 | } | |
67e0dd8f | 2731 | prev = path; |
0423fb71 | 2732 | } |
0423fb71 | 2733 | } |
068bcaa5 KO |
2734 | #else |
2735 | static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} | |
2736 | static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} | |
2737 | #endif | |
0423fb71 | 2738 | |
068bcaa5 | 2739 | void __bch2_btree_trans_sort_paths(struct btree_trans *trans) |
0423fb71 KO |
2740 | { |
2741 | int i, l = 0, r = trans->nr_sorted, inc = 1; | |
2742 | bool swapped; | |
2743 | ||
068bcaa5 KO |
2744 | btree_trans_verify_sorted_refs(trans); |
2745 | ||
2746 | if (trans->paths_sorted) | |
2747 | goto out; | |
2748 | ||
0423fb71 KO |
2749 | /* |
2750 | * Cocktail shaker sort: this is efficient because iterators will be | |
2751 | * mostly sorteda. | |
2752 | */ | |
2753 | do { | |
2754 | swapped = false; | |
2755 | ||
2756 | for (i = inc > 0 ? l : r - 2; | |
2757 | i + 1 < r && i >= l; | |
2758 | i += inc) { | |
67e0dd8f KO |
2759 | if (btree_path_cmp(trans->paths + trans->sorted[i], |
2760 | trans->paths + trans->sorted[i + 1]) > 0) { | |
0423fb71 | 2761 | swap(trans->sorted[i], trans->sorted[i + 1]); |
67e0dd8f KO |
2762 | trans->paths[trans->sorted[i]].sorted_idx = i; |
2763 | trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; | |
0423fb71 KO |
2764 | swapped = true; |
2765 | } | |
2766 | } | |
2767 | ||
2768 | if (inc > 0) | |
2769 | --r; | |
2770 | else | |
2771 | l++; | |
2772 | inc = -inc; | |
2773 | } while (swapped); | |
2774 | ||
67e0dd8f | 2775 | trans->paths_sorted = true; |
068bcaa5 | 2776 | out: |
0423fb71 KO |
2777 | btree_trans_verify_sorted(trans); |
2778 | } | |
2779 | ||
67e0dd8f KO |
2780 | static inline void btree_path_list_remove(struct btree_trans *trans, |
2781 | struct btree_path *path) | |
0423fb71 KO |
2782 | { |
2783 | unsigned i; | |
2784 | ||
67e0dd8f | 2785 | EBUG_ON(path->sorted_idx >= trans->nr_sorted); |
0423fb71 KO |
2786 | #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS |
2787 | trans->nr_sorted--; | |
67e0dd8f KO |
2788 | memmove_u64s_down_small(trans->sorted + path->sorted_idx, |
2789 | trans->sorted + path->sorted_idx + 1, | |
2790 | DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); | |
0423fb71 | 2791 | #else |
67e0dd8f | 2792 | array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); |
0423fb71 | 2793 | #endif |
67e0dd8f KO |
2794 | for (i = path->sorted_idx; i < trans->nr_sorted; i++) |
2795 | trans->paths[trans->sorted[i]].sorted_idx = i; | |
0423fb71 | 2796 | |
67e0dd8f | 2797 | path->sorted_idx = U8_MAX; |
0423fb71 KO |
2798 | } |
2799 | ||
67e0dd8f KO |
2800 | static inline void btree_path_list_add(struct btree_trans *trans, |
2801 | struct btree_path *pos, | |
2802 | struct btree_path *path) | |
0423fb71 KO |
2803 | { |
2804 | unsigned i; | |
2805 | ||
068bcaa5 | 2806 | path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; |
0423fb71 KO |
2807 | |
2808 | #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS | |
67e0dd8f KO |
2809 | memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, |
2810 | trans->sorted + path->sorted_idx, | |
2811 | DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); | |
0423fb71 | 2812 | trans->nr_sorted++; |
67e0dd8f | 2813 | trans->sorted[path->sorted_idx] = path->idx; |
0423fb71 | 2814 | #else |
67e0dd8f | 2815 | array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); |
0423fb71 KO |
2816 | #endif |
2817 | ||
67e0dd8f KO |
2818 | for (i = path->sorted_idx; i < trans->nr_sorted; i++) |
2819 | trans->paths[trans->sorted[i]].sorted_idx = i; | |
64bc0011 | 2820 | |
0423fb71 | 2821 | btree_trans_verify_sorted_refs(trans); |
7c26ecae KO |
2822 | } |
2823 | ||
67e0dd8f | 2824 | void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) |
64bc0011 | 2825 | { |
67e0dd8f KO |
2826 | if (iter->path) |
2827 | bch2_path_put(trans, iter->path, | |
2828 | iter->flags & BTREE_ITER_INTENT); | |
1f2d9192 KO |
2829 | if (iter->update_path) |
2830 | bch2_path_put(trans, iter->update_path, | |
2831 | iter->flags & BTREE_ITER_INTENT); | |
f7b6ca23 KO |
2832 | if (iter->key_cache_path) |
2833 | bch2_path_put(trans, iter->key_cache_path, | |
2834 | iter->flags & BTREE_ITER_INTENT); | |
67e0dd8f | 2835 | iter->path = NULL; |
1f2d9192 | 2836 | iter->update_path = NULL; |
f7b6ca23 | 2837 | iter->key_cache_path = NULL; |
64bc0011 KO |
2838 | } |
2839 | ||
67e0dd8f KO |
2840 | static void __bch2_trans_iter_init(struct btree_trans *trans, |
2841 | struct btree_iter *iter, | |
2842 | enum btree_id btree_id, struct bpos pos, | |
2843 | unsigned locks_want, | |
2844 | unsigned depth, | |
2845 | unsigned flags) | |
1c6fdbd8 | 2846 | { |
e5af273f KO |
2847 | EBUG_ON(trans->restarted); |
2848 | ||
f21566f1 KO |
2849 | if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && |
2850 | btree_node_type_is_extents(btree_id)) | |
3dea728c | 2851 | flags |= BTREE_ITER_IS_EXTENTS; |
1c6fdbd8 | 2852 | |
a861c722 KO |
2853 | if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && |
2854 | !btree_type_has_snapshots(btree_id)) | |
e751c01a KO |
2855 | flags &= ~BTREE_ITER_ALL_SNAPSHOTS; |
2856 | ||
a861c722 KO |
2857 | if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && |
2858 | btree_type_has_snapshots(btree_id)) | |
2859 | flags |= BTREE_ITER_FILTER_SNAPSHOTS; | |
e751c01a | 2860 | |
5222a460 KO |
2861 | if (trans->journal_replay_not_finished) |
2862 | flags |= BTREE_ITER_WITH_JOURNAL; | |
2863 | ||
f7b6ca23 | 2864 | if (!btree_id_cached(trans->c, btree_id)) { |
7c8f6f98 | 2865 | flags &= ~BTREE_ITER_CACHED; |
f7b6ca23 KO |
2866 | flags &= ~BTREE_ITER_WITH_KEY_CACHE; |
2867 | } else if (!(flags & BTREE_ITER_CACHED)) | |
2868 | flags |= BTREE_ITER_WITH_KEY_CACHE; | |
7c8f6f98 | 2869 | |
67e0dd8f KO |
2870 | iter->trans = trans; |
2871 | iter->path = NULL; | |
1f2d9192 | 2872 | iter->update_path = NULL; |
f7b6ca23 | 2873 | iter->key_cache_path = NULL; |
67e0dd8f KO |
2874 | iter->btree_id = btree_id; |
2875 | iter->min_depth = depth; | |
f21566f1 KO |
2876 | iter->flags = flags; |
2877 | iter->snapshot = pos.snapshot; | |
67e0dd8f KO |
2878 | iter->pos = pos; |
2879 | iter->k.type = KEY_TYPE_deleted; | |
2880 | iter->k.p = pos; | |
2881 | iter->k.size = 0; | |
e751c01a | 2882 | |
f3e1f444 KO |
2883 | iter->path = bch2_path_get(trans, btree_id, iter->pos, |
2884 | locks_want, depth, flags); | |
1c6fdbd8 KO |
2885 | } |
2886 | ||
67e0dd8f KO |
2887 | void bch2_trans_iter_init(struct btree_trans *trans, |
2888 | struct btree_iter *iter, | |
2889 | unsigned btree_id, struct bpos pos, | |
2890 | unsigned flags) | |
424eb881 | 2891 | { |
67e0dd8f KO |
2892 | __bch2_trans_iter_init(trans, iter, btree_id, pos, |
2893 | 0, 0, flags); | |
424eb881 KO |
2894 | } |
2895 | ||
67e0dd8f KO |
2896 | void bch2_trans_node_iter_init(struct btree_trans *trans, |
2897 | struct btree_iter *iter, | |
2898 | enum btree_id btree_id, | |
2899 | struct bpos pos, | |
2900 | unsigned locks_want, | |
2901 | unsigned depth, | |
2902 | unsigned flags) | |
1c6fdbd8 | 2903 | { |
67e0dd8f KO |
2904 | __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth, |
2905 | BTREE_ITER_NOT_EXTENTS| | |
2906 | __BTREE_ITER_ALL_SNAPSHOTS| | |
2907 | BTREE_ITER_ALL_SNAPSHOTS| | |
2908 | flags); | |
2909 | BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); | |
2910 | BUG_ON(iter->path->level != depth); | |
2911 | BUG_ON(iter->min_depth != depth); | |
2912 | } | |
7c26ecae | 2913 | |
67e0dd8f KO |
2914 | void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) |
2915 | { | |
2916 | *dst = *src; | |
2917 | if (src->path) | |
2918 | __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); | |
1f2d9192 KO |
2919 | if (src->update_path) |
2920 | __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); | |
f7b6ca23 | 2921 | dst->key_cache_path = NULL; |
1c6fdbd8 KO |
2922 | } |
2923 | ||
73a117d2 | 2924 | void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) |
1c6fdbd8 | 2925 | { |
73a117d2 KO |
2926 | size_t new_top = trans->mem_top + size; |
2927 | void *p; | |
2928 | ||
2929 | if (new_top > trans->mem_bytes) { | |
1c6fdbd8 | 2930 | size_t old_bytes = trans->mem_bytes; |
73a117d2 | 2931 | size_t new_bytes = roundup_pow_of_two(new_top); |
e131b6aa KO |
2932 | void *new_mem; |
2933 | ||
2934 | WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); | |
2935 | ||
2936 | new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); | |
2937 | if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { | |
2938 | new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); | |
2939 | new_bytes = BTREE_TRANS_MEM_MAX; | |
2940 | kfree(trans->mem); | |
2941 | } | |
1c6fdbd8 KO |
2942 | |
2943 | if (!new_mem) | |
73a117d2 | 2944 | return ERR_PTR(-ENOMEM); |
1c6fdbd8 KO |
2945 | |
2946 | trans->mem = new_mem; | |
2947 | trans->mem_bytes = new_bytes; | |
2948 | ||
1c7a0adf | 2949 | if (old_bytes) { |
669f87a5 | 2950 | trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes); |
e5af273f | 2951 | btree_trans_restart(trans); |
73a117d2 | 2952 | return ERR_PTR(-EINTR); |
1c7a0adf | 2953 | } |
1c6fdbd8 KO |
2954 | } |
2955 | ||
20bceecb | 2956 | p = trans->mem + trans->mem_top; |
1c6fdbd8 | 2957 | trans->mem_top += size; |
7ed158f2 | 2958 | memset(p, 0, size); |
20bceecb | 2959 | return p; |
1c6fdbd8 KO |
2960 | } |
2961 | ||
d38494c4 | 2962 | /** |
955af634 | 2963 | * bch2_trans_begin() - reset a transaction after a interrupted attempt |
d38494c4 | 2964 | * @trans: transaction to reset |
d38494c4 DR |
2965 | * |
2966 | * While iterating over nodes or updating nodes a attempt to lock a btree | |
2967 | * node may return EINTR when the trylock fails. When this occurs | |
955af634 | 2968 | * bch2_trans_begin() should be called and the transaction retried. |
d38494c4 | 2969 | */ |
955af634 | 2970 | void bch2_trans_begin(struct btree_trans *trans) |
1c6fdbd8 | 2971 | { |
67e0dd8f KO |
2972 | struct btree_insert_entry *i; |
2973 | struct btree_path *path; | |
a8e00bd4 | 2974 | |
67e0dd8f KO |
2975 | trans_for_each_update(trans, i) |
2976 | __btree_path_put(i->path, true); | |
1c6fdbd8 | 2977 | |
904823de | 2978 | memset(&trans->journal_res, 0, sizeof(trans->journal_res)); |
a49e9a05 | 2979 | trans->extra_journal_res = 0; |
5df4be3f | 2980 | trans->nr_updates = 0; |
163e885a | 2981 | trans->mem_top = 0; |
bf7b87a4 | 2982 | |
43d00243 | 2983 | trans->hooks = NULL; |
96e2aa1b KO |
2984 | trans->extra_journal_entries = NULL; |
2985 | trans->extra_journal_entry_u64s = 0; | |
2986 | ||
f21539a5 KO |
2987 | if (trans->fs_usage_deltas) { |
2988 | trans->fs_usage_deltas->used = 0; | |
2989 | memset((void *) trans->fs_usage_deltas + | |
2990 | offsetof(struct replicas_delta_list, memset_start), 0, | |
2991 | (void *) &trans->fs_usage_deltas->memset_end - | |
2992 | (void *) &trans->fs_usage_deltas->memset_start); | |
2993 | } | |
2994 | ||
67e0dd8f | 2995 | trans_for_each_path(trans, path) { |
25a77231 KO |
2996 | path->should_be_locked = false; |
2997 | ||
67e0dd8f KO |
2998 | /* |
2999 | * XXX: we probably shouldn't be doing this if the transaction | |
3000 | * was restarted, but currently we still overflow transaction | |
3001 | * iterators if we do that | |
3002 | */ | |
3003 | if (!path->ref && !path->preserve) | |
3004 | __bch2_path_free(trans, path); | |
3005 | else | |
25a77231 | 3006 | path->preserve = false; |
67e0dd8f KO |
3007 | } |
3008 | ||
955af634 | 3009 | bch2_trans_cond_resched(trans); |
50dc0f69 | 3010 | |
955af634 | 3011 | if (trans->restarted) |
67e0dd8f | 3012 | bch2_btree_path_traverse_all(trans); |
e5af273f KO |
3013 | |
3014 | trans->restarted = false; | |
1c6fdbd8 KO |
3015 | } |
3016 | ||
67e0dd8f | 3017 | static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) |
1a21bf98 | 3018 | { |
67e0dd8f | 3019 | size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; |
3eb26d01 | 3020 | size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; |
dbd1e825 | 3021 | void *p = NULL; |
1a21bf98 KO |
3022 | |
3023 | BUG_ON(trans->used_mempool); | |
3024 | ||
dbd1e825 | 3025 | #ifdef __KERNEL__ |
67e0dd8f | 3026 | p = this_cpu_xchg(c->btree_paths_bufs->path , NULL); |
dbd1e825 KO |
3027 | #endif |
3028 | if (!p) | |
67e0dd8f | 3029 | p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); |
1a21bf98 | 3030 | |
67e0dd8f | 3031 | trans->paths = p; p += paths_bytes; |
1a21bf98 | 3032 | trans->updates = p; p += updates_bytes; |
1a21bf98 KO |
3033 | } |
3034 | ||
669f87a5 KO |
3035 | void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, |
3036 | unsigned expected_nr_iters, | |
3037 | size_t expected_mem_bytes, | |
3038 | const char *fn) | |
c0ebe3e4 | 3039 | __acquires(&c->btree_trans_barrier) |
1c6fdbd8 | 3040 | { |
ae1ede58 | 3041 | memset(trans, 0, sizeof(*trans)); |
1c6fdbd8 | 3042 | trans->c = c; |
669f87a5 | 3043 | trans->fn = fn; |
5222a460 KO |
3044 | trans->journal_replay_not_finished = |
3045 | !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); | |
20bceecb | 3046 | |
67e0dd8f | 3047 | bch2_trans_alloc_paths(trans, c); |
20bceecb | 3048 | |
0b5c9f59 KO |
3049 | if (expected_mem_bytes) { |
3050 | expected_mem_bytes = roundup_pow_of_two(expected_mem_bytes); | |
3051 | trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); | |
e131b6aa KO |
3052 | |
3053 | if (!unlikely(trans->mem)) { | |
3054 | trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); | |
3055 | trans->mem_bytes = BTREE_TRANS_MEM_MAX; | |
3056 | } else { | |
0b5c9f59 | 3057 | trans->mem_bytes = expected_mem_bytes; |
e131b6aa | 3058 | } |
0b5c9f59 | 3059 | } |
495aabed | 3060 | |
876c7af3 KO |
3061 | trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); |
3062 | ||
b84d42c3 KO |
3063 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { |
3064 | trans->pid = current->pid; | |
3065 | mutex_lock(&c->btree_trans_lock); | |
3066 | list_add(&trans->list, &c->btree_trans_list); | |
3067 | mutex_unlock(&c->btree_trans_lock); | |
3068 | } | |
1c6fdbd8 KO |
3069 | } |
3070 | ||
67e0dd8f KO |
3071 | static void check_btree_paths_leaked(struct btree_trans *trans) |
3072 | { | |
3073 | #ifdef CONFIG_BCACHEFS_DEBUG | |
3074 | struct bch_fs *c = trans->c; | |
3075 | struct btree_path *path; | |
3076 | ||
3077 | trans_for_each_path(trans, path) | |
3078 | if (path->ref) | |
3079 | goto leaked; | |
3080 | return; | |
3081 | leaked: | |
669f87a5 | 3082 | bch_err(c, "btree paths leaked from %s!", trans->fn); |
67e0dd8f KO |
3083 | trans_for_each_path(trans, path) |
3084 | if (path->ref) | |
3085 | printk(KERN_ERR " btree %s %pS\n", | |
3086 | bch2_btree_ids[path->btree_id], | |
3087 | (void *) path->ip_allocated); | |
3088 | /* Be noisy about this: */ | |
3089 | bch2_fatal_error(c); | |
3090 | #endif | |
3091 | } | |
3092 | ||
9a796fdb | 3093 | void bch2_trans_exit(struct btree_trans *trans) |
c0ebe3e4 | 3094 | __releases(&c->btree_trans_barrier) |
1c6fdbd8 | 3095 | { |
67e0dd8f | 3096 | struct btree_insert_entry *i; |
1a21bf98 KO |
3097 | struct bch_fs *c = trans->c; |
3098 | ||
ece254b2 | 3099 | bch2_trans_unlock(trans); |
1c6fdbd8 | 3100 | |
67e0dd8f KO |
3101 | trans_for_each_update(trans, i) |
3102 | __btree_path_put(i->path, true); | |
3103 | trans->nr_updates = 0; | |
509d3e0a | 3104 | |
67e0dd8f | 3105 | check_btree_paths_leaked(trans); |
50dc0f69 | 3106 | |
b84d42c3 KO |
3107 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { |
3108 | mutex_lock(&c->btree_trans_lock); | |
3109 | list_del(&trans->list); | |
3110 | mutex_unlock(&c->btree_trans_lock); | |
3111 | } | |
495aabed | 3112 | |
876c7af3 KO |
3113 | srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); |
3114 | ||
67e0dd8f | 3115 | bch2_journal_preres_put(&c->journal, &trans->journal_preres); |
2ca88e5a | 3116 | |
9620c3ec KO |
3117 | if (trans->fs_usage_deltas) { |
3118 | if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == | |
3119 | REPLICAS_DELTA_LIST_MAX) | |
3120 | mempool_free(trans->fs_usage_deltas, | |
67e0dd8f | 3121 | &c->replicas_delta_pool); |
9620c3ec KO |
3122 | else |
3123 | kfree(trans->fs_usage_deltas); | |
3124 | } | |
e131b6aa KO |
3125 | |
3126 | if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) | |
67e0dd8f | 3127 | mempool_free(trans->mem, &c->btree_trans_mem_pool); |
e131b6aa KO |
3128 | else |
3129 | kfree(trans->mem); | |
1a21bf98 | 3130 | |
dbd1e825 KO |
3131 | #ifdef __KERNEL__ |
3132 | /* | |
3133 | * Userspace doesn't have a real percpu implementation: | |
3134 | */ | |
67e0dd8f | 3135 | trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); |
dbd1e825 | 3136 | #endif |
e131b6aa | 3137 | |
67e0dd8f KO |
3138 | if (trans->paths) |
3139 | mempool_free(trans->paths, &c->btree_paths_pool); | |
1a21bf98 | 3140 | |
1c6fdbd8 | 3141 | trans->mem = (void *) 0x1; |
67e0dd8f | 3142 | trans->paths = (void *) 0x1; |
1c6fdbd8 | 3143 | } |
36e9d698 | 3144 | |
b3d1e6ca | 3145 | static void __maybe_unused |
67e0dd8f | 3146 | bch2_btree_path_node_to_text(struct printbuf *out, |
b3d1e6ca | 3147 | struct btree_bkey_cached_common *_b, |
f21566f1 | 3148 | bool cached) |
d211b408 | 3149 | { |
5c1d808a KO |
3150 | pr_buf(out, " l=%u %s:", |
3151 | _b->level, bch2_btree_ids[_b->btree_id]); | |
f21566f1 | 3152 | bch2_bpos_to_text(out, btree_node_pos(_b, cached)); |
d211b408 KO |
3153 | } |
3154 | ||
b84d42c3 | 3155 | #ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS |
f21566f1 | 3156 | static bool trans_has_locks(struct btree_trans *trans) |
5c1d808a | 3157 | { |
67e0dd8f | 3158 | struct btree_path *path; |
5c1d808a | 3159 | |
67e0dd8f KO |
3160 | trans_for_each_path(trans, path) |
3161 | if (path->nodes_locked) | |
5c1d808a KO |
3162 | return true; |
3163 | return false; | |
3164 | } | |
2d587674 | 3165 | #endif |
5c1d808a | 3166 | |
495aabed KO |
3167 | void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) |
3168 | { | |
b84d42c3 | 3169 | #ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS |
495aabed | 3170 | struct btree_trans *trans; |
67e0dd8f | 3171 | struct btree_path *path; |
495aabed | 3172 | struct btree *b; |
c7ce2732 | 3173 | static char lock_types[] = { 'r', 'i', 'w' }; |
495aabed KO |
3174 | unsigned l; |
3175 | ||
3176 | mutex_lock(&c->btree_trans_lock); | |
3177 | list_for_each_entry(trans, &c->btree_trans_list, list) { | |
f21566f1 | 3178 | if (!trans_has_locks(trans)) |
5c1d808a KO |
3179 | continue; |
3180 | ||
669f87a5 | 3181 | pr_buf(out, "%i %s\n", trans->pid, trans->fn); |
495aabed | 3182 | |
67e0dd8f KO |
3183 | trans_for_each_path(trans, path) { |
3184 | if (!path->nodes_locked) | |
495aabed KO |
3185 | continue; |
3186 | ||
1d3ecd7e | 3187 | pr_buf(out, " path %u %c l=%u %s:", |
67e0dd8f KO |
3188 | path->idx, |
3189 | path->cached ? 'c' : 'b', | |
1d3ecd7e | 3190 | path->level, |
67e0dd8f KO |
3191 | bch2_btree_ids[path->btree_id]); |
3192 | bch2_bpos_to_text(out, path->pos); | |
495aabed KO |
3193 | pr_buf(out, "\n"); |
3194 | ||
3195 | for (l = 0; l < BTREE_MAX_DEPTH; l++) { | |
67e0dd8f | 3196 | if (btree_node_locked(path, l)) { |
d211b408 | 3197 | pr_buf(out, " %s l=%u ", |
67e0dd8f KO |
3198 | btree_node_intent_locked(path, l) ? "i" : "r", l); |
3199 | bch2_btree_path_node_to_text(out, | |
3200 | (void *) path->l[l].b, | |
3201 | path->cached); | |
495aabed KO |
3202 | pr_buf(out, "\n"); |
3203 | } | |
3204 | } | |
3205 | } | |
3206 | ||
3207 | b = READ_ONCE(trans->locking); | |
3208 | if (b) { | |
67e0dd8f | 3209 | path = &trans->paths[trans->locking_path_idx]; |
c7ce2732 | 3210 | pr_buf(out, " locking path %u %c l=%u %c %s:", |
67e0dd8f KO |
3211 | trans->locking_path_idx, |
3212 | path->cached ? 'c' : 'b', | |
515282ac | 3213 | trans->locking_level, |
c7ce2732 | 3214 | lock_types[trans->locking_lock_type], |
515282ac KO |
3215 | bch2_btree_ids[trans->locking_btree_id]); |
3216 | bch2_bpos_to_text(out, trans->locking_pos); | |
3217 | ||
d211b408 | 3218 | pr_buf(out, " node "); |
67e0dd8f KO |
3219 | bch2_btree_path_node_to_text(out, |
3220 | (void *) b, path->cached); | |
495aabed KO |
3221 | pr_buf(out, "\n"); |
3222 | } | |
3223 | } | |
3224 | mutex_unlock(&c->btree_trans_lock); | |
3225 | #endif | |
3226 | } | |
3227 | ||
36e9d698 KO |
3228 | void bch2_fs_btree_iter_exit(struct bch_fs *c) |
3229 | { | |
99fafb04 KO |
3230 | if (c->btree_trans_barrier_initialized) |
3231 | cleanup_srcu_struct(&c->btree_trans_barrier); | |
e131b6aa | 3232 | mempool_exit(&c->btree_trans_mem_pool); |
67e0dd8f | 3233 | mempool_exit(&c->btree_paths_pool); |
36e9d698 KO |
3234 | } |
3235 | ||
3236 | int bch2_fs_btree_iter_init(struct bch_fs *c) | |
3237 | { | |
3238 | unsigned nr = BTREE_ITER_MAX; | |
99fafb04 | 3239 | int ret; |
36e9d698 | 3240 | |
495aabed KO |
3241 | INIT_LIST_HEAD(&c->btree_trans_list); |
3242 | mutex_init(&c->btree_trans_lock); | |
3243 | ||
99fafb04 | 3244 | ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, |
67e0dd8f | 3245 | sizeof(struct btree_path) * nr + |
e131b6aa KO |
3246 | sizeof(struct btree_insert_entry) * nr) ?: |
3247 | mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, | |
99fafb04 KO |
3248 | BTREE_TRANS_MEM_MAX) ?: |
3249 | init_srcu_struct(&c->btree_trans_barrier); | |
3250 | if (!ret) | |
3251 | c->btree_trans_barrier_initialized = true; | |
3252 | return ret; | |
36e9d698 | 3253 | } |