Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
4 | #include "bkey_methods.h" | |
07a1006a | 5 | #include "bkey_buf.h" |
1c6fdbd8 KO |
6 | #include "btree_cache.h" |
7 | #include "btree_iter.h" | |
2ca88e5a | 8 | #include "btree_key_cache.h" |
1c6fdbd8 | 9 | #include "btree_locking.h" |
57b0b3db | 10 | #include "btree_update.h" |
1c6fdbd8 | 11 | #include "debug.h" |
50dc0f69 | 12 | #include "error.h" |
1c6fdbd8 | 13 | #include "extents.h" |
2ca88e5a | 14 | #include "journal.h" |
5222a460 | 15 | #include "recovery.h" |
35d5aff2 | 16 | #include "replicas.h" |
c075ff70 | 17 | #include "subvolume.h" |
1c6fdbd8 KO |
18 | #include "trace.h" |
19 | ||
a0cb8d78 | 20 | #include <linux/random.h> |
1c6fdbd8 KO |
21 | #include <linux/prefetch.h> |
22 | ||
67e0dd8f KO |
23 | static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); |
24 | static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, | |
25 | struct btree_path *); | |
26 | ||
27 | static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); | |
28 | ||
b0d1b70a KO |
29 | /* |
30 | * Unlocks before scheduling | |
31 | * Note: does not revalidate iterator | |
32 | */ | |
33 | static inline int bch2_trans_cond_resched(struct btree_trans *trans) | |
34 | { | |
35 | if (need_resched() || race_fault()) { | |
36 | bch2_trans_unlock(trans); | |
37 | schedule(); | |
549d173c | 38 | return bch2_trans_relock(trans); |
b0d1b70a KO |
39 | } else { |
40 | return 0; | |
41 | } | |
42 | } | |
43 | ||
67e0dd8f KO |
44 | static inline int __btree_path_cmp(const struct btree_path *l, |
45 | enum btree_id r_btree_id, | |
46 | bool r_cached, | |
47 | struct bpos r_pos, | |
48 | unsigned r_level) | |
0423fb71 | 49 | { |
7abda8c1 KO |
50 | /* |
51 | * Must match lock ordering as defined by __bch2_btree_node_lock: | |
52 | */ | |
67e0dd8f | 53 | return cmp_int(l->btree_id, r_btree_id) ?: |
32b26e8c | 54 | cmp_int((int) l->cached, (int) r_cached) ?: |
67e0dd8f KO |
55 | bpos_cmp(l->pos, r_pos) ?: |
56 | -cmp_int(l->level, r_level); | |
57 | } | |
58 | ||
59 | static inline int btree_path_cmp(const struct btree_path *l, | |
60 | const struct btree_path *r) | |
61 | { | |
62 | return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); | |
0423fb71 KO |
63 | } |
64 | ||
e751c01a KO |
65 | static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) |
66 | { | |
e751c01a KO |
67 | /* Are we iterating over keys in all snapshots? */ |
68 | if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { | |
69 | p = bpos_successor(p); | |
70 | } else { | |
71 | p = bpos_nosnap_successor(p); | |
72 | p.snapshot = iter->snapshot; | |
73 | } | |
74 | ||
75 | return p; | |
76 | } | |
77 | ||
78 | static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) | |
79 | { | |
e751c01a KO |
80 | /* Are we iterating over keys in all snapshots? */ |
81 | if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { | |
82 | p = bpos_predecessor(p); | |
83 | } else { | |
84 | p = bpos_nosnap_predecessor(p); | |
85 | p.snapshot = iter->snapshot; | |
86 | } | |
87 | ||
88 | return p; | |
89 | } | |
90 | ||
9626aeb1 | 91 | static inline struct bpos btree_iter_search_key(struct btree_iter *iter) |
a00fd8c5 | 92 | { |
9626aeb1 | 93 | struct bpos pos = iter->pos; |
cbdf24ce | 94 | |
9626aeb1 KO |
95 | if ((iter->flags & BTREE_ITER_IS_EXTENTS) && |
96 | bkey_cmp(pos, POS_MAX)) | |
e751c01a | 97 | pos = bkey_successor(iter, pos); |
9626aeb1 | 98 | return pos; |
a00fd8c5 KO |
99 | } |
100 | ||
67e0dd8f | 101 | static inline bool btree_path_pos_before_node(struct btree_path *path, |
e3ecf4f5 KO |
102 | struct btree *b) |
103 | { | |
67e0dd8f | 104 | return bpos_cmp(path->pos, b->data->min_key) < 0; |
e3ecf4f5 KO |
105 | } |
106 | ||
67e0dd8f | 107 | static inline bool btree_path_pos_after_node(struct btree_path *path, |
e3ecf4f5 KO |
108 | struct btree *b) |
109 | { | |
67e0dd8f | 110 | return bpos_cmp(b->key.k.p, path->pos) < 0; |
e3ecf4f5 KO |
111 | } |
112 | ||
67e0dd8f | 113 | static inline bool btree_path_pos_in_node(struct btree_path *path, |
e3ecf4f5 KO |
114 | struct btree *b) |
115 | { | |
67e0dd8f KO |
116 | return path->btree_id == b->c.btree_id && |
117 | !btree_path_pos_before_node(path, b) && | |
118 | !btree_path_pos_after_node(path, b); | |
e3ecf4f5 KO |
119 | } |
120 | ||
1c6fdbd8 KO |
121 | /* Btree iterator: */ |
122 | ||
123 | #ifdef CONFIG_BCACHEFS_DEBUG | |
124 | ||
67e0dd8f KO |
125 | static void bch2_btree_path_verify_cached(struct btree_trans *trans, |
126 | struct btree_path *path) | |
d211b408 KO |
127 | { |
128 | struct bkey_cached *ck; | |
67e0dd8f | 129 | bool locked = btree_node_locked(path, 0); |
d211b408 | 130 | |
67e0dd8f | 131 | if (!bch2_btree_node_relock(trans, path, 0)) |
d211b408 KO |
132 | return; |
133 | ||
67e0dd8f KO |
134 | ck = (void *) path->l[0].b; |
135 | BUG_ON(ck->key.btree_id != path->btree_id || | |
136 | bkey_cmp(ck->key.pos, path->pos)); | |
d211b408 KO |
137 | |
138 | if (!locked) | |
8bfe14e8 | 139 | btree_node_unlock(trans, path, 0); |
d211b408 KO |
140 | } |
141 | ||
67e0dd8f KO |
142 | static void bch2_btree_path_verify_level(struct btree_trans *trans, |
143 | struct btree_path *path, unsigned level) | |
1c6fdbd8 | 144 | { |
67e0dd8f | 145 | struct btree_path_level *l; |
4ce41957 KO |
146 | struct btree_node_iter tmp; |
147 | bool locked; | |
2dac0eae | 148 | struct bkey_packed *p, *k; |
fa8e94fa KO |
149 | struct printbuf buf1 = PRINTBUF; |
150 | struct printbuf buf2 = PRINTBUF; | |
151 | struct printbuf buf3 = PRINTBUF; | |
2dac0eae | 152 | const char *msg; |
1c6fdbd8 | 153 | |
29364f34 | 154 | if (!bch2_debug_check_iterators) |
f13f5a8c KO |
155 | return; |
156 | ||
67e0dd8f | 157 | l = &path->l[level]; |
4ce41957 | 158 | tmp = l->iter; |
67e0dd8f | 159 | locked = btree_node_locked(path, level); |
4ce41957 | 160 | |
67e0dd8f | 161 | if (path->cached) { |
d211b408 | 162 | if (!level) |
67e0dd8f | 163 | bch2_btree_path_verify_cached(trans, path); |
d211b408 KO |
164 | return; |
165 | } | |
166 | ||
67e0dd8f | 167 | if (!btree_path_node(path, level)) |
2dac0eae KO |
168 | return; |
169 | ||
e9174370 | 170 | if (!bch2_btree_node_relock_notrace(trans, path, level)) |
271a3d3a KO |
171 | return; |
172 | ||
67e0dd8f | 173 | BUG_ON(!btree_path_pos_in_node(path, l->b)); |
2dac0eae | 174 | |
2dac0eae | 175 | bch2_btree_node_iter_verify(&l->iter, l->b); |
1c6fdbd8 KO |
176 | |
177 | /* | |
1ae29c1f | 178 | * For interior nodes, the iterator will have skipped past deleted keys: |
1c6fdbd8 | 179 | */ |
1ae29c1f | 180 | p = level |
c052cf82 | 181 | ? bch2_btree_node_iter_prev(&tmp, l->b) |
2dac0eae KO |
182 | : bch2_btree_node_iter_prev_all(&tmp, l->b); |
183 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
1c6fdbd8 | 184 | |
67e0dd8f | 185 | if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { |
2dac0eae KO |
186 | msg = "before"; |
187 | goto err; | |
1c6fdbd8 KO |
188 | } |
189 | ||
67e0dd8f | 190 | if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { |
2dac0eae KO |
191 | msg = "after"; |
192 | goto err; | |
193 | } | |
f21566f1 | 194 | |
2dac0eae | 195 | if (!locked) |
8bfe14e8 | 196 | btree_node_unlock(trans, path, level); |
2dac0eae KO |
197 | return; |
198 | err: | |
fa8e94fa | 199 | bch2_bpos_to_text(&buf1, path->pos); |
2dac0eae KO |
200 | |
201 | if (p) { | |
202 | struct bkey uk = bkey_unpack_key(l->b, p); | |
fa8e94fa KO |
203 | bch2_bkey_to_text(&buf2, &uk); |
204 | } else { | |
401ec4db | 205 | prt_printf(&buf2, "(none)"); |
2dac0eae | 206 | } |
1c6fdbd8 | 207 | |
2dac0eae KO |
208 | if (k) { |
209 | struct bkey uk = bkey_unpack_key(l->b, k); | |
fa8e94fa KO |
210 | bch2_bkey_to_text(&buf3, &uk); |
211 | } else { | |
401ec4db | 212 | prt_printf(&buf3, "(none)"); |
1c6fdbd8 | 213 | } |
2dac0eae | 214 | |
67e0dd8f KO |
215 | panic("path should be %s key at level %u:\n" |
216 | "path pos %s\n" | |
2dac0eae KO |
217 | "prev key %s\n" |
218 | "cur key %s\n", | |
fa8e94fa | 219 | msg, level, buf1.buf, buf2.buf, buf3.buf); |
1c6fdbd8 KO |
220 | } |
221 | ||
67e0dd8f KO |
222 | static void bch2_btree_path_verify(struct btree_trans *trans, |
223 | struct btree_path *path) | |
1c6fdbd8 | 224 | { |
5aab6635 | 225 | struct bch_fs *c = trans->c; |
2dac0eae | 226 | unsigned i; |
1c6fdbd8 | 227 | |
67e0dd8f KO |
228 | EBUG_ON(path->btree_id >= BTREE_ID_NR); |
229 | ||
230 | for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { | |
231 | if (!path->l[i].b) { | |
d7407292 KO |
232 | BUG_ON(!path->cached && |
233 | c->btree_roots[path->btree_id].b->c.level > i); | |
67e0dd8f KO |
234 | break; |
235 | } | |
236 | ||
237 | bch2_btree_path_verify_level(trans, path, i); | |
238 | } | |
239 | ||
240 | bch2_btree_path_verify_locks(path); | |
241 | } | |
242 | ||
243 | void bch2_trans_verify_paths(struct btree_trans *trans) | |
244 | { | |
245 | struct btree_path *path; | |
246 | ||
67e0dd8f KO |
247 | trans_for_each_path(trans, path) |
248 | bch2_btree_path_verify(trans, path); | |
249 | } | |
250 | ||
251 | static void bch2_btree_iter_verify(struct btree_iter *iter) | |
252 | { | |
253 | struct btree_trans *trans = iter->trans; | |
254 | ||
255 | BUG_ON(iter->btree_id >= BTREE_ID_NR); | |
256 | ||
257 | BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); | |
7e1a3aa9 | 258 | |
e751c01a KO |
259 | BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && |
260 | (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); | |
261 | ||
f21566f1 | 262 | BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && |
e751c01a KO |
263 | (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && |
264 | !btree_type_has_snapshots(iter->btree_id)); | |
265 | ||
1f2d9192 KO |
266 | if (iter->update_path) |
267 | bch2_btree_path_verify(trans, iter->update_path); | |
67e0dd8f | 268 | bch2_btree_path_verify(trans, iter->path); |
2dac0eae KO |
269 | } |
270 | ||
7e1a3aa9 KO |
271 | static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) |
272 | { | |
a861c722 KO |
273 | BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && |
274 | !iter->pos.snapshot); | |
275 | ||
e751c01a KO |
276 | BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && |
277 | iter->pos.snapshot != iter->snapshot); | |
278 | ||
f21566f1 KO |
279 | BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || |
280 | bkey_cmp(iter->pos, iter->k.p) > 0); | |
7e1a3aa9 KO |
281 | } |
282 | ||
c075ff70 KO |
283 | static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) |
284 | { | |
285 | struct btree_trans *trans = iter->trans; | |
286 | struct btree_iter copy; | |
287 | struct bkey_s_c prev; | |
288 | int ret = 0; | |
289 | ||
290 | if (!bch2_debug_check_iterators) | |
291 | return 0; | |
292 | ||
293 | if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) | |
294 | return 0; | |
295 | ||
296 | if (bkey_err(k) || !k.k) | |
297 | return 0; | |
298 | ||
299 | BUG_ON(!bch2_snapshot_is_ancestor(trans->c, | |
300 | iter->snapshot, | |
301 | k.k->p.snapshot)); | |
302 | ||
303 | bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, | |
ffa7d262 | 304 | BTREE_ITER_NOPRESERVE| |
c075ff70 KO |
305 | BTREE_ITER_ALL_SNAPSHOTS); |
306 | prev = bch2_btree_iter_prev(©); | |
307 | if (!prev.k) | |
308 | goto out; | |
309 | ||
310 | ret = bkey_err(prev); | |
311 | if (ret) | |
312 | goto out; | |
313 | ||
314 | if (!bkey_cmp(prev.k->p, k.k->p) && | |
315 | bch2_snapshot_is_ancestor(trans->c, iter->snapshot, | |
316 | prev.k->p.snapshot) > 0) { | |
fa8e94fa | 317 | struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; |
c075ff70 | 318 | |
fa8e94fa KO |
319 | bch2_bkey_to_text(&buf1, k.k); |
320 | bch2_bkey_to_text(&buf2, prev.k); | |
c075ff70 KO |
321 | |
322 | panic("iter snap %u\n" | |
323 | "k %s\n" | |
324 | "prev %s\n", | |
325 | iter->snapshot, | |
fa8e94fa | 326 | buf1.buf, buf2.buf); |
c075ff70 KO |
327 | } |
328 | out: | |
329 | bch2_trans_iter_exit(trans, ©); | |
330 | return ret; | |
331 | } | |
332 | ||
32b26e8c KO |
333 | void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, |
334 | struct bpos pos, bool key_cache) | |
335 | { | |
336 | struct btree_path *path; | |
337 | unsigned idx; | |
fa8e94fa | 338 | struct printbuf buf = PRINTBUF; |
32b26e8c | 339 | |
5c0bb66a KO |
340 | btree_trans_sort_paths(trans); |
341 | ||
32b26e8c KO |
342 | trans_for_each_path_inorder(trans, path, idx) { |
343 | int cmp = cmp_int(path->btree_id, id) ?: | |
344 | cmp_int(path->cached, key_cache); | |
345 | ||
346 | if (cmp > 0) | |
347 | break; | |
348 | if (cmp < 0) | |
349 | continue; | |
350 | ||
d4263e56 | 351 | if (!btree_node_locked(path, 0) || |
32b26e8c KO |
352 | !path->should_be_locked) |
353 | continue; | |
354 | ||
355 | if (!key_cache) { | |
356 | if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 && | |
357 | bkey_cmp(pos, path->l[0].b->key.k.p) <= 0) | |
358 | return; | |
359 | } else { | |
360 | if (!bkey_cmp(pos, path->pos)) | |
361 | return; | |
362 | } | |
363 | } | |
364 | ||
365 | bch2_dump_trans_paths_updates(trans); | |
fa8e94fa KO |
366 | bch2_bpos_to_text(&buf, pos); |
367 | ||
32b26e8c | 368 | panic("not locked: %s %s%s\n", |
fa8e94fa | 369 | bch2_btree_ids[id], buf.buf, |
32b26e8c KO |
370 | key_cache ? " cached" : ""); |
371 | } | |
372 | ||
271a3d3a KO |
373 | #else |
374 | ||
67e0dd8f KO |
375 | static inline void bch2_btree_path_verify_level(struct btree_trans *trans, |
376 | struct btree_path *path, unsigned l) {} | |
377 | static inline void bch2_btree_path_verify(struct btree_trans *trans, | |
378 | struct btree_path *path) {} | |
7d6f9b64 | 379 | static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} |
7e1a3aa9 | 380 | static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} |
c075ff70 | 381 | static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } |
271a3d3a | 382 | |
1c6fdbd8 KO |
383 | #endif |
384 | ||
67e0dd8f KO |
385 | /* Btree path: fixups after btree updates */ |
386 | ||
23bbd2bb KO |
387 | static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, |
388 | struct btree *b, | |
389 | struct bset_tree *t, | |
390 | struct bkey_packed *k) | |
391 | { | |
392 | struct btree_node_iter_set *set; | |
393 | ||
394 | btree_node_iter_for_each(iter, set) | |
395 | if (set->end == t->end_offset) { | |
396 | set->k = __btree_node_key_to_offset(b, k); | |
397 | bch2_btree_node_iter_sort(iter, b); | |
398 | return; | |
399 | } | |
400 | ||
401 | bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); | |
402 | } | |
403 | ||
67e0dd8f | 404 | static void __bch2_btree_path_fix_key_modified(struct btree_path *path, |
9626aeb1 KO |
405 | struct btree *b, |
406 | struct bkey_packed *where) | |
887c2a4e | 407 | { |
67e0dd8f | 408 | struct btree_path_level *l = &path->l[b->c.level]; |
887c2a4e | 409 | |
9626aeb1 KO |
410 | if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) |
411 | return; | |
412 | ||
67e0dd8f | 413 | if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) |
9626aeb1 | 414 | bch2_btree_node_iter_advance(&l->iter, l->b); |
887c2a4e KO |
415 | } |
416 | ||
67e0dd8f | 417 | void bch2_btree_path_fix_key_modified(struct btree_trans *trans, |
887c2a4e KO |
418 | struct btree *b, |
419 | struct bkey_packed *where) | |
420 | { | |
67e0dd8f | 421 | struct btree_path *path; |
887c2a4e | 422 | |
67e0dd8f KO |
423 | trans_for_each_path_with_node(trans, b, path) { |
424 | __bch2_btree_path_fix_key_modified(path, b, where); | |
425 | bch2_btree_path_verify_level(trans, path, b->c.level); | |
887c2a4e KO |
426 | } |
427 | } | |
428 | ||
67e0dd8f KO |
429 | static void __bch2_btree_node_iter_fix(struct btree_path *path, |
430 | struct btree *b, | |
431 | struct btree_node_iter *node_iter, | |
432 | struct bset_tree *t, | |
433 | struct bkey_packed *where, | |
434 | unsigned clobber_u64s, | |
435 | unsigned new_u64s) | |
1c6fdbd8 KO |
436 | { |
437 | const struct bkey_packed *end = btree_bkey_last(b, t); | |
438 | struct btree_node_iter_set *set; | |
439 | unsigned offset = __btree_node_key_to_offset(b, where); | |
440 | int shift = new_u64s - clobber_u64s; | |
271a3d3a | 441 | unsigned old_end = t->end_offset - shift; |
c0fc30da KO |
442 | unsigned orig_iter_pos = node_iter->data[0].k; |
443 | bool iter_current_key_modified = | |
444 | orig_iter_pos >= offset && | |
445 | orig_iter_pos <= offset + clobber_u64s; | |
1c6fdbd8 KO |
446 | |
447 | btree_node_iter_for_each(node_iter, set) | |
448 | if (set->end == old_end) | |
449 | goto found; | |
450 | ||
451 | /* didn't find the bset in the iterator - might have to readd it: */ | |
452 | if (new_u64s && | |
67e0dd8f | 453 | bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { |
1c6fdbd8 | 454 | bch2_btree_node_iter_push(node_iter, b, where, end); |
c0fc30da KO |
455 | goto fixup_done; |
456 | } else { | |
457 | /* Iterator is after key that changed */ | |
059e4134 | 458 | return; |
1c6fdbd8 | 459 | } |
1c6fdbd8 | 460 | found: |
271a3d3a | 461 | set->end = t->end_offset; |
1c6fdbd8 KO |
462 | |
463 | /* Iterator hasn't gotten to the key that changed yet: */ | |
464 | if (set->k < offset) | |
059e4134 | 465 | return; |
1c6fdbd8 KO |
466 | |
467 | if (new_u64s && | |
67e0dd8f | 468 | bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { |
1c6fdbd8 KO |
469 | set->k = offset; |
470 | } else if (set->k < offset + clobber_u64s) { | |
471 | set->k = offset + new_u64s; | |
472 | if (set->k == set->end) | |
473 | bch2_btree_node_iter_set_drop(node_iter, set); | |
474 | } else { | |
c0fc30da | 475 | /* Iterator is after key that changed */ |
1c6fdbd8 | 476 | set->k = (int) set->k + shift; |
059e4134 | 477 | return; |
1c6fdbd8 KO |
478 | } |
479 | ||
1c6fdbd8 | 480 | bch2_btree_node_iter_sort(node_iter, b); |
c0fc30da KO |
481 | fixup_done: |
482 | if (node_iter->data[0].k != orig_iter_pos) | |
483 | iter_current_key_modified = true; | |
56e0e7c7 | 484 | |
1c6fdbd8 | 485 | /* |
23bbd2bb KO |
486 | * When a new key is added, and the node iterator now points to that |
487 | * key, the iterator might have skipped past deleted keys that should | |
488 | * come after the key the iterator now points to. We have to rewind to | |
c0fc30da KO |
489 | * before those deleted keys - otherwise |
490 | * bch2_btree_node_iter_prev_all() breaks: | |
1c6fdbd8 | 491 | */ |
23bbd2bb | 492 | if (!bch2_btree_node_iter_end(node_iter) && |
c0fc30da | 493 | iter_current_key_modified && |
1ae29c1f | 494 | b->c.level) { |
23bbd2bb KO |
495 | struct bset_tree *t; |
496 | struct bkey_packed *k, *k2, *p; | |
497 | ||
498 | k = bch2_btree_node_iter_peek_all(node_iter, b); | |
1c6fdbd8 KO |
499 | |
500 | for_each_bset(b, t) { | |
23bbd2bb KO |
501 | bool set_pos = false; |
502 | ||
503 | if (node_iter->data[0].end == t->end_offset) | |
1c6fdbd8 KO |
504 | continue; |
505 | ||
23bbd2bb KO |
506 | k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); |
507 | ||
508 | while ((p = bch2_bkey_prev_all(b, t, k2)) && | |
509 | bkey_iter_cmp(b, k, p) < 0) { | |
510 | k2 = p; | |
511 | set_pos = true; | |
1c6fdbd8 | 512 | } |
23bbd2bb KO |
513 | |
514 | if (set_pos) | |
515 | btree_node_iter_set_set_pos(node_iter, | |
516 | b, t, k2); | |
1c6fdbd8 KO |
517 | } |
518 | } | |
519 | } | |
520 | ||
9f6bd307 | 521 | void bch2_btree_node_iter_fix(struct btree_trans *trans, |
67e0dd8f | 522 | struct btree_path *path, |
216c9fac KO |
523 | struct btree *b, |
524 | struct btree_node_iter *node_iter, | |
525 | struct bkey_packed *where, | |
526 | unsigned clobber_u64s, | |
527 | unsigned new_u64s) | |
1c6fdbd8 | 528 | { |
216c9fac | 529 | struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); |
67e0dd8f | 530 | struct btree_path *linked; |
1c6fdbd8 | 531 | |
67e0dd8f KO |
532 | if (node_iter != &path->l[b->c.level].iter) { |
533 | __bch2_btree_node_iter_fix(path, b, node_iter, t, | |
059e4134 | 534 | where, clobber_u64s, new_u64s); |
2dac0eae | 535 | |
29364f34 | 536 | if (bch2_debug_check_iterators) |
2dac0eae | 537 | bch2_btree_node_iter_verify(node_iter, b); |
059e4134 | 538 | } |
1c6fdbd8 | 539 | |
67e0dd8f | 540 | trans_for_each_path_with_node(trans, b, linked) { |
1c6fdbd8 | 541 | __bch2_btree_node_iter_fix(linked, b, |
059e4134 KO |
542 | &linked->l[b->c.level].iter, t, |
543 | where, clobber_u64s, new_u64s); | |
67e0dd8f | 544 | bch2_btree_path_verify_level(trans, linked, b->c.level); |
059e4134 | 545 | } |
1c6fdbd8 KO |
546 | } |
547 | ||
67e0dd8f KO |
548 | /* Btree path level: pointer to a particular btree node and node iter */ |
549 | ||
550 | static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, | |
551 | struct btree_path_level *l, | |
1c6fdbd8 KO |
552 | struct bkey *u, |
553 | struct bkey_packed *k) | |
554 | { | |
1c6fdbd8 KO |
555 | if (unlikely(!k)) { |
556 | /* | |
557 | * signal to bch2_btree_iter_peek_slot() that we're currently at | |
558 | * a hole | |
559 | */ | |
26609b61 | 560 | u->type = KEY_TYPE_deleted; |
1c6fdbd8 KO |
561 | return bkey_s_c_null; |
562 | } | |
563 | ||
2ce8fbd9 | 564 | return bkey_disassemble(l->b, k, u); |
1c6fdbd8 KO |
565 | } |
566 | ||
67e0dd8f KO |
567 | static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, |
568 | struct btree_path_level *l, | |
569 | struct bkey *u) | |
1c6fdbd8 | 570 | { |
67e0dd8f | 571 | return __btree_iter_unpack(c, l, u, |
1c6fdbd8 KO |
572 | bch2_btree_node_iter_peek_all(&l->iter, l->b)); |
573 | } | |
574 | ||
67e0dd8f KO |
575 | static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, |
576 | struct btree_path *path, | |
577 | struct btree_path_level *l, | |
578 | struct bkey *u) | |
1c6fdbd8 | 579 | { |
67e0dd8f | 580 | struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, |
1c6fdbd8 | 581 | bch2_btree_node_iter_peek(&l->iter, l->b)); |
ca58cbd4 | 582 | |
67e0dd8f KO |
583 | path->pos = k.k ? k.k->p : l->b->key.k.p; |
584 | trans->paths_sorted = false; | |
01eed771 | 585 | bch2_btree_path_verify_level(trans, path, l - path->l); |
ca58cbd4 | 586 | return k; |
1c6fdbd8 KO |
587 | } |
588 | ||
67e0dd8f KO |
589 | static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, |
590 | struct btree_path *path, | |
591 | struct btree_path_level *l, | |
592 | struct bkey *u) | |
ccf5a109 | 593 | { |
67e0dd8f | 594 | struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, |
ccf5a109 | 595 | bch2_btree_node_iter_prev(&l->iter, l->b)); |
ca58cbd4 | 596 | |
67e0dd8f KO |
597 | path->pos = k.k ? k.k->p : l->b->data->min_key; |
598 | trans->paths_sorted = false; | |
01eed771 | 599 | bch2_btree_path_verify_level(trans, path, l - path->l); |
ca58cbd4 | 600 | return k; |
ccf5a109 KO |
601 | } |
602 | ||
67e0dd8f KO |
603 | static inline bool btree_path_advance_to_pos(struct btree_path *path, |
604 | struct btree_path_level *l, | |
a00fd8c5 | 605 | int max_advance) |
1c6fdbd8 | 606 | { |
a00fd8c5 KO |
607 | struct bkey_packed *k; |
608 | int nr_advanced = 0; | |
609 | ||
610 | while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && | |
67e0dd8f | 611 | bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { |
a00fd8c5 KO |
612 | if (max_advance > 0 && nr_advanced >= max_advance) |
613 | return false; | |
614 | ||
615 | bch2_btree_node_iter_advance(&l->iter, l->b); | |
616 | nr_advanced++; | |
617 | } | |
618 | ||
619 | return true; | |
1c6fdbd8 KO |
620 | } |
621 | ||
67e0dd8f | 622 | static inline void __btree_path_level_init(struct btree_path *path, |
78cf784e | 623 | unsigned level) |
1c6fdbd8 | 624 | { |
67e0dd8f | 625 | struct btree_path_level *l = &path->l[level]; |
a00fd8c5 | 626 | |
67e0dd8f | 627 | bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); |
1c6fdbd8 | 628 | |
537c49d6 KO |
629 | /* |
630 | * Iterators to interior nodes should always be pointed at the first non | |
631 | * whiteout: | |
632 | */ | |
633 | if (level) | |
634 | bch2_btree_node_iter_peek(&l->iter, l->b); | |
1c6fdbd8 KO |
635 | } |
636 | ||
e4215d0f KO |
637 | inline void bch2_btree_path_level_init(struct btree_trans *trans, |
638 | struct btree_path *path, | |
639 | struct btree *b) | |
1c6fdbd8 | 640 | { |
67e0dd8f | 641 | BUG_ON(path->cached); |
2ca88e5a | 642 | |
67e0dd8f | 643 | EBUG_ON(!btree_path_pos_in_node(path, b)); |
c43a6ef9 | 644 | EBUG_ON(b->c.lock.state.seq & 1); |
1c6fdbd8 | 645 | |
67e0dd8f KO |
646 | path->l[b->c.level].lock_seq = b->c.lock.state.seq; |
647 | path->l[b->c.level].b = b; | |
648 | __btree_path_level_init(path, b->c.level); | |
1c6fdbd8 KO |
649 | } |
650 | ||
67e0dd8f KO |
651 | /* Btree path: fixups after btree node updates: */ |
652 | ||
1c6fdbd8 KO |
653 | /* |
654 | * A btree node is being replaced - update the iterator to point to the new | |
655 | * node: | |
656 | */ | |
f7a966a3 | 657 | void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) |
1c6fdbd8 | 658 | { |
67e0dd8f | 659 | struct btree_path *path; |
1c6fdbd8 | 660 | |
67e0dd8f | 661 | trans_for_each_path(trans, path) |
38474c26 KO |
662 | if (path->uptodate == BTREE_ITER_UPTODATE && |
663 | !path->cached && | |
67e0dd8f | 664 | btree_path_pos_in_node(path, b)) { |
1d3ecd7e KO |
665 | enum btree_node_locked_type t = |
666 | btree_lock_want(path, b->c.level); | |
1c6fdbd8 | 667 | |
38474c26 | 668 | if (t != BTREE_NODE_UNLOCKED) { |
8bfe14e8 | 669 | btree_node_unlock(trans, path, b->c.level); |
c43a6ef9 | 670 | six_lock_increment(&b->c.lock, (enum six_lock_type) t); |
c807ca95 | 671 | mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t); |
1c6fdbd8 KO |
672 | } |
673 | ||
e4215d0f | 674 | bch2_btree_path_level_init(trans, path, b); |
1c6fdbd8 | 675 | } |
1c6fdbd8 KO |
676 | } |
677 | ||
1c6fdbd8 KO |
678 | /* |
679 | * A btree node has been modified in such a way as to invalidate iterators - fix | |
680 | * them: | |
681 | */ | |
f7a966a3 | 682 | void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) |
1c6fdbd8 | 683 | { |
67e0dd8f | 684 | struct btree_path *path; |
1c6fdbd8 | 685 | |
67e0dd8f KO |
686 | trans_for_each_path_with_node(trans, b, path) |
687 | __btree_path_level_init(path, b->c.level); | |
1c6fdbd8 KO |
688 | } |
689 | ||
67e0dd8f KO |
690 | /* Btree path: traverse, set_pos: */ |
691 | ||
67e0dd8f KO |
692 | static inline int btree_path_lock_root(struct btree_trans *trans, |
693 | struct btree_path *path, | |
a301dc38 KO |
694 | unsigned depth_want, |
695 | unsigned long trace_ip) | |
1c6fdbd8 | 696 | { |
e5af273f | 697 | struct bch_fs *c = trans->c; |
67e0dd8f | 698 | struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; |
1c6fdbd8 KO |
699 | enum six_lock_type lock_type; |
700 | unsigned i; | |
549d173c | 701 | int ret; |
1c6fdbd8 | 702 | |
67e0dd8f | 703 | EBUG_ON(path->nodes_locked); |
1c6fdbd8 KO |
704 | |
705 | while (1) { | |
bd2bb273 | 706 | b = READ_ONCE(*rootp); |
67e0dd8f | 707 | path->level = READ_ONCE(b->c.level); |
1c6fdbd8 | 708 | |
67e0dd8f | 709 | if (unlikely(path->level < depth_want)) { |
1c6fdbd8 KO |
710 | /* |
711 | * the root is at a lower depth than the depth we want: | |
712 | * got to the end of the btree, or we're walking nodes | |
713 | * greater than some depth and there are no nodes >= | |
714 | * that depth | |
715 | */ | |
67e0dd8f KO |
716 | path->level = depth_want; |
717 | for (i = path->level; i < BTREE_MAX_DEPTH; i++) | |
718 | path->l[i].b = NULL; | |
8812600c | 719 | return 1; |
1c6fdbd8 KO |
720 | } |
721 | ||
67e0dd8f | 722 | lock_type = __btree_lock_want(path, path->level); |
0d7009d7 KO |
723 | ret = btree_node_lock(trans, path, &b->c, |
724 | path->level, lock_type, trace_ip); | |
549d173c KO |
725 | if (unlikely(ret)) { |
726 | if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) | |
727 | continue; | |
728 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) | |
729 | return ret; | |
730 | BUG(); | |
e5af273f | 731 | } |
1c6fdbd8 | 732 | |
bd2bb273 | 733 | if (likely(b == READ_ONCE(*rootp) && |
67e0dd8f | 734 | b->c.level == path->level && |
1c6fdbd8 | 735 | !race_fault())) { |
67e0dd8f | 736 | for (i = 0; i < path->level; i++) |
315c9ba6 | 737 | path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); |
67e0dd8f KO |
738 | path->l[path->level].b = b; |
739 | for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) | |
740 | path->l[i].b = NULL; | |
741 | ||
c807ca95 | 742 | mark_btree_node_locked(trans, path, path->level, lock_type); |
e4215d0f | 743 | bch2_btree_path_level_init(trans, path, b); |
1c6fdbd8 | 744 | return 0; |
1c6fdbd8 KO |
745 | } |
746 | ||
c43a6ef9 | 747 | six_unlock_type(&b->c.lock, lock_type); |
1c6fdbd8 KO |
748 | } |
749 | } | |
750 | ||
751 | noinline | |
67e0dd8f | 752 | static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) |
1c6fdbd8 | 753 | { |
9f6bd307 | 754 | struct bch_fs *c = trans->c; |
67e0dd8f | 755 | struct btree_path_level *l = path_l(path); |
1c6fdbd8 KO |
756 | struct btree_node_iter node_iter = l->iter; |
757 | struct bkey_packed *k; | |
07a1006a | 758 | struct bkey_buf tmp; |
9e5e5b9e | 759 | unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) |
67e0dd8f KO |
760 | ? (path->level > 1 ? 0 : 2) |
761 | : (path->level > 1 ? 1 : 16); | |
762 | bool was_locked = btree_node_locked(path, path->level); | |
8b3e9bd6 | 763 | int ret = 0; |
1c6fdbd8 | 764 | |
07a1006a KO |
765 | bch2_bkey_buf_init(&tmp); |
766 | ||
8b3e9bd6 | 767 | while (nr && !ret) { |
67e0dd8f | 768 | if (!bch2_btree_node_relock(trans, path, path->level)) |
07a1006a | 769 | break; |
1c6fdbd8 KO |
770 | |
771 | bch2_btree_node_iter_advance(&node_iter, l->b); | |
772 | k = bch2_btree_node_iter_peek(&node_iter, l->b); | |
773 | if (!k) | |
774 | break; | |
775 | ||
07a1006a | 776 | bch2_bkey_buf_unpack(&tmp, c, l->b, k); |
67e0dd8f KO |
777 | ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, |
778 | path->level - 1); | |
1c6fdbd8 KO |
779 | } |
780 | ||
781 | if (!was_locked) | |
8bfe14e8 | 782 | btree_node_unlock(trans, path, path->level); |
07a1006a KO |
783 | |
784 | bch2_bkey_buf_exit(&tmp, c); | |
8b3e9bd6 | 785 | return ret; |
1c6fdbd8 KO |
786 | } |
787 | ||
5222a460 KO |
788 | static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, |
789 | struct btree_and_journal_iter *jiter) | |
790 | { | |
791 | struct bch_fs *c = trans->c; | |
792 | struct bkey_s_c k; | |
793 | struct bkey_buf tmp; | |
794 | unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) | |
795 | ? (path->level > 1 ? 0 : 2) | |
796 | : (path->level > 1 ? 1 : 16); | |
797 | bool was_locked = btree_node_locked(path, path->level); | |
798 | int ret = 0; | |
799 | ||
800 | bch2_bkey_buf_init(&tmp); | |
801 | ||
802 | while (nr && !ret) { | |
803 | if (!bch2_btree_node_relock(trans, path, path->level)) | |
804 | break; | |
805 | ||
806 | bch2_btree_and_journal_iter_advance(jiter); | |
807 | k = bch2_btree_and_journal_iter_peek(jiter); | |
808 | if (!k.k) | |
809 | break; | |
810 | ||
811 | bch2_bkey_buf_reassemble(&tmp, c, k); | |
812 | ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, | |
813 | path->level - 1); | |
814 | } | |
815 | ||
816 | if (!was_locked) | |
8bfe14e8 | 817 | btree_node_unlock(trans, path, path->level); |
5222a460 KO |
818 | |
819 | bch2_bkey_buf_exit(&tmp, c); | |
820 | return ret; | |
821 | } | |
822 | ||
78cf784e | 823 | static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, |
67e0dd8f | 824 | struct btree_path *path, |
72141e1f KO |
825 | unsigned plevel, struct btree *b) |
826 | { | |
67e0dd8f KO |
827 | struct btree_path_level *l = &path->l[plevel]; |
828 | bool locked = btree_node_locked(path, plevel); | |
72141e1f KO |
829 | struct bkey_packed *k; |
830 | struct bch_btree_ptr_v2 *bp; | |
831 | ||
67e0dd8f | 832 | if (!bch2_btree_node_relock(trans, path, plevel)) |
72141e1f KO |
833 | return; |
834 | ||
835 | k = bch2_btree_node_iter_peek_all(&l->iter, l->b); | |
836 | BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); | |
837 | ||
838 | bp = (void *) bkeyp_val(&l->b->format, k); | |
839 | bp->mem_ptr = (unsigned long)b; | |
840 | ||
841 | if (!locked) | |
8bfe14e8 | 842 | btree_node_unlock(trans, path, plevel); |
72141e1f KO |
843 | } |
844 | ||
5222a460 KO |
845 | static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, |
846 | struct btree_path *path, | |
847 | unsigned flags, | |
848 | struct bkey_buf *out) | |
849 | { | |
850 | struct bch_fs *c = trans->c; | |
851 | struct btree_path_level *l = path_l(path); | |
852 | struct btree_and_journal_iter jiter; | |
853 | struct bkey_s_c k; | |
854 | int ret = 0; | |
855 | ||
856 | __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); | |
857 | ||
858 | k = bch2_btree_and_journal_iter_peek(&jiter); | |
859 | ||
860 | bch2_bkey_buf_reassemble(out, c, k); | |
861 | ||
862 | if (flags & BTREE_ITER_PREFETCH) | |
863 | ret = btree_path_prefetch_j(trans, path, &jiter); | |
864 | ||
865 | bch2_btree_and_journal_iter_exit(&jiter); | |
866 | return ret; | |
867 | } | |
868 | ||
67e0dd8f KO |
869 | static __always_inline int btree_path_down(struct btree_trans *trans, |
870 | struct btree_path *path, | |
871 | unsigned flags, | |
a301dc38 | 872 | unsigned long trace_ip) |
1c6fdbd8 | 873 | { |
6e075b54 | 874 | struct bch_fs *c = trans->c; |
67e0dd8f | 875 | struct btree_path_level *l = path_l(path); |
1c6fdbd8 | 876 | struct btree *b; |
67e0dd8f KO |
877 | unsigned level = path->level - 1; |
878 | enum six_lock_type lock_type = __btree_lock_want(path, level); | |
07a1006a KO |
879 | struct bkey_buf tmp; |
880 | int ret; | |
1c6fdbd8 | 881 | |
67e0dd8f | 882 | EBUG_ON(!btree_node_locked(path, path->level)); |
1c6fdbd8 | 883 | |
07a1006a | 884 | bch2_bkey_buf_init(&tmp); |
5222a460 KO |
885 | |
886 | if (unlikely(trans->journal_replay_not_finished)) { | |
887 | ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); | |
888 | if (ret) | |
889 | goto err; | |
890 | } else { | |
891 | bch2_bkey_buf_unpack(&tmp, c, l->b, | |
892 | bch2_btree_node_iter_peek(&l->iter, l->b)); | |
893 | ||
894 | if (flags & BTREE_ITER_PREFETCH) { | |
895 | ret = btree_path_prefetch(trans, path); | |
896 | if (ret) | |
897 | goto err; | |
898 | } | |
899 | } | |
1c6fdbd8 | 900 | |
67e0dd8f | 901 | b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); |
07a1006a KO |
902 | ret = PTR_ERR_OR_ZERO(b); |
903 | if (unlikely(ret)) | |
904 | goto err; | |
1c6fdbd8 | 905 | |
5222a460 KO |
906 | if (likely(!trans->journal_replay_not_finished && |
907 | tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && | |
07a1006a | 908 | unlikely(b != btree_node_mem_ptr(tmp.k))) |
67e0dd8f | 909 | btree_node_mem_ptr_set(trans, path, level + 1, b); |
72141e1f | 910 | |
67e0dd8f | 911 | if (btree_node_read_locked(path, level + 1)) |
8bfe14e8 | 912 | btree_node_unlock(trans, path, level + 1); |
223b560e KO |
913 | |
914 | mark_btree_node_locked(trans, path, level, lock_type); | |
67e0dd8f | 915 | path->level = level; |
e4215d0f | 916 | bch2_btree_path_level_init(trans, path, b); |
c205321b | 917 | |
67e0dd8f | 918 | bch2_btree_path_verify_locks(path); |
07a1006a KO |
919 | err: |
920 | bch2_bkey_buf_exit(&tmp, c); | |
921 | return ret; | |
1c6fdbd8 KO |
922 | } |
923 | ||
67e0dd8f KO |
924 | static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, |
925 | unsigned, unsigned long); | |
1c6fdbd8 | 926 | |
8f9ad91a | 927 | static int bch2_btree_path_traverse_all(struct btree_trans *trans) |
1c6fdbd8 | 928 | { |
e542029e | 929 | struct bch_fs *c = trans->c; |
0d7009d7 | 930 | struct btree_path *path; |
8f9ad91a KO |
931 | unsigned long trace_ip = _RET_IP_; |
932 | int i, ret = 0; | |
e542029e | 933 | |
2ca88e5a | 934 | if (trans->in_traverse_all) |
549d173c | 935 | return -BCH_ERR_transaction_restart_in_traverse_all; |
2ca88e5a KO |
936 | |
937 | trans->in_traverse_all = true; | |
938 | retry_all: | |
549d173c | 939 | trans->restarted = 0; |
e5af273f | 940 | |
67e0dd8f KO |
941 | trans_for_each_path(trans, path) |
942 | path->should_be_locked = false; | |
e542029e | 943 | |
67e0dd8f | 944 | btree_trans_sort_paths(trans); |
2527dd91 | 945 | |
58fbf808 | 946 | bch2_trans_unlock(trans); |
a301dc38 | 947 | cond_resched(); |
1c6fdbd8 | 948 | |
8f9ad91a | 949 | if (unlikely(trans->memory_allocation_failure)) { |
1c6fdbd8 KO |
950 | struct closure cl; |
951 | ||
952 | closure_init_stack(&cl); | |
953 | ||
954 | do { | |
955 | ret = bch2_btree_cache_cannibalize_lock(c, &cl); | |
956 | closure_sync(&cl); | |
957 | } while (ret); | |
958 | } | |
959 | ||
1c6fdbd8 | 960 | /* Now, redo traversals in correct order: */ |
0423fb71 KO |
961 | i = 0; |
962 | while (i < trans->nr_sorted) { | |
67e0dd8f | 963 | path = trans->paths + trans->sorted[i]; |
2ca88e5a | 964 | |
33aa419d KO |
965 | /* |
966 | * Traversing a path can cause another path to be added at about | |
967 | * the same position: | |
968 | */ | |
969 | if (path->uptodate) { | |
970 | ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); | |
549d173c KO |
971 | if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || |
972 | ret == -ENOMEM) | |
33aa419d | 973 | goto retry_all; |
a897ef68 KO |
974 | if (ret) |
975 | goto err; | |
33aa419d | 976 | } else { |
0423fb71 | 977 | i++; |
33aa419d | 978 | } |
e542029e | 979 | } |
0423fb71 KO |
980 | |
981 | /* | |
982 | * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock() | |
67e0dd8f | 983 | * and relock(), relock() won't relock since path->should_be_locked |
0423fb71 KO |
984 | * isn't set yet, which is all fine |
985 | */ | |
67e0dd8f KO |
986 | trans_for_each_path(trans, path) |
987 | BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); | |
a897ef68 | 988 | err: |
1c6fdbd8 | 989 | bch2_btree_cache_cannibalize_unlock(c); |
2ca88e5a | 990 | |
669f87a5 KO |
991 | trans->in_traverse_all = false; |
992 | ||
674cfc26 | 993 | trace_and_count(c, trans_traverse_all, trans, trace_ip); |
1c6fdbd8 | 994 | return ret; |
bf7b87a4 | 995 | } |
1c6fdbd8 | 996 | |
67e0dd8f KO |
997 | static inline bool btree_path_good_node(struct btree_trans *trans, |
998 | struct btree_path *path, | |
f4b61341 KO |
999 | unsigned l, int check_pos) |
1000 | { | |
67e0dd8f KO |
1001 | if (!is_btree_node(path, l) || |
1002 | !bch2_btree_node_relock(trans, path, l)) | |
f4b61341 KO |
1003 | return false; |
1004 | ||
67e0dd8f | 1005 | if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) |
f4b61341 | 1006 | return false; |
67e0dd8f | 1007 | if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) |
f4b61341 KO |
1008 | return false; |
1009 | return true; | |
1010 | } | |
1011 | ||
c4bce586 KO |
1012 | static void btree_path_set_level_down(struct btree_trans *trans, |
1013 | struct btree_path *path, | |
1014 | unsigned new_level) | |
1015 | { | |
1016 | unsigned l; | |
1017 | ||
1018 | path->level = new_level; | |
1019 | ||
1020 | for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) | |
1021 | if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) | |
8bfe14e8 | 1022 | btree_node_unlock(trans, path, l); |
c4bce586 KO |
1023 | |
1024 | btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); | |
1025 | bch2_btree_path_verify(trans, path); | |
1026 | } | |
1027 | ||
67e0dd8f KO |
1028 | static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, |
1029 | struct btree_path *path, | |
f4b61341 | 1030 | int check_pos) |
1c6fdbd8 | 1031 | { |
8ee0134e | 1032 | unsigned i, l = path->level; |
29cea6f4 | 1033 | again: |
67e0dd8f | 1034 | while (btree_path_node(path, l) && |
315c9ba6 KO |
1035 | !btree_path_good_node(trans, path, l, check_pos)) |
1036 | __btree_path_set_level_up(trans, path, l++); | |
1c6fdbd8 | 1037 | |
8ee0134e KO |
1038 | /* If we need intent locks, take them too: */ |
1039 | for (i = l + 1; | |
1040 | i < path->locks_want && btree_path_node(path, i); | |
1041 | i++) | |
29cea6f4 | 1042 | if (!bch2_btree_node_relock(trans, path, i)) { |
315c9ba6 KO |
1043 | while (l <= i) |
1044 | __btree_path_set_level_up(trans, path, l++); | |
29cea6f4 KO |
1045 | goto again; |
1046 | } | |
8ee0134e | 1047 | |
1c6fdbd8 KO |
1048 | return l; |
1049 | } | |
1050 | ||
1051 | /* | |
1052 | * This is the main state machine for walking down the btree - walks down to a | |
1053 | * specified depth | |
1054 | * | |
1055 | * Returns 0 on success, -EIO on error (error reading in a btree node). | |
1056 | * | |
1057 | * On error, caller (peek_node()/peek_key()) must return NULL; the error is | |
b7607ce9 | 1058 | * stashed in the iterator and returned from bch2_trans_exit(). |
1c6fdbd8 | 1059 | */ |
67e0dd8f KO |
1060 | static int btree_path_traverse_one(struct btree_trans *trans, |
1061 | struct btree_path *path, | |
1062 | unsigned flags, | |
a301dc38 | 1063 | unsigned long trace_ip) |
1c6fdbd8 | 1064 | { |
8ee0134e | 1065 | unsigned depth_want = path->level; |
549d173c | 1066 | int ret = trans->restarted; |
1c6fdbd8 | 1067 | |
549d173c | 1068 | if (unlikely(ret)) |
979735df | 1069 | goto out; |
979735df | 1070 | |
e829b717 | 1071 | /* |
67e0dd8f KO |
1072 | * Ensure we obey path->should_be_locked: if it's set, we can't unlock |
1073 | * and re-traverse the path without a transaction restart: | |
e829b717 | 1074 | */ |
67e0dd8f | 1075 | if (path->should_be_locked) { |
549d173c | 1076 | ret = bch2_btree_path_relock(trans, path, trace_ip); |
e829b717 KO |
1077 | goto out; |
1078 | } | |
1079 | ||
67e0dd8f KO |
1080 | if (path->cached) { |
1081 | ret = bch2_btree_path_traverse_cached(trans, path, flags); | |
531a0095 KO |
1082 | goto out; |
1083 | } | |
2ca88e5a | 1084 | |
67e0dd8f | 1085 | if (unlikely(path->level >= BTREE_MAX_DEPTH)) |
531a0095 | 1086 | goto out; |
2ca88e5a | 1087 | |
67e0dd8f | 1088 | path->level = btree_path_up_until_good_node(trans, path, 0); |
1c6fdbd8 | 1089 | |
845cffed KO |
1090 | EBUG_ON(btree_path_node(path, path->level) && |
1091 | !btree_node_locked(path, path->level)); | |
1092 | ||
1c6fdbd8 | 1093 | /* |
67e0dd8f | 1094 | * Note: path->nodes[path->level] may be temporarily NULL here - that |
1c6fdbd8 KO |
1095 | * would indicate to other code that we got to the end of the btree, |
1096 | * here it indicates that relocking the root failed - it's critical that | |
67e0dd8f | 1097 | * btree_path_lock_root() comes next and that it can't fail |
1c6fdbd8 | 1098 | */ |
67e0dd8f KO |
1099 | while (path->level > depth_want) { |
1100 | ret = btree_path_node(path, path->level) | |
1101 | ? btree_path_down(trans, path, flags, trace_ip) | |
1102 | : btree_path_lock_root(trans, path, depth_want, trace_ip); | |
1c6fdbd8 | 1103 | if (unlikely(ret)) { |
531a0095 KO |
1104 | if (ret == 1) { |
1105 | /* | |
f21566f1 KO |
1106 | * No nodes at this level - got to the end of |
1107 | * the btree: | |
531a0095 KO |
1108 | */ |
1109 | ret = 0; | |
1110 | goto out; | |
1111 | } | |
8812600c | 1112 | |
8bfe14e8 | 1113 | __bch2_btree_path_unlock(trans, path); |
67e0dd8f | 1114 | path->level = depth_want; |
315c9ba6 | 1115 | path->l[path->level].b = ERR_PTR(ret); |
531a0095 | 1116 | goto out; |
1c6fdbd8 KO |
1117 | } |
1118 | } | |
1119 | ||
67e0dd8f | 1120 | path->uptodate = BTREE_ITER_UPTODATE; |
531a0095 | 1121 | out: |
549d173c | 1122 | BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); |
67e0dd8f | 1123 | bch2_btree_path_verify(trans, path); |
531a0095 | 1124 | return ret; |
1c6fdbd8 KO |
1125 | } |
1126 | ||
67e0dd8f KO |
1127 | int __must_check bch2_btree_path_traverse(struct btree_trans *trans, |
1128 | struct btree_path *path, unsigned flags) | |
1c6fdbd8 | 1129 | { |
01eed771 | 1130 | if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { |
a0cb8d78 KO |
1131 | unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U); |
1132 | u64 max = ~(~0ULL << restart_probability_bits); | |
1133 | ||
1134 | if (!get_random_u32_below(max)) { | |
674cfc26 | 1135 | trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_); |
a0cb8d78 KO |
1136 | return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); |
1137 | } | |
1138 | } | |
1139 | ||
67e0dd8f KO |
1140 | if (path->uptodate < BTREE_ITER_NEED_RELOCK) |
1141 | return 0; | |
1142 | ||
e5fa91d7 | 1143 | return bch2_trans_cond_resched(trans) ?: |
67e0dd8f | 1144 | btree_path_traverse_one(trans, path, flags, _RET_IP_); |
1c6fdbd8 KO |
1145 | } |
1146 | ||
67e0dd8f KO |
1147 | static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, |
1148 | struct btree_path *src) | |
1149 | { | |
1150 | unsigned i, offset = offsetof(struct btree_path, pos); | |
1151 | ||
1152 | memcpy((void *) dst + offset, | |
1153 | (void *) src + offset, | |
1154 | sizeof(struct btree_path) - offset); | |
1155 | ||
1156 | for (i = 0; i < BTREE_MAX_DEPTH; i++) | |
1157 | if (btree_node_locked(dst, i)) | |
1158 | six_lock_increment(&dst->l[i].b->c.lock, | |
1159 | __btree_lock_want(dst, i)); | |
1160 | ||
1161 | trans->paths_sorted = false; | |
1162 | } | |
1163 | ||
c404f203 KO |
1164 | static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, |
1165 | bool intent) | |
67e0dd8f | 1166 | { |
c404f203 | 1167 | struct btree_path *new = btree_path_alloc(trans, src); |
67e0dd8f | 1168 | |
c404f203 | 1169 | btree_path_copy(trans, new, src); |
67e0dd8f | 1170 | __btree_path_get(new, intent); |
c404f203 KO |
1171 | return new; |
1172 | } | |
1173 | ||
d8648425 KO |
1174 | struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, |
1175 | struct btree_path *path, bool intent) | |
c404f203 | 1176 | { |
67e0dd8f | 1177 | __btree_path_put(path, intent); |
c404f203 | 1178 | path = btree_path_clone(trans, path, intent); |
67e0dd8f KO |
1179 | path->preserve = false; |
1180 | #ifdef CONFIG_BCACHEFS_DEBUG | |
1181 | path->ip_allocated = _RET_IP_; | |
1182 | #endif | |
d8648425 | 1183 | path->should_be_locked = false; |
67e0dd8f KO |
1184 | return path; |
1185 | } | |
1186 | ||
ce91abd6 | 1187 | struct btree_path * __must_check |
67e0dd8f KO |
1188 | __bch2_btree_path_set_pos(struct btree_trans *trans, |
1189 | struct btree_path *path, struct bpos new_pos, | |
1190 | bool intent, int cmp) | |
1191 | { | |
67e0dd8f KO |
1192 | unsigned l = path->level; |
1193 | ||
1194 | EBUG_ON(trans->restarted); | |
1195 | EBUG_ON(!path->ref); | |
1196 | ||
1197 | path = bch2_btree_path_make_mut(trans, path, intent); | |
1198 | ||
1199 | path->pos = new_pos; | |
67e0dd8f KO |
1200 | trans->paths_sorted = false; |
1201 | ||
1202 | if (unlikely(path->cached)) { | |
8bfe14e8 | 1203 | btree_node_unlock(trans, path, 0); |
315c9ba6 | 1204 | path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); |
67e0dd8f KO |
1205 | btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); |
1206 | goto out; | |
1207 | } | |
1208 | ||
1209 | l = btree_path_up_until_good_node(trans, path, cmp); | |
1210 | ||
1211 | if (btree_path_node(path, l)) { | |
7071878b | 1212 | BUG_ON(!btree_node_locked(path, l)); |
67e0dd8f KO |
1213 | /* |
1214 | * We might have to skip over many keys, or just a few: try | |
1215 | * advancing the node iterator, and if we have to skip over too | |
1216 | * many keys just reinit it (or if we're rewinding, since that | |
1217 | * is expensive). | |
1218 | */ | |
1219 | if (cmp < 0 || | |
1220 | !btree_path_advance_to_pos(path, &path->l[l], 8)) | |
1221 | __btree_path_level_init(path, l); | |
67e0dd8f KO |
1222 | } |
1223 | ||
1d3ecd7e | 1224 | if (l != path->level) { |
67e0dd8f | 1225 | btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); |
8bfe14e8 | 1226 | __bch2_btree_path_unlock(trans, path); |
1d3ecd7e | 1227 | } |
67e0dd8f KO |
1228 | out: |
1229 | bch2_btree_path_verify(trans, path); | |
67e0dd8f KO |
1230 | return path; |
1231 | } | |
1232 | ||
67e0dd8f KO |
1233 | /* Btree path: main interface: */ |
1234 | ||
1235 | static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) | |
1236 | { | |
a300261a | 1237 | struct btree_path *sib; |
67e0dd8f | 1238 | |
a300261a KO |
1239 | sib = prev_btree_path(trans, path); |
1240 | if (sib && !btree_path_cmp(sib, path)) | |
1241 | return sib; | |
67e0dd8f | 1242 | |
a300261a KO |
1243 | sib = next_btree_path(trans, path); |
1244 | if (sib && !btree_path_cmp(sib, path)) | |
1245 | return sib; | |
67e0dd8f KO |
1246 | |
1247 | return NULL; | |
1248 | } | |
1249 | ||
9a74f63c | 1250 | static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) |
67e0dd8f | 1251 | { |
a300261a | 1252 | struct btree_path *sib; |
67e0dd8f | 1253 | |
a300261a KO |
1254 | sib = prev_btree_path(trans, path); |
1255 | if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) | |
1256 | return sib; | |
67e0dd8f | 1257 | |
a300261a KO |
1258 | sib = next_btree_path(trans, path); |
1259 | if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) | |
1260 | return sib; | |
67e0dd8f | 1261 | |
9a74f63c | 1262 | return NULL; |
67e0dd8f KO |
1263 | } |
1264 | ||
1265 | static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) | |
08070cba | 1266 | { |
8bfe14e8 | 1267 | __bch2_btree_path_unlock(trans, path); |
67e0dd8f KO |
1268 | btree_path_list_remove(trans, path); |
1269 | trans->paths_allocated &= ~(1ULL << path->idx); | |
08070cba KO |
1270 | } |
1271 | ||
67e0dd8f KO |
1272 | void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) |
1273 | { | |
1274 | struct btree_path *dup; | |
1275 | ||
1276 | EBUG_ON(trans->paths + path->idx != path); | |
1277 | EBUG_ON(!path->ref); | |
1278 | ||
1279 | if (!__btree_path_put(path, intent)) | |
1280 | return; | |
1281 | ||
a300261a KO |
1282 | dup = path->preserve |
1283 | ? have_path_at_pos(trans, path) | |
1284 | : have_node_at_pos(trans, path); | |
1285 | ||
1286 | if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) | |
1287 | return; | |
67e0dd8f | 1288 | |
9a74f63c | 1289 | if (path->should_be_locked && |
a300261a KO |
1290 | !trans->restarted && |
1291 | (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) | |
9a74f63c KO |
1292 | return; |
1293 | ||
a300261a KO |
1294 | if (dup) { |
1295 | dup->preserve |= path->preserve; | |
1296 | dup->should_be_locked |= path->should_be_locked; | |
1297 | } | |
1298 | ||
9a74f63c | 1299 | __bch2_path_free(trans, path); |
67e0dd8f KO |
1300 | } |
1301 | ||
7dcbdbd8 KO |
1302 | static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, |
1303 | bool intent) | |
1304 | { | |
1305 | EBUG_ON(trans->paths + path->idx != path); | |
1306 | EBUG_ON(!path->ref); | |
1307 | ||
1308 | if (!__btree_path_put(path, intent)) | |
1309 | return; | |
1310 | ||
1311 | __bch2_path_free(trans, path); | |
1312 | } | |
1313 | ||
5c0bb66a | 1314 | noinline __cold |
8570d775 | 1315 | void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) |
67e0dd8f | 1316 | { |
67e0dd8f | 1317 | struct btree_insert_entry *i; |
2158fe46 | 1318 | |
401ec4db | 1319 | prt_printf(buf, "transaction updates for %s journal seq %llu", |
8570d775 | 1320 | trans->fn, trans->journal_res.seq); |
401ec4db KO |
1321 | prt_newline(buf); |
1322 | printbuf_indent_add(buf, 2); | |
2158fe46 KO |
1323 | |
1324 | trans_for_each_update(trans, i) { | |
1325 | struct bkey_s_c old = { &i->old_k, i->old_v }; | |
1326 | ||
401ec4db | 1327 | prt_printf(buf, "update: btree=%s cached=%u %pS", |
2158fe46 | 1328 | bch2_btree_ids[i->btree_id], |
8570d775 KO |
1329 | i->cached, |
1330 | (void *) i->ip_allocated); | |
401ec4db | 1331 | prt_newline(buf); |
8570d775 | 1332 | |
401ec4db | 1333 | prt_printf(buf, " old "); |
8570d775 | 1334 | bch2_bkey_val_to_text(buf, trans->c, old); |
401ec4db | 1335 | prt_newline(buf); |
8570d775 | 1336 | |
401ec4db | 1337 | prt_printf(buf, " new "); |
8570d775 | 1338 | bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); |
401ec4db | 1339 | prt_newline(buf); |
2158fe46 KO |
1340 | } |
1341 | ||
401ec4db | 1342 | printbuf_indent_sub(buf, 2); |
8570d775 KO |
1343 | } |
1344 | ||
1345 | noinline __cold | |
1346 | void bch2_dump_trans_updates(struct btree_trans *trans) | |
1347 | { | |
1348 | struct printbuf buf = PRINTBUF; | |
1349 | ||
1350 | bch2_trans_updates_to_text(&buf, trans); | |
a8f35428 | 1351 | bch2_print_string_as_lines(KERN_ERR, buf.buf); |
8570d775 | 1352 | printbuf_exit(&buf); |
2158fe46 KO |
1353 | } |
1354 | ||
1355 | noinline __cold | |
5c0bb66a KO |
1356 | void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) |
1357 | { | |
1358 | prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", | |
1359 | path->idx, path->ref, path->intent_ref, | |
1360 | path->preserve ? 'P' : ' ', | |
1361 | path->should_be_locked ? 'S' : ' ', | |
1362 | bch2_btree_ids[path->btree_id], | |
1363 | path->level); | |
1364 | bch2_bpos_to_text(out, path->pos); | |
1365 | ||
1366 | prt_printf(out, " locks %u", path->nodes_locked); | |
1367 | #ifdef CONFIG_BCACHEFS_DEBUG | |
1368 | prt_printf(out, " %pS", (void *) path->ip_allocated); | |
1369 | #endif | |
1370 | prt_newline(out); | |
1371 | } | |
1372 | ||
1373 | noinline __cold | |
1374 | void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, | |
1375 | bool nosort) | |
2158fe46 KO |
1376 | { |
1377 | struct btree_path *path; | |
67e0dd8f | 1378 | unsigned idx; |
67e0dd8f | 1379 | |
5c0bb66a KO |
1380 | if (!nosort) |
1381 | btree_trans_sort_paths(trans); | |
67e0dd8f | 1382 | |
5c0bb66a KO |
1383 | trans_for_each_path_inorder(trans, path, idx) |
1384 | bch2_btree_path_to_text(out, path); | |
1385 | } | |
fa8e94fa | 1386 | |
5c0bb66a KO |
1387 | noinline __cold |
1388 | void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) | |
1389 | { | |
1390 | __bch2_trans_paths_to_text(out, trans, false); | |
1391 | } | |
fa8e94fa | 1392 | |
5c0bb66a KO |
1393 | noinline __cold |
1394 | void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) | |
1395 | { | |
1396 | struct printbuf buf = PRINTBUF; | |
1397 | ||
1398 | __bch2_trans_paths_to_text(&buf, trans, nosort); | |
a8f35428 | 1399 | bch2_trans_updates_to_text(&buf, trans); |
67e0dd8f | 1400 | |
a8f35428 | 1401 | bch2_print_string_as_lines(KERN_ERR, buf.buf); |
2158fe46 | 1402 | printbuf_exit(&buf); |
67e0dd8f KO |
1403 | } |
1404 | ||
5c0bb66a KO |
1405 | noinline __cold |
1406 | void bch2_dump_trans_paths_updates(struct btree_trans *trans) | |
1407 | { | |
1408 | __bch2_dump_trans_paths_updates(trans, false); | |
1409 | } | |
1410 | ||
1411 | noinline __cold | |
1412 | static void bch2_trans_update_max_paths(struct btree_trans *trans) | |
1413 | { | |
1414 | struct btree_transaction_stats *s = btree_trans_stats(trans); | |
1415 | struct printbuf buf = PRINTBUF; | |
1416 | ||
1417 | if (!s) | |
1418 | return; | |
1419 | ||
1420 | bch2_trans_paths_to_text(&buf, trans); | |
1421 | ||
1422 | if (!buf.allocation_failure) { | |
1423 | mutex_lock(&s->lock); | |
1424 | if (s->nr_max_paths < hweight64(trans->paths_allocated)) { | |
3f3bc66e KO |
1425 | s->nr_max_paths = trans->nr_max_paths = |
1426 | hweight64(trans->paths_allocated); | |
5c0bb66a KO |
1427 | swap(s->max_paths_text, buf.buf); |
1428 | } | |
1429 | mutex_unlock(&s->lock); | |
1430 | } | |
1431 | ||
1432 | printbuf_exit(&buf); | |
1433 | ||
1434 | trans->nr_max_paths = hweight64(trans->paths_allocated); | |
1435 | } | |
1436 | ||
3f3bc66e KO |
1437 | static noinline void btree_path_overflow(struct btree_trans *trans) |
1438 | { | |
1439 | bch2_dump_trans_paths_updates(trans); | |
1440 | panic("trans path oveflow\n"); | |
1441 | } | |
1442 | ||
1443 | static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, | |
1444 | struct btree_path *pos) | |
67e0dd8f KO |
1445 | { |
1446 | struct btree_path *path; | |
1447 | unsigned idx; | |
1448 | ||
1449 | if (unlikely(trans->paths_allocated == | |
3f3bc66e KO |
1450 | ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) |
1451 | btree_path_overflow(trans); | |
67e0dd8f KO |
1452 | |
1453 | idx = __ffs64(~trans->paths_allocated); | |
1454 | trans->paths_allocated |= 1ULL << idx; | |
1455 | ||
1456 | path = &trans->paths[idx]; | |
67e0dd8f KO |
1457 | path->idx = idx; |
1458 | path->ref = 0; | |
1459 | path->intent_ref = 0; | |
1460 | path->nodes_locked = 0; | |
67e0dd8f KO |
1461 | |
1462 | btree_path_list_add(trans, pos, path); | |
5c0bb66a KO |
1463 | trans->paths_sorted = false; |
1464 | ||
1465 | if (unlikely(idx > trans->nr_max_paths)) | |
1466 | bch2_trans_update_max_paths(trans); | |
67e0dd8f KO |
1467 | return path; |
1468 | } | |
1469 | ||
f3e1f444 | 1470 | struct btree_path *bch2_path_get(struct btree_trans *trans, |
67e0dd8f KO |
1471 | enum btree_id btree_id, struct bpos pos, |
1472 | unsigned locks_want, unsigned level, | |
f3e1f444 | 1473 | unsigned flags) |
67e0dd8f | 1474 | { |
807dda8c | 1475 | struct btree_path *path, *path_pos = NULL; |
f3e1f444 KO |
1476 | bool cached = flags & BTREE_ITER_CACHED; |
1477 | bool intent = flags & BTREE_ITER_INTENT; | |
67e0dd8f KO |
1478 | int i; |
1479 | ||
1480 | BUG_ON(trans->restarted); | |
7071878b KO |
1481 | bch2_trans_verify_locks(trans); |
1482 | ||
eb7bd15f | 1483 | btree_trans_sort_paths(trans); |
67e0dd8f | 1484 | |
807dda8c KO |
1485 | trans_for_each_path_inorder(trans, path, i) { |
1486 | if (__btree_path_cmp(path, | |
1487 | btree_id, | |
1488 | cached, | |
1489 | pos, | |
1490 | level) > 0) | |
1491 | break; | |
67e0dd8f | 1492 | |
807dda8c | 1493 | path_pos = path; |
67e0dd8f KO |
1494 | } |
1495 | ||
807dda8c KO |
1496 | if (path_pos && |
1497 | path_pos->cached == cached && | |
1498 | path_pos->btree_id == btree_id && | |
1499 | path_pos->level == level) { | |
1500 | __btree_path_get(path_pos, intent); | |
ce91abd6 | 1501 | path = bch2_btree_path_set_pos(trans, path_pos, pos, intent); |
67e0dd8f | 1502 | } else { |
807dda8c KO |
1503 | path = btree_path_alloc(trans, path_pos); |
1504 | path_pos = NULL; | |
67e0dd8f KO |
1505 | |
1506 | __btree_path_get(path, intent); | |
1507 | path->pos = pos; | |
1508 | path->btree_id = btree_id; | |
1509 | path->cached = cached; | |
67e0dd8f KO |
1510 | path->uptodate = BTREE_ITER_NEED_TRAVERSE; |
1511 | path->should_be_locked = false; | |
1512 | path->level = level; | |
1513 | path->locks_want = locks_want; | |
1514 | path->nodes_locked = 0; | |
67e0dd8f | 1515 | for (i = 0; i < ARRAY_SIZE(path->l); i++) |
315c9ba6 | 1516 | path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); |
67e0dd8f KO |
1517 | #ifdef CONFIG_BCACHEFS_DEBUG |
1518 | path->ip_allocated = _RET_IP_; | |
1519 | #endif | |
1520 | trans->paths_sorted = false; | |
1521 | } | |
1522 | ||
f3e1f444 KO |
1523 | if (!(flags & BTREE_ITER_NOPRESERVE)) |
1524 | path->preserve = true; | |
1525 | ||
67e0dd8f KO |
1526 | if (path->intent_ref) |
1527 | locks_want = max(locks_want, level + 1); | |
1528 | ||
1529 | /* | |
1530 | * If the path has locks_want greater than requested, we don't downgrade | |
1531 | * it here - on transaction restart because btree node split needs to | |
1532 | * upgrade locks, we might be putting/getting the iterator again. | |
1533 | * Downgrading iterators only happens via bch2_trans_downgrade(), after | |
1534 | * a successful transaction commit. | |
1535 | */ | |
1536 | ||
1537 | locks_want = min(locks_want, BTREE_MAX_DEPTH); | |
cd5afabe KO |
1538 | if (locks_want > path->locks_want) |
1539 | bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); | |
67e0dd8f | 1540 | |
67e0dd8f KO |
1541 | return path; |
1542 | } | |
1543 | ||
1544 | inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) | |
1545 | { | |
1546 | ||
1547 | struct bkey_s_c k; | |
1548 | ||
67e0dd8f KO |
1549 | if (!path->cached) { |
1550 | struct btree_path_level *l = path_l(path); | |
f7b6ca23 KO |
1551 | struct bkey_packed *_k; |
1552 | ||
1553 | EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); | |
67e0dd8f | 1554 | |
f7b6ca23 | 1555 | _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); |
67e0dd8f KO |
1556 | k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; |
1557 | ||
1558 | EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); | |
1559 | ||
1560 | if (!k.k || bpos_cmp(path->pos, k.k->p)) | |
1561 | goto hole; | |
1562 | } else { | |
1563 | struct bkey_cached *ck = (void *) path->l[0].b; | |
1564 | ||
f7b6ca23 KO |
1565 | EBUG_ON(ck && |
1566 | (path->btree_id != ck->key.btree_id || | |
1567 | bkey_cmp(path->pos, ck->key.pos))); | |
6fae65c1 | 1568 | EBUG_ON(!ck || !ck->valid); |
f7b6ca23 | 1569 | EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); |
67e0dd8f | 1570 | |
2e63e180 | 1571 | *u = ck->k->k; |
67e0dd8f KO |
1572 | k = bkey_i_to_s_c(ck->k); |
1573 | } | |
1574 | ||
1575 | return k; | |
1576 | hole: | |
1577 | bkey_init(u); | |
1578 | u->p = path->pos; | |
1579 | return (struct bkey_s_c) { u, NULL }; | |
1580 | } | |
1581 | ||
1582 | /* Btree iterators: */ | |
1583 | ||
db92f2ea KO |
1584 | int __must_check |
1585 | __bch2_btree_iter_traverse(struct btree_iter *iter) | |
1586 | { | |
1587 | return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); | |
1588 | } | |
1589 | ||
08070cba KO |
1590 | int __must_check |
1591 | bch2_btree_iter_traverse(struct btree_iter *iter) | |
1592 | { | |
66a0a497 KO |
1593 | int ret; |
1594 | ||
ce91abd6 | 1595 | iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, |
67e0dd8f KO |
1596 | btree_iter_search_key(iter), |
1597 | iter->flags & BTREE_ITER_INTENT); | |
08070cba | 1598 | |
67e0dd8f | 1599 | ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); |
66a0a497 KO |
1600 | if (ret) |
1601 | return ret; | |
1602 | ||
fd211bc7 | 1603 | btree_path_set_should_be_locked(iter->path); |
66a0a497 | 1604 | return 0; |
08070cba KO |
1605 | } |
1606 | ||
1c6fdbd8 KO |
1607 | /* Iterate across nodes (leaf and interior nodes) */ |
1608 | ||
1609 | struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) | |
1610 | { | |
502027a8 | 1611 | struct btree_trans *trans = iter->trans; |
f21566f1 | 1612 | struct btree *b = NULL; |
1c6fdbd8 KO |
1613 | int ret; |
1614 | ||
67e0dd8f | 1615 | EBUG_ON(iter->path->cached); |
7e1a3aa9 | 1616 | bch2_btree_iter_verify(iter); |
1c6fdbd8 | 1617 | |
502027a8 | 1618 | ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); |
1c6fdbd8 | 1619 | if (ret) |
d355c6f4 | 1620 | goto err; |
1c6fdbd8 | 1621 | |
67e0dd8f | 1622 | b = btree_path_node(iter->path, iter->path->level); |
1c6fdbd8 | 1623 | if (!b) |
f21566f1 | 1624 | goto out; |
1c6fdbd8 | 1625 | |
4cf91b02 | 1626 | BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); |
1c6fdbd8 | 1627 | |
f21566f1 | 1628 | bkey_init(&iter->k); |
67e0dd8f | 1629 | iter->k.p = iter->pos = b->key.k.p; |
502027a8 | 1630 | |
ce91abd6 | 1631 | iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, |
502027a8 | 1632 | iter->flags & BTREE_ITER_INTENT); |
fd211bc7 | 1633 | btree_path_set_should_be_locked(iter->path); |
f21566f1 KO |
1634 | out: |
1635 | bch2_btree_iter_verify_entry_exit(iter); | |
1636 | bch2_btree_iter_verify(iter); | |
2dac0eae | 1637 | |
1c6fdbd8 | 1638 | return b; |
d355c6f4 KO |
1639 | err: |
1640 | b = ERR_PTR(ret); | |
1641 | goto out; | |
1c6fdbd8 KO |
1642 | } |
1643 | ||
2dac0eae | 1644 | struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) |
1c6fdbd8 | 1645 | { |
67e0dd8f KO |
1646 | struct btree_trans *trans = iter->trans; |
1647 | struct btree_path *path = iter->path; | |
f21566f1 | 1648 | struct btree *b = NULL; |
1c6fdbd8 KO |
1649 | int ret; |
1650 | ||
979735df | 1651 | BUG_ON(trans->restarted); |
67e0dd8f | 1652 | EBUG_ON(iter->path->cached); |
7e1a3aa9 | 1653 | bch2_btree_iter_verify(iter); |
1c6fdbd8 | 1654 | |
979735df | 1655 | /* already at end? */ |
67e0dd8f | 1656 | if (!btree_path_node(path, path->level)) |
979735df | 1657 | return NULL; |
1c6fdbd8 | 1658 | |
979735df KO |
1659 | /* got to end? */ |
1660 | if (!btree_path_node(path, path->level + 1)) { | |
8bfe14e8 | 1661 | btree_path_set_level_up(trans, path); |
979735df KO |
1662 | return NULL; |
1663 | } | |
1c6fdbd8 | 1664 | |
979735df | 1665 | if (!bch2_btree_node_relock(trans, path, path->level + 1)) { |
8bfe14e8 | 1666 | __bch2_btree_path_unlock(trans, path); |
315c9ba6 KO |
1667 | path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); |
1668 | path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); | |
7071878b | 1669 | btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); |
674cfc26 | 1670 | trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); |
549d173c | 1671 | ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); |
d355c6f4 | 1672 | goto err; |
979735df | 1673 | } |
1c6fdbd8 | 1674 | |
979735df | 1675 | b = btree_path_node(path, path->level + 1); |
1c6fdbd8 | 1676 | |
979735df | 1677 | if (!bpos_cmp(iter->pos, b->key.k.p)) { |
315c9ba6 | 1678 | __btree_path_set_level_up(trans, path, path->level++); |
979735df | 1679 | } else { |
1c6fdbd8 KO |
1680 | /* |
1681 | * Haven't gotten to the end of the parent node: go back down to | |
1682 | * the next child node | |
1683 | */ | |
67e0dd8f | 1684 | path = iter->path = |
ce91abd6 | 1685 | bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), |
67e0dd8f | 1686 | iter->flags & BTREE_ITER_INTENT); |
1c6fdbd8 | 1687 | |
c4bce586 | 1688 | btree_path_set_level_down(trans, path, iter->min_depth); |
345ca825 | 1689 | |
67e0dd8f | 1690 | ret = bch2_btree_path_traverse(trans, path, iter->flags); |
d355c6f4 KO |
1691 | if (ret) |
1692 | goto err; | |
1c6fdbd8 | 1693 | |
67e0dd8f | 1694 | b = path->l[path->level].b; |
1c6fdbd8 KO |
1695 | } |
1696 | ||
f21566f1 | 1697 | bkey_init(&iter->k); |
67e0dd8f | 1698 | iter->k.p = iter->pos = b->key.k.p; |
502027a8 | 1699 | |
ce91abd6 | 1700 | iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, |
502027a8 | 1701 | iter->flags & BTREE_ITER_INTENT); |
fd211bc7 | 1702 | btree_path_set_should_be_locked(iter->path); |
502027a8 | 1703 | BUG_ON(iter->path->uptodate); |
f21566f1 KO |
1704 | out: |
1705 | bch2_btree_iter_verify_entry_exit(iter); | |
1706 | bch2_btree_iter_verify(iter); | |
2dac0eae | 1707 | |
1c6fdbd8 | 1708 | return b; |
d355c6f4 KO |
1709 | err: |
1710 | b = ERR_PTR(ret); | |
1711 | goto out; | |
1c6fdbd8 KO |
1712 | } |
1713 | ||
1714 | /* Iterate across keys (in leaf nodes only) */ | |
1715 | ||
e0ba3b64 | 1716 | inline bool bch2_btree_iter_advance(struct btree_iter *iter) |
434094be | 1717 | { |
b0babf2a KO |
1718 | if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { |
1719 | struct bpos pos = iter->k.p; | |
1720 | bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS | |
1721 | ? bpos_cmp(pos, SPOS_MAX) | |
1722 | : bkey_cmp(pos, SPOS_MAX)) != 0; | |
1723 | ||
1724 | if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) | |
1725 | pos = bkey_successor(iter, pos); | |
1726 | bch2_btree_iter_set_pos(iter, pos); | |
1727 | return ret; | |
1728 | } else { | |
1729 | if (!btree_path_node(iter->path, iter->path->level)) | |
1730 | return true; | |
3d495595 | 1731 | |
b0babf2a KO |
1732 | iter->advanced = true; |
1733 | return false; | |
1734 | } | |
3d495595 KO |
1735 | } |
1736 | ||
e0ba3b64 | 1737 | inline bool bch2_btree_iter_rewind(struct btree_iter *iter) |
3d495595 KO |
1738 | { |
1739 | struct bpos pos = bkey_start_pos(&iter->k); | |
71f892a4 KO |
1740 | bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS |
1741 | ? bpos_cmp(pos, POS_MIN) | |
1742 | : bkey_cmp(pos, POS_MIN)) != 0; | |
3d495595 | 1743 | |
7e1a3aa9 | 1744 | if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) |
e751c01a | 1745 | pos = bkey_predecessor(iter, pos); |
3d495595 | 1746 | bch2_btree_iter_set_pos(iter, pos); |
7e1a3aa9 | 1747 | return ret; |
434094be KO |
1748 | } |
1749 | ||
3763cb95 | 1750 | static noinline |
67e0dd8f | 1751 | struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) |
818664f5 KO |
1752 | { |
1753 | struct btree_insert_entry *i; | |
5288e66a | 1754 | struct bkey_i *ret = NULL; |
818664f5 | 1755 | |
8e6bbc41 | 1756 | trans_for_each_update(iter->trans, i) { |
5288e66a KO |
1757 | if (i->btree_id < iter->btree_id) |
1758 | continue; | |
1759 | if (i->btree_id > iter->btree_id) | |
818664f5 | 1760 | break; |
67e0dd8f | 1761 | if (bpos_cmp(i->k->k.p, iter->path->pos) < 0) |
5288e66a | 1762 | continue; |
12ce5b7d KO |
1763 | if (i->key_cache_already_flushed) |
1764 | continue; | |
5288e66a KO |
1765 | if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0) |
1766 | ret = i->k; | |
1767 | } | |
818664f5 | 1768 | |
5288e66a KO |
1769 | return ret; |
1770 | } | |
1771 | ||
3763cb95 KO |
1772 | static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) |
1773 | { | |
1774 | return iter->flags & BTREE_ITER_WITH_UPDATES | |
1775 | ? __bch2_btree_trans_peek_updates(iter) | |
1776 | : NULL; | |
1777 | } | |
1778 | ||
30525f68 KO |
1779 | struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, |
1780 | struct btree_iter *iter, | |
1781 | struct bpos end_pos) | |
1782 | { | |
1783 | struct bkey_i *k; | |
1784 | ||
1785 | if (bpos_cmp(iter->path->pos, iter->journal_pos) < 0) | |
1786 | iter->journal_idx = 0; | |
1787 | ||
1788 | k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, | |
1789 | iter->path->level, | |
1790 | iter->path->pos, | |
1791 | end_pos, | |
1792 | &iter->journal_idx); | |
1793 | ||
1794 | iter->journal_pos = k ? k->k.p : end_pos; | |
1795 | return k; | |
1796 | } | |
1797 | ||
5222a460 KO |
1798 | static noinline |
1799 | struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, | |
1800 | struct btree_iter *iter) | |
1801 | { | |
30525f68 | 1802 | struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); |
5222a460 | 1803 | |
5650bb46 | 1804 | if (k) { |
5222a460 KO |
1805 | iter->k = k->k; |
1806 | return bkey_i_to_s_c(k); | |
1807 | } else { | |
1808 | return bkey_s_c_null; | |
1809 | } | |
1810 | } | |
1811 | ||
1812 | static noinline | |
1813 | struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, | |
1814 | struct btree_iter *iter, | |
1815 | struct bkey_s_c k) | |
1816 | { | |
1817 | struct bkey_i *next_journal = | |
30525f68 | 1818 | bch2_btree_journal_peek(trans, iter, |
5650bb46 | 1819 | k.k ? k.k->p : iter->path->l[0].b->key.k.p); |
5222a460 | 1820 | |
5650bb46 | 1821 | if (next_journal) { |
5222a460 KO |
1822 | iter->k = next_journal->k; |
1823 | k = bkey_i_to_s_c(next_journal); | |
1824 | } | |
1825 | ||
1826 | return k; | |
1827 | } | |
1828 | ||
f7b6ca23 KO |
1829 | /* |
1830 | * Checks btree key cache for key at iter->pos and returns it if present, or | |
1831 | * bkey_s_c_null: | |
1832 | */ | |
1833 | static noinline | |
8192f8a5 | 1834 | struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) |
f7b6ca23 KO |
1835 | { |
1836 | struct btree_trans *trans = iter->trans; | |
1837 | struct bch_fs *c = trans->c; | |
1838 | struct bkey u; | |
1839 | int ret; | |
1840 | ||
1841 | if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) | |
1842 | return bkey_s_c_null; | |
1843 | ||
1844 | if (!iter->key_cache_path) | |
1845 | iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, | |
1846 | iter->flags & BTREE_ITER_INTENT, 0, | |
1847 | iter->flags|BTREE_ITER_CACHED); | |
1848 | ||
1849 | iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, | |
1850 | iter->flags & BTREE_ITER_INTENT); | |
1851 | ||
1852 | ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); | |
1853 | if (unlikely(ret)) | |
1854 | return bkey_s_c_err(ret); | |
1855 | ||
fd211bc7 | 1856 | btree_path_set_should_be_locked(iter->key_cache_path); |
f7b6ca23 KO |
1857 | |
1858 | return bch2_btree_path_peek_slot(iter->key_cache_path, &u); | |
1859 | } | |
1860 | ||
8192f8a5 KO |
1861 | static noinline |
1862 | struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) | |
1863 | { | |
1864 | struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos); | |
1865 | int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_); | |
1866 | ||
1867 | return err ? bkey_s_c_err(err) : ret; | |
1868 | } | |
1869 | ||
a1e82d35 | 1870 | static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) |
1c6fdbd8 | 1871 | { |
67e0dd8f | 1872 | struct btree_trans *trans = iter->trans; |
360746bf | 1873 | struct bkey_i *next_update; |
f7b6ca23 | 1874 | struct bkey_s_c k, k2; |
8e432d98 | 1875 | int ret; |
1c6fdbd8 | 1876 | |
969576ec | 1877 | EBUG_ON(iter->path->cached); |
7e1a3aa9 | 1878 | bch2_btree_iter_verify(iter); |
1c6fdbd8 | 1879 | |
1c6fdbd8 | 1880 | while (1) { |
969576ec KO |
1881 | struct btree_path_level *l; |
1882 | ||
ce91abd6 KO |
1883 | iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, |
1884 | iter->flags & BTREE_ITER_INTENT); | |
c8476a4e | 1885 | |
67e0dd8f | 1886 | ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); |
e6e024e9 KO |
1887 | if (unlikely(ret)) { |
1888 | /* ensure that iter->k is consistent with iter->pos: */ | |
1889 | bch2_btree_iter_set_pos(iter, iter->pos); | |
1890 | k = bkey_s_c_err(ret); | |
1891 | goto out; | |
1892 | } | |
1c6fdbd8 | 1893 | |
969576ec KO |
1894 | l = path_l(iter->path); |
1895 | ||
1896 | if (unlikely(!l->b)) { | |
1897 | /* No btree nodes at requested level: */ | |
1898 | bch2_btree_iter_set_pos(iter, SPOS_MAX); | |
1899 | k = bkey_s_c_null; | |
1900 | goto out; | |
1901 | } | |
1902 | ||
fd211bc7 | 1903 | btree_path_set_should_be_locked(iter->path); |
f7b6ca23 | 1904 | |
969576ec | 1905 | k = btree_path_level_peek_all(trans->c, l, &iter->k); |
e6e024e9 | 1906 | |
f7b6ca23 KO |
1907 | if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && |
1908 | k.k && | |
1909 | (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { | |
8192f8a5 KO |
1910 | k = k2; |
1911 | ret = bkey_err(k); | |
f7b6ca23 | 1912 | if (ret) { |
f7b6ca23 KO |
1913 | bch2_btree_iter_set_pos(iter, iter->pos); |
1914 | goto out; | |
1915 | } | |
f7b6ca23 KO |
1916 | } |
1917 | ||
5222a460 KO |
1918 | if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) |
1919 | k = btree_trans_peek_journal(trans, iter, k); | |
1920 | ||
1921 | next_update = btree_trans_peek_updates(iter); | |
e6e024e9 | 1922 | |
818664f5 | 1923 | if (next_update && |
e6e024e9 | 1924 | bpos_cmp(next_update->k.p, |
969576ec | 1925 | k.k ? k.k->p : l->b->key.k.p) <= 0) { |
5288e66a | 1926 | iter->k = next_update->k; |
818664f5 | 1927 | k = bkey_i_to_s_c(next_update); |
5288e66a | 1928 | } |
818664f5 | 1929 | |
5222a460 KO |
1930 | if (k.k && bkey_deleted(k.k)) { |
1931 | /* | |
1932 | * If we've got a whiteout, and it's after the search | |
1933 | * key, advance the search key to the whiteout instead | |
1934 | * of just after the whiteout - it might be a btree | |
1935 | * whiteout, with a real key at the same position, since | |
1936 | * in the btree deleted keys sort before non deleted. | |
1937 | */ | |
1938 | search_key = bpos_cmp(search_key, k.k->p) | |
1939 | ? k.k->p | |
1940 | : bpos_successor(k.k->p); | |
1941 | continue; | |
1942 | } | |
1943 | ||
818664f5 | 1944 | if (likely(k.k)) { |
c075ff70 | 1945 | break; |
969576ec | 1946 | } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) { |
e6e024e9 | 1947 | /* Advance to next leaf node: */ |
969576ec | 1948 | search_key = bpos_successor(l->b->key.k.p); |
e6e024e9 KO |
1949 | } else { |
1950 | /* End of btree: */ | |
c8476a4e KO |
1951 | bch2_btree_iter_set_pos(iter, SPOS_MAX); |
1952 | k = bkey_s_c_null; | |
1953 | goto out; | |
1954 | } | |
1c6fdbd8 | 1955 | } |
a1e82d35 | 1956 | out: |
a1e82d35 KO |
1957 | bch2_btree_iter_verify(iter); |
1958 | ||
1959 | return k; | |
1960 | } | |
1961 | ||
1962 | /** | |
1963 | * bch2_btree_iter_peek: returns first key greater than or equal to iterator's | |
1964 | * current position | |
1965 | */ | |
85d8cf16 | 1966 | struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) |
a1e82d35 KO |
1967 | { |
1968 | struct btree_trans *trans = iter->trans; | |
1969 | struct bpos search_key = btree_iter_search_key(iter); | |
1970 | struct bkey_s_c k; | |
85d8cf16 | 1971 | struct bpos iter_pos; |
a1e82d35 KO |
1972 | int ret; |
1973 | ||
b0babf2a KO |
1974 | EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); |
1975 | ||
1f2d9192 | 1976 | if (iter->update_path) { |
7dcbdbd8 KO |
1977 | bch2_path_put_nokeep(trans, iter->update_path, |
1978 | iter->flags & BTREE_ITER_INTENT); | |
1f2d9192 KO |
1979 | iter->update_path = NULL; |
1980 | } | |
1981 | ||
a1e82d35 KO |
1982 | bch2_btree_iter_verify_entry_exit(iter); |
1983 | ||
1984 | while (1) { | |
1985 | k = __bch2_btree_iter_peek(iter, search_key); | |
1986 | if (!k.k || bkey_err(k)) | |
fd211bc7 | 1987 | goto out_no_locked; |
a1e82d35 | 1988 | |
85d8cf16 KO |
1989 | /* |
1990 | * iter->pos should be mononotically increasing, and always be | |
1991 | * equal to the key we just returned - except extents can | |
1992 | * straddle iter->pos: | |
1993 | */ | |
1994 | if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) | |
1995 | iter_pos = k.k->p; | |
1996 | else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) | |
1997 | iter_pos = bkey_start_pos(k.k); | |
1998 | else | |
1999 | iter_pos = iter->pos; | |
2000 | ||
2001 | if (bkey_cmp(iter_pos, end) > 0) { | |
2002 | bch2_btree_iter_set_pos(iter, end); | |
2003 | k = bkey_s_c_null; | |
fd211bc7 | 2004 | goto out_no_locked; |
85d8cf16 KO |
2005 | } |
2006 | ||
1f2d9192 KO |
2007 | if (iter->update_path && |
2008 | bkey_cmp(iter->update_path->pos, k.k->p)) { | |
7dcbdbd8 KO |
2009 | bch2_path_put_nokeep(trans, iter->update_path, |
2010 | iter->flags & BTREE_ITER_INTENT); | |
1f2d9192 KO |
2011 | iter->update_path = NULL; |
2012 | } | |
2013 | ||
2014 | if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && | |
2015 | (iter->flags & BTREE_ITER_INTENT) && | |
2016 | !(iter->flags & BTREE_ITER_IS_EXTENTS) && | |
2017 | !iter->update_path) { | |
2018 | struct bpos pos = k.k->p; | |
2019 | ||
2020 | if (pos.snapshot < iter->snapshot) { | |
2021 | search_key = bpos_successor(k.k->p); | |
2022 | continue; | |
2023 | } | |
2024 | ||
2025 | pos.snapshot = iter->snapshot; | |
2026 | ||
2027 | /* | |
2028 | * advance, same as on exit for iter->path, but only up | |
2029 | * to snapshot | |
2030 | */ | |
2031 | __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); | |
2032 | iter->update_path = iter->path; | |
2033 | ||
ce91abd6 | 2034 | iter->update_path = bch2_btree_path_set_pos(trans, |
1f2d9192 KO |
2035 | iter->update_path, pos, |
2036 | iter->flags & BTREE_ITER_INTENT); | |
1f2d9192 KO |
2037 | } |
2038 | ||
a1e82d35 KO |
2039 | /* |
2040 | * We can never have a key in a leaf node at POS_MAX, so | |
2041 | * we don't have to check these successor() calls: | |
2042 | */ | |
2043 | if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && | |
2044 | !bch2_snapshot_is_ancestor(trans->c, | |
2045 | iter->snapshot, | |
2046 | k.k->p.snapshot)) { | |
2047 | search_key = bpos_successor(k.k->p); | |
2048 | continue; | |
2049 | } | |
2050 | ||
2051 | if (bkey_whiteout(k.k) && | |
2052 | !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { | |
2053 | search_key = bkey_successor(iter, k.k->p); | |
2054 | continue; | |
2055 | } | |
2056 | ||
2057 | break; | |
2058 | } | |
2059 | ||
85d8cf16 | 2060 | iter->pos = iter_pos; |
1f2d9192 | 2061 | |
ce91abd6 | 2062 | iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, |
1f2d9192 | 2063 | iter->flags & BTREE_ITER_INTENT); |
fd211bc7 KO |
2064 | |
2065 | btree_path_set_should_be_locked(iter->path); | |
2066 | out_no_locked: | |
1f2d9192 | 2067 | if (iter->update_path) { |
549d173c | 2068 | ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); |
fd211bc7 | 2069 | if (unlikely(ret)) |
549d173c | 2070 | k = bkey_s_c_err(ret); |
fd211bc7 KO |
2071 | else |
2072 | btree_path_set_should_be_locked(iter->update_path); | |
1f2d9192 | 2073 | } |
1f2d9192 | 2074 | |
a1e82d35 | 2075 | if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) |
c075ff70 KO |
2076 | iter->pos.snapshot = iter->snapshot; |
2077 | ||
a1e82d35 KO |
2078 | ret = bch2_btree_iter_verify_ret(iter, k); |
2079 | if (unlikely(ret)) { | |
2080 | bch2_btree_iter_set_pos(iter, iter->pos); | |
2081 | k = bkey_s_c_err(ret); | |
2082 | } | |
67e0dd8f | 2083 | |
7e1a3aa9 | 2084 | bch2_btree_iter_verify_entry_exit(iter); |
c075ff70 | 2085 | |
1c6fdbd8 KO |
2086 | return k; |
2087 | } | |
2088 | ||
b0babf2a KO |
2089 | /** |
2090 | * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal | |
2091 | * to iterator's current position, returning keys from every level of the btree. | |
2092 | * For keys at different levels of the btree that compare equal, the key from | |
2093 | * the lower level (leaf) is returned first. | |
2094 | */ | |
2095 | struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) | |
2096 | { | |
2097 | struct btree_trans *trans = iter->trans; | |
2098 | struct bkey_s_c k; | |
2099 | int ret; | |
2100 | ||
2101 | EBUG_ON(iter->path->cached); | |
2102 | bch2_btree_iter_verify(iter); | |
2103 | BUG_ON(iter->path->level < iter->min_depth); | |
2104 | BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); | |
2105 | EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); | |
2106 | ||
2107 | while (1) { | |
2108 | iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, | |
2109 | iter->flags & BTREE_ITER_INTENT); | |
2110 | ||
2111 | ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); | |
2112 | if (unlikely(ret)) { | |
2113 | /* ensure that iter->k is consistent with iter->pos: */ | |
2114 | bch2_btree_iter_set_pos(iter, iter->pos); | |
2115 | k = bkey_s_c_err(ret); | |
fd211bc7 | 2116 | goto out_no_locked; |
b0babf2a KO |
2117 | } |
2118 | ||
2119 | /* Already at end? */ | |
2120 | if (!btree_path_node(iter->path, iter->path->level)) { | |
2121 | k = bkey_s_c_null; | |
fd211bc7 | 2122 | goto out_no_locked; |
b0babf2a KO |
2123 | } |
2124 | ||
2125 | k = btree_path_level_peek_all(trans->c, | |
2126 | &iter->path->l[iter->path->level], &iter->k); | |
2127 | ||
2128 | /* Check if we should go up to the parent node: */ | |
2129 | if (!k.k || | |
2130 | (iter->advanced && | |
2131 | !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) { | |
2132 | iter->pos = path_l(iter->path)->b->key.k.p; | |
8bfe14e8 | 2133 | btree_path_set_level_up(trans, iter->path); |
b0babf2a KO |
2134 | iter->advanced = false; |
2135 | continue; | |
2136 | } | |
2137 | ||
2138 | /* | |
2139 | * Check if we should go back down to a leaf: | |
2140 | * If we're not in a leaf node, we only return the current key | |
2141 | * if it exactly matches iter->pos - otherwise we first have to | |
2142 | * go back to the leaf: | |
2143 | */ | |
2144 | if (iter->path->level != iter->min_depth && | |
2145 | (iter->advanced || | |
2146 | !k.k || | |
2147 | bpos_cmp(iter->pos, k.k->p))) { | |
2148 | btree_path_set_level_down(trans, iter->path, iter->min_depth); | |
2149 | iter->pos = bpos_successor(iter->pos); | |
2150 | iter->advanced = false; | |
2151 | continue; | |
2152 | } | |
2153 | ||
2154 | /* Check if we should go to the next key: */ | |
2155 | if (iter->path->level == iter->min_depth && | |
2156 | iter->advanced && | |
2157 | k.k && | |
2158 | !bpos_cmp(iter->pos, k.k->p)) { | |
2159 | iter->pos = bpos_successor(iter->pos); | |
2160 | iter->advanced = false; | |
2161 | continue; | |
2162 | } | |
2163 | ||
2164 | if (iter->advanced && | |
2165 | iter->path->level == iter->min_depth && | |
2166 | bpos_cmp(k.k->p, iter->pos)) | |
2167 | iter->advanced = false; | |
2168 | ||
2169 | BUG_ON(iter->advanced); | |
2170 | BUG_ON(!k.k); | |
2171 | break; | |
2172 | } | |
2173 | ||
2174 | iter->pos = k.k->p; | |
fd211bc7 KO |
2175 | btree_path_set_should_be_locked(iter->path); |
2176 | out_no_locked: | |
b0babf2a KO |
2177 | bch2_btree_iter_verify(iter); |
2178 | ||
2179 | return k; | |
2180 | } | |
2181 | ||
f4b61341 KO |
2182 | /** |
2183 | * bch2_btree_iter_next: returns first key greater than iterator's current | |
2184 | * position | |
2185 | */ | |
1c6fdbd8 KO |
2186 | struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) |
2187 | { | |
e0ba3b64 | 2188 | if (!bch2_btree_iter_advance(iter)) |
2e70ce56 | 2189 | return bkey_s_c_null; |
f4b61341 | 2190 | |
2dac0eae | 2191 | return bch2_btree_iter_peek(iter); |
1c6fdbd8 KO |
2192 | } |
2193 | ||
ccf5a109 KO |
2194 | /** |
2195 | * bch2_btree_iter_peek_prev: returns first key less than or equal to | |
2196 | * iterator's current position | |
2197 | */ | |
2198 | struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) | |
1c6fdbd8 | 2199 | { |
67e0dd8f | 2200 | struct btree_trans *trans = iter->trans; |
c8476a4e | 2201 | struct bpos search_key = iter->pos; |
c075ff70 | 2202 | struct btree_path *saved_path = NULL; |
1c6fdbd8 | 2203 | struct bkey_s_c k; |
c075ff70 KO |
2204 | struct bkey saved_k; |
2205 | const struct bch_val *saved_v; | |
1c6fdbd8 KO |
2206 | int ret; |
2207 | ||
67e0dd8f | 2208 | EBUG_ON(iter->path->cached || iter->path->level); |
5288e66a | 2209 | EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); |
5222a460 KO |
2210 | |
2211 | if (iter->flags & BTREE_ITER_WITH_JOURNAL) | |
2212 | return bkey_s_c_err(-EIO); | |
2213 | ||
7e1a3aa9 KO |
2214 | bch2_btree_iter_verify(iter); |
2215 | bch2_btree_iter_verify_entry_exit(iter); | |
2216 | ||
c075ff70 KO |
2217 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) |
2218 | search_key.snapshot = U32_MAX; | |
2219 | ||
1c6fdbd8 | 2220 | while (1) { |
ce91abd6 | 2221 | iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, |
67e0dd8f | 2222 | iter->flags & BTREE_ITER_INTENT); |
c8476a4e | 2223 | |
67e0dd8f | 2224 | ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); |
7e1a3aa9 | 2225 | if (unlikely(ret)) { |
e6e024e9 KO |
2226 | /* ensure that iter->k is consistent with iter->pos: */ |
2227 | bch2_btree_iter_set_pos(iter, iter->pos); | |
7e1a3aa9 | 2228 | k = bkey_s_c_err(ret); |
fd211bc7 | 2229 | goto out_no_locked; |
7e1a3aa9 | 2230 | } |
1c6fdbd8 | 2231 | |
67e0dd8f KO |
2232 | k = btree_path_level_peek(trans, iter->path, |
2233 | &iter->path->l[0], &iter->k); | |
3d495595 KO |
2234 | if (!k.k || |
2235 | ((iter->flags & BTREE_ITER_IS_EXTENTS) | |
c075ff70 KO |
2236 | ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 |
2237 | : bpos_cmp(k.k->p, search_key) > 0)) | |
67e0dd8f KO |
2238 | k = btree_path_level_prev(trans, iter->path, |
2239 | &iter->path->l[0], &iter->k); | |
ccf5a109 | 2240 | |
e6e024e9 | 2241 | if (likely(k.k)) { |
c075ff70 KO |
2242 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { |
2243 | if (k.k->p.snapshot == iter->snapshot) | |
2244 | goto got_key; | |
2245 | ||
2246 | /* | |
2247 | * If we have a saved candidate, and we're no | |
2248 | * longer at the same _key_ (not pos), return | |
2249 | * that candidate | |
2250 | */ | |
2251 | if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { | |
7dcbdbd8 | 2252 | bch2_path_put_nokeep(trans, iter->path, |
c075ff70 KO |
2253 | iter->flags & BTREE_ITER_INTENT); |
2254 | iter->path = saved_path; | |
2255 | saved_path = NULL; | |
2256 | iter->k = saved_k; | |
2257 | k.v = saved_v; | |
2258 | goto got_key; | |
2259 | } | |
2260 | ||
2261 | if (bch2_snapshot_is_ancestor(iter->trans->c, | |
2262 | iter->snapshot, | |
2263 | k.k->p.snapshot)) { | |
2264 | if (saved_path) | |
7dcbdbd8 | 2265 | bch2_path_put_nokeep(trans, saved_path, |
c075ff70 KO |
2266 | iter->flags & BTREE_ITER_INTENT); |
2267 | saved_path = btree_path_clone(trans, iter->path, | |
2268 | iter->flags & BTREE_ITER_INTENT); | |
2269 | saved_k = *k.k; | |
2270 | saved_v = k.v; | |
2271 | } | |
2272 | ||
2273 | search_key = bpos_predecessor(k.k->p); | |
2274 | continue; | |
2275 | } | |
2276 | got_key: | |
2277 | if (bkey_whiteout(k.k) && | |
2278 | !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { | |
2279 | search_key = bkey_predecessor(iter, k.k->p); | |
2280 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) | |
2281 | search_key.snapshot = U32_MAX; | |
2282 | continue; | |
2283 | } | |
2284 | ||
1c6fdbd8 | 2285 | break; |
67e0dd8f | 2286 | } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) { |
e6e024e9 | 2287 | /* Advance to previous leaf node: */ |
67e0dd8f | 2288 | search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); |
e6e024e9 KO |
2289 | } else { |
2290 | /* Start of btree: */ | |
c8476a4e | 2291 | bch2_btree_iter_set_pos(iter, POS_MIN); |
7e1a3aa9 | 2292 | k = bkey_s_c_null; |
fd211bc7 | 2293 | goto out_no_locked; |
7e1a3aa9 | 2294 | } |
ccf5a109 | 2295 | } |
1c6fdbd8 | 2296 | |
a045be5a | 2297 | EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); |
3d495595 KO |
2298 | |
2299 | /* Extents can straddle iter->pos: */ | |
a045be5a | 2300 | if (bkey_cmp(k.k->p, iter->pos) < 0) |
3d495595 | 2301 | iter->pos = k.k->p; |
c075ff70 KO |
2302 | |
2303 | if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) | |
2304 | iter->pos.snapshot = iter->snapshot; | |
fd211bc7 KO |
2305 | |
2306 | btree_path_set_should_be_locked(iter->path); | |
2307 | out_no_locked: | |
c075ff70 | 2308 | if (saved_path) |
7dcbdbd8 | 2309 | bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); |
67e0dd8f | 2310 | |
7e1a3aa9 KO |
2311 | bch2_btree_iter_verify_entry_exit(iter); |
2312 | bch2_btree_iter_verify(iter); | |
67e0dd8f | 2313 | |
1c6fdbd8 KO |
2314 | return k; |
2315 | } | |
2316 | ||
ccf5a109 KO |
2317 | /** |
2318 | * bch2_btree_iter_prev: returns first key less than iterator's current | |
2319 | * position | |
2320 | */ | |
2321 | struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) | |
2322 | { | |
e0ba3b64 | 2323 | if (!bch2_btree_iter_rewind(iter)) |
2e70ce56 | 2324 | return bkey_s_c_null; |
ccf5a109 | 2325 | |
2e70ce56 | 2326 | return bch2_btree_iter_peek_prev(iter); |
ccf5a109 KO |
2327 | } |
2328 | ||
c3801239 | 2329 | struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) |
271a3d3a | 2330 | { |
9f6bd307 | 2331 | struct btree_trans *trans = iter->trans; |
953ee28a | 2332 | struct bpos search_key; |
271a3d3a | 2333 | struct bkey_s_c k; |
c3801239 KO |
2334 | int ret; |
2335 | ||
7e1a3aa9 KO |
2336 | bch2_btree_iter_verify(iter); |
2337 | bch2_btree_iter_verify_entry_exit(iter); | |
b0babf2a | 2338 | EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); |
2ae4573e | 2339 | EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); |
7e1a3aa9 | 2340 | |
1d214eb1 KO |
2341 | /* extents can't span inode numbers: */ |
2342 | if ((iter->flags & BTREE_ITER_IS_EXTENTS) && | |
953ee28a | 2343 | unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { |
1d214eb1 KO |
2344 | if (iter->pos.inode == KEY_INODE_MAX) |
2345 | return bkey_s_c_null; | |
c3801239 | 2346 | |
1d214eb1 KO |
2347 | bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); |
2348 | } | |
8042b5b7 | 2349 | |
953ee28a | 2350 | search_key = btree_iter_search_key(iter); |
ce91abd6 | 2351 | iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, |
67e0dd8f | 2352 | iter->flags & BTREE_ITER_INTENT); |
953ee28a | 2353 | |
67e0dd8f | 2354 | ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); |
23dfb3a2 KO |
2355 | if (unlikely(ret)) { |
2356 | k = bkey_s_c_err(ret); | |
2357 | goto out_no_locked; | |
2358 | } | |
1c6fdbd8 | 2359 | |
c075ff70 KO |
2360 | if ((iter->flags & BTREE_ITER_CACHED) || |
2361 | !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { | |
953ee28a | 2362 | struct bkey_i *next_update; |
1c6fdbd8 | 2363 | |
5222a460 | 2364 | if ((next_update = btree_trans_peek_updates(iter)) && |
67e0dd8f | 2365 | !bpos_cmp(next_update->k.p, iter->pos)) { |
1d214eb1 KO |
2366 | iter->k = next_update->k; |
2367 | k = bkey_i_to_s_c(next_update); | |
5222a460 | 2368 | goto out; |
1d214eb1 KO |
2369 | } |
2370 | ||
5222a460 KO |
2371 | if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && |
2372 | (k = btree_trans_peek_slot_journal(trans, iter)).k) | |
2373 | goto out; | |
2374 | ||
f7b6ca23 | 2375 | if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && |
8192f8a5 KO |
2376 | (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) { |
2377 | if (!bkey_err(k)) | |
f7b6ca23 | 2378 | iter->k = *k.k; |
8192f8a5 KO |
2379 | /* We're not returning a key from iter->path: */ |
2380 | goto out_no_locked; | |
f7b6ca23 KO |
2381 | } |
2382 | ||
5222a460 | 2383 | k = bch2_btree_path_peek_slot(iter->path, &iter->k); |
1d214eb1 KO |
2384 | } else { |
2385 | struct bpos next; | |
1d214eb1 | 2386 | |
2ae4573e KO |
2387 | EBUG_ON(iter->path->level); |
2388 | ||
b1d87f52 | 2389 | if (iter->flags & BTREE_ITER_INTENT) { |
67e0dd8f | 2390 | struct btree_iter iter2; |
85d8cf16 KO |
2391 | struct bpos end = iter->pos; |
2392 | ||
2393 | if (iter->flags & BTREE_ITER_IS_EXTENTS) | |
2394 | end.offset = U64_MAX; | |
b1d87f52 | 2395 | |
67e0dd8f | 2396 | bch2_trans_copy_iter(&iter2, iter); |
85d8cf16 | 2397 | k = bch2_btree_iter_peek_upto(&iter2, end); |
b1d87f52 | 2398 | |
67e0dd8f KO |
2399 | if (k.k && !bkey_err(k)) { |
2400 | iter->k = iter2.k; | |
2401 | k.k = &iter->k; | |
2402 | } | |
2403 | bch2_trans_iter_exit(trans, &iter2); | |
b1d87f52 KO |
2404 | } else { |
2405 | struct bpos pos = iter->pos; | |
2406 | ||
2407 | k = bch2_btree_iter_peek(iter); | |
23dfb3a2 KO |
2408 | if (unlikely(bkey_err(k))) |
2409 | bch2_btree_iter_set_pos(iter, pos); | |
2410 | else | |
2411 | iter->pos = pos; | |
b1d87f52 | 2412 | } |
1d214eb1 KO |
2413 | |
2414 | if (unlikely(bkey_err(k))) | |
2415 | return k; | |
2416 | ||
2417 | next = k.k ? bkey_start_pos(k.k) : POS_MAX; | |
2418 | ||
2419 | if (bkey_cmp(iter->pos, next) < 0) { | |
2420 | bkey_init(&iter->k); | |
2421 | iter->k.p = iter->pos; | |
c075ff70 KO |
2422 | |
2423 | if (iter->flags & BTREE_ITER_IS_EXTENTS) { | |
2424 | bch2_key_resize(&iter->k, | |
2425 | min_t(u64, KEY_SIZE_MAX, | |
2426 | (next.inode == iter->pos.inode | |
2427 | ? next.offset | |
2428 | : KEY_OFFSET_MAX) - | |
2429 | iter->pos.offset)); | |
2430 | EBUG_ON(!iter->k.size); | |
2431 | } | |
1d214eb1 KO |
2432 | |
2433 | k = (struct bkey_s_c) { &iter->k, NULL }; | |
1d214eb1 | 2434 | } |
271a3d3a | 2435 | } |
5222a460 | 2436 | out: |
fd211bc7 KO |
2437 | btree_path_set_should_be_locked(iter->path); |
2438 | out_no_locked: | |
7e1a3aa9 KO |
2439 | bch2_btree_iter_verify_entry_exit(iter); |
2440 | bch2_btree_iter_verify(iter); | |
c075ff70 KO |
2441 | ret = bch2_btree_iter_verify_ret(iter, k); |
2442 | if (unlikely(ret)) | |
2443 | return bkey_s_c_err(ret); | |
66a0a497 | 2444 | |
63f1a598 | 2445 | return k; |
1c6fdbd8 KO |
2446 | } |
2447 | ||
1c6fdbd8 KO |
2448 | struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) |
2449 | { | |
e0ba3b64 | 2450 | if (!bch2_btree_iter_advance(iter)) |
2e70ce56 | 2451 | return bkey_s_c_null; |
1c6fdbd8 | 2452 | |
2dac0eae | 2453 | return bch2_btree_iter_peek_slot(iter); |
1c6fdbd8 KO |
2454 | } |
2455 | ||
18fc6ae5 KO |
2456 | struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) |
2457 | { | |
e0ba3b64 | 2458 | if (!bch2_btree_iter_rewind(iter)) |
18fc6ae5 KO |
2459 | return bkey_s_c_null; |
2460 | ||
2461 | return bch2_btree_iter_peek_slot(iter); | |
2462 | } | |
2463 | ||
1c6fdbd8 KO |
2464 | /* new transactional stuff: */ |
2465 | ||
0423fb71 KO |
2466 | #ifdef CONFIG_BCACHEFS_DEBUG |
2467 | static void btree_trans_verify_sorted_refs(struct btree_trans *trans) | |
2468 | { | |
67e0dd8f | 2469 | struct btree_path *path; |
0423fb71 KO |
2470 | unsigned i; |
2471 | ||
67e0dd8f | 2472 | BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); |
0423fb71 | 2473 | |
67e0dd8f KO |
2474 | trans_for_each_path(trans, path) { |
2475 | BUG_ON(path->sorted_idx >= trans->nr_sorted); | |
2476 | BUG_ON(trans->sorted[path->sorted_idx] != path->idx); | |
0423fb71 KO |
2477 | } |
2478 | ||
2479 | for (i = 0; i < trans->nr_sorted; i++) { | |
2480 | unsigned idx = trans->sorted[i]; | |
2481 | ||
67e0dd8f KO |
2482 | EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); |
2483 | BUG_ON(trans->paths[idx].sorted_idx != i); | |
0423fb71 KO |
2484 | } |
2485 | } | |
0423fb71 KO |
2486 | |
2487 | static void btree_trans_verify_sorted(struct btree_trans *trans) | |
2488 | { | |
67e0dd8f | 2489 | struct btree_path *path, *prev = NULL; |
0423fb71 KO |
2490 | unsigned i; |
2491 | ||
ee4d17d0 KO |
2492 | if (!bch2_debug_check_iterators) |
2493 | return; | |
2494 | ||
67e0dd8f | 2495 | trans_for_each_path_inorder(trans, path, i) { |
eb7bd15f | 2496 | if (prev && btree_path_cmp(prev, path) > 0) { |
5c0bb66a | 2497 | __bch2_dump_trans_paths_updates(trans, true); |
eb7bd15f KO |
2498 | panic("trans paths out of order!\n"); |
2499 | } | |
67e0dd8f | 2500 | prev = path; |
0423fb71 | 2501 | } |
0423fb71 | 2502 | } |
068bcaa5 KO |
2503 | #else |
2504 | static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} | |
2505 | static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} | |
2506 | #endif | |
0423fb71 | 2507 | |
068bcaa5 | 2508 | void __bch2_btree_trans_sort_paths(struct btree_trans *trans) |
0423fb71 KO |
2509 | { |
2510 | int i, l = 0, r = trans->nr_sorted, inc = 1; | |
2511 | bool swapped; | |
2512 | ||
068bcaa5 KO |
2513 | btree_trans_verify_sorted_refs(trans); |
2514 | ||
2515 | if (trans->paths_sorted) | |
2516 | goto out; | |
2517 | ||
0423fb71 KO |
2518 | /* |
2519 | * Cocktail shaker sort: this is efficient because iterators will be | |
5c0bb66a | 2520 | * mostly sorted. |
0423fb71 KO |
2521 | */ |
2522 | do { | |
2523 | swapped = false; | |
2524 | ||
2525 | for (i = inc > 0 ? l : r - 2; | |
2526 | i + 1 < r && i >= l; | |
2527 | i += inc) { | |
67e0dd8f KO |
2528 | if (btree_path_cmp(trans->paths + trans->sorted[i], |
2529 | trans->paths + trans->sorted[i + 1]) > 0) { | |
0423fb71 | 2530 | swap(trans->sorted[i], trans->sorted[i + 1]); |
67e0dd8f KO |
2531 | trans->paths[trans->sorted[i]].sorted_idx = i; |
2532 | trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; | |
0423fb71 KO |
2533 | swapped = true; |
2534 | } | |
2535 | } | |
2536 | ||
2537 | if (inc > 0) | |
2538 | --r; | |
2539 | else | |
2540 | l++; | |
2541 | inc = -inc; | |
2542 | } while (swapped); | |
2543 | ||
67e0dd8f | 2544 | trans->paths_sorted = true; |
068bcaa5 | 2545 | out: |
0423fb71 KO |
2546 | btree_trans_verify_sorted(trans); |
2547 | } | |
2548 | ||
67e0dd8f KO |
2549 | static inline void btree_path_list_remove(struct btree_trans *trans, |
2550 | struct btree_path *path) | |
0423fb71 KO |
2551 | { |
2552 | unsigned i; | |
2553 | ||
67e0dd8f | 2554 | EBUG_ON(path->sorted_idx >= trans->nr_sorted); |
0423fb71 KO |
2555 | #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS |
2556 | trans->nr_sorted--; | |
67e0dd8f KO |
2557 | memmove_u64s_down_small(trans->sorted + path->sorted_idx, |
2558 | trans->sorted + path->sorted_idx + 1, | |
2559 | DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); | |
0423fb71 | 2560 | #else |
67e0dd8f | 2561 | array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); |
0423fb71 | 2562 | #endif |
67e0dd8f KO |
2563 | for (i = path->sorted_idx; i < trans->nr_sorted; i++) |
2564 | trans->paths[trans->sorted[i]].sorted_idx = i; | |
0423fb71 | 2565 | |
67e0dd8f | 2566 | path->sorted_idx = U8_MAX; |
0423fb71 KO |
2567 | } |
2568 | ||
67e0dd8f KO |
2569 | static inline void btree_path_list_add(struct btree_trans *trans, |
2570 | struct btree_path *pos, | |
2571 | struct btree_path *path) | |
0423fb71 KO |
2572 | { |
2573 | unsigned i; | |
2574 | ||
068bcaa5 | 2575 | path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; |
0423fb71 KO |
2576 | |
2577 | #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS | |
67e0dd8f KO |
2578 | memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, |
2579 | trans->sorted + path->sorted_idx, | |
2580 | DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); | |
0423fb71 | 2581 | trans->nr_sorted++; |
67e0dd8f | 2582 | trans->sorted[path->sorted_idx] = path->idx; |
0423fb71 | 2583 | #else |
67e0dd8f | 2584 | array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); |
0423fb71 KO |
2585 | #endif |
2586 | ||
67e0dd8f KO |
2587 | for (i = path->sorted_idx; i < trans->nr_sorted; i++) |
2588 | trans->paths[trans->sorted[i]].sorted_idx = i; | |
64bc0011 | 2589 | |
0423fb71 | 2590 | btree_trans_verify_sorted_refs(trans); |
7c26ecae KO |
2591 | } |
2592 | ||
67e0dd8f | 2593 | void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) |
64bc0011 | 2594 | { |
67e0dd8f KO |
2595 | if (iter->path) |
2596 | bch2_path_put(trans, iter->path, | |
2597 | iter->flags & BTREE_ITER_INTENT); | |
1f2d9192 | 2598 | if (iter->update_path) |
7dcbdbd8 | 2599 | bch2_path_put_nokeep(trans, iter->update_path, |
1f2d9192 | 2600 | iter->flags & BTREE_ITER_INTENT); |
f7b6ca23 KO |
2601 | if (iter->key_cache_path) |
2602 | bch2_path_put(trans, iter->key_cache_path, | |
2603 | iter->flags & BTREE_ITER_INTENT); | |
67e0dd8f | 2604 | iter->path = NULL; |
1f2d9192 | 2605 | iter->update_path = NULL; |
f7b6ca23 | 2606 | iter->key_cache_path = NULL; |
64bc0011 KO |
2607 | } |
2608 | ||
c298fd7d KO |
2609 | static inline void __bch2_trans_iter_init(struct btree_trans *trans, |
2610 | struct btree_iter *iter, | |
2611 | enum btree_id btree_id, struct bpos pos, | |
2612 | unsigned locks_want, | |
2613 | unsigned depth, | |
2614 | unsigned flags) | |
1c6fdbd8 | 2615 | { |
57ce8274 KO |
2616 | if (trans->restarted) |
2617 | panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n", | |
2618 | bch2_err_str(trans->restarted), | |
2619 | (void *) trans->last_restarted_ip); | |
e5af273f | 2620 | |
b0babf2a KO |
2621 | if (flags & BTREE_ITER_ALL_LEVELS) |
2622 | flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; | |
2623 | ||
f21566f1 KO |
2624 | if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && |
2625 | btree_node_type_is_extents(btree_id)) | |
3dea728c | 2626 | flags |= BTREE_ITER_IS_EXTENTS; |
1c6fdbd8 | 2627 | |
a861c722 KO |
2628 | if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && |
2629 | !btree_type_has_snapshots(btree_id)) | |
e751c01a KO |
2630 | flags &= ~BTREE_ITER_ALL_SNAPSHOTS; |
2631 | ||
a861c722 KO |
2632 | if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && |
2633 | btree_type_has_snapshots(btree_id)) | |
2634 | flags |= BTREE_ITER_FILTER_SNAPSHOTS; | |
e751c01a | 2635 | |
5222a460 KO |
2636 | if (trans->journal_replay_not_finished) |
2637 | flags |= BTREE_ITER_WITH_JOURNAL; | |
2638 | ||
67e0dd8f KO |
2639 | iter->trans = trans; |
2640 | iter->path = NULL; | |
1f2d9192 | 2641 | iter->update_path = NULL; |
f7b6ca23 | 2642 | iter->key_cache_path = NULL; |
67e0dd8f KO |
2643 | iter->btree_id = btree_id; |
2644 | iter->min_depth = depth; | |
f21566f1 KO |
2645 | iter->flags = flags; |
2646 | iter->snapshot = pos.snapshot; | |
67e0dd8f KO |
2647 | iter->pos = pos; |
2648 | iter->k.type = KEY_TYPE_deleted; | |
2649 | iter->k.p = pos; | |
2650 | iter->k.size = 0; | |
30525f68 KO |
2651 | iter->journal_idx = 0; |
2652 | iter->journal_pos = POS_MIN; | |
e751c01a | 2653 | |
f3e1f444 KO |
2654 | iter->path = bch2_path_get(trans, btree_id, iter->pos, |
2655 | locks_want, depth, flags); | |
1c6fdbd8 KO |
2656 | } |
2657 | ||
67e0dd8f KO |
2658 | void bch2_trans_iter_init(struct btree_trans *trans, |
2659 | struct btree_iter *iter, | |
2660 | unsigned btree_id, struct bpos pos, | |
2661 | unsigned flags) | |
424eb881 | 2662 | { |
2ae4573e KO |
2663 | if (!btree_id_cached(trans->c, btree_id)) { |
2664 | flags &= ~BTREE_ITER_CACHED; | |
2665 | flags &= ~BTREE_ITER_WITH_KEY_CACHE; | |
2666 | } else if (!(flags & BTREE_ITER_CACHED)) | |
2667 | flags |= BTREE_ITER_WITH_KEY_CACHE; | |
2668 | ||
67e0dd8f KO |
2669 | __bch2_trans_iter_init(trans, iter, btree_id, pos, |
2670 | 0, 0, flags); | |
424eb881 KO |
2671 | } |
2672 | ||
67e0dd8f KO |
2673 | void bch2_trans_node_iter_init(struct btree_trans *trans, |
2674 | struct btree_iter *iter, | |
2675 | enum btree_id btree_id, | |
2676 | struct bpos pos, | |
2677 | unsigned locks_want, | |
2678 | unsigned depth, | |
2679 | unsigned flags) | |
1c6fdbd8 | 2680 | { |
67e0dd8f KO |
2681 | __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth, |
2682 | BTREE_ITER_NOT_EXTENTS| | |
2683 | __BTREE_ITER_ALL_SNAPSHOTS| | |
2684 | BTREE_ITER_ALL_SNAPSHOTS| | |
2685 | flags); | |
2686 | BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); | |
2687 | BUG_ON(iter->path->level != depth); | |
2688 | BUG_ON(iter->min_depth != depth); | |
2689 | } | |
7c26ecae | 2690 | |
67e0dd8f KO |
2691 | void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) |
2692 | { | |
2693 | *dst = *src; | |
2694 | if (src->path) | |
2695 | __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); | |
1f2d9192 KO |
2696 | if (src->update_path) |
2697 | __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); | |
f7b6ca23 | 2698 | dst->key_cache_path = NULL; |
1c6fdbd8 KO |
2699 | } |
2700 | ||
14d8f26a | 2701 | void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) |
1c6fdbd8 | 2702 | { |
616928c3 | 2703 | unsigned new_top = trans->mem_top + size; |
14d8f26a KO |
2704 | size_t old_bytes = trans->mem_bytes; |
2705 | size_t new_bytes = roundup_pow_of_two(new_top); | |
2706 | void *new_mem; | |
73a117d2 KO |
2707 | void *p; |
2708 | ||
616928c3 KO |
2709 | trans->mem_max = max(trans->mem_max, new_top); |
2710 | ||
14d8f26a | 2711 | WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); |
e131b6aa | 2712 | |
14d8f26a KO |
2713 | new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); |
2714 | if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { | |
2715 | new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); | |
2716 | new_bytes = BTREE_TRANS_MEM_MAX; | |
2717 | kfree(trans->mem); | |
2718 | } | |
1c6fdbd8 | 2719 | |
14d8f26a KO |
2720 | if (!new_mem) |
2721 | return ERR_PTR(-ENOMEM); | |
1c6fdbd8 | 2722 | |
14d8f26a KO |
2723 | trans->mem = new_mem; |
2724 | trans->mem_bytes = new_bytes; | |
1c6fdbd8 | 2725 | |
14d8f26a KO |
2726 | if (old_bytes) { |
2727 | trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); | |
2728 | return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); | |
1c6fdbd8 KO |
2729 | } |
2730 | ||
20bceecb | 2731 | p = trans->mem + trans->mem_top; |
1c6fdbd8 | 2732 | trans->mem_top += size; |
7ed158f2 | 2733 | memset(p, 0, size); |
20bceecb | 2734 | return p; |
1c6fdbd8 KO |
2735 | } |
2736 | ||
d38494c4 | 2737 | /** |
955af634 | 2738 | * bch2_trans_begin() - reset a transaction after a interrupted attempt |
d38494c4 | 2739 | * @trans: transaction to reset |
d38494c4 | 2740 | * |
549d173c KO |
2741 | * While iterating over nodes or updating nodes a attempt to lock a btree node |
2742 | * may return BCH_ERR_transaction_restart when the trylock fails. When this | |
2743 | * occurs bch2_trans_begin() should be called and the transaction retried. | |
d38494c4 | 2744 | */ |
e941ae7d | 2745 | u32 bch2_trans_begin(struct btree_trans *trans) |
1c6fdbd8 | 2746 | { |
67e0dd8f | 2747 | struct btree_path *path; |
a8e00bd4 | 2748 | |
0fbf71f8 | 2749 | bch2_trans_reset_updates(trans); |
1c6fdbd8 | 2750 | |
c497df8b | 2751 | trans->restart_count++; |
163e885a | 2752 | trans->mem_top = 0; |
bf7b87a4 | 2753 | |
f21539a5 KO |
2754 | if (trans->fs_usage_deltas) { |
2755 | trans->fs_usage_deltas->used = 0; | |
2756 | memset((void *) trans->fs_usage_deltas + | |
2757 | offsetof(struct replicas_delta_list, memset_start), 0, | |
2758 | (void *) &trans->fs_usage_deltas->memset_end - | |
2759 | (void *) &trans->fs_usage_deltas->memset_start); | |
2760 | } | |
2761 | ||
67e0dd8f | 2762 | trans_for_each_path(trans, path) { |
25a77231 KO |
2763 | path->should_be_locked = false; |
2764 | ||
e1f7fa06 KO |
2765 | /* |
2766 | * If the transaction wasn't restarted, we're presuming to be | |
2767 | * doing something new: dont keep iterators excpt the ones that | |
2768 | * are in use - except for the subvolumes btree: | |
2769 | */ | |
2770 | if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) | |
2771 | path->preserve = false; | |
2772 | ||
67e0dd8f KO |
2773 | /* |
2774 | * XXX: we probably shouldn't be doing this if the transaction | |
2775 | * was restarted, but currently we still overflow transaction | |
2776 | * iterators if we do that | |
2777 | */ | |
2778 | if (!path->ref && !path->preserve) | |
2779 | __bch2_path_free(trans, path); | |
2780 | else | |
25a77231 | 2781 | path->preserve = false; |
67e0dd8f KO |
2782 | } |
2783 | ||
43de721a KO |
2784 | if (!trans->restarted && |
2785 | (need_resched() || | |
2786 | ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { | |
2787 | bch2_trans_unlock(trans); | |
2788 | cond_resched(); | |
2789 | bch2_trans_relock(trans); | |
2790 | } | |
50dc0f69 | 2791 | |
e941ae7d | 2792 | trans->last_restarted_ip = _RET_IP_; |
955af634 | 2793 | if (trans->restarted) |
67e0dd8f | 2794 | bch2_btree_path_traverse_all(trans); |
e5af273f | 2795 | |
43de721a | 2796 | trans->last_begin_time = ktime_get_ns(); |
e941ae7d KO |
2797 | return trans->restart_count; |
2798 | } | |
2799 | ||
2800 | void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count) | |
2801 | { | |
efd0d038 KO |
2802 | if (trans_was_restarted(trans, restart_count)) |
2803 | panic("trans->restart_count %u, should be %u, last restarted by %pS\n", | |
2804 | trans->restart_count, restart_count, | |
2805 | (void *) trans->last_restarted_ip); | |
1c6fdbd8 KO |
2806 | } |
2807 | ||
67e0dd8f | 2808 | static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) |
1a21bf98 | 2809 | { |
67e0dd8f | 2810 | size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; |
3eb26d01 | 2811 | size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; |
dbd1e825 | 2812 | void *p = NULL; |
1a21bf98 KO |
2813 | |
2814 | BUG_ON(trans->used_mempool); | |
2815 | ||
dbd1e825 | 2816 | #ifdef __KERNEL__ |
67e0dd8f | 2817 | p = this_cpu_xchg(c->btree_paths_bufs->path , NULL); |
dbd1e825 KO |
2818 | #endif |
2819 | if (!p) | |
67e0dd8f | 2820 | p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); |
1a21bf98 | 2821 | |
67e0dd8f | 2822 | trans->paths = p; p += paths_bytes; |
1a21bf98 | 2823 | trans->updates = p; p += updates_bytes; |
1a21bf98 KO |
2824 | } |
2825 | ||
4aba7d45 KO |
2826 | static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct bch_fs *c, |
2827 | const char *fn) | |
2828 | { | |
2829 | unsigned i; | |
2830 | ||
2831 | for (i = 0; i < ARRAY_SIZE(c->btree_transaction_fns); i++) | |
2832 | if (!c->btree_transaction_fns[i] || | |
2833 | c->btree_transaction_fns[i] == fn) { | |
2834 | c->btree_transaction_fns[i] = fn; | |
2835 | return i; | |
2836 | } | |
2837 | ||
2838 | pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); | |
2839 | return i; | |
2840 | } | |
2841 | ||
616928c3 | 2842 | void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn) |
c0ebe3e4 | 2843 | __acquires(&c->btree_trans_barrier) |
1c6fdbd8 | 2844 | { |
5c0bb66a KO |
2845 | struct btree_transaction_stats *s; |
2846 | ||
ae1ede58 | 2847 | memset(trans, 0, sizeof(*trans)); |
1c6fdbd8 | 2848 | trans->c = c; |
669f87a5 | 2849 | trans->fn = fn; |
43de721a | 2850 | trans->last_begin_time = ktime_get_ns(); |
4aba7d45 | 2851 | trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn); |
33bd5d06 | 2852 | trans->locking_wait.task = current; |
5222a460 KO |
2853 | trans->journal_replay_not_finished = |
2854 | !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); | |
33bd5d06 | 2855 | closure_init_stack(&trans->ref); |
20bceecb | 2856 | |
67e0dd8f | 2857 | bch2_trans_alloc_paths(trans, c); |
20bceecb | 2858 | |
616928c3 KO |
2859 | s = btree_trans_stats(trans); |
2860 | if (s) { | |
2861 | unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); | |
2862 | ||
0b5c9f59 | 2863 | trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); |
e131b6aa KO |
2864 | |
2865 | if (!unlikely(trans->mem)) { | |
2866 | trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); | |
2867 | trans->mem_bytes = BTREE_TRANS_MEM_MAX; | |
2868 | } else { | |
0b5c9f59 | 2869 | trans->mem_bytes = expected_mem_bytes; |
e131b6aa | 2870 | } |
495aabed | 2871 | |
5c0bb66a | 2872 | trans->nr_max_paths = s->nr_max_paths; |
616928c3 | 2873 | } |
5c0bb66a | 2874 | |
876c7af3 KO |
2875 | trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); |
2876 | ||
b84d42c3 | 2877 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { |
50b13bee KO |
2878 | struct btree_trans *pos; |
2879 | ||
b84d42c3 | 2880 | mutex_lock(&c->btree_trans_lock); |
50b13bee | 2881 | list_for_each_entry(pos, &c->btree_trans_list, list) { |
33bd5d06 | 2882 | if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { |
50b13bee KO |
2883 | list_add_tail(&trans->list, &pos->list); |
2884 | goto list_add_done; | |
2885 | } | |
2886 | } | |
2887 | list_add_tail(&trans->list, &c->btree_trans_list); | |
2888 | list_add_done: | |
b84d42c3 KO |
2889 | mutex_unlock(&c->btree_trans_lock); |
2890 | } | |
1c6fdbd8 KO |
2891 | } |
2892 | ||
67e0dd8f KO |
2893 | static void check_btree_paths_leaked(struct btree_trans *trans) |
2894 | { | |
2895 | #ifdef CONFIG_BCACHEFS_DEBUG | |
2896 | struct bch_fs *c = trans->c; | |
2897 | struct btree_path *path; | |
2898 | ||
2899 | trans_for_each_path(trans, path) | |
2900 | if (path->ref) | |
2901 | goto leaked; | |
2902 | return; | |
2903 | leaked: | |
669f87a5 | 2904 | bch_err(c, "btree paths leaked from %s!", trans->fn); |
67e0dd8f KO |
2905 | trans_for_each_path(trans, path) |
2906 | if (path->ref) | |
2907 | printk(KERN_ERR " btree %s %pS\n", | |
2908 | bch2_btree_ids[path->btree_id], | |
2909 | (void *) path->ip_allocated); | |
2910 | /* Be noisy about this: */ | |
2911 | bch2_fatal_error(c); | |
2912 | #endif | |
2913 | } | |
2914 | ||
9a796fdb | 2915 | void bch2_trans_exit(struct btree_trans *trans) |
c0ebe3e4 | 2916 | __releases(&c->btree_trans_barrier) |
1c6fdbd8 | 2917 | { |
67e0dd8f | 2918 | struct btree_insert_entry *i; |
1a21bf98 | 2919 | struct bch_fs *c = trans->c; |
616928c3 | 2920 | struct btree_transaction_stats *s = btree_trans_stats(trans); |
1a21bf98 | 2921 | |
ece254b2 | 2922 | bch2_trans_unlock(trans); |
1c6fdbd8 | 2923 | |
33bd5d06 KO |
2924 | closure_sync(&trans->ref); |
2925 | ||
616928c3 KO |
2926 | if (s) |
2927 | s->max_mem = max(s->max_mem, trans->mem_max); | |
2928 | ||
67e0dd8f KO |
2929 | trans_for_each_update(trans, i) |
2930 | __btree_path_put(i->path, true); | |
2931 | trans->nr_updates = 0; | |
509d3e0a | 2932 | |
67e0dd8f | 2933 | check_btree_paths_leaked(trans); |
50dc0f69 | 2934 | |
b84d42c3 KO |
2935 | if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { |
2936 | mutex_lock(&c->btree_trans_lock); | |
2937 | list_del(&trans->list); | |
2938 | mutex_unlock(&c->btree_trans_lock); | |
2939 | } | |
495aabed | 2940 | |
876c7af3 KO |
2941 | srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); |
2942 | ||
67e0dd8f | 2943 | bch2_journal_preres_put(&c->journal, &trans->journal_preres); |
2ca88e5a | 2944 | |
2a6870ad KO |
2945 | kfree(trans->extra_journal_entries.data); |
2946 | ||
9620c3ec KO |
2947 | if (trans->fs_usage_deltas) { |
2948 | if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == | |
2949 | REPLICAS_DELTA_LIST_MAX) | |
2950 | mempool_free(trans->fs_usage_deltas, | |
67e0dd8f | 2951 | &c->replicas_delta_pool); |
9620c3ec KO |
2952 | else |
2953 | kfree(trans->fs_usage_deltas); | |
2954 | } | |
e131b6aa KO |
2955 | |
2956 | if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) | |
67e0dd8f | 2957 | mempool_free(trans->mem, &c->btree_trans_mem_pool); |
e131b6aa KO |
2958 | else |
2959 | kfree(trans->mem); | |
1a21bf98 | 2960 | |
dbd1e825 KO |
2961 | #ifdef __KERNEL__ |
2962 | /* | |
2963 | * Userspace doesn't have a real percpu implementation: | |
2964 | */ | |
67e0dd8f | 2965 | trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); |
dbd1e825 | 2966 | #endif |
e131b6aa | 2967 | |
67e0dd8f KO |
2968 | if (trans->paths) |
2969 | mempool_free(trans->paths, &c->btree_paths_pool); | |
1a21bf98 | 2970 | |
1c6fdbd8 | 2971 | trans->mem = (void *) 0x1; |
67e0dd8f | 2972 | trans->paths = (void *) 0x1; |
1c6fdbd8 | 2973 | } |
36e9d698 | 2974 | |
b3d1e6ca | 2975 | static void __maybe_unused |
afbc7194 KO |
2976 | bch2_btree_bkey_cached_common_to_text(struct printbuf *out, |
2977 | struct btree_bkey_cached_common *b) | |
d211b408 | 2978 | { |
c240c3a9 KO |
2979 | struct six_lock_count c = six_lock_counts(&b->lock); |
2980 | struct task_struct *owner; | |
2981 | pid_t pid; | |
2982 | ||
2983 | rcu_read_lock(); | |
2984 | owner = READ_ONCE(b->lock.owner); | |
2985 | pid = owner ? owner->pid : 0;; | |
2986 | rcu_read_unlock(); | |
2987 | ||
afbc7194 KO |
2988 | prt_tab(out); |
2989 | prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', | |
2990 | b->level, bch2_btree_ids[b->btree_id]); | |
4e6defd1 | 2991 | bch2_bpos_to_text(out, btree_node_pos(b)); |
c240c3a9 | 2992 | |
afbc7194 KO |
2993 | prt_tab(out); |
2994 | prt_printf(out, " locks %u:%u:%u held by pid %u", | |
c240c3a9 | 2995 | c.n[0], c.n[1], c.n[2], pid); |
d211b408 KO |
2996 | } |
2997 | ||
50b13bee | 2998 | void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) |
5c1d808a | 2999 | { |
67e0dd8f | 3000 | struct btree_path *path; |
86b74451 | 3001 | struct btree_bkey_cached_common *b; |
c7ce2732 | 3002 | static char lock_types[] = { 'r', 'i', 'w' }; |
495aabed KO |
3003 | unsigned l; |
3004 | ||
afbc7194 KO |
3005 | if (!out->nr_tabstops) { |
3006 | printbuf_tabstop_push(out, 16); | |
3007 | printbuf_tabstop_push(out, 32); | |
3008 | } | |
3009 | ||
33bd5d06 | 3010 | prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); |
495aabed | 3011 | |
50b13bee KO |
3012 | trans_for_each_path(trans, path) { |
3013 | if (!path->nodes_locked) | |
3014 | continue; | |
495aabed | 3015 | |
50b13bee KO |
3016 | prt_printf(out, " path %u %c l=%u %s:", |
3017 | path->idx, | |
3018 | path->cached ? 'c' : 'b', | |
3019 | path->level, | |
3020 | bch2_btree_ids[path->btree_id]); | |
3021 | bch2_bpos_to_text(out, path->pos); | |
afbc7194 | 3022 | prt_newline(out); |
50b13bee KO |
3023 | |
3024 | for (l = 0; l < BTREE_MAX_DEPTH; l++) { | |
86b74451 | 3025 | if (btree_node_locked(path, l) && |
315c9ba6 | 3026 | !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { |
d4263e56 KO |
3027 | prt_printf(out, " %c l=%u ", |
3028 | lock_types[btree_node_locked_type(path, l)], l); | |
afbc7194 KO |
3029 | bch2_btree_bkey_cached_common_to_text(out, b); |
3030 | prt_newline(out); | |
495aabed KO |
3031 | } |
3032 | } | |
50b13bee | 3033 | } |
495aabed | 3034 | |
50b13bee KO |
3035 | b = READ_ONCE(trans->locking); |
3036 | if (b) { | |
afbc7194 KO |
3037 | prt_str(out, " want"); |
3038 | prt_newline(out); | |
3039 | prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); | |
3040 | bch2_btree_bkey_cached_common_to_text(out, b); | |
3041 | prt_newline(out); | |
495aabed | 3042 | } |
495aabed KO |
3043 | } |
3044 | ||
36e9d698 KO |
3045 | void bch2_fs_btree_iter_exit(struct bch_fs *c) |
3046 | { | |
4aba7d45 KO |
3047 | struct btree_transaction_stats *s; |
3048 | ||
3049 | for (s = c->btree_transaction_stats; | |
3050 | s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); | |
5c0bb66a KO |
3051 | s++) { |
3052 | kfree(s->max_paths_text); | |
4aba7d45 | 3053 | bch2_time_stats_exit(&s->lock_hold_times); |
5c0bb66a | 3054 | } |
4aba7d45 | 3055 | |
99fafb04 KO |
3056 | if (c->btree_trans_barrier_initialized) |
3057 | cleanup_srcu_struct(&c->btree_trans_barrier); | |
e131b6aa | 3058 | mempool_exit(&c->btree_trans_mem_pool); |
67e0dd8f | 3059 | mempool_exit(&c->btree_paths_pool); |
36e9d698 KO |
3060 | } |
3061 | ||
3062 | int bch2_fs_btree_iter_init(struct bch_fs *c) | |
3063 | { | |
4aba7d45 | 3064 | struct btree_transaction_stats *s; |
36e9d698 | 3065 | unsigned nr = BTREE_ITER_MAX; |
99fafb04 | 3066 | int ret; |
36e9d698 | 3067 | |
4aba7d45 KO |
3068 | for (s = c->btree_transaction_stats; |
3069 | s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); | |
5c0bb66a | 3070 | s++) { |
4aba7d45 | 3071 | bch2_time_stats_init(&s->lock_hold_times); |
5c0bb66a KO |
3072 | mutex_init(&s->lock); |
3073 | } | |
4aba7d45 | 3074 | |
495aabed KO |
3075 | INIT_LIST_HEAD(&c->btree_trans_list); |
3076 | mutex_init(&c->btree_trans_lock); | |
3077 | ||
99fafb04 | 3078 | ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, |
67e0dd8f | 3079 | sizeof(struct btree_path) * nr + |
e131b6aa KO |
3080 | sizeof(struct btree_insert_entry) * nr) ?: |
3081 | mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, | |
99fafb04 KO |
3082 | BTREE_TRANS_MEM_MAX) ?: |
3083 | init_srcu_struct(&c->btree_trans_barrier); | |
3084 | if (!ret) | |
3085 | c->btree_trans_barrier_initialized = true; | |
3086 | return ret; | |
36e9d698 | 3087 | } |