Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
4 | #include "bkey_methods.h" | |
5b8a9227 | 5 | #include "bkey_sort.h" |
1c6fdbd8 KO |
6 | #include "btree_cache.h" |
7 | #include "btree_io.h" | |
8 | #include "btree_iter.h" | |
9 | #include "btree_locking.h" | |
10 | #include "btree_update.h" | |
11 | #include "btree_update_interior.h" | |
12 | #include "buckets.h" | |
13 | #include "checksum.h" | |
14 | #include "debug.h" | |
15 | #include "error.h" | |
16 | #include "extents.h" | |
17 | #include "io.h" | |
18 | #include "journal_reclaim.h" | |
19 | #include "journal_seq_blacklist.h" | |
20 | #include "super-io.h" | |
21 | #include "trace.h" | |
22 | ||
1c6fdbd8 KO |
23 | static void verify_no_dups(struct btree *b, |
24 | struct bkey_packed *start, | |
25 | struct bkey_packed *end) | |
26 | { | |
27 | #ifdef CONFIG_BCACHEFS_DEBUG | |
28 | struct bkey_packed *k; | |
29 | ||
30 | for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) { | |
31 | struct bkey l = bkey_unpack_key(b, k); | |
32 | struct bkey r = bkey_unpack_key(b, bkey_next(k)); | |
33 | ||
34 | BUG_ON(btree_node_is_extents(b) | |
35 | ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 | |
36 | : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); | |
37 | //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); | |
38 | } | |
39 | #endif | |
40 | } | |
41 | ||
42 | static void clear_needs_whiteout(struct bset *i) | |
43 | { | |
44 | struct bkey_packed *k; | |
45 | ||
46 | for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) | |
47 | k->needs_whiteout = false; | |
48 | } | |
49 | ||
50 | static void set_needs_whiteout(struct bset *i) | |
51 | { | |
52 | struct bkey_packed *k; | |
53 | ||
54 | for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) | |
55 | k->needs_whiteout = true; | |
56 | } | |
57 | ||
58 | static void btree_bounce_free(struct bch_fs *c, unsigned order, | |
59 | bool used_mempool, void *p) | |
60 | { | |
61 | if (used_mempool) | |
62 | mempool_free(p, &c->btree_bounce_pool); | |
63 | else | |
64 | vpfree(p, PAGE_SIZE << order); | |
65 | } | |
66 | ||
67 | static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, | |
68 | bool *used_mempool) | |
69 | { | |
70 | void *p; | |
71 | ||
72 | BUG_ON(order > btree_page_order(c)); | |
73 | ||
74 | *used_mempool = false; | |
75 | p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); | |
76 | if (p) | |
77 | return p; | |
78 | ||
79 | *used_mempool = true; | |
80 | return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); | |
81 | } | |
82 | ||
1c6fdbd8 KO |
83 | static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, |
84 | bool compacting, | |
85 | enum compact_mode mode) | |
86 | { | |
87 | unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); | |
88 | unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set]; | |
89 | ||
90 | if (mode == COMPACT_LAZY) { | |
91 | if (should_compact_bset_lazy(b, t) || | |
1fe08f31 | 92 | (compacting && !bset_written(b, bset(b, t)))) |
1c6fdbd8 KO |
93 | return dead_u64s; |
94 | } else { | |
95 | if (bset_written(b, bset(b, t))) | |
96 | return dead_u64s; | |
97 | } | |
98 | ||
99 | return 0; | |
100 | } | |
101 | ||
102 | bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, | |
103 | enum compact_mode mode) | |
104 | { | |
105 | const struct bkey_format *f = &b->format; | |
106 | struct bset_tree *t; | |
107 | struct bkey_packed *whiteouts = NULL; | |
108 | struct bkey_packed *u_start, *u_pos; | |
109 | struct sort_iter sort_iter; | |
110 | unsigned order, whiteout_u64s = 0, u64s; | |
111 | bool used_mempool, compacting = false; | |
112 | ||
113 | for_each_bset(b, t) | |
114 | whiteout_u64s += should_compact_bset(b, t, | |
115 | whiteout_u64s != 0, mode); | |
116 | ||
117 | if (!whiteout_u64s) | |
118 | return false; | |
119 | ||
120 | sort_iter_init(&sort_iter, b); | |
121 | ||
122 | whiteout_u64s += b->whiteout_u64s; | |
123 | order = get_order(whiteout_u64s * sizeof(u64)); | |
124 | ||
125 | whiteouts = btree_bounce_alloc(c, order, &used_mempool); | |
126 | u_start = u_pos = whiteouts; | |
127 | ||
128 | memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), | |
129 | b->whiteout_u64s); | |
130 | u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); | |
131 | ||
132 | sort_iter_add(&sort_iter, u_start, u_pos); | |
133 | ||
134 | for_each_bset(b, t) { | |
135 | struct bset *i = bset(b, t); | |
136 | struct bkey_packed *k, *n, *out, *start, *end; | |
137 | struct btree_node_entry *src = NULL, *dst = NULL; | |
138 | ||
1fe08f31 | 139 | if (t != b->set && !bset_written(b, i)) { |
1c6fdbd8 KO |
140 | src = container_of(i, struct btree_node_entry, keys); |
141 | dst = max(write_block(b), | |
142 | (void *) btree_bkey_last(b, t -1)); | |
143 | } | |
144 | ||
145 | if (!should_compact_bset(b, t, compacting, mode)) { | |
146 | if (src != dst) { | |
147 | memmove(dst, src, sizeof(*src) + | |
148 | le16_to_cpu(src->keys.u64s) * | |
149 | sizeof(u64)); | |
150 | i = &dst->keys; | |
151 | set_btree_bset(b, t, i); | |
152 | } | |
153 | continue; | |
154 | } | |
155 | ||
156 | compacting = true; | |
157 | u_start = u_pos; | |
158 | start = i->start; | |
159 | end = vstruct_last(i); | |
160 | ||
161 | if (src != dst) { | |
162 | memmove(dst, src, sizeof(*src)); | |
163 | i = &dst->keys; | |
164 | set_btree_bset(b, t, i); | |
165 | } | |
166 | ||
167 | out = i->start; | |
168 | ||
169 | for (k = start; k != end; k = n) { | |
170 | n = bkey_next(k); | |
171 | ||
172 | if (bkey_deleted(k) && btree_node_is_extents(b)) | |
173 | continue; | |
174 | ||
175 | if (bkey_whiteout(k) && !k->needs_whiteout) | |
176 | continue; | |
177 | ||
178 | if (bkey_whiteout(k)) { | |
1fe08f31 | 179 | unreserve_whiteout(b, k); |
1c6fdbd8 KO |
180 | memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); |
181 | set_bkeyp_val_u64s(f, u_pos, 0); | |
182 | u_pos = bkey_next(u_pos); | |
183 | } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { | |
184 | bkey_copy(out, k); | |
185 | out = bkey_next(out); | |
186 | } | |
187 | } | |
188 | ||
189 | sort_iter_add(&sort_iter, u_start, u_pos); | |
190 | ||
191 | if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { | |
192 | i->u64s = cpu_to_le16((u64 *) out - i->_data); | |
193 | set_btree_bset_end(b, t); | |
194 | bch2_bset_set_no_aux_tree(b, t); | |
195 | } | |
196 | } | |
197 | ||
198 | b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; | |
199 | ||
200 | BUG_ON((void *) unwritten_whiteouts_start(c, b) < | |
201 | (void *) btree_bkey_last(b, bset_tree_last(b))); | |
202 | ||
5b8a9227 KO |
203 | u64s = (btree_node_is_extents(b) |
204 | ? bch2_sort_extent_whiteouts | |
205 | : bch2_sort_key_whiteouts)(unwritten_whiteouts_start(c, b), | |
206 | &sort_iter); | |
1c6fdbd8 KO |
207 | |
208 | BUG_ON(u64s > b->whiteout_u64s); | |
209 | BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); | |
210 | BUG_ON(u_pos != whiteouts && !u64s); | |
211 | ||
212 | if (u64s != b->whiteout_u64s) { | |
213 | void *src = unwritten_whiteouts_start(c, b); | |
214 | ||
215 | b->whiteout_u64s = u64s; | |
216 | memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); | |
217 | } | |
218 | ||
219 | verify_no_dups(b, | |
220 | unwritten_whiteouts_start(c, b), | |
221 | unwritten_whiteouts_end(c, b)); | |
222 | ||
223 | btree_bounce_free(c, order, used_mempool, whiteouts); | |
224 | ||
225 | if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) | |
226 | bch2_btree_build_aux_trees(b); | |
227 | ||
228 | bch_btree_keys_u64s_remaining(c, b); | |
229 | bch2_verify_btree_nr_keys(b); | |
230 | ||
231 | return true; | |
232 | } | |
233 | ||
234 | static bool bch2_drop_whiteouts(struct btree *b) | |
235 | { | |
236 | struct bset_tree *t; | |
237 | bool ret = false; | |
238 | ||
239 | for_each_bset(b, t) { | |
240 | struct bset *i = bset(b, t); | |
241 | struct bkey_packed *k, *n, *out, *start, *end; | |
242 | ||
243 | if (!should_compact_bset(b, t, true, COMPACT_WRITTEN)) | |
244 | continue; | |
245 | ||
246 | start = btree_bkey_first(b, t); | |
247 | end = btree_bkey_last(b, t); | |
248 | ||
1fe08f31 | 249 | if (!bset_written(b, i) && |
1c6fdbd8 KO |
250 | t != b->set) { |
251 | struct bset *dst = | |
252 | max_t(struct bset *, write_block(b), | |
253 | (void *) btree_bkey_last(b, t -1)); | |
254 | ||
255 | memmove(dst, i, sizeof(struct bset)); | |
256 | i = dst; | |
257 | set_btree_bset(b, t, i); | |
258 | } | |
259 | ||
260 | out = i->start; | |
261 | ||
262 | for (k = start; k != end; k = n) { | |
263 | n = bkey_next(k); | |
264 | ||
265 | if (!bkey_whiteout(k)) { | |
266 | bkey_copy(out, k); | |
267 | out = bkey_next(out); | |
268 | } | |
269 | } | |
270 | ||
271 | i->u64s = cpu_to_le16((u64 *) out - i->_data); | |
272 | bch2_bset_set_no_aux_tree(b, t); | |
273 | ret = true; | |
274 | } | |
275 | ||
276 | bch2_verify_btree_nr_keys(b); | |
277 | ||
278 | return ret; | |
279 | } | |
280 | ||
1c6fdbd8 KO |
281 | static void btree_node_sort(struct bch_fs *c, struct btree *b, |
282 | struct btree_iter *iter, | |
283 | unsigned start_idx, | |
284 | unsigned end_idx, | |
285 | bool filter_whiteouts) | |
286 | { | |
287 | struct btree_node *out; | |
288 | struct sort_iter sort_iter; | |
289 | struct bset_tree *t; | |
290 | struct bset *start_bset = bset(b, &b->set[start_idx]); | |
291 | bool used_mempool = false; | |
292 | u64 start_time, seq = 0; | |
293 | unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; | |
294 | bool sorting_entire_node = start_idx == 0 && | |
295 | end_idx == b->nsets; | |
296 | ||
297 | sort_iter_init(&sort_iter, b); | |
298 | ||
299 | for (t = b->set + start_idx; | |
300 | t < b->set + end_idx; | |
301 | t++) { | |
302 | u64s += le16_to_cpu(bset(b, t)->u64s); | |
303 | sort_iter_add(&sort_iter, | |
304 | btree_bkey_first(b, t), | |
305 | btree_bkey_last(b, t)); | |
306 | } | |
307 | ||
308 | order = sorting_entire_node | |
309 | ? btree_page_order(c) | |
310 | : get_order(__vstruct_bytes(struct btree_node, u64s)); | |
311 | ||
312 | out = btree_bounce_alloc(c, order, &used_mempool); | |
313 | ||
314 | start_time = local_clock(); | |
315 | ||
316 | if (btree_node_is_extents(b)) | |
317 | filter_whiteouts = bset_written(b, start_bset); | |
318 | ||
5b8a9227 KO |
319 | u64s = (btree_node_is_extents(b) |
320 | ? bch2_sort_extents | |
321 | : bch2_sort_keys)(out->keys.start, | |
322 | &sort_iter, | |
323 | filter_whiteouts); | |
1c6fdbd8 KO |
324 | |
325 | out->keys.u64s = cpu_to_le16(u64s); | |
326 | ||
327 | BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); | |
328 | ||
329 | if (sorting_entire_node) | |
330 | bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], | |
331 | start_time); | |
332 | ||
333 | /* Make sure we preserve bset journal_seq: */ | |
334 | for (t = b->set + start_idx; t < b->set + end_idx; t++) | |
335 | seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); | |
336 | start_bset->journal_seq = cpu_to_le64(seq); | |
337 | ||
338 | if (sorting_entire_node) { | |
339 | unsigned u64s = le16_to_cpu(out->keys.u64s); | |
340 | ||
341 | BUG_ON(order != btree_page_order(c)); | |
342 | ||
343 | /* | |
344 | * Our temporary buffer is the same size as the btree node's | |
345 | * buffer, we can just swap buffers instead of doing a big | |
346 | * memcpy() | |
347 | */ | |
348 | *out = *b->data; | |
349 | out->keys.u64s = cpu_to_le16(u64s); | |
350 | swap(out, b->data); | |
351 | set_btree_bset(b, b->set, &b->data->keys); | |
352 | } else { | |
353 | start_bset->u64s = out->keys.u64s; | |
354 | memcpy_u64s(start_bset->start, | |
355 | out->keys.start, | |
356 | le16_to_cpu(out->keys.u64s)); | |
357 | } | |
358 | ||
359 | for (i = start_idx + 1; i < end_idx; i++) | |
360 | b->nr.bset_u64s[start_idx] += | |
361 | b->nr.bset_u64s[i]; | |
362 | ||
363 | b->nsets -= shift; | |
364 | ||
365 | for (i = start_idx + 1; i < b->nsets; i++) { | |
366 | b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; | |
367 | b->set[i] = b->set[i + shift]; | |
368 | } | |
369 | ||
370 | for (i = b->nsets; i < MAX_BSETS; i++) | |
371 | b->nr.bset_u64s[i] = 0; | |
372 | ||
373 | set_btree_bset_end(b, &b->set[start_idx]); | |
374 | bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); | |
375 | ||
376 | btree_bounce_free(c, order, used_mempool, out); | |
377 | ||
378 | bch2_verify_btree_nr_keys(b); | |
379 | } | |
380 | ||
1c6fdbd8 KO |
381 | void bch2_btree_sort_into(struct bch_fs *c, |
382 | struct btree *dst, | |
383 | struct btree *src) | |
384 | { | |
385 | struct btree_nr_keys nr; | |
386 | struct btree_node_iter src_iter; | |
387 | u64 start_time = local_clock(); | |
388 | ||
389 | BUG_ON(dst->nsets != 1); | |
390 | ||
391 | bch2_bset_set_no_aux_tree(dst, dst->set); | |
392 | ||
271a3d3a | 393 | bch2_btree_node_iter_init_from_start(&src_iter, src); |
1c6fdbd8 | 394 | |
26609b61 KO |
395 | if (btree_node_is_extents(src)) |
396 | nr = bch2_sort_repack_merge(c, btree_bset_first(dst), | |
397 | src, &src_iter, | |
398 | &dst->format, | |
399 | true); | |
400 | else | |
401 | nr = bch2_sort_repack(btree_bset_first(dst), | |
402 | src, &src_iter, | |
403 | &dst->format, | |
404 | true); | |
1c6fdbd8 KO |
405 | |
406 | bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time); | |
407 | ||
408 | set_btree_bset_end(dst, dst->set); | |
409 | ||
410 | dst->nr.live_u64s += nr.live_u64s; | |
411 | dst->nr.bset_u64s[0] += nr.bset_u64s[0]; | |
412 | dst->nr.packed_keys += nr.packed_keys; | |
413 | dst->nr.unpacked_keys += nr.unpacked_keys; | |
414 | ||
415 | bch2_verify_btree_nr_keys(dst); | |
416 | } | |
417 | ||
418 | #define SORT_CRIT (4096 / sizeof(u64)) | |
419 | ||
420 | /* | |
421 | * We're about to add another bset to the btree node, so if there's currently | |
422 | * too many bsets - sort some of them together: | |
423 | */ | |
424 | static bool btree_node_compact(struct bch_fs *c, struct btree *b, | |
425 | struct btree_iter *iter) | |
426 | { | |
427 | unsigned unwritten_idx; | |
428 | bool ret = false; | |
429 | ||
430 | for (unwritten_idx = 0; | |
431 | unwritten_idx < b->nsets; | |
432 | unwritten_idx++) | |
1fe08f31 | 433 | if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) |
1c6fdbd8 KO |
434 | break; |
435 | ||
436 | if (b->nsets - unwritten_idx > 1) { | |
437 | btree_node_sort(c, b, iter, unwritten_idx, | |
438 | b->nsets, false); | |
439 | ret = true; | |
440 | } | |
441 | ||
442 | if (unwritten_idx > 1) { | |
443 | btree_node_sort(c, b, iter, 0, unwritten_idx, false); | |
444 | ret = true; | |
445 | } | |
446 | ||
447 | return ret; | |
448 | } | |
449 | ||
450 | void bch2_btree_build_aux_trees(struct btree *b) | |
451 | { | |
452 | struct bset_tree *t; | |
453 | ||
454 | for_each_bset(b, t) | |
455 | bch2_bset_build_aux_tree(b, t, | |
1fe08f31 | 456 | !bset_written(b, bset(b, t)) && |
1c6fdbd8 KO |
457 | t == bset_tree_last(b)); |
458 | } | |
459 | ||
460 | /* | |
461 | * @bch_btree_init_next - initialize a new (unwritten) bset that can then be | |
462 | * inserted into | |
463 | * | |
464 | * Safe to call if there already is an unwritten bset - will only add a new bset | |
465 | * if @b doesn't already have one. | |
466 | * | |
467 | * Returns true if we sorted (i.e. invalidated iterators | |
468 | */ | |
469 | void bch2_btree_init_next(struct bch_fs *c, struct btree *b, | |
470 | struct btree_iter *iter) | |
471 | { | |
472 | struct btree_node_entry *bne; | |
473 | bool did_sort; | |
474 | ||
475 | EBUG_ON(!(b->lock.state.seq & 1)); | |
476 | EBUG_ON(iter && iter->l[b->level].b != b); | |
477 | ||
478 | did_sort = btree_node_compact(c, b, iter); | |
479 | ||
480 | bne = want_new_bset(c, b); | |
481 | if (bne) | |
482 | bch2_bset_init_next(c, b, bne); | |
483 | ||
484 | bch2_btree_build_aux_trees(b); | |
485 | ||
486 | if (iter && did_sort) | |
487 | bch2_btree_iter_reinit_node(iter, b); | |
488 | } | |
489 | ||
490 | static struct nonce btree_nonce(struct bset *i, unsigned offset) | |
491 | { | |
492 | return (struct nonce) {{ | |
493 | [0] = cpu_to_le32(offset), | |
494 | [1] = ((__le32 *) &i->seq)[0], | |
495 | [2] = ((__le32 *) &i->seq)[1], | |
496 | [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, | |
497 | }}; | |
498 | } | |
499 | ||
500 | static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) | |
501 | { | |
502 | struct nonce nonce = btree_nonce(i, offset); | |
503 | ||
504 | if (!offset) { | |
505 | struct btree_node *bn = container_of(i, struct btree_node, keys); | |
506 | unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; | |
507 | ||
508 | bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, | |
509 | bytes); | |
510 | ||
511 | nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); | |
512 | } | |
513 | ||
514 | bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, | |
515 | vstruct_end(i) - (void *) i->_data); | |
516 | } | |
517 | ||
319f9ac3 KO |
518 | static void btree_err_msg(struct printbuf *out, struct bch_fs *c, |
519 | struct btree *b, struct bset *i, | |
520 | unsigned offset, int write) | |
1c6fdbd8 | 521 | { |
319f9ac3 KO |
522 | pr_buf(out, "error validating btree node %s" |
523 | "at btree %u level %u/%u\n" | |
524 | "pos %llu:%llu node offset %u", | |
525 | write ? "before write " : "", | |
526 | b->btree_id, b->level, | |
527 | c->btree_roots[b->btree_id].level, | |
528 | b->key.k.p.inode, b->key.k.p.offset, | |
529 | b->written); | |
1c6fdbd8 | 530 | if (i) |
319f9ac3 | 531 | pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); |
1c6fdbd8 KO |
532 | } |
533 | ||
534 | enum btree_err_type { | |
535 | BTREE_ERR_FIXABLE, | |
536 | BTREE_ERR_WANT_RETRY, | |
537 | BTREE_ERR_MUST_RETRY, | |
538 | BTREE_ERR_FATAL, | |
539 | }; | |
540 | ||
541 | enum btree_validate_ret { | |
542 | BTREE_RETRY_READ = 64, | |
543 | }; | |
544 | ||
545 | #define btree_err(type, c, b, i, msg, ...) \ | |
546 | ({ \ | |
547 | __label__ out; \ | |
319f9ac3 KO |
548 | char _buf[300]; \ |
549 | struct printbuf out = PBUF(_buf); \ | |
1c6fdbd8 | 550 | \ |
319f9ac3 KO |
551 | btree_err_msg(&out, c, b, i, b->written, write); \ |
552 | pr_buf(&out, ": " msg, ##__VA_ARGS__); \ | |
1c6fdbd8 KO |
553 | \ |
554 | if (type == BTREE_ERR_FIXABLE && \ | |
555 | write == READ && \ | |
556 | !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ | |
557 | mustfix_fsck_err(c, "%s", _buf); \ | |
558 | goto out; \ | |
559 | } \ | |
560 | \ | |
561 | switch (write) { \ | |
562 | case READ: \ | |
563 | bch_err(c, "%s", _buf); \ | |
564 | \ | |
565 | switch (type) { \ | |
566 | case BTREE_ERR_FIXABLE: \ | |
567 | ret = BCH_FSCK_ERRORS_NOT_FIXED; \ | |
568 | goto fsck_err; \ | |
569 | case BTREE_ERR_WANT_RETRY: \ | |
570 | if (have_retry) { \ | |
571 | ret = BTREE_RETRY_READ; \ | |
572 | goto fsck_err; \ | |
573 | } \ | |
574 | break; \ | |
575 | case BTREE_ERR_MUST_RETRY: \ | |
576 | ret = BTREE_RETRY_READ; \ | |
577 | goto fsck_err; \ | |
578 | case BTREE_ERR_FATAL: \ | |
579 | ret = BCH_FSCK_ERRORS_NOT_FIXED; \ | |
580 | goto fsck_err; \ | |
581 | } \ | |
582 | break; \ | |
583 | case WRITE: \ | |
584 | bch_err(c, "corrupt metadata before write: %s", _buf); \ | |
585 | \ | |
586 | if (bch2_fs_inconsistent(c)) { \ | |
587 | ret = BCH_FSCK_ERRORS_NOT_FIXED; \ | |
588 | goto fsck_err; \ | |
589 | } \ | |
590 | break; \ | |
591 | } \ | |
592 | out: \ | |
593 | true; \ | |
594 | }) | |
595 | ||
596 | #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) | |
597 | ||
598 | static int validate_bset(struct bch_fs *c, struct btree *b, | |
599 | struct bset *i, unsigned sectors, | |
600 | unsigned *whiteout_u64s, int write, | |
601 | bool have_retry) | |
602 | { | |
603 | struct bkey_packed *k, *prev = NULL; | |
604 | struct bpos prev_pos = POS_MIN; | |
1c6fdbd8 | 605 | bool seen_non_whiteout = false; |
26609b61 | 606 | unsigned version; |
1c6fdbd8 KO |
607 | const char *err; |
608 | int ret = 0; | |
609 | ||
610 | if (i == &b->data->keys) { | |
611 | /* These indicate that we read the wrong btree node: */ | |
612 | btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id, | |
613 | BTREE_ERR_MUST_RETRY, c, b, i, | |
614 | "incorrect btree id"); | |
615 | ||
616 | btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level, | |
617 | BTREE_ERR_MUST_RETRY, c, b, i, | |
618 | "incorrect level"); | |
619 | ||
620 | if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { | |
621 | u64 *p = (u64 *) &b->data->ptr; | |
622 | ||
623 | *p = swab64(*p); | |
624 | bch2_bpos_swab(&b->data->min_key); | |
625 | bch2_bpos_swab(&b->data->max_key); | |
626 | } | |
627 | ||
628 | btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p), | |
629 | BTREE_ERR_MUST_RETRY, c, b, i, | |
630 | "incorrect max key"); | |
631 | ||
632 | /* XXX: ideally we would be validating min_key too */ | |
633 | #if 0 | |
634 | /* | |
635 | * not correct anymore, due to btree node write error | |
636 | * handling | |
637 | * | |
638 | * need to add b->data->seq to btree keys and verify | |
639 | * against that | |
640 | */ | |
641 | btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), | |
642 | b->data->ptr), | |
643 | BTREE_ERR_FATAL, c, b, i, | |
644 | "incorrect backpointer"); | |
645 | #endif | |
646 | err = bch2_bkey_format_validate(&b->data->format); | |
647 | btree_err_on(err, | |
648 | BTREE_ERR_FATAL, c, b, i, | |
649 | "invalid bkey format: %s", err); | |
650 | } | |
651 | ||
26609b61 KO |
652 | version = le16_to_cpu(i->version); |
653 | btree_err_on((version != BCH_BSET_VERSION_OLD && | |
654 | version < bcachefs_metadata_version_min) || | |
655 | version >= bcachefs_metadata_version_max, | |
656 | BTREE_ERR_FATAL, c, b, i, | |
657 | "unsupported bset version"); | |
1c6fdbd8 KO |
658 | |
659 | if (btree_err_on(b->written + sectors > c->opts.btree_node_size, | |
660 | BTREE_ERR_FIXABLE, c, b, i, | |
661 | "bset past end of btree node")) { | |
662 | i->u64s = 0; | |
663 | return 0; | |
664 | } | |
665 | ||
666 | btree_err_on(b->written && !i->u64s, | |
667 | BTREE_ERR_FIXABLE, c, b, i, | |
668 | "empty bset"); | |
669 | ||
670 | if (!BSET_SEPARATE_WHITEOUTS(i)) { | |
671 | seen_non_whiteout = true; | |
672 | *whiteout_u64s = 0; | |
673 | } | |
674 | ||
675 | for (k = i->start; | |
676 | k != vstruct_last(i);) { | |
677 | struct bkey_s_c u; | |
678 | struct bkey tmp; | |
679 | const char *invalid; | |
680 | ||
681 | if (btree_err_on(!k->u64s, | |
682 | BTREE_ERR_FIXABLE, c, b, i, | |
683 | "KEY_U64s 0: %zu bytes of metadata lost", | |
684 | vstruct_end(i) - (void *) k)) { | |
685 | i->u64s = cpu_to_le16((u64 *) k - i->_data); | |
686 | break; | |
687 | } | |
688 | ||
689 | if (btree_err_on(bkey_next(k) > vstruct_last(i), | |
690 | BTREE_ERR_FIXABLE, c, b, i, | |
691 | "key extends past end of bset")) { | |
692 | i->u64s = cpu_to_le16((u64 *) k - i->_data); | |
693 | break; | |
694 | } | |
695 | ||
696 | if (btree_err_on(k->format > KEY_FORMAT_CURRENT, | |
697 | BTREE_ERR_FIXABLE, c, b, i, | |
698 | "invalid bkey format %u", k->format)) { | |
699 | i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); | |
700 | memmove_u64s_down(k, bkey_next(k), | |
701 | (u64 *) vstruct_end(i) - (u64 *) k); | |
702 | continue; | |
703 | } | |
704 | ||
705 | if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) | |
26609b61 KO |
706 | bch2_bkey_swab(&b->format, k); |
707 | ||
708 | if (!write && | |
709 | version < bcachefs_metadata_version_bkey_renumber) | |
710 | bch2_bkey_renumber(btree_node_type(b), k, write); | |
1c6fdbd8 KO |
711 | |
712 | u = bkey_disassemble(b, k, &tmp); | |
713 | ||
26609b61 | 714 | invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?: |
1c6fdbd8 | 715 | bch2_bkey_in_btree_node(b, u) ?: |
26609b61 | 716 | (write ? bch2_bkey_val_invalid(c, u) : NULL); |
1c6fdbd8 KO |
717 | if (invalid) { |
718 | char buf[160]; | |
719 | ||
26609b61 | 720 | bch2_bkey_val_to_text(&PBUF(buf), c, u); |
1c6fdbd8 KO |
721 | btree_err(BTREE_ERR_FIXABLE, c, b, i, |
722 | "invalid bkey:\n%s\n%s", invalid, buf); | |
723 | ||
724 | i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); | |
725 | memmove_u64s_down(k, bkey_next(k), | |
726 | (u64 *) vstruct_end(i) - (u64 *) k); | |
727 | continue; | |
728 | } | |
729 | ||
26609b61 KO |
730 | if (write && |
731 | version < bcachefs_metadata_version_bkey_renumber) | |
732 | bch2_bkey_renumber(btree_node_type(b), k, write); | |
733 | ||
1c6fdbd8 KO |
734 | /* |
735 | * with the separate whiteouts thing (used for extents), the | |
736 | * second set of keys actually can have whiteouts too, so we | |
737 | * can't solely go off bkey_whiteout()... | |
738 | */ | |
739 | ||
740 | if (!seen_non_whiteout && | |
741 | (!bkey_whiteout(k) || | |
742 | (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { | |
743 | *whiteout_u64s = k->_data - i->_data; | |
744 | seen_non_whiteout = true; | |
745 | } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { | |
746 | btree_err(BTREE_ERR_FATAL, c, b, i, | |
747 | "keys out of order: %llu:%llu > %llu:%llu", | |
748 | prev_pos.inode, | |
749 | prev_pos.offset, | |
750 | u.k->p.inode, | |
751 | bkey_start_offset(u.k)); | |
752 | /* XXX: repair this */ | |
753 | } | |
754 | ||
755 | prev_pos = u.k->p; | |
756 | prev = k; | |
757 | k = bkey_next(k); | |
758 | } | |
759 | ||
760 | SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); | |
761 | fsck_err: | |
762 | return ret; | |
763 | } | |
764 | ||
765 | int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) | |
766 | { | |
767 | struct btree_node_entry *bne; | |
768 | struct btree_node_iter_large *iter; | |
769 | struct btree_node *sorted; | |
770 | struct bkey_packed *k; | |
771 | struct bset *i; | |
772 | bool used_mempool; | |
773 | unsigned u64s; | |
774 | int ret, retry_read = 0, write = READ; | |
775 | ||
776 | iter = mempool_alloc(&c->fill_iter, GFP_NOIO); | |
271a3d3a | 777 | iter->used = 0; |
1c6fdbd8 KO |
778 | |
779 | if (bch2_meta_read_fault("btree")) | |
780 | btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, | |
781 | "dynamic fault"); | |
782 | ||
783 | btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), | |
784 | BTREE_ERR_MUST_RETRY, c, b, NULL, | |
785 | "bad magic"); | |
786 | ||
787 | btree_err_on(!b->data->keys.seq, | |
788 | BTREE_ERR_MUST_RETRY, c, b, NULL, | |
789 | "bad btree header"); | |
790 | ||
791 | while (b->written < c->opts.btree_node_size) { | |
792 | unsigned sectors, whiteout_u64s = 0; | |
793 | struct nonce nonce; | |
794 | struct bch_csum csum; | |
795 | bool first = !b->written; | |
796 | ||
797 | if (!b->written) { | |
798 | i = &b->data->keys; | |
799 | ||
800 | btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), | |
801 | BTREE_ERR_WANT_RETRY, c, b, i, | |
802 | "unknown checksum type"); | |
803 | ||
804 | nonce = btree_nonce(i, b->written << 9); | |
805 | csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); | |
806 | ||
807 | btree_err_on(bch2_crc_cmp(csum, b->data->csum), | |
808 | BTREE_ERR_WANT_RETRY, c, b, i, | |
809 | "invalid checksum"); | |
810 | ||
811 | bset_encrypt(c, i, b->written << 9); | |
812 | ||
813 | sectors = vstruct_sectors(b->data, c->block_bits); | |
814 | ||
815 | btree_node_set_format(b, b->data->format); | |
816 | } else { | |
817 | bne = write_block(b); | |
818 | i = &bne->keys; | |
819 | ||
820 | if (i->seq != b->data->keys.seq) | |
821 | break; | |
822 | ||
823 | btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), | |
824 | BTREE_ERR_WANT_RETRY, c, b, i, | |
825 | "unknown checksum type"); | |
826 | ||
827 | nonce = btree_nonce(i, b->written << 9); | |
828 | csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); | |
829 | ||
830 | btree_err_on(bch2_crc_cmp(csum, bne->csum), | |
831 | BTREE_ERR_WANT_RETRY, c, b, i, | |
832 | "invalid checksum"); | |
833 | ||
834 | bset_encrypt(c, i, b->written << 9); | |
835 | ||
836 | sectors = vstruct_sectors(bne, c->block_bits); | |
837 | } | |
838 | ||
839 | ret = validate_bset(c, b, i, sectors, &whiteout_u64s, | |
840 | READ, have_retry); | |
841 | if (ret) | |
842 | goto fsck_err; | |
843 | ||
844 | b->written += sectors; | |
845 | ||
846 | ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b); | |
847 | if (ret < 0) { | |
848 | btree_err(BTREE_ERR_FATAL, c, b, i, | |
849 | "insufficient memory"); | |
850 | goto err; | |
851 | } | |
852 | ||
853 | if (ret) { | |
854 | btree_err_on(first, | |
855 | BTREE_ERR_FIXABLE, c, b, i, | |
856 | "first btree node bset has blacklisted journal seq"); | |
857 | if (!first) | |
858 | continue; | |
859 | } | |
860 | ||
861 | bch2_btree_node_iter_large_push(iter, b, | |
862 | i->start, | |
863 | vstruct_idx(i, whiteout_u64s)); | |
864 | ||
865 | bch2_btree_node_iter_large_push(iter, b, | |
866 | vstruct_idx(i, whiteout_u64s), | |
867 | vstruct_last(i)); | |
868 | } | |
869 | ||
870 | for (bne = write_block(b); | |
871 | bset_byte_offset(b, bne) < btree_bytes(c); | |
872 | bne = (void *) bne + block_bytes(c)) | |
873 | btree_err_on(bne->keys.seq == b->data->keys.seq, | |
874 | BTREE_ERR_WANT_RETRY, c, b, NULL, | |
875 | "found bset signature after last bset"); | |
876 | ||
877 | sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool); | |
878 | sorted->keys.u64s = 0; | |
879 | ||
880 | set_btree_bset(b, b->set, &b->data->keys); | |
881 | ||
882 | b->nr = btree_node_is_extents(b) | |
883 | ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter) | |
884 | : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter); | |
885 | ||
886 | u64s = le16_to_cpu(sorted->keys.u64s); | |
887 | *sorted = *b->data; | |
888 | sorted->keys.u64s = cpu_to_le16(u64s); | |
889 | swap(sorted, b->data); | |
890 | set_btree_bset(b, b->set, &b->data->keys); | |
891 | b->nsets = 1; | |
892 | ||
893 | BUG_ON(b->nr.live_u64s != u64s); | |
894 | ||
895 | btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); | |
896 | ||
897 | i = &b->data->keys; | |
898 | for (k = i->start; k != vstruct_last(i);) { | |
1c6fdbd8 KO |
899 | struct bkey tmp; |
900 | struct bkey_s_c u = bkey_disassemble(b, k, &tmp); | |
26609b61 | 901 | const char *invalid = bch2_bkey_val_invalid(c, u); |
1c6fdbd8 KO |
902 | |
903 | if (invalid || | |
904 | (inject_invalid_keys(c) && | |
905 | !bversion_cmp(u.k->version, MAX_VERSION))) { | |
906 | char buf[160]; | |
907 | ||
26609b61 | 908 | bch2_bkey_val_to_text(&PBUF(buf), c, u); |
1c6fdbd8 KO |
909 | btree_err(BTREE_ERR_FIXABLE, c, b, i, |
910 | "invalid bkey %s: %s", buf, invalid); | |
911 | ||
912 | btree_keys_account_key_drop(&b->nr, 0, k); | |
913 | ||
914 | i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); | |
915 | memmove_u64s_down(k, bkey_next(k), | |
916 | (u64 *) vstruct_end(i) - (u64 *) k); | |
917 | set_btree_bset_end(b, b->set); | |
918 | continue; | |
919 | } | |
920 | ||
921 | k = bkey_next(k); | |
922 | } | |
923 | ||
924 | bch2_bset_build_aux_tree(b, b->set, false); | |
925 | ||
926 | set_needs_whiteout(btree_bset_first(b)); | |
927 | ||
928 | btree_node_reset_sib_u64s(b); | |
929 | out: | |
930 | mempool_free(iter, &c->fill_iter); | |
931 | return retry_read; | |
932 | err: | |
933 | fsck_err: | |
934 | if (ret == BTREE_RETRY_READ) { | |
935 | retry_read = 1; | |
936 | } else { | |
937 | bch2_inconsistent_error(c); | |
938 | set_btree_node_read_error(b); | |
939 | } | |
940 | goto out; | |
941 | } | |
942 | ||
943 | static void btree_node_read_work(struct work_struct *work) | |
944 | { | |
945 | struct btree_read_bio *rb = | |
946 | container_of(work, struct btree_read_bio, work); | |
947 | struct bch_fs *c = rb->c; | |
948 | struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); | |
949 | struct btree *b = rb->bio.bi_private; | |
950 | struct bio *bio = &rb->bio; | |
5bd95a37 | 951 | struct bch_io_failures failed = { .nr = 0 }; |
1c6fdbd8 KO |
952 | bool can_retry; |
953 | ||
1c6fdbd8 KO |
954 | goto start; |
955 | while (1) { | |
956 | bch_info(c, "retrying read"); | |
957 | ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); | |
958 | rb->have_ioref = bch2_dev_get_ioref(ca, READ); | |
959 | bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); | |
960 | bio->bi_iter.bi_sector = rb->pick.ptr.offset; | |
961 | bio->bi_iter.bi_size = btree_bytes(c); | |
962 | ||
963 | if (rb->have_ioref) { | |
964 | bio_set_dev(bio, ca->disk_sb.bdev); | |
965 | submit_bio_wait(bio); | |
966 | } else { | |
967 | bio->bi_status = BLK_STS_REMOVED; | |
968 | } | |
969 | start: | |
970 | bch2_dev_io_err_on(bio->bi_status, ca, "btree read"); | |
971 | if (rb->have_ioref) | |
972 | percpu_ref_put(&ca->io_ref); | |
973 | rb->have_ioref = false; | |
974 | ||
5bd95a37 KO |
975 | bch2_mark_io_failure(&failed, &rb->pick); |
976 | ||
26609b61 KO |
977 | can_retry = bch2_bkey_pick_read_device(c, |
978 | bkey_i_to_s_c(&b->key), | |
979 | &failed, &rb->pick) > 0; | |
1c6fdbd8 KO |
980 | |
981 | if (!bio->bi_status && | |
982 | !bch2_btree_node_read_done(c, b, can_retry)) | |
983 | break; | |
984 | ||
985 | if (!can_retry) { | |
986 | set_btree_node_read_error(b); | |
987 | break; | |
988 | } | |
989 | } | |
990 | ||
991 | bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time); | |
992 | bio_put(&rb->bio); | |
993 | clear_btree_node_read_in_flight(b); | |
994 | wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); | |
995 | } | |
996 | ||
997 | static void btree_node_read_endio(struct bio *bio) | |
998 | { | |
999 | struct btree_read_bio *rb = | |
1000 | container_of(bio, struct btree_read_bio, bio); | |
1001 | struct bch_fs *c = rb->c; | |
1002 | ||
1003 | if (rb->have_ioref) { | |
1004 | struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); | |
1005 | bch2_latency_acct(ca, rb->start_time, READ); | |
1006 | } | |
1007 | ||
1008 | queue_work(system_unbound_wq, &rb->work); | |
1009 | } | |
1010 | ||
1011 | void bch2_btree_node_read(struct bch_fs *c, struct btree *b, | |
1012 | bool sync) | |
1013 | { | |
4cb13156 | 1014 | struct extent_ptr_decoded pick; |
1c6fdbd8 KO |
1015 | struct btree_read_bio *rb; |
1016 | struct bch_dev *ca; | |
1017 | struct bio *bio; | |
1018 | int ret; | |
1019 | ||
1020 | trace_btree_read(c, b); | |
1021 | ||
26609b61 KO |
1022 | ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), |
1023 | NULL, &pick); | |
1c6fdbd8 KO |
1024 | if (bch2_fs_fatal_err_on(ret <= 0, c, |
1025 | "btree node read error: no device to read from")) { | |
1026 | set_btree_node_read_error(b); | |
1027 | return; | |
1028 | } | |
1029 | ||
1030 | ca = bch_dev_bkey_exists(c, pick.ptr.dev); | |
1031 | ||
1032 | bio = bio_alloc_bioset(NULL, | |
1033 | buf_pages(b->data, btree_bytes(c)), | |
1034 | REQ_OP_READ|REQ_SYNC|REQ_META, | |
1035 | GFP_NOIO, | |
1036 | &c->btree_bio); | |
1037 | rb = container_of(bio, struct btree_read_bio, bio); | |
1038 | rb->c = c; | |
1039 | rb->start_time = local_clock(); | |
1040 | rb->have_ioref = bch2_dev_get_ioref(ca, READ); | |
1041 | rb->pick = pick; | |
1042 | INIT_WORK(&rb->work, btree_node_read_work); | |
1043 | bio->bi_iter.bi_sector = pick.ptr.offset; | |
1044 | bio->bi_iter.bi_size = btree_bytes(c); | |
1045 | bio->bi_end_io = btree_node_read_endio; | |
1046 | bio->bi_private = b; | |
1047 | bch2_bio_map(bio, b->data); | |
1048 | ||
1049 | set_btree_node_read_in_flight(b); | |
1050 | ||
1051 | if (rb->have_ioref) { | |
1052 | this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE], | |
1053 | bio_sectors(bio)); | |
1054 | bio_set_dev(bio, ca->disk_sb.bdev); | |
1055 | ||
1056 | if (sync) { | |
1057 | submit_bio_wait(bio); | |
1058 | ||
1059 | bio->bi_private = b; | |
1060 | btree_node_read_work(&rb->work); | |
1061 | } else { | |
1062 | submit_bio(bio); | |
1063 | } | |
1064 | } else { | |
1065 | bio->bi_status = BLK_STS_REMOVED; | |
1066 | ||
1067 | if (sync) | |
1068 | btree_node_read_work(&rb->work); | |
1069 | else | |
1070 | queue_work(system_unbound_wq, &rb->work); | |
1071 | ||
1072 | } | |
1073 | } | |
1074 | ||
1075 | int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, | |
1076 | const struct bkey_i *k, unsigned level) | |
1077 | { | |
1078 | struct closure cl; | |
1079 | struct btree *b; | |
1080 | int ret; | |
1081 | ||
1082 | closure_init_stack(&cl); | |
1083 | ||
1084 | do { | |
1085 | ret = bch2_btree_cache_cannibalize_lock(c, &cl); | |
1086 | closure_sync(&cl); | |
1087 | } while (ret); | |
1088 | ||
1089 | b = bch2_btree_node_mem_alloc(c); | |
1090 | bch2_btree_cache_cannibalize_unlock(c); | |
1091 | ||
1092 | BUG_ON(IS_ERR(b)); | |
1093 | ||
1094 | bkey_copy(&b->key, k); | |
1095 | BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); | |
1096 | ||
1097 | bch2_btree_node_read(c, b, true); | |
1098 | ||
1099 | if (btree_node_read_error(b)) { | |
1100 | bch2_btree_node_hash_remove(&c->btree_cache, b); | |
1101 | ||
1102 | mutex_lock(&c->btree_cache.lock); | |
1103 | list_move(&b->list, &c->btree_cache.freeable); | |
1104 | mutex_unlock(&c->btree_cache.lock); | |
1105 | ||
1106 | ret = -EIO; | |
1107 | goto err; | |
1108 | } | |
1109 | ||
1110 | bch2_btree_set_root_for_read(c, b); | |
1111 | err: | |
1112 | six_unlock_write(&b->lock); | |
1113 | six_unlock_intent(&b->lock); | |
1114 | ||
1115 | return ret; | |
1116 | } | |
1117 | ||
1118 | void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, | |
1119 | struct btree_write *w) | |
1120 | { | |
1121 | unsigned long old, new, v = READ_ONCE(b->will_make_reachable); | |
1122 | ||
1123 | do { | |
1124 | old = new = v; | |
1125 | if (!(old & 1)) | |
1126 | break; | |
1127 | ||
1128 | new &= ~1UL; | |
1129 | } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); | |
1130 | ||
1131 | if (old & 1) | |
1132 | closure_put(&((struct btree_update *) new)->cl); | |
1133 | ||
1134 | bch2_journal_pin_drop(&c->journal, &w->journal); | |
1135 | closure_wake_up(&w->wait); | |
1136 | } | |
1137 | ||
1138 | static void btree_node_write_done(struct bch_fs *c, struct btree *b) | |
1139 | { | |
1140 | struct btree_write *w = btree_prev_write(b); | |
1141 | ||
1142 | bch2_btree_complete_write(c, b, w); | |
1143 | btree_node_io_unlock(b); | |
1144 | } | |
1145 | ||
1146 | static void bch2_btree_node_write_error(struct bch_fs *c, | |
1147 | struct btree_write_bio *wbio) | |
1148 | { | |
1149 | struct btree *b = wbio->wbio.bio.bi_private; | |
1150 | __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; | |
26609b61 KO |
1151 | struct bkey_i_btree_ptr *new_key; |
1152 | struct bkey_s_btree_ptr bp; | |
1c6fdbd8 KO |
1153 | struct bch_extent_ptr *ptr; |
1154 | struct btree_iter iter; | |
1155 | int ret; | |
1156 | ||
1157 | __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p, | |
1158 | BTREE_MAX_DEPTH, | |
1159 | b->level, BTREE_ITER_NODES); | |
1160 | retry: | |
1161 | ret = bch2_btree_iter_traverse(&iter); | |
1162 | if (ret) | |
1163 | goto err; | |
1164 | ||
1165 | /* has node been freed? */ | |
1166 | if (iter.l[b->level].b != b) { | |
1167 | /* node has been freed: */ | |
1168 | BUG_ON(!btree_node_dying(b)); | |
1169 | goto out; | |
1170 | } | |
1171 | ||
1172 | BUG_ON(!btree_node_hashed(b)); | |
1173 | ||
1174 | bkey_copy(&tmp.k, &b->key); | |
1175 | ||
26609b61 KO |
1176 | new_key = bkey_i_to_btree_ptr(&tmp.k); |
1177 | bp = btree_ptr_i_to_s(new_key); | |
a2753581 | 1178 | |
26609b61 | 1179 | bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, |
a2753581 | 1180 | bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); |
1c6fdbd8 | 1181 | |
26609b61 | 1182 | if (!bch2_bkey_nr_ptrs(bp.s_c)) |
1c6fdbd8 KO |
1183 | goto err; |
1184 | ||
1185 | ret = bch2_btree_node_update_key(c, &iter, b, new_key); | |
1186 | if (ret == -EINTR) | |
1187 | goto retry; | |
1188 | if (ret) | |
1189 | goto err; | |
1190 | out: | |
1191 | bch2_btree_iter_unlock(&iter); | |
1192 | bio_put(&wbio->wbio.bio); | |
1193 | btree_node_write_done(c, b); | |
1194 | return; | |
1195 | err: | |
1196 | set_btree_node_noevict(b); | |
1197 | bch2_fs_fatal_error(c, "fatal error writing btree node"); | |
1198 | goto out; | |
1199 | } | |
1200 | ||
1201 | void bch2_btree_write_error_work(struct work_struct *work) | |
1202 | { | |
1203 | struct bch_fs *c = container_of(work, struct bch_fs, | |
1204 | btree_write_error_work); | |
1205 | struct bio *bio; | |
1206 | ||
1207 | while (1) { | |
1208 | spin_lock_irq(&c->btree_write_error_lock); | |
1209 | bio = bio_list_pop(&c->btree_write_error_list); | |
1210 | spin_unlock_irq(&c->btree_write_error_lock); | |
1211 | ||
1212 | if (!bio) | |
1213 | break; | |
1214 | ||
1215 | bch2_btree_node_write_error(c, | |
1216 | container_of(bio, struct btree_write_bio, wbio.bio)); | |
1217 | } | |
1218 | } | |
1219 | ||
1220 | static void btree_node_write_work(struct work_struct *work) | |
1221 | { | |
1222 | struct btree_write_bio *wbio = | |
1223 | container_of(work, struct btree_write_bio, work); | |
1224 | struct bch_fs *c = wbio->wbio.c; | |
1225 | struct btree *b = wbio->wbio.bio.bi_private; | |
1226 | ||
1227 | btree_bounce_free(c, | |
1228 | wbio->wbio.order, | |
1229 | wbio->wbio.used_mempool, | |
1230 | wbio->data); | |
1231 | ||
1232 | if (wbio->wbio.failed.nr) { | |
1233 | unsigned long flags; | |
1234 | ||
1235 | spin_lock_irqsave(&c->btree_write_error_lock, flags); | |
1236 | bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); | |
1237 | spin_unlock_irqrestore(&c->btree_write_error_lock, flags); | |
1238 | ||
1239 | queue_work(c->wq, &c->btree_write_error_work); | |
1240 | return; | |
1241 | } | |
1242 | ||
1243 | bio_put(&wbio->wbio.bio); | |
1244 | btree_node_write_done(c, b); | |
1245 | } | |
1246 | ||
1247 | static void btree_node_write_endio(struct bio *bio) | |
1248 | { | |
1249 | struct bch_write_bio *wbio = to_wbio(bio); | |
1250 | struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; | |
1251 | struct bch_write_bio *orig = parent ?: wbio; | |
1252 | struct bch_fs *c = wbio->c; | |
1253 | struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); | |
1254 | unsigned long flags; | |
1255 | ||
1256 | if (wbio->have_ioref) | |
1257 | bch2_latency_acct(ca, wbio->submit_time, WRITE); | |
1258 | ||
1259 | if (bio->bi_status == BLK_STS_REMOVED || | |
1260 | bch2_dev_io_err_on(bio->bi_status, ca, "btree write") || | |
1261 | bch2_meta_write_fault("btree")) { | |
1262 | spin_lock_irqsave(&c->btree_write_error_lock, flags); | |
1263 | bch2_dev_list_add_dev(&orig->failed, wbio->dev); | |
1264 | spin_unlock_irqrestore(&c->btree_write_error_lock, flags); | |
1265 | } | |
1266 | ||
1267 | if (wbio->have_ioref) | |
1268 | percpu_ref_put(&ca->io_ref); | |
1269 | ||
1270 | if (parent) { | |
1271 | bio_put(bio); | |
1272 | bio_endio(&parent->bio); | |
1273 | } else { | |
1274 | struct btree_write_bio *wb = | |
1275 | container_of(orig, struct btree_write_bio, wbio); | |
1276 | ||
1277 | INIT_WORK(&wb->work, btree_node_write_work); | |
1278 | queue_work(system_unbound_wq, &wb->work); | |
1279 | } | |
1280 | } | |
1281 | ||
1282 | static int validate_bset_for_write(struct bch_fs *c, struct btree *b, | |
1283 | struct bset *i, unsigned sectors) | |
1284 | { | |
1c6fdbd8 KO |
1285 | unsigned whiteout_u64s = 0; |
1286 | int ret; | |
1287 | ||
26609b61 KO |
1288 | if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) |
1289 | return -1; | |
1c6fdbd8 KO |
1290 | |
1291 | ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false); | |
1292 | if (ret) | |
1293 | bch2_inconsistent_error(c); | |
1294 | ||
1295 | return ret; | |
1296 | } | |
1297 | ||
1298 | void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, | |
1299 | enum six_lock_type lock_type_held) | |
1300 | { | |
1301 | struct btree_write_bio *wbio; | |
1302 | struct bset_tree *t; | |
1303 | struct bset *i; | |
1304 | struct btree_node *bn = NULL; | |
1305 | struct btree_node_entry *bne = NULL; | |
1306 | BKEY_PADDED(key) k; | |
1c6fdbd8 KO |
1307 | struct bch_extent_ptr *ptr; |
1308 | struct sort_iter sort_iter; | |
1309 | struct nonce nonce; | |
1310 | unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; | |
1311 | u64 seq = 0; | |
1312 | bool used_mempool; | |
1313 | unsigned long old, new; | |
26609b61 | 1314 | bool validate_before_checksum = false; |
1c6fdbd8 KO |
1315 | void *data; |
1316 | ||
1317 | if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) | |
1318 | return; | |
1319 | ||
1320 | /* | |
1321 | * We may only have a read lock on the btree node - the dirty bit is our | |
1322 | * "lock" against racing with other threads that may be trying to start | |
1323 | * a write, we do a write iff we clear the dirty bit. Since setting the | |
1324 | * dirty bit requires a write lock, we can't race with other threads | |
1325 | * redirtying it: | |
1326 | */ | |
1327 | do { | |
1328 | old = new = READ_ONCE(b->flags); | |
1329 | ||
1330 | if (!(old & (1 << BTREE_NODE_dirty))) | |
1331 | return; | |
1332 | ||
1333 | if (b->written && | |
1334 | !btree_node_may_write(b)) | |
1335 | return; | |
1336 | ||
1337 | if (old & (1 << BTREE_NODE_write_in_flight)) { | |
1338 | btree_node_wait_on_io(b); | |
1339 | continue; | |
1340 | } | |
1341 | ||
1342 | new &= ~(1 << BTREE_NODE_dirty); | |
1343 | new &= ~(1 << BTREE_NODE_need_write); | |
1344 | new |= (1 << BTREE_NODE_write_in_flight); | |
1345 | new |= (1 << BTREE_NODE_just_written); | |
1346 | new ^= (1 << BTREE_NODE_write_idx); | |
1347 | } while (cmpxchg_acquire(&b->flags, old, new) != old); | |
1348 | ||
1349 | BUG_ON(btree_node_fake(b)); | |
1350 | BUG_ON(!list_empty(&b->write_blocked)); | |
1351 | BUG_ON((b->will_make_reachable != 0) != !b->written); | |
1352 | ||
1353 | BUG_ON(b->written >= c->opts.btree_node_size); | |
1354 | BUG_ON(b->written & (c->opts.block_size - 1)); | |
1355 | BUG_ON(bset_written(b, btree_bset_last(b))); | |
1356 | BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); | |
1357 | BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); | |
1358 | ||
1359 | /* | |
1360 | * We can't block on six_lock_write() here; another thread might be | |
1361 | * trying to get a journal reservation with read locks held, and getting | |
1362 | * a journal reservation might be blocked on flushing the journal and | |
1363 | * doing btree writes: | |
1364 | */ | |
1365 | if (lock_type_held == SIX_LOCK_intent && | |
1366 | six_trylock_write(&b->lock)) { | |
1367 | __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN); | |
1368 | six_unlock_write(&b->lock); | |
1369 | } else { | |
1370 | __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK); | |
1371 | } | |
1372 | ||
1373 | BUG_ON(b->uncompacted_whiteout_u64s); | |
1374 | ||
1375 | sort_iter_init(&sort_iter, b); | |
1376 | ||
1377 | bytes = !b->written | |
1378 | ? sizeof(struct btree_node) | |
1379 | : sizeof(struct btree_node_entry); | |
1380 | ||
1381 | bytes += b->whiteout_u64s * sizeof(u64); | |
1382 | ||
1383 | for_each_bset(b, t) { | |
1384 | i = bset(b, t); | |
1385 | ||
1386 | if (bset_written(b, i)) | |
1387 | continue; | |
1388 | ||
1389 | bytes += le16_to_cpu(i->u64s) * sizeof(u64); | |
1390 | sort_iter_add(&sort_iter, | |
1391 | btree_bkey_first(b, t), | |
1392 | btree_bkey_last(b, t)); | |
1393 | seq = max(seq, le64_to_cpu(i->journal_seq)); | |
1394 | } | |
1395 | ||
1396 | order = get_order(bytes); | |
1397 | data = btree_bounce_alloc(c, order, &used_mempool); | |
1398 | ||
1399 | if (!b->written) { | |
1400 | bn = data; | |
1401 | *bn = *b->data; | |
1402 | i = &bn->keys; | |
1403 | } else { | |
1404 | bne = data; | |
1405 | bne->keys = b->data->keys; | |
1406 | i = &bne->keys; | |
1407 | } | |
1408 | ||
1409 | i->journal_seq = cpu_to_le64(seq); | |
1410 | i->u64s = 0; | |
1411 | ||
1412 | if (!btree_node_is_extents(b)) { | |
1413 | sort_iter_add(&sort_iter, | |
1414 | unwritten_whiteouts_start(c, b), | |
1415 | unwritten_whiteouts_end(c, b)); | |
1416 | SET_BSET_SEPARATE_WHITEOUTS(i, false); | |
1417 | } else { | |
1418 | memcpy_u64s(i->start, | |
1419 | unwritten_whiteouts_start(c, b), | |
1420 | b->whiteout_u64s); | |
1421 | i->u64s = cpu_to_le16(b->whiteout_u64s); | |
1422 | SET_BSET_SEPARATE_WHITEOUTS(i, true); | |
1423 | } | |
1424 | ||
1425 | b->whiteout_u64s = 0; | |
1426 | ||
1427 | u64s = btree_node_is_extents(b) | |
5b8a9227 KO |
1428 | ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) |
1429 | : bch2_sort_keys(i->start, &sort_iter, false); | |
1c6fdbd8 KO |
1430 | le16_add_cpu(&i->u64s, u64s); |
1431 | ||
1432 | clear_needs_whiteout(i); | |
1433 | ||
1434 | /* do we have data to write? */ | |
1435 | if (b->written && !i->u64s) | |
1436 | goto nowrite; | |
1437 | ||
1438 | bytes_to_write = vstruct_end(i) - data; | |
1439 | sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; | |
1440 | ||
1441 | memset(data + bytes_to_write, 0, | |
1442 | (sectors_to_write << 9) - bytes_to_write); | |
1443 | ||
1444 | BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); | |
1445 | BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); | |
1446 | BUG_ON(i->seq != b->data->keys.seq); | |
1447 | ||
26609b61 KO |
1448 | i->version = c->sb.version < bcachefs_metadata_version_new_versioning |
1449 | ? cpu_to_le16(BCH_BSET_VERSION_OLD) | |
1450 | : cpu_to_le16(c->sb.version); | |
1c6fdbd8 KO |
1451 | SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); |
1452 | ||
26609b61 KO |
1453 | if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) |
1454 | validate_before_checksum = true; | |
1455 | ||
1456 | /* validate_bset will be modifying: */ | |
1457 | if (le16_to_cpu(i->version) < | |
1458 | bcachefs_metadata_version_bkey_renumber) | |
1459 | validate_before_checksum = true; | |
1460 | ||
1c6fdbd8 | 1461 | /* if we're going to be encrypting, check metadata validity first: */ |
26609b61 | 1462 | if (validate_before_checksum && |
1c6fdbd8 KO |
1463 | validate_bset_for_write(c, b, i, sectors_to_write)) |
1464 | goto err; | |
1465 | ||
1466 | bset_encrypt(c, i, b->written << 9); | |
1467 | ||
1468 | nonce = btree_nonce(i, b->written << 9); | |
1469 | ||
1470 | if (bn) | |
1471 | bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); | |
1472 | else | |
1473 | bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); | |
1474 | ||
1475 | /* if we're not encrypting, check metadata after checksumming: */ | |
26609b61 | 1476 | if (!validate_before_checksum && |
1c6fdbd8 KO |
1477 | validate_bset_for_write(c, b, i, sectors_to_write)) |
1478 | goto err; | |
1479 | ||
1480 | /* | |
1481 | * We handle btree write errors by immediately halting the journal - | |
1482 | * after we've done that, we can't issue any subsequent btree writes | |
1483 | * because they might have pointers to new nodes that failed to write. | |
1484 | * | |
1485 | * Furthermore, there's no point in doing any more btree writes because | |
1486 | * with the journal stopped, we're never going to update the journal to | |
1487 | * reflect that those writes were done and the data flushed from the | |
1488 | * journal: | |
1489 | * | |
1490 | * Make sure to update b->written so bch2_btree_init_next() doesn't | |
1491 | * break: | |
1492 | */ | |
1493 | if (bch2_journal_error(&c->journal) || | |
1494 | c->opts.nochanges) | |
1495 | goto err; | |
1496 | ||
1497 | trace_btree_write(b, bytes_to_write, sectors_to_write); | |
1498 | ||
ac10a961 KO |
1499 | wbio = container_of(bio_alloc_bioset(NULL, |
1500 | buf_pages(data, sectors_to_write << 9), | |
1c6fdbd8 KO |
1501 | REQ_OP_WRITE|REQ_META|REQ_FUA, |
1502 | GFP_NOIO, | |
1503 | &c->btree_bio), | |
1504 | struct btree_write_bio, wbio.bio); | |
1505 | wbio_init(&wbio->wbio.bio); | |
1506 | wbio->data = data; | |
1507 | wbio->wbio.order = order; | |
1508 | wbio->wbio.used_mempool = used_mempool; | |
1509 | wbio->wbio.bio.bi_iter.bi_size = sectors_to_write << 9; | |
1510 | wbio->wbio.bio.bi_end_io = btree_node_write_endio; | |
1511 | wbio->wbio.bio.bi_private = b; | |
1512 | ||
1513 | bch2_bio_map(&wbio->wbio.bio, data); | |
1514 | ||
1515 | /* | |
1516 | * If we're appending to a leaf node, we don't technically need FUA - | |
1517 | * this write just needs to be persisted before the next journal write, | |
1518 | * which will be marked FLUSH|FUA. | |
1519 | * | |
1520 | * Similarly if we're writing a new btree root - the pointer is going to | |
1521 | * be in the next journal entry. | |
1522 | * | |
1523 | * But if we're writing a new btree node (that isn't a root) or | |
1524 | * appending to a non leaf btree node, we need either FUA or a flush | |
1525 | * when we write the parent with the new pointer. FUA is cheaper than a | |
1526 | * flush, and writes appending to leaf nodes aren't blocking anything so | |
1527 | * just make all btree node writes FUA to keep things sane. | |
1528 | */ | |
1529 | ||
1530 | bkey_copy(&k.key, &b->key); | |
1c6fdbd8 | 1531 | |
26609b61 | 1532 | bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) |
1c6fdbd8 KO |
1533 | ptr->offset += b->written; |
1534 | ||
1535 | b->written += sectors_to_write; | |
1536 | ||
1537 | bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); | |
1538 | return; | |
1539 | err: | |
1540 | set_btree_node_noevict(b); | |
1541 | b->written += sectors_to_write; | |
1542 | nowrite: | |
1543 | btree_bounce_free(c, order, used_mempool, data); | |
1544 | btree_node_write_done(c, b); | |
1545 | } | |
1546 | ||
1547 | /* | |
1548 | * Work that must be done with write lock held: | |
1549 | */ | |
1550 | bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) | |
1551 | { | |
1552 | bool invalidated_iter = false; | |
1553 | struct btree_node_entry *bne; | |
1554 | struct bset_tree *t; | |
1555 | ||
1556 | if (!btree_node_just_written(b)) | |
1557 | return false; | |
1558 | ||
1559 | BUG_ON(b->whiteout_u64s); | |
1560 | BUG_ON(b->uncompacted_whiteout_u64s); | |
1561 | ||
1562 | clear_btree_node_just_written(b); | |
1563 | ||
1564 | /* | |
1fe08f31 KO |
1565 | * Note: immediately after write, bset_written() doesn't work - the |
1566 | * amount of data we had to write after compaction might have been | |
1567 | * smaller than the offset of the last bset. | |
1c6fdbd8 KO |
1568 | * |
1569 | * However, we know that all bsets have been written here, as long as | |
1570 | * we're still holding the write lock: | |
1571 | */ | |
1572 | ||
1573 | /* | |
1574 | * XXX: decide if we really want to unconditionally sort down to a | |
1575 | * single bset: | |
1576 | */ | |
1577 | if (b->nsets > 1) { | |
1578 | btree_node_sort(c, b, NULL, 0, b->nsets, true); | |
1579 | invalidated_iter = true; | |
1580 | } else { | |
1581 | invalidated_iter = bch2_drop_whiteouts(b); | |
1582 | } | |
1583 | ||
1584 | for_each_bset(b, t) | |
1585 | set_needs_whiteout(bset(b, t)); | |
1586 | ||
1587 | bch2_btree_verify(c, b); | |
1588 | ||
1589 | /* | |
1590 | * If later we don't unconditionally sort down to a single bset, we have | |
1591 | * to ensure this is still true: | |
1592 | */ | |
1593 | BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); | |
1594 | ||
1595 | bne = want_new_bset(c, b); | |
1596 | if (bne) | |
1597 | bch2_bset_init_next(c, b, bne); | |
1598 | ||
1599 | bch2_btree_build_aux_trees(b); | |
1600 | ||
1601 | return invalidated_iter; | |
1602 | } | |
1603 | ||
1604 | /* | |
1605 | * Use this one if the node is intent locked: | |
1606 | */ | |
1607 | void bch2_btree_node_write(struct bch_fs *c, struct btree *b, | |
1608 | enum six_lock_type lock_type_held) | |
1609 | { | |
1610 | BUG_ON(lock_type_held == SIX_LOCK_write); | |
1611 | ||
1612 | if (lock_type_held == SIX_LOCK_intent || | |
1613 | six_lock_tryupgrade(&b->lock)) { | |
1614 | __bch2_btree_node_write(c, b, SIX_LOCK_intent); | |
1615 | ||
1616 | /* don't cycle lock unnecessarily: */ | |
1617 | if (btree_node_just_written(b) && | |
1618 | six_trylock_write(&b->lock)) { | |
1619 | bch2_btree_post_write_cleanup(c, b); | |
1620 | six_unlock_write(&b->lock); | |
1621 | } | |
1622 | ||
1623 | if (lock_type_held == SIX_LOCK_read) | |
1624 | six_lock_downgrade(&b->lock); | |
1625 | } else { | |
1626 | __bch2_btree_node_write(c, b, SIX_LOCK_read); | |
1627 | } | |
1628 | } | |
1629 | ||
1630 | static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) | |
1631 | { | |
1632 | struct bucket_table *tbl; | |
1633 | struct rhash_head *pos; | |
1634 | struct btree *b; | |
1635 | unsigned i; | |
1636 | restart: | |
1637 | rcu_read_lock(); | |
1638 | for_each_cached_btree(b, c, tbl, i, pos) | |
1639 | if (test_bit(flag, &b->flags)) { | |
1640 | rcu_read_unlock(); | |
1641 | wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); | |
1642 | goto restart; | |
1643 | ||
1644 | } | |
1645 | rcu_read_unlock(); | |
1646 | } | |
1647 | ||
1648 | void bch2_btree_flush_all_reads(struct bch_fs *c) | |
1649 | { | |
1650 | __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); | |
1651 | } | |
1652 | ||
1653 | void bch2_btree_flush_all_writes(struct bch_fs *c) | |
1654 | { | |
1655 | __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); | |
1656 | } | |
1657 | ||
1658 | void bch2_btree_verify_flushed(struct bch_fs *c) | |
1659 | { | |
1660 | struct bucket_table *tbl; | |
1661 | struct rhash_head *pos; | |
1662 | struct btree *b; | |
1663 | unsigned i; | |
1664 | ||
1665 | rcu_read_lock(); | |
1666 | for_each_cached_btree(b, c, tbl, i, pos) { | |
1667 | unsigned long flags = READ_ONCE(b->flags); | |
1668 | ||
1669 | BUG_ON((flags & (1 << BTREE_NODE_dirty)) || | |
1670 | (flags & (1 << BTREE_NODE_write_in_flight))); | |
1671 | } | |
1672 | rcu_read_unlock(); | |
1673 | } | |
1674 | ||
1675 | ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) | |
1676 | { | |
319f9ac3 | 1677 | struct printbuf out = _PBUF(buf, PAGE_SIZE); |
1c6fdbd8 KO |
1678 | struct bucket_table *tbl; |
1679 | struct rhash_head *pos; | |
1680 | struct btree *b; | |
1681 | unsigned i; | |
1682 | ||
1683 | rcu_read_lock(); | |
1684 | for_each_cached_btree(b, c, tbl, i, pos) { | |
1685 | unsigned long flags = READ_ONCE(b->flags); | |
1686 | unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0; | |
1687 | ||
1688 | if (//!(flags & (1 << BTREE_NODE_dirty)) && | |
1689 | !b->writes[0].wait.list.first && | |
1690 | !b->writes[1].wait.list.first && | |
1691 | !(b->will_make_reachable & 1)) | |
1692 | continue; | |
1693 | ||
319f9ac3 KO |
1694 | pr_buf(&out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n", |
1695 | b, | |
1696 | (flags & (1 << BTREE_NODE_dirty)) != 0, | |
1697 | b->level, | |
1698 | b->written, | |
1699 | !list_empty_careful(&b->write_blocked), | |
1700 | b->will_make_reachable != 0, | |
1701 | b->will_make_reachable & 1, | |
1702 | b->writes[ idx].wait.list.first != NULL, | |
1703 | b->writes[!idx].wait.list.first != NULL); | |
1c6fdbd8 KO |
1704 | } |
1705 | rcu_read_unlock(); | |
1706 | ||
319f9ac3 | 1707 | return out.pos - buf; |
1c6fdbd8 | 1708 | } |