Commit | Line | Data |
---|---|---|
4409b808 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
4 | #include "btree_cache.h" | |
5 | #include "btree_io.h" | |
6 | #include "btree_journal_iter.h" | |
7 | #include "btree_node_scan.h" | |
8 | #include "btree_update_interior.h" | |
9 | #include "buckets.h" | |
10 | #include "error.h" | |
11 | #include "journal_io.h" | |
12 | #include "recovery_passes.h" | |
13 | ||
14 | #include <linux/kthread.h> | |
15 | #include <linux/sort.h> | |
16 | ||
17 | struct find_btree_nodes_worker { | |
18 | struct closure *cl; | |
19 | struct find_btree_nodes *f; | |
20 | struct bch_dev *ca; | |
21 | }; | |
22 | ||
23 | static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) | |
24 | { | |
25 | prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie); | |
26 | bch2_bpos_to_text(out, n->min_key); | |
27 | prt_str(out, "-"); | |
28 | bch2_bpos_to_text(out, n->max_key); | |
29 | ||
30 | if (n->range_updated) | |
31 | prt_str(out, " range updated"); | |
32 | if (n->overwritten) | |
33 | prt_str(out, " overwritten"); | |
34 | ||
35 | for (unsigned i = 0; i < n->nr_ptrs; i++) { | |
36 | prt_char(out, ' '); | |
37 | bch2_extent_ptr_to_text(out, c, n->ptrs + i); | |
38 | } | |
39 | } | |
40 | ||
41 | static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes) | |
42 | { | |
43 | printbuf_indent_add(out, 2); | |
44 | darray_for_each(nodes, i) { | |
45 | found_btree_node_to_text(out, c, i); | |
46 | prt_newline(out); | |
47 | } | |
48 | printbuf_indent_sub(out, 2); | |
49 | } | |
50 | ||
51 | static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f) | |
52 | { | |
53 | struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k); | |
54 | ||
55 | set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs); | |
56 | bp->k.p = f->max_key; | |
57 | bp->v.seq = cpu_to_le64(f->cookie); | |
58 | bp->v.sectors_written = 0; | |
59 | bp->v.flags = 0; | |
f7c3dc26 | 60 | bp->v.sectors_written = cpu_to_le16(f->sectors_written); |
4409b808 KO |
61 | bp->v.min_key = f->min_key; |
62 | SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated); | |
63 | memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); | |
64 | } | |
65 | ||
66 | static bool found_btree_node_is_readable(struct btree_trans *trans, | |
f7c3dc26 | 67 | struct found_btree_node *f) |
4409b808 KO |
68 | { |
69 | struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k; | |
70 | ||
71 | found_btree_node_to_key(&k.k, f); | |
72 | ||
73 | struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false); | |
74 | bool ret = !IS_ERR_OR_NULL(b); | |
f7c3dc26 KO |
75 | if (ret) { |
76 | f->sectors_written = b->written; | |
4409b808 | 77 | six_unlock_read(&b->c.lock); |
f7c3dc26 | 78 | } |
4409b808 KO |
79 | |
80 | /* | |
81 | * We might update this node's range; if that happens, we need the node | |
82 | * to be re-read so the read path can trim keys that are no longer in | |
83 | * this node | |
84 | */ | |
85 | if (b != btree_node_root(trans->c, b)) | |
86 | bch2_btree_node_evict(trans, &k.k); | |
87 | return ret; | |
88 | } | |
89 | ||
90 | static int found_btree_node_cmp_cookie(const void *_l, const void *_r) | |
91 | { | |
92 | const struct found_btree_node *l = _l; | |
93 | const struct found_btree_node *r = _r; | |
94 | ||
95 | return cmp_int(l->btree_id, r->btree_id) ?: | |
96 | cmp_int(l->level, r->level) ?: | |
97 | cmp_int(l->cookie, r->cookie); | |
98 | } | |
99 | ||
100 | /* | |
101 | * Given two found btree nodes, if their sequence numbers are equal, take the | |
102 | * one that's readable: | |
103 | */ | |
104 | static int found_btree_node_cmp_time(const struct found_btree_node *l, | |
105 | const struct found_btree_node *r) | |
106 | { | |
107 | return cmp_int(l->seq, r->seq); | |
108 | } | |
109 | ||
110 | static int found_btree_node_cmp_pos(const void *_l, const void *_r) | |
111 | { | |
112 | const struct found_btree_node *l = _l; | |
113 | const struct found_btree_node *r = _r; | |
114 | ||
115 | return cmp_int(l->btree_id, r->btree_id) ?: | |
116 | -cmp_int(l->level, r->level) ?: | |
117 | bpos_cmp(l->min_key, r->min_key) ?: | |
118 | -found_btree_node_cmp_time(l, r); | |
119 | } | |
120 | ||
121 | static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, | |
122 | struct bio *bio, struct btree_node *bn, u64 offset) | |
123 | { | |
124 | struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); | |
125 | ||
126 | bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); | |
127 | bio->bi_iter.bi_sector = offset; | |
128 | bch2_bio_map(bio, bn, PAGE_SIZE); | |
129 | ||
130 | submit_bio_wait(bio); | |
131 | if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, | |
132 | "IO error in try_read_btree_node() at %llu: %s", | |
133 | offset, bch2_blk_status_to_str(bio->bi_status))) | |
134 | return; | |
135 | ||
136 | if (le64_to_cpu(bn->magic) != bset_magic(c)) | |
137 | return; | |
138 | ||
87cb0239 KO |
139 | if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { |
140 | struct nonce nonce = btree_nonce(&bn->keys, 0); | |
141 | unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; | |
142 | ||
143 | bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes); | |
144 | } | |
145 | ||
5ab4beb7 KO |
146 | if (btree_id_is_alloc(BTREE_NODE_ID(bn))) |
147 | return; | |
148 | ||
87cb0239 KO |
149 | if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH) |
150 | return; | |
151 | ||
4409b808 KO |
152 | rcu_read_lock(); |
153 | struct found_btree_node n = { | |
154 | .btree_id = BTREE_NODE_ID(bn), | |
155 | .level = BTREE_NODE_LEVEL(bn), | |
156 | .seq = BTREE_NODE_SEQ(bn), | |
157 | .cookie = le64_to_cpu(bn->keys.seq), | |
158 | .min_key = bn->min_key, | |
159 | .max_key = bn->max_key, | |
160 | .nr_ptrs = 1, | |
161 | .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr, | |
162 | .ptrs[0].offset = offset, | |
163 | .ptrs[0].dev = ca->dev_idx, | |
164 | .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)), | |
165 | }; | |
166 | rcu_read_unlock(); | |
167 | ||
168 | if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { | |
169 | mutex_lock(&f->lock); | |
170 | if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { | |
171 | bch_err(c, "try_read_btree_node() can't handle endian conversion"); | |
172 | f->ret = -EINVAL; | |
173 | goto unlock; | |
174 | } | |
175 | ||
176 | if (darray_push(&f->nodes, n)) | |
177 | f->ret = -ENOMEM; | |
178 | unlock: | |
179 | mutex_unlock(&f->lock); | |
180 | } | |
181 | } | |
182 | ||
183 | static int read_btree_nodes_worker(void *p) | |
184 | { | |
185 | struct find_btree_nodes_worker *w = p; | |
186 | struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); | |
187 | struct bch_dev *ca = w->ca; | |
188 | void *buf = (void *) __get_free_page(GFP_KERNEL); | |
189 | struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); | |
190 | unsigned long last_print = jiffies; | |
191 | ||
192 | if (!buf || !bio) { | |
193 | bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); | |
194 | w->f->ret = -ENOMEM; | |
195 | goto err; | |
196 | } | |
197 | ||
198 | for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++) | |
199 | for (unsigned bucket_offset = 0; | |
200 | bucket_offset + btree_sectors(c) <= ca->mi.bucket_size; | |
201 | bucket_offset += btree_sectors(c)) { | |
202 | if (time_after(jiffies, last_print + HZ * 30)) { | |
203 | u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset; | |
204 | u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size; | |
205 | ||
206 | bch_info(ca, "%s: %2u%% done", __func__, | |
207 | (unsigned) div64_u64(cur_sector * 100, end_sector)); | |
208 | last_print = jiffies; | |
209 | } | |
210 | ||
27c15ed2 KO |
211 | u64 sector = bucket * ca->mi.bucket_size + bucket_offset; |
212 | ||
213 | if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap && | |
214 | !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) | |
215 | continue; | |
216 | ||
217 | try_read_btree_node(w->f, ca, bio, buf, sector); | |
4409b808 KO |
218 | } |
219 | err: | |
220 | bio_put(bio); | |
221 | free_page((unsigned long) buf); | |
222 | percpu_ref_get(&ca->io_ref); | |
223 | closure_put(w->cl); | |
224 | kfree(w); | |
225 | return 0; | |
226 | } | |
227 | ||
228 | static int read_btree_nodes(struct find_btree_nodes *f) | |
229 | { | |
230 | struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); | |
231 | struct closure cl; | |
232 | int ret = 0; | |
233 | ||
234 | closure_init_stack(&cl); | |
235 | ||
236 | for_each_online_member(c, ca) { | |
9b31152f KO |
237 | if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) |
238 | continue; | |
239 | ||
4409b808 KO |
240 | struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); |
241 | struct task_struct *t; | |
242 | ||
243 | if (!w) { | |
244 | percpu_ref_put(&ca->io_ref); | |
245 | ret = -ENOMEM; | |
246 | goto err; | |
247 | } | |
248 | ||
249 | percpu_ref_get(&ca->io_ref); | |
250 | closure_get(&cl); | |
251 | w->cl = &cl; | |
252 | w->f = f; | |
253 | w->ca = ca; | |
254 | ||
255 | t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); | |
256 | ret = IS_ERR_OR_NULL(t); | |
257 | if (ret) { | |
258 | percpu_ref_put(&ca->io_ref); | |
259 | closure_put(&cl); | |
260 | f->ret = ret; | |
261 | bch_err(c, "error starting kthread: %i", ret); | |
262 | break; | |
263 | } | |
264 | } | |
265 | err: | |
266 | closure_sync(&cl); | |
267 | return f->ret ?: ret; | |
268 | } | |
269 | ||
270 | static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) | |
271 | { | |
272 | while (n + 1 < end && | |
273 | found_btree_node_cmp_pos(n, n + 1) > 0) { | |
274 | swap(n[0], n[1]); | |
275 | n++; | |
276 | } | |
277 | } | |
278 | ||
279 | static int handle_overwrites(struct bch_fs *c, | |
280 | struct found_btree_node *start, | |
281 | struct found_btree_node *end) | |
282 | { | |
283 | struct found_btree_node *n; | |
284 | again: | |
285 | for (n = start + 1; | |
286 | n < end && | |
287 | n->btree_id == start->btree_id && | |
288 | n->level == start->level && | |
289 | bpos_lt(n->min_key, start->max_key); | |
290 | n++) { | |
291 | int cmp = found_btree_node_cmp_time(start, n); | |
292 | ||
293 | if (cmp > 0) { | |
294 | if (bpos_cmp(start->max_key, n->max_key) >= 0) | |
295 | n->overwritten = true; | |
296 | else { | |
297 | n->range_updated = true; | |
298 | n->min_key = bpos_successor(start->max_key); | |
299 | n->range_updated = true; | |
300 | bubble_up(n, end); | |
301 | goto again; | |
302 | } | |
303 | } else if (cmp < 0) { | |
304 | BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); | |
305 | ||
306 | start->max_key = bpos_predecessor(n->min_key); | |
307 | start->range_updated = true; | |
fabb4d49 KO |
308 | } else if (n->level) { |
309 | n->overwritten = true; | |
4409b808 KO |
310 | } else { |
311 | struct printbuf buf = PRINTBUF; | |
312 | ||
313 | prt_str(&buf, "overlapping btree nodes with same seq! halting\n "); | |
314 | found_btree_node_to_text(&buf, c, start); | |
315 | prt_str(&buf, "\n "); | |
316 | found_btree_node_to_text(&buf, c, n); | |
317 | bch_err(c, "%s", buf.buf); | |
318 | printbuf_exit(&buf); | |
5ab4beb7 | 319 | return -BCH_ERR_fsck_repair_unimplemented; |
4409b808 KO |
320 | } |
321 | } | |
322 | ||
323 | return 0; | |
324 | } | |
325 | ||
326 | int bch2_scan_for_btree_nodes(struct bch_fs *c) | |
327 | { | |
328 | struct find_btree_nodes *f = &c->found_btree_nodes; | |
329 | struct printbuf buf = PRINTBUF; | |
330 | size_t dst; | |
331 | int ret = 0; | |
332 | ||
333 | if (f->nodes.nr) | |
334 | return 0; | |
335 | ||
336 | mutex_init(&f->lock); | |
337 | ||
338 | ret = read_btree_nodes(f); | |
339 | if (ret) | |
340 | return ret; | |
341 | ||
342 | if (!f->nodes.nr) { | |
343 | bch_err(c, "%s: no btree nodes found", __func__); | |
344 | ret = -EINVAL; | |
345 | goto err; | |
346 | } | |
347 | ||
348 | if (0 && c->opts.verbose) { | |
349 | printbuf_reset(&buf); | |
350 | prt_printf(&buf, "%s: nodes found:\n", __func__); | |
351 | found_btree_nodes_to_text(&buf, c, f->nodes); | |
352 | bch2_print_string_as_lines(KERN_INFO, buf.buf); | |
353 | } | |
354 | ||
355 | sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); | |
356 | ||
357 | dst = 0; | |
358 | darray_for_each(f->nodes, i) { | |
359 | struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL; | |
360 | ||
361 | if (prev && | |
362 | prev->cookie == i->cookie) { | |
363 | if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) { | |
364 | bch_err(c, "%s: found too many replicas for btree node", __func__); | |
365 | ret = -EINVAL; | |
366 | goto err; | |
367 | } | |
368 | prev->ptrs[prev->nr_ptrs++] = i->ptrs[0]; | |
369 | } else { | |
370 | f->nodes.data[dst++] = *i; | |
371 | } | |
372 | } | |
373 | f->nodes.nr = dst; | |
374 | ||
375 | sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); | |
376 | ||
377 | if (0 && c->opts.verbose) { | |
378 | printbuf_reset(&buf); | |
379 | prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); | |
380 | found_btree_nodes_to_text(&buf, c, f->nodes); | |
381 | bch2_print_string_as_lines(KERN_INFO, buf.buf); | |
382 | } | |
383 | ||
384 | dst = 0; | |
385 | darray_for_each(f->nodes, i) { | |
386 | if (i->overwritten) | |
387 | continue; | |
388 | ||
389 | ret = handle_overwrites(c, i, &darray_top(f->nodes)); | |
390 | if (ret) | |
391 | goto err; | |
392 | ||
393 | BUG_ON(i->overwritten); | |
394 | f->nodes.data[dst++] = *i; | |
395 | } | |
396 | f->nodes.nr = dst; | |
397 | ||
398 | if (c->opts.verbose) { | |
399 | printbuf_reset(&buf); | |
400 | prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); | |
401 | found_btree_nodes_to_text(&buf, c, f->nodes); | |
402 | bch2_print_string_as_lines(KERN_INFO, buf.buf); | |
403 | } | |
404 | ||
405 | eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); | |
406 | err: | |
407 | printbuf_exit(&buf); | |
408 | return ret; | |
409 | } | |
410 | ||
411 | static int found_btree_node_range_start_cmp(const void *_l, const void *_r) | |
412 | { | |
413 | const struct found_btree_node *l = _l; | |
414 | const struct found_btree_node *r = _r; | |
415 | ||
416 | return cmp_int(l->btree_id, r->btree_id) ?: | |
417 | -cmp_int(l->level, r->level) ?: | |
418 | bpos_cmp(l->max_key, r->min_key); | |
419 | } | |
420 | ||
421 | #define for_each_found_btree_node_in_range(_f, _search, _idx) \ | |
422 | for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \ | |
423 | sizeof((_f)->nodes.data[0]), \ | |
424 | found_btree_node_range_start_cmp, &search); \ | |
425 | _idx < (_f)->nodes.nr && \ | |
426 | (_f)->nodes.data[_idx].btree_id == _search.btree_id && \ | |
427 | (_f)->nodes.data[_idx].level == _search.level && \ | |
428 | bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \ | |
429 | _idx = eytzinger0_next(_idx, (_f)->nodes.nr)) | |
430 | ||
431 | bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) | |
432 | { | |
433 | struct find_btree_nodes *f = &c->found_btree_nodes; | |
434 | ||
435 | struct found_btree_node search = { | |
436 | .btree_id = b->c.btree_id, | |
437 | .level = b->c.level, | |
438 | .min_key = b->data->min_key, | |
439 | .max_key = b->key.k.p, | |
440 | }; | |
441 | ||
442 | for_each_found_btree_node_in_range(f, search, idx) | |
443 | if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data)) | |
444 | return true; | |
445 | return false; | |
446 | } | |
447 | ||
448 | bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) | |
449 | { | |
450 | struct found_btree_node search = { | |
451 | .btree_id = btree, | |
452 | .level = 0, | |
453 | .min_key = POS_MIN, | |
454 | .max_key = SPOS_MAX, | |
455 | }; | |
456 | ||
457 | for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx) | |
458 | return true; | |
459 | return false; | |
460 | } | |
461 | ||
462 | int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, | |
463 | unsigned level, struct bpos node_min, struct bpos node_max) | |
464 | { | |
5ab4beb7 KO |
465 | if (btree_id_is_alloc(btree)) |
466 | return 0; | |
467 | ||
4409b808 KO |
468 | struct find_btree_nodes *f = &c->found_btree_nodes; |
469 | ||
470 | int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); | |
471 | if (ret) | |
472 | return ret; | |
473 | ||
474 | if (c->opts.verbose) { | |
475 | struct printbuf buf = PRINTBUF; | |
476 | ||
477 | prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); | |
478 | bch2_bpos_to_text(&buf, node_min); | |
479 | prt_str(&buf, " - "); | |
480 | bch2_bpos_to_text(&buf, node_max); | |
481 | ||
482 | bch_info(c, "%s(): %s", __func__, buf.buf); | |
483 | printbuf_exit(&buf); | |
484 | } | |
485 | ||
486 | struct found_btree_node search = { | |
487 | .btree_id = btree, | |
488 | .level = level, | |
489 | .min_key = node_min, | |
490 | .max_key = node_max, | |
491 | }; | |
492 | ||
493 | for_each_found_btree_node_in_range(f, search, idx) { | |
494 | struct found_btree_node n = f->nodes.data[idx]; | |
495 | ||
496 | n.range_updated |= bpos_lt(n.min_key, node_min); | |
497 | n.min_key = bpos_max(n.min_key, node_min); | |
498 | ||
499 | n.range_updated |= bpos_gt(n.max_key, node_max); | |
500 | n.max_key = bpos_min(n.max_key, node_max); | |
501 | ||
502 | struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; | |
503 | ||
504 | found_btree_node_to_key(&tmp.k, &n); | |
505 | ||
506 | struct printbuf buf = PRINTBUF; | |
507 | bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); | |
508 | bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); | |
509 | printbuf_exit(&buf); | |
510 | ||
511 | BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL)); | |
512 | ||
513 | ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); | |
514 | if (ret) | |
515 | return ret; | |
516 | } | |
517 | ||
518 | return 0; | |
519 | } | |
520 | ||
521 | void bch2_find_btree_nodes_exit(struct find_btree_nodes *f) | |
522 | { | |
523 | darray_exit(&f->nodes); | |
524 | } |