Merge branches 'pm-cpuidle', 'pm-sleep' and 'pm-powercap'
[linux-block.git] / fs / bcachefs / btree_node_scan.c
CommitLineData
4409b808
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
4#include "btree_cache.h"
5#include "btree_io.h"
6#include "btree_journal_iter.h"
7#include "btree_node_scan.h"
8#include "btree_update_interior.h"
9#include "buckets.h"
10#include "error.h"
11#include "journal_io.h"
12#include "recovery_passes.h"
13
14#include <linux/kthread.h>
15#include <linux/sort.h>
16
17struct find_btree_nodes_worker {
18 struct closure *cl;
19 struct find_btree_nodes *f;
20 struct bch_dev *ca;
21};
22
23static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
24{
25 prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
26 bch2_bpos_to_text(out, n->min_key);
27 prt_str(out, "-");
28 bch2_bpos_to_text(out, n->max_key);
29
30 if (n->range_updated)
31 prt_str(out, " range updated");
32 if (n->overwritten)
33 prt_str(out, " overwritten");
34
35 for (unsigned i = 0; i < n->nr_ptrs; i++) {
36 prt_char(out, ' ');
37 bch2_extent_ptr_to_text(out, c, n->ptrs + i);
38 }
39}
40
41static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
42{
43 printbuf_indent_add(out, 2);
44 darray_for_each(nodes, i) {
45 found_btree_node_to_text(out, c, i);
46 prt_newline(out);
47 }
48 printbuf_indent_sub(out, 2);
49}
50
51static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
52{
53 struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
54
55 set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
56 bp->k.p = f->max_key;
57 bp->v.seq = cpu_to_le64(f->cookie);
58 bp->v.sectors_written = 0;
59 bp->v.flags = 0;
f7c3dc26 60 bp->v.sectors_written = cpu_to_le16(f->sectors_written);
4409b808
KO
61 bp->v.min_key = f->min_key;
62 SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
63 memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
64}
65
66static bool found_btree_node_is_readable(struct btree_trans *trans,
f7c3dc26 67 struct found_btree_node *f)
4409b808
KO
68{
69 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
70
71 found_btree_node_to_key(&k.k, f);
72
73 struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
74 bool ret = !IS_ERR_OR_NULL(b);
f7c3dc26
KO
75 if (ret) {
76 f->sectors_written = b->written;
4409b808 77 six_unlock_read(&b->c.lock);
f7c3dc26 78 }
4409b808
KO
79
80 /*
81 * We might update this node's range; if that happens, we need the node
82 * to be re-read so the read path can trim keys that are no longer in
83 * this node
84 */
85 if (b != btree_node_root(trans->c, b))
86 bch2_btree_node_evict(trans, &k.k);
87 return ret;
88}
89
90static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
91{
92 const struct found_btree_node *l = _l;
93 const struct found_btree_node *r = _r;
94
95 return cmp_int(l->btree_id, r->btree_id) ?:
96 cmp_int(l->level, r->level) ?:
97 cmp_int(l->cookie, r->cookie);
98}
99
100/*
101 * Given two found btree nodes, if their sequence numbers are equal, take the
102 * one that's readable:
103 */
104static int found_btree_node_cmp_time(const struct found_btree_node *l,
105 const struct found_btree_node *r)
106{
107 return cmp_int(l->seq, r->seq);
108}
109
110static int found_btree_node_cmp_pos(const void *_l, const void *_r)
111{
112 const struct found_btree_node *l = _l;
113 const struct found_btree_node *r = _r;
114
115 return cmp_int(l->btree_id, r->btree_id) ?:
116 -cmp_int(l->level, r->level) ?:
117 bpos_cmp(l->min_key, r->min_key) ?:
118 -found_btree_node_cmp_time(l, r);
119}
120
121static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
122 struct bio *bio, struct btree_node *bn, u64 offset)
123{
124 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
125
126 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
127 bio->bi_iter.bi_sector = offset;
128 bch2_bio_map(bio, bn, PAGE_SIZE);
129
130 submit_bio_wait(bio);
131 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
132 "IO error in try_read_btree_node() at %llu: %s",
133 offset, bch2_blk_status_to_str(bio->bi_status)))
134 return;
135
136 if (le64_to_cpu(bn->magic) != bset_magic(c))
137 return;
138
87cb0239
KO
139 if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
140 struct nonce nonce = btree_nonce(&bn->keys, 0);
141 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
142
143 bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
144 }
145
5ab4beb7
KO
146 if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
147 return;
148
87cb0239
KO
149 if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
150 return;
151
4409b808
KO
152 rcu_read_lock();
153 struct found_btree_node n = {
154 .btree_id = BTREE_NODE_ID(bn),
155 .level = BTREE_NODE_LEVEL(bn),
156 .seq = BTREE_NODE_SEQ(bn),
157 .cookie = le64_to_cpu(bn->keys.seq),
158 .min_key = bn->min_key,
159 .max_key = bn->max_key,
160 .nr_ptrs = 1,
161 .ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
162 .ptrs[0].offset = offset,
163 .ptrs[0].dev = ca->dev_idx,
164 .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)),
165 };
166 rcu_read_unlock();
167
168 if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
169 mutex_lock(&f->lock);
170 if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
171 bch_err(c, "try_read_btree_node() can't handle endian conversion");
172 f->ret = -EINVAL;
173 goto unlock;
174 }
175
176 if (darray_push(&f->nodes, n))
177 f->ret = -ENOMEM;
178unlock:
179 mutex_unlock(&f->lock);
180 }
181}
182
183static int read_btree_nodes_worker(void *p)
184{
185 struct find_btree_nodes_worker *w = p;
186 struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
187 struct bch_dev *ca = w->ca;
188 void *buf = (void *) __get_free_page(GFP_KERNEL);
189 struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
190 unsigned long last_print = jiffies;
191
192 if (!buf || !bio) {
193 bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
194 w->f->ret = -ENOMEM;
195 goto err;
196 }
197
198 for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
199 for (unsigned bucket_offset = 0;
200 bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
201 bucket_offset += btree_sectors(c)) {
202 if (time_after(jiffies, last_print + HZ * 30)) {
203 u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
204 u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
205
206 bch_info(ca, "%s: %2u%% done", __func__,
207 (unsigned) div64_u64(cur_sector * 100, end_sector));
208 last_print = jiffies;
209 }
210
27c15ed2
KO
211 u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
212
213 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
214 !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
215 continue;
216
217 try_read_btree_node(w->f, ca, bio, buf, sector);
4409b808
KO
218 }
219err:
220 bio_put(bio);
221 free_page((unsigned long) buf);
222 percpu_ref_get(&ca->io_ref);
223 closure_put(w->cl);
224 kfree(w);
225 return 0;
226}
227
228static int read_btree_nodes(struct find_btree_nodes *f)
229{
230 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
231 struct closure cl;
232 int ret = 0;
233
234 closure_init_stack(&cl);
235
236 for_each_online_member(c, ca) {
9b31152f
KO
237 if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
238 continue;
239
4409b808
KO
240 struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
241 struct task_struct *t;
242
243 if (!w) {
244 percpu_ref_put(&ca->io_ref);
245 ret = -ENOMEM;
246 goto err;
247 }
248
249 percpu_ref_get(&ca->io_ref);
250 closure_get(&cl);
251 w->cl = &cl;
252 w->f = f;
253 w->ca = ca;
254
255 t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
256 ret = IS_ERR_OR_NULL(t);
257 if (ret) {
258 percpu_ref_put(&ca->io_ref);
259 closure_put(&cl);
260 f->ret = ret;
261 bch_err(c, "error starting kthread: %i", ret);
262 break;
263 }
264 }
265err:
266 closure_sync(&cl);
267 return f->ret ?: ret;
268}
269
270static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
271{
272 while (n + 1 < end &&
273 found_btree_node_cmp_pos(n, n + 1) > 0) {
274 swap(n[0], n[1]);
275 n++;
276 }
277}
278
279static int handle_overwrites(struct bch_fs *c,
280 struct found_btree_node *start,
281 struct found_btree_node *end)
282{
283 struct found_btree_node *n;
284again:
285 for (n = start + 1;
286 n < end &&
287 n->btree_id == start->btree_id &&
288 n->level == start->level &&
289 bpos_lt(n->min_key, start->max_key);
290 n++) {
291 int cmp = found_btree_node_cmp_time(start, n);
292
293 if (cmp > 0) {
294 if (bpos_cmp(start->max_key, n->max_key) >= 0)
295 n->overwritten = true;
296 else {
297 n->range_updated = true;
298 n->min_key = bpos_successor(start->max_key);
299 n->range_updated = true;
300 bubble_up(n, end);
301 goto again;
302 }
303 } else if (cmp < 0) {
304 BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
305
306 start->max_key = bpos_predecessor(n->min_key);
307 start->range_updated = true;
fabb4d49
KO
308 } else if (n->level) {
309 n->overwritten = true;
4409b808
KO
310 } else {
311 struct printbuf buf = PRINTBUF;
312
313 prt_str(&buf, "overlapping btree nodes with same seq! halting\n ");
314 found_btree_node_to_text(&buf, c, start);
315 prt_str(&buf, "\n ");
316 found_btree_node_to_text(&buf, c, n);
317 bch_err(c, "%s", buf.buf);
318 printbuf_exit(&buf);
5ab4beb7 319 return -BCH_ERR_fsck_repair_unimplemented;
4409b808
KO
320 }
321 }
322
323 return 0;
324}
325
326int bch2_scan_for_btree_nodes(struct bch_fs *c)
327{
328 struct find_btree_nodes *f = &c->found_btree_nodes;
329 struct printbuf buf = PRINTBUF;
330 size_t dst;
331 int ret = 0;
332
333 if (f->nodes.nr)
334 return 0;
335
336 mutex_init(&f->lock);
337
338 ret = read_btree_nodes(f);
339 if (ret)
340 return ret;
341
342 if (!f->nodes.nr) {
343 bch_err(c, "%s: no btree nodes found", __func__);
344 ret = -EINVAL;
345 goto err;
346 }
347
348 if (0 && c->opts.verbose) {
349 printbuf_reset(&buf);
350 prt_printf(&buf, "%s: nodes found:\n", __func__);
351 found_btree_nodes_to_text(&buf, c, f->nodes);
352 bch2_print_string_as_lines(KERN_INFO, buf.buf);
353 }
354
355 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
356
357 dst = 0;
358 darray_for_each(f->nodes, i) {
359 struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
360
361 if (prev &&
362 prev->cookie == i->cookie) {
363 if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
364 bch_err(c, "%s: found too many replicas for btree node", __func__);
365 ret = -EINVAL;
366 goto err;
367 }
368 prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
369 } else {
370 f->nodes.data[dst++] = *i;
371 }
372 }
373 f->nodes.nr = dst;
374
375 sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
376
377 if (0 && c->opts.verbose) {
378 printbuf_reset(&buf);
379 prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
380 found_btree_nodes_to_text(&buf, c, f->nodes);
381 bch2_print_string_as_lines(KERN_INFO, buf.buf);
382 }
383
384 dst = 0;
385 darray_for_each(f->nodes, i) {
386 if (i->overwritten)
387 continue;
388
389 ret = handle_overwrites(c, i, &darray_top(f->nodes));
390 if (ret)
391 goto err;
392
393 BUG_ON(i->overwritten);
394 f->nodes.data[dst++] = *i;
395 }
396 f->nodes.nr = dst;
397
398 if (c->opts.verbose) {
399 printbuf_reset(&buf);
400 prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
401 found_btree_nodes_to_text(&buf, c, f->nodes);
402 bch2_print_string_as_lines(KERN_INFO, buf.buf);
403 }
404
405 eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
406err:
407 printbuf_exit(&buf);
408 return ret;
409}
410
411static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
412{
413 const struct found_btree_node *l = _l;
414 const struct found_btree_node *r = _r;
415
416 return cmp_int(l->btree_id, r->btree_id) ?:
417 -cmp_int(l->level, r->level) ?:
418 bpos_cmp(l->max_key, r->min_key);
419}
420
421#define for_each_found_btree_node_in_range(_f, _search, _idx) \
422 for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr, \
423 sizeof((_f)->nodes.data[0]), \
424 found_btree_node_range_start_cmp, &search); \
425 _idx < (_f)->nodes.nr && \
426 (_f)->nodes.data[_idx].btree_id == _search.btree_id && \
427 (_f)->nodes.data[_idx].level == _search.level && \
428 bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key); \
429 _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
430
431bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
432{
433 struct find_btree_nodes *f = &c->found_btree_nodes;
434
435 struct found_btree_node search = {
436 .btree_id = b->c.btree_id,
437 .level = b->c.level,
438 .min_key = b->data->min_key,
439 .max_key = b->key.k.p,
440 };
441
442 for_each_found_btree_node_in_range(f, search, idx)
443 if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
444 return true;
445 return false;
446}
447
448bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
449{
450 struct found_btree_node search = {
451 .btree_id = btree,
452 .level = 0,
453 .min_key = POS_MIN,
454 .max_key = SPOS_MAX,
455 };
456
457 for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
458 return true;
459 return false;
460}
461
462int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
463 unsigned level, struct bpos node_min, struct bpos node_max)
464{
5ab4beb7
KO
465 if (btree_id_is_alloc(btree))
466 return 0;
467
4409b808
KO
468 struct find_btree_nodes *f = &c->found_btree_nodes;
469
470 int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
471 if (ret)
472 return ret;
473
474 if (c->opts.verbose) {
475 struct printbuf buf = PRINTBUF;
476
477 prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
478 bch2_bpos_to_text(&buf, node_min);
479 prt_str(&buf, " - ");
480 bch2_bpos_to_text(&buf, node_max);
481
482 bch_info(c, "%s(): %s", __func__, buf.buf);
483 printbuf_exit(&buf);
484 }
485
486 struct found_btree_node search = {
487 .btree_id = btree,
488 .level = level,
489 .min_key = node_min,
490 .max_key = node_max,
491 };
492
493 for_each_found_btree_node_in_range(f, search, idx) {
494 struct found_btree_node n = f->nodes.data[idx];
495
496 n.range_updated |= bpos_lt(n.min_key, node_min);
497 n.min_key = bpos_max(n.min_key, node_min);
498
499 n.range_updated |= bpos_gt(n.max_key, node_max);
500 n.max_key = bpos_min(n.max_key, node_max);
501
502 struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
503
504 found_btree_node_to_key(&tmp.k, &n);
505
506 struct printbuf buf = PRINTBUF;
507 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
508 bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
509 printbuf_exit(&buf);
510
511 BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
512
513 ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
514 if (ret)
515 return ret;
516 }
517
518 return 0;
519}
520
521void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
522{
523 darray_exit(&f->nodes);
524}