Merge tag 'bcachefs-2024-05-19' of https://evilpiepirate.org/git/bcachefs
[linux-2.6-block.git] / fs / bcachefs / btree_gc.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4 * Copyright (C) 2014 Datera Inc.
5 */
6
7#include "bcachefs.h"
7b3f84ea 8#include "alloc_background.h"
ef337c54 9#include "alloc_foreground.h"
47d2080e 10#include "backpointers.h"
1c6fdbd8 11#include "bkey_methods.h"
07a1006a 12#include "bkey_buf.h"
401585fe 13#include "btree_journal_iter.h"
ec061b21 14#include "btree_key_cache.h"
1c6fdbd8 15#include "btree_locking.h"
43f5ea46 16#include "btree_node_scan.h"
1c6fdbd8
KO
17#include "btree_update_interior.h"
18#include "btree_io.h"
19#include "btree_gc.h"
20#include "buckets.h"
21#include "clock.h"
22#include "debug.h"
cd575ddf 23#include "ec.h"
1c6fdbd8
KO
24#include "error.h"
25#include "extents.h"
26#include "journal.h"
27#include "keylist.h"
28#include "move.h"
d2554263 29#include "recovery_passes.h"
890b74f0 30#include "reflink.h"
1c6fdbd8
KO
31#include "replicas.h"
32#include "super-io.h"
33#include "trace.h"
34
35#include <linux/slab.h>
36#include <linux/bitops.h>
37#include <linux/freezer.h>
38#include <linux/kthread.h>
39#include <linux/preempt.h>
40#include <linux/rcupdate.h>
41#include <linux/sched/task.h>
42
4351d3ec
KO
43#define DROP_THIS_NODE 10
44#define DROP_PREV_NODE 11
43f5ea46 45#define DID_FILL_FROM_SCAN 12
4351d3ec 46
ad00bce0
KO
47static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
48{
49 return (struct bkey_s) {{{
50 (struct bkey *) k.k,
51 (struct bch_val *) k.v
52 }}};
53}
54
2252aa27
KO
55static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
56{
57 preempt_disable();
58 write_seqcount_begin(&c->gc_pos_lock);
59 c->gc_pos = new_pos;
60 write_seqcount_end(&c->gc_pos_lock);
61 preempt_enable();
62}
63
64static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
65{
24b27975 66 BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) < 0);
2252aa27
KO
67 __gc_pos_set(c, new_pos);
68}
69
aae15aaf
KO
70static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
71{
72 switch (b->key.k.type) {
73 case KEY_TYPE_btree_ptr: {
74 struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
75
76 dst->k.p = src->k.p;
77 dst->v.mem_ptr = 0;
78 dst->v.seq = b->data->keys.seq;
79 dst->v.sectors_written = 0;
80 dst->v.flags = 0;
81 dst->v.min_key = b->data->min_key;
82 set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
83 memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
84 break;
85 }
86 case KEY_TYPE_btree_ptr_v2:
87 bkey_copy(&dst->k_i, &b->key);
88 break;
89 default:
90 BUG();
91 }
92}
93
94static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
95{
96 struct bkey_i_btree_ptr_v2 *new;
97 int ret;
98
43f5ea46
KO
99 if (c->opts.verbose) {
100 struct printbuf buf = PRINTBUF;
101
102 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
103 prt_str(&buf, " -> ");
104 bch2_bpos_to_text(&buf, new_min);
105
106 bch_info(c, "%s(): %s", __func__, buf.buf);
107 printbuf_exit(&buf);
108 }
109
a1019576 110 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
aae15aaf 111 if (!new)
65d48e35 112 return -BCH_ERR_ENOMEM_gc_repair_key;
aae15aaf
KO
113
114 btree_ptr_to_v2(b, new);
115 b->data->min_key = new_min;
116 new->v.min_key = new_min;
117 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
118
e75b2d4c 119 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
aae15aaf
KO
120 if (ret) {
121 kfree(new);
122 return ret;
d06c1a0c 123 }
1c6fdbd8 124
aae15aaf 125 bch2_btree_node_drop_keys_outside_node(b);
48620e51 126 bkey_copy(&b->key, &new->k_i);
aae15aaf
KO
127 return 0;
128}
129
130static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
131{
132 struct bkey_i_btree_ptr_v2 *new;
133 int ret;
134
43f5ea46
KO
135 if (c->opts.verbose) {
136 struct printbuf buf = PRINTBUF;
137
138 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
139 prt_str(&buf, " -> ");
140 bch2_bpos_to_text(&buf, new_max);
141
142 bch_info(c, "%s(): %s", __func__, buf.buf);
143 printbuf_exit(&buf);
144 }
145
aae15aaf
KO
146 ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
147 if (ret)
148 return ret;
149
a1019576 150 new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
aae15aaf 151 if (!new)
65d48e35 152 return -BCH_ERR_ENOMEM_gc_repair_key;
aae15aaf
KO
153
154 btree_ptr_to_v2(b, new);
155 b->data->max_key = new_max;
156 new->k.p = new_max;
157 SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
158
e75b2d4c 159 ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
aae15aaf
KO
160 if (ret) {
161 kfree(new);
162 return ret;
163 }
164
165 bch2_btree_node_drop_keys_outside_node(b);
166
167 mutex_lock(&c->btree_cache.lock);
168 bch2_btree_node_hash_remove(&c->btree_cache, b);
169
170 bkey_copy(&b->key, &new->k_i);
171 ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
172 BUG_ON(ret);
173 mutex_unlock(&c->btree_cache.lock);
174 return 0;
175}
176
43f5ea46
KO
177static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
178 struct btree *prev, struct btree *cur,
179 struct bpos *pulled_from_scan)
aae15aaf
KO
180{
181 struct bpos expected_start = !prev
182 ? b->data->min_key
183 : bpos_successor(prev->key.k.p);
43f5ea46 184 struct printbuf buf = PRINTBUF;
aae15aaf
KO
185 int ret = 0;
186
43f5ea46
KO
187 BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
188 !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
189 b->data->min_key));
190
191 if (bpos_eq(expected_start, cur->data->min_key))
192 return 0;
193
194 prt_printf(&buf, " at btree %s level %u:\n parent: ",
195 bch2_btree_id_str(b->c.btree_id), b->c.level);
196 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
197
198 if (prev) {
199 prt_printf(&buf, "\n prev: ");
200 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
aae15aaf
KO
201 }
202
43f5ea46
KO
203 prt_str(&buf, "\n next: ");
204 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
4351d3ec 205
43f5ea46
KO
206 if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
207 if (b->c.level == 1 &&
208 bpos_lt(*pulled_from_scan, cur->data->min_key)) {
209 ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
210 expected_start,
211 bpos_predecessor(cur->data->min_key));
212 if (ret)
213 goto err;
214
215 *pulled_from_scan = cur->data->min_key;
216 ret = DID_FILL_FROM_SCAN;
217 } else {
218 if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
219 "btree node with incorrect min_key%s", buf.buf))
220 ret = set_node_min(c, cur, expected_start);
221 }
222 } else { /* overlap */
223 if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev */
224 if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */
225 if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
226 "btree node overwritten by next node%s", buf.buf))
227 ret = DROP_PREV_NODE;
228 } else {
229 if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
230 "btree node with incorrect max_key%s", buf.buf))
231 ret = set_node_max(c, prev,
232 bpos_predecessor(cur->data->min_key));
233 }
234 } else {
235 if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */
236 if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
237 "btree node overwritten by prev node%s", buf.buf))
238 ret = DROP_THIS_NODE;
239 } else {
240 if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
241 "btree node with incorrect min_key%s", buf.buf))
242 ret = set_node_min(c, cur, expected_start);
243 }
fa8e94fa 244 }
aae15aaf 245 }
43f5ea46 246err:
aae15aaf 247fsck_err:
43f5ea46 248 printbuf_exit(&buf);
aae15aaf
KO
249 return ret;
250}
251
252static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
43f5ea46 253 struct btree *child, struct bpos *pulled_from_scan)
aae15aaf 254{
43f5ea46 255 struct printbuf buf = PRINTBUF;
aae15aaf
KO
256 int ret = 0;
257
43f5ea46
KO
258 if (bpos_eq(child->key.k.p, b->key.k.p))
259 return 0;
260
261 prt_printf(&buf, "at btree %s level %u:\n parent: ",
262 bch2_btree_id_str(b->c.btree_id), b->c.level);
263 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
264
265 prt_str(&buf, "\n child: ");
266 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
267
268 if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
269 "btree node with incorrect max_key%s", buf.buf)) {
270 if (b->c.level == 1 &&
271 bpos_lt(*pulled_from_scan, b->key.k.p)) {
272 ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
273 bpos_successor(child->key.k.p), b->key.k.p);
274 if (ret)
275 goto err;
276
277 *pulled_from_scan = b->key.k.p;
278 ret = DID_FILL_FROM_SCAN;
279 } else {
280 ret = set_node_max(c, child, b->key.k.p);
281 }
aae15aaf 282 }
fa8e94fa 283err:
aae15aaf 284fsck_err:
43f5ea46 285 printbuf_exit(&buf);
aae15aaf
KO
286 return ret;
287}
a66f7989 288
43f5ea46
KO
289static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
290 struct bpos *pulled_from_scan)
aae15aaf 291{
ca7d8fca 292 struct bch_fs *c = trans->c;
aae15aaf
KO
293 struct btree_and_journal_iter iter;
294 struct bkey_s_c k;
4351d3ec 295 struct bkey_buf prev_k, cur_k;
aae15aaf 296 struct btree *prev = NULL, *cur = NULL;
43f5ea46 297 bool have_child, new_pass = false;
48620e51 298 struct printbuf buf = PRINTBUF;
aae15aaf
KO
299 int ret = 0;
300
301 if (!b->c.level)
302 return 0;
43f5ea46 303
4351d3ec
KO
304 bch2_bkey_buf_init(&prev_k);
305 bch2_bkey_buf_init(&cur_k);
43f5ea46
KO
306again:
307 cur = prev = NULL;
308 have_child = new_pass = false;
fc634d8e 309 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
5f43b013 310 iter.prefetch = true;
aae15aaf
KO
311
312 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
e88a75eb
KO
313 BUG_ON(bpos_lt(k.k->p, b->data->min_key));
314 BUG_ON(bpos_gt(k.k->p, b->data->max_key));
a0b73c1c 315
aae15aaf 316 bch2_btree_and_journal_iter_advance(&iter);
4351d3ec 317 bch2_bkey_buf_reassemble(&cur_k, c, k);
aae15aaf 318
ca7d8fca 319 cur = bch2_btree_node_get_noiter(trans, cur_k.k,
aae15aaf
KO
320 b->c.btree_id, b->c.level - 1,
321 false);
322 ret = PTR_ERR_OR_ZERO(cur);
323
fa8e94fa
KO
324 printbuf_reset(&buf);
325 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
326
52946d82 327 if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
b65db750 328 btree_node_unreadable,
48620e51 329 "Topology repair: unreadable btree node at btree %s level %u:\n"
aae15aaf 330 " %s",
88dfe193 331 bch2_btree_id_str(b->c.btree_id),
aae15aaf 332 b->c.level - 1,
fa8e94fa 333 buf.buf)) {
ca7d8fca 334 bch2_btree_node_evict(trans, cur_k.k);
2817d453 335 cur = NULL;
5ab4beb7
KO
336 ret = bch2_journal_key_delete(c, b->c.btree_id,
337 b->c.level, cur_k.k->k.p);
a0b73c1c 338 if (ret)
4351d3ec 339 break;
5ab4beb7
KO
340
341 if (!btree_id_is_alloc(b->c.btree_id)) {
342 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
343 if (ret)
344 break;
345 }
aae15aaf 346 continue;
a0b73c1c
KO
347 }
348
cf904c8d
KO
349 bch_err_msg(c, ret, "getting btree node");
350 if (ret)
aae15aaf 351 break;
a0b73c1c 352
43f5ea46
KO
353 if (bch2_btree_node_is_stale(c, cur)) {
354 bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
355 six_unlock_read(&cur->c.lock);
356 bch2_btree_node_evict(trans, cur_k.k);
357 ret = bch2_journal_key_delete(c, b->c.btree_id,
358 b->c.level, cur_k.k->k.p);
359 cur = NULL;
360 if (ret)
361 break;
362 continue;
363 }
364
365 ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
366 if (ret == DID_FILL_FROM_SCAN) {
367 new_pass = true;
368 ret = 0;
369 }
4351d3ec
KO
370
371 if (ret == DROP_THIS_NODE) {
372 six_unlock_read(&cur->c.lock);
ca7d8fca 373 bch2_btree_node_evict(trans, cur_k.k);
4351d3ec
KO
374 ret = bch2_journal_key_delete(c, b->c.btree_id,
375 b->c.level, cur_k.k->k.p);
2817d453 376 cur = NULL;
4351d3ec
KO
377 if (ret)
378 break;
379 continue;
380 }
381
aae15aaf
KO
382 if (prev)
383 six_unlock_read(&prev->c.lock);
4351d3ec 384 prev = NULL;
aae15aaf 385
4351d3ec 386 if (ret == DROP_PREV_NODE) {
79032b07 387 bch_info(c, "dropped prev node");
ca7d8fca 388 bch2_btree_node_evict(trans, prev_k.k);
4351d3ec
KO
389 ret = bch2_journal_key_delete(c, b->c.btree_id,
390 b->c.level, prev_k.k->k.p);
391 if (ret)
392 break;
393
394 bch2_btree_and_journal_iter_exit(&iter);
4351d3ec
KO
395 goto again;
396 } else if (ret)
aae15aaf 397 break;
4351d3ec
KO
398
399 prev = cur;
400 cur = NULL;
401 bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
aae15aaf
KO
402 }
403
404 if (!ret && !IS_ERR_OR_NULL(prev)) {
405 BUG_ON(cur);
43f5ea46
KO
406 ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
407 if (ret == DID_FILL_FROM_SCAN) {
408 new_pass = true;
409 ret = 0;
410 }
aae15aaf
KO
411 }
412
413 if (!IS_ERR_OR_NULL(prev))
414 six_unlock_read(&prev->c.lock);
415 prev = NULL;
416 if (!IS_ERR_OR_NULL(cur))
417 six_unlock_read(&cur->c.lock);
418 cur = NULL;
a0b73c1c 419
aae15aaf
KO
420 if (ret)
421 goto err;
422
423 bch2_btree_and_journal_iter_exit(&iter);
43f5ea46
KO
424
425 if (new_pass)
426 goto again;
427
fc634d8e 428 bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
5f43b013 429 iter.prefetch = true;
aae15aaf
KO
430
431 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
4351d3ec 432 bch2_bkey_buf_reassemble(&cur_k, c, k);
aae15aaf 433 bch2_btree_and_journal_iter_advance(&iter);
a0b73c1c 434
ca7d8fca 435 cur = bch2_btree_node_get_noiter(trans, cur_k.k,
aae15aaf
KO
436 b->c.btree_id, b->c.level - 1,
437 false);
438 ret = PTR_ERR_OR_ZERO(cur);
a0b73c1c 439
cf904c8d
KO
440 bch_err_msg(c, ret, "getting btree node");
441 if (ret)
aae15aaf 442 goto err;
a0b73c1c 443
43f5ea46 444 ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
aae15aaf
KO
445 six_unlock_read(&cur->c.lock);
446 cur = NULL;
447
448 if (ret == DROP_THIS_NODE) {
ca7d8fca 449 bch2_btree_node_evict(trans, cur_k.k);
aae15aaf 450 ret = bch2_journal_key_delete(c, b->c.btree_id,
4351d3ec 451 b->c.level, cur_k.k->k.p);
43f5ea46 452 new_pass = true;
a0b73c1c 453 }
aae15aaf
KO
454
455 if (ret)
456 goto err;
457
458 have_child = true;
a0b73c1c 459 }
aae15aaf 460
fa8e94fa
KO
461 printbuf_reset(&buf);
462 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
463
aae15aaf 464 if (mustfix_fsck_err_on(!have_child, c,
b65db750 465 btree_node_topology_interior_node_empty,
aae15aaf
KO
466 "empty interior btree node at btree %s level %u\n"
467 " %s",
88dfe193 468 bch2_btree_id_str(b->c.btree_id),
fa8e94fa 469 b->c.level, buf.buf))
aae15aaf
KO
470 ret = DROP_THIS_NODE;
471err:
d06c1a0c 472fsck_err:
aae15aaf
KO
473 if (!IS_ERR_OR_NULL(prev))
474 six_unlock_read(&prev->c.lock);
475 if (!IS_ERR_OR_NULL(cur))
476 six_unlock_read(&cur->c.lock);
477
478 bch2_btree_and_journal_iter_exit(&iter);
aae15aaf 479
43f5ea46 480 if (!ret && new_pass)
aae15aaf
KO
481 goto again;
482
43f5ea46
KO
483 BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
484
485 bch2_bkey_buf_exit(&prev_k, c);
486 bch2_bkey_buf_exit(&cur_k, c);
fa8e94fa 487 printbuf_exit(&buf);
aae15aaf
KO
488 return ret;
489}
490
922bc5a0 491int bch2_check_topology(struct bch_fs *c)
aae15aaf 492{
6bd68ec2 493 struct btree_trans *trans = bch2_trans_get(c);
43f5ea46 494 struct bpos pulled_from_scan = POS_MIN;
aae15aaf
KO
495 int ret = 0;
496
43f5ea46 497 for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
faa6cb6c 498 struct btree_root *r = bch2_btree_id_root(c, i);
43f5ea46 499 bool reconstructed_root = false;
faa6cb6c 500
43f5ea46
KO
501 if (r->error) {
502 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
503 if (ret)
504 break;
505reconstruct_root:
506 bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
507
508 r->alive = false;
509 r->error = 0;
510
511 if (!bch2_btree_has_scanned_nodes(c, i)) {
512 mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
513 "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
e2e568bd 514 bch2_btree_root_alloc_fake_trans(trans, i, 0);
43f5ea46 515 } else {
e2e568bd 516 bch2_btree_root_alloc_fake_trans(trans, i, 1);
359571c3 517 bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
43f5ea46
KO
518 ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
519 if (ret)
520 break;
521 }
522
43f5ea46
KO
523 reconstructed_root = true;
524 }
faa6cb6c 525
43f5ea46 526 struct btree *b = r->b;
aae15aaf 527
6bd68ec2 528 btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
43f5ea46 529 ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
aae15aaf
KO
530 six_unlock_read(&b->c.lock);
531
532 if (ret == DROP_THIS_NODE) {
43f5ea46
KO
533 bch2_btree_node_hash_remove(&c->btree_cache, b);
534 mutex_lock(&c->btree_cache.lock);
535 list_move(&b->list, &c->btree_cache.freeable);
536 mutex_unlock(&c->btree_cache.lock);
537
538 r->b = NULL;
539
540 if (!reconstructed_root)
541 goto reconstruct_root;
542
543 bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
e2e568bd 544 bch2_btree_root_alloc_fake_trans(trans, i, 0);
43f5ea46
KO
545 r->alive = false;
546 ret = 0;
aae15aaf
KO
547 }
548 }
43f5ea46 549fsck_err:
6bd68ec2 550 bch2_trans_put(trans);
d06c1a0c 551 return ret;
1c6fdbd8
KO
552}
553
2252aa27
KO
554/* marking of btree keys/nodes: */
555
904823de 556static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
24b27975
KO
557 unsigned level, struct btree **prev,
558 struct btree_iter *iter, struct bkey_s_c k,
c929f230 559 bool initial)
2252aa27 560{
904823de 561 struct bch_fs *c = trans->c;
24b27975
KO
562
563 if (iter) {
564 struct btree_path *path = btree_iter_path(trans, iter);
565 struct btree *b = path_l(path)->b;
566
567 if (*prev != b) {
568 int ret = bch2_btree_node_check_topology(trans, b);
569 if (ret)
570 return ret;
571 }
572 *prev = b;
573 }
574
58e1ea4b
KO
575 struct bkey deleted = KEY(0, 0, 0);
576 struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
27c15ed2 577 struct printbuf buf = PRINTBUF;
2252aa27
KO
578 int ret = 0;
579
f40d13f9 580 deleted.p = k.k->p;
58e1ea4b 581
91f8b567 582 if (initial) {
29364f34 583 BUG_ON(bch2_journal_seq_verify &&
f40d13f9 584 k.k->version.lo > atomic64_read(&c->journal.seq));
91f8b567 585
f40d13f9 586 if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
b65db750 587 bkey_version_in_future,
a9bc0a51 588 "key version number higher than recorded: %llu > %llu",
f40d13f9 589 k.k->version.lo,
a9bc0a51 590 atomic64_read(&c->key_version)))
f40d13f9 591 atomic64_set(&c->key_version, k.k->version.lo);
2252aa27
KO
592 }
593
f40d13f9 594 if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
27c15ed2
KO
595 c, btree_bitmap_not_marked,
596 "btree ptr not marked in member info btree allocated bitmap\n %s",
f40d13f9 597 (bch2_bkey_val_to_text(&buf, c, k),
27c15ed2
KO
598 buf.buf))) {
599 mutex_lock(&c->sb_lock);
f40d13f9 600 bch2_dev_btree_bitmap_mark(c, k);
27c15ed2
KO
601 bch2_write_super(c);
602 mutex_unlock(&c->sb_lock);
603 }
604
f40d13f9
KO
605 /*
606 * We require a commit before key_trigger() because
607 * key_trigger(BTREE_TRIGGER_GC) is not idempotant; we'll calculate the
608 * wrong result if we run it multiple times.
609 */
24b27975 610 unsigned flags = !iter ? BTREE_TRIGGER_is_root : 0;
f40d13f9
KO
611
612 ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
613 BTREE_TRIGGER_check_repair|flags);
614 if (ret)
615 goto out;
616
617 if (trans->nr_updates) {
618 ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
619 -BCH_ERR_transaction_restart_nested;
620 goto out;
621 }
622
623 ret = bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
624 BTREE_TRIGGER_gc|flags);
625out:
91f8b567 626fsck_err:
27c15ed2 627 printbuf_exit(&buf);
cf904c8d 628 bch_err_fn(c, ret);
2252aa27
KO
629 return ret;
630}
631
24b27975 632static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
1c6fdbd8 633{
904823de 634 struct bch_fs *c = trans->c;
24b27975 635 int level = 0, target_depth = btree_node_type_needs_gc(__btree_node_type(0, btree)) ? 0 : 1;
1c6fdbd8
KO
636 int ret = 0;
637
24b27975
KO
638 /* We need to make sure every leaf node is readable before going RW */
639 if (initial)
640 target_depth = 0;
1c6fdbd8 641
24b27975 642 /* root */
1c6fdbd8 643 mutex_lock(&c->btree_root_lock);
24b27975
KO
644 struct btree *b = bch2_btree_id_root(c, btree)->b;
645 if (!btree_node_fake(b)) {
646 gc_pos_set(c, gc_pos_btree(btree, b->c.level + 1, SPOS_MAX));
f40d13f9
KO
647 ret = lockrestart_do(trans,
648 bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1,
24b27975
KO
649 NULL, NULL, bkey_i_to_s_c(&b->key), initial));
650 level = b->c.level;
651 }
1c6fdbd8 652 mutex_unlock(&c->btree_root_lock);
8b2b9d11 653
79032b07
KO
654 if (ret)
655 return ret;
656
24b27975
KO
657 for (; level >= target_depth; --level) {
658 struct btree *prev = NULL;
659 struct btree_iter iter;
660 bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, level,
661 BTREE_ITER_prefetch);
a0b73c1c 662
24b27975
KO
663 ret = for_each_btree_key_continue(trans, iter, 0, k, ({
664 gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
665 bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
666 }));
667 if (ret)
668 break;
e62d65f2 669 }
ba665494 670
e62d65f2
KO
671 return ret;
672}
673
cd575ddf
KO
674static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
675{
676 return (int) btree_id_to_gc_phase(l) -
677 (int) btree_id_to_gc_phase(r);
678}
679
24b27975 680static int bch2_gc_btrees(struct bch_fs *c)
2252aa27 681{
6bd68ec2 682 struct btree_trans *trans = bch2_trans_get(c);
cd575ddf 683 enum btree_id ids[BTREE_ID_NR];
2252aa27 684 unsigned i;
aae15aaf 685 int ret = 0;
2252aa27 686
cd575ddf
KO
687 for (i = 0; i < BTREE_ID_NR; i++)
688 ids[i] = i;
689 bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
690
58dda9c1
KO
691 for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
692 unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
2252aa27 693
58dda9c1 694 if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
faa6cb6c
KO
695 continue;
696
24b27975 697 ret = bch2_gc_btree(trans, btree, true);
faa6cb6c 698
b982d645
KO
699 if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
700 c, btree_node_read_error,
701 "btree node read error for %s",
702 bch2_btree_id_str(btree)))
703 ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
704 }
705fsck_err:
6bd68ec2 706 bch2_trans_put(trans);
cf904c8d 707 bch_err_fn(c, ret);
aae15aaf 708 return ret;
2252aa27
KO
709}
710
c281db0f 711static int bch2_mark_superblocks(struct bch_fs *c)
1c6fdbd8 712{
1c6fdbd8
KO
713 mutex_lock(&c->sb_lock);
714 gc_pos_set(c, gc_phase(GC_PHASE_SB));
715
5dd8c60e 716 int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
1c6fdbd8 717 mutex_unlock(&c->sb_lock);
c281db0f 718 return ret;
1c6fdbd8
KO
719}
720
9ca53b55
KO
721static void bch2_gc_free(struct bch_fs *c)
722{
990d42d1
KO
723 genradix_free(&c->reflink_gc_table);
724 genradix_free(&c->gc_stripes);
dfe9bfb3 725
9fea2274 726 for_each_member_device(c, ca) {
cb6fc943 727 kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
5735608c 728 ca->buckets_gc = NULL;
9ca53b55 729
180fb49d
KO
730 free_percpu(ca->usage_gc);
731 ca->usage_gc = NULL;
9ca53b55
KO
732 }
733
5e82a9a1
KO
734 free_percpu(c->usage_gc);
735 c->usage_gc = NULL;
06b7345c
KO
736}
737
58dda9c1 738static int bch2_gc_done(struct bch_fs *c)
9ca53b55 739{
3a402c8d 740 struct bch_dev *ca = NULL;
fa8e94fa 741 struct printbuf buf = PRINTBUF;
9fea2274 742 unsigned i;
cccf4e6d 743 int ret = 0;
9ca53b55 744
ec061b21
KO
745 percpu_down_write(&c->mark_lock);
746
58dda9c1
KO
747#define copy_field(_err, _f, _msg, ...) \
748 if (fsck_err_on(dst->_f != src->_f, c, _err, \
749 _msg ": got %llu, should be %llu" , ##__VA_ARGS__, \
750 dst->_f, src->_f)) \
80b3bf33 751 dst->_f = src->_f
58dda9c1 752#define copy_dev_field(_err, _f, _msg, ...) \
9fea2274 753 copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
58dda9c1 754#define copy_fs_field(_err, _f, _msg, ...) \
b65db750 755 copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
9ca53b55 756
180fb49d
KO
757 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
758 bch2_fs_usage_acc_to_base(c, i);
759
9fea2274 760 __for_each_member_device(c, ca) {
ec061b21
KO
761 struct bch_dev_usage *dst = ca->usage_base;
762 struct bch_dev_usage *src = (void *)
73bd774d 763 bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
ec061b21
KO
764 dev_usage_u64s());
765
ec061b21 766 for (i = 0; i < BCH_DATA_NR; i++) {
b65db750 767 copy_dev_field(dev_usage_buckets_wrong,
e58f963c 768 d[i].buckets, "%s buckets", bch2_data_type_str(i));
b65db750 769 copy_dev_field(dev_usage_sectors_wrong,
e58f963c 770 d[i].sectors, "%s sectors", bch2_data_type_str(i));
b65db750 771 copy_dev_field(dev_usage_fragmented_wrong,
e58f963c 772 d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
180fb49d 773 }
b5e85d4d 774 }
9ca53b55
KO
775
776 {
ecf37a4a 777 unsigned nr = fs_usage_u64s(c);
5e82a9a1 778 struct bch_fs_usage *dst = c->usage_base;
23f80d2b 779 struct bch_fs_usage *src = (void *)
73bd774d 780 bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
9ca53b55 781
b65db750 782 copy_fs_field(fs_usage_hidden_wrong,
8e7834a8 783 b.hidden, "hidden");
b65db750 784 copy_fs_field(fs_usage_btree_wrong,
8e7834a8 785 b.btree, "btree");
06b7345c 786
58dda9c1
KO
787 copy_fs_field(fs_usage_data_wrong,
788 b.data, "data");
789 copy_fs_field(fs_usage_cached_wrong,
790 b.cached, "cached");
791 copy_fs_field(fs_usage_reserved_wrong,
792 b.reserved, "reserved");
793 copy_fs_field(fs_usage_nr_inodes_wrong,
794 b.nr_inodes,"nr_inodes");
795
796 for (i = 0; i < BCH_REPLICAS_MAX; i++)
797 copy_fs_field(fs_usage_persistent_reserved_wrong,
798 persistent_reserved[i],
799 "persistent_reserved[%i]", i);
9ca53b55 800
7ef2a73a 801 for (i = 0; i < c->replicas.nr; i++) {
086a52f7 802 struct bch_replicas_entry_v1 *e =
8777210b 803 cpu_replicas_entry(&c->replicas, i);
8777210b 804
fa8e94fa
KO
805 printbuf_reset(&buf);
806 bch2_replicas_entry_to_text(&buf, e);
8777210b 807
b65db750
KO
808 copy_fs_field(fs_usage_replicas_wrong,
809 replicas[i], "%s", buf.buf);
7ef2a73a 810 }
9ca53b55 811 }
76f4c7b0 812
9ca53b55
KO
813#undef copy_fs_field
814#undef copy_dev_field
dfe9bfb3
KO
815#undef copy_stripe_field
816#undef copy_field
cccf4e6d 817fsck_err:
f295298b 818 bch2_dev_put(ca);
cf904c8d 819 bch_err_fn(c, ret);
ec061b21 820 percpu_up_write(&c->mark_lock);
fa8e94fa 821 printbuf_exit(&buf);
cccf4e6d 822 return ret;
9ca53b55
KO
823}
824
14d7d61f 825static int bch2_gc_start(struct bch_fs *c)
9ca53b55 826{
5e82a9a1 827 BUG_ON(c->usage_gc);
9ca53b55 828
5e82a9a1 829 c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
ecf37a4a 830 sizeof(u64), GFP_KERNEL);
1e1a31c4
KO
831 if (!c->usage_gc) {
832 bch_err(c, "error allocating c->usage_gc");
65d48e35 833 return -BCH_ERR_ENOMEM_gc_start;
1e1a31c4 834 }
9ca53b55 835
9fea2274 836 for_each_member_device(c, ca) {
180fb49d 837 BUG_ON(ca->usage_gc);
9ca53b55 838
180fb49d
KO
839 ca->usage_gc = alloc_percpu(struct bch_dev_usage);
840 if (!ca->usage_gc) {
841 bch_err(c, "error allocating ca->usage_gc");
f295298b 842 bch2_dev_put(ca);
65d48e35 843 return -BCH_ERR_ENOMEM_gc_start;
1c6fdbd8 844 }
822835ff
KO
845
846 this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
847 ca->mi.nbuckets - ca->mi.first_bucket);
1c6fdbd8 848 }
9ca53b55 849
ec061b21
KO
850 return 0;
851}
852
3d48a7f8
KO
853/* returns true if not equal */
854static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
855 struct bch_alloc_v4 r)
856{
857 return l.gen != r.gen ||
858 l.oldest_gen != r.oldest_gen ||
859 l.data_type != r.data_type ||
860 l.dirty_sectors != r.dirty_sectors ||
861 l.cached_sectors != r.cached_sectors ||
862 l.stripe_redundancy != r.stripe_redundancy ||
863 l.stripe != r.stripe;
864}
865
ec061b21
KO
866static int bch2_alloc_write_key(struct btree_trans *trans,
867 struct btree_iter *iter,
f5faf43f 868 struct bch_dev *ca,
58dda9c1 869 struct bkey_s_c k)
ec061b21
KO
870{
871 struct bch_fs *c = trans->c;
3d48a7f8 872 struct bkey_i_alloc_v4 *a;
c02eb9e8 873 struct bch_alloc_v4 old_gc, gc, old_convert, new;
19a614d2 874 const struct bch_alloc_v4 *old;
ec061b21
KO
875 int ret;
876
19a614d2 877 old = bch2_alloc_to_v4(k, &old_convert);
c02eb9e8 878 gc = new = *old;
ec061b21
KO
879
880 percpu_down_read(&c->mark_lock);
c02eb9e8
KO
881 __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
882
883 old_gc = gc;
b3eba6a4
KO
884
885 if ((old->data_type == BCH_DATA_sb ||
886 old->data_type == BCH_DATA_journal) &&
887 !bch2_dev_is_online(ca)) {
c02eb9e8
KO
888 gc.data_type = old->data_type;
889 gc.dirty_sectors = old->dirty_sectors;
b3eba6a4 890 }
822835ff
KO
891
892 /*
c02eb9e8 893 * gc.data_type doesn't yet include need_discard & need_gc_gen states -
822835ff
KO
894 * fix that here:
895 */
c02eb9e8 896 alloc_data_type_set(&gc, gc.data_type);
ec061b21 897
b3eba6a4
KO
898 if (gc.data_type != old_gc.data_type ||
899 gc.dirty_sectors != old_gc.dirty_sectors)
c02eb9e8 900 bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true);
37bb9c95 901 percpu_up_read(&c->mark_lock);
b3eba6a4 902
cdce1094 903 if (fsck_err_on(new.data_type != gc.data_type, c,
b65db750 904 alloc_key_data_type_wrong,
91065976
KO
905 "bucket %llu:%llu gen %u has wrong data_type"
906 ": got %s, should be %s",
907 iter->pos.inode, iter->pos.offset,
908 gc.gen,
e58f963c
KO
909 bch2_data_type_str(new.data_type),
910 bch2_data_type_str(gc.data_type)))
91065976
KO
911 new.data_type = gc.data_type;
912
b65db750 913#define copy_bucket_field(_errtype, _f) \
cdce1094 914 if (fsck_err_on(new._f != gc._f, c, _errtype, \
ec061b21
KO
915 "bucket %llu:%llu gen %u data type %s has wrong " #_f \
916 ": got %u, should be %u", \
917 iter->pos.inode, iter->pos.offset, \
66d90823 918 gc.gen, \
e58f963c 919 bch2_data_type_str(gc.data_type), \
3d48a7f8
KO
920 new._f, gc._f)) \
921 new._f = gc._f; \
ec061b21 922
b65db750
KO
923 copy_bucket_field(alloc_key_gen_wrong,
924 gen);
925 copy_bucket_field(alloc_key_dirty_sectors_wrong,
926 dirty_sectors);
927 copy_bucket_field(alloc_key_cached_sectors_wrong,
928 cached_sectors);
929 copy_bucket_field(alloc_key_stripe_wrong,
930 stripe);
931 copy_bucket_field(alloc_key_stripe_redundancy_wrong,
932 stripe_redundancy);
ec061b21
KO
933#undef copy_bucket_field
934
19a614d2 935 if (!bch2_alloc_v4_cmp(*old, new))
ec061b21
KO
936 return 0;
937
3d48a7f8
KO
938 a = bch2_alloc_to_v4_mut(trans, k);
939 ret = PTR_ERR_OR_ZERO(a);
940 if (ret)
941 return ret;
942
943 a->v = new;
ec061b21 944
7003589d
KO
945 /*
946 * The trigger normally makes sure this is set, but we're not running
947 * triggers:
948 */
949 if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
950 a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
951
5dd8c60e 952 ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_norun);
ec061b21
KO
953fsck_err:
954 return ret;
955}
956
58dda9c1 957static int bch2_gc_alloc_done(struct bch_fs *c)
ec061b21 958{
ec061b21
KO
959 int ret = 0;
960
9fea2274
KO
961 for_each_member_device(c, ca) {
962 ret = bch2_trans_run(c,
963 for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
964 POS(ca->dev_idx, ca->mi.first_bucket),
965 POS(ca->dev_idx, ca->mi.nbuckets - 1),
5dd8c60e 966 BTREE_ITER_slots|BTREE_ITER_prefetch, k,
9fea2274 967 NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
f5faf43f 968 bch2_alloc_write_key(trans, &iter, ca, k)));
9fea2274 969 if (ret) {
f295298b 970 bch2_dev_put(ca);
ec061b21
KO
971 break;
972 }
973 }
a1d58243 974
9fea2274
KO
975 bch_err_fn(c, ret);
976 return ret;
ec061b21 977}
41e37786 978
58dda9c1 979static int bch2_gc_alloc_start(struct bch_fs *c)
ec061b21 980{
9fea2274 981 for_each_member_device(c, ca) {
cb6fc943 982 struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ec061b21
KO
983 ca->mi.nbuckets * sizeof(struct bucket),
984 GFP_KERNEL|__GFP_ZERO);
985 if (!buckets) {
f295298b 986 bch2_dev_put(ca);
ec061b21 987 bch_err(c, "error allocating ca->buckets[gc]");
9fea2274 988 return -BCH_ERR_ENOMEM_gc_alloc_start;
28062d32 989 }
9ca53b55 990
ec061b21
KO
991 buckets->first_bucket = ca->mi.first_bucket;
992 buckets->nbuckets = ca->mi.nbuckets;
5735608c 993 rcu_assign_pointer(ca->buckets_gc, buckets);
b5e85d4d 994 }
0741d378 995
ad897d24 996 struct bch_dev *ca = NULL;
9fea2274
KO
997 int ret = bch2_trans_run(c,
998 for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
5dd8c60e 999 BTREE_ITER_prefetch, k, ({
ad897d24
KO
1000 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
1001 if (!ca) {
1002 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
1003 continue;
1004 }
463086d9 1005
9fea2274
KO
1006 struct bch_alloc_v4 a_convert;
1007 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
1008
ad897d24 1009 struct bucket *g = gc_bucket(ca, k.k->p.offset);
9fea2274
KO
1010 g->gen_valid = 1;
1011 g->gen = a->gen;
9fea2274
KO
1012 0;
1013 })));
ad897d24 1014 bch2_dev_put(ca);
cf904c8d 1015 bch_err_fn(c, ret);
5735608c 1016 return ret;
1c6fdbd8
KO
1017}
1018
326568f1
KO
1019static int bch2_gc_write_reflink_key(struct btree_trans *trans,
1020 struct btree_iter *iter,
1021 struct bkey_s_c k,
1022 size_t *idx)
890b74f0 1023{
326568f1
KO
1024 struct bch_fs *c = trans->c;
1025 const __le64 *refcount = bkey_refcount_c(k);
fa8e94fa 1026 struct printbuf buf = PRINTBUF;
326568f1 1027 struct reflink_gc *r;
890b74f0
KO
1028 int ret = 0;
1029
326568f1 1030 if (!refcount)
890b74f0
KO
1031 return 0;
1032
326568f1
KO
1033 while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
1034 r->offset < k.k->p.offset)
1035 ++*idx;
904823de 1036
326568f1
KO
1037 if (!r ||
1038 r->offset != k.k->p.offset ||
1039 r->size != k.k->size) {
1040 bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
1041 return -EINVAL;
1042 }
890b74f0 1043
326568f1 1044 if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
b65db750 1045 reflink_v_refcount_wrong,
326568f1
KO
1046 "reflink key has wrong refcount:\n"
1047 " %s\n"
1048 " should be %u",
1049 (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
1050 r->refcount)) {
06ebc483 1051 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
326568f1
KO
1052 ret = PTR_ERR_OR_ZERO(new);
1053 if (ret)
719aec84 1054 goto out;
890b74f0 1055
326568f1
KO
1056 if (!r->refcount)
1057 new->k.type = KEY_TYPE_deleted;
1058 else
717296c3 1059 *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
06ebc483 1060 ret = bch2_trans_update(trans, iter, new, 0);
326568f1 1061 }
719aec84 1062out:
326568f1
KO
1063fsck_err:
1064 printbuf_exit(&buf);
1065 return ret;
1066}
890b74f0 1067
58dda9c1 1068static int bch2_gc_reflink_done(struct bch_fs *c)
326568f1 1069{
326568f1 1070 size_t idx = 0;
890b74f0 1071
80eab7a7
KO
1072 int ret = bch2_trans_run(c,
1073 for_each_btree_key_commit(trans, iter,
1074 BTREE_ID_reflink, POS_MIN,
5dd8c60e 1075 BTREE_ITER_prefetch, k,
80eab7a7
KO
1076 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1077 bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
890b74f0
KO
1078 c->reflink_gc_nr = 0;
1079 return ret;
1080}
1081
58dda9c1 1082static int bch2_gc_reflink_start(struct bch_fs *c)
8f11548e 1083{
8f11548e
KO
1084 c->reflink_gc_nr = 0;
1085
80eab7a7 1086 int ret = bch2_trans_run(c,
5028b907 1087 for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
5dd8c60e 1088 BTREE_ITER_prefetch, k, ({
27b2df98 1089 const __le64 *refcount = bkey_refcount_c(k);
8f11548e 1090
27b2df98
KO
1091 if (!refcount)
1092 continue;
8f11548e 1093
80eab7a7
KO
1094 struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
1095 c->reflink_gc_nr++, GFP_KERNEL);
27b2df98
KO
1096 if (!r) {
1097 ret = -BCH_ERR_ENOMEM_gc_reflink_start;
1098 break;
1099 }
8f11548e 1100
27b2df98
KO
1101 r->offset = k.k->p.offset;
1102 r->size = k.k->size;
1103 r->refcount = 0;
1104 0;
1105 })));
8f11548e 1106
27b2df98 1107 bch_err_fn(c, ret);
8f11548e
KO
1108 return ret;
1109}
1110
326568f1
KO
1111static int bch2_gc_write_stripes_key(struct btree_trans *trans,
1112 struct btree_iter *iter,
1113 struct bkey_s_c k)
990d42d1 1114{
326568f1 1115 struct bch_fs *c = trans->c;
fa8e94fa 1116 struct printbuf buf = PRINTBUF;
326568f1
KO
1117 const struct bch_stripe *s;
1118 struct gc_stripe *m;
d57c9add 1119 bool bad = false;
990d42d1
KO
1120 unsigned i;
1121 int ret = 0;
1122
326568f1 1123 if (k.k->type != KEY_TYPE_stripe)
990d42d1
KO
1124 return 0;
1125
326568f1
KO
1126 s = bkey_s_c_to_stripe(k).v;
1127 m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
5222a460 1128
d57c9add
KO
1129 for (i = 0; i < s->nr_blocks; i++) {
1130 u32 old = stripe_blockcount_get(s, i);
1131 u32 new = (m ? m->block_sectors[i] : 0);
1132
1133 if (old != new) {
1134 prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n",
1135 i, old, new);
1136 bad = true;
1137 }
1138 }
1139
1140 if (bad)
1141 bch2_bkey_val_to_text(&buf, c, k);
1142
b65db750
KO
1143 if (fsck_err_on(bad, c, stripe_sector_count_wrong,
1144 "%s", buf.buf)) {
326568f1
KO
1145 struct bkey_i_stripe *new;
1146
1147 new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1148 ret = PTR_ERR_OR_ZERO(new);
1149 if (ret)
1150 return ret;
990d42d1 1151
326568f1 1152 bkey_reassemble(&new->k_i, k);
990d42d1 1153
326568f1
KO
1154 for (i = 0; i < new->v.nr_blocks; i++)
1155 stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
990d42d1 1156
326568f1 1157 ret = bch2_trans_update(trans, iter, &new->k_i, 0);
990d42d1
KO
1158 }
1159fsck_err:
326568f1
KO
1160 printbuf_exit(&buf);
1161 return ret;
1162}
990d42d1 1163
58dda9c1 1164static int bch2_gc_stripes_done(struct bch_fs *c)
326568f1 1165{
80eab7a7
KO
1166 return bch2_trans_run(c,
1167 for_each_btree_key_commit(trans, iter,
1168 BTREE_ID_stripes, POS_MIN,
5dd8c60e 1169 BTREE_ITER_prefetch, k,
80eab7a7
KO
1170 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1171 bch2_gc_write_stripes_key(trans, &iter, k)));
990d42d1
KO
1172}
1173
1c6fdbd8 1174/**
24b27975 1175 * bch2_check_allocations - walk all references to buckets, and recompute them:
9ca53b55 1176 *
96dea3d5 1177 * @c: filesystem object
96dea3d5
KO
1178 *
1179 * Returns: 0 on success, or standard errcode on failure
1180 *
9ca53b55
KO
1181 * Order matters here:
1182 * - Concurrent GC relies on the fact that we have a total ordering for
1183 * everything that GC walks - see gc_will_visit_node(),
1184 * gc_will_visit_root()
1185 *
1186 * - also, references move around in the course of index updates and
1187 * various other crap: everything needs to agree on the ordering
1188 * references are allowed to move around in - e.g., we're allowed to
1189 * start with a reference owned by an open_bucket (the allocator) and
1190 * move it to the btree, but not the reverse.
1191 *
1192 * This is necessary to ensure that gc doesn't miss references that
1193 * move around - if references move backwards in the ordering GC
1194 * uses, GC could skip past them
1c6fdbd8 1195 */
24b27975 1196int bch2_check_allocations(struct bch_fs *c)
1c6fdbd8 1197{
2252aa27 1198 int ret;
1c6fdbd8 1199
1ada1606 1200 lockdep_assert_held(&c->state_lock);
1c6fdbd8 1201
1c6fdbd8 1202 down_write(&c->gc_lock);
00b8ccf7 1203
c0960603 1204 bch2_btree_interior_updates_flush(c);
8f11548e 1205
14d7d61f 1206 ret = bch2_gc_start(c) ?:
58dda9c1
KO
1207 bch2_gc_alloc_start(c) ?:
1208 bch2_gc_reflink_start(c);
9ca53b55 1209 if (ret)
1c6fdbd8 1210 goto out;
930e1a92 1211
8f11548e 1212 gc_pos_set(c, gc_phase(GC_PHASE_START));
1c6fdbd8 1213
c281db0f
KO
1214 ret = bch2_mark_superblocks(c);
1215 BUG_ON(ret);
1c6fdbd8 1216
24b27975 1217 ret = bch2_gc_btrees(c);
9ca53b55 1218 if (ret)
2252aa27 1219 goto out;
1c6fdbd8 1220
1c6fdbd8 1221 c->gc_count++;
a0b73c1c 1222
930e1a92 1223 bch2_journal_block(&c->journal);
a0b73c1c 1224out:
930e1a92
KO
1225 ret = bch2_gc_alloc_done(c) ?:
1226 bch2_gc_done(c) ?:
1227 bch2_gc_stripes_done(c) ?:
1228 bch2_gc_reflink_done(c);
9ca53b55 1229
930e1a92 1230 bch2_journal_unblock(&c->journal);
5e82a9a1 1231
ec061b21 1232 percpu_down_write(&c->mark_lock);
9ca53b55 1233 /* Indicates that gc is no longer in progress: */
dfe9bfb3 1234 __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
9ca53b55
KO
1235
1236 bch2_gc_free(c);
05b3d5ac
KO
1237 percpu_up_write(&c->mark_lock);
1238
1c6fdbd8 1239 up_write(&c->gc_lock);
9ca53b55 1240
1c6fdbd8
KO
1241 /*
1242 * At startup, allocations can happen directly instead of via the
1243 * allocator thread - issue wakeup in case they blocked on gc_lock:
1244 */
1245 closure_wake_up(&c->freelist_wait);
cf904c8d 1246 bch_err_fn(c, ret);
9ca53b55 1247 return ret;
1c6fdbd8
KO
1248}
1249
a1783320
KO
1250static int gc_btree_gens_key(struct btree_trans *trans,
1251 struct btree_iter *iter,
1252 struct bkey_s_c k)
c47c50f8 1253{
a1783320 1254 struct bch_fs *c = trans->c;
c47c50f8 1255 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
a1783320
KO
1256 struct bkey_i *u;
1257 int ret;
c47c50f8 1258
10330402
KO
1259 if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
1260 return -EROFS;
1261
c47c50f8 1262 percpu_down_read(&c->mark_lock);
ad897d24 1263 rcu_read_lock();
c47c50f8 1264 bkey_for_each_ptr(ptrs, ptr) {
ad897d24
KO
1265 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1266 if (!ca)
1267 continue;
c47c50f8 1268
3858aa42 1269 if (dev_ptr_stale(ca, ptr) > 16) {
ad897d24 1270 rcu_read_unlock();
c47c50f8 1271 percpu_up_read(&c->mark_lock);
a1783320 1272 goto update;
c47c50f8
KO
1273 }
1274 }
1275
1276 bkey_for_each_ptr(ptrs, ptr) {
ad897d24
KO
1277 struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
1278 if (!ca)
1279 continue;
c47c50f8 1280
ad897d24 1281 u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
c45c8667
KO
1282 if (gen_after(*gen, ptr->gen))
1283 *gen = ptr->gen;
c47c50f8 1284 }
ad897d24 1285 rcu_read_unlock();
c47c50f8 1286 percpu_up_read(&c->mark_lock);
a1783320
KO
1287 return 0;
1288update:
0fb3355d 1289 u = bch2_bkey_make_mut(trans, iter, &k, 0);
a1783320
KO
1290 ret = PTR_ERR_OR_ZERO(u);
1291 if (ret)
1292 return ret;
c47c50f8 1293
a1783320 1294 bch2_extent_normalize(c, bkey_i_to_s(u));
dbda63bb 1295 return 0;
451570a5
KO
1296}
1297
ad897d24
KO
1298static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev *ca,
1299 struct btree_iter *iter, struct bkey_s_c k)
c45c8667 1300{
19a614d2
KO
1301 struct bch_alloc_v4 a_convert;
1302 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
3d48a7f8 1303 struct bkey_i_alloc_v4 *a_mut;
c45c8667
KO
1304 int ret;
1305
19a614d2 1306 if (a->oldest_gen == ca->oldest_gen[iter->pos.offset])
c45c8667
KO
1307 return 0;
1308
3d48a7f8
KO
1309 a_mut = bch2_alloc_to_v4_mut(trans, k);
1310 ret = PTR_ERR_OR_ZERO(a_mut);
1311 if (ret)
1312 return ret;
1313
1314 a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
fa9bb741 1315 alloc_data_type_set(&a_mut->v, a_mut->v.data_type);
c45c8667 1316
3d48a7f8 1317 return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
c45c8667
KO
1318}
1319
451570a5
KO
1320int bch2_gc_gens(struct bch_fs *c)
1321{
c45c8667 1322 u64 b, start_time = local_clock();
451570a5
KO
1323 int ret;
1324
b9c3d139
KO
1325 /*
1326 * Ideally we would be using state_lock and not gc_lock here, but that
1327 * introduces a deadlock in the RO path - we currently take the state
1328 * lock at the start of going RO, thus the gc thread may get stuck:
1329 */
c45c8667
KO
1330 if (!mutex_trylock(&c->gc_gens_lock))
1331 return 0;
1332
674cfc26 1333 trace_and_count(c, gc_gens_start, c);
b9c3d139 1334 down_read(&c->gc_lock);
451570a5 1335
9fea2274 1336 for_each_member_device(c, ca) {
253ba178 1337 struct bucket_gens *gens = bucket_gens(ca);
c45c8667
KO
1338
1339 BUG_ON(ca->oldest_gen);
1340
253ba178 1341 ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
c45c8667 1342 if (!ca->oldest_gen) {
f295298b 1343 bch2_dev_put(ca);
65d48e35 1344 ret = -BCH_ERR_ENOMEM_gc_gens;
c45c8667
KO
1345 goto err;
1346 }
1347
c45c8667
KO
1348 for (b = gens->first_bucket;
1349 b < gens->nbuckets; b++)
1350 ca->oldest_gen[b] = gens->b[b];
451570a5
KO
1351 }
1352
9fea2274 1353 for (unsigned i = 0; i < BTREE_ID_NR; i++)
2da671dc 1354 if (btree_type_has_ptrs(i)) {
ac516d0e
KO
1355 c->gc_gens_btree = i;
1356 c->gc_gens_pos = POS_MIN;
96dea3d5 1357
9fea2274
KO
1358 ret = bch2_trans_run(c,
1359 for_each_btree_key_commit(trans, iter, i,
1360 POS_MIN,
5dd8c60e 1361 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
9fea2274
KO
1362 k,
1363 NULL, NULL,
1364 BCH_TRANS_COMMIT_no_enospc,
1365 gc_btree_gens_key(trans, &iter, k)));
5f659376 1366 if (ret)
451570a5
KO
1367 goto err;
1368 }
1369
ad897d24 1370 struct bch_dev *ca = NULL;
9fea2274
KO
1371 ret = bch2_trans_run(c,
1372 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
1373 POS_MIN,
5dd8c60e 1374 BTREE_ITER_prefetch,
9fea2274
KO
1375 k,
1376 NULL, NULL,
ad897d24
KO
1377 BCH_TRANS_COMMIT_no_enospc, ({
1378 ca = bch2_dev_iterate(c, ca, k.k->p.inode);
1379 if (!ca) {
1380 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
1381 continue;
1382 }
1383 bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
1384 })));
1385 bch2_dev_put(ca);
1386
5f659376 1387 if (ret)
a1783320 1388 goto err;
74ed7e56 1389
ac516d0e
KO
1390 c->gc_gens_btree = 0;
1391 c->gc_gens_pos = POS_MIN;
1392
74ed7e56 1393 c->gc_count++;
991ba021
KO
1394
1395 bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
674cfc26 1396 trace_and_count(c, gc_gens_end, c);
451570a5 1397err:
9fea2274 1398 for_each_member_device(c, ca) {
c45c8667
KO
1399 kvfree(ca->oldest_gen);
1400 ca->oldest_gen = NULL;
1401 }
1402
b9c3d139 1403 up_read(&c->gc_lock);
c45c8667 1404 mutex_unlock(&c->gc_gens_lock);
9fea2274
KO
1405 if (!bch2_err_matches(ret, EROFS))
1406 bch_err_fn(c, ret);
451570a5
KO
1407 return ret;
1408}
1409
10330402 1410static void bch2_gc_gens_work(struct work_struct *work)
1c6fdbd8 1411{
10330402
KO
1412 struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
1413 bch2_gc_gens(c);
1414 bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
1c6fdbd8
KO
1415}
1416
10330402 1417void bch2_gc_gens_async(struct bch_fs *c)
1c6fdbd8 1418{
10330402
KO
1419 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
1420 !queue_work(c->write_ref_wq, &c->gc_gens_work))
1421 bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
1c6fdbd8
KO
1422}
1423
10330402 1424void bch2_fs_gc_init(struct bch_fs *c)
1c6fdbd8 1425{
10330402 1426 seqcount_init(&c->gc_pos_lock);
1c6fdbd8 1427
10330402 1428 INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
1c6fdbd8 1429}