bcachefs: Bring back metadata only gc
[linux-block.git] / fs / bcachefs / recovery.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
07a1006a 4#include "bkey_buf.h"
7b3f84ea 5#include "alloc_background.h"
1c6fdbd8
KO
6#include "btree_gc.h"
7#include "btree_update.h"
8#include "btree_update_interior.h"
9#include "btree_io.h"
3e0745e2 10#include "buckets.h"
1c6fdbd8 11#include "dirent.h"
cd575ddf 12#include "ec.h"
1c6fdbd8 13#include "error.h"
96385742 14#include "fs-common.h"
1c6fdbd8
KO
15#include "fsck.h"
16#include "journal_io.h"
644d180b 17#include "journal_reclaim.h"
1dd7f9d9 18#include "journal_seq_blacklist.h"
a4805d66 19#include "move.h"
1c6fdbd8
KO
20#include "quota.h"
21#include "recovery.h"
42b72e0b 22#include "replicas.h"
1c6fdbd8
KO
23#include "super-io.h"
24
644d180b 25#include <linux/sort.h>
1c6fdbd8
KO
26#include <linux/stat.h>
27
28#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
29
33114c2d
KO
30/* for -o reconstruct_alloc: */
31static void drop_alloc_keys(struct journal_keys *keys)
32{
33 size_t src, dst;
34
35 for (src = 0, dst = 0; src < keys->nr; src++)
36 if (keys->d[src].btree_id != BTREE_ID_ALLOC)
37 keys->d[dst++] = keys->d[src];
38
39 keys->nr = dst;
40}
41
e222d206
KO
42/* iterate over keys read from the journal: */
43
5b593ee1
KO
44static int __journal_key_cmp(enum btree_id l_btree_id,
45 unsigned l_level,
46 struct bpos l_pos,
47 struct journal_key *r)
48{
49 return (cmp_int(l_btree_id, r->btree_id) ?:
50 cmp_int(l_level, r->level) ?:
51 bkey_cmp(l_pos, r->k->k.p));
52}
53
54static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
55{
56 return (cmp_int(l->btree_id, r->btree_id) ?:
57 cmp_int(l->level, r->level) ?:
58 bkey_cmp(l->k->k.p, r->k->k.p));
59}
60
61static size_t journal_key_search(struct journal_keys *journal_keys,
62 enum btree_id id, unsigned level,
63 struct bpos pos)
e222d206 64{
e62d65f2 65 size_t l = 0, r = journal_keys->nr, m;
e222d206 66
e62d65f2
KO
67 while (l < r) {
68 m = l + ((r - l) >> 1);
5b593ee1 69 if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
e62d65f2
KO
70 l = m + 1;
71 else
72 r = m;
e222d206
KO
73 }
74
e62d65f2 75 BUG_ON(l < journal_keys->nr &&
5b593ee1 76 __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
e62d65f2
KO
77
78 BUG_ON(l &&
5b593ee1
KO
79 __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
80
81 return l;
82}
83
84static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
85{
86 struct bkey_i *n = iter->keys->d[idx].k;
87 struct btree_and_journal_iter *biter =
88 container_of(iter, struct btree_and_journal_iter, journal);
89
90 if (iter->idx > idx ||
91 (iter->idx == idx &&
92 biter->last &&
93 bkey_cmp(n->k.p, biter->unpacked.p) <= 0))
94 iter->idx++;
95}
96
97int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
98 unsigned level, struct bkey_i *k)
99{
100 struct journal_key n = {
101 .btree_id = id,
102 .level = level,
103 .k = k,
104 .allocated = true
105 };
106 struct journal_keys *keys = &c->journal_keys;
107 struct journal_iter *iter;
108 unsigned idx = journal_key_search(keys, id, level, k->k.p);
109
110 if (idx < keys->nr &&
111 journal_key_cmp(&n, &keys->d[idx]) == 0) {
112 if (keys->d[idx].allocated)
113 kfree(keys->d[idx].k);
114 keys->d[idx] = n;
115 return 0;
116 }
117
118 if (keys->nr == keys->size) {
119 struct journal_keys new_keys = {
120 .nr = keys->nr,
121 .size = keys->size * 2,
122 .journal_seq_base = keys->journal_seq_base,
123 };
124
125 new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
dab9ef0d
KO
126 if (!new_keys.d) {
127 bch_err(c, "%s: error allocating new key array (size %zu)",
128 __func__, new_keys.size);
5b593ee1 129 return -ENOMEM;
dab9ef0d 130 }
5b593ee1
KO
131
132 memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
133 kvfree(keys->d);
134 *keys = new_keys;
135 }
136
137 array_insert_item(keys->d, keys->nr, idx, n);
138
139 list_for_each_entry(iter, &c->journal_iters, list)
140 journal_iter_fix(c, iter, idx);
141
142 return 0;
143}
144
145int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
146 unsigned level, struct bpos pos)
147{
148 struct bkey_i *whiteout =
149 kmalloc(sizeof(struct bkey), GFP_KERNEL);
150 int ret;
151
dab9ef0d
KO
152 if (!whiteout) {
153 bch_err(c, "%s: error allocating new key", __func__);
5b593ee1 154 return -ENOMEM;
dab9ef0d 155 }
5b593ee1
KO
156
157 bkey_init(&whiteout->k);
158 whiteout->k.p = pos;
e62d65f2 159
5b593ee1
KO
160 ret = bch2_journal_key_insert(c, id, level, whiteout);
161 if (ret)
162 kfree(whiteout);
163 return ret;
e62d65f2
KO
164}
165
166static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
167{
5b593ee1
KO
168 struct journal_key *k = iter->idx - iter->keys->nr
169 ? iter->keys->d + iter->idx : NULL;
170
171 if (k &&
172 k->btree_id == iter->btree_id &&
173 k->level == iter->level)
174 return k->k;
e62d65f2 175
5b593ee1 176 iter->idx = iter->keys->nr;
e62d65f2
KO
177 return NULL;
178}
179
180static void bch2_journal_iter_advance(struct journal_iter *iter)
181{
5b593ee1
KO
182 if (iter->idx < iter->keys->nr)
183 iter->idx++;
184}
185
186static void bch2_journal_iter_exit(struct journal_iter *iter)
187{
188 list_del(&iter->list);
e222d206
KO
189}
190
5b593ee1
KO
191static void bch2_journal_iter_init(struct bch_fs *c,
192 struct journal_iter *iter,
e62d65f2
KO
193 enum btree_id id, unsigned level,
194 struct bpos pos)
e222d206 195{
e62d65f2
KO
196 iter->btree_id = id;
197 iter->level = level;
5b593ee1
KO
198 iter->keys = &c->journal_keys;
199 iter->idx = journal_key_search(&c->journal_keys, id, level, pos);
200 list_add(&iter->list, &c->journal_iters);
e62d65f2 201}
e222d206 202
e62d65f2
KO
203static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
204{
5b593ee1
KO
205 return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
206 iter->b, &iter->unpacked);
e62d65f2 207}
5c4a5cd5 208
e62d65f2
KO
209static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
210{
5b593ee1 211 bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
e222d206
KO
212}
213
5c4a5cd5
KO
214void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
215{
216 switch (iter->last) {
217 case none:
218 break;
219 case btree:
e62d65f2 220 bch2_journal_iter_advance_btree(iter);
5c4a5cd5
KO
221 break;
222 case journal:
e62d65f2 223 bch2_journal_iter_advance(&iter->journal);
5c4a5cd5
KO
224 break;
225 }
226
227 iter->last = none;
228}
229
230struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
231{
232 struct bkey_s_c ret;
233
234 while (1) {
e62d65f2
KO
235 struct bkey_s_c btree_k =
236 bch2_journal_iter_peek_btree(iter);
237 struct bkey_s_c journal_k =
238 bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
5c4a5cd5
KO
239
240 if (btree_k.k && journal_k.k) {
241 int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
242
243 if (!cmp)
e62d65f2 244 bch2_journal_iter_advance_btree(iter);
5c4a5cd5
KO
245
246 iter->last = cmp < 0 ? btree : journal;
247 } else if (btree_k.k) {
248 iter->last = btree;
249 } else if (journal_k.k) {
250 iter->last = journal;
251 } else {
252 iter->last = none;
253 return bkey_s_c_null;
254 }
255
256 ret = iter->last == journal ? journal_k : btree_k;
e62d65f2
KO
257
258 if (iter->b &&
259 bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
5b593ee1 260 iter->journal.idx = iter->journal.keys->nr;
e62d65f2
KO
261 iter->last = none;
262 return bkey_s_c_null;
263 }
264
5c4a5cd5
KO
265 if (!bkey_deleted(ret.k))
266 break;
267
268 bch2_btree_and_journal_iter_advance(iter);
269 }
270
271 return ret;
272}
273
274struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter)
275{
276 bch2_btree_and_journal_iter_advance(iter);
277
278 return bch2_btree_and_journal_iter_peek(iter);
279}
280
5b593ee1 281void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
5c4a5cd5 282{
5b593ee1 283 bch2_journal_iter_exit(&iter->journal);
e62d65f2
KO
284}
285
286void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
5b593ee1 287 struct bch_fs *c,
e62d65f2
KO
288 struct btree *b)
289{
e62d65f2
KO
290 memset(iter, 0, sizeof(*iter));
291
292 iter->b = b;
293 bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
5b593ee1 294 bch2_journal_iter_init(c, &iter->journal,
b58a181d 295 b->c.btree_id, b->c.level, b->data->min_key);
5c4a5cd5
KO
296}
297
b2930396
KO
298/* Walk btree, overlaying keys from the journal: */
299
edfbba58
KO
300static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
301 struct btree_and_journal_iter iter)
302{
303 unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
304 struct bkey_s_c k;
305 struct bkey_buf tmp;
306
307 BUG_ON(!b->c.level);
308
309 bch2_bkey_buf_init(&tmp);
310
311 while (i < nr &&
312 (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
313 bch2_bkey_buf_reassemble(&tmp, c, k);
314
315 bch2_btree_node_prefetch(c, NULL, tmp.k,
316 b->c.btree_id, b->c.level - 1);
317
318 bch2_btree_and_journal_iter_advance(&iter);
319 i++;
320 }
321
322 bch2_bkey_buf_exit(&tmp, c);
323}
324
b2930396
KO
325static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
326 struct journal_keys *journal_keys,
327 enum btree_id btree_id,
328 btree_walk_node_fn node_fn,
329 btree_walk_key_fn key_fn)
330{
331 struct btree_and_journal_iter iter;
332 struct bkey_s_c k;
edfbba58
KO
333 struct bkey_buf tmp;
334 struct btree *child;
b2930396
KO
335 int ret = 0;
336
edfbba58 337 bch2_bkey_buf_init(&tmp);
5b593ee1 338 bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
b2930396
KO
339
340 while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
341 ret = key_fn(c, btree_id, b->c.level, k);
342 if (ret)
343 break;
344
345 if (b->c.level) {
07a1006a 346 bch2_bkey_buf_reassemble(&tmp, c, k);
b2930396
KO
347
348 bch2_btree_and_journal_iter_advance(&iter);
349
07a1006a 350 child = bch2_btree_node_get_noiter(c, tmp.k,
a0b73c1c
KO
351 b->c.btree_id, b->c.level - 1,
352 false);
07a1006a
KO
353
354 ret = PTR_ERR_OR_ZERO(child);
355 if (ret)
356 break;
357
edfbba58
KO
358 btree_and_journal_iter_prefetch(c, b, iter);
359
07a1006a
KO
360 ret = (node_fn ? node_fn(c, b) : 0) ?:
361 bch2_btree_and_journal_walk_recurse(c, child,
362 journal_keys, btree_id, node_fn, key_fn);
363 six_unlock_read(&child->c.lock);
364
365 if (ret)
366 break;
b2930396
KO
367 } else {
368 bch2_btree_and_journal_iter_advance(&iter);
369 }
370 }
371
5b593ee1 372 bch2_btree_and_journal_iter_exit(&iter);
edfbba58 373 bch2_bkey_buf_exit(&tmp, c);
b2930396
KO
374 return ret;
375}
376
377int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys,
378 enum btree_id btree_id,
379 btree_walk_node_fn node_fn,
380 btree_walk_key_fn key_fn)
381{
382 struct btree *b = c->btree_roots[btree_id].b;
383 int ret = 0;
384
385 if (btree_node_fake(b))
386 return 0;
387
388 six_lock_read(&b->c.lock, NULL, NULL);
389 ret = (node_fn ? node_fn(c, b) : 0) ?:
390 bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id,
391 node_fn, key_fn) ?:
392 key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key));
393 six_unlock_read(&b->c.lock);
394
395 return ret;
396}
397
d0734356 398/* sort and dedup all keys in the journal: */
644d180b 399
f1d786a0 400void bch2_journal_entries_free(struct list_head *list)
1c6fdbd8 401{
1c6fdbd8 402
644d180b
KO
403 while (!list_empty(list)) {
404 struct journal_replay *i =
405 list_first_entry(list, struct journal_replay, list);
406 list_del(&i->list);
407 kvpfree(i, offsetof(struct journal_replay, j) +
408 vstruct_bytes(&i->j));
1c6fdbd8 409 }
644d180b 410}
1c6fdbd8 411
e3e464ac
KO
412/*
413 * When keys compare equal, oldest compares first:
414 */
d0734356
KO
415static int journal_sort_key_cmp(const void *_l, const void *_r)
416{
417 const struct journal_key *l = _l;
418 const struct journal_key *r = _r;
419
e62d65f2
KO
420 return cmp_int(l->btree_id, r->btree_id) ?:
421 cmp_int(l->level, r->level) ?:
e3e464ac 422 bkey_cmp(l->k->k.p, r->k->k.p) ?:
d0734356
KO
423 cmp_int(l->journal_seq, r->journal_seq) ?:
424 cmp_int(l->journal_offset, r->journal_offset);
425}
426
f1d786a0 427void bch2_journal_keys_free(struct journal_keys *keys)
d0734356 428{
5b593ee1
KO
429 struct journal_key *i;
430
431 for (i = keys->d; i < keys->d + keys->nr; i++)
432 if (i->allocated)
433 kfree(i->k);
434
d0734356
KO
435 kvfree(keys->d);
436 keys->d = NULL;
437 keys->nr = 0;
438}
439
440static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
441{
adbcada4 442 struct journal_replay *i;
d0734356
KO
443 struct jset_entry *entry;
444 struct bkey_i *k, *_n;
e3e464ac
KO
445 struct journal_keys keys = { NULL };
446 struct journal_key *src, *dst;
d0734356
KO
447 size_t nr_keys = 0;
448
7fffc85b
KO
449 if (list_empty(journal_entries))
450 return keys;
451
adbcada4
KO
452 list_for_each_entry(i, journal_entries, list) {
453 if (i->ignore)
7fffc85b
KO
454 continue;
455
adbcada4
KO
456 if (!keys.journal_seq_base)
457 keys.journal_seq_base = le64_to_cpu(i->j.seq);
458
459 for_each_jset_key(k, _n, entry, &i->j)
d0734356 460 nr_keys++;
7fffc85b 461 }
d0734356 462
5b593ee1
KO
463 keys.size = roundup_pow_of_two(nr_keys);
464
465 keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
d0734356
KO
466 if (!keys.d)
467 goto err;
468
adbcada4
KO
469 list_for_each_entry(i, journal_entries, list) {
470 if (i->ignore)
7fffc85b
KO
471 continue;
472
adbcada4
KO
473 BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
474
475 for_each_jset_key(k, _n, entry, &i->j)
d0734356
KO
476 keys.d[keys.nr++] = (struct journal_key) {
477 .btree_id = entry->btree_id,
e62d65f2 478 .level = entry->level,
d0734356 479 .k = k,
adbcada4 480 .journal_seq = le64_to_cpu(i->j.seq) -
d0734356 481 keys.journal_seq_base,
adbcada4 482 .journal_offset = k->_data - i->j._data,
d0734356 483 };
7fffc85b 484 }
d0734356 485
3186c80f 486 sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
d0734356 487
e3e464ac
KO
488 src = dst = keys.d;
489 while (src < keys.d + keys.nr) {
490 while (src + 1 < keys.d + keys.nr &&
e62d65f2
KO
491 src[0].btree_id == src[1].btree_id &&
492 src[0].level == src[1].level &&
e3e464ac
KO
493 !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
494 src++;
d0734356 495
e3e464ac 496 *dst++ = *src++;
d0734356
KO
497 }
498
e3e464ac 499 keys.nr = dst - keys.d;
d0734356 500err:
e3e464ac 501 return keys;
d0734356
KO
502}
503
504/* journal replay: */
505
506static void replay_now_at(struct journal *j, u64 seq)
507{
508 BUG_ON(seq < j->replay_journal_seq);
509 BUG_ON(seq > j->replay_journal_seq_end);
510
511 while (j->replay_journal_seq < seq)
512 bch2_journal_pin_put(j, j->replay_journal_seq++);
513}
514
2d594dfb 515static int __bch2_journal_replay_key(struct btree_trans *trans,
f44a6a71
KO
516 enum btree_id id, unsigned level,
517 struct bkey_i *k)
2d594dfb
KO
518{
519 struct btree_iter *iter;
f6d0368e 520 int ret;
2d594dfb 521
f44a6a71
KO
522 iter = bch2_trans_get_node_iter(trans, id, k->k.p,
523 BTREE_MAX_DEPTH, level,
524 BTREE_ITER_INTENT);
2d594dfb 525
e3e464ac
KO
526 /*
527 * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
528 * extent_handle_overwrites() and extent_update_to_keys() - but we don't
529 * want that here, journal replay is supposed to treat extents like
530 * regular keys:
531 */
532 __bch2_btree_iter_set_pos(iter, k->k.p, false);
533
f6d0368e
KO
534 ret = bch2_btree_iter_traverse(iter) ?:
535 bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
27beb810 536 bch2_trans_iter_put(trans, iter);
f6d0368e 537 return ret;
2d594dfb
KO
538}
539
5b593ee1 540static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
2d594dfb 541{
5b593ee1
KO
542 unsigned commit_flags = BTREE_INSERT_NOFAIL|
543 BTREE_INSERT_LAZY_RW;
544
545 if (!k->allocated)
546 commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
547
548 return bch2_trans_do(c, NULL, NULL, commit_flags,
549 __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
2d594dfb
KO
550}
551
5d20ba48
KO
552static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
553{
554 struct btree_iter *iter;
555 int ret;
556
557 iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p,
558 BTREE_ITER_CACHED|
559 BTREE_ITER_CACHED_NOFILL|
560 BTREE_ITER_INTENT);
3eb26d01 561 ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
5d20ba48
KO
562 bch2_trans_iter_put(trans, iter);
563 return ret;
564}
565
566static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
567{
568 return bch2_trans_do(c, NULL, NULL,
569 BTREE_INSERT_NOFAIL|
570 BTREE_INSERT_USE_RESERVE|
571 BTREE_INSERT_LAZY_RW|
572 BTREE_INSERT_JOURNAL_REPLAY,
573 __bch2_alloc_replay_key(&trans, k));
574}
575
576static int journal_sort_seq_cmp(const void *_l, const void *_r)
577{
578 const struct journal_key *l = _l;
579 const struct journal_key *r = _r;
580
581 return cmp_int(r->level, l->level) ?:
582 cmp_int(l->journal_seq, r->journal_seq) ?:
583 cmp_int(l->btree_id, r->btree_id) ?:
584 bkey_cmp(l->k->k.p, r->k->k.p);
585}
586
d0734356
KO
587static int bch2_journal_replay(struct bch_fs *c,
588 struct journal_keys keys)
644d180b
KO
589{
590 struct journal *j = &c->journal;
d0734356 591 struct journal_key *i;
5d20ba48 592 u64 seq;
d0734356 593 int ret;
7b512638 594
d0734356 595 sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
7b512638 596
2f194e16
KO
597 if (keys.nr)
598 replay_now_at(j, keys.journal_seq_base);
f44a6a71 599
5d20ba48
KO
600 seq = j->replay_journal_seq;
601
602 /*
603 * First replay updates to the alloc btree - these will only update the
604 * btree key cache:
605 */
d0734356 606 for_each_journal_key(keys, i) {
5d20ba48 607 cond_resched();
d0734356 608
5d20ba48
KO
609 if (!i->level && i->btree_id == BTREE_ID_ALLOC) {
610 j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
d0734356 611 ret = bch2_alloc_replay_key(c, i->k);
5d20ba48
KO
612 if (ret)
613 goto err;
614 }
615 }
644d180b 616
5d20ba48
KO
617 /*
618 * Next replay updates to interior btree nodes:
619 */
620 for_each_journal_key(keys, i) {
621 cond_resched();
622
623 if (i->level) {
624 j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
5b593ee1 625 ret = bch2_journal_replay_key(c, i);
5d20ba48
KO
626 if (ret)
627 goto err;
644d180b 628 }
5d20ba48
KO
629 }
630
631 /*
632 * Now that the btree is in a consistent state, we can start journal
633 * reclaim (which will be flushing entries from the btree key cache back
634 * to the btree:
635 */
636 set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
637 set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
5731cf01 638 journal_reclaim_kick(j);
5d20ba48
KO
639
640 j->replay_journal_seq = seq;
d0734356 641
5d20ba48
KO
642 /*
643 * Now replay leaf node updates:
644 */
645 for_each_journal_key(keys, i) {
d0734356 646 cond_resched();
5d20ba48
KO
647
648 if (i->level || i->btree_id == BTREE_ID_ALLOC)
649 continue;
650
651 replay_now_at(j, keys.journal_seq_base + i->journal_seq);
652
8042b5b7 653 ret = bch2_journal_replay_key(c, i);
5d20ba48
KO
654 if (ret)
655 goto err;
7b512638 656 }
644d180b
KO
657
658 replay_now_at(j, j->replay_journal_seq_end);
659 j->replay_journal_seq = 0;
660
661 bch2_journal_set_replay_done(j);
662 bch2_journal_flush_all_pins(j);
d0734356 663 return bch2_journal_error(j);
5d20ba48 664err:
a0b73c1c
KO
665 bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
666 ret, bch2_btree_ids[i->btree_id], i->level);
5d20ba48 667 return ret;
7b512638
KO
668}
669
644d180b 670/* journal replay early: */
7b512638 671
42b72e0b
KO
672static int journal_replay_entry_early(struct bch_fs *c,
673 struct jset_entry *entry)
674{
675 int ret = 0;
676
677 switch (entry->type) {
678 case BCH_JSET_ENTRY_btree_root: {
2ded276b
KO
679 struct btree_root *r;
680
681 if (entry->btree_id >= BTREE_ID_NR) {
682 bch_err(c, "filesystem has unknown btree type %u",
683 entry->btree_id);
684 return -EINVAL;
685 }
686
687 r = &c->btree_roots[entry->btree_id];
42b72e0b
KO
688
689 if (entry->u64s) {
690 r->level = entry->level;
691 bkey_copy(&r->key, &entry->start[0]);
692 r->error = 0;
693 } else {
694 r->error = -EIO;
695 }
696 r->alive = true;
697 break;
698 }
699 case BCH_JSET_ENTRY_usage: {
700 struct jset_entry_usage *u =
701 container_of(entry, struct jset_entry_usage, entry);
702
3577df5f
KO
703 switch (entry->btree_id) {
704 case FS_USAGE_RESERVED:
705 if (entry->level < BCH_REPLICAS_MAX)
5e82a9a1
KO
706 c->usage_base->persistent_reserved[entry->level] =
707 le64_to_cpu(u->v);
42b72e0b
KO
708 break;
709 case FS_USAGE_INODES:
5e82a9a1 710 c->usage_base->nr_inodes = le64_to_cpu(u->v);
42b72e0b
KO
711 break;
712 case FS_USAGE_KEY_VERSION:
713 atomic64_set(&c->key_version,
3577df5f 714 le64_to_cpu(u->v));
42b72e0b
KO
715 break;
716 }
717
718 break;
719 }
3577df5f
KO
720 case BCH_JSET_ENTRY_data_usage: {
721 struct jset_entry_data_usage *u =
722 container_of(entry, struct jset_entry_data_usage, entry);
180fb49d 723
3577df5f
KO
724 ret = bch2_replicas_set_usage(c, &u->r,
725 le64_to_cpu(u->v));
726 break;
727 }
180fb49d
KO
728 case BCH_JSET_ENTRY_dev_usage: {
729 struct jset_entry_dev_usage *u =
730 container_of(entry, struct jset_entry_dev_usage, entry);
731 struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
732 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
733 unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
734 sizeof(struct jset_entry_dev_usage_type);
735 unsigned i;
736
737 ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
738 ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
739
740 for (i = 0; i < nr_types; i++) {
741 ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
742 ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
743 ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
744 }
745
746 break;
747 }
1dd7f9d9
KO
748 case BCH_JSET_ENTRY_blacklist: {
749 struct jset_entry_blacklist *bl_entry =
750 container_of(entry, struct jset_entry_blacklist, entry);
751
752 ret = bch2_journal_seq_blacklist_add(c,
753 le64_to_cpu(bl_entry->seq),
754 le64_to_cpu(bl_entry->seq) + 1);
755 break;
756 }
757 case BCH_JSET_ENTRY_blacklist_v2: {
758 struct jset_entry_blacklist_v2 *bl_entry =
759 container_of(entry, struct jset_entry_blacklist_v2, entry);
760
761 ret = bch2_journal_seq_blacklist_add(c,
762 le64_to_cpu(bl_entry->start),
763 le64_to_cpu(bl_entry->end) + 1);
764 break;
765 }
2abe5420
KO
766 case BCH_JSET_ENTRY_clock: {
767 struct jset_entry_clock *clock =
768 container_of(entry, struct jset_entry_clock, entry);
769
770 atomic64_set(&c->io_clock[clock->rw].now, clock->time);
771 }
42b72e0b
KO
772 }
773
774 return ret;
775}
776
1dd7f9d9
KO
777static int journal_replay_early(struct bch_fs *c,
778 struct bch_sb_field_clean *clean,
779 struct list_head *journal)
1c6fdbd8 780{
adbcada4 781 struct journal_replay *i;
7b512638
KO
782 struct jset_entry *entry;
783 int ret;
1c6fdbd8 784
7b512638 785 if (clean) {
7b512638
KO
786 for (entry = clean->start;
787 entry != vstruct_end(&clean->field);
788 entry = vstruct_next(entry)) {
789 ret = journal_replay_entry_early(c, entry);
790 if (ret)
791 return ret;
792 }
793 } else {
adbcada4
KO
794 list_for_each_entry(i, journal, list) {
795 if (i->ignore)
796 continue;
7b512638 797
7b512638
KO
798 vstruct_for_each(&i->j, entry) {
799 ret = journal_replay_entry_early(c, entry);
800 if (ret)
801 return ret;
802 }
adbcada4 803 }
134915f3 804 }
1c6fdbd8 805
7b512638
KO
806 bch2_fs_usage_initialize(c);
807
808 return 0;
809}
810
644d180b
KO
811/* sb clean section: */
812
813static struct bkey_i *btree_root_find(struct bch_fs *c,
814 struct bch_sb_field_clean *clean,
815 struct jset *j,
816 enum btree_id id, unsigned *level)
817{
818 struct bkey_i *k;
819 struct jset_entry *entry, *start, *end;
820
821 if (clean) {
822 start = clean->start;
823 end = vstruct_end(&clean->field);
824 } else {
825 start = j->start;
826 end = vstruct_last(j);
827 }
828
829 for (entry = start; entry < end; entry = vstruct_next(entry))
830 if (entry->type == BCH_JSET_ENTRY_btree_root &&
831 entry->btree_id == id)
832 goto found;
833
834 return NULL;
835found:
836 if (!entry->u64s)
837 return ERR_PTR(-EINVAL);
838
839 k = entry->start;
840 *level = entry->level;
841 return k;
842}
843
844static int verify_superblock_clean(struct bch_fs *c,
845 struct bch_sb_field_clean **cleanp,
846 struct jset *j)
847{
848 unsigned i;
849 struct bch_sb_field_clean *clean = *cleanp;
850 int ret = 0;
851
644d180b
KO
852 if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
853 "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
854 le64_to_cpu(clean->journal_seq),
855 le64_to_cpu(j->seq))) {
856 kfree(clean);
857 *cleanp = NULL;
858 return 0;
859 }
860
644d180b 861 for (i = 0; i < BTREE_ID_NR; i++) {
00b8ccf7 862 char buf1[200], buf2[200];
644d180b
KO
863 struct bkey_i *k1, *k2;
864 unsigned l1 = 0, l2 = 0;
865
866 k1 = btree_root_find(c, clean, NULL, i, &l1);
867 k2 = btree_root_find(c, NULL, j, i, &l2);
868
869 if (!k1 && !k2)
870 continue;
871
872 mustfix_fsck_err_on(!k1 || !k2 ||
873 IS_ERR(k1) ||
874 IS_ERR(k2) ||
875 k1->k.u64s != k2->k.u64s ||
876 memcmp(k1, k2, bkey_bytes(k1)) ||
877 l1 != l2, c,
00b8ccf7
KO
878 "superblock btree root %u doesn't match journal after clean shutdown\n"
879 "sb: l=%u %s\n"
880 "journal: l=%u %s\n", i,
881 l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
882 l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
644d180b
KO
883 }
884fsck_err:
885 return ret;
886}
887
888static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
889{
890 struct bch_sb_field_clean *clean, *sb_clean;
891 int ret;
892
893 mutex_lock(&c->sb_lock);
894 sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
895
896 if (fsck_err_on(!sb_clean, c,
897 "superblock marked clean but clean section not present")) {
898 SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
899 c->sb.clean = false;
900 mutex_unlock(&c->sb_lock);
901 return NULL;
902 }
903
904 clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
905 GFP_KERNEL);
906 if (!clean) {
907 mutex_unlock(&c->sb_lock);
908 return ERR_PTR(-ENOMEM);
909 }
910
911 if (le16_to_cpu(c->disk_sb.sb->version) <
912 bcachefs_metadata_version_bkey_renumber)
913 bch2_sb_clean_renumber(clean, READ);
914
915 mutex_unlock(&c->sb_lock);
916
917 return clean;
918fsck_err:
919 mutex_unlock(&c->sb_lock);
920 return ERR_PTR(ret);
921}
922
7b512638
KO
923static int read_btree_roots(struct bch_fs *c)
924{
925 unsigned i;
926 int ret = 0;
1c6fdbd8
KO
927
928 for (i = 0; i < BTREE_ID_NR; i++) {
7b512638 929 struct btree_root *r = &c->btree_roots[i];
1c6fdbd8 930
7b512638
KO
931 if (!r->alive)
932 continue;
1c6fdbd8 933
7b512638 934 if (i == BTREE_ID_ALLOC &&
ad7e137e 935 c->opts.reconstruct_alloc) {
19dd3172 936 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
1c6fdbd8 937 continue;
7b512638 938 }
1c6fdbd8 939
7b512638
KO
940 if (r->error) {
941 __fsck_err(c, i == BTREE_ID_ALLOC
942 ? FSCK_CAN_IGNORE : 0,
943 "invalid btree root %s",
944 bch2_btree_ids[i]);
945 if (i == BTREE_ID_ALLOC)
19dd3172 946 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
7b512638
KO
947 }
948
949 ret = bch2_btree_root_read(c, i, &r->key, r->level);
950 if (ret) {
951 __fsck_err(c, i == BTREE_ID_ALLOC
952 ? FSCK_CAN_IGNORE : 0,
953 "error reading btree root %s",
954 bch2_btree_ids[i]);
955 if (i == BTREE_ID_ALLOC)
19dd3172 956 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
7b512638 957 }
1c6fdbd8 958 }
7b512638
KO
959
960 for (i = 0; i < BTREE_ID_NR; i++)
961 if (!c->btree_roots[i].b)
962 bch2_btree_root_alloc(c, i);
1c6fdbd8
KO
963fsck_err:
964 return ret;
965}
966
1c6fdbd8
KO
967int bch2_fs_recovery(struct bch_fs *c)
968{
969 const char *err = "cannot allocate memory";
1dd7f9d9 970 struct bch_sb_field_clean *clean = NULL;
adbcada4
KO
971 struct jset *last_journal_entry = NULL;
972 u64 blacklist_seq, journal_seq;
4291a331 973 bool write_sb = false;
1c6fdbd8
KO
974 int ret;
975
1dd7f9d9
KO
976 if (c->sb.clean)
977 clean = read_superblock_clean(c);
978 ret = PTR_ERR_OR_ZERO(clean);
979 if (ret)
980 goto err;
981
982 if (c->sb.clean)
1c6fdbd8
KO
983 bch_info(c, "recovering from clean shutdown, journal seq %llu",
984 le64_to_cpu(clean->journal_seq));
985
5d428c7c
KO
986 if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
987 bch_info(c, "alloc_v2 feature bit not set, fsck required");
988 c->opts.fsck = true;
989 c->opts.fix_errors = FSCK_OPT_YES;
990 c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_alloc_v2;
991 }
992
f621e152
KO
993 if (!c->replicas.entries ||
994 c->opts.rebuild_replicas) {
1dd7f9d9
KO
995 bch_info(c, "building replicas info");
996 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
997 }
998
adbcada4
KO
999 ret = bch2_blacklist_table_initialize(c);
1000 if (ret) {
1001 bch_err(c, "error initializing blacklist table");
1002 goto err;
1003 }
1004
5a655f06 1005 if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
adbcada4 1006 struct journal_replay *i;
1dd7f9d9 1007
adbcada4
KO
1008 ret = bch2_journal_read(c, &c->journal_entries,
1009 &blacklist_seq, &journal_seq);
1c6fdbd8
KO
1010 if (ret)
1011 goto err;
1012
adbcada4
KO
1013 list_for_each_entry_reverse(i, &c->journal_entries, list)
1014 if (!i->ignore) {
1015 last_journal_entry = &i->j;
1016 break;
1017 }
1018
1019 if (mustfix_fsck_err_on(c->sb.clean &&
1020 last_journal_entry &&
1021 !journal_entry_empty(last_journal_entry), c,
932aa837 1022 "filesystem marked clean but journal not empty")) {
19dd3172 1023 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
932aa837
KO
1024 SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
1025 c->sb.clean = false;
1026 }
1dd7f9d9 1027
adbcada4
KO
1028 if (!last_journal_entry) {
1029 fsck_err_on(!c->sb.clean, c, "no journal entries found");
1030 goto use_clean;
1dd7f9d9
KO
1031 }
1032
f1d786a0
KO
1033 c->journal_keys = journal_keys_sort(&c->journal_entries);
1034 if (!c->journal_keys.d) {
d0734356
KO
1035 ret = -ENOMEM;
1036 goto err;
1037 }
1038
adbcada4
KO
1039 if (c->sb.clean && last_journal_entry) {
1040 ret = verify_superblock_clean(c, &clean,
1041 last_journal_entry);
1042 if (ret)
1043 goto err;
1044 }
1045 } else {
1046use_clean:
1047 if (!clean) {
1048 bch_err(c, "no superblock clean section found");
1049 ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
7b512638 1050 goto err;
1dd7f9d9 1051
adbcada4
KO
1052 }
1053 blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
1dd7f9d9
KO
1054 }
1055
e3e464ac
KO
1056 if (!c->sb.clean &&
1057 !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
1058 bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
1059 ret = -EINVAL;
1060 goto err;
1061 }
1062
33114c2d 1063 if (c->opts.reconstruct_alloc) {
19dd3172 1064 c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
33114c2d
KO
1065 drop_alloc_keys(&c->journal_keys);
1066 }
1067
f1d786a0 1068 ret = journal_replay_early(c, clean, &c->journal_entries);
1dd7f9d9
KO
1069 if (ret)
1070 goto err;
1071
adbcada4
KO
1072 /*
1073 * After an unclean shutdown, skip then next few journal sequence
1074 * numbers as they may have been referenced by btree writes that
1075 * happened before their corresponding journal writes - those btree
1076 * writes need to be ignored, by skipping and blacklisting the next few
1077 * journal sequence numbers:
1078 */
1079 if (!c->sb.clean)
1080 journal_seq += 8;
1081
1082 if (blacklist_seq != journal_seq) {
1dd7f9d9 1083 ret = bch2_journal_seq_blacklist_add(c,
adbcada4 1084 blacklist_seq, journal_seq);
1dd7f9d9
KO
1085 if (ret) {
1086 bch_err(c, "error creating new journal seq blacklist entry");
7b512638 1087 goto err;
1dd7f9d9 1088 }
f707e3d8 1089 }
1c6fdbd8 1090
d0734356 1091 ret = bch2_fs_journal_start(&c->journal, journal_seq,
f1d786a0 1092 &c->journal_entries);
7b512638
KO
1093 if (ret)
1094 goto err;
1c6fdbd8 1095
7b512638
KO
1096 ret = read_btree_roots(c);
1097 if (ret)
1098 goto err;
1c6fdbd8 1099
932aa837 1100 bch_verbose(c, "starting alloc read");
1c6fdbd8 1101 err = "error reading allocation information";
f1d786a0 1102 ret = bch2_alloc_read(c, &c->journal_keys);
1c6fdbd8
KO
1103 if (ret)
1104 goto err;
932aa837 1105 bch_verbose(c, "alloc read done");
1c6fdbd8 1106
94cd106f 1107 bch_verbose(c, "starting stripes_read");
932aa837 1108 err = "error reading stripes";
f1d786a0 1109 ret = bch2_stripes_read(c, &c->journal_keys);
4e65431c
KO
1110 if (ret)
1111 goto err;
94cd106f 1112 bch_verbose(c, "stripes_read done");
61c8d7c8
KO
1113
1114 set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
4e65431c 1115
7b512638 1116 if (c->opts.fsck ||
19dd3172
KO
1117 !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
1118 !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
7b512638 1119 test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
41e37786
KO
1120 bool metadata_only = c->opts.norecovery;
1121
619f5bee 1122 bch_info(c, "starting mark and sweep");
932aa837 1123 err = "error in mark and sweep";
41e37786 1124 ret = bch2_gc(c, true, metadata_only);
8d6b6222 1125 if (ret)
4291a331 1126 goto err;
1df42b57
KO
1127 bch_verbose(c, "mark and sweep done");
1128 }
1c6fdbd8 1129
ac958006
KO
1130 bch2_stripes_heap_start(c);
1131
72644db1 1132 clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
1df42b57 1133 set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
72644db1 1134
42b72e0b
KO
1135 /*
1136 * Skip past versions that might have possibly been used (as nonces),
1137 * but hadn't had their pointers written:
1138 */
1139 if (c->sb.encryption_type && !c->sb.clean)
1140 atomic64_add(1 << 16, &c->key_version);
1141
619f5bee 1142 if (c->opts.norecovery)
7b512638
KO
1143 goto out;
1144
619f5bee 1145 bch_verbose(c, "starting journal replay");
1c6fdbd8 1146 err = "journal replay failed";
f1d786a0 1147 ret = bch2_journal_replay(c, c->journal_keys);
1c6fdbd8
KO
1148 if (ret)
1149 goto err;
1150 bch_verbose(c, "journal replay done");
1151
4291a331
KO
1152 if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
1153 !c->opts.nochanges) {
619f5bee
KO
1154 /*
1155 * note that even when filesystem was clean there might be work
1156 * to do here, if we ran gc (because of fsck) which recalculated
1157 * oldest_gen:
1158 */
1159 bch_verbose(c, "writing allocation info");
1160 err = "error writing out alloc info";
8d6b6222
KO
1161 ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?:
1162 bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
619f5bee
KO
1163 if (ret) {
1164 bch_err(c, "error writing alloc info");
1165 goto err;
1166 }
1167 bch_verbose(c, "alloc write done");
932aa837 1168 }
932aa837 1169
619f5bee 1170 if (!c->sb.clean) {
1c3ff72c 1171 if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) {
619f5bee
KO
1172 bch_info(c, "checking inode link counts");
1173 err = "error in recovery";
1174 ret = bch2_fsck_inode_nlink(c);
1175 if (ret)
1176 goto err;
1177 bch_verbose(c, "check inodes done");
1c6fdbd8 1178
619f5bee
KO
1179 } else {
1180 bch_verbose(c, "checking for deleted inodes");
1181 err = "error in recovery";
1182 ret = bch2_fsck_walk_inodes_only(c);
1183 if (ret)
1184 goto err;
1185 bch_verbose(c, "check inodes done");
1186 }
1187 }
1188
1189 if (c->opts.fsck) {
1190 bch_info(c, "starting fsck");
1191 err = "error in fsck";
1192 ret = bch2_fsck_full(c);
1193 if (ret)
1194 goto err;
1195 bch_verbose(c, "fsck done");
1196 }
1c6fdbd8 1197
7b512638 1198 if (enabled_qtypes(c)) {
619f5bee 1199 bch_verbose(c, "reading quotas");
7b512638
KO
1200 ret = bch2_fs_quota_read(c);
1201 if (ret)
1202 goto err;
1203 bch_verbose(c, "quotas done");
1204 }
1205
19dd3172
KO
1206 if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
1207 !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
a4805d66
KO
1208 struct bch_move_stats stats = { 0 };
1209
19dd3172 1210 bch_info(c, "scanning for old btree nodes");
a4805d66
KO
1211 ret = bch2_fs_read_write(c);
1212 if (ret)
1213 goto err;
1214
1215 ret = bch2_scan_old_btree_nodes(c, &stats);
1216 if (ret)
1217 goto err;
19dd3172 1218 bch_info(c, "scanning for old btree nodes done");
a4805d66
KO
1219 }
1220
26609b61
KO
1221 mutex_lock(&c->sb_lock);
1222 if (c->opts.version_upgrade) {
1223 if (c->sb.version < bcachefs_metadata_version_new_versioning)
1224 c->disk_sb.sb->version_min =
1225 le16_to_cpu(bcachefs_metadata_version_min);
1226 c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
b807a0c8 1227 c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
932aa837
KO
1228 write_sb = true;
1229 }
1230
1231 if (!test_bit(BCH_FS_ERROR, &c->flags)) {
19dd3172 1232 c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
932aa837 1233 write_sb = true;
88c07f73
KO
1234 }
1235
0bc166ff
KO
1236 if (c->opts.fsck &&
1237 !test_bit(BCH_FS_ERROR, &c->flags)) {
1c3ff72c 1238 c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
0bc166ff 1239 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
932aa837 1240 write_sb = true;
0bc166ff 1241 }
932aa837
KO
1242
1243 if (write_sb)
1244 bch2_write_super(c);
26609b61 1245 mutex_unlock(&c->sb_lock);
1dd7f9d9
KO
1246
1247 if (c->journal_seq_blacklist_table &&
1248 c->journal_seq_blacklist_table->nr > 128)
1249 queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
1c6fdbd8 1250out:
619f5bee
KO
1251 ret = 0;
1252err:
1253fsck_err:
89b05118 1254 set_bit(BCH_FS_FSCK_DONE, &c->flags);
619f5bee 1255 bch2_flush_fsck_errs(c);
89b05118 1256
f1d786a0
KO
1257 if (!c->opts.keep_journal) {
1258 bch2_journal_keys_free(&c->journal_keys);
1259 bch2_journal_entries_free(&c->journal_entries);
1260 }
1c6fdbd8 1261 kfree(clean);
619f5bee
KO
1262 if (ret)
1263 bch_err(c, "Error in recovery: %s (%i)", err, ret);
1264 else
1265 bch_verbose(c, "ret %i", ret);
1c6fdbd8 1266 return ret;
1c6fdbd8
KO
1267}
1268
1269int bch2_fs_initialize(struct bch_fs *c)
1270{
1271 struct bch_inode_unpacked root_inode, lostfound_inode;
1272 struct bkey_inode_buf packed_inode;
1c6fdbd8
KO
1273 struct qstr lostfound = QSTR("lost+found");
1274 const char *err = "cannot allocate memory";
1275 struct bch_dev *ca;
1276 LIST_HEAD(journal);
1277 unsigned i;
1278 int ret;
1279
1280 bch_notice(c, "initializing new filesystem");
1281
3e0745e2
KO
1282 mutex_lock(&c->sb_lock);
1283 for_each_online_member(ca, c, i)
1284 bch2_mark_dev_superblock(c, ca, 0);
1285 mutex_unlock(&c->sb_lock);
1286
61fc3c96
KO
1287 mutex_lock(&c->sb_lock);
1288 c->disk_sb.sb->version = c->disk_sb.sb->version_min =
1289 le16_to_cpu(bcachefs_metadata_version_current);
1290 c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink;
1291 c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
19dd3172
KO
1292 c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
1293 c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
61fc3c96
KO
1294
1295 bch2_write_super(c);
1296 mutex_unlock(&c->sb_lock);
1297
1c6fdbd8 1298 set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
f7e76361 1299 set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
1c6fdbd8 1300
dfe9bfb3
KO
1301 for (i = 0; i < BTREE_ID_NR; i++)
1302 bch2_btree_root_alloc(c, i);
1303
5d20ba48
KO
1304 set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
1305 set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
1306
1c6fdbd8 1307 err = "unable to allocate journal buckets";
1633e492
KO
1308 for_each_online_member(ca, c, i) {
1309 ret = bch2_dev_journal_alloc(ca);
1310 if (ret) {
1c6fdbd8
KO
1311 percpu_ref_put(&ca->io_ref);
1312 goto err;
1313 }
1633e492 1314 }
1c6fdbd8 1315
1c6fdbd8
KO
1316 /*
1317 * journal_res_get() will crash if called before this has
1318 * set up the journal.pin FIFO and journal.cur pointer:
1319 */
1dd7f9d9 1320 bch2_fs_journal_start(&c->journal, 1, &journal);
1c6fdbd8
KO
1321 bch2_journal_set_replay_done(&c->journal);
1322
8d6b6222
KO
1323 err = "error going read-write";
1324 ret = bch2_fs_read_write_early(c);
1325 if (ret)
1326 goto err;
1327
1328 /*
1329 * Write out the superblock and journal buckets, now that we can do
1330 * btree updates
1331 */
1332 err = "error writing alloc info";
1333 ret = bch2_alloc_write(c, 0);
1334 if (ret)
1335 goto err;
1336
1c6fdbd8
KO
1337 bch2_inode_init(c, &root_inode, 0, 0,
1338 S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
1339 root_inode.bi_inum = BCACHEFS_ROOT_INO;
a3e72262 1340 bch2_inode_pack(c, &packed_inode, &root_inode);
1c6fdbd8
KO
1341
1342 err = "error creating root directory";
1343 ret = bch2_btree_insert(c, BTREE_ID_INODES,
1344 &packed_inode.inode.k_i,
8d6b6222 1345 NULL, NULL, 0);
1c6fdbd8
KO
1346 if (ret)
1347 goto err;
1348
96385742 1349 bch2_inode_init_early(c, &lostfound_inode);
1c6fdbd8
KO
1350
1351 err = "error creating lost+found";
58e2388f 1352 ret = bch2_trans_do(c, NULL, NULL, 0,
96385742
KO
1353 bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
1354 &root_inode, &lostfound_inode,
1355 &lostfound,
b627c7d8 1356 0, 0, S_IFDIR|0700, 0,
96385742 1357 NULL, NULL));
dab9ef0d
KO
1358 if (ret) {
1359 bch_err(c, "error creating lost+found");
1c6fdbd8 1360 goto err;
dab9ef0d 1361 }
1c6fdbd8 1362
1c6fdbd8
KO
1363 if (enabled_qtypes(c)) {
1364 ret = bch2_fs_quota_read(c);
1365 if (ret)
1366 goto err;
1367 }
1368
1369 err = "error writing first journal entry";
1370 ret = bch2_journal_meta(&c->journal);
1371 if (ret)
1372 goto err;
1373
1374 mutex_lock(&c->sb_lock);
1375 SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
1376 SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
1377
1378 bch2_write_super(c);
1379 mutex_unlock(&c->sb_lock);
1380
1381 return 0;
1382err:
8b335bae 1383 pr_err("Error initializing new filesystem: %s (%i)", err, ret);
1c6fdbd8
KO
1384 return ret;
1385}