bcachefs: Assorted fixes for running on very small devices
[linux-block.git] / fs / bcachefs / recovery.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "btree_gc.h"
6 #include "btree_update.h"
7 #include "btree_update_interior.h"
8 #include "btree_io.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fsck.h"
12 #include "journal_io.h"
13 #include "quota.h"
14 #include "recovery.h"
15 #include "super-io.h"
16
17 #include <linux/stat.h>
18
19 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
20
21 struct bkey_i *btree_root_find(struct bch_fs *c,
22                                struct bch_sb_field_clean *clean,
23                                struct jset *j,
24                                enum btree_id id, unsigned *level)
25 {
26         struct bkey_i *k;
27         struct jset_entry *entry, *start, *end;
28
29         if (clean) {
30                 start = clean->start;
31                 end = vstruct_end(&clean->field);
32         } else {
33                 start = j->start;
34                 end = vstruct_last(j);
35         }
36
37         for (entry = start; entry < end; entry = vstruct_next(entry))
38                 if (entry->type == BCH_JSET_ENTRY_btree_root &&
39                     entry->btree_id == id)
40                         goto found;
41
42         return NULL;
43 found:
44         if (!entry->u64s)
45                 return ERR_PTR(-EINVAL);
46
47         k = entry->start;
48         *level = entry->level;
49         return k;
50 }
51
52 static int verify_superblock_clean(struct bch_fs *c,
53                                    struct bch_sb_field_clean *clean,
54                                    struct jset *j)
55 {
56         unsigned i;
57         int ret = 0;
58
59         if (!clean || !j)
60                 return 0;
61
62         if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
63                         "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
64                         le64_to_cpu(clean->journal_seq),
65                         le64_to_cpu(j->seq)))
66                 bch2_fs_mark_clean(c, false);
67
68         mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
69                         "superblock read clock doesn't match journal after clean shutdown");
70         mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
71                         "superblock read clock doesn't match journal after clean shutdown");
72
73         for (i = 0; i < BTREE_ID_NR; i++) {
74                 struct bkey_i *k1, *k2;
75                 unsigned l1 = 0, l2 = 0;
76
77                 k1 = btree_root_find(c, clean, NULL, i, &l1);
78                 k2 = btree_root_find(c, NULL, j, i, &l2);
79
80                 if (!k1 && !k2)
81                         continue;
82
83                 mustfix_fsck_err_on(!k1 || !k2 ||
84                                     IS_ERR(k1) ||
85                                     IS_ERR(k2) ||
86                                     k1->k.u64s != k2->k.u64s ||
87                                     memcmp(k1, k2, bkey_bytes(k1)) ||
88                                     l1 != l2, c,
89                         "superblock btree root doesn't match journal after clean shutdown");
90         }
91 fsck_err:
92         return ret;
93 }
94
95 static bool journal_empty(struct list_head *journal)
96 {
97         struct journal_replay *i;
98         struct jset_entry *entry;
99
100         if (list_empty(journal))
101                 return true;
102
103         i = list_last_entry(journal, struct journal_replay, list);
104
105         if (i->j.last_seq != i->j.seq)
106                 return false;
107
108         list_for_each_entry(i, journal, list) {
109                 vstruct_for_each(&i->j, entry) {
110                         if (entry->type == BCH_JSET_ENTRY_btree_root)
111                                 continue;
112
113                         if (entry->type == BCH_JSET_ENTRY_btree_keys &&
114                             !entry->u64s)
115                                 continue;
116                         return false;
117                 }
118         }
119
120         return true;
121 }
122
123 int bch2_fs_recovery(struct bch_fs *c)
124 {
125         const char *err = "cannot allocate memory";
126         struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
127         LIST_HEAD(journal);
128         struct jset *j = NULL;
129         unsigned i;
130         int ret;
131
132         mutex_lock(&c->sb_lock);
133         if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
134                 bch_info(c, "building replicas info");
135                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
136         }
137
138         if (c->sb.clean)
139                 sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
140         if (sb_clean) {
141                 clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
142                                 GFP_KERNEL);
143                 if (!clean) {
144                         ret = -ENOMEM;
145                         mutex_unlock(&c->sb_lock);
146                         goto err;
147                 }
148         }
149         mutex_unlock(&c->sb_lock);
150
151         if (clean)
152                 bch_info(c, "recovering from clean shutdown, journal seq %llu",
153                          le64_to_cpu(clean->journal_seq));
154
155         if (!clean || c->opts.fsck) {
156                 ret = bch2_journal_read(c, &journal);
157                 if (ret)
158                         goto err;
159
160                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
161         } else {
162                 ret = bch2_journal_set_seq(c,
163                                            le64_to_cpu(clean->journal_seq),
164                                            le64_to_cpu(clean->journal_seq));
165                 BUG_ON(ret);
166         }
167
168         ret = verify_superblock_clean(c, clean, j);
169         if (ret)
170                 goto err;
171
172         fsck_err_on(clean && !journal_empty(&journal), c,
173                     "filesystem marked clean but journal not empty");
174
175         if (clean) {
176                 c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
177                 c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
178         } else {
179                 c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
180                 c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
181         }
182
183         for (i = 0; i < BTREE_ID_NR; i++) {
184                 unsigned level;
185                 struct bkey_i *k;
186
187                 k = btree_root_find(c, clean, j, i, &level);
188                 if (!k)
189                         continue;
190
191                 err = "invalid btree root pointer";
192                 if (IS_ERR(k))
193                         goto err;
194
195                 err = "error reading btree root";
196                 if (bch2_btree_root_read(c, i, k, level)) {
197                         if (i != BTREE_ID_ALLOC)
198                                 goto err;
199
200                         mustfix_fsck_err(c, "error reading btree root");
201                 }
202         }
203
204         for (i = 0; i < BTREE_ID_NR; i++)
205                 if (!c->btree_roots[i].b)
206                         bch2_btree_root_alloc(c, i);
207
208         err = "error reading allocation information";
209         ret = bch2_alloc_read(c, &journal);
210         if (ret)
211                 goto err;
212
213         set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
214
215         bch_verbose(c, "starting mark and sweep:");
216         err = "error in recovery";
217         ret = bch2_initial_gc(c, &journal);
218         if (ret)
219                 goto err;
220         bch_verbose(c, "mark and sweep done");
221
222         clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
223
224         if (c->opts.noreplay)
225                 goto out;
226
227         /*
228          * Mark dirty before journal replay, fsck:
229          * XXX: after a clean shutdown, this could be done lazily only when fsck
230          * finds an error
231          */
232         bch2_fs_mark_clean(c, false);
233
234         /*
235          * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
236          * will give spurious errors about oldest_gen > bucket_gen -
237          * this is a hack but oh well.
238          */
239         bch2_fs_journal_start(&c->journal);
240
241         err = "error starting allocator";
242         ret = bch2_fs_allocator_start(c);
243         if (ret)
244                 goto err;
245
246         bch_verbose(c, "starting journal replay:");
247         err = "journal replay failed";
248         ret = bch2_journal_replay(c, &journal);
249         if (ret)
250                 goto err;
251         bch_verbose(c, "journal replay done");
252
253         if (c->opts.norecovery)
254                 goto out;
255
256         err = "error in fsck";
257         ret = bch2_fsck(c);
258         if (ret)
259                 goto err;
260
261         if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags)) {
262                 mutex_lock(&c->sb_lock);
263                 c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
264                 mutex_unlock(&c->sb_lock);
265         }
266
267         if (enabled_qtypes(c)) {
268                 bch_verbose(c, "reading quotas:");
269                 ret = bch2_fs_quota_read(c);
270                 if (ret)
271                         goto err;
272                 bch_verbose(c, "quotas done");
273         }
274
275 out:
276         bch2_journal_entries_free(&journal);
277         kfree(clean);
278         return ret;
279 err:
280 fsck_err:
281         pr_err("Error in recovery: %s (%i)", err, ret);
282         goto out;
283 }
284
285 int bch2_fs_initialize(struct bch_fs *c)
286 {
287         struct bch_inode_unpacked root_inode, lostfound_inode;
288         struct bkey_inode_buf packed_inode;
289         struct bch_hash_info root_hash_info;
290         struct qstr lostfound = QSTR("lost+found");
291         const char *err = "cannot allocate memory";
292         struct bch_dev *ca;
293         LIST_HEAD(journal);
294         unsigned i;
295         int ret;
296
297         bch_notice(c, "initializing new filesystem");
298
299         set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
300
301         ret = bch2_initial_gc(c, &journal);
302         if (ret)
303                 goto err;
304
305         err = "unable to allocate journal buckets";
306         for_each_online_member(ca, c, i)
307                 if (bch2_dev_journal_alloc(ca)) {
308                         percpu_ref_put(&ca->io_ref);
309                         goto err;
310                 }
311
312         for (i = 0; i < BTREE_ID_NR; i++)
313                 bch2_btree_root_alloc(c, i);
314
315         /*
316          * journal_res_get() will crash if called before this has
317          * set up the journal.pin FIFO and journal.cur pointer:
318          */
319         bch2_fs_journal_start(&c->journal);
320         bch2_journal_set_replay_done(&c->journal);
321
322         err = "error starting allocator";
323         ret = bch2_fs_allocator_start(c);
324         if (ret)
325                 goto err;
326
327         bch2_inode_init(c, &root_inode, 0, 0,
328                         S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
329         root_inode.bi_inum = BCACHEFS_ROOT_INO;
330         root_inode.bi_nlink++; /* lost+found */
331         bch2_inode_pack(&packed_inode, &root_inode);
332
333         err = "error creating root directory";
334         ret = bch2_btree_insert(c, BTREE_ID_INODES,
335                                 &packed_inode.inode.k_i,
336                                 NULL, NULL, 0);
337         if (ret)
338                 goto err;
339
340         bch2_inode_init(c, &lostfound_inode, 0, 0,
341                         S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
342                         &root_inode);
343         lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
344         bch2_inode_pack(&packed_inode, &lostfound_inode);
345
346         err = "error creating lost+found";
347         ret = bch2_btree_insert(c, BTREE_ID_INODES,
348                                 &packed_inode.inode.k_i,
349                                 NULL, NULL, 0);
350         if (ret)
351                 goto err;
352
353         root_hash_info = bch2_hash_info_init(c, &root_inode);
354
355         ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
356                                  &lostfound, lostfound_inode.bi_inum, NULL,
357                                  BTREE_INSERT_NOFAIL);
358         if (ret)
359                 goto err;
360
361         atomic_long_set(&c->nr_inodes, 2);
362
363         if (enabled_qtypes(c)) {
364                 ret = bch2_fs_quota_read(c);
365                 if (ret)
366                         goto err;
367         }
368
369         err = "error writing first journal entry";
370         ret = bch2_journal_meta(&c->journal);
371         if (ret)
372                 goto err;
373
374         mutex_lock(&c->sb_lock);
375         SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
376         SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
377         c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
378
379         bch2_write_super(c);
380         mutex_unlock(&c->sb_lock);
381
382         return 0;
383 err:
384         pr_err("Error initializing new filesystem: %s (%i)", err, ret);
385         return ret;
386 }