Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
7b3f84ea | 4 | #include "alloc_background.h" |
1c6fdbd8 KO |
5 | #include "btree_gc.h" |
6 | #include "btree_update.h" | |
7 | #include "btree_update_interior.h" | |
8 | #include "btree_io.h" | |
3e0745e2 | 9 | #include "buckets.h" |
1c6fdbd8 | 10 | #include "dirent.h" |
cd575ddf | 11 | #include "ec.h" |
1c6fdbd8 | 12 | #include "error.h" |
96385742 | 13 | #include "fs-common.h" |
1c6fdbd8 KO |
14 | #include "fsck.h" |
15 | #include "journal_io.h" | |
644d180b | 16 | #include "journal_reclaim.h" |
1dd7f9d9 | 17 | #include "journal_seq_blacklist.h" |
1c6fdbd8 KO |
18 | #include "quota.h" |
19 | #include "recovery.h" | |
42b72e0b | 20 | #include "replicas.h" |
1c6fdbd8 KO |
21 | #include "super-io.h" |
22 | ||
644d180b | 23 | #include <linux/sort.h> |
1c6fdbd8 KO |
24 | #include <linux/stat.h> |
25 | ||
26 | #define QSTR(n) { { { .len = strlen(n) } }, .name = n } | |
27 | ||
e222d206 KO |
28 | /* iterate over keys read from the journal: */ |
29 | ||
e62d65f2 KO |
30 | static struct journal_key *journal_key_search(struct journal_keys *journal_keys, |
31 | enum btree_id id, unsigned level, | |
32 | struct bpos pos) | |
e222d206 | 33 | { |
e62d65f2 | 34 | size_t l = 0, r = journal_keys->nr, m; |
e222d206 | 35 | |
e62d65f2 KO |
36 | while (l < r) { |
37 | m = l + ((r - l) >> 1); | |
38 | if ((cmp_int(id, journal_keys->d[m].btree_id) ?: | |
39 | cmp_int(level, journal_keys->d[m].level) ?: | |
40 | bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) | |
41 | l = m + 1; | |
42 | else | |
43 | r = m; | |
e222d206 KO |
44 | } |
45 | ||
e62d65f2 KO |
46 | BUG_ON(l < journal_keys->nr && |
47 | (cmp_int(id, journal_keys->d[l].btree_id) ?: | |
48 | cmp_int(level, journal_keys->d[l].level) ?: | |
49 | bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); | |
50 | ||
51 | BUG_ON(l && | |
52 | (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: | |
53 | cmp_int(level, journal_keys->d[l - 1].level) ?: | |
54 | bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); | |
55 | ||
56 | return l < journal_keys->nr ? journal_keys->d + l : NULL; | |
57 | } | |
58 | ||
59 | static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) | |
60 | { | |
61 | if (iter->k && | |
62 | iter->k < iter->keys->d + iter->keys->nr && | |
63 | iter->k->btree_id == iter->btree_id && | |
64 | iter->k->level == iter->level) | |
65 | return iter->k->k; | |
66 | ||
67 | iter->k = NULL; | |
68 | return NULL; | |
69 | } | |
70 | ||
71 | static void bch2_journal_iter_advance(struct journal_iter *iter) | |
72 | { | |
73 | if (iter->k) | |
74 | iter->k++; | |
e222d206 KO |
75 | } |
76 | ||
e62d65f2 KO |
77 | static void bch2_journal_iter_init(struct journal_iter *iter, |
78 | struct journal_keys *journal_keys, | |
79 | enum btree_id id, unsigned level, | |
80 | struct bpos pos) | |
e222d206 | 81 | { |
e62d65f2 KO |
82 | iter->btree_id = id; |
83 | iter->level = level; | |
84 | iter->keys = journal_keys; | |
85 | iter->k = journal_key_search(journal_keys, id, level, pos); | |
86 | } | |
e222d206 | 87 | |
e62d65f2 KO |
88 | static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) |
89 | { | |
90 | return iter->btree | |
91 | ? bch2_btree_iter_peek(iter->btree) | |
92 | : bch2_btree_node_iter_peek_unpack(&iter->node_iter, | |
93 | iter->b, &iter->unpacked); | |
94 | } | |
5c4a5cd5 | 95 | |
e62d65f2 KO |
96 | static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) |
97 | { | |
98 | if (iter->btree) | |
99 | bch2_btree_iter_next(iter->btree); | |
100 | else | |
101 | bch2_btree_node_iter_advance(&iter->node_iter, iter->b); | |
e222d206 KO |
102 | } |
103 | ||
5c4a5cd5 KO |
104 | void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) |
105 | { | |
106 | switch (iter->last) { | |
107 | case none: | |
108 | break; | |
109 | case btree: | |
e62d65f2 | 110 | bch2_journal_iter_advance_btree(iter); |
5c4a5cd5 KO |
111 | break; |
112 | case journal: | |
e62d65f2 | 113 | bch2_journal_iter_advance(&iter->journal); |
5c4a5cd5 KO |
114 | break; |
115 | } | |
116 | ||
117 | iter->last = none; | |
118 | } | |
119 | ||
120 | struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) | |
121 | { | |
122 | struct bkey_s_c ret; | |
123 | ||
124 | while (1) { | |
e62d65f2 KO |
125 | struct bkey_s_c btree_k = |
126 | bch2_journal_iter_peek_btree(iter); | |
127 | struct bkey_s_c journal_k = | |
128 | bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); | |
5c4a5cd5 KO |
129 | |
130 | if (btree_k.k && journal_k.k) { | |
131 | int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); | |
132 | ||
133 | if (!cmp) | |
e62d65f2 | 134 | bch2_journal_iter_advance_btree(iter); |
5c4a5cd5 KO |
135 | |
136 | iter->last = cmp < 0 ? btree : journal; | |
137 | } else if (btree_k.k) { | |
138 | iter->last = btree; | |
139 | } else if (journal_k.k) { | |
140 | iter->last = journal; | |
141 | } else { | |
142 | iter->last = none; | |
143 | return bkey_s_c_null; | |
144 | } | |
145 | ||
146 | ret = iter->last == journal ? journal_k : btree_k; | |
e62d65f2 KO |
147 | |
148 | if (iter->b && | |
149 | bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { | |
150 | iter->journal.k = NULL; | |
151 | iter->last = none; | |
152 | return bkey_s_c_null; | |
153 | } | |
154 | ||
5c4a5cd5 KO |
155 | if (!bkey_deleted(ret.k)) |
156 | break; | |
157 | ||
158 | bch2_btree_and_journal_iter_advance(iter); | |
159 | } | |
160 | ||
161 | return ret; | |
162 | } | |
163 | ||
164 | struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) | |
165 | { | |
166 | bch2_btree_and_journal_iter_advance(iter); | |
167 | ||
168 | return bch2_btree_and_journal_iter_peek(iter); | |
169 | } | |
170 | ||
5c4a5cd5 KO |
171 | void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, |
172 | struct btree_trans *trans, | |
173 | struct journal_keys *journal_keys, | |
174 | enum btree_id id, struct bpos pos) | |
175 | { | |
e62d65f2 | 176 | memset(iter, 0, sizeof(*iter)); |
5c4a5cd5 KO |
177 | |
178 | iter->btree = bch2_trans_get_iter(trans, id, pos, 0); | |
e62d65f2 KO |
179 | bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); |
180 | } | |
181 | ||
182 | void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, | |
183 | struct journal_keys *journal_keys, | |
184 | struct btree *b) | |
185 | { | |
186 | struct bpos start = b->data->min_key; | |
187 | ||
188 | if (btree_node_type_is_extents(b->c.btree_id)) | |
189 | start = bkey_successor(start); | |
190 | ||
191 | memset(iter, 0, sizeof(*iter)); | |
192 | ||
193 | iter->b = b; | |
194 | bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); | |
195 | bch2_journal_iter_init(&iter->journal, journal_keys, | |
196 | b->c.btree_id, b->c.level, start); | |
5c4a5cd5 KO |
197 | } |
198 | ||
d0734356 | 199 | /* sort and dedup all keys in the journal: */ |
644d180b | 200 | |
f1d786a0 | 201 | void bch2_journal_entries_free(struct list_head *list) |
1c6fdbd8 | 202 | { |
1c6fdbd8 | 203 | |
644d180b KO |
204 | while (!list_empty(list)) { |
205 | struct journal_replay *i = | |
206 | list_first_entry(list, struct journal_replay, list); | |
207 | list_del(&i->list); | |
208 | kvpfree(i, offsetof(struct journal_replay, j) + | |
209 | vstruct_bytes(&i->j)); | |
1c6fdbd8 | 210 | } |
644d180b | 211 | } |
1c6fdbd8 | 212 | |
e3e464ac KO |
213 | /* |
214 | * When keys compare equal, oldest compares first: | |
215 | */ | |
d0734356 KO |
216 | static int journal_sort_key_cmp(const void *_l, const void *_r) |
217 | { | |
218 | const struct journal_key *l = _l; | |
219 | const struct journal_key *r = _r; | |
220 | ||
e62d65f2 KO |
221 | return cmp_int(l->btree_id, r->btree_id) ?: |
222 | cmp_int(l->level, r->level) ?: | |
e3e464ac | 223 | bkey_cmp(l->k->k.p, r->k->k.p) ?: |
d0734356 KO |
224 | cmp_int(l->journal_seq, r->journal_seq) ?: |
225 | cmp_int(l->journal_offset, r->journal_offset); | |
226 | } | |
227 | ||
228 | static int journal_sort_seq_cmp(const void *_l, const void *_r) | |
229 | { | |
230 | const struct journal_key *l = _l; | |
231 | const struct journal_key *r = _r; | |
232 | ||
f44a6a71 KO |
233 | return cmp_int(r->level, l->level) ?: |
234 | cmp_int(l->journal_seq, r->journal_seq) ?: | |
e62d65f2 | 235 | cmp_int(l->btree_id, r->btree_id) ?: |
e62d65f2 | 236 | bkey_cmp(l->k->k.p, r->k->k.p); |
d0734356 KO |
237 | } |
238 | ||
f1d786a0 | 239 | void bch2_journal_keys_free(struct journal_keys *keys) |
d0734356 | 240 | { |
d0734356 KO |
241 | kvfree(keys->d); |
242 | keys->d = NULL; | |
243 | keys->nr = 0; | |
244 | } | |
245 | ||
246 | static struct journal_keys journal_keys_sort(struct list_head *journal_entries) | |
247 | { | |
248 | struct journal_replay *p; | |
249 | struct jset_entry *entry; | |
250 | struct bkey_i *k, *_n; | |
e3e464ac KO |
251 | struct journal_keys keys = { NULL }; |
252 | struct journal_key *src, *dst; | |
d0734356 KO |
253 | size_t nr_keys = 0; |
254 | ||
255 | list_for_each_entry(p, journal_entries, list) | |
256 | for_each_jset_key(k, _n, entry, &p->j) | |
257 | nr_keys++; | |
258 | ||
e3e464ac | 259 | keys.journal_seq_base = |
d0734356 KO |
260 | le64_to_cpu(list_first_entry(journal_entries, |
261 | struct journal_replay, | |
262 | list)->j.seq); | |
263 | ||
264 | keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); | |
265 | if (!keys.d) | |
266 | goto err; | |
267 | ||
d0734356 | 268 | list_for_each_entry(p, journal_entries, list) |
e3e464ac | 269 | for_each_jset_key(k, _n, entry, &p->j) |
d0734356 KO |
270 | keys.d[keys.nr++] = (struct journal_key) { |
271 | .btree_id = entry->btree_id, | |
e62d65f2 | 272 | .level = entry->level, |
d0734356 KO |
273 | .k = k, |
274 | .journal_seq = le64_to_cpu(p->j.seq) - | |
275 | keys.journal_seq_base, | |
276 | .journal_offset = k->_data - p->j._data, | |
277 | }; | |
278 | ||
3186c80f | 279 | sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); |
d0734356 | 280 | |
e3e464ac KO |
281 | src = dst = keys.d; |
282 | while (src < keys.d + keys.nr) { | |
283 | while (src + 1 < keys.d + keys.nr && | |
e62d65f2 KO |
284 | src[0].btree_id == src[1].btree_id && |
285 | src[0].level == src[1].level && | |
e3e464ac KO |
286 | !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) |
287 | src++; | |
d0734356 | 288 | |
e3e464ac | 289 | *dst++ = *src++; |
d0734356 KO |
290 | } |
291 | ||
e3e464ac | 292 | keys.nr = dst - keys.d; |
d0734356 | 293 | err: |
e3e464ac | 294 | return keys; |
d0734356 KO |
295 | } |
296 | ||
297 | /* journal replay: */ | |
298 | ||
299 | static void replay_now_at(struct journal *j, u64 seq) | |
300 | { | |
301 | BUG_ON(seq < j->replay_journal_seq); | |
302 | BUG_ON(seq > j->replay_journal_seq_end); | |
303 | ||
304 | while (j->replay_journal_seq < seq) | |
305 | bch2_journal_pin_put(j, j->replay_journal_seq++); | |
306 | } | |
307 | ||
76426098 KO |
308 | static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, |
309 | struct bkey_i *k) | |
644d180b KO |
310 | { |
311 | struct btree_trans trans; | |
c6dd04f8 | 312 | struct btree_iter *iter, *split_iter; |
644d180b | 313 | /* |
c6dd04f8 KO |
314 | * We might cause compressed extents to be split, so we need to pass in |
315 | * a disk_reservation: | |
644d180b KO |
316 | */ |
317 | struct disk_reservation disk_res = | |
318 | bch2_disk_reservation_init(c, 0); | |
c6dd04f8 | 319 | struct bkey_i *split; |
3c7f3b7a | 320 | struct bpos atomic_end; |
06f6c3ec KO |
321 | /* |
322 | * Some extents aren't equivalent - w.r.t. what the triggers do | |
323 | * - if they're split: | |
324 | */ | |
4de77495 | 325 | bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || |
06f6c3ec KO |
326 | k->k.type == KEY_TYPE_reflink_p; |
327 | bool remark = false; | |
644d180b | 328 | int ret; |
1c6fdbd8 | 329 | |
20bceecb | 330 | bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); |
c6dd04f8 KO |
331 | retry: |
332 | bch2_trans_begin(&trans); | |
1c6fdbd8 | 333 | |
76426098 | 334 | iter = bch2_trans_get_iter(&trans, btree_id, |
644d180b KO |
335 | bkey_start_pos(&k->k), |
336 | BTREE_ITER_INTENT); | |
c6dd04f8 | 337 | |
644d180b | 338 | do { |
a40d97a7 | 339 | ret = bch2_btree_iter_traverse(iter); |
644d180b | 340 | if (ret) |
c6dd04f8 | 341 | goto err; |
1c6fdbd8 | 342 | |
a40d97a7 KO |
343 | atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); |
344 | ||
c6dd04f8 KO |
345 | split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); |
346 | ret = PTR_ERR_OR_ZERO(split); | |
347 | if (ret) | |
348 | goto err; | |
349 | ||
06f6c3ec KO |
350 | if (!remark && |
351 | remark_if_split && | |
3c7f3b7a | 352 | bkey_cmp(atomic_end, k->k.p) < 0) { |
c6dd04f8 KO |
353 | ret = bch2_disk_reservation_add(c, &disk_res, |
354 | k->k.size * | |
4de77495 | 355 | bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), |
c6dd04f8 KO |
356 | BCH_DISK_RESERVATION_NOFAIL); |
357 | BUG_ON(ret); | |
358 | ||
06f6c3ec | 359 | remark = true; |
c6dd04f8 | 360 | } |
644d180b | 361 | |
c6dd04f8 | 362 | bkey_copy(split, k); |
e3e464ac | 363 | bch2_cut_front(iter->pos, split); |
085ab693 | 364 | bch2_cut_back(atomic_end, split); |
644d180b | 365 | |
e3e464ac KO |
366 | split_iter = bch2_trans_copy_iter(&trans, iter); |
367 | ret = PTR_ERR_OR_ZERO(split_iter); | |
368 | if (ret) | |
369 | goto err; | |
370 | ||
371 | /* | |
372 | * It's important that we don't go through the | |
373 | * extent_handle_overwrites() and extent_update_to_keys() path | |
374 | * here: journal replay is supposed to treat extents like | |
375 | * regular keys | |
376 | */ | |
377 | __bch2_btree_iter_set_pos(split_iter, split->k.p, false); | |
2d594dfb KO |
378 | bch2_trans_update(&trans, split_iter, split, !remark |
379 | ? BTREE_TRIGGER_NORUN | |
380 | : BTREE_TRIGGER_NOOVERWRITES); | |
e3e464ac | 381 | |
c6dd04f8 KO |
382 | bch2_btree_iter_set_pos(iter, split->k.p); |
383 | } while (bkey_cmp(iter->pos, k->k.p) < 0); | |
384 | ||
06f6c3ec | 385 | if (remark) { |
6e738539 | 386 | ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), |
2cbe5cfe | 387 | 0, -((s64) k->k.size), |
2d594dfb KO |
388 | BTREE_TRIGGER_OVERWRITE); |
389 | if (ret) | |
390 | goto err; | |
c6dd04f8 | 391 | } |
932aa837 | 392 | |
2d594dfb KO |
393 | ret = bch2_trans_commit(&trans, &disk_res, NULL, |
394 | BTREE_INSERT_NOFAIL| | |
395 | BTREE_INSERT_LAZY_RW| | |
396 | BTREE_INSERT_JOURNAL_REPLAY); | |
c6dd04f8 KO |
397 | err: |
398 | if (ret == -EINTR) | |
399 | goto retry; | |
400 | ||
401 | bch2_disk_reservation_put(c, &disk_res); | |
7b512638 | 402 | |
c6dd04f8 | 403 | return bch2_trans_exit(&trans) ?: ret; |
644d180b | 404 | } |
7b512638 | 405 | |
2d594dfb | 406 | static int __bch2_journal_replay_key(struct btree_trans *trans, |
f44a6a71 KO |
407 | enum btree_id id, unsigned level, |
408 | struct bkey_i *k) | |
2d594dfb KO |
409 | { |
410 | struct btree_iter *iter; | |
f6d0368e | 411 | int ret; |
2d594dfb | 412 | |
f44a6a71 KO |
413 | iter = bch2_trans_get_node_iter(trans, id, k->k.p, |
414 | BTREE_MAX_DEPTH, level, | |
415 | BTREE_ITER_INTENT); | |
2d594dfb KO |
416 | if (IS_ERR(iter)) |
417 | return PTR_ERR(iter); | |
418 | ||
e3e464ac KO |
419 | /* |
420 | * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run | |
421 | * extent_handle_overwrites() and extent_update_to_keys() - but we don't | |
422 | * want that here, journal replay is supposed to treat extents like | |
423 | * regular keys: | |
424 | */ | |
425 | __bch2_btree_iter_set_pos(iter, k->k.p, false); | |
426 | ||
f6d0368e KO |
427 | ret = bch2_btree_iter_traverse(iter) ?: |
428 | bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); | |
27beb810 | 429 | bch2_trans_iter_put(trans, iter); |
f6d0368e | 430 | return ret; |
2d594dfb KO |
431 | } |
432 | ||
433 | static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, | |
f44a6a71 | 434 | unsigned level, struct bkey_i *k) |
2d594dfb KO |
435 | { |
436 | return bch2_trans_do(c, NULL, NULL, | |
437 | BTREE_INSERT_NOFAIL| | |
438 | BTREE_INSERT_LAZY_RW| | |
439 | BTREE_INSERT_JOURNAL_REPLAY, | |
f44a6a71 | 440 | __bch2_journal_replay_key(&trans, id, level, k)); |
2d594dfb KO |
441 | } |
442 | ||
d0734356 KO |
443 | static int bch2_journal_replay(struct bch_fs *c, |
444 | struct journal_keys keys) | |
644d180b KO |
445 | { |
446 | struct journal *j = &c->journal; | |
d0734356 KO |
447 | struct journal_key *i; |
448 | int ret; | |
7b512638 | 449 | |
d0734356 | 450 | sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); |
7b512638 | 451 | |
2f194e16 KO |
452 | if (keys.nr) |
453 | replay_now_at(j, keys.journal_seq_base); | |
f44a6a71 | 454 | |
d0734356 | 455 | for_each_journal_key(keys, i) { |
f44a6a71 KO |
456 | if (!i->level) |
457 | replay_now_at(j, keys.journal_seq_base + i->journal_seq); | |
d0734356 | 458 | |
f44a6a71 KO |
459 | if (i->level) |
460 | ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); | |
76426098 | 461 | if (i->btree_id == BTREE_ID_ALLOC) |
d0734356 | 462 | ret = bch2_alloc_replay_key(c, i->k); |
e3e464ac | 463 | else if (i->k->k.size) |
76426098 KO |
464 | ret = bch2_extent_replay_key(c, i->btree_id, i->k); |
465 | else | |
f44a6a71 | 466 | ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); |
644d180b | 467 | |
d0734356 KO |
468 | if (ret) { |
469 | bch_err(c, "journal replay: error %d while replaying key", | |
470 | ret); | |
471 | return ret; | |
644d180b | 472 | } |
d0734356 KO |
473 | |
474 | cond_resched(); | |
7b512638 | 475 | } |
644d180b KO |
476 | |
477 | replay_now_at(j, j->replay_journal_seq_end); | |
478 | j->replay_journal_seq = 0; | |
479 | ||
480 | bch2_journal_set_replay_done(j); | |
481 | bch2_journal_flush_all_pins(j); | |
d0734356 | 482 | return bch2_journal_error(j); |
7b512638 KO |
483 | } |
484 | ||
644d180b KO |
485 | static bool journal_empty(struct list_head *journal) |
486 | { | |
487 | return list_empty(journal) || | |
488 | journal_entry_empty(&list_last_entry(journal, | |
489 | struct journal_replay, list)->j); | |
490 | } | |
491 | ||
1dd7f9d9 KO |
492 | static int |
493 | verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, | |
494 | struct list_head *journal) | |
495 | { | |
496 | struct journal_replay *i = | |
497 | list_last_entry(journal, struct journal_replay, list); | |
498 | u64 start_seq = le64_to_cpu(i->j.last_seq); | |
499 | u64 end_seq = le64_to_cpu(i->j.seq); | |
500 | u64 seq = start_seq; | |
501 | int ret = 0; | |
502 | ||
503 | list_for_each_entry(i, journal, list) { | |
504 | fsck_err_on(seq != le64_to_cpu(i->j.seq), c, | |
505 | "journal entries %llu-%llu missing! (replaying %llu-%llu)", | |
506 | seq, le64_to_cpu(i->j.seq) - 1, | |
507 | start_seq, end_seq); | |
508 | ||
509 | seq = le64_to_cpu(i->j.seq); | |
510 | ||
511 | fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, | |
512 | "found blacklisted journal entry %llu", seq); | |
513 | ||
514 | do { | |
515 | seq++; | |
516 | } while (bch2_journal_seq_is_blacklisted(c, seq, false)); | |
517 | } | |
518 | fsck_err: | |
519 | return ret; | |
520 | } | |
521 | ||
644d180b | 522 | /* journal replay early: */ |
7b512638 | 523 | |
42b72e0b KO |
524 | static int journal_replay_entry_early(struct bch_fs *c, |
525 | struct jset_entry *entry) | |
526 | { | |
527 | int ret = 0; | |
528 | ||
529 | switch (entry->type) { | |
530 | case BCH_JSET_ENTRY_btree_root: { | |
2ded276b KO |
531 | struct btree_root *r; |
532 | ||
533 | if (entry->btree_id >= BTREE_ID_NR) { | |
534 | bch_err(c, "filesystem has unknown btree type %u", | |
535 | entry->btree_id); | |
536 | return -EINVAL; | |
537 | } | |
538 | ||
539 | r = &c->btree_roots[entry->btree_id]; | |
42b72e0b KO |
540 | |
541 | if (entry->u64s) { | |
542 | r->level = entry->level; | |
543 | bkey_copy(&r->key, &entry->start[0]); | |
544 | r->error = 0; | |
545 | } else { | |
546 | r->error = -EIO; | |
547 | } | |
548 | r->alive = true; | |
549 | break; | |
550 | } | |
551 | case BCH_JSET_ENTRY_usage: { | |
552 | struct jset_entry_usage *u = | |
553 | container_of(entry, struct jset_entry_usage, entry); | |
554 | ||
3577df5f KO |
555 | switch (entry->btree_id) { |
556 | case FS_USAGE_RESERVED: | |
557 | if (entry->level < BCH_REPLICAS_MAX) | |
5e82a9a1 KO |
558 | c->usage_base->persistent_reserved[entry->level] = |
559 | le64_to_cpu(u->v); | |
42b72e0b KO |
560 | break; |
561 | case FS_USAGE_INODES: | |
5e82a9a1 | 562 | c->usage_base->nr_inodes = le64_to_cpu(u->v); |
42b72e0b KO |
563 | break; |
564 | case FS_USAGE_KEY_VERSION: | |
565 | atomic64_set(&c->key_version, | |
3577df5f | 566 | le64_to_cpu(u->v)); |
42b72e0b KO |
567 | break; |
568 | } | |
569 | ||
570 | break; | |
571 | } | |
3577df5f KO |
572 | case BCH_JSET_ENTRY_data_usage: { |
573 | struct jset_entry_data_usage *u = | |
574 | container_of(entry, struct jset_entry_data_usage, entry); | |
575 | ret = bch2_replicas_set_usage(c, &u->r, | |
576 | le64_to_cpu(u->v)); | |
577 | break; | |
578 | } | |
1dd7f9d9 KO |
579 | case BCH_JSET_ENTRY_blacklist: { |
580 | struct jset_entry_blacklist *bl_entry = | |
581 | container_of(entry, struct jset_entry_blacklist, entry); | |
582 | ||
583 | ret = bch2_journal_seq_blacklist_add(c, | |
584 | le64_to_cpu(bl_entry->seq), | |
585 | le64_to_cpu(bl_entry->seq) + 1); | |
586 | break; | |
587 | } | |
588 | case BCH_JSET_ENTRY_blacklist_v2: { | |
589 | struct jset_entry_blacklist_v2 *bl_entry = | |
590 | container_of(entry, struct jset_entry_blacklist_v2, entry); | |
591 | ||
592 | ret = bch2_journal_seq_blacklist_add(c, | |
593 | le64_to_cpu(bl_entry->start), | |
594 | le64_to_cpu(bl_entry->end) + 1); | |
595 | break; | |
596 | } | |
42b72e0b KO |
597 | } |
598 | ||
599 | return ret; | |
600 | } | |
601 | ||
1dd7f9d9 KO |
602 | static int journal_replay_early(struct bch_fs *c, |
603 | struct bch_sb_field_clean *clean, | |
604 | struct list_head *journal) | |
1c6fdbd8 | 605 | { |
7b512638 KO |
606 | struct jset_entry *entry; |
607 | int ret; | |
1c6fdbd8 | 608 | |
7b512638 KO |
609 | if (clean) { |
610 | c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); | |
611 | c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); | |
1c6fdbd8 | 612 | |
7b512638 KO |
613 | for (entry = clean->start; |
614 | entry != vstruct_end(&clean->field); | |
615 | entry = vstruct_next(entry)) { | |
616 | ret = journal_replay_entry_early(c, entry); | |
617 | if (ret) | |
618 | return ret; | |
619 | } | |
620 | } else { | |
621 | struct journal_replay *i = | |
622 | list_last_entry(journal, struct journal_replay, list); | |
623 | ||
624 | c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); | |
625 | c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); | |
626 | ||
627 | list_for_each_entry(i, journal, list) | |
628 | vstruct_for_each(&i->j, entry) { | |
629 | ret = journal_replay_entry_early(c, entry); | |
630 | if (ret) | |
631 | return ret; | |
632 | } | |
134915f3 | 633 | } |
1c6fdbd8 | 634 | |
7b512638 KO |
635 | bch2_fs_usage_initialize(c); |
636 | ||
637 | return 0; | |
638 | } | |
639 | ||
644d180b KO |
640 | /* sb clean section: */ |
641 | ||
642 | static struct bkey_i *btree_root_find(struct bch_fs *c, | |
643 | struct bch_sb_field_clean *clean, | |
644 | struct jset *j, | |
645 | enum btree_id id, unsigned *level) | |
646 | { | |
647 | struct bkey_i *k; | |
648 | struct jset_entry *entry, *start, *end; | |
649 | ||
650 | if (clean) { | |
651 | start = clean->start; | |
652 | end = vstruct_end(&clean->field); | |
653 | } else { | |
654 | start = j->start; | |
655 | end = vstruct_last(j); | |
656 | } | |
657 | ||
658 | for (entry = start; entry < end; entry = vstruct_next(entry)) | |
659 | if (entry->type == BCH_JSET_ENTRY_btree_root && | |
660 | entry->btree_id == id) | |
661 | goto found; | |
662 | ||
663 | return NULL; | |
664 | found: | |
665 | if (!entry->u64s) | |
666 | return ERR_PTR(-EINVAL); | |
667 | ||
668 | k = entry->start; | |
669 | *level = entry->level; | |
670 | return k; | |
671 | } | |
672 | ||
673 | static int verify_superblock_clean(struct bch_fs *c, | |
674 | struct bch_sb_field_clean **cleanp, | |
675 | struct jset *j) | |
676 | { | |
677 | unsigned i; | |
678 | struct bch_sb_field_clean *clean = *cleanp; | |
679 | int ret = 0; | |
680 | ||
932aa837 | 681 | if (!c->sb.clean || !j) |
644d180b KO |
682 | return 0; |
683 | ||
684 | if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, | |
685 | "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", | |
686 | le64_to_cpu(clean->journal_seq), | |
687 | le64_to_cpu(j->seq))) { | |
688 | kfree(clean); | |
689 | *cleanp = NULL; | |
690 | return 0; | |
691 | } | |
692 | ||
693 | mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, | |
694 | "superblock read clock doesn't match journal after clean shutdown"); | |
695 | mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, | |
696 | "superblock read clock doesn't match journal after clean shutdown"); | |
697 | ||
698 | for (i = 0; i < BTREE_ID_NR; i++) { | |
699 | struct bkey_i *k1, *k2; | |
700 | unsigned l1 = 0, l2 = 0; | |
701 | ||
702 | k1 = btree_root_find(c, clean, NULL, i, &l1); | |
703 | k2 = btree_root_find(c, NULL, j, i, &l2); | |
704 | ||
705 | if (!k1 && !k2) | |
706 | continue; | |
707 | ||
708 | mustfix_fsck_err_on(!k1 || !k2 || | |
709 | IS_ERR(k1) || | |
710 | IS_ERR(k2) || | |
711 | k1->k.u64s != k2->k.u64s || | |
712 | memcmp(k1, k2, bkey_bytes(k1)) || | |
713 | l1 != l2, c, | |
714 | "superblock btree root doesn't match journal after clean shutdown"); | |
715 | } | |
716 | fsck_err: | |
717 | return ret; | |
718 | } | |
719 | ||
720 | static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) | |
721 | { | |
722 | struct bch_sb_field_clean *clean, *sb_clean; | |
723 | int ret; | |
724 | ||
725 | mutex_lock(&c->sb_lock); | |
726 | sb_clean = bch2_sb_get_clean(c->disk_sb.sb); | |
727 | ||
728 | if (fsck_err_on(!sb_clean, c, | |
729 | "superblock marked clean but clean section not present")) { | |
730 | SET_BCH_SB_CLEAN(c->disk_sb.sb, false); | |
731 | c->sb.clean = false; | |
732 | mutex_unlock(&c->sb_lock); | |
733 | return NULL; | |
734 | } | |
735 | ||
736 | clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), | |
737 | GFP_KERNEL); | |
738 | if (!clean) { | |
739 | mutex_unlock(&c->sb_lock); | |
740 | return ERR_PTR(-ENOMEM); | |
741 | } | |
742 | ||
743 | if (le16_to_cpu(c->disk_sb.sb->version) < | |
744 | bcachefs_metadata_version_bkey_renumber) | |
745 | bch2_sb_clean_renumber(clean, READ); | |
746 | ||
747 | mutex_unlock(&c->sb_lock); | |
748 | ||
749 | return clean; | |
750 | fsck_err: | |
751 | mutex_unlock(&c->sb_lock); | |
752 | return ERR_PTR(ret); | |
753 | } | |
754 | ||
7b512638 KO |
755 | static int read_btree_roots(struct bch_fs *c) |
756 | { | |
757 | unsigned i; | |
758 | int ret = 0; | |
1c6fdbd8 KO |
759 | |
760 | for (i = 0; i < BTREE_ID_NR; i++) { | |
7b512638 | 761 | struct btree_root *r = &c->btree_roots[i]; |
1c6fdbd8 | 762 | |
7b512638 KO |
763 | if (!r->alive) |
764 | continue; | |
1c6fdbd8 | 765 | |
7b512638 | 766 | if (i == BTREE_ID_ALLOC && |
ad7e137e | 767 | c->opts.reconstruct_alloc) { |
7b512638 | 768 | c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); |
1c6fdbd8 | 769 | continue; |
7b512638 | 770 | } |
1c6fdbd8 | 771 | |
7b512638 KO |
772 | |
773 | if (r->error) { | |
774 | __fsck_err(c, i == BTREE_ID_ALLOC | |
775 | ? FSCK_CAN_IGNORE : 0, | |
776 | "invalid btree root %s", | |
777 | bch2_btree_ids[i]); | |
778 | if (i == BTREE_ID_ALLOC) | |
779 | c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); | |
780 | } | |
781 | ||
782 | ret = bch2_btree_root_read(c, i, &r->key, r->level); | |
783 | if (ret) { | |
784 | __fsck_err(c, i == BTREE_ID_ALLOC | |
785 | ? FSCK_CAN_IGNORE : 0, | |
786 | "error reading btree root %s", | |
787 | bch2_btree_ids[i]); | |
788 | if (i == BTREE_ID_ALLOC) | |
789 | c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); | |
790 | } | |
1c6fdbd8 | 791 | } |
7b512638 KO |
792 | |
793 | for (i = 0; i < BTREE_ID_NR; i++) | |
794 | if (!c->btree_roots[i].b) | |
795 | bch2_btree_root_alloc(c, i); | |
1c6fdbd8 KO |
796 | fsck_err: |
797 | return ret; | |
798 | } | |
799 | ||
1c6fdbd8 KO |
800 | int bch2_fs_recovery(struct bch_fs *c) |
801 | { | |
802 | const char *err = "cannot allocate memory"; | |
1dd7f9d9 KO |
803 | struct bch_sb_field_clean *clean = NULL; |
804 | u64 journal_seq; | |
932aa837 | 805 | bool wrote = false, write_sb = false; |
1c6fdbd8 KO |
806 | int ret; |
807 | ||
1dd7f9d9 KO |
808 | if (c->sb.clean) |
809 | clean = read_superblock_clean(c); | |
810 | ret = PTR_ERR_OR_ZERO(clean); | |
811 | if (ret) | |
812 | goto err; | |
813 | ||
814 | if (c->sb.clean) | |
1c6fdbd8 KO |
815 | bch_info(c, "recovering from clean shutdown, journal seq %llu", |
816 | le64_to_cpu(clean->journal_seq)); | |
817 | ||
1dd7f9d9 KO |
818 | if (!c->replicas.entries) { |
819 | bch_info(c, "building replicas info"); | |
820 | set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); | |
821 | } | |
822 | ||
823 | if (!c->sb.clean || c->opts.fsck) { | |
824 | struct jset *j; | |
825 | ||
f1d786a0 | 826 | ret = bch2_journal_read(c, &c->journal_entries); |
1c6fdbd8 KO |
827 | if (ret) |
828 | goto err; | |
829 | ||
f1d786a0 | 830 | if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, |
932aa837 KO |
831 | "filesystem marked clean but journal not empty")) { |
832 | c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); | |
833 | SET_BCH_SB_CLEAN(c->disk_sb.sb, false); | |
834 | c->sb.clean = false; | |
835 | } | |
1dd7f9d9 | 836 | |
f1d786a0 | 837 | if (!c->sb.clean && list_empty(&c->journal_entries)) { |
1dd7f9d9 KO |
838 | bch_err(c, "no journal entries found"); |
839 | ret = BCH_FSCK_REPAIR_IMPOSSIBLE; | |
840 | goto err; | |
841 | } | |
842 | ||
f1d786a0 KO |
843 | c->journal_keys = journal_keys_sort(&c->journal_entries); |
844 | if (!c->journal_keys.d) { | |
d0734356 KO |
845 | ret = -ENOMEM; |
846 | goto err; | |
847 | } | |
848 | ||
f1d786a0 | 849 | j = &list_last_entry(&c->journal_entries, |
d0734356 | 850 | struct journal_replay, list)->j; |
1dd7f9d9 KO |
851 | |
852 | ret = verify_superblock_clean(c, &clean, j); | |
7b512638 KO |
853 | if (ret) |
854 | goto err; | |
1dd7f9d9 KO |
855 | |
856 | journal_seq = le64_to_cpu(j->seq) + 1; | |
1c6fdbd8 | 857 | } else { |
1dd7f9d9 KO |
858 | journal_seq = le64_to_cpu(clean->journal_seq) + 1; |
859 | } | |
860 | ||
e3e464ac KO |
861 | if (!c->sb.clean && |
862 | !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { | |
863 | bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); | |
864 | ret = -EINVAL; | |
865 | goto err; | |
866 | } | |
867 | ||
f1d786a0 | 868 | ret = journal_replay_early(c, clean, &c->journal_entries); |
1dd7f9d9 KO |
869 | if (ret) |
870 | goto err; | |
871 | ||
872 | if (!c->sb.clean) { | |
873 | ret = bch2_journal_seq_blacklist_add(c, | |
619f5bee KO |
874 | journal_seq, |
875 | journal_seq + 4); | |
1dd7f9d9 KO |
876 | if (ret) { |
877 | bch_err(c, "error creating new journal seq blacklist entry"); | |
7b512638 | 878 | goto err; |
1dd7f9d9 KO |
879 | } |
880 | ||
881 | journal_seq += 4; | |
1c6fdbd8 KO |
882 | } |
883 | ||
1dd7f9d9 KO |
884 | ret = bch2_blacklist_table_initialize(c); |
885 | ||
f1d786a0 | 886 | if (!list_empty(&c->journal_entries)) { |
f707e3d8 | 887 | ret = verify_journal_entries_not_blacklisted_or_missing(c, |
f1d786a0 | 888 | &c->journal_entries); |
f707e3d8 KO |
889 | if (ret) |
890 | goto err; | |
891 | } | |
1c6fdbd8 | 892 | |
d0734356 | 893 | ret = bch2_fs_journal_start(&c->journal, journal_seq, |
f1d786a0 | 894 | &c->journal_entries); |
7b512638 KO |
895 | if (ret) |
896 | goto err; | |
1c6fdbd8 | 897 | |
7b512638 KO |
898 | ret = read_btree_roots(c); |
899 | if (ret) | |
900 | goto err; | |
1c6fdbd8 | 901 | |
932aa837 | 902 | bch_verbose(c, "starting alloc read"); |
1c6fdbd8 | 903 | err = "error reading allocation information"; |
f1d786a0 | 904 | ret = bch2_alloc_read(c, &c->journal_keys); |
1c6fdbd8 KO |
905 | if (ret) |
906 | goto err; | |
932aa837 | 907 | bch_verbose(c, "alloc read done"); |
1c6fdbd8 | 908 | |
94cd106f | 909 | bch_verbose(c, "starting stripes_read"); |
932aa837 | 910 | err = "error reading stripes"; |
f1d786a0 | 911 | ret = bch2_stripes_read(c, &c->journal_keys); |
4e65431c KO |
912 | if (ret) |
913 | goto err; | |
94cd106f | 914 | bch_verbose(c, "stripes_read done"); |
61c8d7c8 KO |
915 | |
916 | set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); | |
4e65431c | 917 | |
932aa837 KO |
918 | if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && |
919 | !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { | |
920 | /* | |
921 | * interior btree node updates aren't consistent with the | |
922 | * journal; after an unclean shutdown we have to walk all | |
923 | * pointers to metadata: | |
924 | */ | |
619f5bee | 925 | bch_info(c, "starting metadata mark and sweep"); |
932aa837 | 926 | err = "error in mark and sweep"; |
f1d786a0 | 927 | ret = bch2_gc(c, &c->journal_keys, true, true); |
932aa837 KO |
928 | if (ret) |
929 | goto err; | |
930 | bch_verbose(c, "mark and sweep done"); | |
931 | } | |
932 | ||
7b512638 KO |
933 | if (c->opts.fsck || |
934 | !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || | |
935 | test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { | |
619f5bee | 936 | bch_info(c, "starting mark and sweep"); |
932aa837 | 937 | err = "error in mark and sweep"; |
f1d786a0 | 938 | ret = bch2_gc(c, &c->journal_keys, true, false); |
1df42b57 KO |
939 | if (ret) |
940 | goto err; | |
941 | bch_verbose(c, "mark and sweep done"); | |
942 | } | |
1c6fdbd8 | 943 | |
72644db1 | 944 | clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); |
1df42b57 | 945 | set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); |
72644db1 | 946 | |
42b72e0b KO |
947 | /* |
948 | * Skip past versions that might have possibly been used (as nonces), | |
949 | * but hadn't had their pointers written: | |
950 | */ | |
951 | if (c->sb.encryption_type && !c->sb.clean) | |
952 | atomic64_add(1 << 16, &c->key_version); | |
953 | ||
619f5bee | 954 | if (c->opts.norecovery) |
7b512638 KO |
955 | goto out; |
956 | ||
619f5bee | 957 | bch_verbose(c, "starting journal replay"); |
1c6fdbd8 | 958 | err = "journal replay failed"; |
f1d786a0 | 959 | ret = bch2_journal_replay(c, c->journal_keys); |
1c6fdbd8 KO |
960 | if (ret) |
961 | goto err; | |
962 | bch_verbose(c, "journal replay done"); | |
963 | ||
619f5bee KO |
964 | if (!c->opts.nochanges) { |
965 | /* | |
966 | * note that even when filesystem was clean there might be work | |
967 | * to do here, if we ran gc (because of fsck) which recalculated | |
968 | * oldest_gen: | |
969 | */ | |
970 | bch_verbose(c, "writing allocation info"); | |
971 | err = "error writing out alloc info"; | |
972 | ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: | |
973 | bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); | |
974 | if (ret) { | |
975 | bch_err(c, "error writing alloc info"); | |
976 | goto err; | |
977 | } | |
978 | bch_verbose(c, "alloc write done"); | |
ff929515 KO |
979 | |
980 | set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); | |
932aa837 | 981 | } |
932aa837 | 982 | |
619f5bee | 983 | if (!c->sb.clean) { |
1c3ff72c | 984 | if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { |
619f5bee KO |
985 | bch_info(c, "checking inode link counts"); |
986 | err = "error in recovery"; | |
987 | ret = bch2_fsck_inode_nlink(c); | |
988 | if (ret) | |
989 | goto err; | |
990 | bch_verbose(c, "check inodes done"); | |
1c6fdbd8 | 991 | |
619f5bee KO |
992 | } else { |
993 | bch_verbose(c, "checking for deleted inodes"); | |
994 | err = "error in recovery"; | |
995 | ret = bch2_fsck_walk_inodes_only(c); | |
996 | if (ret) | |
997 | goto err; | |
998 | bch_verbose(c, "check inodes done"); | |
999 | } | |
1000 | } | |
1001 | ||
1002 | if (c->opts.fsck) { | |
1003 | bch_info(c, "starting fsck"); | |
1004 | err = "error in fsck"; | |
1005 | ret = bch2_fsck_full(c); | |
1006 | if (ret) | |
1007 | goto err; | |
1008 | bch_verbose(c, "fsck done"); | |
1009 | } | |
1c6fdbd8 | 1010 | |
7b512638 | 1011 | if (enabled_qtypes(c)) { |
619f5bee | 1012 | bch_verbose(c, "reading quotas"); |
7b512638 KO |
1013 | ret = bch2_fs_quota_read(c); |
1014 | if (ret) | |
1015 | goto err; | |
1016 | bch_verbose(c, "quotas done"); | |
1017 | } | |
1018 | ||
26609b61 KO |
1019 | mutex_lock(&c->sb_lock); |
1020 | if (c->opts.version_upgrade) { | |
1021 | if (c->sb.version < bcachefs_metadata_version_new_versioning) | |
1022 | c->disk_sb.sb->version_min = | |
1023 | le16_to_cpu(bcachefs_metadata_version_min); | |
1024 | c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); | |
b807a0c8 | 1025 | c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; |
932aa837 KO |
1026 | write_sb = true; |
1027 | } | |
1028 | ||
1029 | if (!test_bit(BCH_FS_ERROR, &c->flags)) { | |
1030 | c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; | |
1031 | write_sb = true; | |
88c07f73 KO |
1032 | } |
1033 | ||
0bc166ff KO |
1034 | if (c->opts.fsck && |
1035 | !test_bit(BCH_FS_ERROR, &c->flags)) { | |
1c3ff72c | 1036 | c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; |
0bc166ff | 1037 | SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); |
932aa837 | 1038 | write_sb = true; |
0bc166ff | 1039 | } |
932aa837 KO |
1040 | |
1041 | if (write_sb) | |
1042 | bch2_write_super(c); | |
26609b61 | 1043 | mutex_unlock(&c->sb_lock); |
1dd7f9d9 KO |
1044 | |
1045 | if (c->journal_seq_blacklist_table && | |
1046 | c->journal_seq_blacklist_table->nr > 128) | |
1047 | queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); | |
1c6fdbd8 | 1048 | out: |
619f5bee KO |
1049 | ret = 0; |
1050 | err: | |
1051 | fsck_err: | |
89b05118 | 1052 | set_bit(BCH_FS_FSCK_DONE, &c->flags); |
619f5bee | 1053 | bch2_flush_fsck_errs(c); |
89b05118 | 1054 | |
f1d786a0 KO |
1055 | if (!c->opts.keep_journal) { |
1056 | bch2_journal_keys_free(&c->journal_keys); | |
1057 | bch2_journal_entries_free(&c->journal_entries); | |
1058 | } | |
1c6fdbd8 | 1059 | kfree(clean); |
619f5bee KO |
1060 | if (ret) |
1061 | bch_err(c, "Error in recovery: %s (%i)", err, ret); | |
1062 | else | |
1063 | bch_verbose(c, "ret %i", ret); | |
1c6fdbd8 | 1064 | return ret; |
1c6fdbd8 KO |
1065 | } |
1066 | ||
1067 | int bch2_fs_initialize(struct bch_fs *c) | |
1068 | { | |
1069 | struct bch_inode_unpacked root_inode, lostfound_inode; | |
1070 | struct bkey_inode_buf packed_inode; | |
1c6fdbd8 KO |
1071 | struct qstr lostfound = QSTR("lost+found"); |
1072 | const char *err = "cannot allocate memory"; | |
1073 | struct bch_dev *ca; | |
1074 | LIST_HEAD(journal); | |
1075 | unsigned i; | |
1076 | int ret; | |
1077 | ||
1078 | bch_notice(c, "initializing new filesystem"); | |
1079 | ||
3e0745e2 KO |
1080 | mutex_lock(&c->sb_lock); |
1081 | for_each_online_member(ca, c, i) | |
1082 | bch2_mark_dev_superblock(c, ca, 0); | |
1083 | mutex_unlock(&c->sb_lock); | |
1084 | ||
1c6fdbd8 | 1085 | set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); |
f7e76361 | 1086 | set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); |
1c6fdbd8 | 1087 | |
dfe9bfb3 KO |
1088 | for (i = 0; i < BTREE_ID_NR; i++) |
1089 | bch2_btree_root_alloc(c, i); | |
1090 | ||
1c6fdbd8 | 1091 | err = "unable to allocate journal buckets"; |
1633e492 KO |
1092 | for_each_online_member(ca, c, i) { |
1093 | ret = bch2_dev_journal_alloc(ca); | |
1094 | if (ret) { | |
1c6fdbd8 KO |
1095 | percpu_ref_put(&ca->io_ref); |
1096 | goto err; | |
1097 | } | |
1633e492 | 1098 | } |
1c6fdbd8 | 1099 | |
1c6fdbd8 KO |
1100 | /* |
1101 | * journal_res_get() will crash if called before this has | |
1102 | * set up the journal.pin FIFO and journal.cur pointer: | |
1103 | */ | |
1dd7f9d9 | 1104 | bch2_fs_journal_start(&c->journal, 1, &journal); |
1c6fdbd8 KO |
1105 | bch2_journal_set_replay_done(&c->journal); |
1106 | ||
1c6fdbd8 KO |
1107 | bch2_inode_init(c, &root_inode, 0, 0, |
1108 | S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); | |
1109 | root_inode.bi_inum = BCACHEFS_ROOT_INO; | |
1c6fdbd8 KO |
1110 | bch2_inode_pack(&packed_inode, &root_inode); |
1111 | ||
1112 | err = "error creating root directory"; | |
1113 | ret = bch2_btree_insert(c, BTREE_ID_INODES, | |
1114 | &packed_inode.inode.k_i, | |
e731d466 | 1115 | NULL, NULL, BTREE_INSERT_LAZY_RW); |
1c6fdbd8 KO |
1116 | if (ret) |
1117 | goto err; | |
1118 | ||
96385742 | 1119 | bch2_inode_init_early(c, &lostfound_inode); |
1c6fdbd8 KO |
1120 | |
1121 | err = "error creating lost+found"; | |
58e2388f | 1122 | ret = bch2_trans_do(c, NULL, NULL, 0, |
96385742 KO |
1123 | bch2_create_trans(&trans, BCACHEFS_ROOT_INO, |
1124 | &root_inode, &lostfound_inode, | |
1125 | &lostfound, | |
b627c7d8 | 1126 | 0, 0, S_IFDIR|0700, 0, |
96385742 | 1127 | NULL, NULL)); |
1c6fdbd8 KO |
1128 | if (ret) |
1129 | goto err; | |
1130 | ||
1c6fdbd8 KO |
1131 | if (enabled_qtypes(c)) { |
1132 | ret = bch2_fs_quota_read(c); | |
1133 | if (ret) | |
1134 | goto err; | |
1135 | } | |
1136 | ||
1137 | err = "error writing first journal entry"; | |
1138 | ret = bch2_journal_meta(&c->journal); | |
1139 | if (ret) | |
1140 | goto err; | |
1141 | ||
1142 | mutex_lock(&c->sb_lock); | |
26609b61 KO |
1143 | c->disk_sb.sb->version = c->disk_sb.sb->version_min = |
1144 | le16_to_cpu(bcachefs_metadata_version_current); | |
1c3ff72c | 1145 | c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; |
b807a0c8 | 1146 | c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; |
26609b61 | 1147 | |
1c6fdbd8 KO |
1148 | SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); |
1149 | SET_BCH_SB_CLEAN(c->disk_sb.sb, false); | |
1150 | ||
1151 | bch2_write_super(c); | |
1152 | mutex_unlock(&c->sb_lock); | |
1153 | ||
1154 | return 0; | |
1155 | err: | |
8b335bae | 1156 | pr_err("Error initializing new filesystem: %s (%i)", err, ret); |
1c6fdbd8 KO |
1157 | return ret; |
1158 | } |