Commit | Line | Data |
---|---|---|
1c6fdbd8 KO |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
3 | #include "bcachefs.h" | |
07a1006a | 4 | #include "bkey_buf.h" |
7b3f84ea | 5 | #include "alloc_background.h" |
1c6fdbd8 KO |
6 | #include "btree_gc.h" |
7 | #include "btree_update.h" | |
8 | #include "btree_update_interior.h" | |
9 | #include "btree_io.h" | |
3e0745e2 | 10 | #include "buckets.h" |
1c6fdbd8 | 11 | #include "dirent.h" |
cd575ddf | 12 | #include "ec.h" |
1c6fdbd8 | 13 | #include "error.h" |
96385742 | 14 | #include "fs-common.h" |
1c6fdbd8 KO |
15 | #include "fsck.h" |
16 | #include "journal_io.h" | |
644d180b | 17 | #include "journal_reclaim.h" |
1dd7f9d9 | 18 | #include "journal_seq_blacklist.h" |
a4805d66 | 19 | #include "move.h" |
1c6fdbd8 KO |
20 | #include "quota.h" |
21 | #include "recovery.h" | |
42b72e0b | 22 | #include "replicas.h" |
1c6fdbd8 KO |
23 | #include "super-io.h" |
24 | ||
644d180b | 25 | #include <linux/sort.h> |
1c6fdbd8 KO |
26 | #include <linux/stat.h> |
27 | ||
28 | #define QSTR(n) { { { .len = strlen(n) } }, .name = n } | |
29 | ||
33114c2d KO |
30 | /* for -o reconstruct_alloc: */ |
31 | static void drop_alloc_keys(struct journal_keys *keys) | |
32 | { | |
33 | size_t src, dst; | |
34 | ||
35 | for (src = 0, dst = 0; src < keys->nr; src++) | |
36 | if (keys->d[src].btree_id != BTREE_ID_ALLOC) | |
37 | keys->d[dst++] = keys->d[src]; | |
38 | ||
39 | keys->nr = dst; | |
40 | } | |
41 | ||
e222d206 KO |
42 | /* iterate over keys read from the journal: */ |
43 | ||
5b593ee1 KO |
44 | static int __journal_key_cmp(enum btree_id l_btree_id, |
45 | unsigned l_level, | |
46 | struct bpos l_pos, | |
47 | struct journal_key *r) | |
48 | { | |
49 | return (cmp_int(l_btree_id, r->btree_id) ?: | |
50 | cmp_int(l_level, r->level) ?: | |
51 | bkey_cmp(l_pos, r->k->k.p)); | |
52 | } | |
53 | ||
54 | static int journal_key_cmp(struct journal_key *l, struct journal_key *r) | |
55 | { | |
56 | return (cmp_int(l->btree_id, r->btree_id) ?: | |
57 | cmp_int(l->level, r->level) ?: | |
58 | bkey_cmp(l->k->k.p, r->k->k.p)); | |
59 | } | |
60 | ||
61 | static size_t journal_key_search(struct journal_keys *journal_keys, | |
62 | enum btree_id id, unsigned level, | |
63 | struct bpos pos) | |
e222d206 | 64 | { |
e62d65f2 | 65 | size_t l = 0, r = journal_keys->nr, m; |
e222d206 | 66 | |
e62d65f2 KO |
67 | while (l < r) { |
68 | m = l + ((r - l) >> 1); | |
5b593ee1 | 69 | if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0) |
e62d65f2 KO |
70 | l = m + 1; |
71 | else | |
72 | r = m; | |
e222d206 KO |
73 | } |
74 | ||
e62d65f2 | 75 | BUG_ON(l < journal_keys->nr && |
5b593ee1 | 76 | __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0); |
e62d65f2 KO |
77 | |
78 | BUG_ON(l && | |
5b593ee1 KO |
79 | __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0); |
80 | ||
81 | return l; | |
82 | } | |
83 | ||
84 | static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) | |
85 | { | |
86 | struct bkey_i *n = iter->keys->d[idx].k; | |
87 | struct btree_and_journal_iter *biter = | |
88 | container_of(iter, struct btree_and_journal_iter, journal); | |
89 | ||
90 | if (iter->idx > idx || | |
91 | (iter->idx == idx && | |
92 | biter->last && | |
93 | bkey_cmp(n->k.p, biter->unpacked.p) <= 0)) | |
94 | iter->idx++; | |
95 | } | |
96 | ||
97 | int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, | |
98 | unsigned level, struct bkey_i *k) | |
99 | { | |
100 | struct journal_key n = { | |
101 | .btree_id = id, | |
102 | .level = level, | |
103 | .k = k, | |
104 | .allocated = true | |
105 | }; | |
106 | struct journal_keys *keys = &c->journal_keys; | |
107 | struct journal_iter *iter; | |
108 | unsigned idx = journal_key_search(keys, id, level, k->k.p); | |
109 | ||
110 | if (idx < keys->nr && | |
111 | journal_key_cmp(&n, &keys->d[idx]) == 0) { | |
112 | if (keys->d[idx].allocated) | |
113 | kfree(keys->d[idx].k); | |
114 | keys->d[idx] = n; | |
115 | return 0; | |
116 | } | |
117 | ||
118 | if (keys->nr == keys->size) { | |
119 | struct journal_keys new_keys = { | |
120 | .nr = keys->nr, | |
121 | .size = keys->size * 2, | |
122 | .journal_seq_base = keys->journal_seq_base, | |
123 | }; | |
124 | ||
125 | new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL); | |
dab9ef0d KO |
126 | if (!new_keys.d) { |
127 | bch_err(c, "%s: error allocating new key array (size %zu)", | |
128 | __func__, new_keys.size); | |
5b593ee1 | 129 | return -ENOMEM; |
dab9ef0d | 130 | } |
5b593ee1 KO |
131 | |
132 | memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); | |
133 | kvfree(keys->d); | |
134 | *keys = new_keys; | |
135 | } | |
136 | ||
137 | array_insert_item(keys->d, keys->nr, idx, n); | |
138 | ||
139 | list_for_each_entry(iter, &c->journal_iters, list) | |
140 | journal_iter_fix(c, iter, idx); | |
141 | ||
142 | return 0; | |
143 | } | |
144 | ||
145 | int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, | |
146 | unsigned level, struct bpos pos) | |
147 | { | |
148 | struct bkey_i *whiteout = | |
149 | kmalloc(sizeof(struct bkey), GFP_KERNEL); | |
150 | int ret; | |
151 | ||
dab9ef0d KO |
152 | if (!whiteout) { |
153 | bch_err(c, "%s: error allocating new key", __func__); | |
5b593ee1 | 154 | return -ENOMEM; |
dab9ef0d | 155 | } |
5b593ee1 KO |
156 | |
157 | bkey_init(&whiteout->k); | |
158 | whiteout->k.p = pos; | |
e62d65f2 | 159 | |
5b593ee1 KO |
160 | ret = bch2_journal_key_insert(c, id, level, whiteout); |
161 | if (ret) | |
162 | kfree(whiteout); | |
163 | return ret; | |
e62d65f2 KO |
164 | } |
165 | ||
166 | static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) | |
167 | { | |
5b593ee1 KO |
168 | struct journal_key *k = iter->idx - iter->keys->nr |
169 | ? iter->keys->d + iter->idx : NULL; | |
170 | ||
171 | if (k && | |
172 | k->btree_id == iter->btree_id && | |
173 | k->level == iter->level) | |
174 | return k->k; | |
e62d65f2 | 175 | |
5b593ee1 | 176 | iter->idx = iter->keys->nr; |
e62d65f2 KO |
177 | return NULL; |
178 | } | |
179 | ||
180 | static void bch2_journal_iter_advance(struct journal_iter *iter) | |
181 | { | |
5b593ee1 KO |
182 | if (iter->idx < iter->keys->nr) |
183 | iter->idx++; | |
184 | } | |
185 | ||
186 | static void bch2_journal_iter_exit(struct journal_iter *iter) | |
187 | { | |
188 | list_del(&iter->list); | |
e222d206 KO |
189 | } |
190 | ||
5b593ee1 KO |
191 | static void bch2_journal_iter_init(struct bch_fs *c, |
192 | struct journal_iter *iter, | |
e62d65f2 KO |
193 | enum btree_id id, unsigned level, |
194 | struct bpos pos) | |
e222d206 | 195 | { |
e62d65f2 KO |
196 | iter->btree_id = id; |
197 | iter->level = level; | |
5b593ee1 KO |
198 | iter->keys = &c->journal_keys; |
199 | iter->idx = journal_key_search(&c->journal_keys, id, level, pos); | |
200 | list_add(&iter->list, &c->journal_iters); | |
e62d65f2 | 201 | } |
e222d206 | 202 | |
e62d65f2 KO |
203 | static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) |
204 | { | |
5b593ee1 KO |
205 | return bch2_btree_node_iter_peek_unpack(&iter->node_iter, |
206 | iter->b, &iter->unpacked); | |
e62d65f2 | 207 | } |
5c4a5cd5 | 208 | |
e62d65f2 KO |
209 | static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) |
210 | { | |
5b593ee1 | 211 | bch2_btree_node_iter_advance(&iter->node_iter, iter->b); |
e222d206 KO |
212 | } |
213 | ||
5c4a5cd5 KO |
214 | void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) |
215 | { | |
216 | switch (iter->last) { | |
217 | case none: | |
218 | break; | |
219 | case btree: | |
e62d65f2 | 220 | bch2_journal_iter_advance_btree(iter); |
5c4a5cd5 KO |
221 | break; |
222 | case journal: | |
e62d65f2 | 223 | bch2_journal_iter_advance(&iter->journal); |
5c4a5cd5 KO |
224 | break; |
225 | } | |
226 | ||
227 | iter->last = none; | |
228 | } | |
229 | ||
230 | struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) | |
231 | { | |
232 | struct bkey_s_c ret; | |
233 | ||
234 | while (1) { | |
e62d65f2 KO |
235 | struct bkey_s_c btree_k = |
236 | bch2_journal_iter_peek_btree(iter); | |
237 | struct bkey_s_c journal_k = | |
238 | bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); | |
5c4a5cd5 KO |
239 | |
240 | if (btree_k.k && journal_k.k) { | |
241 | int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); | |
242 | ||
243 | if (!cmp) | |
e62d65f2 | 244 | bch2_journal_iter_advance_btree(iter); |
5c4a5cd5 KO |
245 | |
246 | iter->last = cmp < 0 ? btree : journal; | |
247 | } else if (btree_k.k) { | |
248 | iter->last = btree; | |
249 | } else if (journal_k.k) { | |
250 | iter->last = journal; | |
251 | } else { | |
252 | iter->last = none; | |
253 | return bkey_s_c_null; | |
254 | } | |
255 | ||
256 | ret = iter->last == journal ? journal_k : btree_k; | |
e62d65f2 KO |
257 | |
258 | if (iter->b && | |
259 | bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { | |
5b593ee1 | 260 | iter->journal.idx = iter->journal.keys->nr; |
e62d65f2 KO |
261 | iter->last = none; |
262 | return bkey_s_c_null; | |
263 | } | |
264 | ||
5c4a5cd5 KO |
265 | if (!bkey_deleted(ret.k)) |
266 | break; | |
267 | ||
268 | bch2_btree_and_journal_iter_advance(iter); | |
269 | } | |
270 | ||
271 | return ret; | |
272 | } | |
273 | ||
274 | struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) | |
275 | { | |
276 | bch2_btree_and_journal_iter_advance(iter); | |
277 | ||
278 | return bch2_btree_and_journal_iter_peek(iter); | |
279 | } | |
280 | ||
5b593ee1 | 281 | void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) |
5c4a5cd5 | 282 | { |
5b593ee1 | 283 | bch2_journal_iter_exit(&iter->journal); |
e62d65f2 KO |
284 | } |
285 | ||
286 | void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, | |
5b593ee1 | 287 | struct bch_fs *c, |
e62d65f2 KO |
288 | struct btree *b) |
289 | { | |
e62d65f2 KO |
290 | memset(iter, 0, sizeof(*iter)); |
291 | ||
292 | iter->b = b; | |
293 | bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); | |
5b593ee1 | 294 | bch2_journal_iter_init(c, &iter->journal, |
b58a181d | 295 | b->c.btree_id, b->c.level, b->data->min_key); |
5c4a5cd5 KO |
296 | } |
297 | ||
b2930396 KO |
298 | /* Walk btree, overlaying keys from the journal: */ |
299 | ||
edfbba58 KO |
300 | static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, |
301 | struct btree_and_journal_iter iter) | |
302 | { | |
303 | unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; | |
304 | struct bkey_s_c k; | |
305 | struct bkey_buf tmp; | |
306 | ||
307 | BUG_ON(!b->c.level); | |
308 | ||
309 | bch2_bkey_buf_init(&tmp); | |
310 | ||
311 | while (i < nr && | |
312 | (k = bch2_btree_and_journal_iter_peek(&iter)).k) { | |
313 | bch2_bkey_buf_reassemble(&tmp, c, k); | |
314 | ||
315 | bch2_btree_node_prefetch(c, NULL, tmp.k, | |
316 | b->c.btree_id, b->c.level - 1); | |
317 | ||
318 | bch2_btree_and_journal_iter_advance(&iter); | |
319 | i++; | |
320 | } | |
321 | ||
322 | bch2_bkey_buf_exit(&tmp, c); | |
323 | } | |
324 | ||
b2930396 KO |
325 | static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, |
326 | struct journal_keys *journal_keys, | |
327 | enum btree_id btree_id, | |
328 | btree_walk_node_fn node_fn, | |
329 | btree_walk_key_fn key_fn) | |
330 | { | |
331 | struct btree_and_journal_iter iter; | |
332 | struct bkey_s_c k; | |
edfbba58 KO |
333 | struct bkey_buf tmp; |
334 | struct btree *child; | |
b2930396 KO |
335 | int ret = 0; |
336 | ||
edfbba58 | 337 | bch2_bkey_buf_init(&tmp); |
5b593ee1 | 338 | bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); |
b2930396 KO |
339 | |
340 | while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { | |
341 | ret = key_fn(c, btree_id, b->c.level, k); | |
342 | if (ret) | |
343 | break; | |
344 | ||
345 | if (b->c.level) { | |
07a1006a | 346 | bch2_bkey_buf_reassemble(&tmp, c, k); |
b2930396 KO |
347 | |
348 | bch2_btree_and_journal_iter_advance(&iter); | |
349 | ||
07a1006a | 350 | child = bch2_btree_node_get_noiter(c, tmp.k, |
a0b73c1c KO |
351 | b->c.btree_id, b->c.level - 1, |
352 | false); | |
07a1006a KO |
353 | |
354 | ret = PTR_ERR_OR_ZERO(child); | |
355 | if (ret) | |
356 | break; | |
357 | ||
edfbba58 KO |
358 | btree_and_journal_iter_prefetch(c, b, iter); |
359 | ||
07a1006a KO |
360 | ret = (node_fn ? node_fn(c, b) : 0) ?: |
361 | bch2_btree_and_journal_walk_recurse(c, child, | |
362 | journal_keys, btree_id, node_fn, key_fn); | |
363 | six_unlock_read(&child->c.lock); | |
364 | ||
365 | if (ret) | |
366 | break; | |
b2930396 KO |
367 | } else { |
368 | bch2_btree_and_journal_iter_advance(&iter); | |
369 | } | |
370 | } | |
371 | ||
5b593ee1 | 372 | bch2_btree_and_journal_iter_exit(&iter); |
edfbba58 | 373 | bch2_bkey_buf_exit(&tmp, c); |
b2930396 KO |
374 | return ret; |
375 | } | |
376 | ||
377 | int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys, | |
378 | enum btree_id btree_id, | |
379 | btree_walk_node_fn node_fn, | |
380 | btree_walk_key_fn key_fn) | |
381 | { | |
382 | struct btree *b = c->btree_roots[btree_id].b; | |
383 | int ret = 0; | |
384 | ||
385 | if (btree_node_fake(b)) | |
386 | return 0; | |
387 | ||
388 | six_lock_read(&b->c.lock, NULL, NULL); | |
389 | ret = (node_fn ? node_fn(c, b) : 0) ?: | |
390 | bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, | |
391 | node_fn, key_fn) ?: | |
392 | key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key)); | |
393 | six_unlock_read(&b->c.lock); | |
394 | ||
395 | return ret; | |
396 | } | |
397 | ||
d0734356 | 398 | /* sort and dedup all keys in the journal: */ |
644d180b | 399 | |
f1d786a0 | 400 | void bch2_journal_entries_free(struct list_head *list) |
1c6fdbd8 | 401 | { |
1c6fdbd8 | 402 | |
644d180b KO |
403 | while (!list_empty(list)) { |
404 | struct journal_replay *i = | |
405 | list_first_entry(list, struct journal_replay, list); | |
406 | list_del(&i->list); | |
407 | kvpfree(i, offsetof(struct journal_replay, j) + | |
408 | vstruct_bytes(&i->j)); | |
1c6fdbd8 | 409 | } |
644d180b | 410 | } |
1c6fdbd8 | 411 | |
e3e464ac KO |
412 | /* |
413 | * When keys compare equal, oldest compares first: | |
414 | */ | |
d0734356 KO |
415 | static int journal_sort_key_cmp(const void *_l, const void *_r) |
416 | { | |
417 | const struct journal_key *l = _l; | |
418 | const struct journal_key *r = _r; | |
419 | ||
e62d65f2 KO |
420 | return cmp_int(l->btree_id, r->btree_id) ?: |
421 | cmp_int(l->level, r->level) ?: | |
e3e464ac | 422 | bkey_cmp(l->k->k.p, r->k->k.p) ?: |
d0734356 KO |
423 | cmp_int(l->journal_seq, r->journal_seq) ?: |
424 | cmp_int(l->journal_offset, r->journal_offset); | |
425 | } | |
426 | ||
f1d786a0 | 427 | void bch2_journal_keys_free(struct journal_keys *keys) |
d0734356 | 428 | { |
5b593ee1 KO |
429 | struct journal_key *i; |
430 | ||
431 | for (i = keys->d; i < keys->d + keys->nr; i++) | |
432 | if (i->allocated) | |
433 | kfree(i->k); | |
434 | ||
d0734356 KO |
435 | kvfree(keys->d); |
436 | keys->d = NULL; | |
437 | keys->nr = 0; | |
438 | } | |
439 | ||
440 | static struct journal_keys journal_keys_sort(struct list_head *journal_entries) | |
441 | { | |
adbcada4 | 442 | struct journal_replay *i; |
d0734356 KO |
443 | struct jset_entry *entry; |
444 | struct bkey_i *k, *_n; | |
e3e464ac KO |
445 | struct journal_keys keys = { NULL }; |
446 | struct journal_key *src, *dst; | |
d0734356 KO |
447 | size_t nr_keys = 0; |
448 | ||
7fffc85b KO |
449 | if (list_empty(journal_entries)) |
450 | return keys; | |
451 | ||
adbcada4 KO |
452 | list_for_each_entry(i, journal_entries, list) { |
453 | if (i->ignore) | |
7fffc85b KO |
454 | continue; |
455 | ||
adbcada4 KO |
456 | if (!keys.journal_seq_base) |
457 | keys.journal_seq_base = le64_to_cpu(i->j.seq); | |
458 | ||
459 | for_each_jset_key(k, _n, entry, &i->j) | |
d0734356 | 460 | nr_keys++; |
7fffc85b | 461 | } |
d0734356 | 462 | |
5b593ee1 KO |
463 | keys.size = roundup_pow_of_two(nr_keys); |
464 | ||
465 | keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL); | |
d0734356 KO |
466 | if (!keys.d) |
467 | goto err; | |
468 | ||
adbcada4 KO |
469 | list_for_each_entry(i, journal_entries, list) { |
470 | if (i->ignore) | |
7fffc85b KO |
471 | continue; |
472 | ||
adbcada4 KO |
473 | BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX); |
474 | ||
475 | for_each_jset_key(k, _n, entry, &i->j) | |
d0734356 KO |
476 | keys.d[keys.nr++] = (struct journal_key) { |
477 | .btree_id = entry->btree_id, | |
e62d65f2 | 478 | .level = entry->level, |
d0734356 | 479 | .k = k, |
adbcada4 | 480 | .journal_seq = le64_to_cpu(i->j.seq) - |
d0734356 | 481 | keys.journal_seq_base, |
adbcada4 | 482 | .journal_offset = k->_data - i->j._data, |
d0734356 | 483 | }; |
7fffc85b | 484 | } |
d0734356 | 485 | |
3186c80f | 486 | sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); |
d0734356 | 487 | |
e3e464ac KO |
488 | src = dst = keys.d; |
489 | while (src < keys.d + keys.nr) { | |
490 | while (src + 1 < keys.d + keys.nr && | |
e62d65f2 KO |
491 | src[0].btree_id == src[1].btree_id && |
492 | src[0].level == src[1].level && | |
e3e464ac KO |
493 | !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) |
494 | src++; | |
d0734356 | 495 | |
e3e464ac | 496 | *dst++ = *src++; |
d0734356 KO |
497 | } |
498 | ||
e3e464ac | 499 | keys.nr = dst - keys.d; |
d0734356 | 500 | err: |
e3e464ac | 501 | return keys; |
d0734356 KO |
502 | } |
503 | ||
504 | /* journal replay: */ | |
505 | ||
506 | static void replay_now_at(struct journal *j, u64 seq) | |
507 | { | |
508 | BUG_ON(seq < j->replay_journal_seq); | |
509 | BUG_ON(seq > j->replay_journal_seq_end); | |
510 | ||
511 | while (j->replay_journal_seq < seq) | |
512 | bch2_journal_pin_put(j, j->replay_journal_seq++); | |
513 | } | |
514 | ||
2d594dfb | 515 | static int __bch2_journal_replay_key(struct btree_trans *trans, |
f44a6a71 KO |
516 | enum btree_id id, unsigned level, |
517 | struct bkey_i *k) | |
2d594dfb KO |
518 | { |
519 | struct btree_iter *iter; | |
f6d0368e | 520 | int ret; |
2d594dfb | 521 | |
f44a6a71 KO |
522 | iter = bch2_trans_get_node_iter(trans, id, k->k.p, |
523 | BTREE_MAX_DEPTH, level, | |
524 | BTREE_ITER_INTENT); | |
2d594dfb | 525 | |
e3e464ac KO |
526 | /* |
527 | * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run | |
528 | * extent_handle_overwrites() and extent_update_to_keys() - but we don't | |
529 | * want that here, journal replay is supposed to treat extents like | |
530 | * regular keys: | |
531 | */ | |
532 | __bch2_btree_iter_set_pos(iter, k->k.p, false); | |
533 | ||
f6d0368e KO |
534 | ret = bch2_btree_iter_traverse(iter) ?: |
535 | bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); | |
27beb810 | 536 | bch2_trans_iter_put(trans, iter); |
f6d0368e | 537 | return ret; |
2d594dfb KO |
538 | } |
539 | ||
5b593ee1 | 540 | static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) |
2d594dfb | 541 | { |
5b593ee1 KO |
542 | unsigned commit_flags = BTREE_INSERT_NOFAIL| |
543 | BTREE_INSERT_LAZY_RW; | |
544 | ||
545 | if (!k->allocated) | |
546 | commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; | |
547 | ||
548 | return bch2_trans_do(c, NULL, NULL, commit_flags, | |
549 | __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k)); | |
2d594dfb KO |
550 | } |
551 | ||
5d20ba48 KO |
552 | static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) |
553 | { | |
554 | struct btree_iter *iter; | |
555 | int ret; | |
556 | ||
557 | iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, | |
558 | BTREE_ITER_CACHED| | |
559 | BTREE_ITER_CACHED_NOFILL| | |
560 | BTREE_ITER_INTENT); | |
3eb26d01 | 561 | ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); |
5d20ba48 KO |
562 | bch2_trans_iter_put(trans, iter); |
563 | return ret; | |
564 | } | |
565 | ||
566 | static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) | |
567 | { | |
568 | return bch2_trans_do(c, NULL, NULL, | |
569 | BTREE_INSERT_NOFAIL| | |
570 | BTREE_INSERT_USE_RESERVE| | |
571 | BTREE_INSERT_LAZY_RW| | |
572 | BTREE_INSERT_JOURNAL_REPLAY, | |
573 | __bch2_alloc_replay_key(&trans, k)); | |
574 | } | |
575 | ||
576 | static int journal_sort_seq_cmp(const void *_l, const void *_r) | |
577 | { | |
578 | const struct journal_key *l = _l; | |
579 | const struct journal_key *r = _r; | |
580 | ||
581 | return cmp_int(r->level, l->level) ?: | |
582 | cmp_int(l->journal_seq, r->journal_seq) ?: | |
583 | cmp_int(l->btree_id, r->btree_id) ?: | |
584 | bkey_cmp(l->k->k.p, r->k->k.p); | |
585 | } | |
586 | ||
d0734356 KO |
587 | static int bch2_journal_replay(struct bch_fs *c, |
588 | struct journal_keys keys) | |
644d180b KO |
589 | { |
590 | struct journal *j = &c->journal; | |
d0734356 | 591 | struct journal_key *i; |
5d20ba48 | 592 | u64 seq; |
d0734356 | 593 | int ret; |
7b512638 | 594 | |
d0734356 | 595 | sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); |
7b512638 | 596 | |
2f194e16 KO |
597 | if (keys.nr) |
598 | replay_now_at(j, keys.journal_seq_base); | |
f44a6a71 | 599 | |
5d20ba48 KO |
600 | seq = j->replay_journal_seq; |
601 | ||
602 | /* | |
603 | * First replay updates to the alloc btree - these will only update the | |
604 | * btree key cache: | |
605 | */ | |
d0734356 | 606 | for_each_journal_key(keys, i) { |
5d20ba48 | 607 | cond_resched(); |
d0734356 | 608 | |
5d20ba48 KO |
609 | if (!i->level && i->btree_id == BTREE_ID_ALLOC) { |
610 | j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; | |
d0734356 | 611 | ret = bch2_alloc_replay_key(c, i->k); |
5d20ba48 KO |
612 | if (ret) |
613 | goto err; | |
614 | } | |
615 | } | |
644d180b | 616 | |
5d20ba48 KO |
617 | /* |
618 | * Next replay updates to interior btree nodes: | |
619 | */ | |
620 | for_each_journal_key(keys, i) { | |
621 | cond_resched(); | |
622 | ||
623 | if (i->level) { | |
624 | j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; | |
5b593ee1 | 625 | ret = bch2_journal_replay_key(c, i); |
5d20ba48 KO |
626 | if (ret) |
627 | goto err; | |
644d180b | 628 | } |
5d20ba48 KO |
629 | } |
630 | ||
631 | /* | |
632 | * Now that the btree is in a consistent state, we can start journal | |
633 | * reclaim (which will be flushing entries from the btree key cache back | |
634 | * to the btree: | |
635 | */ | |
636 | set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); | |
637 | set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); | |
5731cf01 | 638 | journal_reclaim_kick(j); |
5d20ba48 KO |
639 | |
640 | j->replay_journal_seq = seq; | |
d0734356 | 641 | |
5d20ba48 KO |
642 | /* |
643 | * Now replay leaf node updates: | |
644 | */ | |
645 | for_each_journal_key(keys, i) { | |
d0734356 | 646 | cond_resched(); |
5d20ba48 KO |
647 | |
648 | if (i->level || i->btree_id == BTREE_ID_ALLOC) | |
649 | continue; | |
650 | ||
651 | replay_now_at(j, keys.journal_seq_base + i->journal_seq); | |
652 | ||
8042b5b7 | 653 | ret = bch2_journal_replay_key(c, i); |
5d20ba48 KO |
654 | if (ret) |
655 | goto err; | |
7b512638 | 656 | } |
644d180b KO |
657 | |
658 | replay_now_at(j, j->replay_journal_seq_end); | |
659 | j->replay_journal_seq = 0; | |
660 | ||
661 | bch2_journal_set_replay_done(j); | |
662 | bch2_journal_flush_all_pins(j); | |
d0734356 | 663 | return bch2_journal_error(j); |
5d20ba48 | 664 | err: |
a0b73c1c KO |
665 | bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", |
666 | ret, bch2_btree_ids[i->btree_id], i->level); | |
5d20ba48 | 667 | return ret; |
7b512638 KO |
668 | } |
669 | ||
644d180b | 670 | /* journal replay early: */ |
7b512638 | 671 | |
42b72e0b KO |
672 | static int journal_replay_entry_early(struct bch_fs *c, |
673 | struct jset_entry *entry) | |
674 | { | |
675 | int ret = 0; | |
676 | ||
677 | switch (entry->type) { | |
678 | case BCH_JSET_ENTRY_btree_root: { | |
2ded276b KO |
679 | struct btree_root *r; |
680 | ||
681 | if (entry->btree_id >= BTREE_ID_NR) { | |
682 | bch_err(c, "filesystem has unknown btree type %u", | |
683 | entry->btree_id); | |
684 | return -EINVAL; | |
685 | } | |
686 | ||
687 | r = &c->btree_roots[entry->btree_id]; | |
42b72e0b KO |
688 | |
689 | if (entry->u64s) { | |
690 | r->level = entry->level; | |
691 | bkey_copy(&r->key, &entry->start[0]); | |
692 | r->error = 0; | |
693 | } else { | |
694 | r->error = -EIO; | |
695 | } | |
696 | r->alive = true; | |
697 | break; | |
698 | } | |
699 | case BCH_JSET_ENTRY_usage: { | |
700 | struct jset_entry_usage *u = | |
701 | container_of(entry, struct jset_entry_usage, entry); | |
702 | ||
3577df5f KO |
703 | switch (entry->btree_id) { |
704 | case FS_USAGE_RESERVED: | |
705 | if (entry->level < BCH_REPLICAS_MAX) | |
5e82a9a1 KO |
706 | c->usage_base->persistent_reserved[entry->level] = |
707 | le64_to_cpu(u->v); | |
42b72e0b KO |
708 | break; |
709 | case FS_USAGE_INODES: | |
5e82a9a1 | 710 | c->usage_base->nr_inodes = le64_to_cpu(u->v); |
42b72e0b KO |
711 | break; |
712 | case FS_USAGE_KEY_VERSION: | |
713 | atomic64_set(&c->key_version, | |
3577df5f | 714 | le64_to_cpu(u->v)); |
42b72e0b KO |
715 | break; |
716 | } | |
717 | ||
718 | break; | |
719 | } | |
3577df5f KO |
720 | case BCH_JSET_ENTRY_data_usage: { |
721 | struct jset_entry_data_usage *u = | |
722 | container_of(entry, struct jset_entry_data_usage, entry); | |
180fb49d | 723 | |
3577df5f KO |
724 | ret = bch2_replicas_set_usage(c, &u->r, |
725 | le64_to_cpu(u->v)); | |
726 | break; | |
727 | } | |
180fb49d KO |
728 | case BCH_JSET_ENTRY_dev_usage: { |
729 | struct jset_entry_dev_usage *u = | |
730 | container_of(entry, struct jset_entry_dev_usage, entry); | |
731 | struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev); | |
732 | unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); | |
733 | unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / | |
734 | sizeof(struct jset_entry_dev_usage_type); | |
735 | unsigned i; | |
736 | ||
737 | ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); | |
738 | ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); | |
739 | ||
740 | for (i = 0; i < nr_types; i++) { | |
741 | ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); | |
742 | ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); | |
743 | ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); | |
744 | } | |
745 | ||
746 | break; | |
747 | } | |
1dd7f9d9 KO |
748 | case BCH_JSET_ENTRY_blacklist: { |
749 | struct jset_entry_blacklist *bl_entry = | |
750 | container_of(entry, struct jset_entry_blacklist, entry); | |
751 | ||
752 | ret = bch2_journal_seq_blacklist_add(c, | |
753 | le64_to_cpu(bl_entry->seq), | |
754 | le64_to_cpu(bl_entry->seq) + 1); | |
755 | break; | |
756 | } | |
757 | case BCH_JSET_ENTRY_blacklist_v2: { | |
758 | struct jset_entry_blacklist_v2 *bl_entry = | |
759 | container_of(entry, struct jset_entry_blacklist_v2, entry); | |
760 | ||
761 | ret = bch2_journal_seq_blacklist_add(c, | |
762 | le64_to_cpu(bl_entry->start), | |
763 | le64_to_cpu(bl_entry->end) + 1); | |
764 | break; | |
765 | } | |
2abe5420 KO |
766 | case BCH_JSET_ENTRY_clock: { |
767 | struct jset_entry_clock *clock = | |
768 | container_of(entry, struct jset_entry_clock, entry); | |
769 | ||
770 | atomic64_set(&c->io_clock[clock->rw].now, clock->time); | |
771 | } | |
42b72e0b KO |
772 | } |
773 | ||
774 | return ret; | |
775 | } | |
776 | ||
1dd7f9d9 KO |
777 | static int journal_replay_early(struct bch_fs *c, |
778 | struct bch_sb_field_clean *clean, | |
779 | struct list_head *journal) | |
1c6fdbd8 | 780 | { |
adbcada4 | 781 | struct journal_replay *i; |
7b512638 KO |
782 | struct jset_entry *entry; |
783 | int ret; | |
1c6fdbd8 | 784 | |
7b512638 | 785 | if (clean) { |
7b512638 KO |
786 | for (entry = clean->start; |
787 | entry != vstruct_end(&clean->field); | |
788 | entry = vstruct_next(entry)) { | |
789 | ret = journal_replay_entry_early(c, entry); | |
790 | if (ret) | |
791 | return ret; | |
792 | } | |
793 | } else { | |
adbcada4 KO |
794 | list_for_each_entry(i, journal, list) { |
795 | if (i->ignore) | |
796 | continue; | |
7b512638 | 797 | |
7b512638 KO |
798 | vstruct_for_each(&i->j, entry) { |
799 | ret = journal_replay_entry_early(c, entry); | |
800 | if (ret) | |
801 | return ret; | |
802 | } | |
adbcada4 | 803 | } |
134915f3 | 804 | } |
1c6fdbd8 | 805 | |
7b512638 KO |
806 | bch2_fs_usage_initialize(c); |
807 | ||
808 | return 0; | |
809 | } | |
810 | ||
644d180b KO |
811 | /* sb clean section: */ |
812 | ||
813 | static struct bkey_i *btree_root_find(struct bch_fs *c, | |
814 | struct bch_sb_field_clean *clean, | |
815 | struct jset *j, | |
816 | enum btree_id id, unsigned *level) | |
817 | { | |
818 | struct bkey_i *k; | |
819 | struct jset_entry *entry, *start, *end; | |
820 | ||
821 | if (clean) { | |
822 | start = clean->start; | |
823 | end = vstruct_end(&clean->field); | |
824 | } else { | |
825 | start = j->start; | |
826 | end = vstruct_last(j); | |
827 | } | |
828 | ||
829 | for (entry = start; entry < end; entry = vstruct_next(entry)) | |
830 | if (entry->type == BCH_JSET_ENTRY_btree_root && | |
831 | entry->btree_id == id) | |
832 | goto found; | |
833 | ||
834 | return NULL; | |
835 | found: | |
836 | if (!entry->u64s) | |
837 | return ERR_PTR(-EINVAL); | |
838 | ||
839 | k = entry->start; | |
840 | *level = entry->level; | |
841 | return k; | |
842 | } | |
843 | ||
844 | static int verify_superblock_clean(struct bch_fs *c, | |
845 | struct bch_sb_field_clean **cleanp, | |
846 | struct jset *j) | |
847 | { | |
848 | unsigned i; | |
849 | struct bch_sb_field_clean *clean = *cleanp; | |
850 | int ret = 0; | |
851 | ||
644d180b KO |
852 | if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, |
853 | "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", | |
854 | le64_to_cpu(clean->journal_seq), | |
855 | le64_to_cpu(j->seq))) { | |
856 | kfree(clean); | |
857 | *cleanp = NULL; | |
858 | return 0; | |
859 | } | |
860 | ||
644d180b | 861 | for (i = 0; i < BTREE_ID_NR; i++) { |
00b8ccf7 | 862 | char buf1[200], buf2[200]; |
644d180b KO |
863 | struct bkey_i *k1, *k2; |
864 | unsigned l1 = 0, l2 = 0; | |
865 | ||
866 | k1 = btree_root_find(c, clean, NULL, i, &l1); | |
867 | k2 = btree_root_find(c, NULL, j, i, &l2); | |
868 | ||
869 | if (!k1 && !k2) | |
870 | continue; | |
871 | ||
872 | mustfix_fsck_err_on(!k1 || !k2 || | |
873 | IS_ERR(k1) || | |
874 | IS_ERR(k2) || | |
875 | k1->k.u64s != k2->k.u64s || | |
876 | memcmp(k1, k2, bkey_bytes(k1)) || | |
877 | l1 != l2, c, | |
00b8ccf7 KO |
878 | "superblock btree root %u doesn't match journal after clean shutdown\n" |
879 | "sb: l=%u %s\n" | |
880 | "journal: l=%u %s\n", i, | |
881 | l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), | |
882 | l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); | |
644d180b KO |
883 | } |
884 | fsck_err: | |
885 | return ret; | |
886 | } | |
887 | ||
888 | static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) | |
889 | { | |
890 | struct bch_sb_field_clean *clean, *sb_clean; | |
891 | int ret; | |
892 | ||
893 | mutex_lock(&c->sb_lock); | |
894 | sb_clean = bch2_sb_get_clean(c->disk_sb.sb); | |
895 | ||
896 | if (fsck_err_on(!sb_clean, c, | |
897 | "superblock marked clean but clean section not present")) { | |
898 | SET_BCH_SB_CLEAN(c->disk_sb.sb, false); | |
899 | c->sb.clean = false; | |
900 | mutex_unlock(&c->sb_lock); | |
901 | return NULL; | |
902 | } | |
903 | ||
904 | clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), | |
905 | GFP_KERNEL); | |
906 | if (!clean) { | |
907 | mutex_unlock(&c->sb_lock); | |
908 | return ERR_PTR(-ENOMEM); | |
909 | } | |
910 | ||
911 | if (le16_to_cpu(c->disk_sb.sb->version) < | |
912 | bcachefs_metadata_version_bkey_renumber) | |
913 | bch2_sb_clean_renumber(clean, READ); | |
914 | ||
915 | mutex_unlock(&c->sb_lock); | |
916 | ||
917 | return clean; | |
918 | fsck_err: | |
919 | mutex_unlock(&c->sb_lock); | |
920 | return ERR_PTR(ret); | |
921 | } | |
922 | ||
7b512638 KO |
923 | static int read_btree_roots(struct bch_fs *c) |
924 | { | |
925 | unsigned i; | |
926 | int ret = 0; | |
1c6fdbd8 KO |
927 | |
928 | for (i = 0; i < BTREE_ID_NR; i++) { | |
7b512638 | 929 | struct btree_root *r = &c->btree_roots[i]; |
1c6fdbd8 | 930 | |
7b512638 KO |
931 | if (!r->alive) |
932 | continue; | |
1c6fdbd8 | 933 | |
7b512638 | 934 | if (i == BTREE_ID_ALLOC && |
ad7e137e | 935 | c->opts.reconstruct_alloc) { |
19dd3172 | 936 | c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); |
1c6fdbd8 | 937 | continue; |
7b512638 | 938 | } |
1c6fdbd8 | 939 | |
7b512638 KO |
940 | if (r->error) { |
941 | __fsck_err(c, i == BTREE_ID_ALLOC | |
942 | ? FSCK_CAN_IGNORE : 0, | |
943 | "invalid btree root %s", | |
944 | bch2_btree_ids[i]); | |
945 | if (i == BTREE_ID_ALLOC) | |
19dd3172 | 946 | c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); |
7b512638 KO |
947 | } |
948 | ||
949 | ret = bch2_btree_root_read(c, i, &r->key, r->level); | |
950 | if (ret) { | |
951 | __fsck_err(c, i == BTREE_ID_ALLOC | |
952 | ? FSCK_CAN_IGNORE : 0, | |
953 | "error reading btree root %s", | |
954 | bch2_btree_ids[i]); | |
955 | if (i == BTREE_ID_ALLOC) | |
19dd3172 | 956 | c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); |
7b512638 | 957 | } |
1c6fdbd8 | 958 | } |
7b512638 KO |
959 | |
960 | for (i = 0; i < BTREE_ID_NR; i++) | |
961 | if (!c->btree_roots[i].b) | |
962 | bch2_btree_root_alloc(c, i); | |
1c6fdbd8 KO |
963 | fsck_err: |
964 | return ret; | |
965 | } | |
966 | ||
1c6fdbd8 KO |
967 | int bch2_fs_recovery(struct bch_fs *c) |
968 | { | |
969 | const char *err = "cannot allocate memory"; | |
1dd7f9d9 | 970 | struct bch_sb_field_clean *clean = NULL; |
adbcada4 KO |
971 | struct jset *last_journal_entry = NULL; |
972 | u64 blacklist_seq, journal_seq; | |
4291a331 | 973 | bool write_sb = false; |
1c6fdbd8 KO |
974 | int ret; |
975 | ||
1dd7f9d9 KO |
976 | if (c->sb.clean) |
977 | clean = read_superblock_clean(c); | |
978 | ret = PTR_ERR_OR_ZERO(clean); | |
979 | if (ret) | |
980 | goto err; | |
981 | ||
982 | if (c->sb.clean) | |
1c6fdbd8 KO |
983 | bch_info(c, "recovering from clean shutdown, journal seq %llu", |
984 | le64_to_cpu(clean->journal_seq)); | |
985 | ||
5d428c7c KO |
986 | if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { |
987 | bch_info(c, "alloc_v2 feature bit not set, fsck required"); | |
988 | c->opts.fsck = true; | |
989 | c->opts.fix_errors = FSCK_OPT_YES; | |
990 | c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_alloc_v2; | |
991 | } | |
992 | ||
f621e152 KO |
993 | if (!c->replicas.entries || |
994 | c->opts.rebuild_replicas) { | |
1dd7f9d9 KO |
995 | bch_info(c, "building replicas info"); |
996 | set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); | |
997 | } | |
998 | ||
adbcada4 KO |
999 | ret = bch2_blacklist_table_initialize(c); |
1000 | if (ret) { | |
1001 | bch_err(c, "error initializing blacklist table"); | |
1002 | goto err; | |
1003 | } | |
1004 | ||
5a655f06 | 1005 | if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { |
adbcada4 | 1006 | struct journal_replay *i; |
1dd7f9d9 | 1007 | |
adbcada4 KO |
1008 | ret = bch2_journal_read(c, &c->journal_entries, |
1009 | &blacklist_seq, &journal_seq); | |
1c6fdbd8 KO |
1010 | if (ret) |
1011 | goto err; | |
1012 | ||
adbcada4 KO |
1013 | list_for_each_entry_reverse(i, &c->journal_entries, list) |
1014 | if (!i->ignore) { | |
1015 | last_journal_entry = &i->j; | |
1016 | break; | |
1017 | } | |
1018 | ||
1019 | if (mustfix_fsck_err_on(c->sb.clean && | |
1020 | last_journal_entry && | |
1021 | !journal_entry_empty(last_journal_entry), c, | |
932aa837 | 1022 | "filesystem marked clean but journal not empty")) { |
19dd3172 | 1023 | c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); |
932aa837 KO |
1024 | SET_BCH_SB_CLEAN(c->disk_sb.sb, false); |
1025 | c->sb.clean = false; | |
1026 | } | |
1dd7f9d9 | 1027 | |
adbcada4 KO |
1028 | if (!last_journal_entry) { |
1029 | fsck_err_on(!c->sb.clean, c, "no journal entries found"); | |
1030 | goto use_clean; | |
1dd7f9d9 KO |
1031 | } |
1032 | ||
f1d786a0 KO |
1033 | c->journal_keys = journal_keys_sort(&c->journal_entries); |
1034 | if (!c->journal_keys.d) { | |
d0734356 KO |
1035 | ret = -ENOMEM; |
1036 | goto err; | |
1037 | } | |
1038 | ||
adbcada4 KO |
1039 | if (c->sb.clean && last_journal_entry) { |
1040 | ret = verify_superblock_clean(c, &clean, | |
1041 | last_journal_entry); | |
1042 | if (ret) | |
1043 | goto err; | |
1044 | } | |
1045 | } else { | |
1046 | use_clean: | |
1047 | if (!clean) { | |
1048 | bch_err(c, "no superblock clean section found"); | |
1049 | ret = BCH_FSCK_REPAIR_IMPOSSIBLE; | |
7b512638 | 1050 | goto err; |
1dd7f9d9 | 1051 | |
adbcada4 KO |
1052 | } |
1053 | blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; | |
1dd7f9d9 KO |
1054 | } |
1055 | ||
e3e464ac KO |
1056 | if (!c->sb.clean && |
1057 | !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { | |
1058 | bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); | |
1059 | ret = -EINVAL; | |
1060 | goto err; | |
1061 | } | |
1062 | ||
33114c2d | 1063 | if (c->opts.reconstruct_alloc) { |
19dd3172 | 1064 | c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); |
33114c2d KO |
1065 | drop_alloc_keys(&c->journal_keys); |
1066 | } | |
1067 | ||
f1d786a0 | 1068 | ret = journal_replay_early(c, clean, &c->journal_entries); |
1dd7f9d9 KO |
1069 | if (ret) |
1070 | goto err; | |
1071 | ||
adbcada4 KO |
1072 | /* |
1073 | * After an unclean shutdown, skip then next few journal sequence | |
1074 | * numbers as they may have been referenced by btree writes that | |
1075 | * happened before their corresponding journal writes - those btree | |
1076 | * writes need to be ignored, by skipping and blacklisting the next few | |
1077 | * journal sequence numbers: | |
1078 | */ | |
1079 | if (!c->sb.clean) | |
1080 | journal_seq += 8; | |
1081 | ||
1082 | if (blacklist_seq != journal_seq) { | |
1dd7f9d9 | 1083 | ret = bch2_journal_seq_blacklist_add(c, |
adbcada4 | 1084 | blacklist_seq, journal_seq); |
1dd7f9d9 KO |
1085 | if (ret) { |
1086 | bch_err(c, "error creating new journal seq blacklist entry"); | |
7b512638 | 1087 | goto err; |
1dd7f9d9 | 1088 | } |
f707e3d8 | 1089 | } |
1c6fdbd8 | 1090 | |
d0734356 | 1091 | ret = bch2_fs_journal_start(&c->journal, journal_seq, |
f1d786a0 | 1092 | &c->journal_entries); |
7b512638 KO |
1093 | if (ret) |
1094 | goto err; | |
1c6fdbd8 | 1095 | |
7b512638 KO |
1096 | ret = read_btree_roots(c); |
1097 | if (ret) | |
1098 | goto err; | |
1c6fdbd8 | 1099 | |
932aa837 | 1100 | bch_verbose(c, "starting alloc read"); |
1c6fdbd8 | 1101 | err = "error reading allocation information"; |
f1d786a0 | 1102 | ret = bch2_alloc_read(c, &c->journal_keys); |
1c6fdbd8 KO |
1103 | if (ret) |
1104 | goto err; | |
932aa837 | 1105 | bch_verbose(c, "alloc read done"); |
1c6fdbd8 | 1106 | |
94cd106f | 1107 | bch_verbose(c, "starting stripes_read"); |
932aa837 | 1108 | err = "error reading stripes"; |
f1d786a0 | 1109 | ret = bch2_stripes_read(c, &c->journal_keys); |
4e65431c KO |
1110 | if (ret) |
1111 | goto err; | |
94cd106f | 1112 | bch_verbose(c, "stripes_read done"); |
61c8d7c8 KO |
1113 | |
1114 | set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); | |
4e65431c | 1115 | |
7b512638 | 1116 | if (c->opts.fsck || |
19dd3172 KO |
1117 | !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || |
1118 | !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || | |
7b512638 | 1119 | test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { |
41e37786 KO |
1120 | bool metadata_only = c->opts.norecovery; |
1121 | ||
619f5bee | 1122 | bch_info(c, "starting mark and sweep"); |
932aa837 | 1123 | err = "error in mark and sweep"; |
41e37786 | 1124 | ret = bch2_gc(c, true, metadata_only); |
8d6b6222 | 1125 | if (ret) |
4291a331 | 1126 | goto err; |
1df42b57 KO |
1127 | bch_verbose(c, "mark and sweep done"); |
1128 | } | |
1c6fdbd8 | 1129 | |
ac958006 KO |
1130 | bch2_stripes_heap_start(c); |
1131 | ||
72644db1 | 1132 | clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); |
1df42b57 | 1133 | set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); |
72644db1 | 1134 | |
42b72e0b KO |
1135 | /* |
1136 | * Skip past versions that might have possibly been used (as nonces), | |
1137 | * but hadn't had their pointers written: | |
1138 | */ | |
1139 | if (c->sb.encryption_type && !c->sb.clean) | |
1140 | atomic64_add(1 << 16, &c->key_version); | |
1141 | ||
619f5bee | 1142 | if (c->opts.norecovery) |
7b512638 KO |
1143 | goto out; |
1144 | ||
619f5bee | 1145 | bch_verbose(c, "starting journal replay"); |
1c6fdbd8 | 1146 | err = "journal replay failed"; |
f1d786a0 | 1147 | ret = bch2_journal_replay(c, c->journal_keys); |
1c6fdbd8 KO |
1148 | if (ret) |
1149 | goto err; | |
1150 | bch_verbose(c, "journal replay done"); | |
1151 | ||
4291a331 KO |
1152 | if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && |
1153 | !c->opts.nochanges) { | |
619f5bee KO |
1154 | /* |
1155 | * note that even when filesystem was clean there might be work | |
1156 | * to do here, if we ran gc (because of fsck) which recalculated | |
1157 | * oldest_gen: | |
1158 | */ | |
1159 | bch_verbose(c, "writing allocation info"); | |
1160 | err = "error writing out alloc info"; | |
8d6b6222 KO |
1161 | ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?: |
1162 | bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); | |
619f5bee KO |
1163 | if (ret) { |
1164 | bch_err(c, "error writing alloc info"); | |
1165 | goto err; | |
1166 | } | |
1167 | bch_verbose(c, "alloc write done"); | |
932aa837 | 1168 | } |
932aa837 | 1169 | |
619f5bee | 1170 | if (!c->sb.clean) { |
1c3ff72c | 1171 | if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { |
619f5bee KO |
1172 | bch_info(c, "checking inode link counts"); |
1173 | err = "error in recovery"; | |
1174 | ret = bch2_fsck_inode_nlink(c); | |
1175 | if (ret) | |
1176 | goto err; | |
1177 | bch_verbose(c, "check inodes done"); | |
1c6fdbd8 | 1178 | |
619f5bee KO |
1179 | } else { |
1180 | bch_verbose(c, "checking for deleted inodes"); | |
1181 | err = "error in recovery"; | |
1182 | ret = bch2_fsck_walk_inodes_only(c); | |
1183 | if (ret) | |
1184 | goto err; | |
1185 | bch_verbose(c, "check inodes done"); | |
1186 | } | |
1187 | } | |
1188 | ||
1189 | if (c->opts.fsck) { | |
1190 | bch_info(c, "starting fsck"); | |
1191 | err = "error in fsck"; | |
1192 | ret = bch2_fsck_full(c); | |
1193 | if (ret) | |
1194 | goto err; | |
1195 | bch_verbose(c, "fsck done"); | |
1196 | } | |
1c6fdbd8 | 1197 | |
7b512638 | 1198 | if (enabled_qtypes(c)) { |
619f5bee | 1199 | bch_verbose(c, "reading quotas"); |
7b512638 KO |
1200 | ret = bch2_fs_quota_read(c); |
1201 | if (ret) | |
1202 | goto err; | |
1203 | bch_verbose(c, "quotas done"); | |
1204 | } | |
1205 | ||
19dd3172 KO |
1206 | if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || |
1207 | !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { | |
a4805d66 KO |
1208 | struct bch_move_stats stats = { 0 }; |
1209 | ||
19dd3172 | 1210 | bch_info(c, "scanning for old btree nodes"); |
a4805d66 KO |
1211 | ret = bch2_fs_read_write(c); |
1212 | if (ret) | |
1213 | goto err; | |
1214 | ||
1215 | ret = bch2_scan_old_btree_nodes(c, &stats); | |
1216 | if (ret) | |
1217 | goto err; | |
19dd3172 | 1218 | bch_info(c, "scanning for old btree nodes done"); |
a4805d66 KO |
1219 | } |
1220 | ||
26609b61 KO |
1221 | mutex_lock(&c->sb_lock); |
1222 | if (c->opts.version_upgrade) { | |
1223 | if (c->sb.version < bcachefs_metadata_version_new_versioning) | |
1224 | c->disk_sb.sb->version_min = | |
1225 | le16_to_cpu(bcachefs_metadata_version_min); | |
1226 | c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); | |
b807a0c8 | 1227 | c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; |
932aa837 KO |
1228 | write_sb = true; |
1229 | } | |
1230 | ||
1231 | if (!test_bit(BCH_FS_ERROR, &c->flags)) { | |
19dd3172 | 1232 | c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; |
932aa837 | 1233 | write_sb = true; |
88c07f73 KO |
1234 | } |
1235 | ||
0bc166ff KO |
1236 | if (c->opts.fsck && |
1237 | !test_bit(BCH_FS_ERROR, &c->flags)) { | |
1c3ff72c | 1238 | c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; |
0bc166ff | 1239 | SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); |
932aa837 | 1240 | write_sb = true; |
0bc166ff | 1241 | } |
932aa837 KO |
1242 | |
1243 | if (write_sb) | |
1244 | bch2_write_super(c); | |
26609b61 | 1245 | mutex_unlock(&c->sb_lock); |
1dd7f9d9 KO |
1246 | |
1247 | if (c->journal_seq_blacklist_table && | |
1248 | c->journal_seq_blacklist_table->nr > 128) | |
1249 | queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); | |
1c6fdbd8 | 1250 | out: |
619f5bee KO |
1251 | ret = 0; |
1252 | err: | |
1253 | fsck_err: | |
89b05118 | 1254 | set_bit(BCH_FS_FSCK_DONE, &c->flags); |
619f5bee | 1255 | bch2_flush_fsck_errs(c); |
89b05118 | 1256 | |
f1d786a0 KO |
1257 | if (!c->opts.keep_journal) { |
1258 | bch2_journal_keys_free(&c->journal_keys); | |
1259 | bch2_journal_entries_free(&c->journal_entries); | |
1260 | } | |
1c6fdbd8 | 1261 | kfree(clean); |
619f5bee KO |
1262 | if (ret) |
1263 | bch_err(c, "Error in recovery: %s (%i)", err, ret); | |
1264 | else | |
1265 | bch_verbose(c, "ret %i", ret); | |
1c6fdbd8 | 1266 | return ret; |
1c6fdbd8 KO |
1267 | } |
1268 | ||
1269 | int bch2_fs_initialize(struct bch_fs *c) | |
1270 | { | |
1271 | struct bch_inode_unpacked root_inode, lostfound_inode; | |
1272 | struct bkey_inode_buf packed_inode; | |
1c6fdbd8 KO |
1273 | struct qstr lostfound = QSTR("lost+found"); |
1274 | const char *err = "cannot allocate memory"; | |
1275 | struct bch_dev *ca; | |
1276 | LIST_HEAD(journal); | |
1277 | unsigned i; | |
1278 | int ret; | |
1279 | ||
1280 | bch_notice(c, "initializing new filesystem"); | |
1281 | ||
3e0745e2 KO |
1282 | mutex_lock(&c->sb_lock); |
1283 | for_each_online_member(ca, c, i) | |
1284 | bch2_mark_dev_superblock(c, ca, 0); | |
1285 | mutex_unlock(&c->sb_lock); | |
1286 | ||
61fc3c96 KO |
1287 | mutex_lock(&c->sb_lock); |
1288 | c->disk_sb.sb->version = c->disk_sb.sb->version_min = | |
1289 | le16_to_cpu(bcachefs_metadata_version_current); | |
1290 | c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; | |
1291 | c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; | |
19dd3172 KO |
1292 | c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; |
1293 | c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; | |
61fc3c96 KO |
1294 | |
1295 | bch2_write_super(c); | |
1296 | mutex_unlock(&c->sb_lock); | |
1297 | ||
1c6fdbd8 | 1298 | set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); |
f7e76361 | 1299 | set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); |
1c6fdbd8 | 1300 | |
dfe9bfb3 KO |
1301 | for (i = 0; i < BTREE_ID_NR; i++) |
1302 | bch2_btree_root_alloc(c, i); | |
1303 | ||
5d20ba48 KO |
1304 | set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); |
1305 | set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); | |
1306 | ||
1c6fdbd8 | 1307 | err = "unable to allocate journal buckets"; |
1633e492 KO |
1308 | for_each_online_member(ca, c, i) { |
1309 | ret = bch2_dev_journal_alloc(ca); | |
1310 | if (ret) { | |
1c6fdbd8 KO |
1311 | percpu_ref_put(&ca->io_ref); |
1312 | goto err; | |
1313 | } | |
1633e492 | 1314 | } |
1c6fdbd8 | 1315 | |
1c6fdbd8 KO |
1316 | /* |
1317 | * journal_res_get() will crash if called before this has | |
1318 | * set up the journal.pin FIFO and journal.cur pointer: | |
1319 | */ | |
1dd7f9d9 | 1320 | bch2_fs_journal_start(&c->journal, 1, &journal); |
1c6fdbd8 KO |
1321 | bch2_journal_set_replay_done(&c->journal); |
1322 | ||
8d6b6222 KO |
1323 | err = "error going read-write"; |
1324 | ret = bch2_fs_read_write_early(c); | |
1325 | if (ret) | |
1326 | goto err; | |
1327 | ||
1328 | /* | |
1329 | * Write out the superblock and journal buckets, now that we can do | |
1330 | * btree updates | |
1331 | */ | |
1332 | err = "error writing alloc info"; | |
1333 | ret = bch2_alloc_write(c, 0); | |
1334 | if (ret) | |
1335 | goto err; | |
1336 | ||
1c6fdbd8 KO |
1337 | bch2_inode_init(c, &root_inode, 0, 0, |
1338 | S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); | |
1339 | root_inode.bi_inum = BCACHEFS_ROOT_INO; | |
a3e72262 | 1340 | bch2_inode_pack(c, &packed_inode, &root_inode); |
1c6fdbd8 KO |
1341 | |
1342 | err = "error creating root directory"; | |
1343 | ret = bch2_btree_insert(c, BTREE_ID_INODES, | |
1344 | &packed_inode.inode.k_i, | |
8d6b6222 | 1345 | NULL, NULL, 0); |
1c6fdbd8 KO |
1346 | if (ret) |
1347 | goto err; | |
1348 | ||
96385742 | 1349 | bch2_inode_init_early(c, &lostfound_inode); |
1c6fdbd8 KO |
1350 | |
1351 | err = "error creating lost+found"; | |
58e2388f | 1352 | ret = bch2_trans_do(c, NULL, NULL, 0, |
96385742 KO |
1353 | bch2_create_trans(&trans, BCACHEFS_ROOT_INO, |
1354 | &root_inode, &lostfound_inode, | |
1355 | &lostfound, | |
b627c7d8 | 1356 | 0, 0, S_IFDIR|0700, 0, |
96385742 | 1357 | NULL, NULL)); |
dab9ef0d KO |
1358 | if (ret) { |
1359 | bch_err(c, "error creating lost+found"); | |
1c6fdbd8 | 1360 | goto err; |
dab9ef0d | 1361 | } |
1c6fdbd8 | 1362 | |
1c6fdbd8 KO |
1363 | if (enabled_qtypes(c)) { |
1364 | ret = bch2_fs_quota_read(c); | |
1365 | if (ret) | |
1366 | goto err; | |
1367 | } | |
1368 | ||
1369 | err = "error writing first journal entry"; | |
1370 | ret = bch2_journal_meta(&c->journal); | |
1371 | if (ret) | |
1372 | goto err; | |
1373 | ||
1374 | mutex_lock(&c->sb_lock); | |
1375 | SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); | |
1376 | SET_BCH_SB_CLEAN(c->disk_sb.sb, false); | |
1377 | ||
1378 | bch2_write_super(c); | |
1379 | mutex_unlock(&c->sb_lock); | |
1380 | ||
1381 | return 0; | |
1382 | err: | |
8b335bae | 1383 | pr_err("Error initializing new filesystem: %s (%i)", err, ret); |
1c6fdbd8 KO |
1384 | return ret; |
1385 | } |