bcachefs: Make sure to go rw if lazy in fsck
[linux-block.git] / fs / bcachefs / journal_io.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2#include "bcachefs.h"
7b3f84ea 3#include "alloc_foreground.h"
39fb2983 4#include "btree_io.h"
00b8ccf7 5#include "btree_update_interior.h"
1c6fdbd8
KO
6#include "buckets.h"
7#include "checksum.h"
8#include "error.h"
63b214e7 9#include "io.h"
1c6fdbd8
KO
10#include "journal.h"
11#include "journal_io.h"
12#include "journal_reclaim.h"
1c6fdbd8
KO
13#include "replicas.h"
14#include "trace.h"
15
16struct journal_list {
17 struct closure cl;
18 struct mutex lock;
19 struct list_head *head;
20 int ret;
21};
22
23#define JOURNAL_ENTRY_ADD_OK 0
24#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
25
26/*
27 * Given a journal entry we just read, add it to the list of journal entries to
28 * be replayed:
29 */
30static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
31 struct journal_list *jlist, struct jset *j)
32{
33 struct journal_replay *i, *pos;
34 struct list_head *where;
35 size_t bytes = vstruct_bytes(j);
36 __le64 last_seq;
37 int ret;
38
39 last_seq = !list_empty(jlist->head)
40 ? list_last_entry(jlist->head, struct journal_replay,
41 list)->j.last_seq
42 : 0;
43
7fffc85b
KO
44 if (!c->opts.read_entire_journal) {
45 /* Is this entry older than the range we need? */
46 if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
47 ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
48 goto out;
49 }
1c6fdbd8 50
7fffc85b
KO
51 /* Drop entries we don't need anymore */
52 list_for_each_entry_safe(i, pos, jlist->head, list) {
53 if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
54 break;
55 list_del(&i->list);
56 kvpfree(i, offsetof(struct journal_replay, j) +
57 vstruct_bytes(&i->j));
58 }
1c6fdbd8
KO
59 }
60
61 list_for_each_entry_reverse(i, jlist->head, list) {
62 /* Duplicate? */
63 if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
64 fsck_err_on(bytes != vstruct_bytes(&i->j) ||
65 memcmp(j, &i->j, bytes), c,
66 "found duplicate but non identical journal entries (seq %llu)",
67 le64_to_cpu(j->seq));
68 goto found;
69 }
70
71 if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
72 where = &i->list;
73 goto add;
74 }
75 }
76
77 where = jlist->head;
78add:
79 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
80 if (!i) {
81 ret = -ENOMEM;
82 goto out;
83 }
84
85 list_add(&i->list, where);
86 i->devs.nr = 0;
87 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
88found:
89 if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
90 bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
91 else
92 fsck_err_on(1, c, "duplicate journal entries on same device");
93 ret = JOURNAL_ENTRY_ADD_OK;
94out:
95fsck_err:
96 return ret;
97}
98
99static struct nonce journal_nonce(const struct jset *jset)
100{
101 return (struct nonce) {{
102 [0] = 0,
103 [1] = ((__le32 *) &jset->seq)[0],
104 [2] = ((__le32 *) &jset->seq)[1],
105 [3] = BCH_NONCE_JOURNAL,
106 }};
107}
108
109/* this fills in a range with empty jset_entries: */
110static void journal_entry_null_range(void *start, void *end)
111{
112 struct jset_entry *entry;
113
114 for (entry = start; entry != end; entry = vstruct_next(entry))
115 memset(entry, 0, sizeof(*entry));
116}
117
118#define JOURNAL_ENTRY_REREAD 5
119#define JOURNAL_ENTRY_NONE 6
120#define JOURNAL_ENTRY_BAD 7
121
122#define journal_entry_err(c, msg, ...) \
123({ \
124 switch (write) { \
125 case READ: \
126 mustfix_fsck_err(c, msg, ##__VA_ARGS__); \
127 break; \
128 case WRITE: \
129 bch_err(c, "corrupt metadata before write:\n" \
130 msg, ##__VA_ARGS__); \
131 if (bch2_fs_inconsistent(c)) { \
132 ret = BCH_FSCK_ERRORS_NOT_FIXED; \
133 goto fsck_err; \
134 } \
135 break; \
136 } \
137 true; \
138})
139
140#define journal_entry_err_on(cond, c, msg, ...) \
141 ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
142
143static int journal_validate_key(struct bch_fs *c, struct jset *jset,
144 struct jset_entry *entry,
39fb2983
KO
145 unsigned level, enum btree_id btree_id,
146 struct bkey_i *k,
1c6fdbd8
KO
147 const char *type, int write)
148{
149 void *next = vstruct_next(entry);
150 const char *invalid;
26609b61 151 unsigned version = le32_to_cpu(jset->version);
1c6fdbd8
KO
152 int ret = 0;
153
154 if (journal_entry_err_on(!k->k.u64s, c,
155 "invalid %s in journal: k->u64s 0", type)) {
156 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
157 journal_entry_null_range(vstruct_next(entry), next);
158 return 0;
159 }
160
161 if (journal_entry_err_on((void *) bkey_next(k) >
162 (void *) vstruct_next(entry), c,
163 "invalid %s in journal: extends past end of journal entry",
164 type)) {
165 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
166 journal_entry_null_range(vstruct_next(entry), next);
167 return 0;
168 }
169
170 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
171 "invalid %s in journal: bad format %u",
172 type, k->k.format)) {
173 le16_add_cpu(&entry->u64s, -k->k.u64s);
174 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
175 journal_entry_null_range(vstruct_next(entry), next);
176 return 0;
177 }
178
39fb2983
KO
179 if (!write)
180 bch2_bkey_compat(level, btree_id, version,
181 JSET_BIG_ENDIAN(jset), write,
182 NULL, bkey_to_packed(k));
26609b61 183
39fb2983
KO
184 invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
185 __btree_node_type(level, btree_id));
1c6fdbd8 186 if (invalid) {
319f9ac3
KO
187 char buf[160];
188
26609b61 189 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
1c6fdbd8
KO
190 mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
191 type, invalid, buf);
192
193 le16_add_cpu(&entry->u64s, -k->k.u64s);
194 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
195 journal_entry_null_range(vstruct_next(entry), next);
196 return 0;
197 }
26609b61 198
39fb2983
KO
199 if (write)
200 bch2_bkey_compat(level, btree_id, version,
201 JSET_BIG_ENDIAN(jset), write,
202 NULL, bkey_to_packed(k));
1c6fdbd8
KO
203fsck_err:
204 return ret;
205}
206
207static int journal_entry_validate_btree_keys(struct bch_fs *c,
208 struct jset *jset,
209 struct jset_entry *entry,
210 int write)
211{
212 struct bkey_i *k;
213
214 vstruct_for_each(entry, k) {
39fb2983
KO
215 int ret = journal_validate_key(c, jset, entry,
216 entry->level,
217 entry->btree_id,
218 k, "key", write);
1c6fdbd8
KO
219 if (ret)
220 return ret;
221 }
222
223 return 0;
224}
225
226static int journal_entry_validate_btree_root(struct bch_fs *c,
227 struct jset *jset,
228 struct jset_entry *entry,
229 int write)
230{
231 struct bkey_i *k = entry->start;
232 int ret = 0;
233
234 if (journal_entry_err_on(!entry->u64s ||
235 le16_to_cpu(entry->u64s) != k->k.u64s, c,
236 "invalid btree root journal entry: wrong number of keys")) {
237 void *next = vstruct_next(entry);
238 /*
239 * we don't want to null out this jset_entry,
240 * just the contents, so that later we can tell
241 * we were _supposed_ to have a btree root
242 */
243 entry->u64s = 0;
244 journal_entry_null_range(vstruct_next(entry), next);
245 return 0;
246 }
247
39fb2983 248 return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
1c6fdbd8
KO
249 "btree root", write);
250fsck_err:
251 return ret;
252}
253
254static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
255 struct jset *jset,
256 struct jset_entry *entry,
257 int write)
258{
259 /* obsolete, don't care: */
260 return 0;
261}
262
263static int journal_entry_validate_blacklist(struct bch_fs *c,
264 struct jset *jset,
265 struct jset_entry *entry,
266 int write)
267{
268 int ret = 0;
269
270 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
271 "invalid journal seq blacklist entry: bad size")) {
272 journal_entry_null_range(entry, vstruct_next(entry));
273 }
274fsck_err:
275 return ret;
276}
277
278static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
279 struct jset *jset,
280 struct jset_entry *entry,
281 int write)
282{
283 struct jset_entry_blacklist_v2 *bl_entry;
284 int ret = 0;
285
286 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
287 "invalid journal seq blacklist entry: bad size")) {
288 journal_entry_null_range(entry, vstruct_next(entry));
2c5af169 289 goto out;
1c6fdbd8
KO
290 }
291
292 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
293
294 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
295 le64_to_cpu(bl_entry->end), c,
296 "invalid journal seq blacklist entry: start > end")) {
297 journal_entry_null_range(entry, vstruct_next(entry));
298 }
2c5af169
KO
299out:
300fsck_err:
301 return ret;
302}
303
304static int journal_entry_validate_usage(struct bch_fs *c,
305 struct jset *jset,
306 struct jset_entry *entry,
307 int write)
308{
309 struct jset_entry_usage *u =
310 container_of(entry, struct jset_entry_usage, entry);
311 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
312 int ret = 0;
313
3577df5f
KO
314 if (journal_entry_err_on(bytes < sizeof(*u),
315 c,
316 "invalid journal entry usage: bad size")) {
317 journal_entry_null_range(entry, vstruct_next(entry));
318 return ret;
319 }
320
321fsck_err:
322 return ret;
323}
324
325static int journal_entry_validate_data_usage(struct bch_fs *c,
326 struct jset *jset,
327 struct jset_entry *entry,
328 int write)
329{
330 struct jset_entry_data_usage *u =
331 container_of(entry, struct jset_entry_data_usage, entry);
332 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
333 int ret = 0;
334
2c5af169
KO
335 if (journal_entry_err_on(bytes < sizeof(*u) ||
336 bytes < sizeof(*u) + u->r.nr_devs,
337 c,
338 "invalid journal entry usage: bad size")) {
339 journal_entry_null_range(entry, vstruct_next(entry));
340 return ret;
341 }
1c6fdbd8
KO
342
343fsck_err:
344 return ret;
345}
346
347struct jset_entry_ops {
348 int (*validate)(struct bch_fs *, struct jset *,
349 struct jset_entry *, int);
350};
351
352static const struct jset_entry_ops bch2_jset_entry_ops[] = {
353#define x(f, nr) \
354 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
355 .validate = journal_entry_validate_##f, \
356 },
357 BCH_JSET_ENTRY_TYPES()
358#undef x
359};
360
361static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
362 struct jset_entry *entry, int write)
363{
2c5af169
KO
364 return entry->type < BCH_JSET_ENTRY_NR
365 ? bch2_jset_entry_ops[entry->type].validate(c, jset,
366 entry, write)
367 : 0;
1c6fdbd8
KO
368}
369
370static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
371 int write)
372{
373 struct jset_entry *entry;
374 int ret = 0;
375
376 vstruct_for_each(jset, entry) {
377 if (journal_entry_err_on(vstruct_next(entry) >
378 vstruct_last(jset), c,
379 "journal entry extends past end of jset")) {
380 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
381 break;
382 }
383
384 ret = journal_entry_validate(c, jset, entry, write);
385 if (ret)
386 break;
387 }
388fsck_err:
389 return ret;
390}
391
392static int jset_validate(struct bch_fs *c,
393 struct jset *jset, u64 sector,
394 unsigned bucket_sectors_left,
395 unsigned sectors_read,
396 int write)
397{
398 size_t bytes = vstruct_bytes(jset);
399 struct bch_csum csum;
26609b61 400 unsigned version;
1c6fdbd8
KO
401 int ret = 0;
402
403 if (le64_to_cpu(jset->magic) != jset_magic(c))
404 return JOURNAL_ENTRY_NONE;
405
26609b61
KO
406 version = le32_to_cpu(jset->version);
407 if ((version != BCH_JSET_VERSION_OLD &&
408 version < bcachefs_metadata_version_min) ||
409 version >= bcachefs_metadata_version_max) {
410 bch_err(c, "unknown journal entry version %u", jset->version);
1c6fdbd8
KO
411 return BCH_FSCK_UNKNOWN_VERSION;
412 }
413
414 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
415 "journal entry too big (%zu bytes), sector %lluu",
416 bytes, sector)) {
417 /* XXX: note we might have missing journal entries */
418 return JOURNAL_ENTRY_BAD;
419 }
420
421 if (bytes > sectors_read << 9)
422 return JOURNAL_ENTRY_REREAD;
423
424 if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
425 "journal entry with unknown csum type %llu sector %lluu",
426 JSET_CSUM_TYPE(jset), sector))
427 return JOURNAL_ENTRY_BAD;
428
429 csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
430 if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
431 "journal checksum bad, sector %llu", sector)) {
432 /* XXX: retry IO, when we start retrying checksum errors */
433 /* XXX: note we might have missing journal entries */
434 return JOURNAL_ENTRY_BAD;
435 }
436
437 bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
438 jset->encrypted_start,
439 vstruct_end(jset) - (void *) jset->encrypted_start);
440
441 if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
442 "invalid journal entry: last_seq > seq"))
443 jset->last_seq = jset->seq;
444
445 return 0;
446fsck_err:
447 return ret;
448}
449
450struct journal_read_buf {
451 void *data;
452 size_t size;
453};
454
455static int journal_read_buf_realloc(struct journal_read_buf *b,
456 size_t new_size)
457{
458 void *n;
459
460 /* the bios are sized for this many pages, max: */
461 if (new_size > JOURNAL_ENTRY_SIZE_MAX)
462 return -ENOMEM;
463
464 new_size = roundup_pow_of_two(new_size);
465 n = kvpmalloc(new_size, GFP_KERNEL);
466 if (!n)
467 return -ENOMEM;
468
469 kvpfree(b->data, b->size);
470 b->data = n;
471 b->size = new_size;
472 return 0;
473}
474
475static int journal_read_bucket(struct bch_dev *ca,
476 struct journal_read_buf *buf,
477 struct journal_list *jlist,
a9ec3454 478 unsigned bucket)
1c6fdbd8
KO
479{
480 struct bch_fs *c = ca->fs;
481 struct journal_device *ja = &ca->journal;
1c6fdbd8
KO
482 struct jset *j = NULL;
483 unsigned sectors, sectors_read = 0;
484 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
485 end = offset + ca->mi.bucket_size;
486 bool saw_bad = false;
487 int ret = 0;
488
489 pr_debug("reading %u", bucket);
490
491 while (offset < end) {
492 if (!sectors_read) {
ac10a961
KO
493 struct bio *bio;
494 unsigned nr_bvecs;
495reread:
496 sectors_read = min_t(unsigned,
1c6fdbd8 497 end - offset, buf->size >> 9);
ac10a961
KO
498 nr_bvecs = buf_pages(buf->data, sectors_read << 9);
499
500 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
501 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
1c6fdbd8 502
885678f6
KO
503 bio->bi_iter.bi_sector = offset;
504 bch2_bio_map(bio, buf->data, sectors_read << 9);
1c6fdbd8
KO
505
506 ret = submit_bio_wait(bio);
ac10a961 507 kfree(bio);
1c6fdbd8
KO
508
509 if (bch2_dev_io_err_on(ret, ca,
510 "journal read from sector %llu",
511 offset) ||
512 bch2_meta_read_fault("journal"))
513 return -EIO;
514
515 j = buf->data;
516 }
517
518 ret = jset_validate(c, j, offset,
519 end - offset, sectors_read,
520 READ);
521 switch (ret) {
522 case BCH_FSCK_OK:
523 break;
524 case JOURNAL_ENTRY_REREAD:
525 if (vstruct_bytes(j) > buf->size) {
526 ret = journal_read_buf_realloc(buf,
527 vstruct_bytes(j));
528 if (ret)
529 return ret;
530 }
531 goto reread;
532 case JOURNAL_ENTRY_NONE:
533 if (!saw_bad)
534 return 0;
535 sectors = c->opts.block_size;
536 goto next_block;
537 case JOURNAL_ENTRY_BAD:
538 saw_bad = true;
539 sectors = c->opts.block_size;
540 goto next_block;
541 default:
542 return ret;
543 }
544
545 /*
546 * This happens sometimes if we don't have discards on -
547 * when we've partially overwritten a bucket with new
548 * journal entries. We don't need the rest of the
549 * bucket:
550 */
551 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
552 return 0;
553
554 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
555
556 mutex_lock(&jlist->lock);
557 ret = journal_entry_add(c, ca, jlist, j);
558 mutex_unlock(&jlist->lock);
559
560 switch (ret) {
561 case JOURNAL_ENTRY_ADD_OK:
1c6fdbd8
KO
562 break;
563 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
564 break;
565 default:
566 return ret;
567 }
568
1c6fdbd8
KO
569 sectors = vstruct_sectors(j, c->block_bits);
570next_block:
571 pr_debug("next");
572 offset += sectors;
573 sectors_read -= sectors;
574 j = ((void *) j) + (sectors << 9);
575 }
576
577 return 0;
578}
579
580static void bch2_journal_read_device(struct closure *cl)
581{
1c6fdbd8
KO
582 struct journal_device *ja =
583 container_of(cl, struct journal_device, read);
584 struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
585 struct journal_list *jlist =
586 container_of(cl->parent, struct journal_list, cl);
1c6fdbd8 587 struct journal_read_buf buf = { NULL, 0 };
a9ec3454
KO
588 u64 min_seq = U64_MAX;
589 unsigned i;
1c6fdbd8
KO
590 int ret;
591
592 if (!ja->nr)
593 goto out;
594
1c6fdbd8
KO
595 ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
596 if (ret)
597 goto err;
598
599 pr_debug("%u journal buckets", ja->nr);
600
1c6fdbd8 601 for (i = 0; i < ja->nr; i++) {
a9ec3454
KO
602 ret = journal_read_bucket(ca, &buf, jlist, i);
603 if (ret)
604 goto err;
1c6fdbd8
KO
605 }
606
a9ec3454
KO
607 /* Find the journal bucket with the highest sequence number: */
608 for (i = 0; i < ja->nr; i++) {
609 if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
610 ja->cur_idx = i;
1c6fdbd8 611
a9ec3454 612 min_seq = min(ja->bucket_seq[i], min_seq);
1c6fdbd8
KO
613 }
614
1c6fdbd8 615 /*
1c6fdbd8
KO
616 * If there's duplicate journal entries in multiple buckets (which
617 * definitely isn't supposed to happen, but...) - make sure to start
618 * cur_idx at the last of those buckets, so we don't deadlock trying to
619 * allocate
620 */
a9ec3454
KO
621 while (ja->bucket_seq[ja->cur_idx] > min_seq &&
622 ja->bucket_seq[ja->cur_idx] >
623 ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
a36d3685 624 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
a9ec3454
KO
625
626 ja->sectors_free = 0;
1c6fdbd8
KO
627
628 /*
0ce2dbbe 629 * Set dirty_idx to indicate the entire journal is full and needs to be
1c6fdbd8
KO
630 * reclaimed - journal reclaim will immediately reclaim whatever isn't
631 * pinned when it first runs:
632 */
0ce2dbbe
KO
633 ja->discard_idx = ja->dirty_idx_ondisk =
634 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1c6fdbd8
KO
635out:
636 kvpfree(buf.data, buf.size);
1c6fdbd8
KO
637 percpu_ref_put(&ca->io_ref);
638 closure_return(cl);
639 return;
640err:
641 mutex_lock(&jlist->lock);
642 jlist->ret = ret;
643 mutex_unlock(&jlist->lock);
644 goto out;
1c6fdbd8
KO
645}
646
1c6fdbd8
KO
647int bch2_journal_read(struct bch_fs *c, struct list_head *list)
648{
1c6fdbd8
KO
649 struct journal_list jlist;
650 struct journal_replay *i;
1c6fdbd8 651 struct bch_dev *ca;
1c6fdbd8
KO
652 unsigned iter;
653 size_t keys = 0, entries = 0;
654 bool degraded = false;
655 int ret = 0;
656
657 closure_init_stack(&jlist.cl);
658 mutex_init(&jlist.lock);
659 jlist.head = list;
660 jlist.ret = 0;
661
662 for_each_member_device(ca, c, iter) {
6bdbfa87 663 if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
89fd25be 664 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1c6fdbd8
KO
665 continue;
666
667 if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
668 ca->mi.state == BCH_MEMBER_STATE_RO) &&
669 percpu_ref_tryget(&ca->io_ref))
670 closure_call(&ca->journal.read,
671 bch2_journal_read_device,
672 system_unbound_wq,
673 &jlist.cl);
674 else
675 degraded = true;
676 }
677
678 closure_sync(&jlist.cl);
679
680 if (jlist.ret)
681 return jlist.ret;
682
1c6fdbd8 683 list_for_each_entry(i, list, list) {
1dd7f9d9
KO
684 struct jset_entry *entry;
685 struct bkey_i *k, *_n;
7ef2a73a
KO
686 struct bch_replicas_padded replicas;
687 char buf[80];
688
1c6fdbd8
KO
689 ret = jset_validate_entries(c, &i->j, READ);
690 if (ret)
691 goto fsck_err;
692
693 /*
694 * If we're mounting in degraded mode - if we didn't read all
695 * the devices - this is wrong:
696 */
697
89fd25be 698 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
1dd7f9d9 699
1c6fdbd8
KO
700 if (!degraded &&
701 (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
988e98cf 702 fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
7ef2a73a
KO
703 "superblock not marked as containing replicas %s",
704 (bch2_replicas_entry_to_text(&PBUF(buf),
705 &replicas.e), buf)))) {
706 ret = bch2_mark_replicas(c, &replicas.e);
1c6fdbd8
KO
707 if (ret)
708 return ret;
709 }
1c6fdbd8
KO
710
711 for_each_jset_key(k, _n, entry, &i->j)
712 keys++;
713 entries++;
714 }
715
1dd7f9d9
KO
716 if (!list_empty(list)) {
717 i = list_last_entry(list, struct journal_replay, list);
718
719 bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
720 keys, entries, le64_to_cpu(i->j.seq));
721 }
1c6fdbd8
KO
722fsck_err:
723 return ret;
724}
725
1c6fdbd8
KO
726/* journal write: */
727
a9ec3454
KO
728static void __journal_write_alloc(struct journal *j,
729 struct journal_buf *w,
730 struct dev_alloc_list *devs_sorted,
731 unsigned sectors,
732 unsigned *replicas,
733 unsigned replicas_want)
1c6fdbd8
KO
734{
735 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1c6fdbd8
KO
736 struct journal_device *ja;
737 struct bch_dev *ca;
a9ec3454 738 unsigned i;
a2753581 739
a9ec3454
KO
740 if (*replicas >= replicas_want)
741 return;
1c6fdbd8 742
a9ec3454
KO
743 for (i = 0; i < devs_sorted->nr; i++) {
744 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1c6fdbd8
KO
745 if (!ca)
746 continue;
747
1c6fdbd8 748 ja = &ca->journal;
1c6fdbd8
KO
749
750 /*
751 * Check that we can use this device, and aren't already using
752 * it:
753 */
a9ec3454
KO
754 if (!ca->mi.durability ||
755 ca->mi.state != BCH_MEMBER_STATE_RW ||
756 !ja->nr ||
26609b61
KO
757 bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
758 ca->dev_idx) ||
a9ec3454 759 sectors > ja->sectors_free)
1c6fdbd8
KO
760 continue;
761
3d080aa5 762 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1c6fdbd8 763
26609b61 764 bch2_bkey_append_ptr(&w->key,
1c6fdbd8
KO
765 (struct bch_extent_ptr) {
766 .offset = bucket_to_sector(ca,
a9ec3454
KO
767 ja->buckets[ja->cur_idx]) +
768 ca->mi.bucket_size -
769 ja->sectors_free,
1c6fdbd8
KO
770 .dev = ca->dev_idx,
771 });
772
a9ec3454
KO
773 ja->sectors_free -= sectors;
774 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
775
776 *replicas += ca->mi.durability;
777
778 if (*replicas >= replicas_want)
779 break;
1c6fdbd8 780 }
a9ec3454 781}
1c6fdbd8 782
a9ec3454
KO
783/**
784 * journal_next_bucket - move on to the next journal bucket if possible
785 */
786static int journal_write_alloc(struct journal *j, struct journal_buf *w,
787 unsigned sectors)
788{
789 struct bch_fs *c = container_of(j, struct bch_fs, journal);
790 struct journal_device *ja;
791 struct bch_dev *ca;
792 struct dev_alloc_list devs_sorted;
793 unsigned i, replicas = 0, replicas_want =
794 READ_ONCE(c->opts.metadata_replicas);
1c6fdbd8 795
a9ec3454 796 rcu_read_lock();
1c6fdbd8 797
a9ec3454 798 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
89fd25be 799 &c->rw_devs[BCH_DATA_journal]);
1c6fdbd8 800
a9ec3454
KO
801 __journal_write_alloc(j, w, &devs_sorted,
802 sectors, &replicas, replicas_want);
1c6fdbd8 803
a9ec3454
KO
804 if (replicas >= replicas_want)
805 goto done;
806
807 for (i = 0; i < devs_sorted.nr; i++) {
808 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
809 if (!ca)
810 continue;
811
812 ja = &ca->journal;
813
814 if (sectors > ja->sectors_free &&
815 sectors <= ca->mi.bucket_size &&
03d5eaed
KO
816 bch2_journal_dev_buckets_available(j, ja,
817 journal_space_discarded)) {
a9ec3454
KO
818 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
819 ja->sectors_free = ca->mi.bucket_size;
68ef94a6
KO
820
821 /*
822 * ja->bucket_seq[ja->cur_idx] must always have
823 * something sensible:
824 */
825 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
a9ec3454
KO
826 }
827 }
828
829 __journal_write_alloc(j, w, &devs_sorted,
830 sectors, &replicas, replicas_want);
831done:
a9ec3454
KO
832 rcu_read_unlock();
833
57cb2142 834 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
1c6fdbd8
KO
835}
836
837static void journal_write_compact(struct jset *jset)
838{
839 struct jset_entry *i, *next, *prev = NULL;
840
841 /*
842 * Simple compaction, dropping empty jset_entries (from journal
843 * reservations that weren't fully used) and merging jset_entries that
844 * can be.
845 *
846 * If we wanted to be really fancy here, we could sort all the keys in
847 * the jset and drop keys that were overwritten - probably not worth it:
848 */
849 vstruct_for_each_safe(jset, i, next) {
850 unsigned u64s = le16_to_cpu(i->u64s);
851
852 /* Empty entry: */
853 if (!u64s)
854 continue;
855
856 /* Can we merge with previous entry? */
857 if (prev &&
858 i->btree_id == prev->btree_id &&
859 i->level == prev->level &&
860 i->type == prev->type &&
861 i->type == BCH_JSET_ENTRY_btree_keys &&
862 le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
863 memmove_u64s_down(vstruct_next(prev),
864 i->_data,
865 u64s);
866 le16_add_cpu(&prev->u64s, u64s);
867 continue;
868 }
869
870 /* Couldn't merge, move i into new position (after prev): */
871 prev = prev ? vstruct_next(prev) : jset->start;
872 if (i != prev)
873 memmove_u64s_down(prev, i, jset_u64s(u64s));
874 }
875
876 prev = prev ? vstruct_next(prev) : jset->start;
877 jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
878}
879
880static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
881{
882 /* we aren't holding j->lock: */
883 unsigned new_size = READ_ONCE(j->buf_size_want);
884 void *new_buf;
885
d16b4a77 886 if (buf->buf_size >= new_size)
1c6fdbd8
KO
887 return;
888
889 new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
890 if (!new_buf)
891 return;
892
d16b4a77
KO
893 memcpy(new_buf, buf->data, buf->buf_size);
894 kvpfree(buf->data, buf->buf_size);
1c6fdbd8 895 buf->data = new_buf;
d16b4a77 896 buf->buf_size = new_size;
1c6fdbd8
KO
897}
898
899static void journal_write_done(struct closure *cl)
900{
901 struct journal *j = container_of(cl, struct journal, io);
902 struct bch_fs *c = container_of(j, struct bch_fs, journal);
903 struct journal_buf *w = journal_prev_buf(j);
904 struct bch_devs_list devs =
26609b61 905 bch2_bkey_devs(bkey_i_to_s_c(&w->key));
7ef2a73a 906 struct bch_replicas_padded replicas;
1c6fdbd8 907 u64 seq = le64_to_cpu(w->data->seq);
60476b14 908 u64 last_seq = le64_to_cpu(w->data->last_seq);
1c6fdbd8 909
9c859dc9
KO
910 bch2_time_stats_update(j->write_time, j->write_start_time);
911
1c6fdbd8
KO
912 if (!devs.nr) {
913 bch_err(c, "unable to write journal to sufficient devices");
914 goto err;
915 }
916
89fd25be 917 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
7ef2a73a
KO
918
919 if (bch2_mark_replicas(c, &replicas.e))
1c6fdbd8 920 goto err;
1c6fdbd8
KO
921
922 spin_lock(&j->lock);
1c6fdbd8
KO
923 if (seq >= j->pin.front)
924 journal_seq_pin(j, seq)->devs = devs;
925
0ce2dbbe
KO
926 j->seq_ondisk = seq;
927 j->last_seq_ondisk = last_seq;
928 bch2_journal_space_available(j);
929
1c6fdbd8
KO
930 /*
931 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
932 * more buckets:
933 *
934 * Must come before signaling write completion, for
935 * bch2_fs_journal_stop():
936 */
6409c6a0 937 mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
9c859dc9 938out:
1c6fdbd8
KO
939 /* also must come before signalling write completion: */
940 closure_debug_destroy(cl);
941
942 BUG_ON(!j->reservations.prev_buf_unwritten);
943 atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
944 &j->reservations.counter);
945
946 closure_wake_up(&w->wait);
947 journal_wake(j);
948
949 if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
950 mod_delayed_work(system_freezable_wq, &j->write_work, 0);
951 spin_unlock(&j->lock);
952 return;
953err:
954 bch2_fatal_error(c);
9c859dc9 955 spin_lock(&j->lock);
1c6fdbd8
KO
956 goto out;
957}
958
959static void journal_write_endio(struct bio *bio)
960{
961 struct bch_dev *ca = bio->bi_private;
962 struct journal *j = &ca->fs->journal;
963
306d40df 964 if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
63b214e7 965 bch2_blk_status_to_str(bio->bi_status)) ||
1c6fdbd8
KO
966 bch2_meta_write_fault("journal")) {
967 struct journal_buf *w = journal_prev_buf(j);
968 unsigned long flags;
969
970 spin_lock_irqsave(&j->err_lock, flags);
26609b61 971 bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
1c6fdbd8
KO
972 spin_unlock_irqrestore(&j->err_lock, flags);
973 }
974
975 closure_put(&j->io);
976 percpu_ref_put(&ca->io_ref);
977}
978
979void bch2_journal_write(struct closure *cl)
980{
981 struct journal *j = container_of(cl, struct journal, io);
982 struct bch_fs *c = container_of(j, struct bch_fs, journal);
983 struct bch_dev *ca;
984 struct journal_buf *w = journal_prev_buf(j);
3ccc5c50 985 struct jset_entry *start, *end;
1c6fdbd8
KO
986 struct jset *jset;
987 struct bio *bio;
988 struct bch_extent_ptr *ptr;
26609b61 989 bool validate_before_checksum = false;
3ccc5c50 990 unsigned i, sectors, bytes, u64s;
e5a66496
KO
991 int ret;
992
993 bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
1c6fdbd8
KO
994
995 journal_buf_realloc(j, w);
996 jset = w->data;
997
998 j->write_start_time = local_clock();
1c6fdbd8 999
00b8ccf7
KO
1000 /*
1001 * New btree roots are set by journalling them; when the journal entry
1002 * gets written we have to propagate them to c->btree_roots
1003 *
1004 * But, every journal entry we write has to contain all the btree roots
1005 * (at least for now); so after we copy btree roots to c->btree_roots we
1006 * have to get any missing btree roots and add them to this journal
1007 * entry:
1008 */
1009
1010 bch2_journal_entries_to_btree_roots(c, jset);
1011
1012 start = end = vstruct_last(jset);
1013
1014 end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1015
1016 end = bch2_journal_super_entries_add_common(c, end,
3ccc5c50
KO
1017 le64_to_cpu(jset->seq));
1018 u64s = (u64 *) end - (u64 *) start;
1019 BUG_ON(u64s > j->entry_u64s_reserved);
1020
d16b4a77
KO
1021 le32_add_cpu(&jset->u64s, u64s);
1022 BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
1c6fdbd8
KO
1023
1024 journal_write_compact(jset);
1025
1026 jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
1027 jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
1028 jset->magic = cpu_to_le64(jset_magic(c));
26609b61
KO
1029
1030 jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
1031 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
1032 : cpu_to_le32(c->sb.version);
1c6fdbd8
KO
1033
1034 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1035 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1036
26609b61
KO
1037 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1038 validate_before_checksum = true;
1039
39fb2983 1040 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
26609b61
KO
1041 validate_before_checksum = true;
1042
1043 if (validate_before_checksum &&
1c6fdbd8
KO
1044 jset_validate_entries(c, jset, WRITE))
1045 goto err;
1046
1047 bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1048 jset->encrypted_start,
1049 vstruct_end(jset) - (void *) jset->encrypted_start);
1050
1051 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1052 journal_nonce(jset), jset);
1053
26609b61 1054 if (!validate_before_checksum &&
1c6fdbd8
KO
1055 jset_validate_entries(c, jset, WRITE))
1056 goto err;
1057
1058 sectors = vstruct_sectors(jset, c->block_bits);
d16b4a77 1059 BUG_ON(sectors > w->sectors);
1c6fdbd8 1060
d16b4a77
KO
1061 bytes = vstruct_bytes(jset);
1062 memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1c6fdbd8 1063
c18dade6 1064retry_alloc:
e5a66496
KO
1065 spin_lock(&j->lock);
1066 ret = journal_write_alloc(j, w, sectors);
1067
c18dade6
KO
1068 if (ret && j->can_discard) {
1069 spin_unlock(&j->lock);
1070 bch2_journal_do_discards(j);
1071 goto retry_alloc;
1072 }
1073
e5a66496
KO
1074 /*
1075 * write is allocated, no longer need to account for it in
1076 * bch2_journal_space_available():
1077 */
1078 w->sectors = 0;
1079
1080 /*
1081 * journal entry has been compacted and allocated, recalculate space
1082 * available:
1083 */
1084 bch2_journal_space_available(j);
1085 spin_unlock(&j->lock);
1086
1087 if (ret) {
1c6fdbd8
KO
1088 bch_err(c, "Unable to allocate journal write");
1089 bch2_fatal_error(c);
1090 continue_at(cl, journal_write_done, system_highpri_wq);
1091 return;
1092 }
1093
1094 /*
1095 * XXX: we really should just disable the entire journal in nochanges
1096 * mode
1097 */
1098 if (c->opts.nochanges)
1099 goto no_io;
1100
1101 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1102 ca = bch_dev_bkey_exists(c, ptr->dev);
1103 if (!percpu_ref_tryget(&ca->io_ref)) {
1104 /* XXX: fix this */
1105 bch_err(c, "missing device for journal write\n");
1106 continue;
1107 }
1108
89fd25be 1109 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1c6fdbd8
KO
1110 sectors);
1111
1112 bio = ca->journal.bio;
1113 bio_reset(bio, ca->disk_sb.bdev,
1114 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
1115 bio->bi_iter.bi_sector = ptr->offset;
1c6fdbd8
KO
1116 bio->bi_end_io = journal_write_endio;
1117 bio->bi_private = ca;
885678f6 1118 bch2_bio_map(bio, jset, sectors << 9);
1c6fdbd8
KO
1119
1120 trace_journal_write(bio);
1121 closure_bio_submit(bio, cl);
1122
d16b4a77 1123 ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
1c6fdbd8
KO
1124 }
1125
1126 for_each_rw_member(ca, c, i)
1127 if (journal_flushes_device(ca) &&
aef90ce0 1128 !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
1c6fdbd8
KO
1129 percpu_ref_get(&ca->io_ref);
1130
1131 bio = ca->journal.bio;
1132 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
1133 bio->bi_end_io = journal_write_endio;
1134 bio->bi_private = ca;
1135 closure_bio_submit(bio, cl);
1136 }
1137
1138no_io:
c6923995
KO
1139 bch2_bucket_seq_cleanup(c);
1140
1c6fdbd8
KO
1141 continue_at(cl, journal_write_done, system_highpri_wq);
1142 return;
1143err:
1144 bch2_inconsistent_error(c);
1145 continue_at(cl, journal_write_done, system_highpri_wq);
1146}