dd3f3434c1b03dd158c504ad68bc8a5a16e7a78e
[linux-block.git] / fs / bcachefs / journal_io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_io.h"
6 #include "btree_update_interior.h"
7 #include "btree_write_buffer.h"
8 #include "buckets.h"
9 #include "checksum.h"
10 #include "disk_groups.h"
11 #include "error.h"
12 #include "journal.h"
13 #include "journal_io.h"
14 #include "journal_reclaim.h"
15 #include "journal_seq_blacklist.h"
16 #include "replicas.h"
17 #include "sb-clean.h"
18 #include "trace.h"
19
20 #include <linux/ioprio.h>
21 #include <linux/string_choices.h>
22 #include <linux/sched/sysctl.h>
23
24 void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
25 {
26         lockdep_assert_held(&c->sb_lock);
27
28         for_each_member_device(c, ca) {
29                 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
30
31                 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx);
32                 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free);
33         }
34 }
35
36 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
37 {
38         mutex_lock(&c->sb_lock);
39         for_each_member_device(c, ca) {
40                 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
41
42                 unsigned idx = le32_to_cpu(m.last_journal_bucket);
43                 if (idx < ca->journal.nr)
44                         ca->journal.cur_idx = idx;
45                 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset);
46                 if (offset <= ca->mi.bucket_size)
47                         ca->journal.sectors_free = ca->mi.bucket_size - offset;
48         }
49         mutex_unlock(&c->sb_lock);
50 }
51
52 static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p)
53 {
54         struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev);
55         prt_printf(out, "%s %u:%u:%u (sector %llu)",
56                    ca ? ca->name : "(invalid dev)",
57                    p->dev, p->bucket, p->bucket_offset, p->sector);
58         bch2_dev_put(ca);
59 }
60
61 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j)
62 {
63         darray_for_each(j->ptrs, i) {
64                 if (i != j->ptrs.data)
65                         prt_printf(out, " ");
66                 bch2_journal_ptr_to_text(out, c, i);
67         }
68 }
69
70 static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j)
71 {
72         for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) {
73                 struct jset_entry_datetime *datetime =
74                         container_of(entry, struct jset_entry_datetime, entry);
75                 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
76                 break;
77         }
78 }
79
80 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
81                                         struct journal_replay *j)
82 {
83         prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
84         bch2_journal_datetime_to_text(out, &j->j);
85         prt_char(out, ' ');
86         bch2_journal_ptrs_to_text(out, c, j);
87 }
88
89 static struct nonce journal_nonce(const struct jset *jset)
90 {
91         return (struct nonce) {{
92                 [0] = 0,
93                 [1] = ((__le32 *) &jset->seq)[0],
94                 [2] = ((__le32 *) &jset->seq)[1],
95                 [3] = BCH_NONCE_JOURNAL,
96         }};
97 }
98
99 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
100 {
101         if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
102                 *csum = (struct bch_csum) {};
103                 return false;
104         }
105
106         *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
107         return !bch2_crc_cmp(j->csum, *csum);
108 }
109
110 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
111 {
112         return (seq - c->journal_entries_base_seq) & (~0U >> 1);
113 }
114
115 static void __journal_replay_free(struct bch_fs *c,
116                                   struct journal_replay *i)
117 {
118         struct journal_replay **p =
119                 genradix_ptr(&c->journal_entries,
120                              journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
121
122         BUG_ON(*p != i);
123         *p = NULL;
124         kvfree(i);
125 }
126
127 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
128 {
129         if (blacklisted)
130                 i->ignore_blacklisted = true;
131         else
132                 i->ignore_not_dirty = true;
133
134         if (!c->opts.read_entire_journal)
135                 __journal_replay_free(c, i);
136 }
137
138 struct journal_list {
139         struct closure          cl;
140         u64                     last_seq;
141         struct mutex            lock;
142         int                     ret;
143 };
144
145 #define JOURNAL_ENTRY_ADD_OK            0
146 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE  5
147
148 /*
149  * Given a journal entry we just read, add it to the list of journal entries to
150  * be replayed:
151  */
152 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
153                              struct journal_ptr entry_ptr,
154                              struct journal_list *jlist, struct jset *j)
155 {
156         struct genradix_iter iter;
157         struct journal_replay **_i, *i, *dup;
158         size_t bytes = vstruct_bytes(j);
159         u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
160         struct printbuf buf = PRINTBUF;
161         int ret = JOURNAL_ENTRY_ADD_OK;
162
163         if (last_seq && c->opts.journal_rewind)
164                 last_seq = min(last_seq, c->opts.journal_rewind);
165
166         if (!c->journal.oldest_seq_found_ondisk ||
167             le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
168                 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
169
170         /* Is this entry older than the range we need? */
171         if (!c->opts.read_entire_journal &&
172             le64_to_cpu(j->seq) < jlist->last_seq)
173                 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
174
175         /*
176          * genradixes are indexed by a ulong, not a u64, so we can't index them
177          * by sequence number directly: Assume instead that they will all fall
178          * within the range of +-2billion of the filrst one we find.
179          */
180         if (!c->journal_entries_base_seq)
181                 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
182
183         /* Drop entries we don't need anymore */
184         if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
185                 genradix_for_each_from(&c->journal_entries, iter, _i,
186                                        journal_entry_radix_idx(c, jlist->last_seq)) {
187                         i = *_i;
188
189                         if (journal_replay_ignore(i))
190                                 continue;
191
192                         if (le64_to_cpu(i->j.seq) >= last_seq)
193                                 break;
194
195                         journal_replay_free(c, i, false);
196                 }
197         }
198
199         jlist->last_seq = max(jlist->last_seq, last_seq);
200
201         _i = genradix_ptr_alloc(&c->journal_entries,
202                                 journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
203                                 GFP_KERNEL);
204         if (!_i)
205                 return bch_err_throw(c, ENOMEM_journal_entry_add);
206
207         /*
208          * Duplicate journal entries? If so we want the one that didn't have a
209          * checksum error:
210          */
211         dup = *_i;
212         if (dup) {
213                 bool identical = bytes == vstruct_bytes(&dup->j) &&
214                         !memcmp(j, &dup->j, bytes);
215                 bool not_identical = !identical &&
216                         entry_ptr.csum_good &&
217                         dup->csum_good;
218
219                 bool same_device = false;
220                 darray_for_each(dup->ptrs, ptr)
221                         if (ptr->dev == ca->dev_idx)
222                                 same_device = true;
223
224                 ret = darray_push(&dup->ptrs, entry_ptr);
225                 if (ret)
226                         goto out;
227
228                 bch2_journal_replay_to_text(&buf, c, dup);
229
230                 fsck_err_on(same_device,
231                             c, journal_entry_dup_same_device,
232                             "duplicate journal entry on same device\n%s",
233                             buf.buf);
234
235                 fsck_err_on(not_identical,
236                             c, journal_entry_replicas_data_mismatch,
237                             "found duplicate but non identical journal entries\n%s",
238                             buf.buf);
239
240                 if (entry_ptr.csum_good && !identical)
241                         goto replace;
242
243                 goto out;
244         }
245 replace:
246         i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
247         if (!i)
248                 return bch_err_throw(c, ENOMEM_journal_entry_add);
249
250         darray_init(&i->ptrs);
251         i->csum_good            = entry_ptr.csum_good;
252         i->ignore_blacklisted   = false;
253         i->ignore_not_dirty     = false;
254         unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
255
256         if (dup) {
257                 /* The first ptr should represent the jset we kept: */
258                 darray_for_each(dup->ptrs, ptr)
259                         darray_push(&i->ptrs, *ptr);
260                 __journal_replay_free(c, dup);
261         } else {
262                 darray_push(&i->ptrs, entry_ptr);
263         }
264
265         *_i = i;
266 out:
267 fsck_err:
268         printbuf_exit(&buf);
269         return ret;
270 }
271
272 /* this fills in a range with empty jset_entries: */
273 static void journal_entry_null_range(void *start, void *end)
274 {
275         struct jset_entry *entry;
276
277         for (entry = start; entry != end; entry = vstruct_next(entry))
278                 memset(entry, 0, sizeof(*entry));
279 }
280
281 #define JOURNAL_ENTRY_REREAD    5
282 #define JOURNAL_ENTRY_NONE      6
283 #define JOURNAL_ENTRY_BAD       7
284
285 static void journal_entry_err_msg(struct printbuf *out,
286                                   u32 version,
287                                   struct jset *jset,
288                                   struct jset_entry *entry)
289 {
290         prt_str(out, "invalid journal entry, version=");
291         bch2_version_to_text(out, version);
292
293         if (entry) {
294                 prt_str(out, " type=");
295                 bch2_prt_jset_entry_type(out, entry->type);
296         }
297
298         if (!jset) {
299                 prt_printf(out, " in superblock");
300         } else {
301
302                 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
303
304                 if (entry)
305                         prt_printf(out, " offset=%zi/%u",
306                                    (u64 *) entry - jset->_data,
307                                    le32_to_cpu(jset->u64s));
308         }
309
310         prt_str(out, ": ");
311 }
312
313 #define journal_entry_err(c, version, jset, entry, _err, msg, ...)      \
314 ({                                                                      \
315         struct printbuf _buf = PRINTBUF;                                \
316                                                                         \
317         journal_entry_err_msg(&_buf, version, jset, entry);             \
318         prt_printf(&_buf, msg, ##__VA_ARGS__);                          \
319                                                                         \
320         switch (from.flags & BCH_VALIDATE_write) {                      \
321         case READ:                                                      \
322                 mustfix_fsck_err(c, _err, "%s", _buf.buf);              \
323                 break;                                                  \
324         case WRITE:                                                     \
325                 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err);            \
326                 if (bch2_fs_inconsistent(c,                             \
327                                 "corrupt metadata before write: %s\n", _buf.buf)) {\
328                         ret = bch_err_throw(c, fsck_errors_not_fixed);          \
329                         goto fsck_err;                                  \
330                 }                                                       \
331                 break;                                                  \
332         }                                                               \
333                                                                         \
334         printbuf_exit(&_buf);                                           \
335         true;                                                           \
336 })
337
338 #define journal_entry_err_on(cond, ...)                                 \
339         ((cond) ? journal_entry_err(__VA_ARGS__) : false)
340
341 #define FSCK_DELETED_KEY        5
342
343 static int journal_validate_key(struct bch_fs *c,
344                                 struct jset *jset,
345                                 struct jset_entry *entry,
346                                 struct bkey_i *k,
347                                 struct bkey_validate_context from,
348                                 unsigned version, int big_endian)
349 {
350         enum bch_validate_flags flags = from.flags;
351         int write = flags & BCH_VALIDATE_write;
352         void *next = vstruct_next(entry);
353         int ret = 0;
354
355         if (journal_entry_err_on(!k->k.u64s,
356                                  c, version, jset, entry,
357                                  journal_entry_bkey_u64s_0,
358                                  "k->u64s 0")) {
359                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
360                 journal_entry_null_range(vstruct_next(entry), next);
361                 return FSCK_DELETED_KEY;
362         }
363
364         if (journal_entry_err_on((void *) bkey_next(k) >
365                                  (void *) vstruct_next(entry),
366                                  c, version, jset, entry,
367                                  journal_entry_bkey_past_end,
368                                  "extends past end of journal entry")) {
369                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
370                 journal_entry_null_range(vstruct_next(entry), next);
371                 return FSCK_DELETED_KEY;
372         }
373
374         if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
375                                  c, version, jset, entry,
376                                  journal_entry_bkey_bad_format,
377                                  "bad format %u", k->k.format)) {
378                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
379                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
380                 journal_entry_null_range(vstruct_next(entry), next);
381                 return FSCK_DELETED_KEY;
382         }
383
384         if (!write)
385                 bch2_bkey_compat(from.level, from.btree, version, big_endian,
386                                  write, NULL, bkey_to_packed(k));
387
388         ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from);
389         if (ret == -BCH_ERR_fsck_delete_bkey) {
390                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
391                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
392                 journal_entry_null_range(vstruct_next(entry), next);
393                 return FSCK_DELETED_KEY;
394         }
395         if (ret)
396                 goto fsck_err;
397
398         if (write)
399                 bch2_bkey_compat(from.level, from.btree, version, big_endian,
400                                  write, NULL, bkey_to_packed(k));
401 fsck_err:
402         return ret;
403 }
404
405 static int journal_entry_btree_keys_validate(struct bch_fs *c,
406                                 struct jset *jset,
407                                 struct jset_entry *entry,
408                                 unsigned version, int big_endian,
409                                 struct bkey_validate_context from)
410 {
411         struct bkey_i *k = entry->start;
412
413         from.level      = entry->level;
414         from.btree      = entry->btree_id;
415
416         while (k != vstruct_last(entry)) {
417                 int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
418                 if (ret == FSCK_DELETED_KEY)
419                         continue;
420                 else if (ret)
421                         return ret;
422
423                 k = bkey_next(k);
424         }
425
426         return 0;
427 }
428
429 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
430                                              struct jset_entry *entry)
431 {
432         bool first = true;
433
434         jset_entry_for_each_key(entry, k) {
435                 /* We may be called on entries that haven't been validated: */
436                 if (!k->k.u64s)
437                         break;
438
439                 if (!first) {
440                         prt_newline(out);
441                         bch2_prt_jset_entry_type(out, entry->type);
442                         prt_str(out, ": ");
443                 }
444                 bch2_btree_id_level_to_text(out, entry->btree_id, entry->level);
445                 prt_char(out, ' ');
446                 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
447                 first = false;
448         }
449 }
450
451 static int journal_entry_btree_root_validate(struct bch_fs *c,
452                                 struct jset *jset,
453                                 struct jset_entry *entry,
454                                 unsigned version, int big_endian,
455                                 struct bkey_validate_context from)
456 {
457         struct bkey_i *k = entry->start;
458         int ret = 0;
459
460         from.root       = true;
461         from.level      = entry->level + 1;
462         from.btree      = entry->btree_id;
463
464         if (journal_entry_err_on(!entry->u64s ||
465                                  le16_to_cpu(entry->u64s) != k->k.u64s,
466                                  c, version, jset, entry,
467                                  journal_entry_btree_root_bad_size,
468                                  "invalid btree root journal entry: wrong number of keys")) {
469                 void *next = vstruct_next(entry);
470                 /*
471                  * we don't want to null out this jset_entry,
472                  * just the contents, so that later we can tell
473                  * we were _supposed_ to have a btree root
474                  */
475                 entry->u64s = 0;
476                 journal_entry_null_range(vstruct_next(entry), next);
477                 return 0;
478         }
479
480         ret = journal_validate_key(c, jset, entry, k, from, version, big_endian);
481         if (ret == FSCK_DELETED_KEY)
482                 ret = 0;
483 fsck_err:
484         return ret;
485 }
486
487 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
488                                              struct jset_entry *entry)
489 {
490         journal_entry_btree_keys_to_text(out, c, entry);
491 }
492
493 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
494                                 struct jset *jset,
495                                 struct jset_entry *entry,
496                                 unsigned version, int big_endian,
497                                 struct bkey_validate_context from)
498 {
499         /* obsolete, don't care: */
500         return 0;
501 }
502
503 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
504                                             struct jset_entry *entry)
505 {
506 }
507
508 static int journal_entry_blacklist_validate(struct bch_fs *c,
509                                 struct jset *jset,
510                                 struct jset_entry *entry,
511                                 unsigned version, int big_endian,
512                                 struct bkey_validate_context from)
513 {
514         int ret = 0;
515
516         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
517                                  c, version, jset, entry,
518                                  journal_entry_blacklist_bad_size,
519                 "invalid journal seq blacklist entry: bad size")) {
520                 journal_entry_null_range(entry, vstruct_next(entry));
521         }
522 fsck_err:
523         return ret;
524 }
525
526 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
527                                             struct jset_entry *entry)
528 {
529         struct jset_entry_blacklist *bl =
530                 container_of(entry, struct jset_entry_blacklist, entry);
531
532         prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
533 }
534
535 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
536                                 struct jset *jset,
537                                 struct jset_entry *entry,
538                                 unsigned version, int big_endian,
539                                 struct bkey_validate_context from)
540 {
541         struct jset_entry_blacklist_v2 *bl_entry;
542         int ret = 0;
543
544         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
545                                  c, version, jset, entry,
546                                  journal_entry_blacklist_v2_bad_size,
547                 "invalid journal seq blacklist entry: bad size")) {
548                 journal_entry_null_range(entry, vstruct_next(entry));
549                 goto out;
550         }
551
552         bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
553
554         if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
555                                  le64_to_cpu(bl_entry->end),
556                                  c, version, jset, entry,
557                                  journal_entry_blacklist_v2_start_past_end,
558                 "invalid journal seq blacklist entry: start > end")) {
559                 journal_entry_null_range(entry, vstruct_next(entry));
560         }
561 out:
562 fsck_err:
563         return ret;
564 }
565
566 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
567                                                struct jset_entry *entry)
568 {
569         struct jset_entry_blacklist_v2 *bl =
570                 container_of(entry, struct jset_entry_blacklist_v2, entry);
571
572         prt_printf(out, "start=%llu end=%llu",
573                le64_to_cpu(bl->start),
574                le64_to_cpu(bl->end));
575 }
576
577 static int journal_entry_usage_validate(struct bch_fs *c,
578                                 struct jset *jset,
579                                 struct jset_entry *entry,
580                                 unsigned version, int big_endian,
581                                 struct bkey_validate_context from)
582 {
583         struct jset_entry_usage *u =
584                 container_of(entry, struct jset_entry_usage, entry);
585         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
586         int ret = 0;
587
588         if (journal_entry_err_on(bytes < sizeof(*u),
589                                  c, version, jset, entry,
590                                  journal_entry_usage_bad_size,
591                                  "invalid journal entry usage: bad size")) {
592                 journal_entry_null_range(entry, vstruct_next(entry));
593                 return ret;
594         }
595
596 fsck_err:
597         return ret;
598 }
599
600 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
601                                         struct jset_entry *entry)
602 {
603         struct jset_entry_usage *u =
604                 container_of(entry, struct jset_entry_usage, entry);
605
606         prt_str(out, "type=");
607         bch2_prt_fs_usage_type(out, u->entry.btree_id);
608         prt_printf(out, " v=%llu", le64_to_cpu(u->v));
609 }
610
611 static int journal_entry_data_usage_validate(struct bch_fs *c,
612                                 struct jset *jset,
613                                 struct jset_entry *entry,
614                                 unsigned version, int big_endian,
615                                 struct bkey_validate_context from)
616 {
617         struct jset_entry_data_usage *u =
618                 container_of(entry, struct jset_entry_data_usage, entry);
619         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
620         struct printbuf err = PRINTBUF;
621         int ret = 0;
622
623         if (journal_entry_err_on(bytes < sizeof(*u) ||
624                                  bytes < sizeof(*u) + u->r.nr_devs,
625                                  c, version, jset, entry,
626                                  journal_entry_data_usage_bad_size,
627                                  "invalid journal entry usage: bad size")) {
628                 journal_entry_null_range(entry, vstruct_next(entry));
629                 goto out;
630         }
631
632         if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
633                                  c, version, jset, entry,
634                                  journal_entry_data_usage_bad_size,
635                                  "invalid journal entry usage: %s", err.buf)) {
636                 journal_entry_null_range(entry, vstruct_next(entry));
637                 goto out;
638         }
639 out:
640 fsck_err:
641         printbuf_exit(&err);
642         return ret;
643 }
644
645 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
646                                              struct jset_entry *entry)
647 {
648         struct jset_entry_data_usage *u =
649                 container_of(entry, struct jset_entry_data_usage, entry);
650
651         bch2_replicas_entry_to_text(out, &u->r);
652         prt_printf(out, "=%llu", le64_to_cpu(u->v));
653 }
654
655 static int journal_entry_clock_validate(struct bch_fs *c,
656                                 struct jset *jset,
657                                 struct jset_entry *entry,
658                                 unsigned version, int big_endian,
659                                 struct bkey_validate_context from)
660 {
661         struct jset_entry_clock *clock =
662                 container_of(entry, struct jset_entry_clock, entry);
663         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
664         int ret = 0;
665
666         if (journal_entry_err_on(bytes != sizeof(*clock),
667                                  c, version, jset, entry,
668                                  journal_entry_clock_bad_size,
669                                  "bad size")) {
670                 journal_entry_null_range(entry, vstruct_next(entry));
671                 return ret;
672         }
673
674         if (journal_entry_err_on(clock->rw > 1,
675                                  c, version, jset, entry,
676                                  journal_entry_clock_bad_rw,
677                                  "bad rw")) {
678                 journal_entry_null_range(entry, vstruct_next(entry));
679                 return ret;
680         }
681
682 fsck_err:
683         return ret;
684 }
685
686 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
687                                         struct jset_entry *entry)
688 {
689         struct jset_entry_clock *clock =
690                 container_of(entry, struct jset_entry_clock, entry);
691
692         prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time));
693 }
694
695 static int journal_entry_dev_usage_validate(struct bch_fs *c,
696                                 struct jset *jset,
697                                 struct jset_entry *entry,
698                                 unsigned version, int big_endian,
699                                 struct bkey_validate_context from)
700 {
701         struct jset_entry_dev_usage *u =
702                 container_of(entry, struct jset_entry_dev_usage, entry);
703         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
704         unsigned expected = sizeof(*u);
705         int ret = 0;
706
707         if (journal_entry_err_on(bytes < expected,
708                                  c, version, jset, entry,
709                                  journal_entry_dev_usage_bad_size,
710                                  "bad size (%u < %u)",
711                                  bytes, expected)) {
712                 journal_entry_null_range(entry, vstruct_next(entry));
713                 return ret;
714         }
715
716         if (journal_entry_err_on(u->pad,
717                                  c, version, jset, entry,
718                                  journal_entry_dev_usage_bad_pad,
719                                  "bad pad")) {
720                 journal_entry_null_range(entry, vstruct_next(entry));
721                 return ret;
722         }
723
724 fsck_err:
725         return ret;
726 }
727
728 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
729                                             struct jset_entry *entry)
730 {
731         struct jset_entry_dev_usage *u =
732                 container_of(entry, struct jset_entry_dev_usage, entry);
733         unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
734
735         if (vstruct_bytes(entry) < sizeof(*u))
736                 return;
737
738         prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
739
740         printbuf_indent_add(out, 2);
741         for (i = 0; i < nr_types; i++) {
742                 prt_newline(out);
743                 bch2_prt_data_type(out, i);
744                 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
745                        le64_to_cpu(u->d[i].buckets),
746                        le64_to_cpu(u->d[i].sectors),
747                        le64_to_cpu(u->d[i].fragmented));
748         }
749         printbuf_indent_sub(out, 2);
750 }
751
752 static int journal_entry_log_validate(struct bch_fs *c,
753                                 struct jset *jset,
754                                 struct jset_entry *entry,
755                                 unsigned version, int big_endian,
756                                 struct bkey_validate_context from)
757 {
758         return 0;
759 }
760
761 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
762                                       struct jset_entry *entry)
763 {
764         struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
765
766         prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d);
767 }
768
769 static int journal_entry_overwrite_validate(struct bch_fs *c,
770                                 struct jset *jset,
771                                 struct jset_entry *entry,
772                                 unsigned version, int big_endian,
773                                 struct bkey_validate_context from)
774 {
775         from.flags = 0;
776         return journal_entry_btree_keys_validate(c, jset, entry,
777                                 version, big_endian, from);
778 }
779
780 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
781                                             struct jset_entry *entry)
782 {
783         journal_entry_btree_keys_to_text(out, c, entry);
784 }
785
786 static int journal_entry_log_bkey_validate(struct bch_fs *c,
787                                 struct jset *jset,
788                                 struct jset_entry *entry,
789                                 unsigned version, int big_endian,
790                                 struct bkey_validate_context from)
791 {
792         from.flags = 0;
793         return journal_entry_btree_keys_validate(c, jset, entry,
794                                 version, big_endian, from);
795 }
796
797 static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c,
798                                            struct jset_entry *entry)
799 {
800         journal_entry_btree_keys_to_text(out, c, entry);
801 }
802
803 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
804                                 struct jset *jset,
805                                 struct jset_entry *entry,
806                                 unsigned version, int big_endian,
807                                 struct bkey_validate_context from)
808 {
809         return journal_entry_btree_keys_validate(c, jset, entry,
810                                 version, big_endian, from);
811 }
812
813 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
814                                             struct jset_entry *entry)
815 {
816         journal_entry_btree_keys_to_text(out, c, entry);
817 }
818
819 static int journal_entry_datetime_validate(struct bch_fs *c,
820                                 struct jset *jset,
821                                 struct jset_entry *entry,
822                                 unsigned version, int big_endian,
823                                 struct bkey_validate_context from)
824 {
825         unsigned bytes = vstruct_bytes(entry);
826         unsigned expected = 16;
827         int ret = 0;
828
829         if (journal_entry_err_on(vstruct_bytes(entry) < expected,
830                                  c, version, jset, entry,
831                                  journal_entry_dev_usage_bad_size,
832                                  "bad size (%u < %u)",
833                                  bytes, expected)) {
834                 journal_entry_null_range(entry, vstruct_next(entry));
835                 return ret;
836         }
837 fsck_err:
838         return ret;
839 }
840
841 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
842                                             struct jset_entry *entry)
843 {
844         struct jset_entry_datetime *datetime =
845                 container_of(entry, struct jset_entry_datetime, entry);
846
847         bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
848 }
849
850 struct jset_entry_ops {
851         int (*validate)(struct bch_fs *, struct jset *,
852                         struct jset_entry *, unsigned, int,
853                         struct bkey_validate_context);
854         void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
855 };
856
857 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
858 #define x(f, nr)                                                \
859         [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
860                 .validate       = journal_entry_##f##_validate, \
861                 .to_text        = journal_entry_##f##_to_text,  \
862         },
863         BCH_JSET_ENTRY_TYPES()
864 #undef x
865 };
866
867 int bch2_journal_entry_validate(struct bch_fs *c,
868                                 struct jset *jset,
869                                 struct jset_entry *entry,
870                                 unsigned version, int big_endian,
871                                 struct bkey_validate_context from)
872 {
873         return entry->type < BCH_JSET_ENTRY_NR
874                 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
875                                 version, big_endian, from)
876                 : 0;
877 }
878
879 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
880                                 struct jset_entry *entry)
881 {
882         bch2_prt_jset_entry_type(out, entry->type);
883
884         if (entry->type < BCH_JSET_ENTRY_NR) {
885                 prt_str(out, ": ");
886                 bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
887         }
888 }
889
890 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
891                                  enum bch_validate_flags flags)
892 {
893         struct bkey_validate_context from = {
894                 .flags          = flags,
895                 .from           = BKEY_VALIDATE_journal,
896                 .journal_seq    = le64_to_cpu(jset->seq),
897         };
898
899         unsigned version = le32_to_cpu(jset->version);
900         int ret = 0;
901
902         vstruct_for_each(jset, entry) {
903                 from.journal_offset = (u64 *) entry - jset->_data;
904
905                 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
906                                 c, version, jset, entry,
907                                 journal_entry_past_jset_end,
908                                 "journal entry extends past end of jset")) {
909                         jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
910                         break;
911                 }
912
913                 ret = bch2_journal_entry_validate(c, jset, entry, version,
914                                                   JSET_BIG_ENDIAN(jset), from);
915                 if (ret)
916                         break;
917         }
918 fsck_err:
919         return ret;
920 }
921
922 static int jset_validate(struct bch_fs *c,
923                          struct bch_dev *ca,
924                          struct jset *jset, u64 sector,
925                          enum bch_validate_flags flags)
926 {
927         struct bkey_validate_context from = {
928                 .flags          = flags,
929                 .from           = BKEY_VALIDATE_journal,
930                 .journal_seq    = le64_to_cpu(jset->seq),
931         };
932         int ret = 0;
933
934         if (le64_to_cpu(jset->magic) != jset_magic(c))
935                 return JOURNAL_ENTRY_NONE;
936
937         unsigned version = le32_to_cpu(jset->version);
938         if (journal_entry_err_on(!bch2_version_compatible(version),
939                         c, version, jset, NULL,
940                         jset_unsupported_version,
941                         "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
942                         ca ? ca->name : c->name,
943                         sector, le64_to_cpu(jset->seq),
944                         BCH_VERSION_MAJOR(version),
945                         BCH_VERSION_MINOR(version))) {
946                 /* don't try to continue: */
947                 return -EINVAL;
948         }
949
950         if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
951                         c, version, jset, NULL,
952                         jset_unknown_csum,
953                         "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
954                         ca ? ca->name : c->name,
955                         sector, le64_to_cpu(jset->seq),
956                         JSET_CSUM_TYPE(jset)))
957                 ret = JOURNAL_ENTRY_BAD;
958
959         /* last_seq is ignored when JSET_NO_FLUSH is true */
960         if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
961                                  le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
962                                  c, version, jset, NULL,
963                                  jset_last_seq_newer_than_seq,
964                                  "invalid journal entry: last_seq > seq (%llu > %llu)",
965                                  le64_to_cpu(jset->last_seq),
966                                  le64_to_cpu(jset->seq))) {
967                 jset->last_seq = jset->seq;
968                 return JOURNAL_ENTRY_BAD;
969         }
970
971         ret = jset_validate_entries(c, jset, flags);
972 fsck_err:
973         return ret;
974 }
975
976 static int jset_validate_early(struct bch_fs *c,
977                          struct bch_dev *ca,
978                          struct jset *jset, u64 sector,
979                          unsigned bucket_sectors_left,
980                          unsigned sectors_read)
981 {
982         struct bkey_validate_context from = {
983                 .from           = BKEY_VALIDATE_journal,
984                 .journal_seq    = le64_to_cpu(jset->seq),
985         };
986         int ret = 0;
987
988         if (le64_to_cpu(jset->magic) != jset_magic(c))
989                 return JOURNAL_ENTRY_NONE;
990
991         unsigned version = le32_to_cpu(jset->version);
992         if (journal_entry_err_on(!bch2_version_compatible(version),
993                         c, version, jset, NULL,
994                         jset_unsupported_version,
995                         "%s sector %llu seq %llu: unknown journal entry version %u.%u",
996                         ca ? ca->name : c->name,
997                         sector, le64_to_cpu(jset->seq),
998                         BCH_VERSION_MAJOR(version),
999                         BCH_VERSION_MINOR(version))) {
1000                 /* don't try to continue: */
1001                 return -EINVAL;
1002         }
1003
1004         size_t bytes = vstruct_bytes(jset);
1005         if (bytes > (sectors_read << 9) &&
1006             sectors_read < bucket_sectors_left)
1007                 return JOURNAL_ENTRY_REREAD;
1008
1009         if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
1010                         c, version, jset, NULL,
1011                         jset_past_bucket_end,
1012                         "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
1013                         ca ? ca->name : c->name,
1014                         sector, le64_to_cpu(jset->seq), bytes))
1015                 le32_add_cpu(&jset->u64s,
1016                              -((bytes - (bucket_sectors_left << 9)) / 8));
1017 fsck_err:
1018         return ret;
1019 }
1020
1021 struct journal_read_buf {
1022         void            *data;
1023         size_t          size;
1024 };
1025
1026 static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b,
1027                                     size_t new_size)
1028 {
1029         void *n;
1030
1031         /* the bios are sized for this many pages, max: */
1032         if (new_size > JOURNAL_ENTRY_SIZE_MAX)
1033                 return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
1034
1035         new_size = roundup_pow_of_two(new_size);
1036         n = kvmalloc(new_size, GFP_KERNEL);
1037         if (!n)
1038                 return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
1039
1040         kvfree(b->data);
1041         b->data = n;
1042         b->size = new_size;
1043         return 0;
1044 }
1045
1046 static int journal_read_bucket(struct bch_dev *ca,
1047                                struct journal_read_buf *buf,
1048                                struct journal_list *jlist,
1049                                unsigned bucket)
1050 {
1051         struct bch_fs *c = ca->fs;
1052         struct journal_device *ja = &ca->journal;
1053         struct jset *j = NULL;
1054         unsigned sectors, sectors_read = 0;
1055         u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
1056             end = offset + ca->mi.bucket_size;
1057         bool saw_bad = false, csum_good;
1058         int ret = 0;
1059
1060         pr_debug("reading %u", bucket);
1061
1062         while (offset < end) {
1063                 if (!sectors_read) {
1064                         struct bio *bio;
1065                         unsigned nr_bvecs;
1066 reread:
1067                         sectors_read = min_t(unsigned,
1068                                 end - offset, buf->size >> 9);
1069                         nr_bvecs = buf_pages(buf->data, sectors_read << 9);
1070
1071                         bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
1072                         if (!bio)
1073                                 return bch_err_throw(c, ENOMEM_journal_read_bucket);
1074                         bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
1075
1076                         bio->bi_iter.bi_sector = offset;
1077                         bch2_bio_map(bio, buf->data, sectors_read << 9);
1078
1079                         u64 submit_time = local_clock();
1080                         ret = submit_bio_wait(bio);
1081                         kfree(bio);
1082
1083                         if (!ret && bch2_meta_read_fault("journal"))
1084                                 ret = bch_err_throw(c, EIO_fault_injected);
1085
1086                         bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
1087                                                    submit_time, !ret);
1088
1089                         if (ret) {
1090                                 bch_err_dev_ratelimited(ca,
1091                                         "journal read error: sector %llu", offset);
1092                                 /*
1093                                  * We don't error out of the recovery process
1094                                  * here, since the relevant journal entry may be
1095                                  * found on a different device, and missing or
1096                                  * no journal entries will be handled later
1097                                  */
1098                                 return 0;
1099                         }
1100
1101                         j = buf->data;
1102                 }
1103
1104                 ret = jset_validate_early(c, ca, j, offset,
1105                                     end - offset, sectors_read);
1106                 switch (ret) {
1107                 case 0:
1108                         sectors = vstruct_sectors(j, c->block_bits);
1109                         break;
1110                 case JOURNAL_ENTRY_REREAD:
1111                         if (vstruct_bytes(j) > buf->size) {
1112                                 ret = journal_read_buf_realloc(c, buf,
1113                                                         vstruct_bytes(j));
1114                                 if (ret)
1115                                         return ret;
1116                         }
1117                         goto reread;
1118                 case JOURNAL_ENTRY_NONE:
1119                         if (!saw_bad)
1120                                 return 0;
1121                         /*
1122                          * On checksum error we don't really trust the size
1123                          * field of the journal entry we read, so try reading
1124                          * again at next block boundary:
1125                          */
1126                         sectors = block_sectors(c);
1127                         goto next_block;
1128                 default:
1129                         return ret;
1130                 }
1131
1132                 if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
1133                         ja->highest_seq_found = le64_to_cpu(j->seq);
1134                         ja->cur_idx = bucket;
1135                         ja->sectors_free = ca->mi.bucket_size -
1136                                 bucket_remainder(ca, offset) - sectors;
1137                 }
1138
1139                 /*
1140                  * This happens sometimes if we don't have discards on -
1141                  * when we've partially overwritten a bucket with new
1142                  * journal entries. We don't need the rest of the
1143                  * bucket:
1144                  */
1145                 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
1146                         return 0;
1147
1148                 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
1149
1150                 struct bch_csum csum;
1151                 csum_good = jset_csum_good(c, j, &csum);
1152
1153                 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
1154
1155                 if (!csum_good) {
1156                         /*
1157                          * Don't print an error here, we'll print the error
1158                          * later if we need this journal entry
1159                          */
1160                         saw_bad = true;
1161                 }
1162
1163                 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
1164                              j->encrypted_start,
1165                              vstruct_end(j) - (void *) j->encrypted_start);
1166                 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
1167
1168                 mutex_lock(&jlist->lock);
1169                 ret = journal_entry_add(c, ca, (struct journal_ptr) {
1170                                         .csum_good      = csum_good,
1171                                         .csum           = csum,
1172                                         .dev            = ca->dev_idx,
1173                                         .bucket         = bucket,
1174                                         .bucket_offset  = offset -
1175                                                 bucket_to_sector(ca, ja->buckets[bucket]),
1176                                         .sector         = offset,
1177                                         }, jlist, j);
1178                 mutex_unlock(&jlist->lock);
1179
1180                 switch (ret) {
1181                 case JOURNAL_ENTRY_ADD_OK:
1182                         break;
1183                 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
1184                         break;
1185                 default:
1186                         return ret;
1187                 }
1188 next_block:
1189                 pr_debug("next");
1190                 offset          += sectors;
1191                 sectors_read    -= sectors;
1192                 j = ((void *) j) + (sectors << 9);
1193         }
1194
1195         return 0;
1196 }
1197
1198 static CLOSURE_CALLBACK(bch2_journal_read_device)
1199 {
1200         closure_type(ja, struct journal_device, read);
1201         struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
1202         struct bch_fs *c = ca->fs;
1203         struct journal_list *jlist =
1204                 container_of(cl->parent, struct journal_list, cl);
1205         struct journal_read_buf buf = { NULL, 0 };
1206         unsigned i;
1207         int ret = 0;
1208
1209         if (!ja->nr)
1210                 goto out;
1211
1212         ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE);
1213         if (ret)
1214                 goto err;
1215
1216         pr_debug("%u journal buckets", ja->nr);
1217
1218         for (i = 0; i < ja->nr; i++) {
1219                 ret = journal_read_bucket(ca, &buf, jlist, i);
1220                 if (ret)
1221                         goto err;
1222         }
1223
1224         /*
1225          * Set dirty_idx to indicate the entire journal is full and needs to be
1226          * reclaimed - journal reclaim will immediately reclaim whatever isn't
1227          * pinned when it first runs:
1228          */
1229         ja->discard_idx = ja->dirty_idx_ondisk =
1230                 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1231 out:
1232         bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
1233         kvfree(buf.data);
1234         enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read);
1235         closure_return(cl);
1236         return;
1237 err:
1238         mutex_lock(&jlist->lock);
1239         jlist->ret = ret;
1240         mutex_unlock(&jlist->lock);
1241         goto out;
1242 }
1243
1244 noinline_for_stack
1245 static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
1246 {
1247         struct printbuf buf = PRINTBUF;
1248         enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
1249         bool have_good = false;
1250
1251         prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
1252         bch2_journal_datetime_to_text(&buf, &j->j);
1253         prt_newline(&buf);
1254
1255         darray_for_each(j->ptrs, ptr)
1256                 if (!ptr->csum_good) {
1257                         bch2_journal_ptr_to_text(&buf, c, ptr);
1258                         prt_char(&buf, ' ');
1259                         bch2_csum_to_text(&buf, csum_type, ptr->csum);
1260                         prt_newline(&buf);
1261                 } else {
1262                         have_good = true;
1263                 }
1264
1265         prt_printf(&buf, "should be ");
1266         bch2_csum_to_text(&buf, csum_type, j->j.csum);
1267
1268         if (have_good)
1269                 prt_printf(&buf, "\n(had good copy on another device)");
1270
1271         bch2_print_str(c, KERN_ERR, buf.buf);
1272         printbuf_exit(&buf);
1273 }
1274
1275 noinline_for_stack
1276 static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
1277 {
1278         struct printbuf buf = PRINTBUF;
1279         int ret = 0;
1280
1281         struct genradix_iter radix_iter;
1282         struct journal_replay *i, **_i, *prev = NULL;
1283         u64 seq = start_seq;
1284
1285         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1286                 i = *_i;
1287
1288                 if (journal_replay_ignore(i))
1289                         continue;
1290
1291                 BUG_ON(seq > le64_to_cpu(i->j.seq));
1292
1293                 while (seq < le64_to_cpu(i->j.seq)) {
1294                         while (seq < le64_to_cpu(i->j.seq) &&
1295                                bch2_journal_seq_is_blacklisted(c, seq, false))
1296                                 seq++;
1297
1298                         if (seq == le64_to_cpu(i->j.seq))
1299                                 break;
1300
1301                         u64 missing_start = seq;
1302
1303                         while (seq < le64_to_cpu(i->j.seq) &&
1304                                !bch2_journal_seq_is_blacklisted(c, seq, false))
1305                                 seq++;
1306
1307                         u64 missing_end = seq - 1;
1308
1309                         printbuf_reset(&buf);
1310                         prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
1311                                    missing_start, missing_end,
1312                                    start_seq, end_seq);
1313
1314                         prt_printf(&buf, "\nprev at ");
1315                         if (prev) {
1316                                 bch2_journal_ptrs_to_text(&buf, c, prev);
1317                                 prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
1318                         } else
1319                                 prt_printf(&buf, "(none)");
1320
1321                         prt_printf(&buf, "\nnext at ");
1322                         bch2_journal_ptrs_to_text(&buf, c, i);
1323                         prt_printf(&buf, ", continue?");
1324
1325                         fsck_err(c, journal_entries_missing, "%s", buf.buf);
1326                 }
1327
1328                 prev = i;
1329                 seq++;
1330         }
1331 fsck_err:
1332         printbuf_exit(&buf);
1333         return ret;
1334 }
1335
1336 int bch2_journal_read(struct bch_fs *c,
1337                       u64 *last_seq,
1338                       u64 *blacklist_seq,
1339                       u64 *start_seq)
1340 {
1341         struct journal_list jlist;
1342         struct journal_replay *i, **_i;
1343         struct genradix_iter radix_iter;
1344         struct printbuf buf = PRINTBUF;
1345         bool degraded = false, last_write_torn = false;
1346         u64 seq;
1347         int ret = 0;
1348
1349         closure_init_stack(&jlist.cl);
1350         mutex_init(&jlist.lock);
1351         jlist.last_seq = 0;
1352         jlist.ret = 0;
1353
1354         for_each_member_device(c, ca) {
1355                 if (!c->opts.fsck &&
1356                     !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1357                         continue;
1358
1359                 if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1360                      ca->mi.state == BCH_MEMBER_STATE_ro) &&
1361                     enumerated_ref_tryget(&ca->io_ref[READ],
1362                                           BCH_DEV_READ_REF_journal_read))
1363                         closure_call(&ca->journal.read,
1364                                      bch2_journal_read_device,
1365                                      system_unbound_wq,
1366                                      &jlist.cl);
1367                 else
1368                         degraded = true;
1369         }
1370
1371         while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2))
1372                 ;
1373
1374         if (jlist.ret)
1375                 return jlist.ret;
1376
1377         *last_seq       = 0;
1378         *start_seq      = 0;
1379         *blacklist_seq  = 0;
1380
1381         /*
1382          * Find most recent flush entry, and ignore newer non flush entries -
1383          * those entries will be blacklisted:
1384          */
1385         genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
1386                 i = *_i;
1387
1388                 if (journal_replay_ignore(i))
1389                         continue;
1390
1391                 if (!*start_seq)
1392                         *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
1393
1394                 if (JSET_NO_FLUSH(&i->j)) {
1395                         i->ignore_blacklisted = true;
1396                         continue;
1397                 }
1398
1399                 if (!last_write_torn && !i->csum_good) {
1400                         last_write_torn = true;
1401                         i->ignore_blacklisted = true;
1402                         continue;
1403                 }
1404
1405                 struct bkey_validate_context from = {
1406                         .from           = BKEY_VALIDATE_journal,
1407                         .journal_seq    = le64_to_cpu(i->j.seq),
1408                 };
1409                 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
1410                                          c, le32_to_cpu(i->j.version), &i->j, NULL,
1411                                          jset_last_seq_newer_than_seq,
1412                                          "invalid journal entry: last_seq > seq (%llu > %llu)",
1413                                          le64_to_cpu(i->j.last_seq),
1414                                          le64_to_cpu(i->j.seq)))
1415                         i->j.last_seq = i->j.seq;
1416
1417                 *last_seq       = le64_to_cpu(i->j.last_seq);
1418                 *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
1419                 break;
1420         }
1421
1422         if (!*start_seq) {
1423                 bch_info(c, "journal read done, but no entries found");
1424                 return 0;
1425         }
1426
1427         if (!*last_seq) {
1428                 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
1429                          "journal read done, but no entries found after dropping non-flushes");
1430                 return 0;
1431         }
1432
1433         printbuf_reset(&buf);
1434         prt_printf(&buf, "journal read done, replaying entries %llu-%llu",
1435                    *last_seq, *blacklist_seq - 1);
1436
1437         /*
1438          * Drop blacklisted entries and entries older than last_seq (or start of
1439          * journal rewind:
1440          */
1441         u64 drop_before = *last_seq;
1442         if (c->opts.journal_rewind) {
1443                 drop_before = min(drop_before, c->opts.journal_rewind);
1444                 prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind);
1445         }
1446
1447         *last_seq = drop_before;
1448         if (*start_seq != *blacklist_seq)
1449                 prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1);
1450         bch_info(c, "%s", buf.buf);
1451         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1452                 i = *_i;
1453
1454                 if (journal_replay_ignore(i))
1455                         continue;
1456
1457                 seq = le64_to_cpu(i->j.seq);
1458                 if (seq < drop_before) {
1459                         journal_replay_free(c, i, false);
1460                         continue;
1461                 }
1462
1463                 if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1464                         fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1465                                     jset_seq_blacklisted,
1466                                     "found blacklisted journal entry %llu", seq);
1467                         i->ignore_blacklisted = true;
1468                 }
1469         }
1470
1471         ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1);
1472         if (ret)
1473                 goto err;
1474
1475         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1476                 union bch_replicas_padded replicas = {
1477                         .e.data_type = BCH_DATA_journal,
1478                         .e.nr_devs = 0,
1479                         .e.nr_required = 1,
1480                 };
1481
1482                 i = *_i;
1483                 if (journal_replay_ignore(i))
1484                         continue;
1485
1486                 /*
1487                  * Don't print checksum errors until we know we're going to use
1488                  * a given journal entry:
1489                  */
1490                 darray_for_each(i->ptrs, ptr)
1491                         if (!ptr->csum_good) {
1492                                 bch2_journal_print_checksum_error(c, i);
1493                                 break;
1494                         }
1495
1496                 ret = jset_validate(c,
1497                                     bch2_dev_have_ref(c, i->ptrs.data[0].dev),
1498                                     &i->j,
1499                                     i->ptrs.data[0].sector,
1500                                     READ);
1501                 if (ret)
1502                         goto err;
1503
1504                 darray_for_each(i->ptrs, ptr)
1505                         replicas_entry_add_dev(&replicas.e, ptr->dev);
1506
1507                 bch2_replicas_entry_sort(&replicas.e);
1508
1509                 printbuf_reset(&buf);
1510                 bch2_replicas_entry_to_text(&buf, &replicas.e);
1511
1512                 if (!degraded &&
1513                     !bch2_replicas_marked(c, &replicas.e) &&
1514                     (le64_to_cpu(i->j.seq) == *last_seq ||
1515                      fsck_err(c, journal_entry_replicas_not_marked,
1516                               "superblock not marked as containing replicas for journal entry %llu\n%s",
1517                               le64_to_cpu(i->j.seq), buf.buf))) {
1518                         ret = bch2_mark_replicas(c, &replicas.e);
1519                         if (ret)
1520                                 goto err;
1521                 }
1522         }
1523 err:
1524 fsck_err:
1525         printbuf_exit(&buf);
1526         return ret;
1527 }
1528
1529 /* journal write: */
1530
1531 static void journal_advance_devs_to_next_bucket(struct journal *j,
1532                                                 struct dev_alloc_list *devs,
1533                                                 unsigned sectors, __le64 seq)
1534 {
1535         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1536
1537         guard(rcu)();
1538         darray_for_each(*devs, i) {
1539                 struct bch_dev *ca = rcu_dereference(c->devs[*i]);
1540                 if (!ca)
1541                         continue;
1542
1543                 struct journal_device *ja = &ca->journal;
1544
1545                 if (sectors > ja->sectors_free &&
1546                     sectors <= ca->mi.bucket_size &&
1547                     bch2_journal_dev_buckets_available(j, ja,
1548                                         journal_space_discarded)) {
1549                         ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1550                         ja->sectors_free = ca->mi.bucket_size;
1551
1552                         /*
1553                          * ja->bucket_seq[ja->cur_idx] must always have
1554                          * something sensible:
1555                          */
1556                         ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
1557                 }
1558         }
1559 }
1560
1561 static void __journal_write_alloc(struct journal *j,
1562                                   struct journal_buf *w,
1563                                   struct dev_alloc_list *devs,
1564                                   unsigned sectors,
1565                                   unsigned *replicas,
1566                                   unsigned replicas_want)
1567 {
1568         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1569
1570         darray_for_each(*devs, i) {
1571                 struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE,
1572                                         BCH_DEV_WRITE_REF_journal_write);
1573                 if (!ca)
1574                         continue;
1575
1576                 struct journal_device *ja = &ca->journal;
1577
1578                 /*
1579                  * Check that we can use this device, and aren't already using
1580                  * it:
1581                  */
1582                 if (!ca->mi.durability ||
1583                     ca->mi.state != BCH_MEMBER_STATE_rw ||
1584                     !ja->nr ||
1585                     bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
1586                     sectors > ja->sectors_free) {
1587                         enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
1588                         continue;
1589                 }
1590
1591                 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1592
1593                 bch2_bkey_append_ptr(&w->key,
1594                         (struct bch_extent_ptr) {
1595                                   .offset = bucket_to_sector(ca,
1596                                         ja->buckets[ja->cur_idx]) +
1597                                         ca->mi.bucket_size -
1598                                         ja->sectors_free,
1599                                   .dev = ca->dev_idx,
1600                 });
1601
1602                 ja->sectors_free -= sectors;
1603                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1604
1605                 *replicas += ca->mi.durability;
1606
1607                 if (*replicas >= replicas_want)
1608                         break;
1609         }
1610 }
1611
1612 static int journal_write_alloc(struct journal *j, struct journal_buf *w,
1613                                unsigned *replicas)
1614 {
1615         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1616         struct bch_devs_mask devs;
1617         struct dev_alloc_list devs_sorted;
1618         unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1619         unsigned target = c->opts.metadata_target ?:
1620                 c->opts.foreground_target;
1621         unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas);
1622         unsigned replicas_need = min_t(unsigned, replicas_want,
1623                                        READ_ONCE(c->opts.metadata_replicas_required));
1624         bool advance_done = false;
1625
1626 retry_target:
1627         devs = target_rw_devs(c, BCH_DATA_journal, target);
1628         bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted);
1629 retry_alloc:
1630         __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
1631
1632         if (likely(*replicas >= replicas_want))
1633                 goto done;
1634
1635         if (!advance_done) {
1636                 journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq);
1637                 advance_done = true;
1638                 goto retry_alloc;
1639         }
1640
1641         if (*replicas < replicas_want && target) {
1642                 /* Retry from all devices: */
1643                 target = 0;
1644                 advance_done = false;
1645                 goto retry_target;
1646         }
1647 done:
1648         BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1649
1650 #if 0
1651         /*
1652          * XXX: we need a way to alert the user when we go degraded for any
1653          * reason
1654          */
1655         if (*replicas < min(replicas_want,
1656                             dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) {
1657         }
1658 #endif
1659
1660         return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
1661 }
1662
1663 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1664 {
1665         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1666
1667         /* we aren't holding j->lock: */
1668         unsigned new_size = READ_ONCE(j->buf_size_want);
1669         void *new_buf;
1670
1671         if (buf->buf_size >= new_size)
1672                 return;
1673
1674         size_t btree_write_buffer_size = new_size / 64;
1675
1676         if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
1677                 return;
1678
1679         new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
1680         if (!new_buf)
1681                 return;
1682
1683         memcpy(new_buf, buf->data, buf->buf_size);
1684
1685         spin_lock(&j->lock);
1686         swap(buf->data,         new_buf);
1687         swap(buf->buf_size,     new_size);
1688         spin_unlock(&j->lock);
1689
1690         kvfree(new_buf);
1691 }
1692
1693 static CLOSURE_CALLBACK(journal_write_done)
1694 {
1695         closure_type(w, struct journal_buf, io);
1696         struct journal *j = container_of(w, struct journal, buf[w->idx]);
1697         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1698         union bch_replicas_padded replicas;
1699         u64 seq = le64_to_cpu(w->data->seq);
1700         int err = 0;
1701
1702         bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1703                                ? j->flush_write_time
1704                                : j->noflush_write_time, j->write_start_time);
1705
1706         if (!w->devs_written.nr) {
1707                 err = bch_err_throw(c, journal_write_err);
1708         } else {
1709                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1710                                          w->devs_written);
1711                 err = bch2_mark_replicas(c, &replicas.e);
1712         }
1713
1714         if (err && !bch2_journal_error(j)) {
1715                 struct printbuf buf = PRINTBUF;
1716                 bch2_log_msg_start(c, &buf);
1717
1718                 if (err == -BCH_ERR_journal_write_err)
1719                         prt_printf(&buf, "unable to write journal to sufficient devices\n");
1720                 else
1721                         prt_printf(&buf, "journal write error marking replicas: %s\n",
1722                                    bch2_err_str(err));
1723
1724                 bch2_fs_emergency_read_only2(c, &buf);
1725
1726                 bch2_print_str(c, KERN_ERR, buf.buf);
1727                 printbuf_exit(&buf);
1728         }
1729
1730         closure_debug_destroy(cl);
1731
1732         spin_lock(&j->lock);
1733         if (seq >= j->pin.front)
1734                 journal_seq_pin(j, seq)->devs = w->devs_written;
1735         if (err && (!j->err_seq || seq < j->err_seq))
1736                 j->err_seq      = seq;
1737         w->write_done = true;
1738
1739         if (!j->free_buf || j->free_buf_size < w->buf_size) {
1740                 swap(j->free_buf,       w->data);
1741                 swap(j->free_buf_size,  w->buf_size);
1742         }
1743
1744         if (w->data) {
1745                 void *buf = w->data;
1746                 w->data = NULL;
1747                 w->buf_size = 0;
1748
1749                 spin_unlock(&j->lock);
1750                 kvfree(buf);
1751                 spin_lock(&j->lock);
1752         }
1753
1754         bool completed = false;
1755         bool do_discards = false;
1756
1757         for (seq = journal_last_unwritten_seq(j);
1758              seq <= journal_cur_seq(j);
1759              seq++) {
1760                 w = j->buf + (seq & JOURNAL_BUF_MASK);
1761                 if (!w->write_done)
1762                         break;
1763
1764                 if (!j->err_seq && !w->noflush) {
1765                         j->flushed_seq_ondisk = seq;
1766                         j->last_seq_ondisk = w->last_seq;
1767
1768                         closure_wake_up(&c->freelist_wait);
1769                         bch2_reset_alloc_cursors(c);
1770                 }
1771
1772                 j->seq_ondisk = seq;
1773
1774                 /*
1775                  * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1776                  * more buckets:
1777                  *
1778                  * Must come before signaling write completion, for
1779                  * bch2_fs_journal_stop():
1780                  */
1781                 if (j->watermark != BCH_WATERMARK_stripe)
1782                         journal_reclaim_kick(&c->journal);
1783
1784                 closure_wake_up(&w->wait);
1785                 completed = true;
1786         }
1787
1788         if (completed) {
1789                 bch2_journal_reclaim_fast(j);
1790                 bch2_journal_space_available(j);
1791
1792                 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
1793
1794                 journal_wake(j);
1795         }
1796
1797         if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1798             j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
1799                 struct journal_buf *buf = journal_cur_buf(j);
1800                 long delta = buf->expires - jiffies;
1801
1802                 /*
1803                  * We don't close a journal entry to write it while there's
1804                  * previous entries still in flight - the current journal entry
1805                  * might want to be written now:
1806                  */
1807                 mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
1808         }
1809
1810         /*
1811          * We don't typically trigger journal writes from her - the next journal
1812          * write will be triggered immediately after the previous one is
1813          * allocated, in bch2_journal_write() - but the journal write error path
1814          * is special:
1815          */
1816         bch2_journal_do_writes(j);
1817         spin_unlock(&j->lock);
1818
1819         if (do_discards)
1820                 bch2_do_discards(c);
1821 }
1822
1823 static void journal_write_endio(struct bio *bio)
1824 {
1825         struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
1826         struct bch_dev *ca = jbio->ca;
1827         struct journal *j = &ca->fs->journal;
1828         struct journal_buf *w = j->buf + jbio->buf_idx;
1829
1830         bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
1831                                    jbio->submit_time, !bio->bi_status);
1832
1833         if (bio->bi_status) {
1834                 bch_err_dev_ratelimited(ca,
1835                                "error writing journal entry %llu: %s",
1836                                le64_to_cpu(w->data->seq),
1837                                bch2_blk_status_to_str(bio->bi_status));
1838
1839                 unsigned long flags;
1840                 spin_lock_irqsave(&j->err_lock, flags);
1841                 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1842                 spin_unlock_irqrestore(&j->err_lock, flags);
1843         }
1844
1845         closure_put(&w->io);
1846         enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
1847 }
1848
1849 static CLOSURE_CALLBACK(journal_write_submit)
1850 {
1851         closure_type(w, struct journal_buf, io);
1852         struct journal *j = container_of(w, struct journal, buf[w->idx]);
1853         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1854         unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1855
1856         extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1857                 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
1858
1859                 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1860                              sectors);
1861
1862                 struct journal_device *ja = &ca->journal;
1863                 struct journal_bio *jbio = ja->bio[w->idx];
1864                 struct bio *bio = &jbio->bio;
1865
1866                 jbio->submit_time       = local_clock();
1867
1868                 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
1869                 bio->bi_iter.bi_sector  = ptr->offset;
1870                 bio->bi_end_io          = journal_write_endio;
1871                 bio->bi_private         = ca;
1872                 bio->bi_ioprio          = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0);
1873
1874                 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1875                 ca->prev_journal_sector = bio->bi_iter.bi_sector;
1876
1877                 if (!JSET_NO_FLUSH(w->data))
1878                         bio->bi_opf    |= REQ_FUA;
1879                 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1880                         bio->bi_opf    |= REQ_PREFLUSH;
1881
1882                 bch2_bio_map(bio, w->data, sectors << 9);
1883
1884                 trace_and_count(c, journal_write, bio);
1885                 closure_bio_submit(bio, cl);
1886
1887                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1888         }
1889
1890         continue_at(cl, journal_write_done, j->wq);
1891 }
1892
1893 static CLOSURE_CALLBACK(journal_write_preflush)
1894 {
1895         closure_type(w, struct journal_buf, io);
1896         struct journal *j = container_of(w, struct journal, buf[w->idx]);
1897         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1898
1899         /*
1900          * Wait for previous journal writes to comelete; they won't necessarily
1901          * be flushed if they're still in flight
1902          */
1903         if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
1904                 spin_lock(&j->lock);
1905                 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
1906                         closure_wait(&j->async_wait, cl);
1907                         spin_unlock(&j->lock);
1908                         continue_at(cl, journal_write_preflush, j->wq);
1909                         return;
1910                 }
1911                 spin_unlock(&j->lock);
1912         }
1913
1914         if (w->separate_flush) {
1915                 for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) {
1916                         enumerated_ref_get(&ca->io_ref[WRITE],
1917                                            BCH_DEV_WRITE_REF_journal_write);
1918
1919                         struct journal_device *ja = &ca->journal;
1920                         struct bio *bio = &ja->bio[w->idx]->bio;
1921                         bio_reset(bio, ca->disk_sb.bdev,
1922                                   REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
1923                         bio->bi_end_io          = journal_write_endio;
1924                         bio->bi_private         = ca;
1925                         closure_bio_submit(bio, cl);
1926                 }
1927
1928                 continue_at(cl, journal_write_submit, j->wq);
1929         } else {
1930                 /*
1931                  * no need to punt to another work item if we're not waiting on
1932                  * preflushes
1933                  */
1934                 journal_write_submit(&cl->work);
1935         }
1936 }
1937
1938 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
1939 {
1940         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1941         struct jset_entry *start, *end;
1942         struct jset *jset = w->data;
1943         struct journal_keys_to_wb wb = { NULL };
1944         unsigned u64s;
1945         unsigned long btree_roots_have = 0;
1946         u64 seq = le64_to_cpu(jset->seq);
1947         int ret;
1948
1949         /*
1950          * Simple compaction, dropping empty jset_entries (from journal
1951          * reservations that weren't fully used) and merging jset_entries that
1952          * can be.
1953          *
1954          * If we wanted to be really fancy here, we could sort all the keys in
1955          * the jset and drop keys that were overwritten - probably not worth it:
1956          */
1957         vstruct_for_each(jset, i) {
1958                 unsigned u64s = le16_to_cpu(i->u64s);
1959
1960                 /* Empty entry: */
1961                 if (!u64s)
1962                         continue;
1963
1964                 /*
1965                  * New btree roots are set by journalling them; when the journal
1966                  * entry gets written we have to propagate them to
1967                  * c->btree_roots
1968                  *
1969                  * But, every journal entry we write has to contain all the
1970                  * btree roots (at least for now); so after we copy btree roots
1971                  * to c->btree_roots we have to get any missing btree roots and
1972                  * add them to this journal entry:
1973                  */
1974                 switch (i->type) {
1975                 case BCH_JSET_ENTRY_btree_root:
1976                         bch2_journal_entry_to_btree_root(c, i);
1977                         __set_bit(i->btree_id, &btree_roots_have);
1978                         break;
1979                 case BCH_JSET_ENTRY_write_buffer_keys:
1980                         EBUG_ON(!w->need_flush_to_write_buffer);
1981
1982                         if (!wb.wb)
1983                                 bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
1984
1985                         jset_entry_for_each_key(i, k) {
1986                                 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
1987                                 if (ret) {
1988                                         bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
1989                                                             bch2_err_str(ret));
1990                                         bch2_journal_keys_to_write_buffer_end(c, &wb);
1991                                         return ret;
1992                                 }
1993                         }
1994                         i->type = BCH_JSET_ENTRY_btree_keys;
1995                         break;
1996                 }
1997         }
1998
1999         if (wb.wb) {
2000                 ret = bch2_journal_keys_to_write_buffer_end(c, &wb);
2001                 if (ret) {
2002                         bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s",
2003                                             bch2_err_str(ret));
2004                         return ret;
2005                 }
2006         }
2007
2008         spin_lock(&c->journal.lock);
2009         w->need_flush_to_write_buffer = false;
2010         spin_unlock(&c->journal.lock);
2011
2012         start = end = vstruct_last(jset);
2013
2014         end     = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
2015
2016         struct jset_entry_datetime *d =
2017                 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
2018         d->entry.type   = BCH_JSET_ENTRY_datetime;
2019         d->seconds      = cpu_to_le64(ktime_get_real_seconds());
2020
2021         bch2_journal_super_entries_add_common(c, &end, seq);
2022         u64s    = (u64 *) end - (u64 *) start;
2023
2024         WARN_ON(u64s > j->entry_u64s_reserved);
2025
2026         le32_add_cpu(&jset->u64s, u64s);
2027
2028         unsigned sectors = vstruct_sectors(jset, c->block_bits);
2029
2030         if (sectors > w->sectors) {
2031                 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
2032                                     vstruct_bytes(jset), w->sectors << 9,
2033                                     u64s, w->u64s_reserved, j->entry_u64s_reserved);
2034                 return -EINVAL;
2035         }
2036
2037         return 0;
2038 }
2039
2040 static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w)
2041 {
2042         struct bch_fs *c = container_of(j, struct bch_fs, journal);
2043         struct jset *jset = w->data;
2044         u64 seq = le64_to_cpu(jset->seq);
2045         bool validate_before_checksum = false;
2046         int ret = 0;
2047
2048         jset->magic             = cpu_to_le64(jset_magic(c));
2049         jset->version           = cpu_to_le32(c->sb.version);
2050
2051         SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
2052         SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
2053
2054         if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
2055                 j->last_empty_seq = seq;
2056
2057         if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
2058                 validate_before_checksum = true;
2059
2060         if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
2061                 validate_before_checksum = true;
2062
2063         if (validate_before_checksum &&
2064             (ret = jset_validate(c, NULL, jset, 0, WRITE)))
2065                 return ret;
2066
2067         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
2068                     jset->encrypted_start,
2069                     vstruct_end(jset) - (void *) jset->encrypted_start);
2070         if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret)))
2071                 return ret;
2072
2073         jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
2074                                   journal_nonce(jset), jset);
2075
2076         if (!validate_before_checksum &&
2077             (ret = jset_validate(c, NULL, jset, 0, WRITE)))
2078                 return ret;
2079
2080         unsigned sectors = vstruct_sectors(jset, c->block_bits);
2081         unsigned bytes  = vstruct_bytes(jset);
2082         memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
2083         return 0;
2084 }
2085
2086 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
2087 {
2088         struct bch_fs *c = container_of(j, struct bch_fs, journal);
2089         int error = bch2_journal_error(j);
2090
2091         /*
2092          * If the journal is in an error state - we did an emergency shutdown -
2093          * we prefer to continue doing journal writes. We just mark them as
2094          * noflush so they'll never be used, but they'll still be visible by the
2095          * list_journal tool - this helps in debugging.
2096          *
2097          * There's a caveat: the first journal write after marking the
2098          * superblock dirty must always be a flush write, because on startup
2099          * from a clean shutdown we didn't necessarily read the journal and the
2100          * new journal write might overwrite whatever was in the journal
2101          * previously - we can't leave the journal without any flush writes in
2102          * it.
2103          *
2104          * So if we're in an error state, and we're still starting up, we don't
2105          * write anything at all.
2106          */
2107         if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
2108                 return error;
2109
2110         if (error ||
2111             w->noflush ||
2112             (!w->must_flush &&
2113              time_before(jiffies, j->last_flush_write +
2114                  msecs_to_jiffies(c->opts.journal_flush_delay)) &&
2115              test_bit(JOURNAL_may_skip_flush, &j->flags))) {
2116                 w->noflush = true;
2117                 SET_JSET_NO_FLUSH(w->data, true);
2118                 w->data->last_seq       = 0;
2119                 w->last_seq             = 0;
2120
2121                 j->nr_noflush_writes++;
2122         } else {
2123                 w->must_flush = true;
2124                 j->last_flush_write = jiffies;
2125                 j->nr_flush_writes++;
2126                 clear_bit(JOURNAL_need_flush_write, &j->flags);
2127         }
2128
2129         return 0;
2130 }
2131
2132 CLOSURE_CALLBACK(bch2_journal_write)
2133 {
2134         closure_type(w, struct journal_buf, io);
2135         struct journal *j = container_of(w, struct journal, buf[w->idx]);
2136         struct bch_fs *c = container_of(j, struct bch_fs, journal);
2137         union bch_replicas_padded replicas;
2138         unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
2139         int ret;
2140
2141         BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
2142         BUG_ON(!w->write_started);
2143         BUG_ON(w->write_allocated);
2144         BUG_ON(w->write_done);
2145
2146         j->write_start_time = local_clock();
2147
2148         spin_lock(&j->lock);
2149         if (nr_rw_members > 1)
2150                 w->separate_flush = true;
2151
2152         ret = bch2_journal_write_pick_flush(j, w);
2153         spin_unlock(&j->lock);
2154
2155         if (unlikely(ret))
2156                 goto err;
2157
2158         mutex_lock(&j->buf_lock);
2159         journal_buf_realloc(j, w);
2160
2161         ret = bch2_journal_write_prep(j, w);
2162         mutex_unlock(&j->buf_lock);
2163
2164         if (unlikely(ret))
2165                 goto err;
2166
2167         unsigned replicas_allocated = 0;
2168         while (1) {
2169                 ret = journal_write_alloc(j, w, &replicas_allocated);
2170                 if (!ret || !j->can_discard)
2171                         break;
2172
2173                 bch2_journal_do_discards(j);
2174         }
2175
2176         if (unlikely(ret))
2177                 goto err_allocate_write;
2178
2179         ret = bch2_journal_write_checksum(j, w);
2180         if (unlikely(ret))
2181                 goto err;
2182
2183         spin_lock(&j->lock);
2184         /*
2185          * write is allocated, no longer need to account for it in
2186          * bch2_journal_space_available():
2187          */
2188         w->sectors = 0;
2189         w->write_allocated = true;
2190         j->entry_bytes_written += vstruct_bytes(w->data);
2191
2192         /*
2193          * journal entry has been compacted and allocated, recalculate space
2194          * available:
2195          */
2196         bch2_journal_space_available(j);
2197         bch2_journal_do_writes(j);
2198         spin_unlock(&j->lock);
2199
2200         w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
2201
2202         /*
2203          * Mark journal replicas before we submit the write to guarantee
2204          * recovery will find the journal entries after a crash.
2205          */
2206         bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
2207                                  w->devs_written);
2208         ret = bch2_mark_replicas(c, &replicas.e);
2209         if (ret)
2210                 goto err;
2211
2212         if (c->opts.nochanges)
2213                 goto no_io;
2214
2215         if (!JSET_NO_FLUSH(w->data))
2216                 continue_at(cl, journal_write_preflush, j->wq);
2217         else
2218                 continue_at(cl, journal_write_submit, j->wq);
2219         return;
2220 err_allocate_write:
2221         if (!bch2_journal_error(j)) {
2222                 struct printbuf buf = PRINTBUF;
2223
2224                 bch2_journal_debug_to_text(&buf, j);
2225                 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
2226                                           le64_to_cpu(w->data->seq),
2227                                           vstruct_sectors(w->data, c->block_bits),
2228                                           bch2_err_str(ret));
2229                 bch2_print_str(c, KERN_ERR, buf.buf);
2230                 printbuf_exit(&buf);
2231         }
2232 err:
2233         bch2_fatal_error(c);
2234 no_io:
2235         extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
2236                 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
2237                 enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
2238         }
2239
2240         continue_at(cl, journal_write_done, j->wq);
2241 }