bcachefs: Add new assertions for shutdown path
[linux-block.git] / fs / bcachefs / journal_io.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2#include "bcachefs.h"
59cc38b8 3#include "alloc_background.h"
7b3f84ea 4#include "alloc_foreground.h"
39fb2983 5#include "btree_io.h"
00b8ccf7 6#include "btree_update_interior.h"
1c6fdbd8
KO
7#include "buckets.h"
8#include "checksum.h"
d042b040 9#include "disk_groups.h"
1c6fdbd8 10#include "error.h"
63b214e7 11#include "io.h"
1c6fdbd8
KO
12#include "journal.h"
13#include "journal_io.h"
14#include "journal_reclaim.h"
adbcada4 15#include "journal_seq_blacklist.h"
1c6fdbd8
KO
16#include "replicas.h"
17#include "trace.h"
18
17fe3b64
KO
19static struct nonce journal_nonce(const struct jset *jset)
20{
21 return (struct nonce) {{
22 [0] = 0,
23 [1] = ((__le32 *) &jset->seq)[0],
24 [2] = ((__le32 *) &jset->seq)[1],
25 [3] = BCH_NONCE_JOURNAL,
26 }};
27}
28
29static bool jset_csum_good(struct bch_fs *c, struct jset *j)
30{
31 return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
32 !bch2_crc_cmp(j->csum,
33 csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
34}
35
ec7ccbde 36static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
adbcada4 37{
ec7ccbde 38 return (seq - c->journal_entries_base_seq) & (~0U >> 1);
ce6201c4
KO
39}
40
41static void __journal_replay_free(struct bch_fs *c,
42 struct journal_replay *i)
43{
44 struct journal_replay **p =
ec7ccbde
KO
45 genradix_ptr(&c->journal_entries,
46 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
ce6201c4
KO
47
48 BUG_ON(*p != i);
49 *p = NULL;
adbcada4
KO
50 kvpfree(i, offsetof(struct journal_replay, j) +
51 vstruct_bytes(&i->j));
adbcada4
KO
52}
53
54static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
55{
56 i->ignore = true;
57
58 if (!c->opts.read_entire_journal)
ce6201c4 59 __journal_replay_free(c, i);
adbcada4
KO
60}
61
1c6fdbd8
KO
62struct journal_list {
63 struct closure cl;
ec7ccbde 64 u64 last_seq;
1c6fdbd8 65 struct mutex lock;
1c6fdbd8
KO
66 int ret;
67};
68
69#define JOURNAL_ENTRY_ADD_OK 0
70#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
71
72/*
73 * Given a journal entry we just read, add it to the list of journal entries to
74 * be replayed:
75 */
76static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
72b7d633 77 struct journal_ptr entry_ptr,
17fe3b64 78 struct journal_list *jlist, struct jset *j)
1c6fdbd8 79{
ce6201c4
KO
80 struct genradix_iter iter;
81 struct journal_replay **_i, *i, *dup;
72b7d633 82 struct journal_ptr *ptr;
1c6fdbd8 83 size_t bytes = vstruct_bytes(j);
ec7ccbde 84 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
e4c3f386 85 int ret = JOURNAL_ENTRY_ADD_OK;
1c6fdbd8 86
ec7ccbde
KO
87 /* Is this entry older than the range we need? */
88 if (!c->opts.read_entire_journal &&
89 le64_to_cpu(j->seq) < jlist->last_seq)
90 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
91
ce6201c4 92 /*
ec7ccbde
KO
93 * genradixes are indexed by a ulong, not a u64, so we can't index them
94 * by sequence number directly: Assume instead that they will all fall
95 * within the range of +-2billion of the filrst one we find.
ce6201c4
KO
96 */
97 if (!c->journal_entries_base_seq)
98 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
99
adbcada4 100 /* Drop entries we don't need anymore */
ec7ccbde
KO
101 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
102 genradix_for_each_from(&c->journal_entries, iter, _i,
103 journal_entry_radix_idx(c, jlist->last_seq)) {
ce6201c4
KO
104 i = *_i;
105
ec7ccbde 106 if (!i || i->ignore)
ce6201c4
KO
107 continue;
108
ec7ccbde 109 if (le64_to_cpu(i->j.seq) >= last_seq)
7fffc85b 110 break;
adbcada4 111 journal_replay_free(c, i);
7fffc85b 112 }
1c6fdbd8
KO
113 }
114
ec7ccbde
KO
115 jlist->last_seq = max(jlist->last_seq, last_seq);
116
117 _i = genradix_ptr_alloc(&c->journal_entries,
118 journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
119 GFP_KERNEL);
120 if (!_i)
65d48e35 121 return -BCH_ERR_ENOMEM_journal_entry_add;
e4c3f386 122
ca73852a
KO
123 /*
124 * Duplicate journal entries? If so we want the one that didn't have a
125 * checksum error:
126 */
ec7ccbde 127 dup = *_i;
e4c3f386 128 if (dup) {
17fe3b64
KO
129 if (bytes == vstruct_bytes(&dup->j) &&
130 !memcmp(j, &dup->j, bytes)) {
e4c3f386 131 i = dup;
ca73852a 132 goto found;
17fe3b64
KO
133 }
134
135 if (!entry_ptr.csum_good) {
e4c3f386 136 i = dup;
1c6fdbd8
KO
137 goto found;
138 }
1c6fdbd8 139
17fe3b64
KO
140 if (!dup->csum_good)
141 goto replace;
142
143 fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
144 le64_to_cpu(j->seq));
145 i = dup;
146 goto found;
147 }
148replace:
1c6fdbd8 149 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
ec7ccbde 150 if (!i)
65d48e35 151 return -BCH_ERR_ENOMEM_journal_entry_add;
1c6fdbd8 152
17fe3b64
KO
153 i->nr_ptrs = 0;
154 i->csum_good = entry_ptr.csum_good;
e4c3f386 155 i->ignore = false;
1c6fdbd8 156 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
17fe3b64 157 i->ptrs[i->nr_ptrs++] = entry_ptr;
e4c3f386
KO
158
159 if (dup) {
17fe3b64
KO
160 if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
161 bch_err(c, "found too many copies of journal entry %llu",
162 le64_to_cpu(i->j.seq));
163 dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
164 }
165
166 /* The first ptr should represent the jset we kept: */
167 memcpy(i->ptrs + i->nr_ptrs,
168 dup->ptrs,
169 sizeof(dup->ptrs[0]) * dup->nr_ptrs);
170 i->nr_ptrs += dup->nr_ptrs;
ce6201c4 171 __journal_replay_free(c, dup);
e4c3f386
KO
172 }
173
ce6201c4 174 *_i = i;
17fe3b64 175 return 0;
1c6fdbd8 176found:
e4c3f386
KO
177 for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
178 if (ptr->dev == ca->dev_idx) {
179 bch_err(c, "duplicate journal entry %llu on same device",
180 le64_to_cpu(i->j.seq));
181 goto out;
182 }
183 }
184
185 if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
186 bch_err(c, "found too many copies of journal entry %llu",
187 le64_to_cpu(i->j.seq));
188 goto out;
189 }
190
191 i->ptrs[i->nr_ptrs++] = entry_ptr;
1c6fdbd8
KO
192out:
193fsck_err:
194 return ret;
195}
196
1c6fdbd8
KO
197/* this fills in a range with empty jset_entries: */
198static void journal_entry_null_range(void *start, void *end)
199{
200 struct jset_entry *entry;
201
202 for (entry = start; entry != end; entry = vstruct_next(entry))
203 memset(entry, 0, sizeof(*entry));
204}
205
206#define JOURNAL_ENTRY_REREAD 5
207#define JOURNAL_ENTRY_NONE 6
208#define JOURNAL_ENTRY_BAD 7
209
c23a9e08
KO
210static void journal_entry_err_msg(struct printbuf *out,
211 struct jset *jset,
212 struct jset_entry *entry)
213{
214 prt_str(out, "invalid journal entry ");
215 if (entry)
216 prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
217
218 if (!jset)
219 prt_printf(out, "in superblock");
220 else if (!entry)
221 prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
222 else
223 prt_printf(out, "at offset %zi/%u seq %llu",
224 (u64 *) entry - jset->_data,
225 le32_to_cpu(jset->u64s),
226 le64_to_cpu(jset->seq));
227 prt_str(out, ": ");
228}
229
230#define journal_entry_err(c, jset, entry, msg, ...) \
1c6fdbd8 231({ \
c23a9e08
KO
232 struct printbuf buf = PRINTBUF; \
233 \
234 journal_entry_err_msg(&buf, jset, entry); \
235 prt_printf(&buf, msg, ##__VA_ARGS__); \
236 \
1c6fdbd8
KO
237 switch (write) { \
238 case READ: \
c23a9e08 239 mustfix_fsck_err(c, "%s", buf.buf); \
1c6fdbd8
KO
240 break; \
241 case WRITE: \
c23a9e08 242 bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
1c6fdbd8 243 if (bch2_fs_inconsistent(c)) { \
1ed0a5d2 244 ret = -BCH_ERR_fsck_errors_not_fixed; \
1c6fdbd8
KO
245 goto fsck_err; \
246 } \
247 break; \
248 } \
c23a9e08
KO
249 \
250 printbuf_exit(&buf); \
1c6fdbd8
KO
251 true; \
252})
253
c23a9e08
KO
254#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \
255 ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
1c6fdbd8 256
4d54337c
KO
257#define FSCK_DELETED_KEY 5
258
c23a9e08
KO
259static int journal_validate_key(struct bch_fs *c,
260 struct jset *jset,
1c6fdbd8 261 struct jset_entry *entry,
39fb2983 262 unsigned level, enum btree_id btree_id,
cb685ce7 263 struct bkey_i *k,
7d6f07ed 264 unsigned version, int big_endian, int write)
1c6fdbd8
KO
265{
266 void *next = vstruct_next(entry);
f0ac7df2 267 struct printbuf buf = PRINTBUF;
1c6fdbd8
KO
268 int ret = 0;
269
c23a9e08 270 if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
1c6fdbd8
KO
271 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
272 journal_entry_null_range(vstruct_next(entry), next);
4d54337c 273 return FSCK_DELETED_KEY;
1c6fdbd8
KO
274 }
275
276 if (journal_entry_err_on((void *) bkey_next(k) >
c23a9e08
KO
277 (void *) vstruct_next(entry),
278 c, jset, entry,
279 "extends past end of journal entry")) {
1c6fdbd8
KO
280 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
281 journal_entry_null_range(vstruct_next(entry), next);
4d54337c 282 return FSCK_DELETED_KEY;
1c6fdbd8
KO
283 }
284
c23a9e08
KO
285 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
286 c, jset, entry,
287 "bad format %u", k->k.format)) {
4d54337c 288 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
1c6fdbd8
KO
289 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
290 journal_entry_null_range(vstruct_next(entry), next);
4d54337c 291 return FSCK_DELETED_KEY;
1c6fdbd8
KO
292 }
293
39fb2983 294 if (!write)
7d6f07ed
KO
295 bch2_bkey_compat(level, btree_id, version, big_endian,
296 write, NULL, bkey_to_packed(k));
26609b61 297
f0ac7df2 298 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
275c8426 299 __btree_node_type(level, btree_id), write, &buf)) {
f0ac7df2 300 printbuf_reset(&buf);
c23a9e08
KO
301 prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
302 bch2_jset_entry_types[entry->type],
303 (u64 *) entry - jset->_data,
304 le32_to_cpu(jset->u64s),
305 le64_to_cpu(jset->seq));
401ec4db
KO
306 prt_newline(&buf);
307 printbuf_indent_add(&buf, 2);
319f9ac3 308
fa8e94fa 309 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
401ec4db 310 prt_newline(&buf);
f0ac7df2 311 bch2_bkey_invalid(c, bkey_i_to_s_c(k),
275c8426 312 __btree_node_type(level, btree_id), write, &buf);
f0ac7df2
KO
313
314 mustfix_fsck_err(c, "%s", buf.buf);
1c6fdbd8 315
4d54337c 316 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
1c6fdbd8
KO
317 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
318 journal_entry_null_range(vstruct_next(entry), next);
f0ac7df2
KO
319
320 printbuf_exit(&buf);
4d54337c 321 return FSCK_DELETED_KEY;
1c6fdbd8 322 }
26609b61 323
39fb2983 324 if (write)
7d6f07ed
KO
325 bch2_bkey_compat(level, btree_id, version, big_endian,
326 write, NULL, bkey_to_packed(k));
1c6fdbd8 327fsck_err:
f0ac7df2 328 printbuf_exit(&buf);
1c6fdbd8
KO
329 return ret;
330}
331
528b18e6 332static int journal_entry_btree_keys_validate(struct bch_fs *c,
c23a9e08 333 struct jset *jset,
1c6fdbd8 334 struct jset_entry *entry,
7d6f07ed 335 unsigned version, int big_endian, int write)
1c6fdbd8 336{
4d54337c 337 struct bkey_i *k = entry->start;
1c6fdbd8 338
4d54337c 339 while (k != vstruct_last(entry)) {
c23a9e08 340 int ret = journal_validate_key(c, jset, entry,
39fb2983
KO
341 entry->level,
342 entry->btree_id,
8726dc93
KO
343 k, version, big_endian,
344 write|BKEY_INVALID_JOURNAL);
4d54337c
KO
345 if (ret == FSCK_DELETED_KEY)
346 continue;
347
348 k = bkey_next(k);
1c6fdbd8
KO
349 }
350
351 return 0;
352}
353
528b18e6
KO
354static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
355 struct jset_entry *entry)
356{
357 struct bkey_i *k;
e7bc7cdf 358 bool first = true;
528b18e6 359
ac2ccddc 360 jset_entry_for_each_key(entry, k) {
e7bc7cdf 361 if (!first) {
401ec4db
KO
362 prt_newline(out);
363 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
e7bc7cdf 364 }
401ec4db 365 prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
528b18e6 366 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
e7bc7cdf
KO
367 first = false;
368 }
528b18e6
KO
369}
370
371static int journal_entry_btree_root_validate(struct bch_fs *c,
c23a9e08 372 struct jset *jset,
1c6fdbd8 373 struct jset_entry *entry,
7d6f07ed 374 unsigned version, int big_endian, int write)
1c6fdbd8
KO
375{
376 struct bkey_i *k = entry->start;
377 int ret = 0;
378
379 if (journal_entry_err_on(!entry->u64s ||
c23a9e08
KO
380 le16_to_cpu(entry->u64s) != k->k.u64s,
381 c, jset, entry,
1c6fdbd8
KO
382 "invalid btree root journal entry: wrong number of keys")) {
383 void *next = vstruct_next(entry);
384 /*
385 * we don't want to null out this jset_entry,
386 * just the contents, so that later we can tell
387 * we were _supposed_ to have a btree root
388 */
389 entry->u64s = 0;
390 journal_entry_null_range(vstruct_next(entry), next);
391 return 0;
392 }
393
c23a9e08 394 return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
cb685ce7 395 version, big_endian, write);
1c6fdbd8
KO
396fsck_err:
397 return ret;
398}
399
528b18e6
KO
400static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
401 struct jset_entry *entry)
402{
403 journal_entry_btree_keys_to_text(out, c, entry);
404}
405
406static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
c23a9e08 407 struct jset *jset,
1c6fdbd8 408 struct jset_entry *entry,
7d6f07ed 409 unsigned version, int big_endian, int write)
1c6fdbd8
KO
410{
411 /* obsolete, don't care: */
412 return 0;
413}
414
528b18e6
KO
415static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
416 struct jset_entry *entry)
417{
418}
419
420static int journal_entry_blacklist_validate(struct bch_fs *c,
c23a9e08 421 struct jset *jset,
1c6fdbd8 422 struct jset_entry *entry,
7d6f07ed 423 unsigned version, int big_endian, int write)
1c6fdbd8
KO
424{
425 int ret = 0;
426
c23a9e08
KO
427 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
428 c, jset, entry,
1c6fdbd8
KO
429 "invalid journal seq blacklist entry: bad size")) {
430 journal_entry_null_range(entry, vstruct_next(entry));
431 }
432fsck_err:
433 return ret;
434}
435
528b18e6
KO
436static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
437 struct jset_entry *entry)
438{
439 struct jset_entry_blacklist *bl =
440 container_of(entry, struct jset_entry_blacklist, entry);
441
401ec4db 442 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
528b18e6
KO
443}
444
445static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
c23a9e08 446 struct jset *jset,
1c6fdbd8 447 struct jset_entry *entry,
7d6f07ed 448 unsigned version, int big_endian, int write)
1c6fdbd8
KO
449{
450 struct jset_entry_blacklist_v2 *bl_entry;
451 int ret = 0;
452
c23a9e08
KO
453 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
454 c, jset, entry,
1c6fdbd8
KO
455 "invalid journal seq blacklist entry: bad size")) {
456 journal_entry_null_range(entry, vstruct_next(entry));
2c5af169 457 goto out;
1c6fdbd8
KO
458 }
459
460 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
461
462 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
c23a9e08
KO
463 le64_to_cpu(bl_entry->end),
464 c, jset, entry,
1c6fdbd8
KO
465 "invalid journal seq blacklist entry: start > end")) {
466 journal_entry_null_range(entry, vstruct_next(entry));
467 }
2c5af169
KO
468out:
469fsck_err:
470 return ret;
471}
472
528b18e6
KO
473static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
474 struct jset_entry *entry)
475{
476 struct jset_entry_blacklist_v2 *bl =
477 container_of(entry, struct jset_entry_blacklist_v2, entry);
478
401ec4db 479 prt_printf(out, "start=%llu end=%llu",
528b18e6
KO
480 le64_to_cpu(bl->start),
481 le64_to_cpu(bl->end));
482}
483
484static int journal_entry_usage_validate(struct bch_fs *c,
c23a9e08 485 struct jset *jset,
2c5af169 486 struct jset_entry *entry,
7d6f07ed 487 unsigned version, int big_endian, int write)
2c5af169
KO
488{
489 struct jset_entry_usage *u =
490 container_of(entry, struct jset_entry_usage, entry);
491 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
492 int ret = 0;
493
3577df5f 494 if (journal_entry_err_on(bytes < sizeof(*u),
c23a9e08 495 c, jset, entry,
3577df5f
KO
496 "invalid journal entry usage: bad size")) {
497 journal_entry_null_range(entry, vstruct_next(entry));
498 return ret;
499 }
500
501fsck_err:
502 return ret;
503}
504
528b18e6
KO
505static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
506 struct jset_entry *entry)
507{
508 struct jset_entry_usage *u =
509 container_of(entry, struct jset_entry_usage, entry);
510
401ec4db 511 prt_printf(out, "type=%s v=%llu",
528b18e6
KO
512 bch2_fs_usage_types[u->entry.btree_id],
513 le64_to_cpu(u->v));
514}
515
516static int journal_entry_data_usage_validate(struct bch_fs *c,
c23a9e08 517 struct jset *jset,
3577df5f 518 struct jset_entry *entry,
7d6f07ed 519 unsigned version, int big_endian, int write)
3577df5f
KO
520{
521 struct jset_entry_data_usage *u =
522 container_of(entry, struct jset_entry_data_usage, entry);
523 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
524 int ret = 0;
525
2c5af169
KO
526 if (journal_entry_err_on(bytes < sizeof(*u) ||
527 bytes < sizeof(*u) + u->r.nr_devs,
c23a9e08 528 c, jset, entry,
2c5af169
KO
529 "invalid journal entry usage: bad size")) {
530 journal_entry_null_range(entry, vstruct_next(entry));
531 return ret;
532 }
1c6fdbd8
KO
533
534fsck_err:
535 return ret;
536}
537
528b18e6
KO
538static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
539 struct jset_entry *entry)
540{
541 struct jset_entry_data_usage *u =
542 container_of(entry, struct jset_entry_data_usage, entry);
543
544 bch2_replicas_entry_to_text(out, &u->r);
401ec4db 545 prt_printf(out, "=%llu", le64_to_cpu(u->v));
528b18e6
KO
546}
547
548static int journal_entry_clock_validate(struct bch_fs *c,
c23a9e08 549 struct jset *jset,
2abe5420 550 struct jset_entry *entry,
7d6f07ed 551 unsigned version, int big_endian, int write)
2abe5420
KO
552{
553 struct jset_entry_clock *clock =
554 container_of(entry, struct jset_entry_clock, entry);
555 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
556 int ret = 0;
557
558 if (journal_entry_err_on(bytes != sizeof(*clock),
c23a9e08 559 c, jset, entry, "bad size")) {
2abe5420
KO
560 journal_entry_null_range(entry, vstruct_next(entry));
561 return ret;
562 }
563
564 if (journal_entry_err_on(clock->rw > 1,
c23a9e08 565 c, jset, entry, "bad rw")) {
2abe5420
KO
566 journal_entry_null_range(entry, vstruct_next(entry));
567 return ret;
568 }
569
570fsck_err:
571 return ret;
572}
573
528b18e6
KO
574static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
575 struct jset_entry *entry)
576{
577 struct jset_entry_clock *clock =
578 container_of(entry, struct jset_entry_clock, entry);
579
401ec4db 580 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
528b18e6
KO
581}
582
583static int journal_entry_dev_usage_validate(struct bch_fs *c,
c23a9e08 584 struct jset *jset,
180fb49d 585 struct jset_entry *entry,
7d6f07ed 586 unsigned version, int big_endian, int write)
180fb49d
KO
587{
588 struct jset_entry_dev_usage *u =
589 container_of(entry, struct jset_entry_dev_usage, entry);
590 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
45c2e33f 591 unsigned expected = sizeof(*u);
180fb49d
KO
592 unsigned dev;
593 int ret = 0;
594
595 if (journal_entry_err_on(bytes < expected,
c23a9e08 596 c, jset, entry, "bad size (%u < %u)",
180fb49d
KO
597 bytes, expected)) {
598 journal_entry_null_range(entry, vstruct_next(entry));
599 return ret;
600 }
601
602 dev = le32_to_cpu(u->dev);
603
604 if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
c23a9e08 605 c, jset, entry, "bad dev")) {
180fb49d
KO
606 journal_entry_null_range(entry, vstruct_next(entry));
607 return ret;
608 }
609
610 if (journal_entry_err_on(u->pad,
c23a9e08 611 c, jset, entry, "bad pad")) {
180fb49d
KO
612 journal_entry_null_range(entry, vstruct_next(entry));
613 return ret;
614 }
615
616fsck_err:
617 return ret;
618}
619
528b18e6
KO
620static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
621 struct jset_entry *entry)
622{
623 struct jset_entry_dev_usage *u =
624 container_of(entry, struct jset_entry_dev_usage, entry);
625 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
626
401ec4db 627 prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
528b18e6
KO
628
629 for (i = 0; i < nr_types; i++) {
630 if (i < BCH_DATA_NR)
401ec4db 631 prt_printf(out, " %s", bch2_data_types[i]);
528b18e6 632 else
401ec4db
KO
633 prt_printf(out, " (unknown data type %u)", i);
634 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
528b18e6
KO
635 le64_to_cpu(u->d[i].buckets),
636 le64_to_cpu(u->d[i].sectors),
637 le64_to_cpu(u->d[i].fragmented));
638 }
639
401ec4db 640 prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
528b18e6
KO
641}
642
643static int journal_entry_log_validate(struct bch_fs *c,
c23a9e08 644 struct jset *jset,
fb64f3fd
KO
645 struct jset_entry *entry,
646 unsigned version, int big_endian, int write)
647{
648 return 0;
649}
650
528b18e6
KO
651static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
652 struct jset_entry *entry)
653{
654 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
655 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
656
401ec4db 657 prt_printf(out, "%.*s", bytes, l->d);
528b18e6
KO
658}
659
c23a9e08
KO
660static int journal_entry_overwrite_validate(struct bch_fs *c,
661 struct jset *jset,
cb685ce7
KO
662 struct jset_entry *entry,
663 unsigned version, int big_endian, int write)
664{
dbe17f18
KO
665 return journal_entry_btree_keys_validate(c, jset, entry,
666 version, big_endian, READ);
cb685ce7
KO
667}
668
669static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
670 struct jset_entry *entry)
671{
672 journal_entry_btree_keys_to_text(out, c, entry);
673}
674
1c6fdbd8 675struct jset_entry_ops {
c23a9e08 676 int (*validate)(struct bch_fs *, struct jset *,
7d6f07ed 677 struct jset_entry *, unsigned, int, int);
528b18e6 678 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
1c6fdbd8
KO
679};
680
681static const struct jset_entry_ops bch2_jset_entry_ops[] = {
682#define x(f, nr) \
683 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
528b18e6
KO
684 .validate = journal_entry_##f##_validate, \
685 .to_text = journal_entry_##f##_to_text, \
1c6fdbd8
KO
686 },
687 BCH_JSET_ENTRY_TYPES()
688#undef x
689};
690
c23a9e08
KO
691int bch2_journal_entry_validate(struct bch_fs *c,
692 struct jset *jset,
7d6f07ed
KO
693 struct jset_entry *entry,
694 unsigned version, int big_endian, int write)
1c6fdbd8 695{
2c5af169 696 return entry->type < BCH_JSET_ENTRY_NR
c23a9e08 697 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
7d6f07ed 698 version, big_endian, write)
2c5af169 699 : 0;
1c6fdbd8
KO
700}
701
528b18e6
KO
702void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
703 struct jset_entry *entry)
704{
705 if (entry->type < BCH_JSET_ENTRY_NR) {
401ec4db 706 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
528b18e6
KO
707 bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
708 } else {
401ec4db 709 prt_printf(out, "(unknown type %u)", entry->type);
528b18e6
KO
710 }
711}
712
1c6fdbd8
KO
713static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
714 int write)
715{
716 struct jset_entry *entry;
717 int ret = 0;
718
719 vstruct_for_each(jset, entry) {
720 if (journal_entry_err_on(vstruct_next(entry) >
c23a9e08 721 vstruct_last(jset), c, jset, entry,
1c6fdbd8
KO
722 "journal entry extends past end of jset")) {
723 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
724 break;
725 }
726
c23a9e08 727 ret = bch2_journal_entry_validate(c, jset, entry,
7d6f07ed
KO
728 le32_to_cpu(jset->version),
729 JSET_BIG_ENDIAN(jset), write);
1c6fdbd8
KO
730 if (ret)
731 break;
732 }
733fsck_err:
734 return ret;
735}
736
737static int jset_validate(struct bch_fs *c,
ca73852a 738 struct bch_dev *ca,
1c6fdbd8 739 struct jset *jset, u64 sector,
1c6fdbd8
KO
740 int write)
741{
26609b61 742 unsigned version;
1c6fdbd8
KO
743 int ret = 0;
744
745 if (le64_to_cpu(jset->magic) != jset_magic(c))
746 return JOURNAL_ENTRY_NONE;
747
26609b61 748 version = le32_to_cpu(jset->version);
a02a0121
KO
749 if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
750 "%s sector %llu seq %llu: incompatible journal entry version %u",
ed9d58a2 751 ca ? ca->name : c->name,
a02a0121 752 sector, le64_to_cpu(jset->seq), version)) {
35ef6df5 753 /* don't try to continue: */
d1b2c864 754 return -EINVAL;
1c6fdbd8
KO
755 }
756
c23a9e08
KO
757 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
758 c, jset, NULL,
ca73852a 759 "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ed9d58a2
KO
760 ca ? ca->name : c->name,
761 sector, le64_to_cpu(jset->seq),
d1b2c864 762 JSET_CSUM_TYPE(jset)))
35ef6df5 763 ret = JOURNAL_ENTRY_BAD;
1c6fdbd8 764
ed9d58a2
KO
765 /* last_seq is ignored when JSET_NO_FLUSH is true */
766 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
c23a9e08
KO
767 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
768 c, jset, NULL,
ed9d58a2
KO
769 "invalid journal entry: last_seq > seq (%llu > %llu)",
770 le64_to_cpu(jset->last_seq),
771 le64_to_cpu(jset->seq))) {
1c6fdbd8 772 jset->last_seq = jset->seq;
ca73852a
KO
773 return JOURNAL_ENTRY_BAD;
774 }
d1b2c864
KO
775
776 ret = jset_validate_entries(c, jset, write);
1c6fdbd8
KO
777fsck_err:
778 return ret;
779}
780
d1b2c864
KO
781static int jset_validate_early(struct bch_fs *c,
782 struct bch_dev *ca,
783 struct jset *jset, u64 sector,
784 unsigned bucket_sectors_left,
785 unsigned sectors_read)
ed9d58a2 786{
d1b2c864
KO
787 size_t bytes = vstruct_bytes(jset);
788 unsigned version;
789 int write = READ;
790 int ret = 0;
791
792 if (le64_to_cpu(jset->magic) != jset_magic(c))
793 return JOURNAL_ENTRY_NONE;
794
795 version = le32_to_cpu(jset->version);
a02a0121 796 if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
d1b2c864
KO
797 "%s sector %llu seq %llu: unknown journal entry version %u",
798 ca ? ca->name : c->name,
a02a0121 799 sector, le64_to_cpu(jset->seq), version)) {
d1b2c864
KO
800 /* don't try to continue: */
801 return -EINVAL;
802 }
803
804 if (bytes > (sectors_read << 9) &&
805 sectors_read < bucket_sectors_left)
806 return JOURNAL_ENTRY_REREAD;
ed9d58a2 807
d1b2c864
KO
808 if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
809 c, jset, NULL,
810 "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
811 ca ? ca->name : c->name,
812 sector, le64_to_cpu(jset->seq), bytes))
813 le32_add_cpu(&jset->u64s,
814 -((bytes - (bucket_sectors_left << 9)) / 8));
815fsck_err:
816 return ret;
ed9d58a2
KO
817}
818
1c6fdbd8
KO
819struct journal_read_buf {
820 void *data;
821 size_t size;
822};
823
824static int journal_read_buf_realloc(struct journal_read_buf *b,
825 size_t new_size)
826{
827 void *n;
828
829 /* the bios are sized for this many pages, max: */
830 if (new_size > JOURNAL_ENTRY_SIZE_MAX)
65d48e35 831 return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
1c6fdbd8
KO
832
833 new_size = roundup_pow_of_two(new_size);
834 n = kvpmalloc(new_size, GFP_KERNEL);
835 if (!n)
65d48e35 836 return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
1c6fdbd8
KO
837
838 kvpfree(b->data, b->size);
839 b->data = n;
840 b->size = new_size;
841 return 0;
842}
843
844static int journal_read_bucket(struct bch_dev *ca,
845 struct journal_read_buf *buf,
846 struct journal_list *jlist,
a9ec3454 847 unsigned bucket)
1c6fdbd8
KO
848{
849 struct bch_fs *c = ca->fs;
850 struct journal_device *ja = &ca->journal;
1c6fdbd8
KO
851 struct jset *j = NULL;
852 unsigned sectors, sectors_read = 0;
853 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
854 end = offset + ca->mi.bucket_size;
17fe3b64 855 bool saw_bad = false, csum_good;
1c6fdbd8
KO
856 int ret = 0;
857
858 pr_debug("reading %u", bucket);
859
860 while (offset < end) {
861 if (!sectors_read) {
ac10a961
KO
862 struct bio *bio;
863 unsigned nr_bvecs;
864reread:
865 sectors_read = min_t(unsigned,
1c6fdbd8 866 end - offset, buf->size >> 9);
ac10a961
KO
867 nr_bvecs = buf_pages(buf->data, sectors_read << 9);
868
869 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
870 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
1c6fdbd8 871
885678f6
KO
872 bio->bi_iter.bi_sector = offset;
873 bch2_bio_map(bio, buf->data, sectors_read << 9);
1c6fdbd8
KO
874
875 ret = submit_bio_wait(bio);
ac10a961 876 kfree(bio);
1c6fdbd8
KO
877
878 if (bch2_dev_io_err_on(ret, ca,
0fefe8d8 879 "journal read error: sector %llu",
1c6fdbd8 880 offset) ||
29d90f61
KO
881 bch2_meta_read_fault("journal")) {
882 /*
883 * We don't error out of the recovery process
884 * here, since the relevant journal entry may be
885 * found on a different device, and missing or
886 * no journal entries will be handled later
887 */
888 return 0;
889 }
1c6fdbd8
KO
890
891 j = buf->data;
892 }
893
d1b2c864
KO
894 ret = jset_validate_early(c, ca, j, offset,
895 end - offset, sectors_read);
1c6fdbd8 896 switch (ret) {
1ed0a5d2 897 case 0:
ca73852a 898 sectors = vstruct_sectors(j, c->block_bits);
1c6fdbd8
KO
899 break;
900 case JOURNAL_ENTRY_REREAD:
901 if (vstruct_bytes(j) > buf->size) {
902 ret = journal_read_buf_realloc(buf,
903 vstruct_bytes(j));
904 if (ret)
905 return ret;
906 }
907 goto reread;
908 case JOURNAL_ENTRY_NONE:
909 if (!saw_bad)
910 return 0;
ca73852a
KO
911 /*
912 * On checksum error we don't really trust the size
913 * field of the journal entry we read, so try reading
914 * again at next block boundary:
915 */
8244f320 916 sectors = block_sectors(c);
d1b2c864 917 goto next_block;
1c6fdbd8
KO
918 default:
919 return ret;
920 }
921
922 /*
923 * This happens sometimes if we don't have discards on -
924 * when we've partially overwritten a bucket with new
925 * journal entries. We don't need the rest of the
926 * bucket:
927 */
928 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
929 return 0;
930
931 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
932
17fe3b64
KO
933 csum_good = jset_csum_good(c, j);
934 if (!csum_good)
935 saw_bad = true;
936
d1b2c864
KO
937 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
938 j->encrypted_start,
939 vstruct_end(j) - (void *) j->encrypted_start);
940 bch2_fs_fatal_err_on(ret, c,
941 "error decrypting journal entry: %i", ret);
942
1c6fdbd8 943 mutex_lock(&jlist->lock);
72b7d633 944 ret = journal_entry_add(c, ca, (struct journal_ptr) {
17fe3b64 945 .csum_good = csum_good,
72b7d633
KO
946 .dev = ca->dev_idx,
947 .bucket = bucket,
948 .bucket_offset = offset -
949 bucket_to_sector(ca, ja->buckets[bucket]),
950 .sector = offset,
17fe3b64 951 }, jlist, j);
1c6fdbd8
KO
952 mutex_unlock(&jlist->lock);
953
954 switch (ret) {
955 case JOURNAL_ENTRY_ADD_OK:
1c6fdbd8
KO
956 break;
957 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
958 break;
959 default:
960 return ret;
961 }
1c6fdbd8
KO
962next_block:
963 pr_debug("next");
964 offset += sectors;
965 sectors_read -= sectors;
966 j = ((void *) j) + (sectors << 9);
967 }
968
969 return 0;
970}
971
972static void bch2_journal_read_device(struct closure *cl)
973{
1c6fdbd8
KO
974 struct journal_device *ja =
975 container_of(cl, struct journal_device, read);
976 struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
365f64f3 977 struct bch_fs *c = ca->fs;
1c6fdbd8
KO
978 struct journal_list *jlist =
979 container_of(cl->parent, struct journal_list, cl);
ce6201c4
KO
980 struct journal_replay *r, **_r;
981 struct genradix_iter iter;
1c6fdbd8 982 struct journal_read_buf buf = { NULL, 0 };
a9ec3454 983 unsigned i;
9714baaa 984 int ret = 0;
1c6fdbd8
KO
985
986 if (!ja->nr)
987 goto out;
988
1c6fdbd8
KO
989 ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
990 if (ret)
991 goto err;
992
993 pr_debug("%u journal buckets", ja->nr);
994
1c6fdbd8 995 for (i = 0; i < ja->nr; i++) {
a9ec3454
KO
996 ret = journal_read_bucket(ca, &buf, jlist, i);
997 if (ret)
998 goto err;
1c6fdbd8
KO
999 }
1000
062afcba
KO
1001 ja->sectors_free = ca->mi.bucket_size;
1002
1003 mutex_lock(&jlist->lock);
230fa1c7 1004 genradix_for_each_reverse(&c->journal_entries, iter, _r) {
ce6201c4
KO
1005 r = *_r;
1006
1007 if (!r)
1008 continue;
1009
062afcba 1010 for (i = 0; i < r->nr_ptrs; i++) {
230fa1c7 1011 if (r->ptrs[i].dev == ca->dev_idx) {
502f973d 1012 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
062afcba
KO
1013 vstruct_sectors(&r->j, c->block_bits);
1014
230fa1c7
KO
1015 ja->cur_idx = r->ptrs[i].bucket;
1016 ja->sectors_free = ca->mi.bucket_size - wrote;
1017 goto found;
062afcba
KO
1018 }
1019 }
1020 }
230fa1c7 1021found:
062afcba
KO
1022 mutex_unlock(&jlist->lock);
1023
b0be2fcf
KO
1024 if (ja->bucket_seq[ja->cur_idx] &&
1025 ja->sectors_free == ca->mi.bucket_size) {
1026 bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
1027 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
1028 for (i = 0; i < 3; i++) {
1029 unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
1030 bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
1031 }
1032 ja->sectors_free = 0;
1033 }
1c6fdbd8
KO
1034
1035 /*
0ce2dbbe 1036 * Set dirty_idx to indicate the entire journal is full and needs to be
1c6fdbd8
KO
1037 * reclaimed - journal reclaim will immediately reclaim whatever isn't
1038 * pinned when it first runs:
1039 */
0ce2dbbe
KO
1040 ja->discard_idx = ja->dirty_idx_ondisk =
1041 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1c6fdbd8 1042out:
365f64f3 1043 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
1c6fdbd8 1044 kvpfree(buf.data, buf.size);
1c6fdbd8
KO
1045 percpu_ref_put(&ca->io_ref);
1046 closure_return(cl);
1047 return;
1048err:
1049 mutex_lock(&jlist->lock);
1050 jlist->ret = ret;
1051 mutex_unlock(&jlist->lock);
1052 goto out;
1c6fdbd8
KO
1053}
1054
72b7d633
KO
1055void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1056 struct journal_replay *j)
e4c3f386
KO
1057{
1058 unsigned i;
1059
1060 for (i = 0; i < j->nr_ptrs; i++) {
c0ebe3e4 1061 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
514852c2
KO
1062 u64 offset;
1063
72b7d633 1064 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
e4c3f386
KO
1065
1066 if (i)
401ec4db
KO
1067 prt_printf(out, " ");
1068 prt_printf(out, "%u:%u:%u (sector %llu)",
e4c3f386 1069 j->ptrs[i].dev,
72b7d633
KO
1070 j->ptrs[i].bucket,
1071 j->ptrs[i].bucket_offset,
1072 j->ptrs[i].sector);
e4c3f386
KO
1073 }
1074}
1075
5bbe3f2d
KO
1076int bch2_journal_read(struct bch_fs *c,
1077 u64 *last_seq,
1078 u64 *blacklist_seq,
1079 u64 *start_seq)
1c6fdbd8 1080{
1c6fdbd8 1081 struct journal_list jlist;
ce6201c4
KO
1082 struct journal_replay *i, **_i, *prev = NULL;
1083 struct genradix_iter radix_iter;
1c6fdbd8 1084 struct bch_dev *ca;
1c6fdbd8 1085 unsigned iter;
fa8e94fa 1086 struct printbuf buf = PRINTBUF;
dab1e248 1087 bool degraded = false, last_write_torn = false;
5bbe3f2d 1088 u64 seq;
1c6fdbd8
KO
1089 int ret = 0;
1090
1091 closure_init_stack(&jlist.cl);
1092 mutex_init(&jlist.lock);
ec7ccbde 1093 jlist.last_seq = 0;
1c6fdbd8
KO
1094 jlist.ret = 0;
1095
1096 for_each_member_device(ca, c, iter) {
75c8d030 1097 if (!c->opts.fsck &&
89fd25be 1098 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1c6fdbd8
KO
1099 continue;
1100
2436cb9f
KO
1101 if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1102 ca->mi.state == BCH_MEMBER_STATE_ro) &&
1c6fdbd8
KO
1103 percpu_ref_tryget(&ca->io_ref))
1104 closure_call(&ca->journal.read,
1105 bch2_journal_read_device,
1106 system_unbound_wq,
1107 &jlist.cl);
1108 else
1109 degraded = true;
1110 }
1111
1112 closure_sync(&jlist.cl);
1113
1114 if (jlist.ret)
1115 return jlist.ret;
1116
5bbe3f2d 1117 *last_seq = 0;
ff56d68c
KO
1118 *start_seq = 0;
1119 *blacklist_seq = 0;
adbcada4
KO
1120
1121 /*
1122 * Find most recent flush entry, and ignore newer non flush entries -
1123 * those entries will be blacklisted:
1124 */
ce6201c4 1125 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
dab1e248
KO
1126 int write = READ;
1127
ce6201c4
KO
1128 i = *_i;
1129
1130 if (!i || i->ignore)
adbcada4
KO
1131 continue;
1132
ce6201c4 1133 if (!*start_seq)
ff56d68c 1134 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
ce6201c4 1135
dab1e248 1136 if (JSET_NO_FLUSH(&i->j)) {
1ba8a796 1137 i->ignore = true;
dab1e248 1138 continue;
adbcada4
KO
1139 }
1140
dab1e248
KO
1141 if (!last_write_torn && !i->csum_good) {
1142 last_write_torn = true;
1ba8a796 1143 i->ignore = true;
dab1e248
KO
1144 continue;
1145 }
1146
1147 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
1148 c, &i->j, NULL,
1149 "invalid journal entry: last_seq > seq (%llu > %llu)",
1150 le64_to_cpu(i->j.last_seq),
1151 le64_to_cpu(i->j.seq)))
1152 i->j.last_seq = i->j.seq;
1153
5bbe3f2d 1154 *last_seq = le64_to_cpu(i->j.last_seq);
dab1e248
KO
1155 *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
1156 break;
adbcada4
KO
1157 }
1158
ce6201c4
KO
1159 if (!*start_seq) {
1160 bch_info(c, "journal read done, but no entries found");
1161 return 0;
1162 }
1163
5bbe3f2d 1164 if (!*last_seq) {
adbcada4 1165 fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
e0de429a 1166 return 0;
adbcada4
KO
1167 }
1168
ff56d68c 1169 bch_info(c, "journal read done, replaying entries %llu-%llu",
5bbe3f2d 1170 *last_seq, *blacklist_seq - 1);
ff56d68c
KO
1171
1172 if (*start_seq != *blacklist_seq)
1173 bch_info(c, "dropped unflushed entries %llu-%llu",
1174 *blacklist_seq, *start_seq - 1);
1175
adbcada4 1176 /* Drop blacklisted entries and entries older than last_seq: */
ce6201c4
KO
1177 genradix_for_each(&c->journal_entries, radix_iter, _i) {
1178 i = *_i;
1179
1180 if (!i || i->ignore)
adbcada4
KO
1181 continue;
1182
1183 seq = le64_to_cpu(i->j.seq);
5bbe3f2d 1184 if (seq < *last_seq) {
adbcada4
KO
1185 journal_replay_free(c, i);
1186 continue;
1187 }
1188
1189 if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1190 fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1191 "found blacklisted journal entry %llu", seq);
1ba8a796 1192 i->ignore = true;
adbcada4
KO
1193 }
1194 }
1195
1196 /* Check for missing entries: */
5bbe3f2d 1197 seq = *last_seq;
ce6201c4
KO
1198 genradix_for_each(&c->journal_entries, radix_iter, _i) {
1199 i = *_i;
1200
1201 if (!i || i->ignore)
adbcada4
KO
1202 continue;
1203
1204 BUG_ON(seq > le64_to_cpu(i->j.seq));
1205
1206 while (seq < le64_to_cpu(i->j.seq)) {
1207 u64 missing_start, missing_end;
fa8e94fa 1208 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
adbcada4
KO
1209
1210 while (seq < le64_to_cpu(i->j.seq) &&
1211 bch2_journal_seq_is_blacklisted(c, seq, false))
1212 seq++;
1213
1214 if (seq == le64_to_cpu(i->j.seq))
1215 break;
1216
1217 missing_start = seq;
1218
1219 while (seq < le64_to_cpu(i->j.seq) &&
1220 !bch2_journal_seq_is_blacklisted(c, seq, false))
1221 seq++;
1222
ce6201c4
KO
1223 if (prev) {
1224 bch2_journal_ptrs_to_text(&buf1, c, prev);
401ec4db 1225 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
e4c3f386 1226 } else
401ec4db 1227 prt_printf(&buf1, "(none)");
fa8e94fa 1228 bch2_journal_ptrs_to_text(&buf2, c, i);
e4c3f386 1229
adbcada4 1230 missing_end = seq - 1;
e4c3f386
KO
1231 fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
1232 " prev at %s\n"
1233 " next at %s",
adbcada4 1234 missing_start, missing_end,
5bbe3f2d 1235 *last_seq, *blacklist_seq - 1,
fa8e94fa
KO
1236 buf1.buf, buf2.buf);
1237
1238 printbuf_exit(&buf1);
1239 printbuf_exit(&buf2);
adbcada4
KO
1240 }
1241
ce6201c4 1242 prev = i;
adbcada4
KO
1243 seq++;
1244 }
1245
ce6201c4 1246 genradix_for_each(&c->journal_entries, radix_iter, _i) {
e4c3f386
KO
1247 struct bch_replicas_padded replicas = {
1248 .e.data_type = BCH_DATA_journal,
1249 .e.nr_required = 1,
1250 };
1251 unsigned ptr;
7ef2a73a 1252
ce6201c4
KO
1253 i = *_i;
1254 if (!i || i->ignore)
adbcada4
KO
1255 continue;
1256
d1b2c864
KO
1257 for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
1258 struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
1259
1260 if (!i->ptrs[ptr].csum_good)
7fec8266
KO
1261 bch_err_dev_offset(ca, i->ptrs[ptr].sector,
1262 "invalid journal checksum, seq %llu%s",
1263 le64_to_cpu(i->j.seq),
1264 i->csum_good ? " (had good copy on another device)" : "");
d1b2c864
KO
1265 }
1266
1267 ret = jset_validate(c,
1268 bch_dev_bkey_exists(c, i->ptrs[0].dev),
1269 &i->j,
1270 i->ptrs[0].sector,
1271 READ);
1c6fdbd8 1272 if (ret)
fa8e94fa 1273 goto err;
1c6fdbd8 1274
e4c3f386
KO
1275 for (ptr = 0; ptr < i->nr_ptrs; ptr++)
1276 replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
1277
26452d1d
KO
1278 bch2_replicas_entry_sort(&replicas.e);
1279
1c6fdbd8
KO
1280 /*
1281 * If we're mounting in degraded mode - if we didn't read all
1282 * the devices - this is wrong:
1283 */
1284
fa8e94fa
KO
1285 printbuf_reset(&buf);
1286 bch2_replicas_entry_to_text(&buf, &replicas.e);
1287
1c6fdbd8 1288 if (!degraded &&
75c8d030
KO
1289 fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
1290 "superblock not marked as containing replicas %s",
1291 buf.buf)) {
7ef2a73a 1292 ret = bch2_mark_replicas(c, &replicas.e);
1c6fdbd8 1293 if (ret)
fa8e94fa 1294 goto err;
1c6fdbd8 1295 }
1c6fdbd8 1296 }
fa8e94fa 1297err:
1c6fdbd8 1298fsck_err:
fa8e94fa 1299 printbuf_exit(&buf);
1c6fdbd8
KO
1300 return ret;
1301}
1302
1c6fdbd8
KO
1303/* journal write: */
1304
a9ec3454
KO
1305static void __journal_write_alloc(struct journal *j,
1306 struct journal_buf *w,
1307 struct dev_alloc_list *devs_sorted,
1308 unsigned sectors,
1309 unsigned *replicas,
1310 unsigned replicas_want)
1c6fdbd8
KO
1311{
1312 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1c6fdbd8
KO
1313 struct journal_device *ja;
1314 struct bch_dev *ca;
a9ec3454 1315 unsigned i;
a2753581 1316
a9ec3454
KO
1317 if (*replicas >= replicas_want)
1318 return;
1c6fdbd8 1319
a9ec3454
KO
1320 for (i = 0; i < devs_sorted->nr; i++) {
1321 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1c6fdbd8
KO
1322 if (!ca)
1323 continue;
1324
1c6fdbd8 1325 ja = &ca->journal;
1c6fdbd8
KO
1326
1327 /*
1328 * Check that we can use this device, and aren't already using
1329 * it:
1330 */
a9ec3454 1331 if (!ca->mi.durability ||
2436cb9f 1332 ca->mi.state != BCH_MEMBER_STATE_rw ||
a9ec3454 1333 !ja->nr ||
702ffea2 1334 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
a9ec3454 1335 sectors > ja->sectors_free)
1c6fdbd8
KO
1336 continue;
1337
3d080aa5 1338 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1c6fdbd8 1339
26609b61 1340 bch2_bkey_append_ptr(&w->key,
1c6fdbd8
KO
1341 (struct bch_extent_ptr) {
1342 .offset = bucket_to_sector(ca,
a9ec3454
KO
1343 ja->buckets[ja->cur_idx]) +
1344 ca->mi.bucket_size -
1345 ja->sectors_free,
1c6fdbd8
KO
1346 .dev = ca->dev_idx,
1347 });
1348
a9ec3454
KO
1349 ja->sectors_free -= sectors;
1350 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1351
1352 *replicas += ca->mi.durability;
1353
1354 if (*replicas >= replicas_want)
1355 break;
1c6fdbd8 1356 }
a9ec3454 1357}
1c6fdbd8 1358
a9ec3454
KO
1359/**
1360 * journal_next_bucket - move on to the next journal bucket if possible
1361 */
1362static int journal_write_alloc(struct journal *j, struct journal_buf *w,
1363 unsigned sectors)
1364{
1365 struct bch_fs *c = container_of(j, struct bch_fs, journal);
d042b040 1366 struct bch_devs_mask devs;
a9ec3454
KO
1367 struct journal_device *ja;
1368 struct bch_dev *ca;
1369 struct dev_alloc_list devs_sorted;
d042b040
KO
1370 unsigned target = c->opts.metadata_target ?:
1371 c->opts.foreground_target;
a9ec3454
KO
1372 unsigned i, replicas = 0, replicas_want =
1373 READ_ONCE(c->opts.metadata_replicas);
1c6fdbd8 1374
a9ec3454 1375 rcu_read_lock();
d042b040
KO
1376retry:
1377 devs = target_rw_devs(c, BCH_DATA_journal, target);
1c6fdbd8 1378
d042b040 1379 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1c6fdbd8 1380
a9ec3454
KO
1381 __journal_write_alloc(j, w, &devs_sorted,
1382 sectors, &replicas, replicas_want);
1c6fdbd8 1383
a9ec3454
KO
1384 if (replicas >= replicas_want)
1385 goto done;
1386
1387 for (i = 0; i < devs_sorted.nr; i++) {
1388 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
1389 if (!ca)
1390 continue;
1391
1392 ja = &ca->journal;
1393
1394 if (sectors > ja->sectors_free &&
1395 sectors <= ca->mi.bucket_size &&
03d5eaed
KO
1396 bch2_journal_dev_buckets_available(j, ja,
1397 journal_space_discarded)) {
a9ec3454
KO
1398 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1399 ja->sectors_free = ca->mi.bucket_size;
68ef94a6
KO
1400
1401 /*
1402 * ja->bucket_seq[ja->cur_idx] must always have
1403 * something sensible:
1404 */
1405 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
a9ec3454
KO
1406 }
1407 }
1408
1409 __journal_write_alloc(j, w, &devs_sorted,
1410 sectors, &replicas, replicas_want);
d042b040
KO
1411
1412 if (replicas < replicas_want && target) {
1413 /* Retry from all devices: */
1414 target = 0;
1415 goto retry;
1416 }
a9ec3454 1417done:
a9ec3454
KO
1418 rcu_read_unlock();
1419
07a1006a
KO
1420 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1421
57cb2142 1422 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
1c6fdbd8
KO
1423}
1424
1c6fdbd8
KO
1425static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1426{
1427 /* we aren't holding j->lock: */
1428 unsigned new_size = READ_ONCE(j->buf_size_want);
1429 void *new_buf;
1430
d16b4a77 1431 if (buf->buf_size >= new_size)
1c6fdbd8
KO
1432 return;
1433
19c304be 1434 new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
1c6fdbd8
KO
1435 if (!new_buf)
1436 return;
1437
d16b4a77 1438 memcpy(new_buf, buf->data, buf->buf_size);
c859430b
KO
1439
1440 spin_lock(&j->lock);
1441 swap(buf->data, new_buf);
1442 swap(buf->buf_size, new_size);
1443 spin_unlock(&j->lock);
1444
1445 kvpfree(new_buf, new_size);
1c6fdbd8
KO
1446}
1447
ebb84d09
KO
1448static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
1449{
30ef633a 1450 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
ebb84d09
KO
1451}
1452
1c6fdbd8
KO
1453static void journal_write_done(struct closure *cl)
1454{
1455 struct journal *j = container_of(cl, struct journal, io);
1456 struct bch_fs *c = container_of(j, struct bch_fs, journal);
ebb84d09 1457 struct journal_buf *w = journal_last_unwritten_buf(j);
ebb84d09 1458 union journal_res_state old, new;
1784d43a 1459 u64 v, seq;
158eecb8 1460 int err = 0;
1c6fdbd8 1461
991ba021
KO
1462 bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1463 ? j->flush_write_time
1464 : j->noflush_write_time, j->write_start_time);
9c859dc9 1465
d797ca3d 1466 if (!w->devs_written.nr) {
1c6fdbd8 1467 bch_err(c, "unable to write journal to sufficient devices");
158eecb8 1468 err = -EIO;
1c6fdbd8 1469 }
158eecb8
KO
1470 if (err)
1471 bch2_fatal_error(c);
1c6fdbd8
KO
1472
1473 spin_lock(&j->lock);
ed9d58a2 1474 seq = le64_to_cpu(w->data->seq);
ed9d58a2 1475
1c6fdbd8 1476 if (seq >= j->pin.front)
d797ca3d 1477 journal_seq_pin(j, seq)->devs = w->devs_written;
1c6fdbd8 1478
9be1efe9 1479 if (!err) {
9be1efe9
KO
1480 if (!JSET_NO_FLUSH(w->data)) {
1481 j->flushed_seq_ondisk = seq;
1482 j->last_seq_ondisk = w->last_seq;
f25d8215 1483
59cc38b8 1484 bch2_do_discards(c);
f25d8215
KO
1485 closure_wake_up(&c->freelist_wait);
1486
1487 bch2_reset_alloc_cursors(c);
9be1efe9
KO
1488 }
1489 } else if (!j->err_seq || seq < j->err_seq)
1490 j->err_seq = seq;
0ce2dbbe 1491
f0a3a2cc
KO
1492 j->seq_ondisk = seq;
1493
1c6fdbd8
KO
1494 /*
1495 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1496 * more buckets:
1497 *
1498 * Must come before signaling write completion, for
1499 * bch2_fs_journal_stop():
1500 */
ec14fc60 1501 if (j->watermark != BCH_WATERMARK_stripe)
8cc052db 1502 journal_reclaim_kick(&c->journal);
158eecb8 1503
1c6fdbd8
KO
1504 /* also must come before signalling write completion: */
1505 closure_debug_destroy(cl);
1506
ebb84d09
KO
1507 v = atomic64_read(&j->reservations.counter);
1508 do {
1509 old.v = new.v = v;
24a3d53b 1510 BUG_ON(journal_state_count(new, new.unwritten_idx));
ebb84d09
KO
1511
1512 new.unwritten_idx++;
1513 } while ((v = atomic64_cmpxchg(&j->reservations.counter,
1514 old.v, new.v)) != old.v);
1c6fdbd8 1515
5d32c5bb
KO
1516 bch2_journal_space_available(j);
1517
1c6fdbd8
KO
1518 closure_wake_up(&w->wait);
1519 journal_wake(j);
1520
24a3d53b
KO
1521 if (!journal_state_count(new, new.unwritten_idx) &&
1522 journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
1523 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
1524 } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1525 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
fbec3b88
KO
1526 struct journal_buf *buf = journal_cur_buf(j);
1527 long delta = buf->expires - jiffies;
ebb84d09 1528
24a3d53b
KO
1529 /*
1530 * We don't close a journal entry to write it while there's
1531 * previous entries still in flight - the current journal entry
1532 * might want to be written now:
1533 */
1534
fbec3b88 1535 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
24a3d53b 1536 }
fbec3b88
KO
1537
1538 spin_unlock(&j->lock);
1c6fdbd8
KO
1539}
1540
1541static void journal_write_endio(struct bio *bio)
1542{
1543 struct bch_dev *ca = bio->bi_private;
1544 struct journal *j = &ca->fs->journal;
d797ca3d
KO
1545 struct journal_buf *w = journal_last_unwritten_buf(j);
1546 unsigned long flags;
1c6fdbd8 1547
d797ca3d
KO
1548 if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
1549 le64_to_cpu(w->data->seq),
63b214e7 1550 bch2_blk_status_to_str(bio->bi_status)) ||
1c6fdbd8 1551 bch2_meta_write_fault("journal")) {
1c6fdbd8 1552 spin_lock_irqsave(&j->err_lock, flags);
d797ca3d 1553 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1c6fdbd8
KO
1554 spin_unlock_irqrestore(&j->err_lock, flags);
1555 }
1556
1557 closure_put(&j->io);
1558 percpu_ref_put(&ca->io_ref);
1559}
1560
280249b9
KO
1561static void do_journal_write(struct closure *cl)
1562{
1563 struct journal *j = container_of(cl, struct journal, io);
1564 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1565 struct bch_dev *ca;
1566 struct journal_buf *w = journal_last_unwritten_buf(j);
1567 struct bch_extent_ptr *ptr;
1568 struct bio *bio;
1569 unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1570
1571 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1572 ca = bch_dev_bkey_exists(c, ptr->dev);
1573 if (!percpu_ref_tryget(&ca->io_ref)) {
1574 /* XXX: fix this */
1575 bch_err(c, "missing device for journal write\n");
1576 continue;
1577 }
1578
1579 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1580 sectors);
1581
1582 bio = ca->journal.bio;
1583 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
1584 bio->bi_iter.bi_sector = ptr->offset;
1585 bio->bi_end_io = journal_write_endio;
1586 bio->bi_private = ca;
1587
a28bd48a
KO
1588 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1589 ca->prev_journal_sector = bio->bi_iter.bi_sector;
1590
280249b9
KO
1591 if (!JSET_NO_FLUSH(w->data))
1592 bio->bi_opf |= REQ_FUA;
1593 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1594 bio->bi_opf |= REQ_PREFLUSH;
1595
1596 bch2_bio_map(bio, w->data, sectors << 9);
1597
674cfc26 1598 trace_and_count(c, journal_write, bio);
280249b9
KO
1599 closure_bio_submit(bio, cl);
1600
1601 ca->journal.bucket_seq[ca->journal.cur_idx] =
1602 le64_to_cpu(w->data->seq);
1603 }
1604
731bdd2e 1605 continue_at(cl, journal_write_done, c->io_complete_wq);
280249b9
KO
1606 return;
1607}
1608
9f6db127
KO
1609static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
1610{
1611 struct jset_entry *i, *next, *prev = NULL;
1612
1613 /*
1614 * Simple compaction, dropping empty jset_entries (from journal
1615 * reservations that weren't fully used) and merging jset_entries that
1616 * can be.
1617 *
1618 * If we wanted to be really fancy here, we could sort all the keys in
1619 * the jset and drop keys that were overwritten - probably not worth it:
1620 */
1621 vstruct_for_each_safe(jset, i, next) {
1622 unsigned u64s = le16_to_cpu(i->u64s);
1623
1624 /* Empty entry: */
1625 if (!u64s)
1626 continue;
1627
1628 if (i->type == BCH_JSET_ENTRY_btree_root)
1629 bch2_journal_entry_to_btree_root(c, i);
1630
1631 /* Can we merge with previous entry? */
1632 if (prev &&
1633 i->btree_id == prev->btree_id &&
1634 i->level == prev->level &&
1635 i->type == prev->type &&
1636 i->type == BCH_JSET_ENTRY_btree_keys &&
1637 le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
1638 memmove_u64s_down(vstruct_next(prev),
1639 i->_data,
1640 u64s);
1641 le16_add_cpu(&prev->u64s, u64s);
1642 continue;
1643 }
1644
1645 /* Couldn't merge, move i into new position (after prev): */
1646 prev = prev ? vstruct_next(prev) : jset->start;
1647 if (i != prev)
1648 memmove_u64s_down(prev, i, jset_u64s(u64s));
1649 }
1650
1651 prev = prev ? vstruct_next(prev) : jset->start;
1652 jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
1653}
1654
1c6fdbd8
KO
1655void bch2_journal_write(struct closure *cl)
1656{
1657 struct journal *j = container_of(cl, struct journal, io);
1658 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1659 struct bch_dev *ca;
ebb84d09 1660 struct journal_buf *w = journal_last_unwritten_buf(j);
a7b29b8d 1661 struct bch_replicas_padded replicas;
3ccc5c50 1662 struct jset_entry *start, *end;
1c6fdbd8
KO
1663 struct jset *jset;
1664 struct bio *bio;
fa8e94fa 1665 struct printbuf journal_debug_buf = PRINTBUF;
26609b61 1666 bool validate_before_checksum = false;
280249b9 1667 unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
e5a66496
KO
1668 int ret;
1669
b7a9bbfc
KO
1670 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
1671
1c6fdbd8
KO
1672 journal_buf_realloc(j, w);
1673 jset = w->data;
1674
1675 j->write_start_time = local_clock();
1c6fdbd8 1676
adbcada4 1677 spin_lock(&j->lock);
b9004e85
KO
1678
1679 /*
1680 * If the journal is in an error state - we did an emergency shutdown -
1681 * we prefer to continue doing journal writes. We just mark them as
1682 * noflush so they'll never be used, but they'll still be visible by the
1683 * list_journal tool - this helps in debugging.
1684 *
1685 * There's a caveat: the first journal write after marking the
1686 * superblock dirty must always be a flush write, because on startup
1687 * from a clean shutdown we didn't necessarily read the journal and the
1688 * new journal write might overwrite whatever was in the journal
1689 * previously - we can't leave the journal without any flush writes in
1690 * it.
1691 *
1692 * So if we're in an error state, and we're still starting up, we don't
1693 * write anything at all.
1694 */
1695 if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
1696 (bch2_journal_error(j) ||
1697 w->noflush ||
1698 (!w->must_flush &&
1699 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
1700 test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
adbcada4
KO
1701 w->noflush = true;
1702 SET_JSET_NO_FLUSH(jset, true);
c0ebe3e4
KO
1703 jset->last_seq = 0;
1704 w->last_seq = 0;
adbcada4
KO
1705
1706 j->nr_noflush_writes++;
b9004e85 1707 } else if (!bch2_journal_error(j)) {
adbcada4
KO
1708 j->last_flush_write = jiffies;
1709 j->nr_flush_writes++;
b9004e85
KO
1710 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
1711 } else {
1712 spin_unlock(&j->lock);
1713 goto err;
adbcada4
KO
1714 }
1715 spin_unlock(&j->lock);
1716
00b8ccf7
KO
1717 /*
1718 * New btree roots are set by journalling them; when the journal entry
1719 * gets written we have to propagate them to c->btree_roots
1720 *
1721 * But, every journal entry we write has to contain all the btree roots
1722 * (at least for now); so after we copy btree roots to c->btree_roots we
1723 * have to get any missing btree roots and add them to this journal
1724 * entry:
1725 */
1726
9f6db127 1727 bch2_journal_entries_postprocess(c, jset);
00b8ccf7
KO
1728
1729 start = end = vstruct_last(jset);
1730
1731 end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1732
2abe5420
KO
1733 bch2_journal_super_entries_add_common(c, &end,
1734 le64_to_cpu(jset->seq));
3ccc5c50
KO
1735 u64s = (u64 *) end - (u64 *) start;
1736 BUG_ON(u64s > j->entry_u64s_reserved);
1737
d16b4a77 1738 le32_add_cpu(&jset->u64s, u64s);
4a2e5d7b
KO
1739
1740 sectors = vstruct_sectors(jset, c->block_bits);
1741 bytes = vstruct_bytes(jset);
1742
1743 if (sectors > w->sectors) {
1744 bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
1745 vstruct_bytes(jset), w->sectors << 9,
1746 u64s, w->u64s_reserved, j->entry_u64s_reserved);
1747 goto err;
1748 }
1c6fdbd8 1749
1c6fdbd8 1750 jset->magic = cpu_to_le64(jset_magic(c));
a02a0121 1751 jset->version = cpu_to_le32(c->sb.version);
1c6fdbd8
KO
1752
1753 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1754 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1755
4141fde0 1756 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
158eecb8
KO
1757 j->last_empty_seq = le64_to_cpu(jset->seq);
1758
26609b61
KO
1759 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1760 validate_before_checksum = true;
1761
e751c01a 1762 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
26609b61
KO
1763 validate_before_checksum = true;
1764
1765 if (validate_before_checksum &&
d1b2c864 1766 jset_validate(c, NULL, jset, 0, WRITE))
1c6fdbd8
KO
1767 goto err;
1768
a9de137b 1769 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1c6fdbd8
KO
1770 jset->encrypted_start,
1771 vstruct_end(jset) - (void *) jset->encrypted_start);
a9de137b
KO
1772 if (bch2_fs_fatal_err_on(ret, c,
1773 "error decrypting journal entry: %i", ret))
1774 goto err;
1c6fdbd8
KO
1775
1776 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1777 journal_nonce(jset), jset);
1778
26609b61 1779 if (!validate_before_checksum &&
d1b2c864 1780 jset_validate(c, NULL, jset, 0, WRITE))
1c6fdbd8
KO
1781 goto err;
1782
d16b4a77 1783 memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1c6fdbd8 1784
c18dade6 1785retry_alloc:
e5a66496
KO
1786 spin_lock(&j->lock);
1787 ret = journal_write_alloc(j, w, sectors);
1788
c18dade6
KO
1789 if (ret && j->can_discard) {
1790 spin_unlock(&j->lock);
1791 bch2_journal_do_discards(j);
1792 goto retry_alloc;
1793 }
1794
fa8e94fa
KO
1795 if (ret)
1796 __bch2_journal_debug_to_text(&journal_debug_buf, j);
85674154 1797
e5a66496
KO
1798 /*
1799 * write is allocated, no longer need to account for it in
1800 * bch2_journal_space_available():
1801 */
1802 w->sectors = 0;
1803
1804 /*
1805 * journal entry has been compacted and allocated, recalculate space
1806 * available:
1807 */
1808 bch2_journal_space_available(j);
1809 spin_unlock(&j->lock);
1810
1811 if (ret) {
85674154 1812 bch_err(c, "Unable to allocate journal write:\n%s",
fa8e94fa
KO
1813 journal_debug_buf.buf);
1814 printbuf_exit(&journal_debug_buf);
a7b29b8d 1815 goto err;
1c6fdbd8
KO
1816 }
1817
d797ca3d
KO
1818 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
1819
b66b2bc0 1820 if (c->opts.nochanges)
1c6fdbd8
KO
1821 goto no_io;
1822
280249b9
KO
1823 for_each_rw_member(ca, c, i)
1824 nr_rw_members++;
1c6fdbd8 1825
280249b9
KO
1826 if (nr_rw_members > 1)
1827 w->separate_flush = true;
1c6fdbd8 1828
a7b29b8d
BF
1829 /*
1830 * Mark journal replicas before we submit the write to guarantee
1831 * recovery will find the journal entries after a crash.
1832 */
1833 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1834 w->devs_written);
1835 ret = bch2_mark_replicas(c, &replicas.e);
1836 if (ret)
1837 goto err;
1838
280249b9
KO
1839 if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
1840 for_each_rw_member(ca, c, i) {
1841 percpu_ref_get(&ca->io_ref);
1c6fdbd8 1842
280249b9
KO
1843 bio = ca->journal.bio;
1844 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
1845 bio->bi_end_io = journal_write_endio;
1846 bio->bi_private = ca;
1847 closure_bio_submit(bio, cl);
1848 }
1c6fdbd8
KO
1849 }
1850
731bdd2e 1851 continue_at(cl, do_journal_write, c->io_complete_wq);
280249b9 1852 return;
1c6fdbd8 1853no_io:
731bdd2e 1854 continue_at(cl, journal_write_done, c->io_complete_wq);
1c6fdbd8
KO
1855 return;
1856err:
b74b147d 1857 bch2_fatal_error(c);
731bdd2e 1858 continue_at(cl, journal_write_done, c->io_complete_wq);
1c6fdbd8 1859}