bcachefs: Improve journal_entry_add()
[linux-block.git] / fs / bcachefs / journal_io.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2#include "bcachefs.h"
59cc38b8 3#include "alloc_background.h"
7b3f84ea 4#include "alloc_foreground.h"
39fb2983 5#include "btree_io.h"
00b8ccf7 6#include "btree_update_interior.h"
1c6fdbd8
KO
7#include "buckets.h"
8#include "checksum.h"
d042b040 9#include "disk_groups.h"
1c6fdbd8 10#include "error.h"
63b214e7 11#include "io.h"
1c6fdbd8
KO
12#include "journal.h"
13#include "journal_io.h"
14#include "journal_reclaim.h"
adbcada4 15#include "journal_seq_blacklist.h"
1c6fdbd8
KO
16#include "replicas.h"
17#include "trace.h"
18
17fe3b64
KO
19static struct nonce journal_nonce(const struct jset *jset)
20{
21 return (struct nonce) {{
22 [0] = 0,
23 [1] = ((__le32 *) &jset->seq)[0],
24 [2] = ((__le32 *) &jset->seq)[1],
25 [3] = BCH_NONCE_JOURNAL,
26 }};
27}
28
29static bool jset_csum_good(struct bch_fs *c, struct jset *j)
30{
31 return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
32 !bch2_crc_cmp(j->csum,
33 csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
34}
35
ec7ccbde 36static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
adbcada4 37{
ec7ccbde 38 return (seq - c->journal_entries_base_seq) & (~0U >> 1);
ce6201c4
KO
39}
40
41static void __journal_replay_free(struct bch_fs *c,
42 struct journal_replay *i)
43{
44 struct journal_replay **p =
ec7ccbde
KO
45 genradix_ptr(&c->journal_entries,
46 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
ce6201c4
KO
47
48 BUG_ON(*p != i);
49 *p = NULL;
adbcada4
KO
50 kvpfree(i, offsetof(struct journal_replay, j) +
51 vstruct_bytes(&i->j));
adbcada4
KO
52}
53
54static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
55{
56 i->ignore = true;
57
58 if (!c->opts.read_entire_journal)
ce6201c4 59 __journal_replay_free(c, i);
adbcada4
KO
60}
61
1c6fdbd8
KO
62struct journal_list {
63 struct closure cl;
ec7ccbde 64 u64 last_seq;
1c6fdbd8 65 struct mutex lock;
1c6fdbd8
KO
66 int ret;
67};
68
69#define JOURNAL_ENTRY_ADD_OK 0
70#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
71
72/*
73 * Given a journal entry we just read, add it to the list of journal entries to
74 * be replayed:
75 */
76static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
72b7d633 77 struct journal_ptr entry_ptr,
17fe3b64 78 struct journal_list *jlist, struct jset *j)
1c6fdbd8 79{
ce6201c4
KO
80 struct genradix_iter iter;
81 struct journal_replay **_i, *i, *dup;
72b7d633 82 struct journal_ptr *ptr;
1c6fdbd8 83 size_t bytes = vstruct_bytes(j);
ec7ccbde 84 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
e4c3f386 85 int ret = JOURNAL_ENTRY_ADD_OK;
1c6fdbd8 86
ec7ccbde
KO
87 /* Is this entry older than the range we need? */
88 if (!c->opts.read_entire_journal &&
89 le64_to_cpu(j->seq) < jlist->last_seq)
90 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
91
ce6201c4 92 /*
ec7ccbde
KO
93 * genradixes are indexed by a ulong, not a u64, so we can't index them
94 * by sequence number directly: Assume instead that they will all fall
95 * within the range of +-2billion of the filrst one we find.
ce6201c4
KO
96 */
97 if (!c->journal_entries_base_seq)
98 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
99
adbcada4 100 /* Drop entries we don't need anymore */
ec7ccbde
KO
101 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
102 genradix_for_each_from(&c->journal_entries, iter, _i,
103 journal_entry_radix_idx(c, jlist->last_seq)) {
ce6201c4
KO
104 i = *_i;
105
ec7ccbde 106 if (!i || i->ignore)
ce6201c4
KO
107 continue;
108
ec7ccbde 109 if (le64_to_cpu(i->j.seq) >= last_seq)
7fffc85b 110 break;
adbcada4 111 journal_replay_free(c, i);
7fffc85b 112 }
1c6fdbd8
KO
113 }
114
ec7ccbde
KO
115 jlist->last_seq = max(jlist->last_seq, last_seq);
116
117 _i = genradix_ptr_alloc(&c->journal_entries,
118 journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
119 GFP_KERNEL);
120 if (!_i)
121 return -ENOMEM;
e4c3f386 122
ca73852a
KO
123 /*
124 * Duplicate journal entries? If so we want the one that didn't have a
125 * checksum error:
126 */
ec7ccbde 127 dup = *_i;
e4c3f386 128 if (dup) {
17fe3b64
KO
129 if (bytes == vstruct_bytes(&dup->j) &&
130 !memcmp(j, &dup->j, bytes)) {
e4c3f386 131 i = dup;
ca73852a 132 goto found;
17fe3b64
KO
133 }
134
135 if (!entry_ptr.csum_good) {
e4c3f386 136 i = dup;
1c6fdbd8
KO
137 goto found;
138 }
1c6fdbd8 139
17fe3b64
KO
140 if (!dup->csum_good)
141 goto replace;
142
143 fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
144 le64_to_cpu(j->seq));
145 i = dup;
146 goto found;
147 }
148replace:
1c6fdbd8 149 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
ec7ccbde
KO
150 if (!i)
151 return -ENOMEM;
1c6fdbd8 152
17fe3b64
KO
153 i->nr_ptrs = 0;
154 i->csum_good = entry_ptr.csum_good;
e4c3f386 155 i->ignore = false;
1c6fdbd8 156 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
17fe3b64 157 i->ptrs[i->nr_ptrs++] = entry_ptr;
e4c3f386
KO
158
159 if (dup) {
17fe3b64
KO
160 if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
161 bch_err(c, "found too many copies of journal entry %llu",
162 le64_to_cpu(i->j.seq));
163 dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
164 }
165
166 /* The first ptr should represent the jset we kept: */
167 memcpy(i->ptrs + i->nr_ptrs,
168 dup->ptrs,
169 sizeof(dup->ptrs[0]) * dup->nr_ptrs);
170 i->nr_ptrs += dup->nr_ptrs;
ce6201c4 171 __journal_replay_free(c, dup);
e4c3f386
KO
172 }
173
ce6201c4 174 *_i = i;
17fe3b64 175 return 0;
1c6fdbd8 176found:
e4c3f386
KO
177 for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
178 if (ptr->dev == ca->dev_idx) {
179 bch_err(c, "duplicate journal entry %llu on same device",
180 le64_to_cpu(i->j.seq));
181 goto out;
182 }
183 }
184
185 if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
186 bch_err(c, "found too many copies of journal entry %llu",
187 le64_to_cpu(i->j.seq));
188 goto out;
189 }
190
191 i->ptrs[i->nr_ptrs++] = entry_ptr;
1c6fdbd8
KO
192out:
193fsck_err:
194 return ret;
195}
196
1c6fdbd8
KO
197/* this fills in a range with empty jset_entries: */
198static void journal_entry_null_range(void *start, void *end)
199{
200 struct jset_entry *entry;
201
202 for (entry = start; entry != end; entry = vstruct_next(entry))
203 memset(entry, 0, sizeof(*entry));
204}
205
206#define JOURNAL_ENTRY_REREAD 5
207#define JOURNAL_ENTRY_NONE 6
208#define JOURNAL_ENTRY_BAD 7
209
c23a9e08
KO
210static void journal_entry_err_msg(struct printbuf *out,
211 struct jset *jset,
212 struct jset_entry *entry)
213{
214 prt_str(out, "invalid journal entry ");
215 if (entry)
216 prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
217
218 if (!jset)
219 prt_printf(out, "in superblock");
220 else if (!entry)
221 prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
222 else
223 prt_printf(out, "at offset %zi/%u seq %llu",
224 (u64 *) entry - jset->_data,
225 le32_to_cpu(jset->u64s),
226 le64_to_cpu(jset->seq));
227 prt_str(out, ": ");
228}
229
230#define journal_entry_err(c, jset, entry, msg, ...) \
1c6fdbd8 231({ \
c23a9e08
KO
232 struct printbuf buf = PRINTBUF; \
233 \
234 journal_entry_err_msg(&buf, jset, entry); \
235 prt_printf(&buf, msg, ##__VA_ARGS__); \
236 \
1c6fdbd8
KO
237 switch (write) { \
238 case READ: \
c23a9e08 239 mustfix_fsck_err(c, "%s", buf.buf); \
1c6fdbd8
KO
240 break; \
241 case WRITE: \
c23a9e08 242 bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
1c6fdbd8 243 if (bch2_fs_inconsistent(c)) { \
1ed0a5d2 244 ret = -BCH_ERR_fsck_errors_not_fixed; \
1c6fdbd8
KO
245 goto fsck_err; \
246 } \
247 break; \
248 } \
c23a9e08
KO
249 \
250 printbuf_exit(&buf); \
1c6fdbd8
KO
251 true; \
252})
253
c23a9e08
KO
254#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \
255 ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
1c6fdbd8 256
4d54337c
KO
257#define FSCK_DELETED_KEY 5
258
c23a9e08
KO
259static int journal_validate_key(struct bch_fs *c,
260 struct jset *jset,
1c6fdbd8 261 struct jset_entry *entry,
39fb2983 262 unsigned level, enum btree_id btree_id,
cb685ce7 263 struct bkey_i *k,
7d6f07ed 264 unsigned version, int big_endian, int write)
1c6fdbd8
KO
265{
266 void *next = vstruct_next(entry);
f0ac7df2 267 struct printbuf buf = PRINTBUF;
1c6fdbd8
KO
268 int ret = 0;
269
c23a9e08 270 if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
1c6fdbd8
KO
271 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
272 journal_entry_null_range(vstruct_next(entry), next);
4d54337c 273 return FSCK_DELETED_KEY;
1c6fdbd8
KO
274 }
275
276 if (journal_entry_err_on((void *) bkey_next(k) >
c23a9e08
KO
277 (void *) vstruct_next(entry),
278 c, jset, entry,
279 "extends past end of journal entry")) {
1c6fdbd8
KO
280 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
281 journal_entry_null_range(vstruct_next(entry), next);
4d54337c 282 return FSCK_DELETED_KEY;
1c6fdbd8
KO
283 }
284
c23a9e08
KO
285 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
286 c, jset, entry,
287 "bad format %u", k->k.format)) {
4d54337c 288 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
1c6fdbd8
KO
289 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
290 journal_entry_null_range(vstruct_next(entry), next);
4d54337c 291 return FSCK_DELETED_KEY;
1c6fdbd8
KO
292 }
293
39fb2983 294 if (!write)
7d6f07ed
KO
295 bch2_bkey_compat(level, btree_id, version, big_endian,
296 write, NULL, bkey_to_packed(k));
26609b61 297
f0ac7df2 298 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
275c8426 299 __btree_node_type(level, btree_id), write, &buf)) {
f0ac7df2 300 printbuf_reset(&buf);
c23a9e08
KO
301 prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
302 bch2_jset_entry_types[entry->type],
303 (u64 *) entry - jset->_data,
304 le32_to_cpu(jset->u64s),
305 le64_to_cpu(jset->seq));
401ec4db
KO
306 prt_newline(&buf);
307 printbuf_indent_add(&buf, 2);
319f9ac3 308
fa8e94fa 309 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
401ec4db 310 prt_newline(&buf);
f0ac7df2 311 bch2_bkey_invalid(c, bkey_i_to_s_c(k),
275c8426 312 __btree_node_type(level, btree_id), write, &buf);
f0ac7df2
KO
313
314 mustfix_fsck_err(c, "%s", buf.buf);
1c6fdbd8 315
4d54337c 316 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
1c6fdbd8
KO
317 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
318 journal_entry_null_range(vstruct_next(entry), next);
f0ac7df2
KO
319
320 printbuf_exit(&buf);
4d54337c 321 return FSCK_DELETED_KEY;
1c6fdbd8 322 }
26609b61 323
39fb2983 324 if (write)
7d6f07ed
KO
325 bch2_bkey_compat(level, btree_id, version, big_endian,
326 write, NULL, bkey_to_packed(k));
1c6fdbd8 327fsck_err:
f0ac7df2 328 printbuf_exit(&buf);
1c6fdbd8
KO
329 return ret;
330}
331
528b18e6 332static int journal_entry_btree_keys_validate(struct bch_fs *c,
c23a9e08 333 struct jset *jset,
1c6fdbd8 334 struct jset_entry *entry,
7d6f07ed 335 unsigned version, int big_endian, int write)
1c6fdbd8 336{
4d54337c 337 struct bkey_i *k = entry->start;
1c6fdbd8 338
4d54337c 339 while (k != vstruct_last(entry)) {
c23a9e08 340 int ret = journal_validate_key(c, jset, entry,
39fb2983
KO
341 entry->level,
342 entry->btree_id,
cb685ce7 343 k, version, big_endian, write);
4d54337c
KO
344 if (ret == FSCK_DELETED_KEY)
345 continue;
346
347 k = bkey_next(k);
1c6fdbd8
KO
348 }
349
350 return 0;
351}
352
528b18e6
KO
353static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
354 struct jset_entry *entry)
355{
356 struct bkey_i *k;
e7bc7cdf 357 bool first = true;
528b18e6 358
e7bc7cdf
KO
359 vstruct_for_each(entry, k) {
360 if (!first) {
401ec4db
KO
361 prt_newline(out);
362 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
e7bc7cdf 363 }
401ec4db 364 prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
528b18e6 365 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
e7bc7cdf
KO
366 first = false;
367 }
528b18e6
KO
368}
369
370static int journal_entry_btree_root_validate(struct bch_fs *c,
c23a9e08 371 struct jset *jset,
1c6fdbd8 372 struct jset_entry *entry,
7d6f07ed 373 unsigned version, int big_endian, int write)
1c6fdbd8
KO
374{
375 struct bkey_i *k = entry->start;
376 int ret = 0;
377
378 if (journal_entry_err_on(!entry->u64s ||
c23a9e08
KO
379 le16_to_cpu(entry->u64s) != k->k.u64s,
380 c, jset, entry,
1c6fdbd8
KO
381 "invalid btree root journal entry: wrong number of keys")) {
382 void *next = vstruct_next(entry);
383 /*
384 * we don't want to null out this jset_entry,
385 * just the contents, so that later we can tell
386 * we were _supposed_ to have a btree root
387 */
388 entry->u64s = 0;
389 journal_entry_null_range(vstruct_next(entry), next);
390 return 0;
391 }
392
c23a9e08 393 return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
cb685ce7 394 version, big_endian, write);
1c6fdbd8
KO
395fsck_err:
396 return ret;
397}
398
528b18e6
KO
399static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
400 struct jset_entry *entry)
401{
402 journal_entry_btree_keys_to_text(out, c, entry);
403}
404
405static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
c23a9e08 406 struct jset *jset,
1c6fdbd8 407 struct jset_entry *entry,
7d6f07ed 408 unsigned version, int big_endian, int write)
1c6fdbd8
KO
409{
410 /* obsolete, don't care: */
411 return 0;
412}
413
528b18e6
KO
414static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
415 struct jset_entry *entry)
416{
417}
418
419static int journal_entry_blacklist_validate(struct bch_fs *c,
c23a9e08 420 struct jset *jset,
1c6fdbd8 421 struct jset_entry *entry,
7d6f07ed 422 unsigned version, int big_endian, int write)
1c6fdbd8
KO
423{
424 int ret = 0;
425
c23a9e08
KO
426 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
427 c, jset, entry,
1c6fdbd8
KO
428 "invalid journal seq blacklist entry: bad size")) {
429 journal_entry_null_range(entry, vstruct_next(entry));
430 }
431fsck_err:
432 return ret;
433}
434
528b18e6
KO
435static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
436 struct jset_entry *entry)
437{
438 struct jset_entry_blacklist *bl =
439 container_of(entry, struct jset_entry_blacklist, entry);
440
401ec4db 441 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
528b18e6
KO
442}
443
444static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
c23a9e08 445 struct jset *jset,
1c6fdbd8 446 struct jset_entry *entry,
7d6f07ed 447 unsigned version, int big_endian, int write)
1c6fdbd8
KO
448{
449 struct jset_entry_blacklist_v2 *bl_entry;
450 int ret = 0;
451
c23a9e08
KO
452 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
453 c, jset, entry,
1c6fdbd8
KO
454 "invalid journal seq blacklist entry: bad size")) {
455 journal_entry_null_range(entry, vstruct_next(entry));
2c5af169 456 goto out;
1c6fdbd8
KO
457 }
458
459 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
460
461 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
c23a9e08
KO
462 le64_to_cpu(bl_entry->end),
463 c, jset, entry,
1c6fdbd8
KO
464 "invalid journal seq blacklist entry: start > end")) {
465 journal_entry_null_range(entry, vstruct_next(entry));
466 }
2c5af169
KO
467out:
468fsck_err:
469 return ret;
470}
471
528b18e6
KO
472static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
473 struct jset_entry *entry)
474{
475 struct jset_entry_blacklist_v2 *bl =
476 container_of(entry, struct jset_entry_blacklist_v2, entry);
477
401ec4db 478 prt_printf(out, "start=%llu end=%llu",
528b18e6
KO
479 le64_to_cpu(bl->start),
480 le64_to_cpu(bl->end));
481}
482
483static int journal_entry_usage_validate(struct bch_fs *c,
c23a9e08 484 struct jset *jset,
2c5af169 485 struct jset_entry *entry,
7d6f07ed 486 unsigned version, int big_endian, int write)
2c5af169
KO
487{
488 struct jset_entry_usage *u =
489 container_of(entry, struct jset_entry_usage, entry);
490 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
491 int ret = 0;
492
3577df5f 493 if (journal_entry_err_on(bytes < sizeof(*u),
c23a9e08 494 c, jset, entry,
3577df5f
KO
495 "invalid journal entry usage: bad size")) {
496 journal_entry_null_range(entry, vstruct_next(entry));
497 return ret;
498 }
499
500fsck_err:
501 return ret;
502}
503
528b18e6
KO
504static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
505 struct jset_entry *entry)
506{
507 struct jset_entry_usage *u =
508 container_of(entry, struct jset_entry_usage, entry);
509
401ec4db 510 prt_printf(out, "type=%s v=%llu",
528b18e6
KO
511 bch2_fs_usage_types[u->entry.btree_id],
512 le64_to_cpu(u->v));
513}
514
515static int journal_entry_data_usage_validate(struct bch_fs *c,
c23a9e08 516 struct jset *jset,
3577df5f 517 struct jset_entry *entry,
7d6f07ed 518 unsigned version, int big_endian, int write)
3577df5f
KO
519{
520 struct jset_entry_data_usage *u =
521 container_of(entry, struct jset_entry_data_usage, entry);
522 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
523 int ret = 0;
524
2c5af169
KO
525 if (journal_entry_err_on(bytes < sizeof(*u) ||
526 bytes < sizeof(*u) + u->r.nr_devs,
c23a9e08 527 c, jset, entry,
2c5af169
KO
528 "invalid journal entry usage: bad size")) {
529 journal_entry_null_range(entry, vstruct_next(entry));
530 return ret;
531 }
1c6fdbd8
KO
532
533fsck_err:
534 return ret;
535}
536
528b18e6
KO
537static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
538 struct jset_entry *entry)
539{
540 struct jset_entry_data_usage *u =
541 container_of(entry, struct jset_entry_data_usage, entry);
542
543 bch2_replicas_entry_to_text(out, &u->r);
401ec4db 544 prt_printf(out, "=%llu", le64_to_cpu(u->v));
528b18e6
KO
545}
546
547static int journal_entry_clock_validate(struct bch_fs *c,
c23a9e08 548 struct jset *jset,
2abe5420 549 struct jset_entry *entry,
7d6f07ed 550 unsigned version, int big_endian, int write)
2abe5420
KO
551{
552 struct jset_entry_clock *clock =
553 container_of(entry, struct jset_entry_clock, entry);
554 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
555 int ret = 0;
556
557 if (journal_entry_err_on(bytes != sizeof(*clock),
c23a9e08 558 c, jset, entry, "bad size")) {
2abe5420
KO
559 journal_entry_null_range(entry, vstruct_next(entry));
560 return ret;
561 }
562
563 if (journal_entry_err_on(clock->rw > 1,
c23a9e08 564 c, jset, entry, "bad rw")) {
2abe5420
KO
565 journal_entry_null_range(entry, vstruct_next(entry));
566 return ret;
567 }
568
569fsck_err:
570 return ret;
571}
572
528b18e6
KO
573static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
574 struct jset_entry *entry)
575{
576 struct jset_entry_clock *clock =
577 container_of(entry, struct jset_entry_clock, entry);
578
401ec4db 579 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
528b18e6
KO
580}
581
582static int journal_entry_dev_usage_validate(struct bch_fs *c,
c23a9e08 583 struct jset *jset,
180fb49d 584 struct jset_entry *entry,
7d6f07ed 585 unsigned version, int big_endian, int write)
180fb49d
KO
586{
587 struct jset_entry_dev_usage *u =
588 container_of(entry, struct jset_entry_dev_usage, entry);
589 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
45c2e33f 590 unsigned expected = sizeof(*u);
180fb49d
KO
591 unsigned dev;
592 int ret = 0;
593
594 if (journal_entry_err_on(bytes < expected,
c23a9e08 595 c, jset, entry, "bad size (%u < %u)",
180fb49d
KO
596 bytes, expected)) {
597 journal_entry_null_range(entry, vstruct_next(entry));
598 return ret;
599 }
600
601 dev = le32_to_cpu(u->dev);
602
603 if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
c23a9e08 604 c, jset, entry, "bad dev")) {
180fb49d
KO
605 journal_entry_null_range(entry, vstruct_next(entry));
606 return ret;
607 }
608
609 if (journal_entry_err_on(u->pad,
c23a9e08 610 c, jset, entry, "bad pad")) {
180fb49d
KO
611 journal_entry_null_range(entry, vstruct_next(entry));
612 return ret;
613 }
614
615fsck_err:
616 return ret;
617}
618
528b18e6
KO
619static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
620 struct jset_entry *entry)
621{
622 struct jset_entry_dev_usage *u =
623 container_of(entry, struct jset_entry_dev_usage, entry);
624 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
625
401ec4db 626 prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
528b18e6
KO
627
628 for (i = 0; i < nr_types; i++) {
629 if (i < BCH_DATA_NR)
401ec4db 630 prt_printf(out, " %s", bch2_data_types[i]);
528b18e6 631 else
401ec4db
KO
632 prt_printf(out, " (unknown data type %u)", i);
633 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
528b18e6
KO
634 le64_to_cpu(u->d[i].buckets),
635 le64_to_cpu(u->d[i].sectors),
636 le64_to_cpu(u->d[i].fragmented));
637 }
638
401ec4db 639 prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
528b18e6
KO
640}
641
642static int journal_entry_log_validate(struct bch_fs *c,
c23a9e08 643 struct jset *jset,
fb64f3fd
KO
644 struct jset_entry *entry,
645 unsigned version, int big_endian, int write)
646{
647 return 0;
648}
649
528b18e6
KO
650static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
651 struct jset_entry *entry)
652{
653 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
654 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
655
401ec4db 656 prt_printf(out, "%.*s", bytes, l->d);
528b18e6
KO
657}
658
c23a9e08
KO
659static int journal_entry_overwrite_validate(struct bch_fs *c,
660 struct jset *jset,
cb685ce7
KO
661 struct jset_entry *entry,
662 unsigned version, int big_endian, int write)
663{
c23a9e08 664 return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write);
cb685ce7
KO
665}
666
667static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
668 struct jset_entry *entry)
669{
670 journal_entry_btree_keys_to_text(out, c, entry);
671}
672
1c6fdbd8 673struct jset_entry_ops {
c23a9e08 674 int (*validate)(struct bch_fs *, struct jset *,
7d6f07ed 675 struct jset_entry *, unsigned, int, int);
528b18e6 676 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
1c6fdbd8
KO
677};
678
679static const struct jset_entry_ops bch2_jset_entry_ops[] = {
680#define x(f, nr) \
681 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
528b18e6
KO
682 .validate = journal_entry_##f##_validate, \
683 .to_text = journal_entry_##f##_to_text, \
1c6fdbd8
KO
684 },
685 BCH_JSET_ENTRY_TYPES()
686#undef x
687};
688
c23a9e08
KO
689int bch2_journal_entry_validate(struct bch_fs *c,
690 struct jset *jset,
7d6f07ed
KO
691 struct jset_entry *entry,
692 unsigned version, int big_endian, int write)
1c6fdbd8 693{
2c5af169 694 return entry->type < BCH_JSET_ENTRY_NR
c23a9e08 695 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
7d6f07ed 696 version, big_endian, write)
2c5af169 697 : 0;
1c6fdbd8
KO
698}
699
528b18e6
KO
700void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
701 struct jset_entry *entry)
702{
703 if (entry->type < BCH_JSET_ENTRY_NR) {
401ec4db 704 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
528b18e6
KO
705 bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
706 } else {
401ec4db 707 prt_printf(out, "(unknown type %u)", entry->type);
528b18e6
KO
708 }
709}
710
1c6fdbd8
KO
711static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
712 int write)
713{
714 struct jset_entry *entry;
715 int ret = 0;
716
717 vstruct_for_each(jset, entry) {
718 if (journal_entry_err_on(vstruct_next(entry) >
c23a9e08 719 vstruct_last(jset), c, jset, entry,
1c6fdbd8
KO
720 "journal entry extends past end of jset")) {
721 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
722 break;
723 }
724
c23a9e08 725 ret = bch2_journal_entry_validate(c, jset, entry,
7d6f07ed
KO
726 le32_to_cpu(jset->version),
727 JSET_BIG_ENDIAN(jset), write);
1c6fdbd8
KO
728 if (ret)
729 break;
730 }
731fsck_err:
732 return ret;
733}
734
735static int jset_validate(struct bch_fs *c,
ca73852a 736 struct bch_dev *ca,
1c6fdbd8
KO
737 struct jset *jset, u64 sector,
738 unsigned bucket_sectors_left,
739 unsigned sectors_read,
740 int write)
741{
742 size_t bytes = vstruct_bytes(jset);
743 struct bch_csum csum;
26609b61 744 unsigned version;
1c6fdbd8
KO
745 int ret = 0;
746
747 if (le64_to_cpu(jset->magic) != jset_magic(c))
748 return JOURNAL_ENTRY_NONE;
749
26609b61 750 version = le32_to_cpu(jset->version);
ca73852a
KO
751 if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
752 version < bcachefs_metadata_version_min) ||
c23a9e08
KO
753 version >= bcachefs_metadata_version_max,
754 c, jset, NULL,
ca73852a 755 "%s sector %llu seq %llu: unknown journal entry version %u",
ed9d58a2
KO
756 ca ? ca->name : c->name,
757 sector, le64_to_cpu(jset->seq),
ca73852a 758 version)) {
35ef6df5
KO
759 /* don't try to continue: */
760 return EINVAL;
1c6fdbd8
KO
761 }
762
35ef6df5
KO
763 if (bytes > (sectors_read << 9) &&
764 sectors_read < bucket_sectors_left)
765 return JOURNAL_ENTRY_REREAD;
766
c23a9e08
KO
767 if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
768 c, jset, NULL,
ca73852a 769 "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
ed9d58a2
KO
770 ca ? ca->name : c->name,
771 sector, le64_to_cpu(jset->seq), bytes)) {
35ef6df5
KO
772 ret = JOURNAL_ENTRY_BAD;
773 le32_add_cpu(&jset->u64s,
774 -((bytes - (bucket_sectors_left << 9)) / 8));
1c6fdbd8
KO
775 }
776
c23a9e08
KO
777 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
778 c, jset, NULL,
ca73852a 779 "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ed9d58a2
KO
780 ca ? ca->name : c->name,
781 sector, le64_to_cpu(jset->seq),
35ef6df5
KO
782 JSET_CSUM_TYPE(jset))) {
783 ret = JOURNAL_ENTRY_BAD;
ed9d58a2 784 goto csum_done;
35ef6df5 785 }
1c6fdbd8 786
ed9d58a2
KO
787 if (write)
788 goto csum_done;
789
1c6fdbd8 790 csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
c23a9e08
KO
791 if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum),
792 c, jset, NULL,
ca73852a 793 "%s sector %llu seq %llu: journal checksum bad",
ed9d58a2
KO
794 ca ? ca->name : c->name,
795 sector, le64_to_cpu(jset->seq)))
35ef6df5 796 ret = JOURNAL_ENTRY_BAD;
1c6fdbd8 797
a9de137b 798 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1c6fdbd8
KO
799 jset->encrypted_start,
800 vstruct_end(jset) - (void *) jset->encrypted_start);
a9de137b
KO
801 bch2_fs_fatal_err_on(ret, c,
802 "error decrypting journal entry: %i", ret);
ed9d58a2
KO
803csum_done:
804 /* last_seq is ignored when JSET_NO_FLUSH is true */
805 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
c23a9e08
KO
806 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
807 c, jset, NULL,
ed9d58a2
KO
808 "invalid journal entry: last_seq > seq (%llu > %llu)",
809 le64_to_cpu(jset->last_seq),
810 le64_to_cpu(jset->seq))) {
1c6fdbd8 811 jset->last_seq = jset->seq;
ca73852a
KO
812 return JOURNAL_ENTRY_BAD;
813 }
1c6fdbd8
KO
814fsck_err:
815 return ret;
816}
817
ed9d58a2
KO
818static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
819{
820 unsigned sectors = vstruct_sectors(jset, c->block_bits);
821
822 return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
823 jset_validate_entries(c, jset, WRITE);
824}
825
1c6fdbd8
KO
826struct journal_read_buf {
827 void *data;
828 size_t size;
829};
830
831static int journal_read_buf_realloc(struct journal_read_buf *b,
832 size_t new_size)
833{
834 void *n;
835
836 /* the bios are sized for this many pages, max: */
837 if (new_size > JOURNAL_ENTRY_SIZE_MAX)
838 return -ENOMEM;
839
840 new_size = roundup_pow_of_two(new_size);
841 n = kvpmalloc(new_size, GFP_KERNEL);
842 if (!n)
843 return -ENOMEM;
844
845 kvpfree(b->data, b->size);
846 b->data = n;
847 b->size = new_size;
848 return 0;
849}
850
851static int journal_read_bucket(struct bch_dev *ca,
852 struct journal_read_buf *buf,
853 struct journal_list *jlist,
a9ec3454 854 unsigned bucket)
1c6fdbd8
KO
855{
856 struct bch_fs *c = ca->fs;
857 struct journal_device *ja = &ca->journal;
1c6fdbd8
KO
858 struct jset *j = NULL;
859 unsigned sectors, sectors_read = 0;
860 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
861 end = offset + ca->mi.bucket_size;
17fe3b64 862 bool saw_bad = false, csum_good;
1c6fdbd8
KO
863 int ret = 0;
864
865 pr_debug("reading %u", bucket);
866
867 while (offset < end) {
868 if (!sectors_read) {
ac10a961
KO
869 struct bio *bio;
870 unsigned nr_bvecs;
871reread:
872 sectors_read = min_t(unsigned,
1c6fdbd8 873 end - offset, buf->size >> 9);
ac10a961
KO
874 nr_bvecs = buf_pages(buf->data, sectors_read << 9);
875
876 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
877 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
1c6fdbd8 878
885678f6
KO
879 bio->bi_iter.bi_sector = offset;
880 bch2_bio_map(bio, buf->data, sectors_read << 9);
1c6fdbd8
KO
881
882 ret = submit_bio_wait(bio);
ac10a961 883 kfree(bio);
1c6fdbd8
KO
884
885 if (bch2_dev_io_err_on(ret, ca,
0fefe8d8 886 "journal read error: sector %llu",
1c6fdbd8 887 offset) ||
29d90f61
KO
888 bch2_meta_read_fault("journal")) {
889 /*
890 * We don't error out of the recovery process
891 * here, since the relevant journal entry may be
892 * found on a different device, and missing or
893 * no journal entries will be handled later
894 */
895 return 0;
896 }
1c6fdbd8
KO
897
898 j = buf->data;
899 }
900
ca73852a 901 ret = jset_validate(c, ca, j, offset,
1c6fdbd8
KO
902 end - offset, sectors_read,
903 READ);
904 switch (ret) {
1ed0a5d2 905 case 0:
ca73852a 906 sectors = vstruct_sectors(j, c->block_bits);
1c6fdbd8
KO
907 break;
908 case JOURNAL_ENTRY_REREAD:
909 if (vstruct_bytes(j) > buf->size) {
910 ret = journal_read_buf_realloc(buf,
911 vstruct_bytes(j));
912 if (ret)
913 return ret;
914 }
915 goto reread;
916 case JOURNAL_ENTRY_NONE:
917 if (!saw_bad)
918 return 0;
8244f320 919 sectors = block_sectors(c);
1c6fdbd8
KO
920 goto next_block;
921 case JOURNAL_ENTRY_BAD:
922 saw_bad = true;
ca73852a
KO
923 /*
924 * On checksum error we don't really trust the size
925 * field of the journal entry we read, so try reading
926 * again at next block boundary:
927 */
8244f320 928 sectors = block_sectors(c);
ca73852a 929 break;
1c6fdbd8
KO
930 default:
931 return ret;
932 }
933
934 /*
935 * This happens sometimes if we don't have discards on -
936 * when we've partially overwritten a bucket with new
937 * journal entries. We don't need the rest of the
938 * bucket:
939 */
940 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
941 return 0;
942
943 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
944
17fe3b64
KO
945 csum_good = jset_csum_good(c, j);
946 if (!csum_good)
947 saw_bad = true;
948
1c6fdbd8 949 mutex_lock(&jlist->lock);
72b7d633 950 ret = journal_entry_add(c, ca, (struct journal_ptr) {
17fe3b64 951 .csum_good = csum_good,
72b7d633
KO
952 .dev = ca->dev_idx,
953 .bucket = bucket,
954 .bucket_offset = offset -
955 bucket_to_sector(ca, ja->buckets[bucket]),
956 .sector = offset,
17fe3b64 957 }, jlist, j);
1c6fdbd8
KO
958 mutex_unlock(&jlist->lock);
959
960 switch (ret) {
961 case JOURNAL_ENTRY_ADD_OK:
1c6fdbd8
KO
962 break;
963 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
964 break;
965 default:
966 return ret;
967 }
1c6fdbd8
KO
968next_block:
969 pr_debug("next");
970 offset += sectors;
971 sectors_read -= sectors;
972 j = ((void *) j) + (sectors << 9);
973 }
974
975 return 0;
976}
977
978static void bch2_journal_read_device(struct closure *cl)
979{
1c6fdbd8
KO
980 struct journal_device *ja =
981 container_of(cl, struct journal_device, read);
982 struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
365f64f3 983 struct bch_fs *c = ca->fs;
1c6fdbd8
KO
984 struct journal_list *jlist =
985 container_of(cl->parent, struct journal_list, cl);
ce6201c4
KO
986 struct journal_replay *r, **_r;
987 struct genradix_iter iter;
1c6fdbd8 988 struct journal_read_buf buf = { NULL, 0 };
a9ec3454
KO
989 u64 min_seq = U64_MAX;
990 unsigned i;
9714baaa 991 int ret = 0;
1c6fdbd8
KO
992
993 if (!ja->nr)
994 goto out;
995
1c6fdbd8
KO
996 ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
997 if (ret)
998 goto err;
999
1000 pr_debug("%u journal buckets", ja->nr);
1001
1c6fdbd8 1002 for (i = 0; i < ja->nr; i++) {
a9ec3454
KO
1003 ret = journal_read_bucket(ca, &buf, jlist, i);
1004 if (ret)
1005 goto err;
1c6fdbd8
KO
1006 }
1007
a9ec3454
KO
1008 /* Find the journal bucket with the highest sequence number: */
1009 for (i = 0; i < ja->nr; i++) {
1010 if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
1011 ja->cur_idx = i;
1c6fdbd8 1012
a9ec3454 1013 min_seq = min(ja->bucket_seq[i], min_seq);
1c6fdbd8
KO
1014 }
1015
1c6fdbd8 1016 /*
1c6fdbd8
KO
1017 * If there's duplicate journal entries in multiple buckets (which
1018 * definitely isn't supposed to happen, but...) - make sure to start
1019 * cur_idx at the last of those buckets, so we don't deadlock trying to
1020 * allocate
1021 */
a9ec3454 1022 while (ja->bucket_seq[ja->cur_idx] > min_seq &&
062afcba 1023 ja->bucket_seq[ja->cur_idx] ==
a9ec3454 1024 ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
a36d3685 1025 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
a9ec3454 1026
062afcba
KO
1027 ja->sectors_free = ca->mi.bucket_size;
1028
1029 mutex_lock(&jlist->lock);
ce6201c4
KO
1030 genradix_for_each(&c->journal_entries, iter, _r) {
1031 r = *_r;
1032
1033 if (!r)
1034 continue;
1035
062afcba
KO
1036 for (i = 0; i < r->nr_ptrs; i++) {
1037 if (r->ptrs[i].dev == ca->dev_idx &&
1038 sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
502f973d 1039 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
062afcba
KO
1040 vstruct_sectors(&r->j, c->block_bits);
1041
1042 ja->sectors_free = min(ja->sectors_free,
1043 ca->mi.bucket_size - wrote);
1044 }
1045 }
1046 }
1047 mutex_unlock(&jlist->lock);
1048
b0be2fcf
KO
1049 if (ja->bucket_seq[ja->cur_idx] &&
1050 ja->sectors_free == ca->mi.bucket_size) {
1051 bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
1052 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
1053 for (i = 0; i < 3; i++) {
1054 unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
1055 bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
1056 }
1057 ja->sectors_free = 0;
1058 }
1c6fdbd8
KO
1059
1060 /*
0ce2dbbe 1061 * Set dirty_idx to indicate the entire journal is full and needs to be
1c6fdbd8
KO
1062 * reclaimed - journal reclaim will immediately reclaim whatever isn't
1063 * pinned when it first runs:
1064 */
0ce2dbbe
KO
1065 ja->discard_idx = ja->dirty_idx_ondisk =
1066 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1c6fdbd8 1067out:
365f64f3 1068 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
1c6fdbd8 1069 kvpfree(buf.data, buf.size);
1c6fdbd8
KO
1070 percpu_ref_put(&ca->io_ref);
1071 closure_return(cl);
1072 return;
1073err:
1074 mutex_lock(&jlist->lock);
1075 jlist->ret = ret;
1076 mutex_unlock(&jlist->lock);
1077 goto out;
1c6fdbd8
KO
1078}
1079
72b7d633
KO
1080void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1081 struct journal_replay *j)
e4c3f386
KO
1082{
1083 unsigned i;
1084
1085 for (i = 0; i < j->nr_ptrs; i++) {
c0ebe3e4 1086 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
514852c2
KO
1087 u64 offset;
1088
72b7d633 1089 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
e4c3f386
KO
1090
1091 if (i)
401ec4db
KO
1092 prt_printf(out, " ");
1093 prt_printf(out, "%u:%u:%u (sector %llu)",
e4c3f386 1094 j->ptrs[i].dev,
72b7d633
KO
1095 j->ptrs[i].bucket,
1096 j->ptrs[i].bucket_offset,
1097 j->ptrs[i].sector);
e4c3f386
KO
1098 }
1099}
1100
ce6201c4 1101int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
1c6fdbd8 1102{
1c6fdbd8 1103 struct journal_list jlist;
ce6201c4
KO
1104 struct journal_replay *i, **_i, *prev = NULL;
1105 struct genradix_iter radix_iter;
1c6fdbd8 1106 struct bch_dev *ca;
1c6fdbd8 1107 unsigned iter;
fa8e94fa 1108 struct printbuf buf = PRINTBUF;
1c6fdbd8
KO
1109 size_t keys = 0, entries = 0;
1110 bool degraded = false;
adbcada4 1111 u64 seq, last_seq = 0;
1c6fdbd8
KO
1112 int ret = 0;
1113
1114 closure_init_stack(&jlist.cl);
1115 mutex_init(&jlist.lock);
ec7ccbde 1116 jlist.last_seq = 0;
1c6fdbd8
KO
1117 jlist.ret = 0;
1118
1119 for_each_member_device(ca, c, iter) {
75c8d030 1120 if (!c->opts.fsck &&
89fd25be 1121 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1c6fdbd8
KO
1122 continue;
1123
2436cb9f
KO
1124 if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1125 ca->mi.state == BCH_MEMBER_STATE_ro) &&
1c6fdbd8
KO
1126 percpu_ref_tryget(&ca->io_ref))
1127 closure_call(&ca->journal.read,
1128 bch2_journal_read_device,
1129 system_unbound_wq,
1130 &jlist.cl);
1131 else
1132 degraded = true;
1133 }
1134
1135 closure_sync(&jlist.cl);
1136
1137 if (jlist.ret)
1138 return jlist.ret;
1139
ce6201c4 1140 *start_seq = 0;
adbcada4
KO
1141
1142 /*
1143 * Find most recent flush entry, and ignore newer non flush entries -
1144 * those entries will be blacklisted:
1145 */
ce6201c4
KO
1146 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
1147 i = *_i;
1148
1149 if (!i || i->ignore)
adbcada4
KO
1150 continue;
1151
ce6201c4
KO
1152 if (!*start_seq)
1153 *start_seq = le64_to_cpu(i->j.seq) + 1;
1154
adbcada4
KO
1155 if (!JSET_NO_FLUSH(&i->j)) {
1156 last_seq = le64_to_cpu(i->j.last_seq);
1157 *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
1158 break;
1159 }
1160
1161 journal_replay_free(c, i);
1162 }
1163
ce6201c4
KO
1164 if (!*start_seq) {
1165 bch_info(c, "journal read done, but no entries found");
1166 return 0;
1167 }
1168
adbcada4
KO
1169 if (!last_seq) {
1170 fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
fa8e94fa
KO
1171 ret = -1;
1172 goto err;
adbcada4
KO
1173 }
1174
1175 /* Drop blacklisted entries and entries older than last_seq: */
ce6201c4
KO
1176 genradix_for_each(&c->journal_entries, radix_iter, _i) {
1177 i = *_i;
1178
1179 if (!i || i->ignore)
adbcada4
KO
1180 continue;
1181
1182 seq = le64_to_cpu(i->j.seq);
1183 if (seq < last_seq) {
1184 journal_replay_free(c, i);
1185 continue;
1186 }
1187
1188 if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1189 fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1190 "found blacklisted journal entry %llu", seq);
1191
1192 journal_replay_free(c, i);
1193 }
1194 }
1195
1196 /* Check for missing entries: */
1197 seq = last_seq;
ce6201c4
KO
1198 genradix_for_each(&c->journal_entries, radix_iter, _i) {
1199 i = *_i;
1200
1201 if (!i || i->ignore)
adbcada4
KO
1202 continue;
1203
1204 BUG_ON(seq > le64_to_cpu(i->j.seq));
1205
1206 while (seq < le64_to_cpu(i->j.seq)) {
1207 u64 missing_start, missing_end;
fa8e94fa 1208 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
adbcada4
KO
1209
1210 while (seq < le64_to_cpu(i->j.seq) &&
1211 bch2_journal_seq_is_blacklisted(c, seq, false))
1212 seq++;
1213
1214 if (seq == le64_to_cpu(i->j.seq))
1215 break;
1216
1217 missing_start = seq;
1218
1219 while (seq < le64_to_cpu(i->j.seq) &&
1220 !bch2_journal_seq_is_blacklisted(c, seq, false))
1221 seq++;
1222
ce6201c4
KO
1223 if (prev) {
1224 bch2_journal_ptrs_to_text(&buf1, c, prev);
401ec4db 1225 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
e4c3f386 1226 } else
401ec4db 1227 prt_printf(&buf1, "(none)");
fa8e94fa 1228 bch2_journal_ptrs_to_text(&buf2, c, i);
e4c3f386 1229
adbcada4 1230 missing_end = seq - 1;
e4c3f386
KO
1231 fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
1232 " prev at %s\n"
1233 " next at %s",
adbcada4 1234 missing_start, missing_end,
e4c3f386 1235 last_seq, *blacklist_seq - 1,
fa8e94fa
KO
1236 buf1.buf, buf2.buf);
1237
1238 printbuf_exit(&buf1);
1239 printbuf_exit(&buf2);
adbcada4
KO
1240 }
1241
ce6201c4 1242 prev = i;
adbcada4
KO
1243 seq++;
1244 }
1245
ce6201c4 1246 genradix_for_each(&c->journal_entries, radix_iter, _i) {
1dd7f9d9
KO
1247 struct jset_entry *entry;
1248 struct bkey_i *k, *_n;
e4c3f386
KO
1249 struct bch_replicas_padded replicas = {
1250 .e.data_type = BCH_DATA_journal,
1251 .e.nr_required = 1,
1252 };
1253 unsigned ptr;
7ef2a73a 1254
ce6201c4
KO
1255 i = *_i;
1256 if (!i || i->ignore)
adbcada4
KO
1257 continue;
1258
1c6fdbd8
KO
1259 ret = jset_validate_entries(c, &i->j, READ);
1260 if (ret)
fa8e94fa 1261 goto err;
1c6fdbd8 1262
e4c3f386
KO
1263 for (ptr = 0; ptr < i->nr_ptrs; ptr++)
1264 replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
1265
26452d1d
KO
1266 bch2_replicas_entry_sort(&replicas.e);
1267
1c6fdbd8
KO
1268 /*
1269 * If we're mounting in degraded mode - if we didn't read all
1270 * the devices - this is wrong:
1271 */
1272
fa8e94fa
KO
1273 printbuf_reset(&buf);
1274 bch2_replicas_entry_to_text(&buf, &replicas.e);
1275
1c6fdbd8 1276 if (!degraded &&
75c8d030
KO
1277 fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
1278 "superblock not marked as containing replicas %s",
1279 buf.buf)) {
7ef2a73a 1280 ret = bch2_mark_replicas(c, &replicas.e);
1c6fdbd8 1281 if (ret)
fa8e94fa 1282 goto err;
1c6fdbd8 1283 }
1c6fdbd8
KO
1284
1285 for_each_jset_key(k, _n, entry, &i->j)
1286 keys++;
1287 entries++;
1288 }
1289
adbcada4
KO
1290 bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
1291 keys, entries, *start_seq);
1dd7f9d9 1292
adbcada4
KO
1293 if (*start_seq != *blacklist_seq)
1294 bch_info(c, "dropped unflushed entries %llu-%llu",
1295 *blacklist_seq, *start_seq - 1);
fa8e94fa 1296err:
1c6fdbd8 1297fsck_err:
fa8e94fa 1298 printbuf_exit(&buf);
1c6fdbd8
KO
1299 return ret;
1300}
1301
1c6fdbd8
KO
1302/* journal write: */
1303
a9ec3454
KO
1304static void __journal_write_alloc(struct journal *j,
1305 struct journal_buf *w,
1306 struct dev_alloc_list *devs_sorted,
1307 unsigned sectors,
1308 unsigned *replicas,
1309 unsigned replicas_want)
1c6fdbd8
KO
1310{
1311 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1c6fdbd8
KO
1312 struct journal_device *ja;
1313 struct bch_dev *ca;
a9ec3454 1314 unsigned i;
a2753581 1315
a9ec3454
KO
1316 if (*replicas >= replicas_want)
1317 return;
1c6fdbd8 1318
a9ec3454
KO
1319 for (i = 0; i < devs_sorted->nr; i++) {
1320 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1c6fdbd8
KO
1321 if (!ca)
1322 continue;
1323
1c6fdbd8 1324 ja = &ca->journal;
1c6fdbd8
KO
1325
1326 /*
1327 * Check that we can use this device, and aren't already using
1328 * it:
1329 */
a9ec3454 1330 if (!ca->mi.durability ||
2436cb9f 1331 ca->mi.state != BCH_MEMBER_STATE_rw ||
a9ec3454 1332 !ja->nr ||
26609b61
KO
1333 bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
1334 ca->dev_idx) ||
a9ec3454 1335 sectors > ja->sectors_free)
1c6fdbd8
KO
1336 continue;
1337
3d080aa5 1338 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1c6fdbd8 1339
26609b61 1340 bch2_bkey_append_ptr(&w->key,
1c6fdbd8
KO
1341 (struct bch_extent_ptr) {
1342 .offset = bucket_to_sector(ca,
a9ec3454
KO
1343 ja->buckets[ja->cur_idx]) +
1344 ca->mi.bucket_size -
1345 ja->sectors_free,
1c6fdbd8
KO
1346 .dev = ca->dev_idx,
1347 });
1348
a9ec3454
KO
1349 ja->sectors_free -= sectors;
1350 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1351
1352 *replicas += ca->mi.durability;
1353
1354 if (*replicas >= replicas_want)
1355 break;
1c6fdbd8 1356 }
a9ec3454 1357}
1c6fdbd8 1358
a9ec3454
KO
1359/**
1360 * journal_next_bucket - move on to the next journal bucket if possible
1361 */
1362static int journal_write_alloc(struct journal *j, struct journal_buf *w,
1363 unsigned sectors)
1364{
1365 struct bch_fs *c = container_of(j, struct bch_fs, journal);
d042b040 1366 struct bch_devs_mask devs;
a9ec3454
KO
1367 struct journal_device *ja;
1368 struct bch_dev *ca;
1369 struct dev_alloc_list devs_sorted;
d042b040
KO
1370 unsigned target = c->opts.metadata_target ?:
1371 c->opts.foreground_target;
a9ec3454
KO
1372 unsigned i, replicas = 0, replicas_want =
1373 READ_ONCE(c->opts.metadata_replicas);
1c6fdbd8 1374
a9ec3454 1375 rcu_read_lock();
d042b040
KO
1376retry:
1377 devs = target_rw_devs(c, BCH_DATA_journal, target);
1c6fdbd8 1378
d042b040 1379 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1c6fdbd8 1380
a9ec3454
KO
1381 __journal_write_alloc(j, w, &devs_sorted,
1382 sectors, &replicas, replicas_want);
1c6fdbd8 1383
a9ec3454
KO
1384 if (replicas >= replicas_want)
1385 goto done;
1386
1387 for (i = 0; i < devs_sorted.nr; i++) {
1388 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
1389 if (!ca)
1390 continue;
1391
1392 ja = &ca->journal;
1393
1394 if (sectors > ja->sectors_free &&
1395 sectors <= ca->mi.bucket_size &&
03d5eaed
KO
1396 bch2_journal_dev_buckets_available(j, ja,
1397 journal_space_discarded)) {
a9ec3454
KO
1398 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1399 ja->sectors_free = ca->mi.bucket_size;
68ef94a6
KO
1400
1401 /*
1402 * ja->bucket_seq[ja->cur_idx] must always have
1403 * something sensible:
1404 */
1405 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
a9ec3454
KO
1406 }
1407 }
1408
1409 __journal_write_alloc(j, w, &devs_sorted,
1410 sectors, &replicas, replicas_want);
d042b040
KO
1411
1412 if (replicas < replicas_want && target) {
1413 /* Retry from all devices: */
1414 target = 0;
1415 goto retry;
1416 }
a9ec3454 1417done:
a9ec3454
KO
1418 rcu_read_unlock();
1419
07a1006a
KO
1420 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1421
57cb2142 1422 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
1c6fdbd8
KO
1423}
1424
1c6fdbd8
KO
1425static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1426{
1427 /* we aren't holding j->lock: */
1428 unsigned new_size = READ_ONCE(j->buf_size_want);
1429 void *new_buf;
1430
d16b4a77 1431 if (buf->buf_size >= new_size)
1c6fdbd8
KO
1432 return;
1433
1434 new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
1435 if (!new_buf)
1436 return;
1437
d16b4a77 1438 memcpy(new_buf, buf->data, buf->buf_size);
c859430b
KO
1439
1440 spin_lock(&j->lock);
1441 swap(buf->data, new_buf);
1442 swap(buf->buf_size, new_size);
1443 spin_unlock(&j->lock);
1444
1445 kvpfree(new_buf, new_size);
1c6fdbd8
KO
1446}
1447
ebb84d09
KO
1448static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
1449{
30ef633a 1450 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
ebb84d09
KO
1451}
1452
1c6fdbd8
KO
1453static void journal_write_done(struct closure *cl)
1454{
1455 struct journal *j = container_of(cl, struct journal, io);
1456 struct bch_fs *c = container_of(j, struct bch_fs, journal);
ebb84d09 1457 struct journal_buf *w = journal_last_unwritten_buf(j);
7ef2a73a 1458 struct bch_replicas_padded replicas;
ebb84d09 1459 union journal_res_state old, new;
1784d43a 1460 u64 v, seq;
158eecb8 1461 int err = 0;
1c6fdbd8 1462
991ba021
KO
1463 bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1464 ? j->flush_write_time
1465 : j->noflush_write_time, j->write_start_time);
9c859dc9 1466
d797ca3d 1467 if (!w->devs_written.nr) {
1c6fdbd8 1468 bch_err(c, "unable to write journal to sufficient devices");
158eecb8
KO
1469 err = -EIO;
1470 } else {
d797ca3d
KO
1471 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1472 w->devs_written);
158eecb8
KO
1473 if (bch2_mark_replicas(c, &replicas.e))
1474 err = -EIO;
1c6fdbd8
KO
1475 }
1476
158eecb8
KO
1477 if (err)
1478 bch2_fatal_error(c);
1c6fdbd8
KO
1479
1480 spin_lock(&j->lock);
ed9d58a2 1481 seq = le64_to_cpu(w->data->seq);
ed9d58a2 1482
1c6fdbd8 1483 if (seq >= j->pin.front)
d797ca3d 1484 journal_seq_pin(j, seq)->devs = w->devs_written;
1c6fdbd8 1485
9be1efe9 1486 if (!err) {
9be1efe9
KO
1487 if (!JSET_NO_FLUSH(w->data)) {
1488 j->flushed_seq_ondisk = seq;
1489 j->last_seq_ondisk = w->last_seq;
f25d8215 1490
59cc38b8 1491 bch2_do_discards(c);
f25d8215
KO
1492 closure_wake_up(&c->freelist_wait);
1493
1494 bch2_reset_alloc_cursors(c);
9be1efe9
KO
1495 }
1496 } else if (!j->err_seq || seq < j->err_seq)
1497 j->err_seq = seq;
0ce2dbbe 1498
f0a3a2cc
KO
1499 j->seq_ondisk = seq;
1500
1c6fdbd8
KO
1501 /*
1502 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1503 * more buckets:
1504 *
1505 * Must come before signaling write completion, for
1506 * bch2_fs_journal_stop():
1507 */
8cc052db
KO
1508 if (j->watermark)
1509 journal_reclaim_kick(&c->journal);
158eecb8 1510
1c6fdbd8
KO
1511 /* also must come before signalling write completion: */
1512 closure_debug_destroy(cl);
1513
ebb84d09
KO
1514 v = atomic64_read(&j->reservations.counter);
1515 do {
1516 old.v = new.v = v;
24a3d53b 1517 BUG_ON(journal_state_count(new, new.unwritten_idx));
ebb84d09
KO
1518
1519 new.unwritten_idx++;
1520 } while ((v = atomic64_cmpxchg(&j->reservations.counter,
1521 old.v, new.v)) != old.v);
1c6fdbd8 1522
5d32c5bb
KO
1523 bch2_journal_space_available(j);
1524
1c6fdbd8
KO
1525 closure_wake_up(&w->wait);
1526 journal_wake(j);
1527
24a3d53b
KO
1528 if (!journal_state_count(new, new.unwritten_idx) &&
1529 journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
1530 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
1531 } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1532 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
fbec3b88
KO
1533 struct journal_buf *buf = journal_cur_buf(j);
1534 long delta = buf->expires - jiffies;
ebb84d09 1535
24a3d53b
KO
1536 /*
1537 * We don't close a journal entry to write it while there's
1538 * previous entries still in flight - the current journal entry
1539 * might want to be written now:
1540 */
1541
fbec3b88 1542 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
24a3d53b 1543 }
fbec3b88
KO
1544
1545 spin_unlock(&j->lock);
1c6fdbd8
KO
1546}
1547
1548static void journal_write_endio(struct bio *bio)
1549{
1550 struct bch_dev *ca = bio->bi_private;
1551 struct journal *j = &ca->fs->journal;
d797ca3d
KO
1552 struct journal_buf *w = journal_last_unwritten_buf(j);
1553 unsigned long flags;
1c6fdbd8 1554
d797ca3d
KO
1555 if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
1556 le64_to_cpu(w->data->seq),
63b214e7 1557 bch2_blk_status_to_str(bio->bi_status)) ||
1c6fdbd8 1558 bch2_meta_write_fault("journal")) {
1c6fdbd8 1559 spin_lock_irqsave(&j->err_lock, flags);
d797ca3d 1560 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1c6fdbd8
KO
1561 spin_unlock_irqrestore(&j->err_lock, flags);
1562 }
1563
1564 closure_put(&j->io);
1565 percpu_ref_put(&ca->io_ref);
1566}
1567
280249b9
KO
1568static void do_journal_write(struct closure *cl)
1569{
1570 struct journal *j = container_of(cl, struct journal, io);
1571 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1572 struct bch_dev *ca;
1573 struct journal_buf *w = journal_last_unwritten_buf(j);
1574 struct bch_extent_ptr *ptr;
1575 struct bio *bio;
1576 unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1577
1578 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1579 ca = bch_dev_bkey_exists(c, ptr->dev);
1580 if (!percpu_ref_tryget(&ca->io_ref)) {
1581 /* XXX: fix this */
1582 bch_err(c, "missing device for journal write\n");
1583 continue;
1584 }
1585
1586 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1587 sectors);
1588
1589 bio = ca->journal.bio;
1590 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
1591 bio->bi_iter.bi_sector = ptr->offset;
1592 bio->bi_end_io = journal_write_endio;
1593 bio->bi_private = ca;
1594
a28bd48a
KO
1595 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1596 ca->prev_journal_sector = bio->bi_iter.bi_sector;
1597
280249b9
KO
1598 if (!JSET_NO_FLUSH(w->data))
1599 bio->bi_opf |= REQ_FUA;
1600 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1601 bio->bi_opf |= REQ_PREFLUSH;
1602
1603 bch2_bio_map(bio, w->data, sectors << 9);
1604
674cfc26 1605 trace_and_count(c, journal_write, bio);
280249b9
KO
1606 closure_bio_submit(bio, cl);
1607
1608 ca->journal.bucket_seq[ca->journal.cur_idx] =
1609 le64_to_cpu(w->data->seq);
1610 }
1611
731bdd2e 1612 continue_at(cl, journal_write_done, c->io_complete_wq);
280249b9
KO
1613 return;
1614}
1615
1c6fdbd8
KO
1616void bch2_journal_write(struct closure *cl)
1617{
1618 struct journal *j = container_of(cl, struct journal, io);
1619 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1620 struct bch_dev *ca;
ebb84d09 1621 struct journal_buf *w = journal_last_unwritten_buf(j);
3ccc5c50 1622 struct jset_entry *start, *end;
1c6fdbd8
KO
1623 struct jset *jset;
1624 struct bio *bio;
fa8e94fa 1625 struct printbuf journal_debug_buf = PRINTBUF;
26609b61 1626 bool validate_before_checksum = false;
280249b9 1627 unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
e5a66496
KO
1628 int ret;
1629
b7a9bbfc
KO
1630 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
1631
1c6fdbd8
KO
1632 journal_buf_realloc(j, w);
1633 jset = w->data;
1634
1635 j->write_start_time = local_clock();
1c6fdbd8 1636
adbcada4 1637 spin_lock(&j->lock);
e0c014e7
KO
1638 if (bch2_journal_error(j) ||
1639 w->noflush ||
1640 (!w->must_flush &&
1641 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
1642 test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
adbcada4
KO
1643 w->noflush = true;
1644 SET_JSET_NO_FLUSH(jset, true);
c0ebe3e4
KO
1645 jset->last_seq = 0;
1646 w->last_seq = 0;
adbcada4
KO
1647
1648 j->nr_noflush_writes++;
1649 } else {
1650 j->last_flush_write = jiffies;
1651 j->nr_flush_writes++;
1652 }
1653 spin_unlock(&j->lock);
1654
00b8ccf7
KO
1655 /*
1656 * New btree roots are set by journalling them; when the journal entry
1657 * gets written we have to propagate them to c->btree_roots
1658 *
1659 * But, every journal entry we write has to contain all the btree roots
1660 * (at least for now); so after we copy btree roots to c->btree_roots we
1661 * have to get any missing btree roots and add them to this journal
1662 * entry:
1663 */
1664
1665 bch2_journal_entries_to_btree_roots(c, jset);
1666
1667 start = end = vstruct_last(jset);
1668
1669 end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1670
2abe5420
KO
1671 bch2_journal_super_entries_add_common(c, &end,
1672 le64_to_cpu(jset->seq));
3ccc5c50
KO
1673 u64s = (u64 *) end - (u64 *) start;
1674 BUG_ON(u64s > j->entry_u64s_reserved);
1675
d16b4a77
KO
1676 le32_add_cpu(&jset->u64s, u64s);
1677 BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
1c6fdbd8 1678
1c6fdbd8 1679 jset->magic = cpu_to_le64(jset_magic(c));
74b33393 1680 jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
26609b61
KO
1681 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
1682 : cpu_to_le32(c->sb.version);
1c6fdbd8
KO
1683
1684 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1685 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1686
4141fde0 1687 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
158eecb8
KO
1688 j->last_empty_seq = le64_to_cpu(jset->seq);
1689
26609b61
KO
1690 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1691 validate_before_checksum = true;
1692
e751c01a 1693 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
26609b61
KO
1694 validate_before_checksum = true;
1695
1696 if (validate_before_checksum &&
ed9d58a2 1697 jset_validate_for_write(c, jset))
1c6fdbd8
KO
1698 goto err;
1699
a9de137b 1700 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1c6fdbd8
KO
1701 jset->encrypted_start,
1702 vstruct_end(jset) - (void *) jset->encrypted_start);
a9de137b
KO
1703 if (bch2_fs_fatal_err_on(ret, c,
1704 "error decrypting journal entry: %i", ret))
1705 goto err;
1c6fdbd8
KO
1706
1707 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1708 journal_nonce(jset), jset);
1709
26609b61 1710 if (!validate_before_checksum &&
ed9d58a2 1711 jset_validate_for_write(c, jset))
1c6fdbd8
KO
1712 goto err;
1713
1714 sectors = vstruct_sectors(jset, c->block_bits);
d16b4a77 1715 BUG_ON(sectors > w->sectors);
1c6fdbd8 1716
d16b4a77
KO
1717 bytes = vstruct_bytes(jset);
1718 memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1c6fdbd8 1719
c18dade6 1720retry_alloc:
e5a66496
KO
1721 spin_lock(&j->lock);
1722 ret = journal_write_alloc(j, w, sectors);
1723
c18dade6
KO
1724 if (ret && j->can_discard) {
1725 spin_unlock(&j->lock);
1726 bch2_journal_do_discards(j);
1727 goto retry_alloc;
1728 }
1729
fa8e94fa
KO
1730 if (ret)
1731 __bch2_journal_debug_to_text(&journal_debug_buf, j);
85674154 1732
e5a66496
KO
1733 /*
1734 * write is allocated, no longer need to account for it in
1735 * bch2_journal_space_available():
1736 */
1737 w->sectors = 0;
1738
1739 /*
1740 * journal entry has been compacted and allocated, recalculate space
1741 * available:
1742 */
1743 bch2_journal_space_available(j);
1744 spin_unlock(&j->lock);
1745
1746 if (ret) {
85674154 1747 bch_err(c, "Unable to allocate journal write:\n%s",
fa8e94fa
KO
1748 journal_debug_buf.buf);
1749 printbuf_exit(&journal_debug_buf);
1c6fdbd8 1750 bch2_fatal_error(c);
731bdd2e 1751 continue_at(cl, journal_write_done, c->io_complete_wq);
1c6fdbd8
KO
1752 return;
1753 }
1754
d797ca3d
KO
1755 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
1756
b66b2bc0 1757 if (c->opts.nochanges)
1c6fdbd8
KO
1758 goto no_io;
1759
280249b9
KO
1760 for_each_rw_member(ca, c, i)
1761 nr_rw_members++;
1c6fdbd8 1762
280249b9
KO
1763 if (nr_rw_members > 1)
1764 w->separate_flush = true;
1c6fdbd8 1765
280249b9
KO
1766 if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
1767 for_each_rw_member(ca, c, i) {
1768 percpu_ref_get(&ca->io_ref);
1c6fdbd8 1769
280249b9
KO
1770 bio = ca->journal.bio;
1771 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
1772 bio->bi_end_io = journal_write_endio;
1773 bio->bi_private = ca;
1774 closure_bio_submit(bio, cl);
1775 }
1c6fdbd8
KO
1776 }
1777
731bdd2e 1778 continue_at(cl, do_journal_write, c->io_complete_wq);
280249b9 1779 return;
1c6fdbd8 1780no_io:
731bdd2e 1781 continue_at(cl, journal_write_done, c->io_complete_wq);
1c6fdbd8
KO
1782 return;
1783err:
b74b147d 1784 bch2_fatal_error(c);
731bdd2e 1785 continue_at(cl, journal_write_done, c->io_complete_wq);
1c6fdbd8 1786}