bcachefs: time_stats: shrink time_stat_buffer for better alignment
[linux-block.git] / fs / bcachefs / journal_io.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2#include "bcachefs.h"
59cc38b8 3#include "alloc_background.h"
7b3f84ea 4#include "alloc_foreground.h"
39fb2983 5#include "btree_io.h"
00b8ccf7 6#include "btree_update_interior.h"
09caeabe 7#include "btree_write_buffer.h"
1c6fdbd8
KO
8#include "buckets.h"
9#include "checksum.h"
d042b040 10#include "disk_groups.h"
1c6fdbd8
KO
11#include "error.h"
12#include "journal.h"
13#include "journal_io.h"
14#include "journal_reclaim.h"
adbcada4 15#include "journal_seq_blacklist.h"
1c6fdbd8 16#include "replicas.h"
a37ad1a3 17#include "sb-clean.h"
1c6fdbd8
KO
18#include "trace.h"
19
3d3d23b3
KO
20void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
21 struct journal_replay *j)
22{
23 darray_for_each(j->ptrs, i) {
24 struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
25 u64 offset;
26
27 div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
28
29 if (i != j->ptrs.data)
30 prt_printf(out, " ");
31 prt_printf(out, "%u:%u:%u (sector %llu)",
32 i->dev, i->bucket, i->bucket_offset, i->sector);
33 }
34}
35
36static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
37 struct journal_replay *j)
38{
39 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
40
41 bch2_journal_ptrs_to_text(out, c, j);
52f7d75e 42
52f7d75e
KO
43 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
44 struct jset_entry_datetime *datetime =
45 container_of(entry, struct jset_entry_datetime, entry);
46 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
47 break;
48 }
3d3d23b3
KO
49}
50
17fe3b64
KO
51static struct nonce journal_nonce(const struct jset *jset)
52{
53 return (struct nonce) {{
54 [0] = 0,
55 [1] = ((__le32 *) &jset->seq)[0],
56 [2] = ((__le32 *) &jset->seq)[1],
57 [3] = BCH_NONCE_JOURNAL,
58 }};
59}
60
4819b66e 61static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
17fe3b64 62{
4819b66e
KO
63 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
64 *csum = (struct bch_csum) {};
65 return false;
66 }
67
68 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
69 return !bch2_crc_cmp(j->csum, *csum);
17fe3b64
KO
70}
71
ec7ccbde 72static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
adbcada4 73{
ec7ccbde 74 return (seq - c->journal_entries_base_seq) & (~0U >> 1);
ce6201c4
KO
75}
76
77static void __journal_replay_free(struct bch_fs *c,
78 struct journal_replay *i)
79{
80 struct journal_replay **p =
ec7ccbde
KO
81 genradix_ptr(&c->journal_entries,
82 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
ce6201c4
KO
83
84 BUG_ON(*p != i);
85 *p = NULL;
cb6fc943 86 kvfree(i);
adbcada4
KO
87}
88
2cce3752 89static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
adbcada4 90{
2cce3752
KO
91 if (blacklisted)
92 i->ignore_blacklisted = true;
93 else
94 i->ignore_not_dirty = true;
adbcada4
KO
95
96 if (!c->opts.read_entire_journal)
ce6201c4 97 __journal_replay_free(c, i);
adbcada4
KO
98}
99
1c6fdbd8
KO
100struct journal_list {
101 struct closure cl;
ec7ccbde 102 u64 last_seq;
1c6fdbd8 103 struct mutex lock;
1c6fdbd8
KO
104 int ret;
105};
106
107#define JOURNAL_ENTRY_ADD_OK 0
108#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
109
110/*
111 * Given a journal entry we just read, add it to the list of journal entries to
112 * be replayed:
113 */
114static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
72b7d633 115 struct journal_ptr entry_ptr,
17fe3b64 116 struct journal_list *jlist, struct jset *j)
1c6fdbd8 117{
ce6201c4
KO
118 struct genradix_iter iter;
119 struct journal_replay **_i, *i, *dup;
1c6fdbd8 120 size_t bytes = vstruct_bytes(j);
ec7ccbde 121 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
3d3d23b3 122 struct printbuf buf = PRINTBUF;
e4c3f386 123 int ret = JOURNAL_ENTRY_ADD_OK;
1c6fdbd8 124
ec7ccbde
KO
125 /* Is this entry older than the range we need? */
126 if (!c->opts.read_entire_journal &&
127 le64_to_cpu(j->seq) < jlist->last_seq)
128 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
129
ce6201c4 130 /*
ec7ccbde
KO
131 * genradixes are indexed by a ulong, not a u64, so we can't index them
132 * by sequence number directly: Assume instead that they will all fall
133 * within the range of +-2billion of the filrst one we find.
ce6201c4
KO
134 */
135 if (!c->journal_entries_base_seq)
136 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
137
adbcada4 138 /* Drop entries we don't need anymore */
ec7ccbde
KO
139 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
140 genradix_for_each_from(&c->journal_entries, iter, _i,
141 journal_entry_radix_idx(c, jlist->last_seq)) {
ce6201c4
KO
142 i = *_i;
143
2cce3752 144 if (journal_replay_ignore(i))
ce6201c4
KO
145 continue;
146
ec7ccbde 147 if (le64_to_cpu(i->j.seq) >= last_seq)
7fffc85b 148 break;
2cce3752
KO
149
150 journal_replay_free(c, i, false);
7fffc85b 151 }
1c6fdbd8
KO
152 }
153
ec7ccbde
KO
154 jlist->last_seq = max(jlist->last_seq, last_seq);
155
156 _i = genradix_ptr_alloc(&c->journal_entries,
157 journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
158 GFP_KERNEL);
159 if (!_i)
65d48e35 160 return -BCH_ERR_ENOMEM_journal_entry_add;
e4c3f386 161
ca73852a
KO
162 /*
163 * Duplicate journal entries? If so we want the one that didn't have a
164 * checksum error:
165 */
ec7ccbde 166 dup = *_i;
e4c3f386 167 if (dup) {
3d3d23b3
KO
168 bool identical = bytes == vstruct_bytes(&dup->j) &&
169 !memcmp(j, &dup->j, bytes);
170 bool not_identical = !identical &&
171 entry_ptr.csum_good &&
172 dup->csum_good;
17fe3b64 173
3d3d23b3
KO
174 bool same_device = false;
175 darray_for_each(dup->ptrs, ptr)
176 if (ptr->dev == ca->dev_idx)
177 same_device = true;
178
179 ret = darray_push(&dup->ptrs, entry_ptr);
180 if (ret)
181 goto out;
1c6fdbd8 182
3d3d23b3
KO
183 bch2_journal_replay_to_text(&buf, c, dup);
184
185 fsck_err_on(same_device,
186 c, journal_entry_dup_same_device,
187 "duplicate journal entry on same device\n %s",
188 buf.buf);
189
190 fsck_err_on(not_identical,
191 c, journal_entry_replicas_data_mismatch,
192 "found duplicate but non identical journal entries\n %s",
193 buf.buf);
194
195 if (entry_ptr.csum_good && !identical)
17fe3b64
KO
196 goto replace;
197
3d3d23b3 198 goto out;
17fe3b64
KO
199 }
200replace:
cb6fc943 201 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
ec7ccbde 202 if (!i)
65d48e35 203 return -BCH_ERR_ENOMEM_journal_entry_add;
1c6fdbd8 204
a555bcf4 205 darray_init(&i->ptrs);
2cce3752
KO
206 i->csum_good = entry_ptr.csum_good;
207 i->ignore_blacklisted = false;
208 i->ignore_not_dirty = false;
1c6fdbd8 209 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
e4c3f386
KO
210
211 if (dup) {
17fe3b64 212 /* The first ptr should represent the jset we kept: */
a555bcf4
KO
213 darray_for_each(dup->ptrs, ptr)
214 darray_push(&i->ptrs, *ptr);
ce6201c4 215 __journal_replay_free(c, dup);
3d3d23b3
KO
216 } else {
217 darray_push(&i->ptrs, entry_ptr);
e4c3f386
KO
218 }
219
ce6201c4 220 *_i = i;
1c6fdbd8
KO
221out:
222fsck_err:
3d3d23b3 223 printbuf_exit(&buf);
1c6fdbd8
KO
224 return ret;
225}
226
1c6fdbd8
KO
227/* this fills in a range with empty jset_entries: */
228static void journal_entry_null_range(void *start, void *end)
229{
230 struct jset_entry *entry;
231
232 for (entry = start; entry != end; entry = vstruct_next(entry))
233 memset(entry, 0, sizeof(*entry));
234}
235
236#define JOURNAL_ENTRY_REREAD 5
237#define JOURNAL_ENTRY_NONE 6
238#define JOURNAL_ENTRY_BAD 7
239
c23a9e08 240static void journal_entry_err_msg(struct printbuf *out,
a8712967 241 u32 version,
c23a9e08
KO
242 struct jset *jset,
243 struct jset_entry *entry)
244{
a8712967
KO
245 prt_str(out, "invalid journal entry, version=");
246 bch2_version_to_text(out, version);
247
248 if (entry) {
249 prt_str(out, " type=");
250 prt_str(out, bch2_jset_entry_types[entry->type]);
251 }
252
253 if (!jset) {
254 prt_printf(out, " in superblock");
255 } else {
256
257 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
258
259 if (entry)
260 prt_printf(out, " offset=%zi/%u",
261 (u64 *) entry - jset->_data,
262 le32_to_cpu(jset->u64s));
263 }
264
c23a9e08
KO
265 prt_str(out, ": ");
266}
267
b65db750 268#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \
1c6fdbd8 269({ \
96dea3d5 270 struct printbuf _buf = PRINTBUF; \
c23a9e08 271 \
96dea3d5
KO
272 journal_entry_err_msg(&_buf, version, jset, entry); \
273 prt_printf(&_buf, msg, ##__VA_ARGS__); \
c23a9e08 274 \
c4e382e2 275 switch (flags & BKEY_INVALID_WRITE) { \
1c6fdbd8 276 case READ: \
b65db750 277 mustfix_fsck_err(c, _err, "%s", _buf.buf); \
1c6fdbd8
KO
278 break; \
279 case WRITE: \
b65db750 280 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
96dea3d5 281 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
1c6fdbd8 282 if (bch2_fs_inconsistent(c)) { \
1ed0a5d2 283 ret = -BCH_ERR_fsck_errors_not_fixed; \
1c6fdbd8
KO
284 goto fsck_err; \
285 } \
286 break; \
287 } \
c23a9e08 288 \
96dea3d5 289 printbuf_exit(&_buf); \
1c6fdbd8
KO
290 true; \
291})
292
b65db750
KO
293#define journal_entry_err_on(cond, ...) \
294 ((cond) ? journal_entry_err(__VA_ARGS__) : false)
1c6fdbd8 295
4d54337c
KO
296#define FSCK_DELETED_KEY 5
297
c23a9e08
KO
298static int journal_validate_key(struct bch_fs *c,
299 struct jset *jset,
1c6fdbd8 300 struct jset_entry *entry,
39fb2983 301 unsigned level, enum btree_id btree_id,
cb685ce7 302 struct bkey_i *k,
c4e382e2
KO
303 unsigned version, int big_endian,
304 enum bkey_invalid_flags flags)
1c6fdbd8 305{
c4e382e2 306 int write = flags & BKEY_INVALID_WRITE;
1c6fdbd8 307 void *next = vstruct_next(entry);
f0ac7df2 308 struct printbuf buf = PRINTBUF;
1c6fdbd8
KO
309 int ret = 0;
310
b65db750
KO
311 if (journal_entry_err_on(!k->k.u64s,
312 c, version, jset, entry,
313 journal_entry_bkey_u64s_0,
314 "k->u64s 0")) {
1c6fdbd8
KO
315 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
316 journal_entry_null_range(vstruct_next(entry), next);
4d54337c 317 return FSCK_DELETED_KEY;
1c6fdbd8
KO
318 }
319
320 if (journal_entry_err_on((void *) bkey_next(k) >
c23a9e08 321 (void *) vstruct_next(entry),
a8712967 322 c, version, jset, entry,
b65db750 323 journal_entry_bkey_past_end,
c23a9e08 324 "extends past end of journal entry")) {
1c6fdbd8
KO
325 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
326 journal_entry_null_range(vstruct_next(entry), next);
4d54337c 327 return FSCK_DELETED_KEY;
1c6fdbd8
KO
328 }
329
c23a9e08 330 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
a8712967 331 c, version, jset, entry,
b65db750 332 journal_entry_bkey_bad_format,
c23a9e08 333 "bad format %u", k->k.format)) {
4d54337c 334 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
1c6fdbd8
KO
335 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
336 journal_entry_null_range(vstruct_next(entry), next);
4d54337c 337 return FSCK_DELETED_KEY;
1c6fdbd8
KO
338 }
339
39fb2983 340 if (!write)
7d6f07ed
KO
341 bch2_bkey_compat(level, btree_id, version, big_endian,
342 write, NULL, bkey_to_packed(k));
26609b61 343
f0ac7df2 344 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
275c8426 345 __btree_node_type(level, btree_id), write, &buf)) {
f0ac7df2 346 printbuf_reset(&buf);
a8712967 347 journal_entry_err_msg(&buf, version, jset, entry);
401ec4db
KO
348 prt_newline(&buf);
349 printbuf_indent_add(&buf, 2);
319f9ac3 350
fa8e94fa 351 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
401ec4db 352 prt_newline(&buf);
f0ac7df2 353 bch2_bkey_invalid(c, bkey_i_to_s_c(k),
275c8426 354 __btree_node_type(level, btree_id), write, &buf);
f0ac7df2 355
b65db750
KO
356 mustfix_fsck_err(c, journal_entry_bkey_invalid,
357 "%s", buf.buf);
1c6fdbd8 358
4d54337c 359 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
1c6fdbd8
KO
360 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
361 journal_entry_null_range(vstruct_next(entry), next);
f0ac7df2
KO
362
363 printbuf_exit(&buf);
4d54337c 364 return FSCK_DELETED_KEY;
1c6fdbd8 365 }
26609b61 366
39fb2983 367 if (write)
7d6f07ed
KO
368 bch2_bkey_compat(level, btree_id, version, big_endian,
369 write, NULL, bkey_to_packed(k));
1c6fdbd8 370fsck_err:
f0ac7df2 371 printbuf_exit(&buf);
1c6fdbd8
KO
372 return ret;
373}
374
528b18e6 375static int journal_entry_btree_keys_validate(struct bch_fs *c,
c4e382e2
KO
376 struct jset *jset,
377 struct jset_entry *entry,
378 unsigned version, int big_endian,
379 enum bkey_invalid_flags flags)
1c6fdbd8 380{
4d54337c 381 struct bkey_i *k = entry->start;
1c6fdbd8 382
4d54337c 383 while (k != vstruct_last(entry)) {
c23a9e08 384 int ret = journal_validate_key(c, jset, entry,
39fb2983
KO
385 entry->level,
386 entry->btree_id,
8726dc93 387 k, version, big_endian,
c4e382e2 388 flags|BKEY_INVALID_JOURNAL);
4d54337c
KO
389 if (ret == FSCK_DELETED_KEY)
390 continue;
391
392 k = bkey_next(k);
1c6fdbd8
KO
393 }
394
395 return 0;
396}
397
528b18e6
KO
398static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
399 struct jset_entry *entry)
400{
e7bc7cdf 401 bool first = true;
528b18e6 402
ac2ccddc 403 jset_entry_for_each_key(entry, k) {
e7bc7cdf 404 if (!first) {
401ec4db
KO
405 prt_newline(out);
406 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
e7bc7cdf 407 }
88dfe193 408 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
528b18e6 409 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
e7bc7cdf
KO
410 first = false;
411 }
528b18e6
KO
412}
413
414static int journal_entry_btree_root_validate(struct bch_fs *c,
c4e382e2
KO
415 struct jset *jset,
416 struct jset_entry *entry,
417 unsigned version, int big_endian,
418 enum bkey_invalid_flags flags)
1c6fdbd8
KO
419{
420 struct bkey_i *k = entry->start;
421 int ret = 0;
422
423 if (journal_entry_err_on(!entry->u64s ||
c23a9e08 424 le16_to_cpu(entry->u64s) != k->k.u64s,
a8712967 425 c, version, jset, entry,
b65db750 426 journal_entry_btree_root_bad_size,
1c6fdbd8
KO
427 "invalid btree root journal entry: wrong number of keys")) {
428 void *next = vstruct_next(entry);
429 /*
430 * we don't want to null out this jset_entry,
431 * just the contents, so that later we can tell
432 * we were _supposed_ to have a btree root
433 */
434 entry->u64s = 0;
435 journal_entry_null_range(vstruct_next(entry), next);
436 return 0;
437 }
438
c8296d73
KO
439 ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
440 version, big_endian, flags);
441 if (ret == FSCK_DELETED_KEY)
442 ret = 0;
1c6fdbd8
KO
443fsck_err:
444 return ret;
445}
446
528b18e6
KO
447static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
448 struct jset_entry *entry)
449{
450 journal_entry_btree_keys_to_text(out, c, entry);
451}
452
453static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
c4e382e2
KO
454 struct jset *jset,
455 struct jset_entry *entry,
456 unsigned version, int big_endian,
457 enum bkey_invalid_flags flags)
1c6fdbd8
KO
458{
459 /* obsolete, don't care: */
460 return 0;
461}
462
528b18e6
KO
463static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
464 struct jset_entry *entry)
465{
466}
467
468static int journal_entry_blacklist_validate(struct bch_fs *c,
c4e382e2
KO
469 struct jset *jset,
470 struct jset_entry *entry,
471 unsigned version, int big_endian,
472 enum bkey_invalid_flags flags)
1c6fdbd8
KO
473{
474 int ret = 0;
475
c23a9e08 476 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
a8712967 477 c, version, jset, entry,
b65db750 478 journal_entry_blacklist_bad_size,
1c6fdbd8
KO
479 "invalid journal seq blacklist entry: bad size")) {
480 journal_entry_null_range(entry, vstruct_next(entry));
481 }
482fsck_err:
483 return ret;
484}
485
528b18e6
KO
486static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
487 struct jset_entry *entry)
488{
489 struct jset_entry_blacklist *bl =
490 container_of(entry, struct jset_entry_blacklist, entry);
491
401ec4db 492 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
528b18e6
KO
493}
494
495static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
c4e382e2
KO
496 struct jset *jset,
497 struct jset_entry *entry,
498 unsigned version, int big_endian,
499 enum bkey_invalid_flags flags)
1c6fdbd8
KO
500{
501 struct jset_entry_blacklist_v2 *bl_entry;
502 int ret = 0;
503
c23a9e08 504 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
a8712967 505 c, version, jset, entry,
b65db750 506 journal_entry_blacklist_v2_bad_size,
1c6fdbd8
KO
507 "invalid journal seq blacklist entry: bad size")) {
508 journal_entry_null_range(entry, vstruct_next(entry));
2c5af169 509 goto out;
1c6fdbd8
KO
510 }
511
512 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
513
514 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
c23a9e08 515 le64_to_cpu(bl_entry->end),
a8712967 516 c, version, jset, entry,
b65db750 517 journal_entry_blacklist_v2_start_past_end,
1c6fdbd8
KO
518 "invalid journal seq blacklist entry: start > end")) {
519 journal_entry_null_range(entry, vstruct_next(entry));
520 }
2c5af169
KO
521out:
522fsck_err:
523 return ret;
524}
525
528b18e6
KO
526static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
527 struct jset_entry *entry)
528{
529 struct jset_entry_blacklist_v2 *bl =
530 container_of(entry, struct jset_entry_blacklist_v2, entry);
531
401ec4db 532 prt_printf(out, "start=%llu end=%llu",
528b18e6
KO
533 le64_to_cpu(bl->start),
534 le64_to_cpu(bl->end));
535}
536
537static int journal_entry_usage_validate(struct bch_fs *c,
c4e382e2
KO
538 struct jset *jset,
539 struct jset_entry *entry,
540 unsigned version, int big_endian,
541 enum bkey_invalid_flags flags)
2c5af169
KO
542{
543 struct jset_entry_usage *u =
544 container_of(entry, struct jset_entry_usage, entry);
545 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
546 int ret = 0;
547
3577df5f 548 if (journal_entry_err_on(bytes < sizeof(*u),
a8712967 549 c, version, jset, entry,
b65db750 550 journal_entry_usage_bad_size,
3577df5f
KO
551 "invalid journal entry usage: bad size")) {
552 journal_entry_null_range(entry, vstruct_next(entry));
553 return ret;
554 }
555
556fsck_err:
557 return ret;
558}
559
528b18e6
KO
560static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
561 struct jset_entry *entry)
562{
563 struct jset_entry_usage *u =
564 container_of(entry, struct jset_entry_usage, entry);
565
401ec4db 566 prt_printf(out, "type=%s v=%llu",
528b18e6
KO
567 bch2_fs_usage_types[u->entry.btree_id],
568 le64_to_cpu(u->v));
569}
570
571static int journal_entry_data_usage_validate(struct bch_fs *c,
c4e382e2
KO
572 struct jset *jset,
573 struct jset_entry *entry,
574 unsigned version, int big_endian,
575 enum bkey_invalid_flags flags)
3577df5f
KO
576{
577 struct jset_entry_data_usage *u =
578 container_of(entry, struct jset_entry_data_usage, entry);
579 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
d5bd3787 580 struct printbuf err = PRINTBUF;
3577df5f
KO
581 int ret = 0;
582
2c5af169
KO
583 if (journal_entry_err_on(bytes < sizeof(*u) ||
584 bytes < sizeof(*u) + u->r.nr_devs,
a8712967 585 c, version, jset, entry,
b65db750 586 journal_entry_data_usage_bad_size,
2c5af169
KO
587 "invalid journal entry usage: bad size")) {
588 journal_entry_null_range(entry, vstruct_next(entry));
d5bd3787 589 goto out;
2c5af169 590 }
1c6fdbd8 591
d5bd3787
KO
592 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
593 c, version, jset, entry,
594 journal_entry_data_usage_bad_size,
595 "invalid journal entry usage: %s", err.buf)) {
596 journal_entry_null_range(entry, vstruct_next(entry));
597 goto out;
598 }
599out:
1c6fdbd8 600fsck_err:
d5bd3787 601 printbuf_exit(&err);
1c6fdbd8
KO
602 return ret;
603}
604
528b18e6
KO
605static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
606 struct jset_entry *entry)
607{
608 struct jset_entry_data_usage *u =
609 container_of(entry, struct jset_entry_data_usage, entry);
610
611 bch2_replicas_entry_to_text(out, &u->r);
401ec4db 612 prt_printf(out, "=%llu", le64_to_cpu(u->v));
528b18e6
KO
613}
614
615static int journal_entry_clock_validate(struct bch_fs *c,
c4e382e2
KO
616 struct jset *jset,
617 struct jset_entry *entry,
618 unsigned version, int big_endian,
619 enum bkey_invalid_flags flags)
2abe5420
KO
620{
621 struct jset_entry_clock *clock =
622 container_of(entry, struct jset_entry_clock, entry);
623 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
624 int ret = 0;
625
626 if (journal_entry_err_on(bytes != sizeof(*clock),
b65db750
KO
627 c, version, jset, entry,
628 journal_entry_clock_bad_size,
629 "bad size")) {
2abe5420
KO
630 journal_entry_null_range(entry, vstruct_next(entry));
631 return ret;
632 }
633
634 if (journal_entry_err_on(clock->rw > 1,
b65db750
KO
635 c, version, jset, entry,
636 journal_entry_clock_bad_rw,
637 "bad rw")) {
2abe5420
KO
638 journal_entry_null_range(entry, vstruct_next(entry));
639 return ret;
640 }
641
642fsck_err:
643 return ret;
644}
645
528b18e6
KO
646static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
647 struct jset_entry *entry)
648{
649 struct jset_entry_clock *clock =
650 container_of(entry, struct jset_entry_clock, entry);
651
401ec4db 652 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
528b18e6
KO
653}
654
655static int journal_entry_dev_usage_validate(struct bch_fs *c,
c4e382e2
KO
656 struct jset *jset,
657 struct jset_entry *entry,
658 unsigned version, int big_endian,
659 enum bkey_invalid_flags flags)
180fb49d
KO
660{
661 struct jset_entry_dev_usage *u =
662 container_of(entry, struct jset_entry_dev_usage, entry);
663 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
45c2e33f 664 unsigned expected = sizeof(*u);
180fb49d
KO
665 unsigned dev;
666 int ret = 0;
667
668 if (journal_entry_err_on(bytes < expected,
b65db750
KO
669 c, version, jset, entry,
670 journal_entry_dev_usage_bad_size,
671 "bad size (%u < %u)",
180fb49d
KO
672 bytes, expected)) {
673 journal_entry_null_range(entry, vstruct_next(entry));
674 return ret;
675 }
676
677 dev = le32_to_cpu(u->dev);
678
679 if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
b65db750
KO
680 c, version, jset, entry,
681 journal_entry_dev_usage_bad_dev,
682 "bad dev")) {
180fb49d
KO
683 journal_entry_null_range(entry, vstruct_next(entry));
684 return ret;
685 }
686
687 if (journal_entry_err_on(u->pad,
b65db750
KO
688 c, version, jset, entry,
689 journal_entry_dev_usage_bad_pad,
690 "bad pad")) {
180fb49d
KO
691 journal_entry_null_range(entry, vstruct_next(entry));
692 return ret;
693 }
694
695fsck_err:
696 return ret;
697}
698
528b18e6
KO
699static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
700 struct jset_entry *entry)
701{
702 struct jset_entry_dev_usage *u =
703 container_of(entry, struct jset_entry_dev_usage, entry);
704 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
705
401ec4db 706 prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
528b18e6
KO
707
708 for (i = 0; i < nr_types; i++) {
e58f963c 709 bch2_prt_data_type(out, i);
401ec4db 710 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
528b18e6
KO
711 le64_to_cpu(u->d[i].buckets),
712 le64_to_cpu(u->d[i].sectors),
713 le64_to_cpu(u->d[i].fragmented));
714 }
528b18e6
KO
715}
716
717static int journal_entry_log_validate(struct bch_fs *c,
c4e382e2
KO
718 struct jset *jset,
719 struct jset_entry *entry,
720 unsigned version, int big_endian,
721 enum bkey_invalid_flags flags)
fb64f3fd
KO
722{
723 return 0;
724}
725
528b18e6
KO
726static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
727 struct jset_entry *entry)
728{
729 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
730 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
731
401ec4db 732 prt_printf(out, "%.*s", bytes, l->d);
528b18e6
KO
733}
734
c23a9e08 735static int journal_entry_overwrite_validate(struct bch_fs *c,
c4e382e2
KO
736 struct jset *jset,
737 struct jset_entry *entry,
738 unsigned version, int big_endian,
739 enum bkey_invalid_flags flags)
cb685ce7 740{
dbe17f18
KO
741 return journal_entry_btree_keys_validate(c, jset, entry,
742 version, big_endian, READ);
cb685ce7
KO
743}
744
745static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
746 struct jset_entry *entry)
747{
748 journal_entry_btree_keys_to_text(out, c, entry);
749}
750
09caeabe
KO
751static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
752 struct jset *jset,
753 struct jset_entry *entry,
754 unsigned version, int big_endian,
755 enum bkey_invalid_flags flags)
756{
757 return journal_entry_btree_keys_validate(c, jset, entry,
758 version, big_endian, READ);
759}
760
761static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
762 struct jset_entry *entry)
763{
764 journal_entry_btree_keys_to_text(out, c, entry);
765}
766
52f7d75e
KO
767static int journal_entry_datetime_validate(struct bch_fs *c,
768 struct jset *jset,
769 struct jset_entry *entry,
770 unsigned version, int big_endian,
771 enum bkey_invalid_flags flags)
772{
773 unsigned bytes = vstruct_bytes(entry);
774 unsigned expected = 16;
775 int ret = 0;
776
777 if (journal_entry_err_on(vstruct_bytes(entry) < expected,
778 c, version, jset, entry,
779 journal_entry_dev_usage_bad_size,
780 "bad size (%u < %u)",
781 bytes, expected)) {
782 journal_entry_null_range(entry, vstruct_next(entry));
783 return ret;
784 }
785fsck_err:
786 return ret;
787}
788
789static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
790 struct jset_entry *entry)
791{
792 struct jset_entry_datetime *datetime =
793 container_of(entry, struct jset_entry_datetime, entry);
794
795 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
796}
797
1c6fdbd8 798struct jset_entry_ops {
c23a9e08 799 int (*validate)(struct bch_fs *, struct jset *,
c4e382e2
KO
800 struct jset_entry *, unsigned, int,
801 enum bkey_invalid_flags);
528b18e6 802 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
1c6fdbd8
KO
803};
804
805static const struct jset_entry_ops bch2_jset_entry_ops[] = {
806#define x(f, nr) \
807 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
528b18e6
KO
808 .validate = journal_entry_##f##_validate, \
809 .to_text = journal_entry_##f##_to_text, \
1c6fdbd8
KO
810 },
811 BCH_JSET_ENTRY_TYPES()
812#undef x
813};
814
c23a9e08
KO
815int bch2_journal_entry_validate(struct bch_fs *c,
816 struct jset *jset,
7d6f07ed 817 struct jset_entry *entry,
c4e382e2
KO
818 unsigned version, int big_endian,
819 enum bkey_invalid_flags flags)
1c6fdbd8 820{
2c5af169 821 return entry->type < BCH_JSET_ENTRY_NR
c23a9e08 822 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
c4e382e2 823 version, big_endian, flags)
2c5af169 824 : 0;
1c6fdbd8
KO
825}
826
528b18e6
KO
827void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
828 struct jset_entry *entry)
829{
830 if (entry->type < BCH_JSET_ENTRY_NR) {
401ec4db 831 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
528b18e6
KO
832 bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
833 } else {
401ec4db 834 prt_printf(out, "(unknown type %u)", entry->type);
528b18e6
KO
835 }
836}
837
1c6fdbd8 838static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
c4e382e2 839 enum bkey_invalid_flags flags)
1c6fdbd8 840{
a8712967 841 unsigned version = le32_to_cpu(jset->version);
1c6fdbd8
KO
842 int ret = 0;
843
844 vstruct_for_each(jset, entry) {
a8712967 845 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
b65db750
KO
846 c, version, jset, entry,
847 journal_entry_past_jset_end,
1c6fdbd8
KO
848 "journal entry extends past end of jset")) {
849 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
850 break;
851 }
852
c23a9e08 853 ret = bch2_journal_entry_validate(c, jset, entry,
c4e382e2 854 version, JSET_BIG_ENDIAN(jset), flags);
1c6fdbd8
KO
855 if (ret)
856 break;
857 }
858fsck_err:
859 return ret;
860}
861
862static int jset_validate(struct bch_fs *c,
ca73852a 863 struct bch_dev *ca,
1c6fdbd8 864 struct jset *jset, u64 sector,
c4e382e2 865 enum bkey_invalid_flags flags)
1c6fdbd8 866{
26609b61 867 unsigned version;
1c6fdbd8
KO
868 int ret = 0;
869
870 if (le64_to_cpu(jset->magic) != jset_magic(c))
871 return JOURNAL_ENTRY_NONE;
872
26609b61 873 version = le32_to_cpu(jset->version);
a8712967
KO
874 if (journal_entry_err_on(!bch2_version_compatible(version),
875 c, version, jset, NULL,
b65db750 876 jset_unsupported_version,
ba8eeae8 877 "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
ed9d58a2 878 ca ? ca->name : c->name,
ba8eeae8
KO
879 sector, le64_to_cpu(jset->seq),
880 BCH_VERSION_MAJOR(version),
881 BCH_VERSION_MINOR(version))) {
35ef6df5 882 /* don't try to continue: */
d1b2c864 883 return -EINVAL;
1c6fdbd8
KO
884 }
885
c23a9e08 886 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
b65db750
KO
887 c, version, jset, NULL,
888 jset_unknown_csum,
ca73852a 889 "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ed9d58a2
KO
890 ca ? ca->name : c->name,
891 sector, le64_to_cpu(jset->seq),
d1b2c864 892 JSET_CSUM_TYPE(jset)))
35ef6df5 893 ret = JOURNAL_ENTRY_BAD;
1c6fdbd8 894
ed9d58a2
KO
895 /* last_seq is ignored when JSET_NO_FLUSH is true */
896 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
c23a9e08 897 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
a8712967 898 c, version, jset, NULL,
b65db750 899 jset_last_seq_newer_than_seq,
ed9d58a2
KO
900 "invalid journal entry: last_seq > seq (%llu > %llu)",
901 le64_to_cpu(jset->last_seq),
902 le64_to_cpu(jset->seq))) {
1c6fdbd8 903 jset->last_seq = jset->seq;
ca73852a
KO
904 return JOURNAL_ENTRY_BAD;
905 }
d1b2c864 906
c4e382e2 907 ret = jset_validate_entries(c, jset, flags);
1c6fdbd8
KO
908fsck_err:
909 return ret;
910}
911
d1b2c864
KO
912static int jset_validate_early(struct bch_fs *c,
913 struct bch_dev *ca,
914 struct jset *jset, u64 sector,
915 unsigned bucket_sectors_left,
916 unsigned sectors_read)
ed9d58a2 917{
d1b2c864
KO
918 size_t bytes = vstruct_bytes(jset);
919 unsigned version;
c4e382e2 920 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
d1b2c864
KO
921 int ret = 0;
922
923 if (le64_to_cpu(jset->magic) != jset_magic(c))
924 return JOURNAL_ENTRY_NONE;
925
926 version = le32_to_cpu(jset->version);
a8712967 927 if (journal_entry_err_on(!bch2_version_compatible(version),
b65db750
KO
928 c, version, jset, NULL,
929 jset_unsupported_version,
ba8eeae8 930 "%s sector %llu seq %llu: unknown journal entry version %u.%u",
d1b2c864 931 ca ? ca->name : c->name,
ba8eeae8
KO
932 sector, le64_to_cpu(jset->seq),
933 BCH_VERSION_MAJOR(version),
934 BCH_VERSION_MINOR(version))) {
d1b2c864
KO
935 /* don't try to continue: */
936 return -EINVAL;
937 }
938
939 if (bytes > (sectors_read << 9) &&
940 sectors_read < bucket_sectors_left)
941 return JOURNAL_ENTRY_REREAD;
ed9d58a2 942
d1b2c864 943 if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
b65db750
KO
944 c, version, jset, NULL,
945 jset_past_bucket_end,
d1b2c864
KO
946 "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
947 ca ? ca->name : c->name,
948 sector, le64_to_cpu(jset->seq), bytes))
949 le32_add_cpu(&jset->u64s,
950 -((bytes - (bucket_sectors_left << 9)) / 8));
951fsck_err:
952 return ret;
ed9d58a2
KO
953}
954
1c6fdbd8
KO
955struct journal_read_buf {
956 void *data;
957 size_t size;
958};
959
960static int journal_read_buf_realloc(struct journal_read_buf *b,
961 size_t new_size)
962{
963 void *n;
964
965 /* the bios are sized for this many pages, max: */
966 if (new_size > JOURNAL_ENTRY_SIZE_MAX)
65d48e35 967 return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
1c6fdbd8
KO
968
969 new_size = roundup_pow_of_two(new_size);
cb6fc943 970 n = kvmalloc(new_size, GFP_KERNEL);
1c6fdbd8 971 if (!n)
65d48e35 972 return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
1c6fdbd8 973
cb6fc943 974 kvfree(b->data);
1c6fdbd8
KO
975 b->data = n;
976 b->size = new_size;
977 return 0;
978}
979
980static int journal_read_bucket(struct bch_dev *ca,
981 struct journal_read_buf *buf,
982 struct journal_list *jlist,
a9ec3454 983 unsigned bucket)
1c6fdbd8
KO
984{
985 struct bch_fs *c = ca->fs;
986 struct journal_device *ja = &ca->journal;
1c6fdbd8
KO
987 struct jset *j = NULL;
988 unsigned sectors, sectors_read = 0;
989 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
990 end = offset + ca->mi.bucket_size;
17fe3b64 991 bool saw_bad = false, csum_good;
4819b66e 992 struct printbuf err = PRINTBUF;
1c6fdbd8
KO
993 int ret = 0;
994
995 pr_debug("reading %u", bucket);
996
997 while (offset < end) {
998 if (!sectors_read) {
ac10a961
KO
999 struct bio *bio;
1000 unsigned nr_bvecs;
1001reread:
1002 sectors_read = min_t(unsigned,
1c6fdbd8 1003 end - offset, buf->size >> 9);
ac10a961
KO
1004 nr_bvecs = buf_pages(buf->data, sectors_read << 9);
1005
1006 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
1007 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
1c6fdbd8 1008
885678f6
KO
1009 bio->bi_iter.bi_sector = offset;
1010 bch2_bio_map(bio, buf->data, sectors_read << 9);
1c6fdbd8
KO
1011
1012 ret = submit_bio_wait(bio);
ac10a961 1013 kfree(bio);
1c6fdbd8 1014
94119eeb 1015 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
0fefe8d8 1016 "journal read error: sector %llu",
1c6fdbd8 1017 offset) ||
29d90f61
KO
1018 bch2_meta_read_fault("journal")) {
1019 /*
1020 * We don't error out of the recovery process
1021 * here, since the relevant journal entry may be
1022 * found on a different device, and missing or
1023 * no journal entries will be handled later
1024 */
4819b66e 1025 goto out;
29d90f61 1026 }
1c6fdbd8
KO
1027
1028 j = buf->data;
1029 }
1030
d1b2c864
KO
1031 ret = jset_validate_early(c, ca, j, offset,
1032 end - offset, sectors_read);
1c6fdbd8 1033 switch (ret) {
1ed0a5d2 1034 case 0:
ca73852a 1035 sectors = vstruct_sectors(j, c->block_bits);
1c6fdbd8
KO
1036 break;
1037 case JOURNAL_ENTRY_REREAD:
1038 if (vstruct_bytes(j) > buf->size) {
1039 ret = journal_read_buf_realloc(buf,
1040 vstruct_bytes(j));
1041 if (ret)
4819b66e 1042 goto err;
1c6fdbd8
KO
1043 }
1044 goto reread;
1045 case JOURNAL_ENTRY_NONE:
1046 if (!saw_bad)
4819b66e 1047 goto out;
ca73852a
KO
1048 /*
1049 * On checksum error we don't really trust the size
1050 * field of the journal entry we read, so try reading
1051 * again at next block boundary:
1052 */
8244f320 1053 sectors = block_sectors(c);
d1b2c864 1054 goto next_block;
1c6fdbd8 1055 default:
4819b66e 1056 goto err;
1c6fdbd8
KO
1057 }
1058
1059 /*
1060 * This happens sometimes if we don't have discards on -
1061 * when we've partially overwritten a bucket with new
1062 * journal entries. We don't need the rest of the
1063 * bucket:
1064 */
1065 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
4819b66e 1066 goto out;
1c6fdbd8
KO
1067
1068 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
1069
4819b66e
KO
1070 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
1071 struct bch_csum csum;
1072 csum_good = jset_csum_good(c, j, &csum);
1073
94119eeb 1074 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
4819b66e
KO
1075 "%s",
1076 (printbuf_reset(&err),
1077 prt_str(&err, "journal "),
1078 bch2_csum_err_msg(&err, csum_type, j->csum, csum),
1079 err.buf)))
17fe3b64
KO
1080 saw_bad = true;
1081
d1b2c864
KO
1082 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
1083 j->encrypted_start,
1084 vstruct_end(j) - (void *) j->encrypted_start);
1085 bch2_fs_fatal_err_on(ret, c,
4819b66e
KO
1086 "error decrypting journal entry: %s",
1087 bch2_err_str(ret));
d1b2c864 1088
1c6fdbd8 1089 mutex_lock(&jlist->lock);
72b7d633 1090 ret = journal_entry_add(c, ca, (struct journal_ptr) {
17fe3b64 1091 .csum_good = csum_good,
72b7d633
KO
1092 .dev = ca->dev_idx,
1093 .bucket = bucket,
1094 .bucket_offset = offset -
1095 bucket_to_sector(ca, ja->buckets[bucket]),
1096 .sector = offset,
17fe3b64 1097 }, jlist, j);
1c6fdbd8
KO
1098 mutex_unlock(&jlist->lock);
1099
1100 switch (ret) {
1101 case JOURNAL_ENTRY_ADD_OK:
1c6fdbd8
KO
1102 break;
1103 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
1104 break;
1105 default:
4819b66e 1106 goto err;
1c6fdbd8 1107 }
1c6fdbd8
KO
1108next_block:
1109 pr_debug("next");
1110 offset += sectors;
1111 sectors_read -= sectors;
1112 j = ((void *) j) + (sectors << 9);
1113 }
1114
4819b66e
KO
1115out:
1116 ret = 0;
1117err:
1118 printbuf_exit(&err);
1119 return ret;
1c6fdbd8
KO
1120}
1121
d4e3b928 1122static CLOSURE_CALLBACK(bch2_journal_read_device)
1c6fdbd8 1123{
d4e3b928 1124 closure_type(ja, struct journal_device, read);
1c6fdbd8 1125 struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
365f64f3 1126 struct bch_fs *c = ca->fs;
1c6fdbd8
KO
1127 struct journal_list *jlist =
1128 container_of(cl->parent, struct journal_list, cl);
ce6201c4
KO
1129 struct journal_replay *r, **_r;
1130 struct genradix_iter iter;
1c6fdbd8 1131 struct journal_read_buf buf = { NULL, 0 };
a9ec3454 1132 unsigned i;
9714baaa 1133 int ret = 0;
1c6fdbd8
KO
1134
1135 if (!ja->nr)
1136 goto out;
1137
1c6fdbd8
KO
1138 ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
1139 if (ret)
1140 goto err;
1141
1142 pr_debug("%u journal buckets", ja->nr);
1143
1c6fdbd8 1144 for (i = 0; i < ja->nr; i++) {
a9ec3454
KO
1145 ret = journal_read_bucket(ca, &buf, jlist, i);
1146 if (ret)
1147 goto err;
1c6fdbd8
KO
1148 }
1149
062afcba
KO
1150 ja->sectors_free = ca->mi.bucket_size;
1151
1152 mutex_lock(&jlist->lock);
230fa1c7 1153 genradix_for_each_reverse(&c->journal_entries, iter, _r) {
ce6201c4
KO
1154 r = *_r;
1155
1156 if (!r)
1157 continue;
1158
a555bcf4
KO
1159 darray_for_each(r->ptrs, i)
1160 if (i->dev == ca->dev_idx) {
1161 unsigned wrote = bucket_remainder(ca, i->sector) +
062afcba
KO
1162 vstruct_sectors(&r->j, c->block_bits);
1163
a555bcf4 1164 ja->cur_idx = i->bucket;
230fa1c7
KO
1165 ja->sectors_free = ca->mi.bucket_size - wrote;
1166 goto found;
062afcba 1167 }
062afcba 1168 }
230fa1c7 1169found:
062afcba
KO
1170 mutex_unlock(&jlist->lock);
1171
b0be2fcf
KO
1172 if (ja->bucket_seq[ja->cur_idx] &&
1173 ja->sectors_free == ca->mi.bucket_size) {
497c57a3
KO
1174#if 0
1175 /*
1176 * Debug code for ZNS support, where we (probably) want to be
1177 * correlated where we stopped in the journal to the zone write
1178 * points:
1179 */
b0be2fcf
KO
1180 bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
1181 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
1182 for (i = 0; i < 3; i++) {
1183 unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
1e81f89b 1184
b0be2fcf
KO
1185 bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
1186 }
497c57a3 1187#endif
b0be2fcf
KO
1188 ja->sectors_free = 0;
1189 }
1c6fdbd8
KO
1190
1191 /*
0ce2dbbe 1192 * Set dirty_idx to indicate the entire journal is full and needs to be
1c6fdbd8
KO
1193 * reclaimed - journal reclaim will immediately reclaim whatever isn't
1194 * pinned when it first runs:
1195 */
0ce2dbbe
KO
1196 ja->discard_idx = ja->dirty_idx_ondisk =
1197 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1c6fdbd8 1198out:
365f64f3 1199 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
cb6fc943 1200 kvfree(buf.data);
1c6fdbd8
KO
1201 percpu_ref_put(&ca->io_ref);
1202 closure_return(cl);
1203 return;
1204err:
1205 mutex_lock(&jlist->lock);
1206 jlist->ret = ret;
1207 mutex_unlock(&jlist->lock);
1208 goto out;
1c6fdbd8
KO
1209}
1210
5bbe3f2d
KO
1211int bch2_journal_read(struct bch_fs *c,
1212 u64 *last_seq,
1213 u64 *blacklist_seq,
1214 u64 *start_seq)
1c6fdbd8 1215{
1c6fdbd8 1216 struct journal_list jlist;
ce6201c4
KO
1217 struct journal_replay *i, **_i, *prev = NULL;
1218 struct genradix_iter radix_iter;
fa8e94fa 1219 struct printbuf buf = PRINTBUF;
dab1e248 1220 bool degraded = false, last_write_torn = false;
5bbe3f2d 1221 u64 seq;
1c6fdbd8
KO
1222 int ret = 0;
1223
1224 closure_init_stack(&jlist.cl);
1225 mutex_init(&jlist.lock);
ec7ccbde 1226 jlist.last_seq = 0;
1c6fdbd8
KO
1227 jlist.ret = 0;
1228
9fea2274 1229 for_each_member_device(c, ca) {
75c8d030 1230 if (!c->opts.fsck &&
89fd25be 1231 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1c6fdbd8
KO
1232 continue;
1233
2436cb9f
KO
1234 if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1235 ca->mi.state == BCH_MEMBER_STATE_ro) &&
1c6fdbd8
KO
1236 percpu_ref_tryget(&ca->io_ref))
1237 closure_call(&ca->journal.read,
1238 bch2_journal_read_device,
1239 system_unbound_wq,
1240 &jlist.cl);
1241 else
1242 degraded = true;
1243 }
1244
1245 closure_sync(&jlist.cl);
1246
1247 if (jlist.ret)
1248 return jlist.ret;
1249
5bbe3f2d 1250 *last_seq = 0;
ff56d68c
KO
1251 *start_seq = 0;
1252 *blacklist_seq = 0;
adbcada4
KO
1253
1254 /*
1255 * Find most recent flush entry, and ignore newer non flush entries -
1256 * those entries will be blacklisted:
1257 */
ce6201c4 1258 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
c4e382e2 1259 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
dab1e248 1260
ce6201c4
KO
1261 i = *_i;
1262
2cce3752 1263 if (journal_replay_ignore(i))
adbcada4
KO
1264 continue;
1265
ce6201c4 1266 if (!*start_seq)
ff56d68c 1267 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
ce6201c4 1268
dab1e248 1269 if (JSET_NO_FLUSH(&i->j)) {
2cce3752 1270 i->ignore_blacklisted = true;
dab1e248 1271 continue;
adbcada4
KO
1272 }
1273
dab1e248
KO
1274 if (!last_write_torn && !i->csum_good) {
1275 last_write_torn = true;
2cce3752 1276 i->ignore_blacklisted = true;
dab1e248
KO
1277 continue;
1278 }
1279
1280 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
a8712967 1281 c, le32_to_cpu(i->j.version), &i->j, NULL,
b65db750 1282 jset_last_seq_newer_than_seq,
dab1e248
KO
1283 "invalid journal entry: last_seq > seq (%llu > %llu)",
1284 le64_to_cpu(i->j.last_seq),
1285 le64_to_cpu(i->j.seq)))
1286 i->j.last_seq = i->j.seq;
1287
5bbe3f2d 1288 *last_seq = le64_to_cpu(i->j.last_seq);
dab1e248
KO
1289 *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
1290 break;
adbcada4
KO
1291 }
1292
ce6201c4
KO
1293 if (!*start_seq) {
1294 bch_info(c, "journal read done, but no entries found");
1295 return 0;
1296 }
1297
5bbe3f2d 1298 if (!*last_seq) {
b65db750
KO
1299 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
1300 "journal read done, but no entries found after dropping non-flushes");
e0de429a 1301 return 0;
adbcada4
KO
1302 }
1303
ff56d68c 1304 bch_info(c, "journal read done, replaying entries %llu-%llu",
5bbe3f2d 1305 *last_seq, *blacklist_seq - 1);
ff56d68c
KO
1306
1307 if (*start_seq != *blacklist_seq)
1308 bch_info(c, "dropped unflushed entries %llu-%llu",
1309 *blacklist_seq, *start_seq - 1);
1310
adbcada4 1311 /* Drop blacklisted entries and entries older than last_seq: */
ce6201c4
KO
1312 genradix_for_each(&c->journal_entries, radix_iter, _i) {
1313 i = *_i;
1314
2cce3752 1315 if (journal_replay_ignore(i))
adbcada4
KO
1316 continue;
1317
1318 seq = le64_to_cpu(i->j.seq);
5bbe3f2d 1319 if (seq < *last_seq) {
2cce3752 1320 journal_replay_free(c, i, false);
adbcada4
KO
1321 continue;
1322 }
1323
1324 if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1325 fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
b65db750 1326 jset_seq_blacklisted,
adbcada4 1327 "found blacklisted journal entry %llu", seq);
2cce3752 1328 i->ignore_blacklisted = true;
adbcada4
KO
1329 }
1330 }
1331
1332 /* Check for missing entries: */
5bbe3f2d 1333 seq = *last_seq;
ce6201c4
KO
1334 genradix_for_each(&c->journal_entries, radix_iter, _i) {
1335 i = *_i;
1336
2cce3752 1337 if (journal_replay_ignore(i))
adbcada4
KO
1338 continue;
1339
1340 BUG_ON(seq > le64_to_cpu(i->j.seq));
1341
1342 while (seq < le64_to_cpu(i->j.seq)) {
1343 u64 missing_start, missing_end;
fa8e94fa 1344 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
adbcada4
KO
1345
1346 while (seq < le64_to_cpu(i->j.seq) &&
1347 bch2_journal_seq_is_blacklisted(c, seq, false))
1348 seq++;
1349
1350 if (seq == le64_to_cpu(i->j.seq))
1351 break;
1352
1353 missing_start = seq;
1354
1355 while (seq < le64_to_cpu(i->j.seq) &&
1356 !bch2_journal_seq_is_blacklisted(c, seq, false))
1357 seq++;
1358
ce6201c4
KO
1359 if (prev) {
1360 bch2_journal_ptrs_to_text(&buf1, c, prev);
401ec4db 1361 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
e4c3f386 1362 } else
401ec4db 1363 prt_printf(&buf1, "(none)");
fa8e94fa 1364 bch2_journal_ptrs_to_text(&buf2, c, i);
e4c3f386 1365
adbcada4 1366 missing_end = seq - 1;
b65db750
KO
1367 fsck_err(c, journal_entries_missing,
1368 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
e4c3f386
KO
1369 " prev at %s\n"
1370 " next at %s",
adbcada4 1371 missing_start, missing_end,
5bbe3f2d 1372 *last_seq, *blacklist_seq - 1,
fa8e94fa
KO
1373 buf1.buf, buf2.buf);
1374
1375 printbuf_exit(&buf1);
1376 printbuf_exit(&buf2);
adbcada4
KO
1377 }
1378
ce6201c4 1379 prev = i;
adbcada4
KO
1380 seq++;
1381 }
1382
ce6201c4 1383 genradix_for_each(&c->journal_entries, radix_iter, _i) {
e4c3f386
KO
1384 struct bch_replicas_padded replicas = {
1385 .e.data_type = BCH_DATA_journal,
1386 .e.nr_required = 1,
1387 };
7ef2a73a 1388
ce6201c4 1389 i = *_i;
2cce3752 1390 if (journal_replay_ignore(i))
adbcada4
KO
1391 continue;
1392
a555bcf4
KO
1393 darray_for_each(i->ptrs, ptr) {
1394 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
d1b2c864 1395
a555bcf4
KO
1396 if (!ptr->csum_good)
1397 bch_err_dev_offset(ca, ptr->sector,
7fec8266
KO
1398 "invalid journal checksum, seq %llu%s",
1399 le64_to_cpu(i->j.seq),
1400 i->csum_good ? " (had good copy on another device)" : "");
d1b2c864
KO
1401 }
1402
1403 ret = jset_validate(c,
a555bcf4 1404 bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
d1b2c864 1405 &i->j,
a555bcf4 1406 i->ptrs.data[0].sector,
d1b2c864 1407 READ);
1c6fdbd8 1408 if (ret)
fa8e94fa 1409 goto err;
1c6fdbd8 1410
a555bcf4
KO
1411 darray_for_each(i->ptrs, ptr)
1412 replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
e4c3f386 1413
26452d1d
KO
1414 bch2_replicas_entry_sort(&replicas.e);
1415
fa8e94fa
KO
1416 printbuf_reset(&buf);
1417 bch2_replicas_entry_to_text(&buf, &replicas.e);
1418
1c6fdbd8 1419 if (!degraded &&
83b3d959
KO
1420 !bch2_replicas_marked(c, &replicas.e) &&
1421 (le64_to_cpu(i->j.seq) == *last_seq ||
b65db750
KO
1422 fsck_err(c, journal_entry_replicas_not_marked,
1423 "superblock not marked as containing replicas for journal entry %llu\n %s",
83b3d959 1424 le64_to_cpu(i->j.seq), buf.buf))) {
7ef2a73a 1425 ret = bch2_mark_replicas(c, &replicas.e);
1c6fdbd8 1426 if (ret)
fa8e94fa 1427 goto err;
1c6fdbd8 1428 }
1c6fdbd8 1429 }
fa8e94fa 1430err:
1c6fdbd8 1431fsck_err:
fa8e94fa 1432 printbuf_exit(&buf);
1c6fdbd8
KO
1433 return ret;
1434}
1435
1c6fdbd8
KO
1436/* journal write: */
1437
a9ec3454
KO
1438static void __journal_write_alloc(struct journal *j,
1439 struct journal_buf *w,
1440 struct dev_alloc_list *devs_sorted,
1441 unsigned sectors,
1442 unsigned *replicas,
1443 unsigned replicas_want)
1c6fdbd8
KO
1444{
1445 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1c6fdbd8
KO
1446 struct journal_device *ja;
1447 struct bch_dev *ca;
a9ec3454 1448 unsigned i;
a2753581 1449
a9ec3454
KO
1450 if (*replicas >= replicas_want)
1451 return;
1c6fdbd8 1452
a9ec3454
KO
1453 for (i = 0; i < devs_sorted->nr; i++) {
1454 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1c6fdbd8
KO
1455 if (!ca)
1456 continue;
1457
1c6fdbd8 1458 ja = &ca->journal;
1c6fdbd8
KO
1459
1460 /*
1461 * Check that we can use this device, and aren't already using
1462 * it:
1463 */
a9ec3454 1464 if (!ca->mi.durability ||
2436cb9f 1465 ca->mi.state != BCH_MEMBER_STATE_rw ||
a9ec3454 1466 !ja->nr ||
702ffea2 1467 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
a9ec3454 1468 sectors > ja->sectors_free)
1c6fdbd8
KO
1469 continue;
1470
3d080aa5 1471 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1c6fdbd8 1472
26609b61 1473 bch2_bkey_append_ptr(&w->key,
1c6fdbd8
KO
1474 (struct bch_extent_ptr) {
1475 .offset = bucket_to_sector(ca,
a9ec3454
KO
1476 ja->buckets[ja->cur_idx]) +
1477 ca->mi.bucket_size -
1478 ja->sectors_free,
1c6fdbd8
KO
1479 .dev = ca->dev_idx,
1480 });
1481
a9ec3454
KO
1482 ja->sectors_free -= sectors;
1483 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1484
1485 *replicas += ca->mi.durability;
1486
1487 if (*replicas >= replicas_want)
1488 break;
1c6fdbd8 1489 }
a9ec3454 1490}
1c6fdbd8 1491
a9ec3454 1492/**
96dea3d5
KO
1493 * journal_write_alloc - decide where to write next journal entry
1494 *
1495 * @j: journal object
1496 * @w: journal buf (entry to be written)
1497 *
1498 * Returns: 0 on success, or -EROFS on failure
a9ec3454 1499 */
96dea3d5 1500static int journal_write_alloc(struct journal *j, struct journal_buf *w)
a9ec3454
KO
1501{
1502 struct bch_fs *c = container_of(j, struct bch_fs, journal);
d042b040 1503 struct bch_devs_mask devs;
a9ec3454
KO
1504 struct journal_device *ja;
1505 struct bch_dev *ca;
1506 struct dev_alloc_list devs_sorted;
96dea3d5 1507 unsigned sectors = vstruct_sectors(w->data, c->block_bits);
d042b040
KO
1508 unsigned target = c->opts.metadata_target ?:
1509 c->opts.foreground_target;
a9ec3454
KO
1510 unsigned i, replicas = 0, replicas_want =
1511 READ_ONCE(c->opts.metadata_replicas);
4e074475
KO
1512 unsigned replicas_need = min_t(unsigned, replicas_want,
1513 READ_ONCE(c->opts.metadata_replicas_required));
1c6fdbd8 1514
a9ec3454 1515 rcu_read_lock();
d042b040
KO
1516retry:
1517 devs = target_rw_devs(c, BCH_DATA_journal, target);
1c6fdbd8 1518
d042b040 1519 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1c6fdbd8 1520
a9ec3454
KO
1521 __journal_write_alloc(j, w, &devs_sorted,
1522 sectors, &replicas, replicas_want);
1c6fdbd8 1523
a9ec3454
KO
1524 if (replicas >= replicas_want)
1525 goto done;
1526
1527 for (i = 0; i < devs_sorted.nr; i++) {
1528 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
1529 if (!ca)
1530 continue;
1531
1532 ja = &ca->journal;
1533
1534 if (sectors > ja->sectors_free &&
1535 sectors <= ca->mi.bucket_size &&
03d5eaed
KO
1536 bch2_journal_dev_buckets_available(j, ja,
1537 journal_space_discarded)) {
a9ec3454
KO
1538 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1539 ja->sectors_free = ca->mi.bucket_size;
68ef94a6
KO
1540
1541 /*
1542 * ja->bucket_seq[ja->cur_idx] must always have
1543 * something sensible:
1544 */
1545 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
a9ec3454
KO
1546 }
1547 }
1548
1549 __journal_write_alloc(j, w, &devs_sorted,
1550 sectors, &replicas, replicas_want);
d042b040
KO
1551
1552 if (replicas < replicas_want && target) {
1553 /* Retry from all devices: */
1554 target = 0;
1555 goto retry;
1556 }
a9ec3454 1557done:
a9ec3454
KO
1558 rcu_read_unlock();
1559
07a1006a
KO
1560 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1561
4e074475 1562 return replicas >= replicas_need ? 0 : -EROFS;
1c6fdbd8
KO
1563}
1564
1c6fdbd8
KO
1565static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1566{
09caeabe
KO
1567 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1568
1c6fdbd8
KO
1569 /* we aren't holding j->lock: */
1570 unsigned new_size = READ_ONCE(j->buf_size_want);
1571 void *new_buf;
1572
d16b4a77 1573 if (buf->buf_size >= new_size)
1c6fdbd8
KO
1574 return;
1575
09caeabe
KO
1576 size_t btree_write_buffer_size = new_size / 64;
1577
1578 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
1579 return;
1580
cb6fc943 1581 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
1c6fdbd8
KO
1582 if (!new_buf)
1583 return;
1584
d16b4a77 1585 memcpy(new_buf, buf->data, buf->buf_size);
c859430b
KO
1586
1587 spin_lock(&j->lock);
1588 swap(buf->data, new_buf);
1589 swap(buf->buf_size, new_size);
1590 spin_unlock(&j->lock);
1591
cb6fc943 1592 kvfree(new_buf);
1c6fdbd8
KO
1593}
1594
ebb84d09
KO
1595static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
1596{
30ef633a 1597 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
ebb84d09
KO
1598}
1599
d4e3b928 1600static CLOSURE_CALLBACK(journal_write_done)
1c6fdbd8 1601{
38789c25
KO
1602 closure_type(w, struct journal_buf, io);
1603 struct journal *j = container_of(w, struct journal, buf[w->idx]);
1c6fdbd8 1604 struct bch_fs *c = container_of(j, struct bch_fs, journal);
83b3d959 1605 struct bch_replicas_padded replicas;
ebb84d09 1606 union journal_res_state old, new;
916abefd 1607 u64 v, seq = le64_to_cpu(w->data->seq);
158eecb8 1608 int err = 0;
1c6fdbd8 1609
991ba021
KO
1610 bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1611 ? j->flush_write_time
1612 : j->noflush_write_time, j->write_start_time);
9c859dc9 1613
d797ca3d 1614 if (!w->devs_written.nr) {
1c6fdbd8 1615 bch_err(c, "unable to write journal to sufficient devices");
158eecb8 1616 err = -EIO;
83b3d959
KO
1617 } else {
1618 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1619 w->devs_written);
1620 if (bch2_mark_replicas(c, &replicas.e))
1621 err = -EIO;
1c6fdbd8 1622 }
83b3d959 1623
158eecb8
KO
1624 if (err)
1625 bch2_fatal_error(c);
1c6fdbd8 1626
916abefd 1627 closure_debug_destroy(cl);
ed9d58a2 1628
916abefd 1629 spin_lock(&j->lock);
1c6fdbd8 1630 if (seq >= j->pin.front)
d797ca3d 1631 journal_seq_pin(j, seq)->devs = w->devs_written;
916abefd
KO
1632 if (err && (!j->err_seq || seq < j->err_seq))
1633 j->err_seq = seq;
1634 w->write_done = true;
1635
1636 bool completed = false;
1637
1638 for (seq = journal_last_unwritten_seq(j);
1639 seq <= journal_cur_seq(j);
1640 seq++) {
1641 w = j->buf + (seq & JOURNAL_BUF_MASK);
1642 if (!w->write_done)
1643 break;
1c6fdbd8 1644
916abefd 1645 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
9be1efe9
KO
1646 j->flushed_seq_ondisk = seq;
1647 j->last_seq_ondisk = w->last_seq;
f25d8215 1648
59cc38b8 1649 bch2_do_discards(c);
f25d8215 1650 closure_wake_up(&c->freelist_wait);
f25d8215 1651 bch2_reset_alloc_cursors(c);
9be1efe9 1652 }
0ce2dbbe 1653
916abefd 1654 j->seq_ondisk = seq;
f0a3a2cc 1655
916abefd
KO
1656 /*
1657 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1658 * more buckets:
1659 *
1660 * Must come before signaling write completion, for
1661 * bch2_fs_journal_stop():
1662 */
1663 if (j->watermark != BCH_WATERMARK_stripe)
1664 journal_reclaim_kick(&c->journal);
158eecb8 1665
916abefd
KO
1666 v = atomic64_read(&j->reservations.counter);
1667 do {
1668 old.v = new.v = v;
1669 BUG_ON(journal_state_count(new, new.unwritten_idx));
1670 BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
1c6fdbd8 1671
916abefd
KO
1672 new.unwritten_idx++;
1673 } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
ebb84d09 1674
ada02c20 1675 closure_wake_up(&w->wait);
916abefd
KO
1676 completed = true;
1677 }
1c6fdbd8 1678
916abefd
KO
1679 if (completed) {
1680 bch2_journal_reclaim_fast(j);
1681 bch2_journal_space_available(j);
5d32c5bb 1682
f1ca1abf 1683 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
066a2646 1684
916abefd
KO
1685 journal_wake(j);
1686 }
1c6fdbd8 1687
916abefd 1688 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
24a3d53b 1689 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
fbec3b88
KO
1690 struct journal_buf *buf = journal_cur_buf(j);
1691 long delta = buf->expires - jiffies;
ebb84d09 1692
24a3d53b
KO
1693 /*
1694 * We don't close a journal entry to write it while there's
1695 * previous entries still in flight - the current journal entry
1696 * might want to be written now:
1697 */
656f05d8 1698 mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
24a3d53b 1699 }
916abefd
KO
1700
1701 spin_unlock(&j->lock);
1c6fdbd8
KO
1702}
1703
1704static void journal_write_endio(struct bio *bio)
1705{
38789c25
KO
1706 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
1707 struct bch_dev *ca = jbio->ca;
1c6fdbd8 1708 struct journal *j = &ca->fs->journal;
38789c25 1709 struct journal_buf *w = j->buf + jbio->buf_idx;
1c6fdbd8 1710
94119eeb
KO
1711 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
1712 "error writing journal entry %llu: %s",
d797ca3d 1713 le64_to_cpu(w->data->seq),
63b214e7 1714 bch2_blk_status_to_str(bio->bi_status)) ||
1c6fdbd8 1715 bch2_meta_write_fault("journal")) {
38789c25
KO
1716 unsigned long flags;
1717
1c6fdbd8 1718 spin_lock_irqsave(&j->err_lock, flags);
d797ca3d 1719 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1c6fdbd8
KO
1720 spin_unlock_irqrestore(&j->err_lock, flags);
1721 }
1722
38789c25 1723 closure_put(&w->io);
1c6fdbd8
KO
1724 percpu_ref_put(&ca->io_ref);
1725}
1726
d4e3b928 1727static CLOSURE_CALLBACK(do_journal_write)
280249b9 1728{
38789c25
KO
1729 closure_type(w, struct journal_buf, io);
1730 struct journal *j = container_of(w, struct journal, buf[w->idx]);
280249b9 1731 struct bch_fs *c = container_of(j, struct bch_fs, journal);
280249b9
KO
1732 unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1733
1734 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
51654002
KO
1735 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1736 struct journal_device *ja = &ca->journal;
1737
280249b9
KO
1738 if (!percpu_ref_tryget(&ca->io_ref)) {
1739 /* XXX: fix this */
1740 bch_err(c, "missing device for journal write\n");
1741 continue;
1742 }
1743
1744 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1745 sectors);
1746
38789c25 1747 struct bio *bio = &ja->bio[w->idx]->bio;
280249b9
KO
1748 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
1749 bio->bi_iter.bi_sector = ptr->offset;
1750 bio->bi_end_io = journal_write_endio;
1751 bio->bi_private = ca;
1752
a28bd48a
KO
1753 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1754 ca->prev_journal_sector = bio->bi_iter.bi_sector;
1755
280249b9
KO
1756 if (!JSET_NO_FLUSH(w->data))
1757 bio->bi_opf |= REQ_FUA;
1758 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1759 bio->bi_opf |= REQ_PREFLUSH;
1760
1761 bch2_bio_map(bio, w->data, sectors << 9);
1762
674cfc26 1763 trace_and_count(c, journal_write, bio);
280249b9
KO
1764 closure_bio_submit(bio, cl);
1765
51654002 1766 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
280249b9
KO
1767 }
1768
656f05d8 1769 continue_at(cl, journal_write_done, j->wq);
280249b9
KO
1770}
1771
769b3600 1772static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
9f6db127 1773{
769b3600 1774 struct bch_fs *c = container_of(j, struct bch_fs, journal);
cea07a7b 1775 struct jset_entry *start, *end;
769b3600 1776 struct jset *jset = w->data;
09caeabe 1777 struct journal_keys_to_wb wb = { NULL };
769b3600 1778 unsigned sectors, bytes, u64s;
769b3600 1779 unsigned long btree_roots_have = 0;
09caeabe
KO
1780 bool validate_before_checksum = false;
1781 u64 seq = le64_to_cpu(jset->seq);
769b3600 1782 int ret;
9f6db127
KO
1783
1784 /*
1785 * Simple compaction, dropping empty jset_entries (from journal
1786 * reservations that weren't fully used) and merging jset_entries that
1787 * can be.
1788 *
1789 * If we wanted to be really fancy here, we could sort all the keys in
1790 * the jset and drop keys that were overwritten - probably not worth it:
1791 */
73ffa530 1792 vstruct_for_each(jset, i) {
9f6db127
KO
1793 unsigned u64s = le16_to_cpu(i->u64s);
1794
1795 /* Empty entry: */
1796 if (!u64s)
1797 continue;
1798
769b3600
KO
1799 /*
1800 * New btree roots are set by journalling them; when the journal
1801 * entry gets written we have to propagate them to
1802 * c->btree_roots
1803 *
1804 * But, every journal entry we write has to contain all the
1805 * btree roots (at least for now); so after we copy btree roots
1806 * to c->btree_roots we have to get any missing btree roots and
1807 * add them to this journal entry:
1808 */
09caeabe
KO
1809 switch (i->type) {
1810 case BCH_JSET_ENTRY_btree_root:
9f6db127 1811 bch2_journal_entry_to_btree_root(c, i);
769b3600 1812 __set_bit(i->btree_id, &btree_roots_have);
09caeabe
KO
1813 break;
1814 case BCH_JSET_ENTRY_write_buffer_keys:
1815 EBUG_ON(!w->need_flush_to_write_buffer);
1816
1817 if (!wb.wb)
1818 bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
1819
09caeabe
KO
1820 jset_entry_for_each_key(i, k) {
1821 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
1822 if (ret) {
1823 bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
1824 bch2_journal_keys_to_write_buffer_end(c, &wb);
1825 return ret;
1826 }
1827 }
1828 i->type = BCH_JSET_ENTRY_btree_keys;
1829 break;
769b3600 1830 }
9f6db127
KO
1831 }
1832
09caeabe
KO
1833 if (wb.wb)
1834 bch2_journal_keys_to_write_buffer_end(c, &wb);
d9290c99
KO
1835
1836 spin_lock(&c->journal.lock);
09caeabe 1837 w->need_flush_to_write_buffer = false;
d9290c99 1838 spin_unlock(&c->journal.lock);
09caeabe 1839
00b8ccf7
KO
1840 start = end = vstruct_last(jset);
1841
769b3600 1842 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
00b8ccf7 1843
52f7d75e
KO
1844 struct jset_entry_datetime *d =
1845 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
1846 d->entry.type = BCH_JSET_ENTRY_datetime;
1847 d->seconds = cpu_to_le64(ktime_get_real_seconds());
1848
09caeabe 1849 bch2_journal_super_entries_add_common(c, &end, seq);
3ccc5c50
KO
1850 u64s = (u64 *) end - (u64 *) start;
1851 BUG_ON(u64s > j->entry_u64s_reserved);
1852
d16b4a77 1853 le32_add_cpu(&jset->u64s, u64s);
4a2e5d7b
KO
1854
1855 sectors = vstruct_sectors(jset, c->block_bits);
1856 bytes = vstruct_bytes(jset);
1857
1858 if (sectors > w->sectors) {
1859 bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
1860 vstruct_bytes(jset), w->sectors << 9,
1861 u64s, w->u64s_reserved, j->entry_u64s_reserved);
80396a47 1862 return -EINVAL;
4a2e5d7b 1863 }
1c6fdbd8 1864
1c6fdbd8 1865 jset->magic = cpu_to_le64(jset_magic(c));
a02a0121 1866 jset->version = cpu_to_le32(c->sb.version);
1c6fdbd8
KO
1867
1868 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1869 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1870
4141fde0 1871 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
09caeabe 1872 j->last_empty_seq = seq;
158eecb8 1873
26609b61
KO
1874 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1875 validate_before_checksum = true;
1876
e751c01a 1877 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
26609b61
KO
1878 validate_before_checksum = true;
1879
1880 if (validate_before_checksum &&
80396a47
KO
1881 (ret = jset_validate(c, NULL, jset, 0, WRITE)))
1882 return ret;
1c6fdbd8 1883
a9de137b 1884 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1c6fdbd8
KO
1885 jset->encrypted_start,
1886 vstruct_end(jset) - (void *) jset->encrypted_start);
a9de137b
KO
1887 if (bch2_fs_fatal_err_on(ret, c,
1888 "error decrypting journal entry: %i", ret))
80396a47 1889 return ret;
1c6fdbd8
KO
1890
1891 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1892 journal_nonce(jset), jset);
1893
26609b61 1894 if (!validate_before_checksum &&
80396a47
KO
1895 (ret = jset_validate(c, NULL, jset, 0, WRITE)))
1896 return ret;
1c6fdbd8 1897
d16b4a77 1898 memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
80396a47
KO
1899 return 0;
1900}
1901
1902static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
1903{
1904 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1905 int error = bch2_journal_error(j);
1906
1907 /*
1908 * If the journal is in an error state - we did an emergency shutdown -
1909 * we prefer to continue doing journal writes. We just mark them as
1910 * noflush so they'll never be used, but they'll still be visible by the
1911 * list_journal tool - this helps in debugging.
1912 *
1913 * There's a caveat: the first journal write after marking the
1914 * superblock dirty must always be a flush write, because on startup
1915 * from a clean shutdown we didn't necessarily read the journal and the
1916 * new journal write might overwrite whatever was in the journal
1917 * previously - we can't leave the journal without any flush writes in
1918 * it.
1919 *
1920 * So if we're in an error state, and we're still starting up, we don't
1921 * write anything at all.
1922 */
1923 if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
1924 return -EIO;
1925
1926 if (error ||
1927 w->noflush ||
1928 (!w->must_flush &&
1929 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
1930 test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
225879f4 1931 w->noflush = true;
80396a47
KO
1932 SET_JSET_NO_FLUSH(w->data, true);
1933 w->data->last_seq = 0;
1934 w->last_seq = 0;
1935
1936 j->nr_noflush_writes++;
1937 } else {
7efa2875 1938 w->must_flush = true;
80396a47
KO
1939 j->last_flush_write = jiffies;
1940 j->nr_flush_writes++;
1941 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
1942 }
1943
1944 return 0;
1945}
1946
d4e3b928 1947CLOSURE_CALLBACK(bch2_journal_write)
80396a47 1948{
38789c25
KO
1949 closure_type(w, struct journal_buf, io);
1950 struct journal *j = container_of(w, struct journal, buf[w->idx]);
80396a47 1951 struct bch_fs *c = container_of(j, struct bch_fs, journal);
80396a47 1952 struct bch_replicas_padded replicas;
80396a47 1953 struct printbuf journal_debug_buf = PRINTBUF;
9fea2274 1954 unsigned nr_rw_members = 0;
80396a47
KO
1955 int ret;
1956
d9290c99
KO
1957 for_each_rw_member(c, ca)
1958 nr_rw_members++;
1959
80396a47 1960 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
d9290c99 1961 BUG_ON(!w->write_started);
916abefd 1962 BUG_ON(w->write_allocated);
d9290c99 1963 BUG_ON(w->write_done);
80396a47
KO
1964
1965 j->write_start_time = local_clock();
1c6fdbd8 1966
e5a66496 1967 spin_lock(&j->lock);
d9290c99
KO
1968 if (nr_rw_members > 1)
1969 w->separate_flush = true;
1970
80396a47
KO
1971 ret = bch2_journal_write_pick_flush(j, w);
1972 spin_unlock(&j->lock);
1973 if (ret)
1974 goto err;
1975
b05c0e93 1976 mutex_lock(&j->buf_lock);
769b3600
KO
1977 journal_buf_realloc(j, w);
1978
80396a47 1979 ret = bch2_journal_write_prep(j, w);
b05c0e93 1980 mutex_unlock(&j->buf_lock);
80396a47
KO
1981 if (ret)
1982 goto err;
1983
fa5df9e7
KO
1984 j->entry_bytes_written += vstruct_bytes(w->data);
1985
80396a47
KO
1986 while (1) {
1987 spin_lock(&j->lock);
1988 ret = journal_write_alloc(j, w);
1989 if (!ret || !j->can_discard)
1990 break;
e5a66496 1991
c18dade6
KO
1992 spin_unlock(&j->lock);
1993 bch2_journal_do_discards(j);
c18dade6
KO
1994 }
1995
80396a47 1996 if (ret) {
fa8e94fa 1997 __bch2_journal_debug_to_text(&journal_debug_buf, j);
80396a47
KO
1998 spin_unlock(&j->lock);
1999 bch_err(c, "Unable to allocate journal write:\n%s",
2000 journal_debug_buf.buf);
2001 printbuf_exit(&journal_debug_buf);
2002 goto err;
2003 }
85674154 2004
e5a66496
KO
2005 /*
2006 * write is allocated, no longer need to account for it in
2007 * bch2_journal_space_available():
2008 */
2009 w->sectors = 0;
916abefd 2010 w->write_allocated = true;
e5a66496
KO
2011
2012 /*
2013 * journal entry has been compacted and allocated, recalculate space
2014 * available:
2015 */
2016 bch2_journal_space_available(j);
916abefd 2017 bch2_journal_do_writes(j);
e5a66496
KO
2018 spin_unlock(&j->lock);
2019
d797ca3d
KO
2020 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
2021
b66b2bc0 2022 if (c->opts.nochanges)
1c6fdbd8
KO
2023 goto no_io;
2024
a7b29b8d
BF
2025 /*
2026 * Mark journal replicas before we submit the write to guarantee
2027 * recovery will find the journal entries after a crash.
2028 */
2029 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
2030 w->devs_written);
2031 ret = bch2_mark_replicas(c, &replicas.e);
2032 if (ret)
2033 goto err;
2034
38789c25
KO
2035 if (!JSET_NO_FLUSH(w->data))
2036 closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
2037
80396a47 2038 if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
9fea2274 2039 for_each_rw_member(c, ca) {
280249b9 2040 percpu_ref_get(&ca->io_ref);
1c6fdbd8 2041
51654002 2042 struct journal_device *ja = &ca->journal;
38789c25 2043 struct bio *bio = &ja->bio[w->idx]->bio;
3e44f325 2044 bio_reset(bio, ca->disk_sb.bdev,
bdec47f5 2045 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
280249b9
KO
2046 bio->bi_end_io = journal_write_endio;
2047 bio->bi_private = ca;
2048 closure_bio_submit(bio, cl);
2049 }
1c6fdbd8
KO
2050 }
2051
656f05d8 2052 continue_at(cl, do_journal_write, j->wq);
280249b9 2053 return;
1c6fdbd8 2054no_io:
656f05d8 2055 continue_at(cl, journal_write_done, j->wq);
1c6fdbd8
KO
2056 return;
2057err:
b74b147d 2058 bch2_fatal_error(c);
656f05d8 2059 continue_at(cl, journal_write_done, j->wq);
1c6fdbd8 2060}