bcachefs: Add a shrinker for the btree key cache
[linux-block.git] / fs / bcachefs / journal_io.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2#include "bcachefs.h"
7b3f84ea 3#include "alloc_foreground.h"
39fb2983 4#include "btree_io.h"
00b8ccf7 5#include "btree_update_interior.h"
1c6fdbd8
KO
6#include "buckets.h"
7#include "checksum.h"
8#include "error.h"
63b214e7 9#include "io.h"
1c6fdbd8
KO
10#include "journal.h"
11#include "journal_io.h"
12#include "journal_reclaim.h"
1c6fdbd8
KO
13#include "replicas.h"
14#include "trace.h"
15
16struct journal_list {
17 struct closure cl;
18 struct mutex lock;
19 struct list_head *head;
20 int ret;
21};
22
23#define JOURNAL_ENTRY_ADD_OK 0
24#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
25
26/*
27 * Given a journal entry we just read, add it to the list of journal entries to
28 * be replayed:
29 */
30static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
ca73852a
KO
31 struct journal_list *jlist, struct jset *j,
32 bool bad)
1c6fdbd8
KO
33{
34 struct journal_replay *i, *pos;
ca73852a 35 struct bch_devs_list devs = { .nr = 0 };
1c6fdbd8
KO
36 struct list_head *where;
37 size_t bytes = vstruct_bytes(j);
38 __le64 last_seq;
39 int ret;
40
41 last_seq = !list_empty(jlist->head)
42 ? list_last_entry(jlist->head, struct journal_replay,
43 list)->j.last_seq
44 : 0;
45
7fffc85b
KO
46 if (!c->opts.read_entire_journal) {
47 /* Is this entry older than the range we need? */
48 if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
49 ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
50 goto out;
51 }
1c6fdbd8 52
7fffc85b
KO
53 /* Drop entries we don't need anymore */
54 list_for_each_entry_safe(i, pos, jlist->head, list) {
55 if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
56 break;
57 list_del(&i->list);
58 kvpfree(i, offsetof(struct journal_replay, j) +
59 vstruct_bytes(&i->j));
60 }
1c6fdbd8
KO
61 }
62
63 list_for_each_entry_reverse(i, jlist->head, list) {
ca73852a
KO
64 if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
65 where = &i->list;
66 goto add;
67 }
68 }
69
70 where = jlist->head;
71add:
72 i = where->next != jlist->head
73 ? container_of(where->next, struct journal_replay, list)
74 : NULL;
75
76 /*
77 * Duplicate journal entries? If so we want the one that didn't have a
78 * checksum error:
79 */
80 if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
81 if (i->bad) {
82 devs = i->devs;
83 list_del(&i->list);
84 kvpfree(i, offsetof(struct journal_replay, j) +
85 vstruct_bytes(&i->j));
86 } else if (bad) {
87 goto found;
88 } else {
1c6fdbd8
KO
89 fsck_err_on(bytes != vstruct_bytes(&i->j) ||
90 memcmp(j, &i->j, bytes), c,
91 "found duplicate but non identical journal entries (seq %llu)",
92 le64_to_cpu(j->seq));
93 goto found;
94 }
95
1c6fdbd8
KO
96 }
97
1c6fdbd8
KO
98 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
99 if (!i) {
100 ret = -ENOMEM;
101 goto out;
102 }
103
104 list_add(&i->list, where);
ca73852a
KO
105 i->devs = devs;
106 i->bad = bad;
1c6fdbd8
KO
107 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
108found:
109 if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
110 bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
111 else
112 fsck_err_on(1, c, "duplicate journal entries on same device");
113 ret = JOURNAL_ENTRY_ADD_OK;
114out:
115fsck_err:
116 return ret;
117}
118
119static struct nonce journal_nonce(const struct jset *jset)
120{
121 return (struct nonce) {{
122 [0] = 0,
123 [1] = ((__le32 *) &jset->seq)[0],
124 [2] = ((__le32 *) &jset->seq)[1],
125 [3] = BCH_NONCE_JOURNAL,
126 }};
127}
128
129/* this fills in a range with empty jset_entries: */
130static void journal_entry_null_range(void *start, void *end)
131{
132 struct jset_entry *entry;
133
134 for (entry = start; entry != end; entry = vstruct_next(entry))
135 memset(entry, 0, sizeof(*entry));
136}
137
138#define JOURNAL_ENTRY_REREAD 5
139#define JOURNAL_ENTRY_NONE 6
140#define JOURNAL_ENTRY_BAD 7
141
142#define journal_entry_err(c, msg, ...) \
143({ \
144 switch (write) { \
145 case READ: \
146 mustfix_fsck_err(c, msg, ##__VA_ARGS__); \
147 break; \
148 case WRITE: \
149 bch_err(c, "corrupt metadata before write:\n" \
150 msg, ##__VA_ARGS__); \
151 if (bch2_fs_inconsistent(c)) { \
152 ret = BCH_FSCK_ERRORS_NOT_FIXED; \
153 goto fsck_err; \
154 } \
155 break; \
156 } \
157 true; \
158})
159
160#define journal_entry_err_on(cond, c, msg, ...) \
161 ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
162
163static int journal_validate_key(struct bch_fs *c, struct jset *jset,
164 struct jset_entry *entry,
39fb2983
KO
165 unsigned level, enum btree_id btree_id,
166 struct bkey_i *k,
1c6fdbd8
KO
167 const char *type, int write)
168{
169 void *next = vstruct_next(entry);
170 const char *invalid;
26609b61 171 unsigned version = le32_to_cpu(jset->version);
1c6fdbd8
KO
172 int ret = 0;
173
174 if (journal_entry_err_on(!k->k.u64s, c,
ed0d631f
KO
175 "invalid %s in journal entry %llu offset %zi: k->u64s 0",
176 type, le64_to_cpu(jset->seq),
177 (u64 *) entry - jset->_data)) {
1c6fdbd8
KO
178 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
179 journal_entry_null_range(vstruct_next(entry), next);
180 return 0;
181 }
182
183 if (journal_entry_err_on((void *) bkey_next(k) >
184 (void *) vstruct_next(entry), c,
ed0d631f
KO
185 "invalid %s in journal entry %llu offset %zi: extends past end of journal entry",
186 type, le64_to_cpu(jset->seq),
187 (u64 *) entry - jset->_data)) {
1c6fdbd8
KO
188 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
189 journal_entry_null_range(vstruct_next(entry), next);
190 return 0;
191 }
192
193 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
ed0d631f
KO
194 "invalid %s in journal entry %llu offset %zi: bad format %u",
195 type, le64_to_cpu(jset->seq),
196 (u64 *) entry - jset->_data,
197 k->k.format)) {
1c6fdbd8
KO
198 le16_add_cpu(&entry->u64s, -k->k.u64s);
199 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
200 journal_entry_null_range(vstruct_next(entry), next);
201 return 0;
202 }
203
39fb2983
KO
204 if (!write)
205 bch2_bkey_compat(level, btree_id, version,
206 JSET_BIG_ENDIAN(jset), write,
207 NULL, bkey_to_packed(k));
26609b61 208
39fb2983
KO
209 invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
210 __btree_node_type(level, btree_id));
1c6fdbd8 211 if (invalid) {
319f9ac3
KO
212 char buf[160];
213
26609b61 214 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
ed0d631f
KO
215 mustfix_fsck_err(c, "invalid %s in journal entry %llu offset %zi: %s\n%s",
216 type, le64_to_cpu(jset->seq),
217 (u64 *) entry - jset->_data,
218 invalid, buf);
1c6fdbd8
KO
219
220 le16_add_cpu(&entry->u64s, -k->k.u64s);
221 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
222 journal_entry_null_range(vstruct_next(entry), next);
223 return 0;
224 }
26609b61 225
39fb2983
KO
226 if (write)
227 bch2_bkey_compat(level, btree_id, version,
228 JSET_BIG_ENDIAN(jset), write,
229 NULL, bkey_to_packed(k));
1c6fdbd8
KO
230fsck_err:
231 return ret;
232}
233
234static int journal_entry_validate_btree_keys(struct bch_fs *c,
235 struct jset *jset,
236 struct jset_entry *entry,
237 int write)
238{
239 struct bkey_i *k;
240
241 vstruct_for_each(entry, k) {
39fb2983
KO
242 int ret = journal_validate_key(c, jset, entry,
243 entry->level,
244 entry->btree_id,
245 k, "key", write);
1c6fdbd8
KO
246 if (ret)
247 return ret;
248 }
249
250 return 0;
251}
252
253static int journal_entry_validate_btree_root(struct bch_fs *c,
254 struct jset *jset,
255 struct jset_entry *entry,
256 int write)
257{
258 struct bkey_i *k = entry->start;
259 int ret = 0;
260
261 if (journal_entry_err_on(!entry->u64s ||
262 le16_to_cpu(entry->u64s) != k->k.u64s, c,
263 "invalid btree root journal entry: wrong number of keys")) {
264 void *next = vstruct_next(entry);
265 /*
266 * we don't want to null out this jset_entry,
267 * just the contents, so that later we can tell
268 * we were _supposed_ to have a btree root
269 */
270 entry->u64s = 0;
271 journal_entry_null_range(vstruct_next(entry), next);
272 return 0;
273 }
274
39fb2983 275 return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
1c6fdbd8
KO
276 "btree root", write);
277fsck_err:
278 return ret;
279}
280
281static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
282 struct jset *jset,
283 struct jset_entry *entry,
284 int write)
285{
286 /* obsolete, don't care: */
287 return 0;
288}
289
290static int journal_entry_validate_blacklist(struct bch_fs *c,
291 struct jset *jset,
292 struct jset_entry *entry,
293 int write)
294{
295 int ret = 0;
296
297 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
298 "invalid journal seq blacklist entry: bad size")) {
299 journal_entry_null_range(entry, vstruct_next(entry));
300 }
301fsck_err:
302 return ret;
303}
304
305static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
306 struct jset *jset,
307 struct jset_entry *entry,
308 int write)
309{
310 struct jset_entry_blacklist_v2 *bl_entry;
311 int ret = 0;
312
313 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
314 "invalid journal seq blacklist entry: bad size")) {
315 journal_entry_null_range(entry, vstruct_next(entry));
2c5af169 316 goto out;
1c6fdbd8
KO
317 }
318
319 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
320
321 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
322 le64_to_cpu(bl_entry->end), c,
323 "invalid journal seq blacklist entry: start > end")) {
324 journal_entry_null_range(entry, vstruct_next(entry));
325 }
2c5af169
KO
326out:
327fsck_err:
328 return ret;
329}
330
331static int journal_entry_validate_usage(struct bch_fs *c,
332 struct jset *jset,
333 struct jset_entry *entry,
334 int write)
335{
336 struct jset_entry_usage *u =
337 container_of(entry, struct jset_entry_usage, entry);
338 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
339 int ret = 0;
340
3577df5f
KO
341 if (journal_entry_err_on(bytes < sizeof(*u),
342 c,
343 "invalid journal entry usage: bad size")) {
344 journal_entry_null_range(entry, vstruct_next(entry));
345 return ret;
346 }
347
348fsck_err:
349 return ret;
350}
351
352static int journal_entry_validate_data_usage(struct bch_fs *c,
353 struct jset *jset,
354 struct jset_entry *entry,
355 int write)
356{
357 struct jset_entry_data_usage *u =
358 container_of(entry, struct jset_entry_data_usage, entry);
359 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
360 int ret = 0;
361
2c5af169
KO
362 if (journal_entry_err_on(bytes < sizeof(*u) ||
363 bytes < sizeof(*u) + u->r.nr_devs,
364 c,
365 "invalid journal entry usage: bad size")) {
366 journal_entry_null_range(entry, vstruct_next(entry));
367 return ret;
368 }
1c6fdbd8
KO
369
370fsck_err:
371 return ret;
372}
373
374struct jset_entry_ops {
375 int (*validate)(struct bch_fs *, struct jset *,
376 struct jset_entry *, int);
377};
378
379static const struct jset_entry_ops bch2_jset_entry_ops[] = {
380#define x(f, nr) \
381 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
382 .validate = journal_entry_validate_##f, \
383 },
384 BCH_JSET_ENTRY_TYPES()
385#undef x
386};
387
388static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
389 struct jset_entry *entry, int write)
390{
2c5af169
KO
391 return entry->type < BCH_JSET_ENTRY_NR
392 ? bch2_jset_entry_ops[entry->type].validate(c, jset,
393 entry, write)
394 : 0;
1c6fdbd8
KO
395}
396
397static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
398 int write)
399{
400 struct jset_entry *entry;
401 int ret = 0;
402
403 vstruct_for_each(jset, entry) {
404 if (journal_entry_err_on(vstruct_next(entry) >
405 vstruct_last(jset), c,
406 "journal entry extends past end of jset")) {
407 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
408 break;
409 }
410
411 ret = journal_entry_validate(c, jset, entry, write);
412 if (ret)
413 break;
414 }
415fsck_err:
416 return ret;
417}
418
419static int jset_validate(struct bch_fs *c,
ca73852a 420 struct bch_dev *ca,
1c6fdbd8
KO
421 struct jset *jset, u64 sector,
422 unsigned bucket_sectors_left,
423 unsigned sectors_read,
424 int write)
425{
426 size_t bytes = vstruct_bytes(jset);
427 struct bch_csum csum;
26609b61 428 unsigned version;
1c6fdbd8
KO
429 int ret = 0;
430
431 if (le64_to_cpu(jset->magic) != jset_magic(c))
432 return JOURNAL_ENTRY_NONE;
433
26609b61 434 version = le32_to_cpu(jset->version);
ca73852a
KO
435 if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
436 version < bcachefs_metadata_version_min) ||
437 version >= bcachefs_metadata_version_max, c,
438 "%s sector %llu seq %llu: unknown journal entry version %u",
439 ca->name, sector, le64_to_cpu(jset->seq),
440 version)) {
35ef6df5
KO
441 /* don't try to continue: */
442 return EINVAL;
1c6fdbd8
KO
443 }
444
35ef6df5
KO
445 if (bytes > (sectors_read << 9) &&
446 sectors_read < bucket_sectors_left)
447 return JOURNAL_ENTRY_REREAD;
448
1c6fdbd8 449 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
ca73852a
KO
450 "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
451 ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
35ef6df5
KO
452 ret = JOURNAL_ENTRY_BAD;
453 le32_add_cpu(&jset->u64s,
454 -((bytes - (bucket_sectors_left << 9)) / 8));
1c6fdbd8
KO
455 }
456
1c6fdbd8 457 if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
ca73852a
KO
458 "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
459 ca->name, sector, le64_to_cpu(jset->seq),
35ef6df5
KO
460 JSET_CSUM_TYPE(jset))) {
461 ret = JOURNAL_ENTRY_BAD;
462 goto bad_csum_type;
463 }
1c6fdbd8
KO
464
465 csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
466 if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
ca73852a 467 "%s sector %llu seq %llu: journal checksum bad",
35ef6df5
KO
468 ca->name, sector, le64_to_cpu(jset->seq)))
469 ret = JOURNAL_ENTRY_BAD;
1c6fdbd8
KO
470
471 bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
472 jset->encrypted_start,
473 vstruct_end(jset) - (void *) jset->encrypted_start);
35ef6df5 474bad_csum_type:
1c6fdbd8 475 if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
ca73852a 476 "invalid journal entry: last_seq > seq")) {
1c6fdbd8 477 jset->last_seq = jset->seq;
ca73852a
KO
478 return JOURNAL_ENTRY_BAD;
479 }
1c6fdbd8
KO
480fsck_err:
481 return ret;
482}
483
484struct journal_read_buf {
485 void *data;
486 size_t size;
487};
488
489static int journal_read_buf_realloc(struct journal_read_buf *b,
490 size_t new_size)
491{
492 void *n;
493
494 /* the bios are sized for this many pages, max: */
495 if (new_size > JOURNAL_ENTRY_SIZE_MAX)
496 return -ENOMEM;
497
498 new_size = roundup_pow_of_two(new_size);
499 n = kvpmalloc(new_size, GFP_KERNEL);
500 if (!n)
501 return -ENOMEM;
502
503 kvpfree(b->data, b->size);
504 b->data = n;
505 b->size = new_size;
506 return 0;
507}
508
509static int journal_read_bucket(struct bch_dev *ca,
510 struct journal_read_buf *buf,
511 struct journal_list *jlist,
a9ec3454 512 unsigned bucket)
1c6fdbd8
KO
513{
514 struct bch_fs *c = ca->fs;
515 struct journal_device *ja = &ca->journal;
1c6fdbd8
KO
516 struct jset *j = NULL;
517 unsigned sectors, sectors_read = 0;
518 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
519 end = offset + ca->mi.bucket_size;
520 bool saw_bad = false;
521 int ret = 0;
522
523 pr_debug("reading %u", bucket);
524
525 while (offset < end) {
526 if (!sectors_read) {
ac10a961
KO
527 struct bio *bio;
528 unsigned nr_bvecs;
529reread:
530 sectors_read = min_t(unsigned,
1c6fdbd8 531 end - offset, buf->size >> 9);
ac10a961
KO
532 nr_bvecs = buf_pages(buf->data, sectors_read << 9);
533
534 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
535 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
1c6fdbd8 536
885678f6
KO
537 bio->bi_iter.bi_sector = offset;
538 bch2_bio_map(bio, buf->data, sectors_read << 9);
1c6fdbd8
KO
539
540 ret = submit_bio_wait(bio);
ac10a961 541 kfree(bio);
1c6fdbd8
KO
542
543 if (bch2_dev_io_err_on(ret, ca,
544 "journal read from sector %llu",
545 offset) ||
546 bch2_meta_read_fault("journal"))
547 return -EIO;
548
549 j = buf->data;
550 }
551
ca73852a 552 ret = jset_validate(c, ca, j, offset,
1c6fdbd8
KO
553 end - offset, sectors_read,
554 READ);
555 switch (ret) {
556 case BCH_FSCK_OK:
ca73852a 557 sectors = vstruct_sectors(j, c->block_bits);
1c6fdbd8
KO
558 break;
559 case JOURNAL_ENTRY_REREAD:
560 if (vstruct_bytes(j) > buf->size) {
561 ret = journal_read_buf_realloc(buf,
562 vstruct_bytes(j));
563 if (ret)
564 return ret;
565 }
566 goto reread;
567 case JOURNAL_ENTRY_NONE:
568 if (!saw_bad)
569 return 0;
570 sectors = c->opts.block_size;
571 goto next_block;
572 case JOURNAL_ENTRY_BAD:
573 saw_bad = true;
ca73852a
KO
574 /*
575 * On checksum error we don't really trust the size
576 * field of the journal entry we read, so try reading
577 * again at next block boundary:
578 */
1c6fdbd8 579 sectors = c->opts.block_size;
ca73852a 580 break;
1c6fdbd8
KO
581 default:
582 return ret;
583 }
584
585 /*
586 * This happens sometimes if we don't have discards on -
587 * when we've partially overwritten a bucket with new
588 * journal entries. We don't need the rest of the
589 * bucket:
590 */
591 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
592 return 0;
593
594 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
595
596 mutex_lock(&jlist->lock);
ca73852a 597 ret = journal_entry_add(c, ca, jlist, j, ret != 0);
1c6fdbd8
KO
598 mutex_unlock(&jlist->lock);
599
600 switch (ret) {
601 case JOURNAL_ENTRY_ADD_OK:
1c6fdbd8
KO
602 break;
603 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
604 break;
605 default:
606 return ret;
607 }
1c6fdbd8
KO
608next_block:
609 pr_debug("next");
610 offset += sectors;
611 sectors_read -= sectors;
612 j = ((void *) j) + (sectors << 9);
613 }
614
615 return 0;
616}
617
618static void bch2_journal_read_device(struct closure *cl)
619{
1c6fdbd8
KO
620 struct journal_device *ja =
621 container_of(cl, struct journal_device, read);
622 struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
623 struct journal_list *jlist =
624 container_of(cl->parent, struct journal_list, cl);
1c6fdbd8 625 struct journal_read_buf buf = { NULL, 0 };
a9ec3454
KO
626 u64 min_seq = U64_MAX;
627 unsigned i;
1c6fdbd8
KO
628 int ret;
629
630 if (!ja->nr)
631 goto out;
632
1c6fdbd8
KO
633 ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
634 if (ret)
635 goto err;
636
637 pr_debug("%u journal buckets", ja->nr);
638
1c6fdbd8 639 for (i = 0; i < ja->nr; i++) {
a9ec3454
KO
640 ret = journal_read_bucket(ca, &buf, jlist, i);
641 if (ret)
642 goto err;
1c6fdbd8
KO
643 }
644
a9ec3454
KO
645 /* Find the journal bucket with the highest sequence number: */
646 for (i = 0; i < ja->nr; i++) {
647 if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
648 ja->cur_idx = i;
1c6fdbd8 649
a9ec3454 650 min_seq = min(ja->bucket_seq[i], min_seq);
1c6fdbd8
KO
651 }
652
1c6fdbd8 653 /*
1c6fdbd8
KO
654 * If there's duplicate journal entries in multiple buckets (which
655 * definitely isn't supposed to happen, but...) - make sure to start
656 * cur_idx at the last of those buckets, so we don't deadlock trying to
657 * allocate
658 */
a9ec3454
KO
659 while (ja->bucket_seq[ja->cur_idx] > min_seq &&
660 ja->bucket_seq[ja->cur_idx] >
661 ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
a36d3685 662 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
a9ec3454
KO
663
664 ja->sectors_free = 0;
1c6fdbd8
KO
665
666 /*
0ce2dbbe 667 * Set dirty_idx to indicate the entire journal is full and needs to be
1c6fdbd8
KO
668 * reclaimed - journal reclaim will immediately reclaim whatever isn't
669 * pinned when it first runs:
670 */
0ce2dbbe
KO
671 ja->discard_idx = ja->dirty_idx_ondisk =
672 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1c6fdbd8
KO
673out:
674 kvpfree(buf.data, buf.size);
1c6fdbd8
KO
675 percpu_ref_put(&ca->io_ref);
676 closure_return(cl);
677 return;
678err:
679 mutex_lock(&jlist->lock);
680 jlist->ret = ret;
681 mutex_unlock(&jlist->lock);
682 goto out;
1c6fdbd8
KO
683}
684
1c6fdbd8
KO
685int bch2_journal_read(struct bch_fs *c, struct list_head *list)
686{
1c6fdbd8
KO
687 struct journal_list jlist;
688 struct journal_replay *i;
1c6fdbd8 689 struct bch_dev *ca;
1c6fdbd8
KO
690 unsigned iter;
691 size_t keys = 0, entries = 0;
692 bool degraded = false;
693 int ret = 0;
694
695 closure_init_stack(&jlist.cl);
696 mutex_init(&jlist.lock);
697 jlist.head = list;
698 jlist.ret = 0;
699
700 for_each_member_device(ca, c, iter) {
6bdbfa87 701 if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
89fd25be 702 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1c6fdbd8
KO
703 continue;
704
705 if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
706 ca->mi.state == BCH_MEMBER_STATE_RO) &&
707 percpu_ref_tryget(&ca->io_ref))
708 closure_call(&ca->journal.read,
709 bch2_journal_read_device,
710 system_unbound_wq,
711 &jlist.cl);
712 else
713 degraded = true;
714 }
715
716 closure_sync(&jlist.cl);
717
718 if (jlist.ret)
719 return jlist.ret;
720
1c6fdbd8 721 list_for_each_entry(i, list, list) {
1dd7f9d9
KO
722 struct jset_entry *entry;
723 struct bkey_i *k, *_n;
7ef2a73a
KO
724 struct bch_replicas_padded replicas;
725 char buf[80];
726
1c6fdbd8
KO
727 ret = jset_validate_entries(c, &i->j, READ);
728 if (ret)
729 goto fsck_err;
730
731 /*
732 * If we're mounting in degraded mode - if we didn't read all
733 * the devices - this is wrong:
734 */
735
89fd25be 736 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
1dd7f9d9 737
1c6fdbd8
KO
738 if (!degraded &&
739 (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
988e98cf 740 fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
7ef2a73a
KO
741 "superblock not marked as containing replicas %s",
742 (bch2_replicas_entry_to_text(&PBUF(buf),
743 &replicas.e), buf)))) {
744 ret = bch2_mark_replicas(c, &replicas.e);
1c6fdbd8
KO
745 if (ret)
746 return ret;
747 }
1c6fdbd8
KO
748
749 for_each_jset_key(k, _n, entry, &i->j)
750 keys++;
751 entries++;
752 }
753
1dd7f9d9
KO
754 if (!list_empty(list)) {
755 i = list_last_entry(list, struct journal_replay, list);
756
757 bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
758 keys, entries, le64_to_cpu(i->j.seq));
759 }
1c6fdbd8
KO
760fsck_err:
761 return ret;
762}
763
1c6fdbd8
KO
764/* journal write: */
765
a9ec3454
KO
766static void __journal_write_alloc(struct journal *j,
767 struct journal_buf *w,
768 struct dev_alloc_list *devs_sorted,
769 unsigned sectors,
770 unsigned *replicas,
771 unsigned replicas_want)
1c6fdbd8
KO
772{
773 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1c6fdbd8
KO
774 struct journal_device *ja;
775 struct bch_dev *ca;
a9ec3454 776 unsigned i;
a2753581 777
a9ec3454
KO
778 if (*replicas >= replicas_want)
779 return;
1c6fdbd8 780
a9ec3454
KO
781 for (i = 0; i < devs_sorted->nr; i++) {
782 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1c6fdbd8
KO
783 if (!ca)
784 continue;
785
1c6fdbd8 786 ja = &ca->journal;
1c6fdbd8
KO
787
788 /*
789 * Check that we can use this device, and aren't already using
790 * it:
791 */
a9ec3454
KO
792 if (!ca->mi.durability ||
793 ca->mi.state != BCH_MEMBER_STATE_RW ||
794 !ja->nr ||
26609b61
KO
795 bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
796 ca->dev_idx) ||
a9ec3454 797 sectors > ja->sectors_free)
1c6fdbd8
KO
798 continue;
799
3d080aa5 800 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1c6fdbd8 801
26609b61 802 bch2_bkey_append_ptr(&w->key,
1c6fdbd8
KO
803 (struct bch_extent_ptr) {
804 .offset = bucket_to_sector(ca,
a9ec3454
KO
805 ja->buckets[ja->cur_idx]) +
806 ca->mi.bucket_size -
807 ja->sectors_free,
1c6fdbd8
KO
808 .dev = ca->dev_idx,
809 });
810
a9ec3454
KO
811 ja->sectors_free -= sectors;
812 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
813
814 *replicas += ca->mi.durability;
815
816 if (*replicas >= replicas_want)
817 break;
1c6fdbd8 818 }
a9ec3454 819}
1c6fdbd8 820
a9ec3454
KO
821/**
822 * journal_next_bucket - move on to the next journal bucket if possible
823 */
824static int journal_write_alloc(struct journal *j, struct journal_buf *w,
825 unsigned sectors)
826{
827 struct bch_fs *c = container_of(j, struct bch_fs, journal);
828 struct journal_device *ja;
829 struct bch_dev *ca;
830 struct dev_alloc_list devs_sorted;
831 unsigned i, replicas = 0, replicas_want =
832 READ_ONCE(c->opts.metadata_replicas);
1c6fdbd8 833
a9ec3454 834 rcu_read_lock();
1c6fdbd8 835
a9ec3454 836 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
89fd25be 837 &c->rw_devs[BCH_DATA_journal]);
1c6fdbd8 838
a9ec3454
KO
839 __journal_write_alloc(j, w, &devs_sorted,
840 sectors, &replicas, replicas_want);
1c6fdbd8 841
a9ec3454
KO
842 if (replicas >= replicas_want)
843 goto done;
844
845 for (i = 0; i < devs_sorted.nr; i++) {
846 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
847 if (!ca)
848 continue;
849
850 ja = &ca->journal;
851
852 if (sectors > ja->sectors_free &&
853 sectors <= ca->mi.bucket_size &&
03d5eaed
KO
854 bch2_journal_dev_buckets_available(j, ja,
855 journal_space_discarded)) {
a9ec3454
KO
856 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
857 ja->sectors_free = ca->mi.bucket_size;
68ef94a6
KO
858
859 /*
860 * ja->bucket_seq[ja->cur_idx] must always have
861 * something sensible:
862 */
863 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
a9ec3454
KO
864 }
865 }
866
867 __journal_write_alloc(j, w, &devs_sorted,
868 sectors, &replicas, replicas_want);
869done:
a9ec3454
KO
870 rcu_read_unlock();
871
57cb2142 872 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
1c6fdbd8
KO
873}
874
875static void journal_write_compact(struct jset *jset)
876{
877 struct jset_entry *i, *next, *prev = NULL;
878
879 /*
880 * Simple compaction, dropping empty jset_entries (from journal
881 * reservations that weren't fully used) and merging jset_entries that
882 * can be.
883 *
884 * If we wanted to be really fancy here, we could sort all the keys in
885 * the jset and drop keys that were overwritten - probably not worth it:
886 */
887 vstruct_for_each_safe(jset, i, next) {
888 unsigned u64s = le16_to_cpu(i->u64s);
889
890 /* Empty entry: */
891 if (!u64s)
892 continue;
893
894 /* Can we merge with previous entry? */
895 if (prev &&
896 i->btree_id == prev->btree_id &&
897 i->level == prev->level &&
898 i->type == prev->type &&
899 i->type == BCH_JSET_ENTRY_btree_keys &&
900 le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
901 memmove_u64s_down(vstruct_next(prev),
902 i->_data,
903 u64s);
904 le16_add_cpu(&prev->u64s, u64s);
905 continue;
906 }
907
908 /* Couldn't merge, move i into new position (after prev): */
909 prev = prev ? vstruct_next(prev) : jset->start;
910 if (i != prev)
911 memmove_u64s_down(prev, i, jset_u64s(u64s));
912 }
913
914 prev = prev ? vstruct_next(prev) : jset->start;
915 jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
916}
917
918static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
919{
920 /* we aren't holding j->lock: */
921 unsigned new_size = READ_ONCE(j->buf_size_want);
922 void *new_buf;
923
d16b4a77 924 if (buf->buf_size >= new_size)
1c6fdbd8
KO
925 return;
926
927 new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
928 if (!new_buf)
929 return;
930
d16b4a77
KO
931 memcpy(new_buf, buf->data, buf->buf_size);
932 kvpfree(buf->data, buf->buf_size);
1c6fdbd8 933 buf->data = new_buf;
d16b4a77 934 buf->buf_size = new_size;
1c6fdbd8
KO
935}
936
937static void journal_write_done(struct closure *cl)
938{
939 struct journal *j = container_of(cl, struct journal, io);
940 struct bch_fs *c = container_of(j, struct bch_fs, journal);
941 struct journal_buf *w = journal_prev_buf(j);
942 struct bch_devs_list devs =
26609b61 943 bch2_bkey_devs(bkey_i_to_s_c(&w->key));
7ef2a73a 944 struct bch_replicas_padded replicas;
1c6fdbd8 945 u64 seq = le64_to_cpu(w->data->seq);
60476b14 946 u64 last_seq = le64_to_cpu(w->data->last_seq);
158eecb8 947 int err = 0;
1c6fdbd8 948
9c859dc9
KO
949 bch2_time_stats_update(j->write_time, j->write_start_time);
950
1c6fdbd8
KO
951 if (!devs.nr) {
952 bch_err(c, "unable to write journal to sufficient devices");
158eecb8
KO
953 err = -EIO;
954 } else {
955 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
956 if (bch2_mark_replicas(c, &replicas.e))
957 err = -EIO;
1c6fdbd8
KO
958 }
959
158eecb8
KO
960 if (err)
961 bch2_fatal_error(c);
1c6fdbd8
KO
962
963 spin_lock(&j->lock);
1c6fdbd8
KO
964 if (seq >= j->pin.front)
965 journal_seq_pin(j, seq)->devs = devs;
966
0ce2dbbe 967 j->seq_ondisk = seq;
158eecb8
KO
968 if (err && (!j->err_seq || seq < j->err_seq))
969 j->err_seq = seq;
0ce2dbbe
KO
970 j->last_seq_ondisk = last_seq;
971 bch2_journal_space_available(j);
972
1c6fdbd8
KO
973 /*
974 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
975 * more buckets:
976 *
977 * Must come before signaling write completion, for
978 * bch2_fs_journal_stop():
979 */
6409c6a0 980 mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
158eecb8 981
1c6fdbd8
KO
982 /* also must come before signalling write completion: */
983 closure_debug_destroy(cl);
984
985 BUG_ON(!j->reservations.prev_buf_unwritten);
986 atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
987 &j->reservations.counter);
988
989 closure_wake_up(&w->wait);
990 journal_wake(j);
991
992 if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
993 mod_delayed_work(system_freezable_wq, &j->write_work, 0);
994 spin_unlock(&j->lock);
1c6fdbd8
KO
995}
996
997static void journal_write_endio(struct bio *bio)
998{
999 struct bch_dev *ca = bio->bi_private;
1000 struct journal *j = &ca->fs->journal;
1001
306d40df 1002 if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
63b214e7 1003 bch2_blk_status_to_str(bio->bi_status)) ||
1c6fdbd8
KO
1004 bch2_meta_write_fault("journal")) {
1005 struct journal_buf *w = journal_prev_buf(j);
1006 unsigned long flags;
1007
1008 spin_lock_irqsave(&j->err_lock, flags);
26609b61 1009 bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
1c6fdbd8
KO
1010 spin_unlock_irqrestore(&j->err_lock, flags);
1011 }
1012
1013 closure_put(&j->io);
1014 percpu_ref_put(&ca->io_ref);
1015}
1016
1017void bch2_journal_write(struct closure *cl)
1018{
1019 struct journal *j = container_of(cl, struct journal, io);
1020 struct bch_fs *c = container_of(j, struct bch_fs, journal);
1021 struct bch_dev *ca;
1022 struct journal_buf *w = journal_prev_buf(j);
3ccc5c50 1023 struct jset_entry *start, *end;
1c6fdbd8
KO
1024 struct jset *jset;
1025 struct bio *bio;
1026 struct bch_extent_ptr *ptr;
26609b61 1027 bool validate_before_checksum = false;
3ccc5c50 1028 unsigned i, sectors, bytes, u64s;
e5a66496
KO
1029 int ret;
1030
1031 bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
1c6fdbd8
KO
1032
1033 journal_buf_realloc(j, w);
1034 jset = w->data;
1035
1036 j->write_start_time = local_clock();
1c6fdbd8 1037
00b8ccf7
KO
1038 /*
1039 * New btree roots are set by journalling them; when the journal entry
1040 * gets written we have to propagate them to c->btree_roots
1041 *
1042 * But, every journal entry we write has to contain all the btree roots
1043 * (at least for now); so after we copy btree roots to c->btree_roots we
1044 * have to get any missing btree roots and add them to this journal
1045 * entry:
1046 */
1047
1048 bch2_journal_entries_to_btree_roots(c, jset);
1049
1050 start = end = vstruct_last(jset);
1051
1052 end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1053
1054 end = bch2_journal_super_entries_add_common(c, end,
3ccc5c50
KO
1055 le64_to_cpu(jset->seq));
1056 u64s = (u64 *) end - (u64 *) start;
1057 BUG_ON(u64s > j->entry_u64s_reserved);
1058
d16b4a77
KO
1059 le32_add_cpu(&jset->u64s, u64s);
1060 BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
1c6fdbd8
KO
1061
1062 journal_write_compact(jset);
1063
1064 jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
1065 jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
1066 jset->magic = cpu_to_le64(jset_magic(c));
26609b61
KO
1067
1068 jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
1069 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
1070 : cpu_to_le32(c->sb.version);
1c6fdbd8
KO
1071
1072 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1073 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1074
158eecb8
KO
1075 if (journal_entry_empty(jset))
1076 j->last_empty_seq = le64_to_cpu(jset->seq);
1077
26609b61
KO
1078 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1079 validate_before_checksum = true;
1080
39fb2983 1081 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
26609b61
KO
1082 validate_before_checksum = true;
1083
1084 if (validate_before_checksum &&
1c6fdbd8
KO
1085 jset_validate_entries(c, jset, WRITE))
1086 goto err;
1087
1088 bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1089 jset->encrypted_start,
1090 vstruct_end(jset) - (void *) jset->encrypted_start);
1091
1092 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1093 journal_nonce(jset), jset);
1094
26609b61 1095 if (!validate_before_checksum &&
1c6fdbd8
KO
1096 jset_validate_entries(c, jset, WRITE))
1097 goto err;
1098
1099 sectors = vstruct_sectors(jset, c->block_bits);
d16b4a77 1100 BUG_ON(sectors > w->sectors);
1c6fdbd8 1101
d16b4a77
KO
1102 bytes = vstruct_bytes(jset);
1103 memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1c6fdbd8 1104
c18dade6 1105retry_alloc:
e5a66496
KO
1106 spin_lock(&j->lock);
1107 ret = journal_write_alloc(j, w, sectors);
1108
c18dade6
KO
1109 if (ret && j->can_discard) {
1110 spin_unlock(&j->lock);
1111 bch2_journal_do_discards(j);
1112 goto retry_alloc;
1113 }
1114
e5a66496
KO
1115 /*
1116 * write is allocated, no longer need to account for it in
1117 * bch2_journal_space_available():
1118 */
1119 w->sectors = 0;
1120
1121 /*
1122 * journal entry has been compacted and allocated, recalculate space
1123 * available:
1124 */
1125 bch2_journal_space_available(j);
1126 spin_unlock(&j->lock);
1127
1128 if (ret) {
1c6fdbd8
KO
1129 bch_err(c, "Unable to allocate journal write");
1130 bch2_fatal_error(c);
1131 continue_at(cl, journal_write_done, system_highpri_wq);
1132 return;
1133 }
1134
1135 /*
1136 * XXX: we really should just disable the entire journal in nochanges
1137 * mode
1138 */
1139 if (c->opts.nochanges)
1140 goto no_io;
1141
1142 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1143 ca = bch_dev_bkey_exists(c, ptr->dev);
1144 if (!percpu_ref_tryget(&ca->io_ref)) {
1145 /* XXX: fix this */
1146 bch_err(c, "missing device for journal write\n");
1147 continue;
1148 }
1149
89fd25be 1150 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1c6fdbd8
KO
1151 sectors);
1152
1153 bio = ca->journal.bio;
1154 bio_reset(bio, ca->disk_sb.bdev,
1155 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
1156 bio->bi_iter.bi_sector = ptr->offset;
1c6fdbd8
KO
1157 bio->bi_end_io = journal_write_endio;
1158 bio->bi_private = ca;
885678f6 1159 bch2_bio_map(bio, jset, sectors << 9);
1c6fdbd8
KO
1160
1161 trace_journal_write(bio);
1162 closure_bio_submit(bio, cl);
1163
d16b4a77 1164 ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
1c6fdbd8
KO
1165 }
1166
1167 for_each_rw_member(ca, c, i)
1168 if (journal_flushes_device(ca) &&
aef90ce0 1169 !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
1c6fdbd8
KO
1170 percpu_ref_get(&ca->io_ref);
1171
1172 bio = ca->journal.bio;
1173 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
1174 bio->bi_end_io = journal_write_endio;
1175 bio->bi_private = ca;
1176 closure_bio_submit(bio, cl);
1177 }
1178
1179no_io:
c6923995
KO
1180 bch2_bucket_seq_cleanup(c);
1181
1c6fdbd8
KO
1182 continue_at(cl, journal_write_done, system_highpri_wq);
1183 return;
1184err:
1185 bch2_inconsistent_error(c);
1186 continue_at(cl, journal_write_done, system_highpri_wq);
1187}