bcachefs: New on disk format: Backpointers
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 18 Mar 2022 00:51:27 +0000 (20:51 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:50 +0000 (17:09 -0400)
This patch adds backpointers: we now have a reverse index from device
and offset on that device (specifically, offset within a bucket) back to
btree nodes and (non cached) data extents.

The first 40 backpointers within a bucket are stored in the alloc key;
after that backpointers spill over to the next backpointers btree. This
is to help avoid performance regressions from additional btree updates
on large streaming workloads.

This patch adds all the code for creating, checking and repairing
backpointers. The next patch in the series is going to use backpointers
for copygc - finally getting rid of the need to scan all extents to do
copygc.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
14 files changed:
fs/bcachefs/Makefile
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_background.h
fs/bcachefs/alloc_foreground.c
fs/bcachefs/backpointers.c [new file with mode: 0644]
fs/bcachefs/backpointers.h [new file with mode: 0644]
fs/bcachefs/bcachefs.h
fs/bcachefs/bcachefs_format.h
fs/bcachefs/bkey_methods.c
fs/bcachefs/buckets.c
fs/bcachefs/buckets.h
fs/bcachefs/errcode.h
fs/bcachefs/recovery.c
fs/bcachefs/super.c

index c0e715760c8be51a097de2be93c4c42c99276087..456d540441ce2840fe03f5aa6d9dba0d92d7563a 100644 (file)
@@ -4,6 +4,7 @@ obj-$(CONFIG_BCACHEFS_FS)       += bcachefs.o
 bcachefs-y             :=      \
        alloc_background.o      \
        alloc_foreground.o      \
+       backpointers.o          \
        bkey.o                  \
        bkey_methods.o          \
        bkey_sort.o             \
index f75d05beaf3146b85f0a7b0e839578a341d6028f..58ec650a512cedbb86a9a3650ccccac00ab553af 100644 (file)
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_key_cache.h"
@@ -266,12 +267,34 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 {
        struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_alloc_v4)) {
-               prt_printf(err, "bad val size (%zu != %zu)",
-                      bkey_val_bytes(k.k), sizeof(struct bch_alloc_v4));
+       if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
+               prt_printf(err, "bad val size (%lu != %u)",
+                      bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
                return -BCH_ERR_invalid_bkey;
        }
 
+       if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+           BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
+               prt_printf(err, "invalid backpointers_start");
+               return -BCH_ERR_invalid_bkey;
+       }
+
+       /*
+        * XXX this is wrong, we'll be checking updates that happened from
+        * before BCH_FS_CHECK_BACKPOINTERS_DONE
+        */
+       if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+               unsigned i, bp_len = 0;
+
+               for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
+                       bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
+
+               if (bp_len > a.v->dirty_sectors) {
+                       prt_printf(err, "too many backpointers");
+                       return -BCH_ERR_invalid_bkey;
+               }
+       }
+
        if (rw == WRITE) {
                if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
                        prt_printf(err, "invalid data type (got %u should be %u)",
@@ -328,9 +351,19 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
        return 0;
 }
 
+static inline u64 swab40(u64 x)
+{
+       return (((x & 0x00000000ffULL) << 32)|
+               ((x & 0x000000ff00ULL) << 16)|
+               ((x & 0x0000ff0000ULL) >>  0)|
+               ((x & 0x00ff000000ULL) >> 16)|
+               ((x & 0xff00000000ULL) >> 32));
+}
+
 void bch2_alloc_v4_swab(struct bkey_s k)
 {
        struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
+       struct bch_backpointer *bp, *bps;
 
        a->journal_seq          = swab64(a->journal_seq);
        a->flags                = swab32(a->flags);
@@ -340,12 +373,20 @@ void bch2_alloc_v4_swab(struct bkey_s k)
        a->io_time[1]           = swab64(a->io_time[1]);
        a->stripe               = swab32(a->stripe);
        a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+
+       bps = alloc_v4_backpointers(a);
+       for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
+               bp->bucket_offset       = swab40(bp->bucket_offset);
+               bp->bucket_len          = swab32(bp->bucket_len);
+               bch2_bpos_swab(&bp->pos);
+       }
 }
 
 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
        struct bch_alloc_v4 _a;
        const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+       unsigned i;
 
        prt_newline(out);
        printbuf_indent_add(out, 2);
@@ -374,14 +415,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
        prt_newline(out);
        prt_printf(out, "io_time[WRITE]    %llu",       a->io_time[WRITE]);
        prt_newline(out);
-       prt_printf(out, "backpointers:     %llu",       BCH_ALLOC_V4_NR_BACKPOINTERS(a));
+       prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
+       prt_newline(out);
 
-       printbuf_indent_sub(out, 2);
-}
+       if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
+               struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
+               const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
 
-static inline void *alloc_v4_backpointers(struct bch_alloc_v4 *a)
-{
-       return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+               prt_printf(out, "backpointers:     %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
+               printbuf_indent_add(out, 2);
+
+               for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
+                       prt_newline(out);
+                       bch2_backpointer_to_text(out, &bps[i]);
+               }
+
+               printbuf_indent_sub(out, 2);
+       }
+
+       printbuf_indent_sub(out, 2);
 }
 
 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
@@ -422,12 +474,18 @@ static noinline struct bkey_i_alloc_v4 *
 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
 {
        struct bkey_i_alloc_v4 *ret;
-
        if (k.k->type == KEY_TYPE_alloc_v4) {
-               unsigned bytes = min(sizeof(struct bkey_i_alloc_v4), bkey_bytes(k.k));
+               struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+               unsigned bytes = sizeof(struct bkey_i_alloc_v4) +
+                       BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) *
+                       sizeof(struct bch_backpointer);
                void *src, *dst;
 
-               ret = bch2_trans_kmalloc(trans, bytes);
+               /*
+                * Reserve space for one more backpointer here:
+                * Not sketchy at doing it this way, nope...
+                */
+               ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer));
                if (IS_ERR(ret))
                        return ret;
 
@@ -437,16 +495,20 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k)
                SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s);
                dst = alloc_v4_backpointers(&ret->v);
 
+               memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) *
+                       sizeof(struct bch_backpointer));
                if (src < dst)
                        memset(src, 0, dst - src);
                set_alloc_v4_u64s(ret);
        } else {
-               ret = bch2_trans_kmalloc(trans, sizeof(*ret));
-               if (!IS_ERR(ret)) {
-                       bkey_alloc_v4_init(&ret->k_i);
-                       ret->k.p = k.k->p;
-                       bch2_alloc_to_v4(k, &ret->v);
-               }
+               ret = bch2_trans_kmalloc(trans, sizeof(struct bkey_i_alloc_v4) +
+                                        sizeof(struct bch_backpointer));
+               if (IS_ERR(ret))
+                       return ret;
+
+               bkey_alloc_v4_init(&ret->k_i);
+               ret->k.p = k.k->p;
+               bch2_alloc_to_v4(k, &ret->v);
        }
        return ret;
 }
@@ -455,8 +517,12 @@ static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_
 {
        if (likely(k.k->type == KEY_TYPE_alloc_v4) &&
            BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) {
+               /*
+                * Reserve space for one more backpointer here:
+                * Not sketchy at doing it this way, nope...
+                */
                struct bkey_i_alloc_v4 *ret =
-                       bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k));
+                       bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer));
                if (!IS_ERR(ret))
                        bkey_reassemble(&ret->k_i, k);
                return ret;
index c562aff3ac3390896e946f0cc8ac67e7c60401db..b843316d38465cbd0d15d2f494becaac8a88275f 100644 (file)
@@ -73,7 +73,9 @@ static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_
 static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a)
 {
        unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
-                       BCH_ALLOC_V4_U64s_V0);
+                       BCH_ALLOC_V4_U64s_V0) +
+               BCH_ALLOC_V4_NR_BACKPOINTERS(a) *
+               (sizeof(struct bch_backpointer) / sizeof(u64));
 
        BUG_ON(ret > U8_MAX - BKEY_U64s);
        return ret;
@@ -175,6 +177,18 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
 
 void bch2_do_invalidates(struct bch_fs *);
 
+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a)
+{
+       return (void *) ((u64 *) &a->v +
+                        (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?:
+                         BCH_ALLOC_V4_U64s_V0));
+}
+
+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a)
+{
+       return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
+}
+
 int bch2_fs_freespace_init(struct bch_fs *);
 
 void bch2_recalc_capacity(struct bch_fs *);
index ba14cfe06515ca1af116ebe22b4256cfa6ee2ac2..5988aa288c98b5e3206cf8fe24a87a4a3788529f 100644 (file)
@@ -14,6 +14,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "btree_iter.h"
 #include "btree_update.h"
 #include "btree_gc.h"
@@ -346,6 +347,28 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 
        }
 
+       if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+               struct bch_backpointer bp;
+               u64 bp_offset = 0;
+
+               ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1,
+                                               &bp_offset, &bp);
+               if (ret) {
+                       ob = ERR_PTR(ret);
+                       goto err;
+               }
+
+               if (bp_offset != U64_MAX) {
+                       /*
+                        * Bucket may have data in it - we don't call
+                        * bc2h_trans_inconnsistent() because fsck hasn't
+                        * finished yet
+                        */
+                       ob = NULL;
+                       goto err;
+               }
+       }
+
        ob = __try_alloc_bucket(c, ca, b, reserve, a, s, cl);
        if (!ob)
                iter.path->preserve = false;
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
new file mode 100644 (file)
index 0000000..6efc286
--- /dev/null
@@ -0,0 +1,799 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_cache.h"
+#include "btree_update.h"
+#include "error.h"
+
+static bool extent_matches_bp(struct bch_fs *c,
+                             enum btree_id btree_id, unsigned level,
+                             struct bkey_s_c k,
+                             struct bpos bucket,
+                             struct bch_backpointer bp)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               struct bpos bucket2;
+               struct bch_backpointer bp2;
+
+               if (p.ptr.cached)
+                       continue;
+
+               bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
+                                     &bucket2, &bp2);
+               if (bpos_eq(bucket, bucket2) &&
+                   !memcmp(&bp, &bp2, sizeof(bp)))
+                       return true;
+       }
+
+       return false;
+}
+
+int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                            int rw, struct printbuf *err)
+{
+       struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+       struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+
+       if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) {
+               prt_str(err, "incorrect value size");
+               return -BCH_ERR_invalid_bkey;
+       }
+
+       if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
+               prt_str(err, "backpointer at wrong pos");
+               return -BCH_ERR_invalid_bkey;
+       }
+
+       return 0;
+}
+
+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
+{
+       prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
+              bch2_btree_ids[bp->btree_id],
+              bp->level,
+              (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+              (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
+              bp->bucket_len);
+       bch2_bpos_to_text(out, bp->pos);
+}
+
+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+       bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
+}
+
+void bch2_backpointer_swab(struct bkey_s k)
+{
+       struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
+
+       bp.v->bucket_offset     = swab32(bp.v->bucket_offset);
+       bp.v->bucket_len        = swab32(bp.v->bucket_len);
+       bch2_bpos_swab(&bp.v->pos);
+}
+
+#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1)
+
+static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r)
+{
+       return cmp_int(l.bucket_offset, r.bucket_offset);
+}
+
+static int bch2_backpointer_del_by_offset(struct btree_trans *trans,
+                                         struct bpos bucket,
+                                         u64 bp_offset,
+                                         struct bch_backpointer bp)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       if (bp_offset < BACKPOINTER_OFFSET_MAX) {
+               struct bch_backpointer *bps;
+               struct bkey_i_alloc_v4 *a;
+               unsigned i, nr;
+
+               bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                                    bucket,
+                                    BTREE_ITER_INTENT|
+                                    BTREE_ITER_SLOTS|
+                                    BTREE_ITER_WITH_UPDATES);
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_alloc_v4) {
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               a = bch2_alloc_to_v4_mut(trans, k);
+               ret = PTR_ERR_OR_ZERO(a);
+               if (ret)
+                       goto err;
+               bps = alloc_v4_backpointers(&a->v);
+               nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+
+               for (i = 0; i < nr; i++) {
+                       if (bps[i].bucket_offset == bp_offset)
+                               goto found;
+                       if (bps[i].bucket_offset > bp_offset)
+                               break;
+               }
+
+               ret = -ENOENT;
+               goto err;
+found:
+               if (memcmp(&bps[i], &bp, sizeof(bp))) {
+                       ret = -ENOENT;
+                       goto err;
+               }
+               array_remove_item(bps, nr, i);
+               SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+               set_alloc_v4_u64s(a);
+               ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+       } else {
+               bp_offset -= BACKPOINTER_OFFSET_MAX;
+
+               bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers,
+                                    bucket_pos_to_bp(c, bucket, bp_offset),
+                                    BTREE_ITER_INTENT|
+                                    BTREE_ITER_SLOTS|
+                                    BTREE_ITER_WITH_UPDATES);
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_backpointer ||
+                   memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) {
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               ret = bch2_btree_delete_at(trans, &iter, 0);
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+bool bch2_bucket_backpointer_del(struct btree_trans *trans,
+                                struct bkey_i_alloc_v4 *a,
+                                struct bch_backpointer bp)
+{
+       struct bch_backpointer *bps = alloc_v4_backpointers(&a->v);
+       unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v);
+
+       for (i = 0; i < nr; i++) {
+               int cmp = backpointer_cmp(bps[i], bp) ?:
+                       memcmp(&bps[i], &bp, sizeof(bp));
+               if (!cmp) {
+                       array_remove_item(bps, nr, i);
+                       SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr);
+                       set_alloc_v4_u64s(a);
+                       return true;
+               }
+               if (cmp >= 0)
+                       break;
+       }
+
+       return false;
+}
+
+static noinline int backpointer_mod_err(struct btree_trans *trans,
+                                       struct bch_backpointer bp,
+                                       struct bkey_s_c bp_k,
+                                       struct bkey_s_c orig_k,
+                                       bool insert)
+{
+       struct bch_fs *c = trans->c;
+       struct printbuf buf = PRINTBUF;
+
+       if (insert) {
+               prt_printf(&buf, "existing backpointer found when inserting ");
+               bch2_backpointer_to_text(&buf, &bp);
+               prt_newline(&buf);
+               printbuf_indent_add(&buf, 2);
+
+               prt_printf(&buf, "found ");
+               bch2_bkey_val_to_text(&buf, c, bp_k);
+               prt_newline(&buf);
+
+               prt_printf(&buf, "for ");
+               bch2_bkey_val_to_text(&buf, c, orig_k);
+
+               bch_err(c, "%s", buf.buf);
+       } else if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+               prt_printf(&buf, "backpointer not found when deleting");
+               prt_newline(&buf);
+               printbuf_indent_add(&buf, 2);
+
+               prt_printf(&buf, "searching for ");
+               bch2_backpointer_to_text(&buf, &bp);
+               prt_newline(&buf);
+
+               prt_printf(&buf, "got ");
+               bch2_bkey_val_to_text(&buf, c, bp_k);
+               prt_newline(&buf);
+
+               prt_printf(&buf, "for ");
+               bch2_bkey_val_to_text(&buf, c, orig_k);
+
+               bch_err(c, "%s", buf.buf);
+       }
+
+       printbuf_exit(&buf);
+
+       if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) {
+               bch2_inconsistent_error(c);
+               return -EIO;
+       } else {
+               return 0;
+       }
+}
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
+                               struct bkey_i_alloc_v4 *a,
+                               struct bch_backpointer bp,
+                               struct bkey_s_c orig_k,
+                               bool insert)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i_backpointer *bp_k;
+       struct btree_iter bp_iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+       ret = PTR_ERR_OR_ZERO(bp_k);
+       if (ret)
+               return ret;
+
+       bkey_backpointer_init(&bp_k->k_i);
+       bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+       bp_k->v = bp;
+
+       if (!insert) {
+               bp_k->k.type = KEY_TYPE_deleted;
+               set_bkey_val_u64s(&bp_k->k, 0);
+       }
+
+       bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+                            bucket_pos_to_bp(c, a->k.p, bp.bucket_offset),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_SLOTS|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&bp_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (insert
+           ? k.k->type
+           : (k.k->type != KEY_TYPE_backpointer ||
+              memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) {
+               ret = backpointer_mod_err(trans, bp, k, orig_k, insert);
+               if (ret)
+                       goto err;
+       }
+
+       ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0);
+err:
+       bch2_trans_iter_exit(trans, &bp_iter);
+       return ret;
+}
+
+/*
+ * Find the next backpointer >= *bp_offset:
+ */
+int bch2_get_next_backpointer(struct btree_trans *trans,
+                             struct bpos bucket, int gen,
+                             u64 *bp_offset,
+                             struct bch_backpointer *dst)
+{
+       struct bch_fs *c = trans->c;
+       struct bpos bp_pos, bp_end_pos;
+       struct btree_iter alloc_iter, bp_iter = { NULL };
+       struct bkey_s_c k;
+       struct bkey_s_c_alloc_v4 a;
+       size_t i;
+       int ret;
+
+       if (*bp_offset == U64_MAX)
+               return 0;
+
+       bp_pos = bucket_pos_to_bp(c, bucket,
+                                 max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX);
+       bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0);
+
+       bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+                            bucket, BTREE_ITER_CACHED);
+       k = bch2_btree_iter_peek_slot(&alloc_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto out;
+
+       if (k.k->type != KEY_TYPE_alloc_v4)
+               goto done;
+
+       a = bkey_s_c_to_alloc_v4(k);
+       if (gen >= 0 && a.v->gen != gen)
+               goto done;
+
+       for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) {
+               if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset)
+                       continue;
+
+               *dst = alloc_v4_backpointers_c(a.v)[i];
+               *bp_offset = dst->bucket_offset;
+               goto out;
+       }
+
+       for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers,
+                                    bp_pos, 0, k, ret) {
+               if (bpos_ge(k.k->p, bp_end_pos))
+                       break;
+
+               if (k.k->type != KEY_TYPE_backpointer)
+                       continue;
+
+               *dst = *bkey_s_c_to_backpointer(k).v;
+               *bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX;
+               goto out;
+       }
+done:
+       *bp_offset = U64_MAX;
+out:
+       bch2_trans_iter_exit(trans, &bp_iter);
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       return ret;
+}
+
+static void backpointer_not_found(struct btree_trans *trans,
+                                 struct bpos bucket,
+                                 u64 bp_offset,
+                                 struct bch_backpointer bp,
+                                 struct bkey_s_c k,
+                                 const char *thing_it_points_to)
+{
+       struct bch_fs *c = trans->c;
+       struct printbuf buf = PRINTBUF;
+
+       if (likely(!bch2_backpointers_no_use_write_buffer))
+               return;
+
+       prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
+                  thing_it_points_to);
+       prt_printf(&buf, "bucket: ");
+       bch2_bpos_to_text(&buf, bucket);
+       prt_printf(&buf, "\n  ");
+
+       if (bp_offset >= BACKPOINTER_OFFSET_MAX) {
+               struct bpos bp_pos =
+                       bucket_pos_to_bp(c, bucket,
+                                       bp_offset - BACKPOINTER_OFFSET_MAX);
+               prt_printf(&buf, "backpointer pos: ");
+               bch2_bpos_to_text(&buf, bp_pos);
+               prt_printf(&buf, "\n  ");
+       }
+
+       bch2_backpointer_to_text(&buf, &bp);
+       prt_printf(&buf, "\n  ");
+       bch2_bkey_val_to_text(&buf, c, k);
+       if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags))
+               bch_err_ratelimited(c, "%s", buf.buf);
+       else
+               bch2_trans_inconsistent(trans, "%s", buf.buf);
+
+       printbuf_exit(&buf);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+                                        struct btree_iter *iter,
+                                        struct bpos bucket,
+                                        u64 bp_offset,
+                                        struct bch_backpointer bp)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
+
+       bch2_trans_node_iter_init(trans, iter,
+                                 bp.btree_id,
+                                 bp.pos,
+                                 0,
+                                 min(bp.level, c->btree_roots[bp.btree_id].level),
+                                 0);
+       k = bch2_btree_iter_peek_slot(iter);
+       if (bkey_err(k)) {
+               bch2_trans_iter_exit(trans, iter);
+               return k;
+       }
+
+       if (bp.level == c->btree_roots[bp.btree_id].level + 1)
+               k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key);
+
+       if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+               return k;
+
+       bch2_trans_iter_exit(trans, iter);
+
+       if (unlikely(bch2_backpointers_no_use_write_buffer)) {
+               if (bp.level) {
+                       struct btree *b;
+
+                       /*
+                        * If a backpointer for a btree node wasn't found, it may be
+                        * because it was overwritten by a new btree node that hasn't
+                        * been written out yet - backpointer_get_node() checks for
+                        * this:
+                        */
+                       b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp);
+                       if (!IS_ERR_OR_NULL(b))
+                               return bkey_i_to_s_c(&b->key);
+
+                       bch2_trans_iter_exit(trans, iter);
+
+                       if (IS_ERR(b))
+                               return bkey_s_c_err(PTR_ERR(b));
+                       return bkey_s_c_null;
+               }
+
+               backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent");
+       }
+
+       return bkey_s_c_null;
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+                                       struct btree_iter *iter,
+                                       struct bpos bucket,
+                                       u64 bp_offset,
+                                       struct bch_backpointer bp)
+{
+       struct bch_fs *c = trans->c;
+       struct btree *b;
+
+       BUG_ON(!bp.level);
+
+       bch2_trans_node_iter_init(trans, iter,
+                                 bp.btree_id,
+                                 bp.pos,
+                                 0,
+                                 bp.level - 1,
+                                 0);
+       b = bch2_btree_iter_peek_node(iter);
+       if (IS_ERR(b))
+               goto err;
+
+       if (b && extent_matches_bp(c, bp.btree_id, bp.level,
+                                  bkey_i_to_s_c(&b->key),
+                                  bucket, bp))
+               return b;
+
+       if (b && btree_node_will_make_reachable(b)) {
+               b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+       } else {
+               backpointer_not_found(trans, bucket, bp_offset, bp,
+                                     bkey_i_to_s_c(&b->key), "btree node");
+               b = NULL;
+       }
+err:
+       bch2_trans_iter_exit(trans, iter);
+       return b;
+}
+
+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter,
+                                       struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter alloc_iter = { NULL };
+       struct bch_dev *ca;
+       struct bkey_s_c alloc_k;
+       struct printbuf buf = PRINTBUF;
+       int ret = 0;
+
+       if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
+                       "backpointer for mising device:\n%s",
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = bch2_btree_delete_at(trans, bp_iter, 0);
+               goto out;
+       }
+
+       ca = bch_dev_bkey_exists(c, k.k->p.inode);
+
+       bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+                            bp_pos_to_bucket(c, k.k->p), 0);
+
+       alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+       ret = bkey_err(alloc_k);
+       if (ret)
+               goto out;
+
+       if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+                       "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
+                       alloc_iter.pos.inode, alloc_iter.pos.offset,
+                       (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+               ret = bch2_btree_delete_at(trans, bp_iter, 0);
+               goto out;
+       }
+out:
+fsck_err:
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       printbuf_exit(&buf);
+       return ret;
+}
+
+/* verify that every backpointer has a corresponding alloc key */
+int bch2_check_btree_backpointers(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+
+       return bch2_trans_run(c,
+               for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_backpointers, POS_MIN, 0, k,
+                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                 bch2_check_btree_backpointer(&trans, &iter, k)));
+}
+
+static int check_bp_exists(struct btree_trans *trans,
+                          struct bpos bucket_pos,
+                          struct bch_backpointer bp,
+                          struct bkey_s_c orig_k)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter alloc_iter, bp_iter = { NULL };
+       struct printbuf buf = PRINTBUF;
+       struct bkey_s_c alloc_k, bp_k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0);
+       alloc_k = bch2_btree_iter_peek_slot(&alloc_iter);
+       ret = bkey_err(alloc_k);
+       if (ret)
+               goto err;
+
+       if (alloc_k.k->type == KEY_TYPE_alloc_v4) {
+               struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k);
+               const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v);
+               unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v);
+
+               for (i = 0; i < nr; i++) {
+                       int cmp = backpointer_cmp(bps[i], bp) ?:
+                               memcmp(&bps[i], &bp, sizeof(bp));
+                       if (!cmp)
+                               goto out;
+                       if (cmp >= 0)
+                               break;
+               }
+       } else {
+               goto missing;
+       }
+
+       bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
+                            bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset),
+                            0);
+       bp_k = bch2_btree_iter_peek_slot(&bp_iter);
+       ret = bkey_err(bp_k);
+       if (ret)
+               goto err;
+
+       if (bp_k.k->type != KEY_TYPE_backpointer ||
+           memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp)))
+               goto missing;
+out:
+err:
+fsck_err:
+       bch2_trans_iter_exit(trans, &bp_iter);
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       printbuf_exit(&buf);
+       return ret;
+missing:
+       prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
+              bch2_btree_ids[bp.btree_id], bp.level);
+       bch2_bkey_val_to_text(&buf, c, orig_k);
+       prt_printf(&buf, "\nbp pos ");
+       bch2_bpos_to_text(&buf, bp_iter.pos);
+
+       if (c->sb.version < bcachefs_metadata_version_backpointers ||
+           c->opts.reconstruct_alloc ||
+           fsck_err(c, "%s", buf.buf)) {
+               struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k);
+
+               ret   = PTR_ERR_OR_ZERO(a) ?:
+                       bch2_bucket_backpointer_mod(trans, a, bp, orig_k, true);
+       }
+
+       goto out;
+}
+
+static int check_extent_to_backpointers(struct btree_trans *trans,
+                                       struct btree_iter *iter)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_ptrs_c ptrs;
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       struct bkey_s_c k;
+       int ret;
+
+       k = bch2_btree_iter_peek_all_levels(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+       if (!k.k)
+               return 0;
+
+       ptrs = bch2_bkey_ptrs_c(k);
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               struct bpos bucket_pos;
+               struct bch_backpointer bp;
+
+               if (p.ptr.cached)
+                       continue;
+
+               bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+                                     k, p, &bucket_pos, &bp);
+
+               ret = check_bp_exists(trans, bucket_pos, bp, k);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int check_btree_root_to_backpointers(struct btree_trans *trans,
+                                           enum btree_id btree_id)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct btree *b;
+       struct bkey_s_c k;
+       struct bkey_ptrs_c ptrs;
+       struct extent_ptr_decoded p;
+       const union bch_extent_entry *entry;
+       int ret;
+
+       bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+                                 c->btree_roots[btree_id].level, 0);
+       b = bch2_btree_iter_peek_node(&iter);
+       ret = PTR_ERR_OR_ZERO(b);
+       if (ret)
+               goto err;
+
+       BUG_ON(b != btree_node_root(c, b));
+
+       k = bkey_i_to_s_c(&b->key);
+       ptrs = bch2_bkey_ptrs_c(k);
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               struct bpos bucket_pos;
+               struct bch_backpointer bp;
+
+               if (p.ptr.cached)
+                       continue;
+
+               bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1,
+                                     k, p, &bucket_pos, &bp);
+
+               ret = check_bp_exists(trans, bucket_pos, bp, k);
+               if (ret)
+                       goto err;
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_check_extents_to_backpointers(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       enum btree_id btree_id;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+       for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+               unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+
+               bch2_trans_node_iter_init(&trans, &iter, btree_id, POS_MIN, 0,
+                                         depth,
+                                         BTREE_ITER_ALL_LEVELS|
+                                         BTREE_ITER_PREFETCH);
+
+               do {
+                       ret = commit_do(&trans, NULL, NULL,
+                                             BTREE_INSERT_LAZY_RW|
+                                             BTREE_INSERT_NOFAIL,
+                                             check_extent_to_backpointers(&trans, &iter));
+                       if (ret)
+                               break;
+               } while (!bch2_btree_iter_advance(&iter));
+
+               bch2_trans_iter_exit(&trans, &iter);
+
+               if (ret)
+                       break;
+
+               ret = commit_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_LAZY_RW|
+                                     BTREE_INSERT_NOFAIL,
+                                     check_btree_root_to_backpointers(&trans, btree_id));
+               if (ret)
+                       break;
+       }
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static int check_one_backpointer(struct btree_trans *trans,
+                                struct bpos bucket,
+                                u64 *bp_offset)
+{
+       struct btree_iter iter;
+       struct bch_backpointer bp;
+       struct bkey_s_c k;
+       struct printbuf buf = PRINTBUF;
+       int ret;
+
+       ret = bch2_get_next_backpointer(trans, bucket, -1,
+                                       bp_offset, &bp);
+       if (ret || *bp_offset == U64_MAX)
+               return ret;
+
+       k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp);
+       ret = bkey_err(k);
+       if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+               return 0;
+       if (ret)
+               return ret;
+
+       if (fsck_err_on(!k.k, trans->c,
+                       "%s backpointer points to missing extent\n%s",
+                       *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree",
+                       (bch2_backpointer_to_text(&buf, &bp), buf.buf))) {
+               ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp);
+               if (ret == -ENOENT)
+                       bch_err(trans->c, "backpointer at %llu not found", *bp_offset);
+       }
+
+       bch2_trans_iter_exit(trans, &iter);
+fsck_err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+int bch2_check_backpointers_to_extents(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               u64 bp_offset = 0;
+
+               while (!(ret = commit_do(&trans, NULL, NULL,
+                                              BTREE_INSERT_LAZY_RW|
+                                              BTREE_INSERT_NOFAIL,
+                               check_one_backpointer(&trans, iter.pos, &bp_offset))) &&
+                      bp_offset < U64_MAX)
+                       bp_offset++;
+
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_exit(&trans);
+       return ret < 0 ? ret : 0;
+}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
new file mode 100644 (file)
index 0000000..e150649
--- /dev/null
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "super.h"
+
+int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+                            int, struct printbuf *);
+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_backpointer_swab(struct bkey_s);
+
+#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
+       .key_invalid    = bch2_backpointer_invalid,     \
+       .val_to_text    = bch2_backpointer_k_to_text,   \
+       .swab           = bch2_backpointer_swab,        \
+})
+
+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT                10
+
+/*
+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc
+ * btree:
+ */
+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c,
+                                          struct bpos bp_pos)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode);
+       u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+       return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
+}
+
+/*
+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree:
+ */
+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
+                                          struct bpos bucket,
+                                          u64 bucket_offset)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+       struct bpos ret;
+
+       ret = POS(bucket.inode,
+                 (bucket_to_sector(ca, bucket.offset) <<
+                  MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+
+       BUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
+
+       return ret;
+}
+
+bool bch2_bucket_backpointer_del(struct btree_trans *,
+                               struct bkey_i_alloc_v4 *,
+                               struct bch_backpointer);
+
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
+                               struct bkey_i_alloc_v4 *,
+                               struct bch_backpointer, struct bkey_s_c, bool);
+
+static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
+                               struct bkey_i_alloc_v4 *a,
+                               struct bch_backpointer bp,
+                               struct bkey_s_c orig_k,
+                               bool insert)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i_backpointer *bp_k;
+       int ret;
+
+       if (!insert &&
+           unlikely(BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v)) &&
+           bch2_bucket_backpointer_del(trans, a, bp))
+               return 0;
+
+       if (unlikely(bch2_backpointers_no_use_write_buffer))
+               return bch2_bucket_backpointer_mod_nowritebuffer(trans, a, bp, orig_k, insert);
+
+       bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+       ret = PTR_ERR_OR_ZERO(bp_k);
+       if (ret)
+               return ret;
+
+       bkey_backpointer_init(&bp_k->k_i);
+       bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset);
+       bp_k->v = bp;
+
+       if (!insert) {
+               bp_k->k.type = KEY_TYPE_deleted;
+               set_bkey_val_u64s(&bp_k->k, 0);
+       }
+
+       return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+}
+
+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
+                          enum btree_id btree_id, unsigned level,
+                          struct bkey_s_c k, struct extent_ptr_decoded p,
+                          struct bpos *bucket_pos, struct bch_backpointer *bp)
+{
+       enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user;
+       s64 sectors = level ? btree_sectors(c) : k.k->size;
+       u32 bucket_offset;
+
+       *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset);
+       *bp = (struct bch_backpointer) {
+               .btree_id       = btree_id,
+               .level          = level,
+               .data_type      = data_type,
+               .bucket_offset  = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
+                       p.crc.offset,
+               .bucket_len     = ptr_disk_sectors(sectors, p),
+               .pos            = k.k->p,
+       };
+}
+
+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int,
+                             u64 *, struct bch_backpointer *);
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
+                                        struct bpos, u64, struct bch_backpointer);
+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
+                                       struct bpos, u64, struct bch_backpointer);
+
+int bch2_check_btree_backpointers(struct bch_fs *);
+int bch2_check_extents_to_backpointers(struct bch_fs *);
+int bch2_check_backpointers_to_extents(struct bch_fs *);
+
+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
index 91f635faccb06232cd2de331d535a3ff0d281ec6..6d048e5d8843de18f0bc83cb11ab10316a96206a 100644 (file)
@@ -315,7 +315,10 @@ do {                                                                       \
                "done in memory")                                       \
        BCH_DEBUG_PARAM(verify_all_btree_replicas,                      \
                "When reading btree nodes, read all replicas and "      \
-               "compare them")
+               "compare them")                                         \
+       BCH_DEBUG_PARAM(backpointers_no_use_write_buffer,               \
+               "Don't use the write buffer for backpointers, enabling "\
+               "extra runtime checks")
 
 /* Parameters that should only be compiled in debug mode: */
 #define BCH_DEBUG_PARAMS_DEBUG()                                       \
@@ -435,6 +438,7 @@ enum gc_phase {
        GC_PHASE_BTREE_lru,
        GC_PHASE_BTREE_freespace,
        GC_PHASE_BTREE_need_discard,
+       GC_PHASE_BTREE_backpointers,
 
        GC_PHASE_PENDING_DELETE,
 };
@@ -552,6 +556,7 @@ enum {
        BCH_FS_INITIAL_GC_DONE,         /* kill when we enumerate fsck passes */
        BCH_FS_CHECK_ALLOC_DONE,
        BCH_FS_CHECK_LRUS_DONE,
+       BCH_FS_CHECK_BACKPOINTERS_DONE,
        BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
        BCH_FS_FSCK_DONE,
        BCH_FS_INITIAL_GC_UNFIXED,      /* kill when we enumerate fsck errors */
index 8e070402e73f131293970e673c45c8eacab829c9..66c88518616016f465071e960dc5259e7ca7b14e 100644 (file)
@@ -369,7 +369,8 @@ static inline void bkey_init(struct bkey *k)
        x(alloc_v3,             24)                     \
        x(set,                  25)                     \
        x(lru,                  26)                     \
-       x(alloc_v4,             27)
+       x(alloc_v4,             27)                     \
+       x(backpointer,          28)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -890,6 +891,12 @@ struct bch_alloc {
        x(stripe,               32)             \
        x(stripe_redundancy,    8)
 
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+       BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
 struct bch_alloc_v2 {
        struct bch_val          v;
        __u8                    nr_fields;
@@ -918,6 +925,9 @@ struct bch_alloc_v3 {
        __u8                    data[];
 } __packed __aligned(8);
 
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
+
 struct bch_alloc_v4 {
        struct bch_val          v;
        __u64                   journal_seq;
@@ -931,25 +941,27 @@ struct bch_alloc_v4 {
        __u64                   io_time[2];
        __u32                   stripe;
        __u32                   nr_external_backpointers;
-       struct bpos             backpointers[0];
 } __packed __aligned(8);
 
 #define BCH_ALLOC_V4_U64s_V0   6
 #define BCH_ALLOC_V4_U64s      (sizeof(struct bch_alloc_v4) / sizeof(u64))
 
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
-
 BITMASK(BCH_ALLOC_V4_NEED_DISCARD,     struct bch_alloc_v4, flags,  0,  1)
 BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,     struct bch_alloc_v4, flags,  1,  2)
 BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
 BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,  struct bch_alloc_v4, flags,  8,  14)
 
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
-       BCH_ALLOC_FIELDS_V1()
-#undef x
-};
+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX       40
+
+struct bch_backpointer {
+       struct bch_val          v;
+       __u8                    btree_id;
+       __u8                    level;
+       __u8                    data_type;
+       __u64                   bucket_offset:40;
+       __u32                   bucket_len;
+       struct bpos             pos;
+} __packed __aligned(8);
 
 /* Quotas: */
 
@@ -1486,7 +1498,8 @@ struct bch_sb_field_journal_seq_blacklist {
        x(inode_v2,                     18)             \
        x(freespace,                    19)             \
        x(alloc_v4,                     20)             \
-       x(new_data_types,               21)
+       x(new_data_types,               21)             \
+       x(backpointers,                 22)
 
 enum bcachefs_metadata_version {
        bcachefs_metadata_version_min = 9,
@@ -2007,19 +2020,20 @@ LE32_BITMASK(JSET_NO_FLUSH,     struct jset, flags, 5, 6);
 /* Btree: */
 
 #define BCH_BTREE_IDS()                                \
-       x(extents,      0)                      \
-       x(inodes,       1)                      \
-       x(dirents,      2)                      \
-       x(xattrs,       3)                      \
-       x(alloc,        4)                      \
-       x(quotas,       5)                      \
-       x(stripes,      6)                      \
-       x(reflink,      7)                      \
-       x(subvolumes,   8)                      \
-       x(snapshots,    9)                      \
-       x(lru,          10)                     \
-       x(freespace,    11)                     \
-       x(need_discard, 12)
+       x(extents,              0)              \
+       x(inodes,               1)              \
+       x(dirents,              2)              \
+       x(xattrs,               3)              \
+       x(alloc,                4)              \
+       x(quotas,               5)              \
+       x(stripes,              6)              \
+       x(reflink,              7)              \
+       x(subvolumes,           8)              \
+       x(snapshots,            9)              \
+       x(lru,                  10)             \
+       x(freespace,            11)             \
+       x(need_discard,         12)             \
+       x(backpointers,         13)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
index 29809da5e9cf229e7f2365d963f3133faf5d3ee5..45c8b2c61c5b912fb911bcf1de3af0a1e2cb8257 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "backpointers.h"
 #include "bkey_methods.h"
 #include "btree_types.h"
 #include "alloc_background.h"
@@ -191,6 +192,9 @@ static unsigned bch2_key_types_allowed[] = {
        [BKEY_TYPE_need_discard] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_set),
+       [BKEY_TYPE_backpointers] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_backpointer),
        [BKEY_TYPE_btree] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_btree_ptr)|
index 86f48f5762dd4067e148599fc40380a85d2e5e0e..b657f8545a3b96fb13fe3456ef9a0eb2e65556bb 100644 (file)
@@ -7,6 +7,7 @@
 
 #include "bcachefs.h"
 #include "alloc_background.h"
+#include "backpointers.h"
 #include "bset.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -662,16 +663,6 @@ err:
        return ret;
 }
 
-static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
-{
-       EBUG_ON(sectors < 0);
-
-       return crc_is_compressed(p.crc)
-               ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
-                                  p.crc.uncompressed_size)
-               : sectors;
-}
-
 static int check_bucket_ref(struct bch_fs *c,
                            struct bkey_s_c k,
                            const struct bch_extent_ptr *ptr,
@@ -1399,22 +1390,42 @@ need_mark:
 
 /* trans_mark: */
 
-static int bch2_trans_mark_pointer(struct btree_trans *trans,
-                       struct bkey_s_c k, struct extent_ptr_decoded p,
-                       s64 sectors, enum bch_data_type data_type)
+static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
+                                  enum btree_id btree_id, unsigned level,
+                                  struct bkey_s_c k, struct extent_ptr_decoded p,
+                                  unsigned flags)
 {
+       bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
        struct btree_iter iter;
        struct bkey_i_alloc_v4 *a;
+       struct bpos bucket_pos;
+       struct bch_backpointer bp;
+       s64 sectors;
        int ret;
 
-       a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(trans->c, &p.ptr));
+       bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp);
+       sectors = bp.bucket_len;
+       if (!insert)
+               sectors = -sectors;
+
+       a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos);
        if (IS_ERR(a))
                return PTR_ERR(a);
 
-       ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
+       ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
                             a->v.gen, &a->v.data_type,
-                            &a->v.dirty_sectors, &a->v.cached_sectors) ?:
-               bch2_trans_update(trans, &iter, &a->k_i, 0);
+                            &a->v.dirty_sectors, &a->v.cached_sectors);
+       if (ret)
+               goto err;
+
+       if (!p.ptr.cached) {
+               ret = bch2_bucket_backpointer_mod(trans, a, bp, k, insert);
+               if (ret)
+                       goto err;
+       }
+
+       ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -1497,8 +1508,7 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
                if (flags & BTREE_TRIGGER_OVERWRITE)
                        disk_sectors = -disk_sectors;
 
-               ret = bch2_trans_mark_pointer(trans, k, p,
-                                       disk_sectors, data_type);
+               ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
                if (ret < 0)
                        return ret;
 
index e8e3a3b097146d3e685db1b1662670a5469b7e2a..3398c9c3a81b353cbb1fc3bfc15d34f5c4612274 100644 (file)
@@ -75,6 +75,15 @@ static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c,
        return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
 }
 
+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c,
+                                               const struct bch_extent_ptr *ptr,
+                                               u32 *bucket_offset)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+       return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset));
+}
+
 static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
                                           const struct bch_extent_ptr *ptr)
 {
@@ -90,6 +99,16 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k,
        return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
 }
 
+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
+{
+       EBUG_ON(sectors < 0);
+
+       return crc_is_compressed(p.crc)
+               ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
+                                  p.crc.uncompressed_size)
+               : sectors;
+}
+
 static inline int gen_cmp(u8 a, u8 b)
 {
        return (s8) (a - b);
index 7a6448f48fcaf5d2b76ded915f9abc51ae1d32b9..804bc15dce3196ebb1ff81a35723bf751911985a 100644 (file)
@@ -60,6 +60,7 @@
        x(BCH_ERR_btree_insert_fail,    btree_insert_need_journal_res)          \
        x(BCH_ERR_btree_insert_fail,    btree_insert_need_journal_reclaim)      \
        x(BCH_ERR_btree_insert_fail,    btree_insert_need_flush_buffer)         \
+       x(0,                            backpointer_to_overwritten_btree_node)  \
        x(0,                            lock_fail_root_changed)                 \
        x(0,                            journal_reclaim_would_deadlock)         \
        x(0,                            fsck)                                   \
index 61890755d33517de71a639c779fb9cf40bca16a9..55356c117737c53a1c33f9a32ca470819496fc63 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "backpointers.h"
 #include "bkey_buf.h"
 #include "alloc_background.h"
 #include "btree_gc.h"
@@ -925,6 +926,7 @@ static bool btree_id_is_alloc(enum btree_id id)
 {
        switch (id) {
        case BTREE_ID_alloc:
+       case BTREE_ID_backpointers:
        case BTREE_ID_need_discard:
        case BTREE_ID_freespace:
                return true;
@@ -1091,8 +1093,8 @@ int bch2_fs_recovery(struct bch_fs *c)
        }
 
        if (!c->opts.nochanges) {
-               if (c->sb.version < bcachefs_metadata_version_new_data_types) {
-                       bch_info(c, "version prior to new_data_types, upgrade and fsck required");
+               if (c->sb.version < bcachefs_metadata_version_backpointers) {
+                       bch_info(c, "version prior to backpointers, upgrade and fsck required");
                        c->opts.version_upgrade = true;
                        c->opts.fsck            = true;
                        c->opts.fix_errors      = FSCK_OPT_YES;
@@ -1301,6 +1303,28 @@ use_clean:
                bch_verbose(c, "done checking lrus");
                set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 
+               bch_info(c, "checking backpointers to alloc keys");
+               err = "error checking backpointers to alloc keys";
+               ret = bch2_check_btree_backpointers(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking backpointers to alloc keys");
+
+               bch_info(c, "checking backpointers to extents");
+               err = "error checking backpointers to extents";
+               ret = bch2_check_backpointers_to_extents(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking backpointers to extents");
+
+               bch_info(c, "checking extents to backpointers");
+               err = "error checking extents to backpointers";
+               ret = bch2_check_extents_to_backpointers(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking extents to backpointers");
+               set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+
                bch_info(c, "checking alloc to lru refs");
                err = "error checking alloc to lru refs";
                ret = bch2_check_alloc_to_lru_refs(c);
@@ -1312,6 +1336,7 @@ use_clean:
                set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
                set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags);
                set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+               set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
                set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
                set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
@@ -1471,6 +1496,9 @@ int bch2_fs_initialize(struct bch_fs *c)
        c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
        c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 
+       if (c->sb.version < bcachefs_metadata_version_backpointers)
+               c->opts.version_upgrade = true;
+
        if (c->opts.version_upgrade) {
                c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
                c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
@@ -1479,6 +1507,9 @@ int bch2_fs_initialize(struct bch_fs *c)
        mutex_unlock(&c->sb_lock);
 
        set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+       set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+       set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags);
+       set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
        set_bit(BCH_FS_MAY_GO_RW, &c->flags);
        set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
index ade8d074e88709eaffbfd6830eb827181ae896ab..c5efaa7d38a8bf6f3b1c7c468601640e70ebe2cb 100644 (file)
@@ -1431,6 +1431,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
                                        BTREE_TRIGGER_NORUN, NULL) ?:
                bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
                                        BTREE_TRIGGER_NORUN, NULL) ?:
+               bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end,
+                                       BTREE_TRIGGER_NORUN, NULL) ?:
                bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
                                        BTREE_TRIGGER_NORUN, NULL);
        if (ret)