bcachefs: Fix for bad stripe pointers
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 20 Oct 2020 02:36:24 +0000 (22:36 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:44 +0000 (17:08 -0400)
The allocator usually doesn't increment bucket gens right away on
buckets that it's about to hand out (for reasons that need to be
documented), instead deferring that to whatever extent update first
references that bucket.

But stripe pointers reference buckets without changing bucket sector
counts, meaning we could end up with a pointer in a stripe with a gen
newer than the bucket it points to.

Fix this by adding a transactional trigger for KEY_TYPE_stripe that just
writes out the keys in the alloc btree for the buckets it points to.

Also - consolidate the code that checks pointer validity.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_background.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/buckets.c

index b0448d2f1916548f19427d8132770f5abebd999f..8f0c1f378b776514982f5ead7b4319e90f3bd90a 100644 (file)
@@ -497,8 +497,6 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
  * commands to the newly free buckets, then puts them on the various freelists.
  */
 
-#define BUCKET_GC_GEN_MAX      96U
-
 /**
  * wait_buckets_available - wait on reclaimable buckets
  *
index 56a846fde8dd8d1855586f16e72a0a84f6beb454..66ce54724e939a8ad18e5d875c4f99cbaa0d2855 100644 (file)
@@ -13,6 +13,9 @@ struct bkey_alloc_unpacked {
 #undef  x
 };
 
+/* How out of date a pointer gen is allowed to be: */
+#define BUCKET_GC_GEN_MAX      96U
+
 /* returns true if not equal */
 static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
                                           struct bkey_alloc_unpacked r)
index b295e46de05904d3f58fbf1455b09fff12a6012d..f02518f9d9ec49f93f21c69c290df257c65481c8 100644 (file)
@@ -591,6 +591,7 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS             \
        ((1U << BKEY_TYPE_EXTENTS)|                     \
         (1U << BKEY_TYPE_INODES)|                      \
+        (1U << BKEY_TYPE_EC)|                          \
         (1U << BKEY_TYPE_REFLINK))
 
 enum btree_trigger_flags {
index a4a5e084aad3d9ba458789d8a68142b9cfaa40f6..9c33a8be2c580517dfb8f760dbe07a2397a6a42a 100644 (file)
@@ -337,8 +337,9 @@ static inline bool iter_has_trans_triggers(struct btree_iter *iter)
 
 static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
 {
-       return (BTREE_NODE_TYPE_HAS_TRIGGERS &
-               ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
+       return (((BTREE_NODE_TYPE_HAS_TRIGGERS &
+                 ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) |
+               (1U << BTREE_ID_EC)) &
                (1U << iter->btree_id);
 }
 
index 7bc51f397c7b1deba11480e9b4735fea89e3f552..80d11decb71ebb871ea7e3d719ed34bdd26d33b3 100644 (file)
@@ -883,124 +883,140 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
                                        p.crc.uncompressed_size);
 }
 
-static void bucket_set_stripe(struct bch_fs *c,
-                             const struct bch_extent_ptr *ptr,
-                             struct bch_fs_usage *fs_usage,
-                             u64 journal_seq,
-                             unsigned flags,
-                             bool enabled)
-{
-       bool gc = flags & BTREE_TRIGGER_GC;
-       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-       struct bucket *g = PTR_BUCKET(ca, ptr, gc);
-       struct bucket_mark new, old;
-
-       old = bucket_cmpxchg(g, new, ({
-               new.stripe                      = enabled;
-               if (journal_seq) {
-                       new.journal_seq_valid   = 1;
-                       new.journal_seq         = journal_seq;
-               }
-       }));
-
-       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
-
-       /*
-        * XXX write repair code for these, flag stripe as possibly bad
-        */
-       if (old.gen != ptr->gen)
-               bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                             "stripe with stale pointer");
-#if 0
-       /*
-        * We'd like to check for these, but these checks don't work
-        * yet:
-        */
-       if (old.stripe && enabled)
-               bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                             "multiple stripes using same bucket");
-
-       if (!old.stripe && !enabled)
-               bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                             "deleting stripe but bucket not marked as stripe bucket");
-#endif
-}
-
-static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
-                         struct extent_ptr_decoded p,
-                         s64 sectors, enum bch_data_type ptr_data_type,
-                         u8 bucket_gen, u8 *bucket_data_type,
-                         u16 *dirty_sectors, u16 *cached_sectors)
-{
-       u16 *dst_sectors = !p.ptr.cached
+static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
+                           const struct bch_extent_ptr *ptr,
+                           s64 sectors, enum bch_data_type ptr_data_type,
+                           u8 bucket_gen, u8 bucket_data_type,
+                           u16 dirty_sectors, u16 cached_sectors)
+{
+       size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
+       u16 bucket_sectors = !ptr->cached
                ? dirty_sectors
                : cached_sectors;
-       u16 orig_sectors = *dst_sectors;
        char buf[200];
 
-       if (gen_after(p.ptr.gen, bucket_gen)) {
+       if (gen_after(ptr->gen, bucket_gen)) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
                        "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-                       bucket_gen,
-                       bch2_data_types[*bucket_data_type ?: ptr_data_type],
-                       p.ptr.gen,
+                       ptr->dev, bucket_nr, bucket_gen,
+                       bch2_data_types[bucket_data_type ?: ptr_data_type],
+                       ptr->gen,
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
                return -EIO;
        }
 
-       if (gen_cmp(bucket_gen, p.ptr.gen) > 96U) {
+       if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
                        "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-                       bucket_gen,
-                       bch2_data_types[*bucket_data_type ?: ptr_data_type],
-                       p.ptr.gen,
+                       ptr->dev, bucket_nr, bucket_gen,
+                       bch2_data_types[bucket_data_type ?: ptr_data_type],
+                       ptr->gen,
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
                return -EIO;
        }
 
-       if (bucket_gen != p.ptr.gen && !p.ptr.cached) {
+       if (bucket_gen != ptr->gen && !ptr->cached) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
                        "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-                       bucket_gen,
-                       bch2_data_types[*bucket_data_type ?: ptr_data_type],
-                       p.ptr.gen,
+                       ptr->dev, bucket_nr, bucket_gen,
+                       bch2_data_types[bucket_data_type ?: ptr_data_type],
+                       ptr->gen,
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
                return -EIO;
        }
 
-       if (bucket_gen != p.ptr.gen)
+       if (bucket_gen != ptr->gen)
                return 1;
 
-       if (*bucket_data_type && *bucket_data_type != ptr_data_type) {
+       if (bucket_data_type && ptr_data_type &&
+           bucket_data_type != ptr_data_type) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-                       bucket_gen,
-                       bch2_data_types[*bucket_data_type],
+                       ptr->dev, bucket_nr, bucket_gen,
+                       bch2_data_types[bucket_data_type],
                        bch2_data_types[ptr_data_type],
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
                return -EIO;
        }
 
-       if (checked_add(*dst_sectors, sectors)) {
+       if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
                        "while marking %s",
-                       p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
-                       bucket_gen,
-                       bch2_data_types[*bucket_data_type ?: ptr_data_type],
-                       orig_sectors, sectors,
+                       ptr->dev, bucket_nr, bucket_gen,
+                       bch2_data_types[bucket_data_type ?: ptr_data_type],
+                       bucket_sectors, sectors,
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
                return -EIO;
        }
 
+       return 0;
+}
+
+static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+                            const struct bch_extent_ptr *ptr,
+                            struct bch_fs_usage *fs_usage,
+                            u64 journal_seq,
+                            unsigned flags,
+                            bool enabled)
+{
+       bool gc = flags & BTREE_TRIGGER_GC;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+       struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+       struct bucket_mark new, old;
+       char buf[200];
+       int ret;
+
+       old = bucket_cmpxchg(g, new, ({
+               ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
+                                      new.dirty_sectors, new.cached_sectors);
+               if (ret)
+                       return ret;
+
+               if (new.stripe && enabled)
+                       bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+                                     "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+                                     ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+                                     (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+               if (!new.stripe && !enabled)
+                       bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+                                     "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
+                                     ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+                                     (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+               new.stripe                      = enabled;
+               if (journal_seq) {
+                       new.journal_seq_valid   = 1;
+                       new.journal_seq         = journal_seq;
+               }
+       }));
+
+       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+       return 0;
+}
+
+static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+                         const struct bch_extent_ptr *ptr,
+                         s64 sectors, enum bch_data_type ptr_data_type,
+                         u8 bucket_gen, u8 *bucket_data_type,
+                         u16 *dirty_sectors, u16 *cached_sectors)
+{
+       u16 *dst_sectors = !ptr->cached
+               ? dirty_sectors
+               : cached_sectors;
+       int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
+                                  bucket_gen, *bucket_data_type,
+                                  *dirty_sectors, *cached_sectors);
+
+       if (ret)
+               return ret;
+
+       *dst_sectors += sectors;
        *bucket_data_type = *dirty_sectors || *cached_sectors
                ? ptr_data_type : 0;
        return 0;
@@ -1025,7 +1041,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
                new.v.counter = old.v.counter = v;
                bucket_data_type = new.data_type;
 
-               ret = __mark_pointer(c, k, p, sectors, data_type, new.gen,
+               ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
                                     &bucket_data_type,
                                     &new.dirty_sectors,
                                     &new.cached_sectors);
@@ -1189,6 +1205,7 @@ static int bch2_mark_stripe(struct bch_fs *c,
                ? bkey_s_c_to_stripe(new).v : NULL;
        struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
        unsigned i;
+       int ret;
 
        if (!m || (old_s && !m->alive)) {
                bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
@@ -1198,9 +1215,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
 
        if (!new_s) {
                /* Deleting: */
-               for (i = 0; i < old_s->nr_blocks; i++)
-                       bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
-                                         journal_seq, flags, false);
+               for (i = 0; i < old_s->nr_blocks; i++) {
+                       ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
+                                               journal_seq, flags, false);
+                       if (ret)
+                               return ret;
+               }
 
                if (!gc && m->on_heap) {
                        spin_lock(&c->ec_stripes_heap_lock);
@@ -1219,11 +1239,16 @@ static int bch2_mark_stripe(struct bch_fs *c,
                                   old_s->ptrs + i,
                                   sizeof(struct bch_extent_ptr))) {
 
-                               if (old_s)
-                                       bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
+                               if (old_s) {
+                                       bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
                                                          journal_seq, flags, false);
-                               bucket_set_stripe(c, new_s->ptrs + i, fs_usage,
-                                                 journal_seq, flags, true);
+                                       if (ret)
+                                               return ret;
+                               }
+                               ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage,
+                                                       journal_seq, flags, true);
+                               if (ret)
+                                       return ret;
                        }
                }
 
@@ -1549,23 +1574,21 @@ static int trans_get_key(struct btree_trans *trans,
        return ret;
 }
 
-static int bch2_trans_mark_pointer(struct btree_trans *trans,
-                       struct bkey_s_c k, struct extent_ptr_decoded p,
-                       s64 sectors, enum bch_data_type data_type)
+static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+                                        const struct bch_extent_ptr *ptr,
+                                        struct bkey_alloc_unpacked *u)
 {
        struct bch_fs *c = trans->c;
-       struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-       struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr));
-       struct btree_iter *iter;
-       struct bkey_s_c k_a;
-       struct bkey_alloc_unpacked u;
-       struct bkey_i_alloc *a;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+       struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
        struct bucket *g;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
        int ret;
 
-       iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a);
+       iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
        if (iter) {
-               u = bch2_alloc_unpack(k_a);
+               *u = bch2_alloc_unpack(k);
        } else {
                iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos,
                                           BTREE_ITER_CACHED|
@@ -1575,16 +1598,36 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
                        return PTR_ERR(iter);
 
                ret = bch2_btree_iter_traverse(iter);
-               if (ret)
-                       goto out;
+               if (ret) {
+                       bch2_trans_iter_put(trans, iter);
+                       return ret;
+               }
 
                percpu_down_read(&c->mark_lock);
                g = bucket(ca, pos.offset);
-               u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+               *u = alloc_mem_to_key(g, READ_ONCE(g->mark));
                percpu_up_read(&c->mark_lock);
        }
 
-       ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type,
+       *_iter = iter;
+       return 0;
+}
+
+static int bch2_trans_mark_pointer(struct btree_trans *trans,
+                       struct bkey_s_c k, struct extent_ptr_decoded p,
+                       s64 sectors, enum bch_data_type data_type)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter *iter;
+       struct bkey_alloc_unpacked u;
+       struct bkey_i_alloc *a;
+       int ret;
+
+       ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+       if (ret)
+               return ret;
+
+       ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
                             &u.dirty_sectors, &u.cached_sectors);
        if (ret)
                goto out;
@@ -1595,7 +1638,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
                goto out;
 
        bkey_alloc_init(&a->k_i);
-       a->k.p = pos;
+       a->k.p = iter->pos;
        bch2_alloc_pack(a, u);
        bch2_trans_update(trans, iter, &a->k_i, 0);
 out:
@@ -1716,6 +1759,44 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
        return 0;
 }
 
+static int bch2_trans_mark_stripe(struct btree_trans *trans,
+                                 struct bkey_s_c k)
+{
+       const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+       struct bkey_alloc_unpacked u;
+       struct bkey_i_alloc *a;
+       struct btree_iter *iter;
+       unsigned i;
+       int ret = 0;
+
+       /*
+        * The allocator code doesn't necessarily update bucket gens in the
+        * btree when incrementing them, right before handing out new buckets -
+        * we just need to persist those updates here along with the new stripe:
+        */
+
+       for (i = 0; i < s->nr_blocks && !ret; i++) {
+               ret = bch2_trans_start_alloc_update(trans, &iter,
+                                                   &s->ptrs[i], &u);
+               if (ret)
+                       break;
+
+               a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+               ret = PTR_ERR_OR_ZERO(a);
+               if (ret)
+                       goto put_iter;
+
+               bkey_alloc_init(&a->k_i);
+               a->k.p = iter->pos;
+               bch2_alloc_pack(a, u);
+               bch2_trans_update(trans, iter, &a->k_i, 0);
+put_iter:
+               bch2_trans_iter_put(trans, iter);
+       }
+
+       return ret;
+}
+
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
                        struct bkey_s_c_reflink_p p,
                        u64 idx, unsigned sectors,
@@ -1815,6 +1896,8 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
        case KEY_TYPE_reflink_v:
                return bch2_trans_mark_extent(trans, k, offset, sectors,
                                              flags, BCH_DATA_user);
+       case KEY_TYPE_stripe:
+               return bch2_trans_mark_stripe(trans, k);
        case KEY_TYPE_inode:
                d = replicas_deltas_realloc(trans, 0);