bcachefs: New and improved topology repair code
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 24 Apr 2021 20:32:35 +0000 (16:32 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:02 +0000 (17:09 -0400)
This splits out btree topology repair into a separate pass, and makes
some improvements:
 - When we have to pick which of two overlapping nodes to drop keys
   from, we use the btree node header sequence number to preserve the
   newer node

 - the gc code has been changed so that it doesn't bail out if we're
   continuing/ignoring on fsck error - this way the dump tool can skip
   running the repair pass but still walk all reachable metadata

 - add a new superblock flag indicating when a filesystem is known to
   have btree topology issues, and the topology repair pass should be
   run

 - changing the start/end of a node might mean keys in that node have to
   be deleted: this patch handles that better by splitting it out into a
   separate function and running it explicitly in the topology repair
   code, previously those keys were only being dropped when the btree
   node was read in.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs.h
fs/bcachefs/bcachefs_format.h
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/error.c
fs/bcachefs/error.h
fs/bcachefs/recovery.c
fs/bcachefs/super-io.c
fs/bcachefs/super.c

index 64a9094ec748cffe91219d406a138a21f8f6ee13..323705f352dee80ff6d7d7ea7c929afb12a89520 100644 (file)
@@ -485,6 +485,7 @@ enum {
        BCH_FS_ALLOCATOR_RUNNING,
        BCH_FS_ALLOCATOR_STOPPING,
        BCH_FS_INITIAL_GC_DONE,
+       BCH_FS_INITIAL_GC_UNFIXED,
        BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
        BCH_FS_FSCK_DONE,
        BCH_FS_STARTED,
@@ -498,7 +499,9 @@ enum {
 
        /* errors: */
        BCH_FS_ERROR,
+       BCH_FS_TOPOLOGY_ERROR,
        BCH_FS_ERRORS_FIXED,
+       BCH_FS_ERRORS_NOT_FIXED,
 
        /* misc: */
        BCH_FS_NEED_ANOTHER_GC,
index f2b5f5c06ee047768084cb508e2c7a324c04b655..98a87e4928abf7d03fa95f53b68420d40ba0051a 100644 (file)
@@ -1317,12 +1317,10 @@ LE64_BITMASK(BCH_SB_GRPQUOTA,           struct bch_sb, flags[0], 58, 59);
 LE64_BITMASK(BCH_SB_PRJQUOTA,          struct bch_sb, flags[0], 59, 60);
 
 LE64_BITMASK(BCH_SB_HAS_ERRORS,                struct bch_sb, flags[0], 60, 61);
+LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
 
-/* bit 61 was reflink option */
 LE64_BITMASK(BCH_SB_BIG_ENDIAN,                struct bch_sb, flags[0], 62, 63);
 
-/* 61-64 unused */
-
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,     struct bch_sb, flags[1],  0,  4);
 LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,  struct bch_sb, flags[1],  4,  8);
 LE64_BITMASK(BCH_SB_INODE_32BIT,       struct bch_sb, flags[1],  8,  9);
index 6983a11979059cc40da42c97c3ba79c1902abbdd..09a49dc63144084068bfd3567be3a00f4560cd85 100644 (file)
@@ -66,8 +66,6 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                ? node_start
                : bpos_successor(prev->k->k.p);
        char buf1[200], buf2[200];
-       bool update_min = false;
-       bool update_max = false;
        int ret = 0;
 
        if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
@@ -81,83 +79,341 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                        bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
                }
 
-               if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c,
-                               "btree node with incorrect min_key at btree %s level %u:\n"
-                               "  prev %s\n"
-                               "  cur %s",
-                               bch2_btree_ids[b->c.btree_id], b->c.level,
-                               buf1,
-                               (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
-                       update_min = true;
+               if (bpos_cmp(expected_start, bp->v.min_key)) {
+                       bch2_topology_error(c);
+
+                       if (fsck_err(c, "btree node with incorrect min_key at btree %s level %u:\n"
+                                    "  prev %s\n"
+                                    "  cur %s",
+                                    bch2_btree_ids[b->c.btree_id], b->c.level,
+                                    buf1,
+                                    (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
+                               bch_info(c, "Halting mark and sweep to start topology repair pass");
+                               return FSCK_ERR_START_TOPOLOGY_REPAIR;
+                       } else {
+                               set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+                       }
+               }
+       }
+
+       if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
+               bch2_topology_error(c);
+
+               if (fsck_err(c, "btree node with incorrect max_key at btree %s level %u:\n"
+                            "  %s\n"
+                            "  expected %s",
+                            bch2_btree_ids[b->c.btree_id], b->c.level,
+                            (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+                            (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
+                       bch_info(c, "Halting mark and sweep to start topology repair pass");
+                       return FSCK_ERR_START_TOPOLOGY_REPAIR;
+               } else {
+                       set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+               }
+       }
+
+       bch2_bkey_buf_copy(prev, c, cur.k);
+fsck_err:
+       return ret;
+}
+
+static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
+{
+       switch (b->key.k.type) {
+       case KEY_TYPE_btree_ptr: {
+               struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
+
+               dst->k.p                = src->k.p;
+               dst->v.mem_ptr          = 0;
+               dst->v.seq              = b->data->keys.seq;
+               dst->v.sectors_written  = 0;
+               dst->v.flags            = 0;
+               dst->v.min_key          = b->data->min_key;
+               set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
+               memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
+               break;
+       }
+       case KEY_TYPE_btree_ptr_v2:
+               bkey_copy(&dst->k_i, &b->key);
+               break;
+       default:
+               BUG();
+       }
+}
+
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
+{
+       struct bkey_i_btree_ptr_v2 *new;
+       int ret;
+
+       new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+       if (!new)
+               return -ENOMEM;
+
+       btree_ptr_to_v2(b, new);
+       b->data->min_key        = new_min;
+       new->v.min_key          = new_min;
+       SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+       ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+       if (ret) {
+               kfree(new);
+               return ret;
+       }
+
+       bch2_btree_node_drop_keys_outside_node(b);
+
+       return 0;
+}
+
+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
+{
+       struct bkey_i_btree_ptr_v2 *new;
+       int ret;
+
+       ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
+       if (ret)
+               return ret;
+
+       new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+       if (!new)
+               return -ENOMEM;
+
+       btree_ptr_to_v2(b, new);
+       b->data->max_key        = new_max;
+       new->k.p                = new_max;
+       SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+       ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+       if (ret) {
+               kfree(new);
+               return ret;
+       }
+
+       bch2_btree_node_drop_keys_outside_node(b);
+
+       mutex_lock(&c->btree_cache.lock);
+       bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+       bkey_copy(&b->key, &new->k_i);
+       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+       BUG_ON(ret);
+       mutex_unlock(&c->btree_cache.lock);
+       return 0;
+}
+
+static int btree_repair_node_start(struct bch_fs *c, struct btree *b,
+                                  struct btree *prev, struct btree *cur)
+{
+       struct bpos expected_start = !prev
+               ? b->data->min_key
+               : bpos_successor(prev->key.k.p);
+       char buf1[200], buf2[200];
+       int ret = 0;
+
+       if (!prev) {
+               struct printbuf out = PBUF(buf1);
+               pr_buf(&out, "start of node: ");
+               bch2_bpos_to_text(&out, b->data->min_key);
+       } else {
+               bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
+       }
+
+       if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
+                       "btree node with incorrect min_key at btree %s level %u:\n"
+                       "  prev %s\n"
+                       "  cur %s",
+                       bch2_btree_ids[b->c.btree_id], b->c.level,
+                       buf1,
+                       (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) {
+               if (prev &&
+                   bpos_cmp(expected_start, cur->data->min_key) > 0 &&
+                   BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data))
+                       ret = set_node_max(c, prev,
+                               bpos_predecessor(cur->data->min_key));
+               else
+                       ret = set_node_min(c, cur, expected_start);
+               if (ret)
+                       return ret;
        }
+fsck_err:
+       return ret;
+}
 
-       if (fsck_err_on(is_last &&
-                       bpos_cmp(cur.k->k.p, node_end), c,
+static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+                                struct btree *child)
+{
+       char buf1[200], buf2[200];
+       int ret = 0;
+
+       if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
                        "btree node with incorrect max_key at btree %s level %u:\n"
                        "  %s\n"
                        "  expected %s",
                        bch2_btree_ids[b->c.btree_id], b->c.level,
-                       (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-                       (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
-               update_max = true;
+                       (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
+                       (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
+               ret = set_node_max(c, child, b->key.k.p);
+               if (ret)
+                       return ret;
+       }
+fsck_err:
+       return ret;
+}
 
-       bch2_bkey_buf_copy(prev, c, cur.k);
+#define DROP_THIS_NODE         10
 
-       if (update_min || update_max) {
-               struct bkey_i *new;
-               struct bkey_i_btree_ptr_v2 *bp = NULL;
-               struct btree *n;
+static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
+{
+       struct btree_and_journal_iter iter;
+       struct bkey_s_c k;
+       struct bkey_buf tmp;
+       struct btree *prev = NULL, *cur = NULL;
+       bool have_child, dropped_children = false;
+       char buf[200];
+       int ret = 0;
 
-               if (update_max) {
+       if (!b->c.level)
+               return 0;
+again:
+       have_child = dropped_children = false;
+       bch2_bkey_buf_init(&tmp);
+       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+               BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+               BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+
+               bch2_btree_and_journal_iter_advance(&iter);
+               bch2_bkey_buf_reassemble(&tmp, c, k);
+
+               cur = bch2_btree_node_get_noiter(c, tmp.k,
+                                       b->c.btree_id, b->c.level - 1,
+                                       false);
+               ret = PTR_ERR_OR_ZERO(cur);
+
+               if (mustfix_fsck_err_on(ret == -EIO, c,
+                               "Unreadable btree node at btree %s level %u:\n"
+                               "  %s",
+                               bch2_btree_ids[b->c.btree_id],
+                               b->c.level - 1,
+                               (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) {
                        ret = bch2_journal_key_delete(c, b->c.btree_id,
-                                                     b->c.level, cur.k->k.p);
+                                                     b->c.level, tmp.k->k.p);
                        if (ret)
-                               return ret;
+                               goto err;
+                       continue;
                }
 
-               new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
-               if (!new) {
-                       bch_err(c, "%s: error allocating new key", __func__);
-                       return -ENOMEM;
+               if (ret) {
+                       bch_err(c, "%s: error %i getting btree node",
+                               __func__, ret);
+                       break;
                }
 
-               bkey_copy(new, cur.k);
+               ret = btree_repair_node_start(c, b, prev, cur);
+               if (prev)
+                       six_unlock_read(&prev->c.lock);
+               prev = cur;
+               cur = NULL;
+
+               if (ret)
+                       break;
+       }
+
+       if (!ret && !IS_ERR_OR_NULL(prev)) {
+               BUG_ON(cur);
+               ret = btree_repair_node_end(c, b, prev);
+       }
+
+       if (!IS_ERR_OR_NULL(prev))
+               six_unlock_read(&prev->c.lock);
+       prev = NULL;
+       if (!IS_ERR_OR_NULL(cur))
+               six_unlock_read(&cur->c.lock);
+       cur = NULL;
 
-               if (new->k.type == KEY_TYPE_btree_ptr_v2)
-                       bp = bkey_i_to_btree_ptr_v2(new);
+       if (ret)
+               goto err;
+
+       bch2_btree_and_journal_iter_exit(&iter);
+       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+               bch2_bkey_buf_reassemble(&tmp, c, k);
+               bch2_btree_and_journal_iter_advance(&iter);
 
-               if (update_min)
-                       bp->v.min_key = expected_start;
-               if (update_max)
-                       new->k.p = node_end;
-               if (bp)
-                       SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
+               cur = bch2_btree_node_get_noiter(c, tmp.k,
+                                       b->c.btree_id, b->c.level - 1,
+                                       false);
+               ret = PTR_ERR_OR_ZERO(cur);
 
-               ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
                if (ret) {
-                       kfree(new);
-                       return ret;
+                       bch_err(c, "%s: error %i getting btree node",
+                               __func__, ret);
+                       goto err;
                }
 
-               n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
-                                              b->c.level - 1, true);
-               if (n) {
-                       mutex_lock(&c->btree_cache.lock);
-                       bch2_btree_node_hash_remove(&c->btree_cache, n);
-
-                       bkey_copy(&n->key, new);
-                       if (update_min)
-                               n->data->min_key = expected_start;
-                       if (update_max)
-                               n->data->max_key = node_end;
-
-                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
-                       BUG_ON(ret);
-                       mutex_unlock(&c->btree_cache.lock);
-                       six_unlock_read(&n->c.lock);
+               ret = bch2_btree_repair_topology_recurse(c, cur);
+               six_unlock_read(&cur->c.lock);
+               cur = NULL;
+
+               if (ret == DROP_THIS_NODE) {
+                       ret = bch2_journal_key_delete(c, b->c.btree_id,
+                                                     b->c.level, tmp.k->k.p);
+                       dropped_children = true;
                }
+
+               if (ret)
+                       goto err;
+
+               have_child = true;
        }
+
+       if (mustfix_fsck_err_on(!have_child, c,
+                       "empty interior btree node at btree %s level %u\n"
+                       "  %s",
+                       bch2_btree_ids[b->c.btree_id],
+                       b->c.level,
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
+               ret = DROP_THIS_NODE;
+err:
 fsck_err:
+       if (!IS_ERR_OR_NULL(prev))
+               six_unlock_read(&prev->c.lock);
+       if (!IS_ERR_OR_NULL(cur))
+               six_unlock_read(&cur->c.lock);
+
+       bch2_btree_and_journal_iter_exit(&iter);
+       bch2_bkey_buf_exit(&tmp, c);
+
+       if (!ret && dropped_children)
+               goto again;
+
+       return ret;
+}
+
+static int bch2_repair_topology(struct bch_fs *c)
+{
+       struct btree *b;
+       unsigned i;
+       int ret = 0;
+
+       for (i = 0; i < BTREE_ID_NR && !ret; i++) {
+               b = c->btree_roots[i].b;
+               if (btree_node_fake(b))
+                       continue;
+
+               six_lock_read(&b->c.lock, NULL, NULL);
+               ret = bch2_btree_repair_topology_recurse(c, b);
+               six_unlock_read(&b->c.lock);
+
+               if (ret == DROP_THIS_NODE) {
+                       bch_err(c, "empty btree root - repair unimplemented");
+                       ret = FSCK_ERR_EXIT;
+               }
+       }
+
        return ret;
 }
 
@@ -483,6 +739,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
        struct bkey_s_c k;
        struct bkey_buf cur, prev;
        u8 max_stale = 0;
+       char buf[200];
        int ret = 0;
 
        bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
@@ -498,7 +755,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                                       &k, &max_stale, true);
                if (ret) {
                        bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
-                       break;
+                       goto fsck_err;
                }
 
                if (b->c.level) {
@@ -511,7 +768,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                                        &prev, cur,
                                        !bch2_btree_and_journal_iter_peek(&iter).k);
                        if (ret)
-                               break;
+                               goto fsck_err;
                } else {
                        bch2_btree_and_journal_iter_advance(&iter);
                }
@@ -532,18 +789,25 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                                                false);
                        ret = PTR_ERR_OR_ZERO(child);
 
-                       if (fsck_err_on(ret == -EIO, c,
-                                       "unreadable btree node")) {
-                               ret = bch2_journal_key_delete(c, b->c.btree_id,
-                                                             b->c.level, cur.k->k.p);
-                               if (ret)
-                                       return ret;
-
-                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-                               continue;
-                       }
-
-                       if (ret) {
+                       if (ret == -EIO) {
+                               bch2_topology_error(c);
+
+                               if (fsck_err(c, "Unreadable btree node at btree %s level %u:\n"
+                                       "  %s",
+                                       bch2_btree_ids[b->c.btree_id],
+                                       b->c.level - 1,
+                                       (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) {
+                                       ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+                                       bch_info(c, "Halting mark and sweep to start topology repair pass");
+                                       goto fsck_err;
+                               } else {
+                                       /* Continue marking when opted to not
+                                        * fix the error: */
+                                       ret = 0;
+                                       set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+                                       continue;
+                               }
+                       } else if (ret) {
                                bch_err(c, "%s: error %i getting btree node",
                                        __func__, ret);
                                break;
@@ -583,16 +847,20 @@ static int bch2_gc_btree_init(struct bch_fs *c,
                return 0;
 
        six_lock_read(&b->c.lock, NULL, NULL);
-       if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
+       if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
                        "btree root with incorrect min_key: %s",
                        (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
-               BUG();
+               bch_err(c, "repair unimplemented");
+               ret = FSCK_ERR_EXIT;
+               goto fsck_err;
        }
 
-       if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
+       if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
                        "btree root with incorrect max_key: %s",
                        (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
-               BUG();
+               bch_err(c, "repair unimplemented");
+               ret = FSCK_ERR_EXIT;
+               goto fsck_err;
        }
 
        if (b->c.level >= target_depth)
@@ -607,7 +875,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 fsck_err:
        six_unlock_read(&b->c.lock);
 
-       if (ret)
+       if (ret < 0)
                bch_err(c, "%s: ret %i", __func__, ret);
        return ret;
 }
@@ -622,23 +890,20 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
        enum btree_id ids[BTREE_ID_NR];
        unsigned i;
+       int ret = 0;
 
        for (i = 0; i < BTREE_ID_NR; i++)
                ids[i] = i;
        bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
 
-       for (i = 0; i < BTREE_ID_NR; i++) {
-               enum btree_id id = ids[i];
-               int ret = initial
-                       ? bch2_gc_btree_init(c, id, metadata_only)
-                       : bch2_gc_btree(c, id, initial, metadata_only);
-               if (ret) {
-                       bch_err(c, "%s: ret %i", __func__, ret);
-                       return ret;
-               }
-       }
+       for (i = 0; i < BTREE_ID_NR && !ret; i++)
+               ret = initial
+                       ? bch2_gc_btree_init(c, ids[i], metadata_only)
+                       : bch2_gc_btree(c, ids[i], initial, metadata_only);
 
-       return 0;
+       if (ret < 0)
+               bch_err(c, "%s: ret %i", __func__, ret);
+       return ret;
 }
 
 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
@@ -1025,7 +1290,27 @@ again:
 
        bch2_mark_superblocks(c);
 
+       if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) &&
+           !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
+           c->opts.fix_errors != FSCK_OPT_NO) {
+               bch_info(c, "starting topology repair pass");
+               ret = bch2_repair_topology(c);
+               if (ret)
+                       goto out;
+               bch_info(c, "topology repair pass done");
+       }
+
        ret = bch2_gc_btrees(c, initial, metadata_only);
+
+       if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
+           !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
+               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+               ret = 0;
+       }
+
+       if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR)
+               ret = FSCK_ERR_EXIT;
+
        if (ret)
                goto out;
 
index cea151a5d4f807037f9e71ca94367e1b64b03ca3..389524ce1fb6525a76a633f9c2341ba461336075 100644 (file)
@@ -558,6 +558,55 @@ out:                                                                       \
 
 #define btree_err_on(cond, ...)        ((cond) ? btree_err(__VA_ARGS__) : false)
 
+/*
+ * When btree topology repair changes the start or end of a node, that might
+ * mean we have to drop keys that are no longer inside the node:
+ */
+__cold
+void bch2_btree_node_drop_keys_outside_node(struct btree *b)
+{
+       struct bset_tree *t;
+       struct bkey_s_c k;
+       struct bkey unpacked;
+       struct btree_node_iter iter;
+
+       for_each_bset(b, t) {
+               struct bset *i = bset(b, t);
+               struct bkey_packed *k;
+
+               for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+                       if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0)
+                               break;
+
+               if (k != i->start) {
+                       unsigned shift = (u64 *) k - (u64 *) i->start;
+
+                       memmove_u64s_down(i->start, k,
+                                         (u64 *) vstruct_end(i) - (u64 *) k);
+                       i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
+                       set_btree_bset_end(b, t);
+                       bch2_bset_set_no_aux_tree(b, t);
+               }
+
+               for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+                       if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
+                               break;
+
+               if (k != vstruct_last(i)) {
+                       i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
+                       set_btree_bset_end(b, t);
+                       bch2_bset_set_no_aux_tree(b, t);
+               }
+       }
+
+       bch2_btree_build_aux_trees(b);
+
+       for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
+               BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+               BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+       }
+}
+
 static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                         struct btree *b, struct bset *i,
                         unsigned sectors, int write, bool have_retry)
@@ -680,6 +729,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 {
        unsigned version = le16_to_cpu(i->version);
        struct bkey_packed *k, *prev = NULL;
+       bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
        int ret = 0;
 
        for (k = i->start;
@@ -713,7 +764,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                u = __bkey_disassemble(b, k, &tmp);
 
                invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
-                       bch2_bkey_in_btree_node(b, u.s_c) ?:
+                       (!updated_range ?  bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
                        (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
                if (invalid) {
                        char buf[160];
@@ -770,6 +821,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        struct bch_extent_ptr *ptr;
        struct bset *i;
        bool used_mempool, blacklisted;
+       bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
        unsigned u64s;
        int ret, retry_read = 0, write = READ;
 
@@ -917,6 +970,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
        btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
 
+       if (updated_range)
+               bch2_btree_node_drop_keys_outside_node(b);
+
        i = &b->data->keys;
        for (k = i->start; k != vstruct_last(i);) {
                struct bkey tmp;
index c8a8b05a19b0f1b1bc74fe254e28facd6a98b4a6..cadcf7f886d73759167ce8f177e0e55723ebf9a5 100644 (file)
@@ -131,6 +131,8 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse
 
 void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 
+void bch2_btree_node_drop_keys_outside_node(struct btree *);
+
 void bch2_btree_build_aux_trees(struct btree *);
 void bch2_btree_init_next(struct bch_fs *, struct btree *,
                         struct btree_iter *);
index 113e81e8730c1b5a5863e05c682e801d294f0b5b..ea522b4583fd0cfa7c9d4f83e1be7e4083821b20 100644 (file)
@@ -1609,11 +1609,12 @@ retry:
 
                bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
                bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
-               bch2_fs_inconsistent(c,
-                                    "btree topology error in btree merge:\n"
-                                    "prev ends at   %s\n"
-                                    "next starts at %s\n",
-                                    buf1, buf2);
+               bch_err(c,
+                       "btree topology error in btree merge:\n"
+                       "  prev ends at   %s\n"
+                       "  next starts at %s",
+                       buf1, buf2);
+               bch2_topology_error(c);
                ret = -EIO;
                goto err;
        }
index a8ee1db8aa3917851dfdd011e9d00e63bf8a84bd..90c3b986c264020253a2938941c6c9e6b1ae9bdf 100644 (file)
@@ -25,6 +25,13 @@ bool bch2_inconsistent_error(struct bch_fs *c)
        }
 }
 
+void bch2_topology_error(struct bch_fs *c)
+{
+       set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+       if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+               bch2_inconsistent_error(c);
+}
+
 void bch2_fatal_error(struct bch_fs *c)
 {
        if (bch2_fs_emergency_read_only(c))
@@ -74,9 +81,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
                vprintk(fmt, args);
                va_end(args);
 
-               return bch2_inconsistent_error(c)
-                       ? FSCK_ERR_EXIT
-                       : FSCK_ERR_FIX;
+               if (c->opts.errors == BCH_ON_ERROR_continue) {
+                       bch_err(c, "fixing");
+                       return FSCK_ERR_FIX;
+               } else {
+                       bch2_inconsistent_error(c);
+                       return FSCK_ERR_EXIT;
+               }
        }
 
        mutex_lock(&c->fsck_error_lock);
@@ -146,6 +157,7 @@ print:
                set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
                return FSCK_ERR_FIX;
        } else {
+               set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
                set_bit(BCH_FS_ERROR, &c->flags);
                return c->opts.fix_errors == FSCK_OPT_EXIT ||
                        !(flags & FSCK_CAN_IGNORE)
index 0e49fd728e440cb5be02bf1da3e399fa52e3e9f0..d8cd19b3f63c83c73b675b54ed767607eb638cf9 100644 (file)
@@ -29,6 +29,8 @@ struct work_struct;
 
 bool bch2_inconsistent_error(struct bch_fs *);
 
+void bch2_topology_error(struct bch_fs *);
+
 #define bch2_fs_inconsistent(c, ...)                                   \
 ({                                                                     \
        bch_err(c, __VA_ARGS__);                                        \
@@ -88,6 +90,7 @@ enum fsck_err_ret {
        FSCK_ERR_IGNORE = 0,
        FSCK_ERR_FIX    = 1,
        FSCK_ERR_EXIT   = 2,
+       FSCK_ERR_START_TOPOLOGY_REPAIR = 3,
 };
 
 struct fsck_err_state {
index fe6886e42216b0ce3de98900d38fe6fa625f8ff4..a9ccd14effe7297fc61cf2dc9217719bdc342d90 100644 (file)
@@ -1241,8 +1241,9 @@ use_clean:
 
        if (c->opts.fsck &&
            !test_bit(BCH_FS_ERROR, &c->flags) &&
-           BCH_SB_HAS_ERRORS(c->disk_sb.sb)) {
+           !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
                SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+               SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
                write_sb = true;
        }
 
index bf36a5743607ac2842be3e6072e3002e0198faab..e0de6f0c0cb48ddf4fb271461c2ec76188957f6f 100644 (file)
@@ -439,6 +439,11 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
        __copy_super(&c->disk_sb, src);
 
+       if (BCH_SB_HAS_ERRORS(c->disk_sb.sb))
+               set_bit(BCH_FS_ERROR, &c->flags);
+       if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
+               set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+
        ret = bch2_sb_replicas_to_cpu_replicas(c);
        if (ret)
                return ret;
@@ -715,6 +720,8 @@ int bch2_write_super(struct bch_fs *c)
 
        if (test_bit(BCH_FS_ERROR, &c->flags))
                SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+       if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+               SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
 
        SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
 
index 64bc5ed33203e658f7b424bc69be29f76ab394ef..78db2c0a5f5a01c8e19d2382b17d431a4571605d 100644 (file)
@@ -388,6 +388,11 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
        unsigned i;
        int ret;
 
+       if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
+               bch_err(c, "cannot go rw, unfixed btree errors");
+               return -EROFS;
+       }
+
        if (test_bit(BCH_FS_RW, &c->flags))
                return 0;