bcachefs: Fix journal replay with unreadable btree roots
authorKent Overstreet <kent.overstreet@linux.dev>
Sat, 9 Mar 2024 00:57:22 +0000 (19:57 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 10 Mar 2024 19:18:13 +0000 (15:18 -0400)
When a btree root is unreadable, we still might be able to get some data
back by replaying what's in the journal. Previously though, we got
confused when journal replay would attempt to replay a key for a level
that didn't exist.

This adds bch2_btree_increase_depth(), so that journal replay can handle
this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_interior.h
fs/bcachefs/recovery.c

index 3ef338df82f5e46228f583a85a7cacdba233a64b..cab2e3fa900b4634f050250fa893ada53fa80321 100644 (file)
@@ -1729,7 +1729,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
        if (ret)
                return ret;
 
-       btree_path_set_should_be_locked(trans->paths + iter->path);
+       struct btree_path *path = btree_iter_path(trans, iter);
+       if (btree_path_node(path, path->level))
+               btree_path_set_should_be_locked(path);
        return 0;
 }
 
index 4530b14ff2c3717ec15e92615385c04e185e28e1..7203ea8d502692b173a3bec85b789f24d946ddd8 100644 (file)
@@ -1208,10 +1208,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        mutex_unlock(&c->btree_cache.lock);
 
        mutex_lock(&c->btree_root_lock);
-       BUG_ON(btree_node_root(c, b) &&
-              (b->c.level < btree_node_root(c, b)->c.level ||
-               !btree_node_dying(btree_node_root(c, b))));
-
        bch2_btree_id_root(c, b->c.btree_id)->b = b;
        mutex_unlock(&c->btree_root_lock);
 
@@ -1747,7 +1743,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
                          unsigned flags)
 {
        /* btree_split & merge may both cause paths array to be reallocated */
-
        struct btree *b = path_l(trans->paths + path)->b;
        struct btree_update *as;
        unsigned l;
@@ -1775,6 +1770,60 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
        return ret;
 }
 
+static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
+                                  btree_path_idx_t path_idx)
+{
+       struct bch_fs *c = as->c;
+       struct btree_path *path = trans->paths + path_idx;
+       struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
+
+       BUG_ON(!btree_node_locked(path, b->c.level));
+
+       n = __btree_root_alloc(as, trans, b->c.level + 1);
+
+       bch2_btree_update_add_new_node(as, n);
+       six_unlock_write(&n->c.lock);
+
+       path->locks_want++;
+       BUG_ON(btree_node_locked(path, n->c.level));
+       six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+       mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+       bch2_btree_path_level_init(trans, path, n);
+
+       n->sib_u64s[0] = U16_MAX;
+       n->sib_u64s[1] = U16_MAX;
+
+       bch2_keylist_add(&as->parent_keys, &b->key);
+       btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
+
+       bch2_btree_set_root(as, trans, path, n);
+       bch2_btree_update_get_open_buckets(as, n);
+       bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+       bch2_trans_node_add(trans, path, n);
+       six_unlock_intent(&n->c.lock);
+
+       mutex_lock(&c->btree_cache.lock);
+       list_add_tail(&b->list, &c->btree_cache.live);
+       mutex_unlock(&c->btree_cache.lock);
+
+       bch2_trans_verify_locks(trans);
+}
+
+int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+       struct btree_update *as =
+               bch2_btree_update_start(trans, trans->paths + path,
+                                       b->c.level, true, flags);
+       if (IS_ERR(as))
+               return PTR_ERR(as);
+
+       __btree_increase_depth(as, trans, path);
+       bch2_btree_update_done(as, trans);
+       return 0;
+}
+
 int __bch2_foreground_maybe_merge(struct btree_trans *trans,
                                  btree_path_idx_t path,
                                  unsigned level,
index c593c925d1e3b03cfae5b4e7fdf0f7bc4b99df5c..3439b03719c7b505bf43e5f76bf703f6778c9899 100644 (file)
@@ -119,6 +119,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 
 int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
 
+int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
+
 int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
                                  unsigned, unsigned, enum btree_node_sibling);
 
index 1aa21adc7ee5f48910fbb16e113109d745c9bad4..39271d2d63d17e4fc16f7c06b294c1df42b05698 100644 (file)
@@ -124,6 +124,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
        if (ret)
                goto out;
 
+       struct btree_path *path = btree_iter_path(trans, &iter);
+       if (unlikely(!btree_path_node(path, k->level))) {
+               bch2_trans_iter_exit(trans, &iter);
+               bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+                                         BTREE_MAX_DEPTH, 0, iter_flags);
+               ret =   bch2_btree_iter_traverse(&iter) ?:
+                       bch2_btree_increase_depth(trans, iter.path, 0) ?:
+                       -BCH_ERR_transaction_restart_nested;
+               goto out;
+       }
+
        /* Must be checked with btree locked: */
        if (k->overwritten)
                goto out;