bcachefs: trans_commit() path can now insert to interior nodes
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 16 Mar 2020 03:29:43 +0000 (23:29 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:37 +0000 (17:08 -0400)
This will be needed for the upcoming patches to journal updates to
interior btree nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bkey_methods.c
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_cache.h
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/recovery.c
fs/bcachefs/recovery.h

index c064cf468a9b9a2d0570c95ce210e0f3afed75b5..0aa3d3b9a281c7fad15df4914b7f007861e3cad0 100644 (file)
@@ -134,7 +134,7 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
-       if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+       if (bkey_cmp(k.k->p, b->data->min_key) < 0)
                return "key before start of btree node";
 
        if (bkey_cmp(k.k->p, b->data->max_key) > 0)
index cb843a362cb4be8aa3ac8b80ad874b22be545972..0711bde8d68c0f05e2685f2d2cc0d0a2f313bb41 100644 (file)
@@ -589,6 +589,7 @@ err:
 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
                                struct btree_iter *iter,
                                const struct bkey_i *k,
+                               enum btree_id btree_id,
                                unsigned level,
                                enum six_lock_type lock_type,
                                bool sync)
@@ -601,7 +602,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
         * Parent node must be locked, else we could read in a btree node that's
         * been freed:
         */
-       if (!bch2_btree_node_relock(iter, level + 1))
+       if (iter && !bch2_btree_node_relock(iter, level + 1))
                return ERR_PTR(-EINTR);
 
        b = bch2_btree_node_mem_alloc(c);
@@ -609,7 +610,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
                return b;
 
        bkey_copy(&b->key, k);
-       if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+       if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
                /* raced with another fill: */
 
                /* mark as unhashed... */
@@ -629,7 +630,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
         *
         * XXX: ideally should be dropping all btree node locks here
         */
-       if (btree_node_read_locked(iter, level + 1))
+       if (iter && btree_node_read_locked(iter, level + 1))
                btree_node_unlock(iter, level + 1);
 
        bch2_btree_node_read(c, b, sync);
@@ -677,7 +678,8 @@ retry:
                 * else we could read in a btree node from disk that's been
                 * freed:
                 */
-               b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+               b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+                                        level, lock_type, true);
 
                /* We raced and found the btree node in the cache */
                if (!b)
@@ -763,6 +765,74 @@ lock_node:
        return b;
 }
 
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+                                        const struct bkey_i *k,
+                                        enum btree_id btree_id,
+                                        unsigned level)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b;
+       struct bset_tree *t;
+
+       EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+       b = btree_node_mem_ptr(k);
+       if (b)
+               goto lock_node;
+retry:
+       b = btree_cache_find(bc, k);
+       if (unlikely(!b)) {
+               b = bch2_btree_node_fill(c, NULL, k, btree_id,
+                                        level, SIX_LOCK_read, true);
+
+               /* We raced and found the btree node in the cache */
+               if (!b)
+                       goto retry;
+
+               if (IS_ERR(b))
+                       return b;
+       } else {
+lock_node:
+               six_lock_read(&b->c.lock, NULL, NULL);
+
+               if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+                            b->c.btree_id != btree_id ||
+                            b->c.level != level)) {
+                       six_unlock_read(&b->c.lock);
+                       goto retry;
+               }
+       }
+
+       /* XXX: waiting on IO with btree locks held: */
+       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+
+       prefetch(b->aux_data);
+
+       for_each_bset(b, t) {
+               void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+               prefetch(p + L1_CACHE_BYTES * 0);
+               prefetch(p + L1_CACHE_BYTES * 1);
+               prefetch(p + L1_CACHE_BYTES * 2);
+       }
+
+       /* avoid atomic set bit if it's not needed: */
+       if (!btree_node_accessed(b))
+               set_btree_node_accessed(b);
+
+       if (unlikely(btree_node_read_error(b))) {
+               six_unlock_read(&b->c.lock);
+               return ERR_PTR(-EIO);
+       }
+
+       EBUG_ON(b->c.btree_id != btree_id ||
+               BTREE_NODE_LEVEL(b->data) != level ||
+               bkey_cmp(b->data->max_key, k->k.p));
+
+       return b;
+}
+
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
                                          struct btree_iter *iter,
                                          struct btree *b,
@@ -877,7 +947,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
        if (b)
                return;
 
-       bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+       bch2_btree_node_fill(c, iter, k, iter->btree_id,
+                            level, SIX_LOCK_read, false);
 }
 
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
index 5d85987457bf72fcf5e9d9789f134937a7b4e39e..abde6c2658c692c0d62a0c5722010af1d201888c 100644 (file)
@@ -25,6 +25,9 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
                                  const struct bkey_i *, unsigned,
                                  enum six_lock_type);
 
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+                                        enum btree_id, unsigned);
+
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
                                struct btree *, enum btree_node_sibling);
 
index f85fbc057fb31fd25365b08fcd8bb5f7f8479723..ee5eafdb1222c87e866914232a28ab27a553b6b0 100644 (file)
@@ -186,16 +186,8 @@ fsck_err:
        return ret;
 }
 
-static bool pos_in_journal_keys(struct journal_keys *journal_keys,
-                               enum btree_id id, struct bpos pos)
-{
-       struct journal_key *k = journal_key_search(journal_keys, id, pos);
-
-       return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
-}
-
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
-                             struct journal_keys *journal_keys, bool initial)
+                             bool initial)
 {
        struct btree_node_iter iter;
        struct bkey unpacked;
@@ -209,10 +201,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 
        for_each_btree_node_key_unpack(b, k, &iter,
                                       &unpacked) {
-               if (!b->c.level && journal_keys &&
-                   pos_in_journal_keys(journal_keys, b->c.btree_id, k.k->p))
-                       continue;
-
                bch2_bkey_debugcheck(c, b, k);
 
                ret = bch2_gc_mark_key(c, k, max_stale, initial);
@@ -224,7 +212,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-                        struct journal_keys *journal_keys,
                         bool initial, bool metadata_only)
 {
        struct btree_trans trans;
@@ -252,8 +239,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
                gc_pos_set(c, gc_pos_btree_node(b));
 
-               ret = btree_gc_mark_node(c, b, &max_stale,
-                                        journal_keys, initial);
+               ret = btree_gc_mark_node(c, b, &max_stale, initial);
                if (ret)
                        break;
 
@@ -289,6 +275,78 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        return ret;
 }
 
+static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
+                                        struct journal_keys *journal_keys,
+                                        unsigned target_depth)
+{
+       struct btree_and_journal_iter iter;
+       struct bkey_s_c k;
+       u8 max_stale = 0;
+       int ret = 0;
+
+       bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+
+       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+               bch2_bkey_debugcheck(c, b, k);
+
+               ret = bch2_gc_mark_key(c, k, &max_stale, true);
+               if (ret)
+                       break;
+
+               if (b->c.level > target_depth) {
+                       struct btree *child;
+                       BKEY_PADDED(k) tmp;
+
+                       bkey_reassemble(&tmp.k, k);
+
+                       child = bch2_btree_node_get_noiter(c, &tmp.k,
+                                               b->c.btree_id, b->c.level - 1);
+                       ret = PTR_ERR_OR_ZERO(child);
+                       if (ret)
+                               break;
+
+                       bch2_gc_btree_init_recurse(c, child,
+                                       journal_keys, target_depth);
+                       six_unlock_read(&child->c.lock);
+               }
+
+               bch2_btree_and_journal_iter_advance(&iter);
+       }
+
+       return ret;
+}
+
+static int bch2_gc_btree_init(struct bch_fs *c,
+                             struct journal_keys *journal_keys,
+                             enum btree_id btree_id,
+                             bool metadata_only)
+{
+       struct btree *b;
+       unsigned target_depth = metadata_only           ? 1
+               : expensive_debug_checks(c)             ? 0
+               : !btree_node_type_needs_gc(btree_id)   ? 1
+               : 0;
+       u8 max_stale = 0;
+       int ret = 0;
+
+       b = c->btree_roots[btree_id].b;
+
+       if (btree_node_fake(b))
+               return 0;
+
+       six_lock_read(&b->c.lock, NULL, NULL);
+       if (b->c.level >= target_depth)
+               ret = bch2_gc_btree_init_recurse(c, b,
+                                       journal_keys, target_depth);
+
+       if (!ret)
+               ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+                                      &max_stale, true);
+       six_unlock_read(&b->c.lock);
+
+       return ret;
+}
+
 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 {
        return  (int) btree_id_to_gc_phase(l) -
@@ -307,27 +365,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 
        for (i = 0; i < BTREE_ID_NR; i++) {
                enum btree_id id = ids[i];
-               enum btree_node_type type = __btree_node_type(0, id);
-
-               int ret = bch2_gc_btree(c, id, journal_keys,
-                                       initial, metadata_only);
+               int ret = initial
+                       ? bch2_gc_btree_init(c, journal_keys,
+                                            id, metadata_only)
+                       : bch2_gc_btree(c, id, initial, metadata_only);
                if (ret)
                        return ret;
-
-               if (journal_keys && !metadata_only &&
-                   btree_node_type_needs_gc(type)) {
-                       struct journal_key *j;
-                       u8 max_stale;
-                       int ret;
-
-                       for_each_journal_key(*journal_keys, j)
-                               if (j->btree_id == id) {
-                                       ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
-                                                              &max_stale, initial);
-                                       if (ret)
-                                               return ret;
-                               }
-               }
        }
 
        return 0;
index fdfa7a265850d6ccb6aacdebe3cf41cdce548789..885cc9500f36b85f1dd0bc8753e9c7853d9805ce 100644 (file)
@@ -264,6 +264,11 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
        return iter->flags & BTREE_ITER_TYPE;
 }
 
+static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
+{
+       return iter->l + iter->level;
+}
+
 struct btree_insert_entry {
        unsigned                trigger_flags;
        unsigned                trans_triggers_run:1;
index c1a4d6559d01dcab26e9dec325fe9c8bc05a8b2c..fa9c7f5e0bb92bd77dde05163f89768116f2d3fb 100644 (file)
@@ -1630,7 +1630,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
                          unsigned flags)
 {
        struct btree_trans *trans = iter->trans;
-       struct btree *b = iter->l[0].b;
+       struct btree *b = iter_l(iter)->b;
        struct btree_update *as;
        struct closure cl;
        int ret = 0;
index a5362ecb4f5ddbf8a0e6a189bea4c12fcd7d54a3..a8487f8275b682524689b839abc7ace0c73eb933 100644 (file)
@@ -24,7 +24,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
                                     struct btree_insert_entry *i)
 {
        return i != trans->updates2 &&
-               i[0].iter->l[0].b == i[-1].iter->l[0].b;
+               iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
 }
 
 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
@@ -172,13 +172,12 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct journal *j = &c->journal;
-       struct btree *b = iter->l[0].b;
+       struct btree *b = iter_l(iter)->b;
        struct btree_write *w = btree_current_write(b);
        u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
                ? trans->journal_res.seq
                : j->replay_journal_seq;
 
-       EBUG_ON(iter->level || b->c.level);
        EBUG_ON(trans->journal_res.ref !=
                !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 
@@ -205,17 +204,15 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
                                  struct bkey_i *insert)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter->l[0].b;
+       struct btree *b = iter_l(iter)->b;
        struct bset_tree *t = bset_tree_last(b);
        int old_u64s = bset_u64s(t);
        int old_live_u64s = b->nr.live_u64s;
        int live_u64s_added, u64s_added;
 
-       EBUG_ON(iter->level);
-
        insert->k.needs_whiteout = false;
 
-       if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
+       if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert)))
                bch2_btree_journal_key(trans, iter, insert);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
@@ -241,7 +238,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
 
-       BUG_ON(iter->level);
        BUG_ON(bkey_cmp(insert->k.p, iter->pos));
        BUG_ON(debug_check_bkeys(c) &&
               bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
@@ -290,7 +286,7 @@ btree_key_can_insert(struct btree_trans *trans,
                     unsigned *u64s)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter->l[0].b;
+       struct btree *b = iter_l(iter)->b;
        static enum btree_insert_ret ret;
 
        if (unlikely(btree_node_fake(b)))
@@ -345,7 +341,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
        struct btree_insert_entry *i;
 
        trans_for_each_update(trans, i)
-               if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+               if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
                        bch2_mark_update(trans, i->iter, i->k, NULL,
                                         i->trigger_flags|BTREE_TRIGGER_GC);
 }
@@ -461,7 +457,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        int ret;
 
        trans_for_each_update2(trans, i)
-               BUG_ON(!btree_node_intent_locked(i->iter, 0));
+               BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
 
        ret = bch2_journal_preres_get(&trans->c->journal,
                        &trans->journal_preres, trans->journal_preres_u64s,
@@ -495,13 +491,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        trans_for_each_update2(trans, i)
                if (!same_leaf_as_prev(trans, i))
                        bch2_btree_node_lock_for_insert(trans->c,
-                                               i->iter->l[0].b, i->iter);
+                                       iter_l(i->iter)->b, i->iter);
 
        ret = bch2_trans_commit_write_locked(trans, stopped_at);
 
        trans_for_each_update2(trans, i)
                if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
+                       bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
                                                             i->iter);
 
        /*
index 02b381cb567b38e484d4c6c6a3256371e576842e..0d4abaa3ba10b2153646672b109e027a4f393452 100644 (file)
 
 /* iterate over keys read from the journal: */
 
-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
+                                             enum btree_id id, unsigned level,
+                                             struct bpos pos)
 {
-       while (iter->k) {
-               if (iter->k->btree_id == iter->btree_id)
-                       return bkey_i_to_s_c(iter->k->k);
+       size_t l = 0, r = journal_keys->nr, m;
 
-               iter->k++;
-               if (iter->k == iter->keys->d + iter->keys->nr)
-                       iter->k = NULL;
+       while (l < r) {
+               m = l + ((r - l) >> 1);
+               if ((cmp_int(id,        journal_keys->d[m].btree_id) ?:
+                    cmp_int(level,     journal_keys->d[m].level) ?:
+                    bkey_cmp(pos,      journal_keys->d[m].k->k.p)) > 0)
+                       l = m + 1;
+               else
+                       r = m;
        }
 
-       return bkey_s_c_null;
+       BUG_ON(l < journal_keys->nr &&
+              (cmp_int(id,     journal_keys->d[l].btree_id) ?:
+               cmp_int(level,  journal_keys->d[l].level) ?:
+               bkey_cmp(pos,   journal_keys->d[l].k->k.p)) > 0);
+
+       BUG_ON(l &&
+              (cmp_int(id,     journal_keys->d[l - 1].btree_id) ?:
+               cmp_int(level,  journal_keys->d[l - 1].level) ?:
+               bkey_cmp(pos,   journal_keys->d[l - 1].k->k.p)) <= 0);
+
+       return l < journal_keys->nr ? journal_keys->d + l : NULL;
+}
+
+static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
+{
+       if (iter->k &&
+           iter->k < iter->keys->d + iter->keys->nr &&
+           iter->k->btree_id   == iter->btree_id &&
+           iter->k->level      == iter->level)
+               return iter->k->k;
+
+       iter->k = NULL;
+       return NULL;
+}
+
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+       if (iter->k)
+               iter->k++;
 }
 
-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
+static void bch2_journal_iter_init(struct journal_iter *iter,
+                                  struct journal_keys *journal_keys,
+                                  enum btree_id id, unsigned level,
+                                  struct bpos pos)
 {
-       if (!iter->k)
-               return bkey_s_c_null;
+       iter->btree_id  = id;
+       iter->level     = level;
+       iter->keys      = journal_keys;
+       iter->k         = journal_key_search(journal_keys, id, level, pos);
+}
 
-       iter->k++;
-       if (iter->k == iter->keys->d + iter->keys->nr)
-               iter->k = NULL;
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+       return iter->btree
+               ? bch2_btree_iter_peek(iter->btree)
+               : bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+                                                  iter->b, &iter->unpacked);
+}
 
-       return bch2_journal_iter_peek(iter);
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+       if (iter->btree)
+               bch2_btree_iter_next(iter->btree);
+       else
+               bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
 }
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
@@ -59,10 +107,10 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
        case none:
                break;
        case btree:
-               bch2_btree_iter_next(iter->btree);
+               bch2_journal_iter_advance_btree(iter);
                break;
        case journal:
-               bch2_journal_iter_next(&iter->journal);
+               bch2_journal_iter_advance(&iter->journal);
                break;
        }
 
@@ -74,14 +122,16 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
        struct bkey_s_c ret;
 
        while (1) {
-               struct bkey_s_c btree_k         = bch2_btree_iter_peek(iter->btree);
-               struct bkey_s_c journal_k       = bch2_journal_iter_peek(&iter->journal);
+               struct bkey_s_c btree_k         =
+                       bch2_journal_iter_peek_btree(iter);
+               struct bkey_s_c journal_k       =
+                       bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
 
                if (btree_k.k && journal_k.k) {
                        int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
 
                        if (!cmp)
-                               bch2_btree_iter_next(iter->btree);
+                               bch2_journal_iter_advance_btree(iter);
 
                        iter->last = cmp < 0 ? btree : journal;
                } else if (btree_k.k) {
@@ -94,6 +144,14 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
                }
 
                ret = iter->last == journal ? journal_k : btree_k;
+
+               if (iter->b &&
+                   bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+                       iter->journal.k = NULL;
+                       iter->last = none;
+                       return bkey_s_c_null;
+               }
+
                if (!bkey_deleted(ret.k))
                        break;
 
@@ -110,41 +168,32 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *
        return bch2_btree_and_journal_iter_peek(iter);
 }
 
-struct journal_key *journal_key_search(struct journal_keys *journal_keys,
-                                      enum btree_id id, struct bpos pos)
-{
-       size_t l = 0, r = journal_keys->nr, m;
-
-       while (l < r) {
-               m = l + ((r - l) >> 1);
-               if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
-                    bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
-                       l = m + 1;
-               else
-                       r = m;
-       }
-
-       BUG_ON(l < journal_keys->nr &&
-              (cmp_int(id, journal_keys->d[l].btree_id) ?:
-               bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
-
-       BUG_ON(l &&
-              (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
-               bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
-
-       return l < journal_keys->nr ? journal_keys->d + l : NULL;
-}
-
 void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
                                      struct btree_trans *trans,
                                      struct journal_keys *journal_keys,
                                      enum btree_id id, struct bpos pos)
 {
-       iter->journal.keys      = journal_keys;
-       iter->journal.k         = journal_key_search(journal_keys, id, pos);
-       iter->journal.btree_id  = id;
+       memset(iter, 0, sizeof(*iter));
 
        iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
+       bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
+}
+
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                               struct journal_keys *journal_keys,
+                                               struct btree *b)
+{
+       struct bpos start = b->data->min_key;
+
+       if (btree_node_type_is_extents(b->c.btree_id))
+               start = bkey_successor(start);
+
+       memset(iter, 0, sizeof(*iter));
+
+       iter->b = b;
+       bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
+       bch2_journal_iter_init(&iter->journal, journal_keys,
+                              b->c.btree_id, b->c.level, start);
 }
 
 /* sort and dedup all keys in the journal: */
@@ -169,7 +218,8 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
        const struct journal_key *l = _l;
        const struct journal_key *r = _r;
 
-       return cmp_int(l->btree_id, r->btree_id) ?:
+       return  cmp_int(l->btree_id,    r->btree_id) ?:
+               cmp_int(l->level,       r->level) ?:
                bkey_cmp(l->k->k.p, r->k->k.p) ?:
                cmp_int(l->journal_seq, r->journal_seq) ?:
                cmp_int(l->journal_offset, r->journal_offset);
@@ -180,9 +230,10 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
        const struct journal_key *l = _l;
        const struct journal_key *r = _r;
 
-       return cmp_int(l->journal_seq, r->journal_seq) ?:
-               cmp_int(l->btree_id, r->btree_id) ?:
-               bkey_cmp(l->k->k.p, r->k->k.p);
+       return  cmp_int(l->journal_seq, r->journal_seq) ?:
+               cmp_int(l->btree_id,    r->btree_id) ?:
+               cmp_int(l->level,       r->level) ?:
+               bkey_cmp(l->k->k.p,     r->k->k.p);
 }
 
 static void journal_keys_free(struct journal_keys *keys)
@@ -218,6 +269,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
                for_each_jset_key(k, _n, entry, &p->j)
                        keys.d[keys.nr++] = (struct journal_key) {
                                .btree_id       = entry->btree_id,
+                               .level          = entry->level,
                                .k              = k,
                                .journal_seq    = le64_to_cpu(p->j.seq) -
                                        keys.journal_seq_base,
@@ -229,7 +281,8 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
        src = dst = keys.d;
        while (src < keys.d + keys.nr) {
                while (src + 1 < keys.d + keys.nr &&
-                      src[0].btree_id == src[1].btree_id &&
+                      src[0].btree_id  == src[1].btree_id &&
+                      src[0].level     == src[1].level &&
                       !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
                        src++;
 
@@ -864,7 +917,7 @@ int bch2_fs_recovery(struct bch_fs *c)
                 */
                bch_info(c, "starting metadata mark and sweep");
                err = "error in mark and sweep";
-               ret = bch2_gc(c, NULL, true, true);
+               ret = bch2_gc(c, &journal_keys, true, true);
                if (ret)
                        goto err;
                bch_verbose(c, "mark and sweep done");
index c913093015635a3dde895a9ee629389551779ea6..fa1f2818817d4b10cfd305275c554a773fa92c42 100644 (file)
@@ -5,6 +5,7 @@
 struct journal_keys {
        struct journal_key {
                enum btree_id   btree_id:8;
+               unsigned        level:8;
                struct bkey_i   *k;
                u32             journal_seq;
                u32             journal_offset;
@@ -17,15 +18,23 @@ struct journal_keys {
        for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
 
 struct journal_iter {
+       enum btree_id           btree_id;
+       unsigned                level;
        struct journal_keys     *keys;
        struct journal_key      *k;
-       enum btree_id           btree_id;
 };
 
-struct btree_and_journal_iter {
-       enum btree_id           btree_id;
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
 
+struct btree_and_journal_iter {
        struct btree_iter       *btree;
+
+       struct btree            *b;
+       struct btree_node_iter  node_iter;
+       struct bkey             unpacked;
+
        struct journal_iter     journal;
 
        enum last_key_returned {
@@ -38,12 +47,14 @@ struct btree_and_journal_iter {
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
-struct journal_key *journal_key_search(struct journal_keys *,
-                                      enum btree_id, struct bpos);
+
 void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
                                      struct btree_trans *,
                                      struct journal_keys *,
                                      enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+                                               struct journal_keys *,
+                                               struct btree *);
 
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);