bcachefs: BTREE_ITER_WITH_JOURNAL
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 26 Dec 2021 01:07:00 +0000 (20:07 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:21 +0000 (17:09 -0400)
This adds a new btree iterator flag, BTREE_ITER_WITH_JOURNAL, that is
automatically enabled when initializing a btree iterator before journal
replay has completed - it overlays the contents of the journal with the
btree.

This lets us delete bch2_btree_and_journal_walk() and just use the
normal btree iterator interface instead - which also lets us delete a
significant amount of duplicated code.

Note that BTREE_ITER_WITH_JOURNAL is still unoptimized in this patch -
we're redoing the binary search over keys in the journal every time we
call bch2_btree_iter_peek().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/bcachefs.h
fs/bcachefs/btree_gc.c
fs/bcachefs/btree_iter.c
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/ec.c
fs/bcachefs/recovery.c
fs/bcachefs/recovery.h

index 30bf363d2ff3757fc743488880551a5427c269ca..cb4b059e796ce3b5ee2aa017cd5dfaae552ad47a 100644 (file)
@@ -340,46 +340,46 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 #undef  x
 }
 
-static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+int bch2_alloc_read(struct bch_fs *c)
 {
-       struct bch_fs *c = trans->c;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
        struct bch_dev *ca;
        struct bucket *g;
        struct bkey_alloc_unpacked u;
-
-       if (!bkey_is_alloc(k.k))
-               return 0;
-
-       ca = bch_dev_bkey_exists(c, k.k->p.inode);
-       g = bucket(ca, k.k->p.offset);
-       u = bch2_alloc_unpack(k);
-
-       *bucket_gen(ca, k.k->p.offset) = u.gen;
-       g->_mark.gen            = u.gen;
-       g->_mark.data_type      = u.data_type;
-       g->_mark.dirty_sectors  = u.dirty_sectors;
-       g->_mark.cached_sectors = u.cached_sectors;
-       g->_mark.stripe         = u.stripe != 0;
-       g->stripe               = u.stripe;
-       g->stripe_redundancy    = u.stripe_redundancy;
-       g->io_time[READ]        = u.read_time;
-       g->io_time[WRITE]       = u.write_time;
-       g->oldest_gen           = u.oldest_gen;
-       g->gen_valid            = 1;
-
-       return 0;
-}
-
-int bch2_alloc_read(struct bch_fs *c)
-{
-       struct btree_trans trans;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
        down_read(&c->gc_lock);
-       ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               if (!bkey_is_alloc(k.k))
+                       continue;
+
+               ca = bch_dev_bkey_exists(c, k.k->p.inode);
+               g = bucket(ca, k.k->p.offset);
+               u = bch2_alloc_unpack(k);
+
+               *bucket_gen(ca, k.k->p.offset) = u.gen;
+               g->_mark.gen            = u.gen;
+               g->_mark.data_type      = u.data_type;
+               g->_mark.dirty_sectors  = u.dirty_sectors;
+               g->_mark.cached_sectors = u.cached_sectors;
+               g->_mark.stripe         = u.stripe != 0;
+               g->stripe               = u.stripe;
+               g->stripe_redundancy    = u.stripe_redundancy;
+               g->io_time[READ]        = u.read_time;
+               g->io_time[WRITE]       = u.write_time;
+               g->oldest_gen           = u.oldest_gen;
+               g->gen_valid            = 1;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
        up_read(&c->gc_lock);
        bch2_trans_exit(&trans);
+
        if (ret) {
                bch_err(c, "error reading alloc info: %i", ret);
                return ret;
index 9452b6cf04a5c1d6388b1205515576f1390d4882..431cf25b38dbf48a9ec41a11fb6bd5a20f9e96e4 100644 (file)
@@ -860,7 +860,6 @@ mempool_t           bio_bounce_pages;
        u64                     reflink_hint;
        reflink_gc_table        reflink_gc_table;
        size_t                  reflink_gc_nr;
-       size_t                  reflink_gc_idx;
 
        /* VFS IO PATH - fs-io.c */
        struct bio_set          writepage_bioset;
index 77c30157792befe2a1f1c34e8b94c73b89f989bb..d7de00af81c95445005c1d76d91cb17db5843869 100644 (file)
@@ -1342,59 +1342,6 @@ static int bch2_gc_start(struct bch_fs *c,
        return 0;
 }
 
-static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
-                                          struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct reflink_gc *r;
-       const __le64 *refcount = bkey_refcount_c(k);
-       char buf[200];
-       int ret = 0;
-
-       if (!refcount)
-               return 0;
-
-       r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
-       if (!r)
-               return -ENOMEM;
-
-       if (!r ||
-           r->offset != k.k->p.offset ||
-           r->size != k.k->size) {
-               bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-               return -EINVAL;
-       }
-
-       if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
-                       "reflink key has wrong refcount:\n"
-                       "  %s\n"
-                       "  should be %u",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-                       r->refcount)) {
-               struct bkey_i *new;
-
-               new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-               if (!new) {
-                       ret = -ENOMEM;
-                       goto fsck_err;
-               }
-
-               bkey_reassemble(new, k);
-
-               if (!r->refcount) {
-                       new->k.type = KEY_TYPE_deleted;
-                       new->k.size = 0;
-               } else {
-                       *bkey_refcount(new) = cpu_to_le64(r->refcount);
-               }
-
-               ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
-               kfree(new);
-       }
-fsck_err:
-       return ret;
-}
-
 static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
                                bool metadata_only)
 {
@@ -1411,14 +1358,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       if (initial) {
-               c->reflink_gc_idx = 0;
-
-               ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
-                               bch2_gc_reflink_done_initial_fn);
-               goto out;
-       }
-
        for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                const __le64 *refcount = bkey_refcount_c(k);
@@ -1426,7 +1365,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
                if (!refcount)
                        continue;
 
-               r = genradix_ptr(&c->reflink_gc_table, idx);
+               r = genradix_ptr(&c->reflink_gc_table, idx++);
                if (!r ||
                    r->offset != k.k->p.offset ||
                    r->size != k.k->size) {
@@ -1456,7 +1395,9 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
                        else
                                *bkey_refcount(new) = cpu_to_le64(r->refcount);
 
-                       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       ret = initial
+                              ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
+                              : __bch2_trans_do(&trans, NULL, NULL, 0,
                                        __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
                        kfree(new);
 
@@ -1466,104 +1407,74 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
        }
 fsck_err:
        bch2_trans_iter_exit(&trans, &iter);
-out:
        c->reflink_gc_nr = 0;
        bch2_trans_exit(&trans);
        return ret;
 }
 
-static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
-                                          struct bkey_s_c k)
+static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
+                               bool metadata_only)
 {
-       struct bch_fs *c = trans->c;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
        struct gc_stripe *m;
        const struct bch_stripe *s;
        char buf[200];
        unsigned i;
        int ret = 0;
 
-       if (k.k->type != KEY_TYPE_stripe)
+       if (metadata_only)
                return 0;
 
-       s = bkey_s_c_to_stripe(k).v;
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               if (k.k->type != KEY_TYPE_stripe)
+                       continue;
 
-       m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+               s = bkey_s_c_to_stripe(k).v;
+               m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
 
-       for (i = 0; i < s->nr_blocks; i++)
-               if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
-                       goto inconsistent;
-       return 0;
+               for (i = 0; i < s->nr_blocks; i++)
+                       if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+                               goto inconsistent;
+               continue;
 inconsistent:
-       if (fsck_err_on(true, c,
-                       "stripe has wrong block sector count %u:\n"
-                       "  %s\n"
-                       "  should be %u", i,
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-                       m ? m->block_sectors[i] : 0)) {
-               struct bkey_i_stripe *new;
+               if (fsck_err_on(true, c,
+                               "stripe has wrong block sector count %u:\n"
+                               "  %s\n"
+                               "  should be %u", i,
+                               (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+                               m ? m->block_sectors[i] : 0)) {
+                       struct bkey_i_stripe *new;
 
-               new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-               if (!new) {
-                       ret = -ENOMEM;
-                       goto fsck_err;
-               }
+                       new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+                       if (!new) {
+                               ret = -ENOMEM;
+                               break;
+                       }
 
-               bkey_reassemble(&new->k_i, k);
+                       bkey_reassemble(&new->k_i, k);
 
-               for (i = 0; i < new->v.nr_blocks; i++)
-                       stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+                       for (i = 0; i < new->v.nr_blocks; i++)
+                               stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
 
-               ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
-               kfree(new);
+                       ret = initial
+                               ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
+                               : __bch2_trans_do(&trans, NULL, NULL, 0,
+                                       __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
+                       kfree(new);
+               }
        }
 fsck_err:
-       return ret;
-}
-
-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
-                               bool metadata_only)
-{
-       struct btree_trans trans;
-       int ret = 0;
-
-       if (metadata_only)
-               return 0;
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       if (initial) {
-               ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
-                               bch2_gc_stripes_done_initial_fn);
-       } else {
-               BUG();
-       }
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        return ret;
 }
 
-static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
-                                           struct bkey_s_c k)
-{
-
-       struct bch_fs *c = trans->c;
-       struct reflink_gc *r;
-       const __le64 *refcount = bkey_refcount_c(k);
-
-       if (!refcount)
-               return 0;
-
-       r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
-                              GFP_KERNEL);
-       if (!r)
-               return -ENOMEM;
-
-       r->offset       = k.k->p.offset;
-       r->size         = k.k->size;
-       r->refcount     = 0;
-       return 0;
-}
-
 static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
                                 bool metadata_only)
 {
@@ -1579,12 +1490,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
        bch2_trans_init(&trans, c, 0, 0);
        c->reflink_gc_nr = 0;
 
-       if (initial) {
-               ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
-                                               bch2_gc_reflink_start_initial_fn);
-               goto out;
-       }
-
        for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                const __le64 *refcount = bkey_refcount_c(k);
@@ -1604,7 +1509,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
                r->refcount     = 0;
        }
        bch2_trans_iter_exit(&trans, &iter);
-out:
+
        bch2_trans_exit(&trans);
        return ret;
 }
index 0b5bf75fbf89a4d86779c065df3011f3f8e91533..01c130a3ce8db3abda4035a3b53c5b6bd0f4e25b 100644 (file)
@@ -12,6 +12,7 @@
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
+#include "recovery.h"
 #include "replicas.h"
 #include "subvolume.h"
 #include "trace.h"
@@ -1064,6 +1065,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
 static void btree_path_verify_new_node(struct btree_trans *trans,
                                       struct btree_path *path, struct btree *b)
 {
+       struct bch_fs *c = trans->c;
        struct btree_path_level *l;
        unsigned plevel;
        bool parent_locked;
@@ -1072,6 +1074,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
        if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
                return;
 
+       if (trans->journal_replay_not_finished)
+               return;
+
        plevel = b->c.level + 1;
        if (!btree_path_node(path, plevel))
                return;
@@ -1092,7 +1097,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
                char buf4[100];
                struct bkey uk = bkey_unpack_key(b, k);
 
-               bch2_dump_btree_node(trans->c, l->b);
+               bch2_dump_btree_node(c, l->b);
                bch2_bpos_to_text(&PBUF(buf1), path->pos);
                bch2_bkey_to_text(&PBUF(buf2), &uk);
                bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
@@ -1283,6 +1288,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
        return ret;
 }
 
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+                                struct btree_and_journal_iter *jiter)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
+       struct bkey_buf tmp;
+       unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+               ? (path->level > 1 ? 0 :  2)
+               : (path->level > 1 ? 1 : 16);
+       bool was_locked = btree_node_locked(path, path->level);
+       int ret = 0;
+
+       bch2_bkey_buf_init(&tmp);
+
+       while (nr && !ret) {
+               if (!bch2_btree_node_relock(trans, path, path->level))
+                       break;
+
+               bch2_btree_and_journal_iter_advance(jiter);
+               k = bch2_btree_and_journal_iter_peek(jiter);
+               if (!k.k)
+                       break;
+
+               bch2_bkey_buf_reassemble(&tmp, c, k);
+               ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+                                              path->level - 1);
+       }
+
+       if (!was_locked)
+               btree_node_unlock(path, path->level);
+
+       bch2_bkey_buf_exit(&tmp, c);
+       return ret;
+}
+
 static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
                                            struct btree_path *path,
                                            unsigned plevel, struct btree *b)
@@ -1305,6 +1345,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
                btree_node_unlock(path, plevel);
 }
 
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+                                                    struct btree_path *path,
+                                                    unsigned flags,
+                                                    struct bkey_buf *out)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_path_level *l = path_l(path);
+       struct btree_and_journal_iter jiter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+       k = bch2_btree_and_journal_iter_peek(&jiter);
+
+       bch2_bkey_buf_reassemble(out, c, k);
+
+       if (flags & BTREE_ITER_PREFETCH)
+               ret = btree_path_prefetch_j(trans, path, &jiter);
+
+       bch2_btree_and_journal_iter_exit(&jiter);
+       return ret;
+}
+
 static __always_inline int btree_path_down(struct btree_trans *trans,
                                           struct btree_path *path,
                                           unsigned flags,
@@ -1321,8 +1385,21 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
        EBUG_ON(!btree_node_locked(path, path->level));
 
        bch2_bkey_buf_init(&tmp);
-       bch2_bkey_buf_unpack(&tmp, c, l->b,
-                        bch2_btree_node_iter_peek(&l->iter, l->b));
+
+       if (unlikely(trans->journal_replay_not_finished)) {
+               ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+               if (ret)
+                       goto err;
+       } else {
+               bch2_bkey_buf_unpack(&tmp, c, l->b,
+                                bch2_btree_node_iter_peek(&l->iter, l->b));
+
+               if (flags & BTREE_ITER_PREFETCH) {
+                       ret = btree_path_prefetch(trans, path);
+                       if (ret)
+                               goto err;
+               }
+       }
 
        b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
        ret = PTR_ERR_OR_ZERO(b);
@@ -1332,13 +1409,11 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
        mark_btree_node_locked(path, level, lock_type);
        btree_path_level_init(trans, path, b);
 
-       if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+       if (likely(!trans->journal_replay_not_finished &&
+                  tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
            unlikely(b != btree_node_mem_ptr(tmp.k)))
                btree_node_mem_ptr_set(trans, path, level + 1, b);
 
-       if (flags & BTREE_ITER_PREFETCH)
-               ret = btree_path_prefetch(trans, path);
-
        if (btree_node_read_locked(path, level + 1))
                btree_node_unlock(path, level + 1);
        path->level = level;
@@ -2113,6 +2188,55 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
        return ret;
 }
 
+static struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
+                                                struct btree_path *path)
+{
+       struct journal_keys *keys = &trans->c->journal_keys;
+       size_t idx = bch2_journal_key_search(keys, path->btree_id,
+                                            path->level, path->pos);
+
+       while (idx < keys->nr && keys->d[idx].overwritten)
+               idx++;
+
+       return (idx < keys->nr &&
+               keys->d[idx].btree_id   == path->btree_id &&
+               keys->d[idx].level      == path->level)
+               ? keys->d[idx].k
+               : NULL;
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
+                                             struct btree_iter *iter)
+{
+       struct bkey_i *k = __btree_trans_peek_journal(trans, iter->path);
+
+       if (k && !bpos_cmp(k->k.p, iter->pos)) {
+               iter->k = k->k;
+               return bkey_i_to_s_c(k);
+       } else {
+               return bkey_s_c_null;
+       }
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+                                        struct btree_iter *iter,
+                                        struct bkey_s_c k)
+{
+       struct bkey_i *next_journal =
+               __btree_trans_peek_journal(trans, iter->path);
+
+       if (next_journal &&
+           bpos_cmp(next_journal->k.p,
+                    k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+               iter->k = next_journal->k;
+               k = bkey_i_to_s_c(next_journal);
+       }
+
+       return k;
+}
+
 /**
  * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
  * current position
@@ -2141,16 +2265,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                        goto out;
                }
 
-               next_update = btree_trans_peek_updates(iter);
                k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
 
-               /* * In the btree, deleted keys sort before non deleted: */
-               if (k.k && bkey_deleted(k.k) &&
-                   (!next_update ||
-                    bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
-                       search_key = k.k->p;
-                       continue;
-               }
+               if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+                       k = btree_trans_peek_journal(trans, iter, k);
+
+               next_update = btree_trans_peek_updates(iter);
 
                if (next_update &&
                    bpos_cmp(next_update->k.p,
@@ -2159,6 +2279,20 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                        k = bkey_i_to_s_c(next_update);
                }
 
+               if (k.k && bkey_deleted(k.k)) {
+                       /*
+                        * If we've got a whiteout, and it's after the search
+                        * key, advance the search key to the whiteout instead
+                        * of just after the whiteout - it might be a btree
+                        * whiteout, with a real key at the same position, since
+                        * in the btree deleted keys sort before non deleted.
+                        */
+                       search_key = bpos_cmp(search_key, k.k->p)
+                               ? k.k->p
+                               : bpos_successor(k.k->p);
+                       continue;
+               }
+
                if (likely(k.k)) {
                        /*
                         * We can never have a key in a leaf node at POS_MAX, so
@@ -2249,6 +2383,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 
        EBUG_ON(iter->path->cached || iter->path->level);
        EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+       if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+               return bkey_s_c_err(-EIO);
+
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
 
@@ -2395,23 +2533,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
            !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
                struct bkey_i *next_update;
 
-               next_update = btree_trans_peek_updates(iter);
-               if (next_update &&
+               if ((next_update = btree_trans_peek_updates(iter)) &&
                    !bpos_cmp(next_update->k.p, iter->pos)) {
                        iter->k = next_update->k;
                        k = bkey_i_to_s_c(next_update);
-               } else {
-                       k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+                       goto out;
                }
 
-               if (!k.k ||
-                   ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
-                    ? bpos_cmp(iter->pos, k.k->p)
-                    : bkey_cmp(iter->pos, k.k->p))) {
-                       bkey_init(&iter->k);
-                       iter->k.p = iter->pos;
-                       k = (struct bkey_s_c) { &iter->k, NULL };
-               }
+               if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+                   (k = btree_trans_peek_slot_journal(trans, iter)).k)
+                       goto out;
+
+               k = bch2_btree_path_peek_slot(iter->path, &iter->k);
        } else {
                struct bpos next;
 
@@ -2455,7 +2588,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                        k = (struct bkey_s_c) { &iter->k, NULL };
                }
        }
-
+out:
        iter->path->should_be_locked = true;
 
        bch2_btree_iter_verify_entry_exit(iter);
@@ -2635,6 +2768,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
            btree_type_has_snapshots(btree_id))
                flags |= BTREE_ITER_FILTER_SNAPSHOTS;
 
+       if (trans->journal_replay_not_finished)
+               flags |= BTREE_ITER_WITH_JOURNAL;
+
        iter->trans     = trans;
        iter->path      = NULL;
        iter->btree_id  = btree_id;
@@ -2801,6 +2937,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
        memset(trans, 0, sizeof(*trans));
        trans->c                = c;
        trans->ip               = _RET_IP_;
+       trans->journal_replay_not_finished =
+               !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
 
        bch2_trans_alloc_paths(trans, c);
 
index 2c8b30949e6f86925f2fae67cb590e91249571a1..1fd0cebe30acba5dff6cf7babb158b59c4cf9c74 100644 (file)
@@ -207,10 +207,11 @@ struct btree_node_iter {
 #define BTREE_ITER_CACHED_NOFILL       (1 << 8)
 #define BTREE_ITER_CACHED_NOCREATE     (1 << 9)
 #define BTREE_ITER_WITH_UPDATES                (1 << 10)
-#define __BTREE_ITER_ALL_SNAPSHOTS     (1 << 11)
-#define BTREE_ITER_ALL_SNAPSHOTS       (1 << 12)
-#define BTREE_ITER_FILTER_SNAPSHOTS    (1 << 13)
-#define BTREE_ITER_NOPRESERVE          (1 << 14)
+#define BTREE_ITER_WITH_JOURNAL                (1 << 11)
+#define __BTREE_ITER_ALL_SNAPSHOTS     (1 << 12)
+#define BTREE_ITER_ALL_SNAPSHOTS       (1 << 13)
+#define BTREE_ITER_FILTER_SNAPSHOTS    (1 << 14)
+#define BTREE_ITER_NOPRESERVE          (1 << 15)
 
 enum btree_path_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -381,6 +382,7 @@ struct btree_trans {
        bool                    restarted:1;
        bool                    paths_sorted:1;
        bool                    journal_transaction_names:1;
+       bool                    journal_replay_not_finished:1;
        /*
         * For when bch2_trans_update notices we'll be splitting a compressed
         * extent:
index 6ef0711431a1f13fdd30e42b32d23fe19e3c793a..17111c4228bd2a999bf5d6542e76d31e63d65f71 100644 (file)
@@ -16,6 +16,7 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 #include "trace.h"
@@ -1146,6 +1147,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
        BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
               !btree_ptr_sectors_written(insert));
 
+       if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+               bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
        invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
                bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
        if (invalid) {
index 8af9ba464b253ab1bc56e7bba65aa3406f2178ea..e95940ffad6bc9db07d635d336fc2129d20590d0 100644 (file)
@@ -711,7 +711,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 
        ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
-       if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+       if (!ret && unlikely(trans->journal_replay_not_finished))
                bch2_drop_overwrites_from_journal(trans);
 
        trans_for_each_update(trans, i)
index e18d2ecf7f071f37954811d3f8556d4993473ebe..86421f65d139d789d83d780e60ffc327ba869244 100644 (file)
@@ -1558,50 +1558,48 @@ void bch2_stripes_heap_start(struct bch_fs *c)
                        bch2_stripes_heap_insert(c, m, iter.pos);
 }
 
-static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+int bch2_stripes_read(struct bch_fs *c)
 {
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
        const struct bch_stripe *s;
-       struct bch_fs *c = trans->c;
        struct stripe *m;
        unsigned i;
-       int ret = 0;
+       int ret;
 
-       if (k.k->type != KEY_TYPE_stripe)
-               return 0;
+       bch2_trans_init(&trans, c, 0, 0);
 
-       ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
-       if (ret)
-               return ret;
+       for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               if (k.k->type != KEY_TYPE_stripe)
+                       continue;
 
-       s = bkey_s_c_to_stripe(k).v;
+               ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+               if (ret)
+                       break;
 
-       m = genradix_ptr(&c->stripes, k.k->p.offset);
-       m->alive        = true;
-       m->sectors      = le16_to_cpu(s->sectors);
-       m->algorithm    = s->algorithm;
-       m->nr_blocks    = s->nr_blocks;
-       m->nr_redundant = s->nr_redundant;
-       m->blocks_nonempty = 0;
+               s = bkey_s_c_to_stripe(k).v;
 
-       for (i = 0; i < s->nr_blocks; i++)
-               m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+               m = genradix_ptr(&c->stripes, k.k->p.offset);
+               m->alive        = true;
+               m->sectors      = le16_to_cpu(s->sectors);
+               m->algorithm    = s->algorithm;
+               m->nr_blocks    = s->nr_blocks;
+               m->nr_redundant = s->nr_redundant;
+               m->blocks_nonempty = 0;
 
-       spin_lock(&c->ec_stripes_heap_lock);
-       bch2_stripes_heap_update(c, m, k.k->p.offset);
-       spin_unlock(&c->ec_stripes_heap_lock);
-
-       return ret;
-}
+               for (i = 0; i < s->nr_blocks; i++)
+                       m->blocks_nonempty += !!stripe_blockcount_get(s, i);
 
-int bch2_stripes_read(struct bch_fs *c)
-{
-       struct btree_trans trans;
-       int ret;
+               spin_lock(&c->ec_stripes_heap_lock);
+               bch2_stripes_heap_update(c, m, k.k->p.offset);
+               spin_unlock(&c->ec_stripes_heap_lock);
+       }
+       bch2_trans_iter_exit(&trans, &iter);
 
-       bch2_trans_init(&trans, c, 0, 0);
-       ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
-                                         bch2_stripes_read_fn);
        bch2_trans_exit(&trans);
+
        if (ret)
                bch_err(c, "error reading stripes: %i", ret);
 
index 219351654564adcf10089f4c2422f9faf4123790..57311ad283c75b987c8a39bfabbd41043cddfcd0 100644 (file)
@@ -59,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
 static int __journal_key_cmp(enum btree_id     l_btree_id,
                             unsigned           l_level,
                             struct bpos        l_pos,
-                            struct journal_key *r)
+                            const struct journal_key *r)
 {
        return (cmp_int(l_btree_id,     r->btree_id) ?:
                cmp_int(l_level,        r->level) ?:
                bpos_cmp(l_pos, r->k->k.p));
 }
 
-static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
 {
-       return (cmp_int(l->btree_id,    r->btree_id) ?:
-               cmp_int(l->level,       r->level) ?:
-               bpos_cmp(l->k->k.p,     r->k->k.p));
+       return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
 }
 
-static size_t journal_key_search(struct journal_keys *journal_keys,
-                                enum btree_id id, unsigned level,
-                                struct bpos pos)
+size_t bch2_journal_key_search(struct journal_keys *journal_keys,
+                              enum btree_id id, unsigned level,
+                              struct bpos pos)
 {
        size_t l = 0, r = journal_keys->nr, m;
 
@@ -125,7 +123,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
        };
        struct journal_keys *keys = &c->journal_keys;
        struct journal_iter *iter;
-       unsigned idx = journal_key_search(keys, id, level, k->k.p);
+       size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
 
        BUG_ON(test_bit(BCH_FS_RW, &c->flags));
 
@@ -164,6 +162,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
        return 0;
 }
 
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
 int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
                            unsigned level, struct bkey_i *k)
 {
@@ -196,7 +199,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
                                  unsigned level, struct bpos pos)
 {
        struct journal_keys *keys = &c->journal_keys;
-       size_t idx = journal_key_search(keys, btree, level, pos);
+       size_t idx = bch2_journal_key_search(keys, btree, level, pos);
 
        if (idx < keys->nr &&
            keys->d[idx].btree_id       == btree &&
@@ -207,15 +210,18 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
 
 static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
-       struct journal_key *k = iter->idx - iter->keys->nr
-               ? iter->keys->d + iter->idx : NULL;
+       struct journal_key *k = iter->keys->d + iter->idx;
 
-       if (k &&
-           k->btree_id == iter->btree_id &&
-           k->level    == iter->level)
-               return k->k;
+       while (k < iter->keys->d + iter->keys->nr &&
+              k->btree_id      == iter->btree_id &&
+              k->level         == iter->level) {
+               if (!k->overwritten)
+                       return k->k;
+
+               iter->idx++;
+               k = iter->keys->d + iter->idx;
+       }
 
-       iter->idx = iter->keys->nr;
        return NULL;
 }
 
@@ -238,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c,
        iter->btree_id  = id;
        iter->level     = level;
        iter->keys      = &c->journal_keys;
-       iter->idx       = journal_key_search(&c->journal_keys, id, level, pos);
-       list_add(&iter->list, &c->journal_iters);
+       iter->idx       = bch2_journal_key_search(&c->journal_keys, id, level, pos);
 }
 
 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -325,106 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
        bch2_journal_iter_exit(&iter->journal);
 }
 
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                               struct bch_fs *c,
-                                               struct btree *b)
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                                 struct bch_fs *c,
+                                                 struct btree *b,
+                                                 struct btree_node_iter node_iter,
+                                                 struct bpos pos)
 {
        memset(iter, 0, sizeof(*iter));
 
        iter->b = b;
-       bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
-       bch2_journal_iter_init(c, &iter->journal,
-                              b->c.btree_id, b->c.level, b->data->min_key);
-}
-
-/* Walk btree, overlaying keys from the journal: */
-
-static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
-                                          struct btree_and_journal_iter iter)
-{
-       unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
-       struct bkey_s_c k;
-       struct bkey_buf tmp;
-
-       BUG_ON(!b->c.level);
-
-       bch2_bkey_buf_init(&tmp);
-
-       while (i < nr &&
-              (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-               bch2_bkey_buf_reassemble(&tmp, c, k);
-
-               bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
-                                       b->c.btree_id, b->c.level - 1);
-
-               bch2_btree_and_journal_iter_advance(&iter);
-               i++;
-       }
-
-       bch2_bkey_buf_exit(&tmp, c);
-}
-
-static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b,
-                               enum btree_id btree_id,
-                               btree_walk_key_fn key_fn)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_and_journal_iter iter;
-       struct bkey_s_c k;
-       struct bkey_buf tmp;
-       struct btree *child;
-       int ret = 0;
-
-       bch2_bkey_buf_init(&tmp);
-       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-
-       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-               if (b->c.level) {
-                       bch2_bkey_buf_reassemble(&tmp, c, k);
-
-                       child = bch2_btree_node_get_noiter(c, tmp.k,
-                                               b->c.btree_id, b->c.level - 1,
-                                               false);
-
-                       ret = PTR_ERR_OR_ZERO(child);
-                       if (ret)
-                               break;
-
-                       btree_and_journal_iter_prefetch(c, b, iter);
-
-                       ret = bch2_btree_and_journal_walk_recurse(trans, child,
-                                       btree_id, key_fn);
-                       six_unlock_read(&child->c.lock);
-               } else {
-                       ret = key_fn(trans, k);
-               }
-
-               if (ret)
-                       break;
-
-               bch2_btree_and_journal_iter_advance(&iter);
-       }
-
-       bch2_btree_and_journal_iter_exit(&iter);
-       bch2_bkey_buf_exit(&tmp, c);
-       return ret;
+       iter->node_iter = node_iter;
+       bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+       INIT_LIST_HEAD(&iter->journal.list);
 }
 
-int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id,
-                               btree_walk_key_fn key_fn)
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                               struct bch_fs *c,
+                                               struct btree *b)
 {
-       struct bch_fs *c = trans->c;
-       struct btree *b = c->btree_roots[btree_id].b;
-       int ret = 0;
-
-       if (btree_node_fake(b))
-               return 0;
-
-       six_lock_read(&b->c.lock, NULL, NULL);
-       ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn);
-       six_unlock_read(&b->c.lock);
+       struct btree_node_iter node_iter;
 
-       return ret;
+       bch2_btree_node_iter_init_from_start(&node_iter, b);
+       __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+       list_add(&iter->journal.list, &c->journal_iters);
 }
 
 /* sort and dedup all keys in the journal: */
@@ -449,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
        const struct journal_key *l = _l;
        const struct journal_key *r = _r;
 
-       return  cmp_int(l->btree_id,    r->btree_id) ?:
-               cmp_int(l->level,       r->level) ?:
-               bpos_cmp(l->k->k.p, r->k->k.p) ?:
+       return  journal_key_cmp(l, r) ?:
                cmp_int(l->journal_seq, r->journal_seq) ?:
                cmp_int(l->journal_offset, r->journal_offset);
 }
index a7a9496afb95c63183c727d59345f743cdd8b48c..21bdad9db2493668b0f5c2a6bcf6f2913315b8db 100644 (file)
@@ -31,6 +31,9 @@ struct btree_and_journal_iter {
        }                       last;
 };
 
+size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
+                              unsigned, struct bpos);
+
 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
                                 unsigned, struct bkey_i *);
 int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@@ -45,14 +48,13 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
 
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+                               struct bch_fs *, struct btree *,
+                               struct btree_node_iter, struct bpos);
 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
                                                struct bch_fs *,
                                                struct btree *);
 
-typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c);
-
-int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn);
-
 void bch2_journal_keys_free(struct journal_keys *);
 void bch2_journal_entries_free(struct list_head *);