bcachefs: Gap buffer for journal keys
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 4 Apr 2022 05:09:26 +0000 (01:09 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:30 +0000 (17:09 -0400)
Btree updates before we go RW work by inserting into the array of keys
that journal replay will insert - but inserting into a flat array is
O(n), meaning if btree_gc needs to update many alloc keys, we're O(n^2).

Fortunately, the updates btree_gc does happens in sequential order,
which means a gap buffer works nicely here - this patch implements a gap
buffer for journal keys.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/bcachefs.h
fs/bcachefs/recovery.c
fs/bcachefs/recovery.h
fs/bcachefs/util.h

index c06837612bdf59e435c1d9ce8e9fa61a9c127acc..f2bb23162b4ac328a56342c4ed8cd863a2ac37c5 100644 (file)
@@ -548,6 +548,12 @@ struct journal_keys {
                u32             journal_seq;
                u32             journal_offset;
        }                       *d;
+       /*
+        * Gap buffer: instead of all the empty space in the array being at the
+        * end of the buffer - from @nr to @size - the empty space is at @gap.
+        * This means that sequential insertions are O(n) instead of O(n^2).
+        */
+       size_t                  gap;
        size_t                  nr;
        size_t                  size;
        u64                     journal_seq_base;
index 8291e58089fd48a8788eb237e188edb3cfa61136..f9215cc7cb09dabfe7bf03207e3665dcccfb65f4 100644 (file)
@@ -72,58 +72,97 @@ static int journal_key_cmp(const struct journal_key *l, const struct journal_key
        return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
 }
 
-size_t bch2_journal_key_search(struct journal_keys *journal_keys,
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
+{
+       size_t gap_size = keys->size - keys->nr;
+
+       if (idx >= keys->gap)
+               idx += gap_size;
+       return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+       return keys->d + idx_to_pos(keys, idx);
+}
+
+size_t bch2_journal_key_search(struct journal_keys *keys,
                               enum btree_id id, unsigned level,
                               struct bpos pos)
 {
-       size_t l = 0, r = journal_keys->nr, m;
+       size_t l = 0, r = keys->nr, m;
 
        while (l < r) {
                m = l + ((r - l) >> 1);
-               if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
+               if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
                        l = m + 1;
                else
                        r = m;
        }
 
-       BUG_ON(l < journal_keys->nr &&
-              __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
+       BUG_ON(l < keys->nr &&
+              __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
 
        BUG_ON(l &&
-              __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
+              __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
 
-       return l;
+       return idx_to_pos(keys, l);
 }
 
 struct bkey_i *bch2_journal_keys_peek(struct bch_fs *c, enum btree_id btree_id,
                                      unsigned level, struct bpos pos)
 {
        struct journal_keys *keys = &c->journal_keys;
-       struct journal_key *end = keys->d + keys->nr;
-       struct journal_key *k = keys->d +
-               bch2_journal_key_search(keys, btree_id, level, pos);
+       size_t idx = bch2_journal_key_search(keys, btree_id, level, pos);
 
-       while (k < end && k->overwritten)
-               k++;
+       while (idx < keys->size &&
+              keys->d[idx].overwritten) {
+               idx++;
+               if (idx == keys->gap)
+                       idx += keys->size - keys->nr;
+       }
 
-       if (k < end &&
-           k->btree_id == btree_id &&
-           k->level    == level)
-               return k->k;
+       if (idx < keys->size &&
+           keys->d[idx].btree_id == btree_id &&
+           keys->d[idx].level == level)
+               return keys->d[idx].k;
        return NULL;
 }
 
-static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+static void journal_iters_fix(struct bch_fs *c)
 {
-       struct bkey_i *n = iter->keys->d[idx].k;
-       struct btree_and_journal_iter *biter =
-               container_of(iter, struct btree_and_journal_iter, journal);
-
-       if (iter->idx > idx ||
-           (iter->idx == idx &&
-            biter->last &&
-            bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
-               iter->idx++;
+       struct journal_keys *keys = &c->journal_keys;
+       /* The key we just inserted is immediately before the gap: */
+       struct journal_key *n = &keys->d[keys->gap - 1];
+       size_t gap_end = keys->gap + (keys->size - keys->nr);
+       struct btree_and_journal_iter *iter;
+
+       /*
+        * If an iterator points one after the key we just inserted,
+        * and the key we just inserted compares >= the iterator's position,
+        * decrement the iterator so it points at the key we just inserted:
+        */
+       list_for_each_entry(iter, &c->journal_iters, journal.list)
+               if (iter->journal.idx == gap_end &&
+                   iter->last &&
+                   iter->b->c.btree_id == n->btree_id &&
+                   iter->b->c.level    == n->level &&
+                   bpos_cmp(n->k->k.p, iter->unpacked.p) >= 0)
+                       iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       struct journal_iter *iter;
+       size_t gap_size = keys->size - keys->nr;
+
+       list_for_each_entry(iter, &c->journal_iters, list) {
+               if (iter->idx > old_gap)
+                       iter->idx -= gap_size;
+               if (iter->idx >= new_gap)
+                       iter->idx += gap_size;
+       }
 }
 
 int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
@@ -141,12 +180,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
                .journal_seq    = U32_MAX,
        };
        struct journal_keys *keys = &c->journal_keys;
-       struct journal_iter *iter;
        size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
 
        BUG_ON(test_bit(BCH_FS_RW, &c->flags));
 
-       if (idx < keys->nr &&
+       if (idx < keys->size &&
            journal_key_cmp(&n, &keys->d[idx]) == 0) {
                if (keys->d[idx].allocated)
                        kfree(keys->d[idx].k);
@@ -154,6 +192,9 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
                return 0;
        }
 
+       if (idx > keys->gap)
+               idx -= keys->size - keys->nr;
+
        if (keys->nr == keys->size) {
                struct journal_keys new_keys = {
                        .nr                     = keys->nr,
@@ -168,15 +209,24 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
                        return -ENOMEM;
                }
 
+               /* Since @keys was full, there was no gap: */
                memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
                kvfree(keys->d);
                *keys = new_keys;
+
+               /* And now the gap is at the end: */
+               keys->gap = keys->nr;
        }
 
-       array_insert_item(keys->d, keys->nr, idx, n);
+       journal_iters_move_gap(c, keys->gap, idx);
+
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+       keys->gap = idx;
+
+       keys->nr++;
+       keys->d[keys->gap++] = n;
 
-       list_for_each_entry(iter, &c->journal_iters, list)
-               journal_iter_fix(c, iter, idx);
+       journal_iters_fix(c);
 
        return 0;
 }
@@ -220,36 +270,39 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
        struct journal_keys *keys = &c->journal_keys;
        size_t idx = bch2_journal_key_search(keys, btree, level, pos);
 
-       if (idx < keys->nr &&
+       if (idx < keys->size &&
            keys->d[idx].btree_id       == btree &&
            keys->d[idx].level          == level &&
            !bpos_cmp(keys->d[idx].k->k.p, pos))
                keys->d[idx].overwritten = true;
 }
 
-static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+       if (iter->idx < iter->keys->size) {
+               iter->idx++;
+               if (iter->idx == iter->keys->gap)
+                       iter->idx += iter->keys->size - iter->keys->nr;
+       }
+}
+
+struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
        struct journal_key *k = iter->keys->d + iter->idx;
 
-       while (k < iter->keys->d + iter->keys->nr &&
+       while (k < iter->keys->d + iter->keys->size &&
               k->btree_id      == iter->btree_id &&
               k->level         == iter->level) {
                if (!k->overwritten)
                        return k->k;
 
-               iter->idx++;
+               bch2_journal_iter_advance(iter);
                k = iter->keys->d + iter->idx;
        }
 
        return NULL;
 }
 
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
-       if (iter->idx < iter->keys->nr)
-               iter->idx++;
-}
-
 static void bch2_journal_iter_exit(struct journal_iter *iter)
 {
        list_del(&iter->list);
@@ -409,13 +462,16 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 {
        struct journal_key *i;
 
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+       keys->gap = keys->nr;
+
        for (i = keys->d; i < keys->d + keys->nr; i++)
                if (i->allocated)
                        kfree(i->k);
 
        kvfree(keys->d);
        keys->d = NULL;
-       keys->nr = 0;
+       keys->nr = keys->gap = keys->size = 0;
 }
 
 static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
@@ -478,6 +534,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
        }
 
        keys.nr = dst - keys.d;
+       keys.gap = keys.nr;
 err:
        return keys;
 }
@@ -538,6 +595,9 @@ static int bch2_journal_replay(struct bch_fs *c)
        size_t i;
        int ret;
 
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+       keys->gap = keys->nr;
+
        keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
        if (!keys_sorted)
                return -ENOMEM;
index e6927a918df3bfbe1f36a160513eaa80881b85e4..30580a8984a13b2de02f9ef8ace9d3999f28da02 100644 (file)
@@ -2,9 +2,6 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
-#define for_each_journal_key(keys, i)                          \
-       for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
-
 struct journal_iter {
        struct list_head        list;
        enum btree_id           btree_id;
index 1629d279f49404e4a212cb7be2d0fff2bce90d34..74bfa5faf470ccbb28d84fb3dc85ae91156ffc36 100644 (file)
@@ -826,6 +826,31 @@ do {                                                                       \
 #define array_remove_item(_array, _nr, _pos)                           \
        array_remove_items(_array, _nr, _pos, 1)
 
+static inline void __move_gap(void *array, size_t element_size,
+                             size_t nr, size_t size,
+                             size_t old_gap, size_t new_gap)
+{
+       size_t gap_end = old_gap + size - nr;
+
+       if (new_gap < old_gap) {
+               size_t move = old_gap - new_gap;
+
+               memmove(array + element_size * (gap_end - move),
+                       array + element_size * (old_gap - move),
+                               element_size * move);
+       } else if (new_gap > old_gap) {
+               size_t move = new_gap - old_gap;
+
+               memmove(array + element_size * old_gap,
+                       array + element_size * gap_end,
+                               element_size * move);
+       }
+}
+
+/* Move the gap in a gap buffer: */
+#define move_gap(_array, _nr, _size, _old_gap, _new_gap)       \
+       __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+
 #define bubble_sort(_base, _nr, _cmp)                                  \
 do {                                                                   \
        ssize_t _i, _end;                                               \