bcachefs: Convert bucket invalidation to key marking path
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 13 Feb 2019 19:46:32 +0000 (14:46 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:16 +0000 (17:08 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_background.h
fs/bcachefs/bcachefs_format.h
fs/bcachefs/btree_types.h
fs/bcachefs/btree_update.h
fs/bcachefs/buckets.c
fs/bcachefs/buckets.h
fs/bcachefs/fifo.h
fs/bcachefs/journal_io.c

index 5b9d6c77d037b2365f786316a93c701527436cba..04b75367fcde5ba47cede82f237bbd66214e1474 100644 (file)
@@ -129,6 +129,34 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
        *p += bytes;
 }
 
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
+{
+       struct bkey_alloc_unpacked ret = { .gen = a->gen };
+       const void *d = a->data;
+       unsigned idx = 0;
+
+#define x(_name, _bits)        ret._name = get_alloc_field(a, &d, idx++);
+       BCH_ALLOC_FIELDS()
+#undef  x
+       return ret;
+}
+
+static void bch2_alloc_pack(struct bkey_i_alloc *dst,
+                           const struct bkey_alloc_unpacked src)
+{
+       unsigned idx = 0;
+       void *d = dst->v.data;
+
+       dst->v.fields   = 0;
+       dst->v.gen      = src.gen;
+
+#define x(_name, _bits)        put_alloc_field(dst, &d, idx++, src._name);
+       BCH_ALLOC_FIELDS()
+#undef  x
+
+       set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v);
+}
+
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 {
        unsigned i, bytes = offsetof(struct bch_alloc, data);
@@ -174,16 +202,24 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
 {
        const void *d = a->data;
-       unsigned idx = 0;
+       unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
+       struct bucket_mark m;
 
-       g->_mark.gen            = a->gen;
-       g->gen_valid            = 1;
        g->io_time[READ]        = get_alloc_field(a, &d, idx++);
        g->io_time[WRITE]       = get_alloc_field(a, &d, idx++);
-       g->_mark.data_type      = get_alloc_field(a, &d, idx++);
-       g->_mark.dirty_sectors  = get_alloc_field(a, &d, idx++);
-       g->_mark.cached_sectors = get_alloc_field(a, &d, idx++);
+       data_type               = get_alloc_field(a, &d, idx++);
+       dirty_sectors           = get_alloc_field(a, &d, idx++);
+       cached_sectors          = get_alloc_field(a, &d, idx++);
        g->oldest_gen           = get_alloc_field(a, &d, idx++);
+
+       bucket_cmpxchg(g, m, ({
+               m.gen                   = a->gen;
+               m.data_type             = data_type;
+               m.dirty_sectors         = dirty_sectors;
+               m.cached_sectors        = cached_sectors;
+       }));
+
+       g->gen_valid            = 1;
 }
 
 static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
@@ -318,6 +354,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
                                   BTREE_INSERT_NOFAIL|
                                   BTREE_INSERT_USE_RESERVE|
                                   BTREE_INSERT_USE_ALLOC_RESERVE|
+                                  BTREE_INSERT_NOMARK|
                                   flags,
                                   BTREE_INSERT_ENTRY(iter, &a->k_i));
        if (ret)
@@ -361,7 +398,8 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
                ? 0
                : bch2_btree_insert_at(c, NULL, NULL,
                                       BTREE_INSERT_NOFAIL|
-                                      BTREE_INSERT_JOURNAL_REPLAY,
+                                      BTREE_INSERT_JOURNAL_REPLAY|
+                                      BTREE_INSERT_NOMARK,
                                       BTREE_INSERT_ENTRY(&iter, k));
 err:
        bch2_btree_iter_unlock(&iter);
@@ -827,6 +865,142 @@ static inline long next_alloc_bucket(struct bch_dev *ca)
        return -1;
 }
 
+/*
+ * returns sequence number of most recent journal entry that updated this
+ * bucket:
+ */
+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
+{
+       if (m.journal_seq_valid) {
+               u64 journal_seq = atomic64_read(&c->journal.seq);
+               u64 bucket_seq  = journal_seq;
+
+               bucket_seq &= ~((u64) U16_MAX);
+               bucket_seq |= m.journal_seq;
+
+               if (bucket_seq > journal_seq)
+                       bucket_seq -= 1 << 16;
+
+               return bucket_seq;
+       } else {
+               return 0;
+       }
+}
+
+static int bch2_invalidate_one_bucket2(struct bch_fs *c, struct bch_dev *ca,
+                                      struct btree_iter *iter,
+                                      u64 *journal_seq, unsigned flags)
+{
+#if 0
+       __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
+#else
+       /* hack: */
+       __BKEY_PADDED(k, 8) alloc_key;
+#endif
+       struct bkey_i_alloc *a;
+       struct bkey_alloc_unpacked u;
+       struct bucket_mark m;
+       struct bkey_s_c k;
+       bool invalidating_cached_data;
+       size_t b;
+       int ret;
+
+       BUG_ON(!ca->alloc_heap.used ||
+              !ca->alloc_heap.data[0].nr);
+       b = ca->alloc_heap.data[0].bucket;
+
+       /* first, put on free_inc and mark as owned by allocator: */
+       percpu_down_read(&c->mark_lock);
+       spin_lock(&c->freelist_lock);
+
+       verify_not_on_freelist(c, ca, b);
+
+       BUG_ON(!fifo_push(&ca->free_inc, b));
+
+       bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+       m = bucket(ca, b)->mark;
+
+       spin_unlock(&c->freelist_lock);
+       percpu_up_read(&c->mark_lock);
+
+       bch2_btree_iter_cond_resched(iter);
+
+       BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
+
+       bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+retry:
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = btree_iter_err(k);
+       if (ret)
+               return ret;
+
+       if (k.k && k.k->type == KEY_TYPE_alloc)
+               u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+       else
+               memset(&u, 0, sizeof(u));
+
+       invalidating_cached_data = u.cached_sectors != 0;
+
+       //BUG_ON(u.dirty_sectors);
+       u.data_type     = 0;
+       u.dirty_sectors = 0;
+       u.cached_sectors = 0;
+       u.read_time     = c->bucket_clock[READ].hand;
+       u.write_time    = c->bucket_clock[WRITE].hand;
+       u.gen++;
+
+       a = bkey_alloc_init(&alloc_key.k);
+       a->k.p = iter->pos;
+       bch2_alloc_pack(a, u);
+
+       ret = bch2_btree_insert_at(c, NULL,
+                       invalidating_cached_data ? journal_seq : NULL,
+                       BTREE_INSERT_ATOMIC|
+                       BTREE_INSERT_NOCHECK_RW|
+                       BTREE_INSERT_NOFAIL|
+                       BTREE_INSERT_USE_RESERVE|
+                       BTREE_INSERT_USE_ALLOC_RESERVE|
+                       flags,
+                       BTREE_INSERT_ENTRY(iter, &a->k_i));
+       if (ret == -EINTR)
+               goto retry;
+
+       if (!ret) {
+               /* remove from alloc_heap: */
+               struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+               top->bucket++;
+               top->nr--;
+
+               if (!top->nr)
+                       heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+
+               /*
+                * Make sure we flush the last journal entry that updated this
+                * bucket (i.e. deleting the last reference) before writing to
+                * this bucket again:
+                */
+               *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+       } else {
+               size_t b2;
+
+               /* remove from free_inc: */
+               percpu_down_read(&c->mark_lock);
+               spin_lock(&c->freelist_lock);
+
+               bch2_mark_alloc_bucket(c, ca, b, false,
+                                      gc_pos_alloc(c, NULL), 0);
+
+               BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
+               BUG_ON(b != b2);
+
+               spin_unlock(&c->freelist_lock);
+               percpu_up_read(&c->mark_lock);
+       }
+
+       return ret;
+}
+
 static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
                                       size_t bucket, u64 *flush_seq)
 {
@@ -847,18 +1021,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 
        percpu_up_read(&c->mark_lock);
 
-       if (m.journal_seq_valid) {
-               u64 journal_seq = atomic64_read(&c->journal.seq);
-               u64 bucket_seq  = journal_seq;
-
-               bucket_seq &= ~((u64) U16_MAX);
-               bucket_seq |= m.journal_seq;
-
-               if (bucket_seq > journal_seq)
-                       bucket_seq -= 1 << 16;
-
-               *flush_seq = max(*flush_seq, bucket_seq);
-       }
+       *flush_seq = max(*flush_seq, bucket_journal_seq(c, m));
 
        return m.cached_sectors != 0;
 }
@@ -871,7 +1034,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
        struct btree_iter iter;
        u64 journal_seq = 0;
        int ret = 0;
-       long b;
 
        bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
                             BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -879,16 +1041,11 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
        /* Only use nowait if we've already invalidated at least one bucket: */
        while (!ret &&
               !fifo_full(&ca->free_inc) &&
-              (b = next_alloc_bucket(ca)) >= 0) {
-               bool must_flush =
-                       bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
-
-               ret = __bch2_alloc_write_key(c, ca, b, &iter,
-                               must_flush ? &journal_seq : NULL,
+              ca->alloc_heap.used)
+               ret = bch2_invalidate_one_bucket2(c, ca, &iter, &journal_seq,
                                BTREE_INSERT_GC_LOCK_HELD|
                                (!fifo_empty(&ca->free_inc)
                                 ? BTREE_INSERT_NOWAIT : 0));
-       }
 
        bch2_btree_iter_unlock(&iter);
 
index 04f1e9152494a0f19ee2a301938a61ad8f0cd914..ff6eccf904af349eef7edefed6ee6dfd1d632aae 100644 (file)
@@ -6,6 +6,15 @@
 #include "alloc_types.h"
 #include "debug.h"
 
+struct bkey_alloc_unpacked {
+       u8              gen;
+#define x(_name, _bits)        u##_bits _name;
+       BCH_ALLOC_FIELDS()
+#undef  x
+};
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
+
 #define ALLOC_SCAN_BATCH(ca)           max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
index c7971e5c7c3639f2d591497f08d019e0fce2def6..9a3ca6fa30b74821e706b1042f14e7f360ce8644 100644 (file)
@@ -826,12 +826,12 @@ struct bch_alloc {
 } __attribute__((packed, aligned(8)));
 
 #define BCH_ALLOC_FIELDS()                     \
-       x(read_time, 2)                         \
-       x(write_time, 2)                        \
-       x(data_type, 1)                         \
-       x(dirty_sectors, 2)                     \
-       x(cached_sectors, 2)                    \
-       x(oldest_gen, 1)
+       x(read_time,            16)             \
+       x(write_time,           16)             \
+       x(data_type,            8)              \
+       x(dirty_sectors,        16)             \
+       x(cached_sectors,       16)             \
+       x(oldest_gen,           8)
 
 enum {
 #define x(name, bytes) BCH_ALLOC_FIELD_##name,
@@ -841,12 +841,12 @@ enum {
 };
 
 static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bytes) [BCH_ALLOC_FIELD_##name] = bytes,
+#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
        BCH_ALLOC_FIELDS()
 #undef x
 };
 
-#define x(name, bytes) + bytes
+#define x(name, bits) + (bits / 8)
 static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
        DIV_ROUND_UP(offsetof(struct bch_alloc, data)
                     BCH_ALLOC_FIELDS(), sizeof(u64));
index 5f0e0009ec5dd3ed762175bc85f4724774be19dc..7e58e82daec1c8a0654dfaf45636972d79cd1cad 100644 (file)
@@ -456,6 +456,7 @@ static inline bool btree_node_is_extents(struct btree *b)
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
        switch (type) {
+       case BKEY_TYPE_ALLOC:
        case BKEY_TYPE_BTREE:
        case BKEY_TYPE_EXTENTS:
        case BKEY_TYPE_INODES:
index 9bcab29bd03340ef4d88493d1ea04bdddb9325fa..1fd01fb404821d5c93d2840a623ec98e85b62174 100644 (file)
@@ -82,6 +82,7 @@ enum {
        __BTREE_INSERT_USE_RESERVE,
        __BTREE_INSERT_USE_ALLOC_RESERVE,
        __BTREE_INSERT_JOURNAL_REPLAY,
+       __BTREE_INSERT_NOMARK,
        __BTREE_INSERT_NOWAIT,
        __BTREE_INSERT_GC_LOCK_HELD,
        __BCH_HASH_SET_MUST_CREATE,
@@ -108,12 +109,12 @@ enum {
 #define BTREE_INSERT_USE_RESERVE       (1 << __BTREE_INSERT_USE_RESERVE)
 #define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
 
-/*
- * Insert is for journal replay: don't get journal reservations, or mark extents
- * (bch_mark_key)
- */
+/* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY    (1 << __BTREE_INSERT_JOURNAL_REPLAY)
 
+/* Don't call bch2_mark_key: */
+#define BTREE_INSERT_NOMARK            (1 << __BTREE_INSERT_NOMARK)
+
 /* Don't block on allocation failure (for new btree nodes: */
 #define BTREE_INSERT_NOWAIT            (1 << __BTREE_INSERT_NOWAIT)
 #define BTREE_INSERT_GC_LOCK_HELD      (1 << __BTREE_INSERT_GC_LOCK_HELD)
index 949541f15e7d5d17bb9aea801bb28a0b956d76f4..3286ee26f7e23bf5f0c614b3175a4bfa2c9ab6a4 100644 (file)
@@ -476,6 +476,60 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
                   ca, b, owned_by_allocator);
 }
 
+static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
+                          bool inserting,
+                          struct bch_fs_usage *fs_usage,
+                          unsigned journal_seq, unsigned flags,
+                          bool gc)
+{
+       struct bkey_alloc_unpacked u;
+       struct bch_dev *ca;
+       struct bucket *g;
+       struct bucket_mark old, m;
+
+       if (!inserting)
+               return 0;
+
+       /*
+        * alloc btree is read in by bch2_alloc_read, not gc:
+        */
+       if (flags & BCH_BUCKET_MARK_GC)
+               return 0;
+
+       u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+       ca = bch_dev_bkey_exists(c, k.k->p.inode);
+       g = __bucket(ca, k.k->p.offset, gc);
+
+       /*
+        * this should currently only be getting called from the bucket
+        * invalidate path:
+        */
+       BUG_ON(u.dirty_sectors);
+       BUG_ON(u.cached_sectors);
+       BUG_ON(!g->mark.owned_by_allocator);
+
+       old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
+               m.gen                   = u.gen;
+               m.data_type             = u.data_type;
+               m.dirty_sectors         = u.dirty_sectors;
+               m.cached_sectors        = u.cached_sectors;
+       }));
+
+       g->io_time[READ]        = u.read_time;
+       g->io_time[WRITE]       = u.write_time;
+       g->oldest_gen           = u.oldest_gen;
+       g->gen_valid            = 1;
+
+       if (old.cached_sectors) {
+               update_cached_sectors(c, fs_usage, ca->dev_idx,
+                                     -old.cached_sectors);
+               trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
+                                old.cached_sectors);
+       }
+
+       return 0;
+}
+
 #define checked_add(a, b)                                      \
 do {                                                           \
        unsigned _res = (unsigned) (a) + (b);                   \
@@ -840,18 +894,21 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
                fs_usage = this_cpu_ptr(c->usage[gc]);
 
        switch (k.k->type) {
+       case KEY_TYPE_alloc:
+               return bch2_mark_alloc(c, k, inserting,
+                               fs_usage, journal_seq, flags, gc);
        case KEY_TYPE_btree_ptr:
                return bch2_mark_extent(c, k, inserting
-                                       ?  c->opts.btree_node_size
-                                       : -c->opts.btree_node_size,
-                                       BCH_DATA_BTREE,
-                                       fs_usage, journal_seq, flags, gc);
+                               ?  c->opts.btree_node_size
+                               : -c->opts.btree_node_size,
+                               BCH_DATA_BTREE,
+                               fs_usage, journal_seq, flags, gc);
        case KEY_TYPE_extent:
                return bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-                                       fs_usage, journal_seq, flags, gc);
+                               fs_usage, journal_seq, flags, gc);
        case KEY_TYPE_stripe:
                return bch2_mark_stripe(c, k, inserting,
-                                       fs_usage, journal_seq, flags, gc);
+                               fs_usage, journal_seq, flags, gc);
        case KEY_TYPE_inode:
                if (inserting)
                        fs_usage->s.nr_inodes++;
@@ -922,7 +979,7 @@ void bch2_mark_update(struct btree_insert *trans,
        preempt_disable();
        fs_usage = bch2_fs_usage_get_scratch(c);
 
-       if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+       if (!(trans->flags & BTREE_INSERT_NOMARK))
                bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
                        bpos_min(insert->k->k.p, b->key.k.p).offset -
                        bkey_start_offset(&insert->k->k),
index ffdf176d7ed267d28f40af8fdbbd5701f2066925..973bf605cbd93849c69398b083044a26b26ee075 100644 (file)
 
 #define bucket_cmpxchg(g, new, expr)                           \
 ({                                                             \
+       struct bucket *_g = g;                                  \
        u64 _v = atomic64_read(&(g)->_mark.v);                  \
        struct bucket_mark _old;                                \
                                                                \
        do {                                                    \
                (new).v.counter = _old.v.counter = _v;          \
                expr;                                           \
-       } while ((_v = atomic64_cmpxchg(&(g)->_mark.v,          \
+       } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v,         \
                               _old.v.counter,                  \
                               (new).v.counter)) != _old.v.counter);\
        _old;                                                   \
index 0cd5f1931aac2a1d193074eb7d81b7e322e27122..cdb272708a4bdacf94093a7c0351570189abf973 100644 (file)
@@ -101,7 +101,7 @@ do {                                                                        \
 ({                                                                     \
        bool _r = !fifo_empty((fifo));                                  \
        if (_r)                                                         \
-               (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]       \
+               (i) = (fifo)->data[--(fifo)->back & (fifo)->mask];      \
        _r;                                                             \
 })
 
index bfa1045b0eb58b73536957081113498e03b2fb3e..17eba4269719b2554f14bab419b0e724c06b5383 100644 (file)
@@ -854,7 +854,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
                                ret = bch2_btree_insert(c, entry->btree_id, k,
                                                &disk_res, NULL,
                                                BTREE_INSERT_NOFAIL|
-                                               BTREE_INSERT_JOURNAL_REPLAY);
+                                               BTREE_INSERT_JOURNAL_REPLAY|
+                                               BTREE_INSERT_NOMARK);
                        }
 
                        if (ret) {