bcachefs: bcachefs_metadata_version_persistent_inode_cursors
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 2 Dec 2024 02:44:38 +0000 (21:44 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Fri, 10 Jan 2025 04:38:41 +0000 (23:38 -0500)
Persistent cursors for inode allocation.

A free inodes btree would add substantial overhead to inode allocation
and freeing - a "next num to allocate" cursor is always going to be
faster.

We just need it to be persistent, to avoid scanning the inodes btree
from the start on startup.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
12 files changed:
fs/bcachefs/bcachefs.h
fs/bcachefs/bcachefs_format.h
fs/bcachefs/btree_update.c
fs/bcachefs/inode.c
fs/bcachefs/inode.h
fs/bcachefs/inode_format.h
fs/bcachefs/logged_ops.c
fs/bcachefs/logged_ops_format.h
fs/bcachefs/opts.h
fs/bcachefs/sb-errors_format.h
fs/bcachefs/super-io.c
fs/bcachefs/super.c

index b749c4ecad1b4b63ef854ee07877e1e55e1fca50..161cf2f05d2ad97444b0f297cd4702a4e0495216 100644 (file)
@@ -1063,9 +1063,6 @@ struct bch_fs {
        struct btree_node       *verify_ondisk;
        struct mutex            verify_lock;
 
-       u64                     *unused_inode_hints;
-       unsigned                inode_shard_bits;
-
        /*
         * A btree node on disk could have too many bsets for an iterator to fit
         * on the stack - have to dynamically allocate them
index f140c3366e65ef5dda7512078e178f2d97f482fa..09e53bef1a304427b74d63027251b556874c65cd 100644 (file)
@@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k)
        x(snapshot_tree,        31)                     \
        x(logged_op_truncate,   32)                     \
        x(logged_op_finsert,    33)                     \
-       x(accounting,           34)
+       x(accounting,           34)                     \
+       x(inode_alloc_cursor,   35)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -682,7 +683,8 @@ struct bch_sb_field_ext {
        x(backpointer_bucket_gen,       BCH_VERSION(1, 14))             \
        x(disk_accounting_big_endian,   BCH_VERSION(1, 15))             \
        x(reflink_p_may_update_opts,    BCH_VERSION(1, 16))             \
-       x(inode_depth,                  BCH_VERSION(1, 17))
+       x(inode_depth,                  BCH_VERSION(1, 17))             \
+       x(persistent_inode_cursors,     BCH_VERSION(1, 18))
 
 enum bcachefs_metadata_version {
        bcachefs_metadata_version_min = 9,
@@ -850,6 +852,7 @@ LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
 LE64_BITMASK(BCH_SB_VERSION_INCOMPAT,  struct bch_sb, flags[5], 32, 48);
 LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
                                        struct bch_sb, flags[5], 48, 64);
+LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6],  0,  4);
 
 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
 {
@@ -1347,7 +1350,8 @@ enum btree_id_flags {
          BIT_ULL(KEY_TYPE_set))                                                \
        x(logged_ops,           17,     0,                                      \
          BIT_ULL(KEY_TYPE_logged_op_truncate)|                                 \
-         BIT_ULL(KEY_TYPE_logged_op_finsert))                                  \
+         BIT_ULL(KEY_TYPE_logged_op_finsert)|                                  \
+         BIT_ULL(KEY_TYPE_inode_alloc_cursor))                                 \
        x(rebalance_work,       18,     BTREE_ID_SNAPSHOT_FIELD,                \
          BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))                       \
        x(subvolume_children,   19,     0,                                      \
index a4b70e3fe4c33d629290598ffd2a60d137673606..13d794f201a5638e9bc79c1b03e172d06fb4bc33 100644 (file)
@@ -588,7 +588,7 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi
 int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
                             enum btree_id btree, struct bpos end)
 {
-       bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
+       bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
        struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
        int ret = bkey_err(k);
        if (ret)
index f6245b78eb7840e7744d18cc082ed375eea09c50..04ec05206f8cfd5838c7e299cf9e05694edf7d06 100644 (file)
@@ -799,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
        prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
 }
 
+int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
+                                  struct bkey_validate_context from)
+{
+       int ret = 0;
+
+       bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
+                        c, inode_alloc_cursor_inode_bad,
+                        "k.p.inode bad");
+fsck_err:
+       return ret;
+}
+
+void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
+                                    struct bkey_s_c k)
+{
+       struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);
+
+       prt_printf(out, "idx %llu generation %llu",
+                  le64_to_cpu(i.v->idx),
+                  le64_to_cpu(i.v->gen));
+}
+
 void bch2_inode_init_early(struct bch_fs *c,
                           struct bch_inode_unpacked *inode_u)
 {
@@ -859,43 +881,78 @@ static inline u32 bkey_generation(struct bkey_s_c k)
        }
 }
 
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
-                     struct btree_iter *iter,
-                     struct bch_inode_unpacked *inode_u,
-                     u32 snapshot, u64 cpu)
+static struct bkey_i_inode_alloc_cursor *
+bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k;
-       u64 min, max, start, pos, *hint;
-       int ret = 0;
-       unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
 
-       if (c->opts.shard_inode_numbers) {
-               bits -= c->inode_shard_bits;
+       u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
+
+       cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
 
-               min = (cpu << bits);
-               max = (cpu << bits) | ~(ULLONG_MAX << bits);
+       struct btree_iter iter;
+       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+                                       BTREE_ID_logged_ops,
+                                       POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
+                                       BTREE_ITER_cached);
+       int ret = bkey_err(k);
+       if (ret)
+               return ERR_PTR(ret);
+
+       struct bkey_i_inode_alloc_cursor *cursor =
+               k.k->type == KEY_TYPE_inode_alloc_cursor
+               ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
+               : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
+       ret = PTR_ERR_OR_ZERO(cursor);
+       if (ret)
+               goto err;
 
-               min = max_t(u64, min, BLOCKDEV_INODE_MAX);
-               hint = c->unused_inode_hints + cpu;
+       if (c->opts.inodes_32bit) {
+               *min = BLOCKDEV_INODE_MAX;
+               *max = INT_MAX;
        } else {
-               min = BLOCKDEV_INODE_MAX;
-               max = ~(ULLONG_MAX << bits);
-               hint = c->unused_inode_hints;
+               cursor->v.bits = c->opts.shard_inode_numbers_bits;
+
+               unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
+
+               *min = max(cpu << bits, (u64) INT_MAX + 1);
+               *max = (cpu << bits) | ~(ULLONG_MAX << bits);
        }
 
-       start = READ_ONCE(*hint);
+       if (le64_to_cpu(cursor->v.idx)  < *min)
+               cursor->v.idx = cpu_to_le64(*min);
+
+       if (le64_to_cpu(cursor->v.idx) >= *max) {
+               cursor->v.idx = cpu_to_le64(*min);
+               le32_add_cpu(&cursor->v.gen, 1);
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret ? ERR_PTR(ret) : cursor;
+}
+
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+                     struct btree_iter *iter,
+                     struct bch_inode_unpacked *inode_u,
+                     u32 snapshot, u64 cpu)
+{
+       u64 min, max;
+       struct bkey_i_inode_alloc_cursor *cursor =
+               bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
+       int ret = PTR_ERR_OR_ZERO(cursor);
+       if (ret)
+               return ret;
 
-       if (start >= max || start < min)
-               start = min;
+       u64 start = le64_to_cpu(cursor->v.idx);
+       u64 pos = start;
 
-       pos = start;
        bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
                             BTREE_ITER_all_snapshots|
                             BTREE_ITER_intent);
+       struct bkey_s_c k;
 again:
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k)) &&
@@ -925,6 +982,7 @@ again:
        /* Retry from start */
        pos = start = min;
        bch2_btree_iter_set_pos(iter, POS(0, pos));
+       le32_add_cpu(&cursor->v.gen, 1);
        goto again;
 found_slot:
        bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
@@ -935,9 +993,9 @@ found_slot:
                return ret;
        }
 
-       *hint                   = k.k->p.offset;
        inode_u->bi_inum        = k.k->p.offset;
-       inode_u->bi_generation  = bkey_generation(k);
+       inode_u->bi_generation  = le64_to_cpu(cursor->v.gen);
+       cursor->v.idx           = cpu_to_le64(k.k->p.offset + 1);
        return 0;
 }
 
@@ -999,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
        struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
-       struct bkey_i_inode_generation delete;
-       struct bch_inode_unpacked inode_u;
        struct bkey_s_c k;
        u32 snapshot;
        int ret;
@@ -1040,13 +1096,7 @@ retry:
                goto err;
        }
 
-       bch2_inode_unpack(k, &inode_u);
-
-       bkey_inode_generation_init(&delete.k_i);
-       delete.k.p = iter.pos;
-       delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
-       ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+       ret   = bch2_btree_delete_at(trans, &iter, 0) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                BCH_TRANS_COMMIT_no_enospc);
 err:
index 5bca6950f20e36ebcf5bf47d2e0609d920296a1f..d2e134528f0e6d8720221e6a1b0b72224eca1d29 100644 (file)
@@ -68,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk
        .min_val_size   = 8,                                    \
 })
 
+int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
+                                    struct bkey_validate_context);
+void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) {  \
+       .key_validate   = bch2_inode_alloc_cursor_validate,     \
+       .val_to_text    = bch2_inode_alloc_cursor_to_text,      \
+       .min_val_size   = 16,                                   \
+})
+
 #if 0
 typedef struct {
        u64                     lo;
index be1e747629d20e8786827bced3d83107373707a3..b99a5bf1a75edb57f9a72f7dd11cfe4b245ab6f2 100644 (file)
@@ -102,7 +102,8 @@ struct bch_inode_generation {
        x(bi_subvol,                    32)     \
        x(bi_parent_subvol,             32)     \
        x(bi_nocow,                     8)      \
-       x(bi_depth,                     32)
+       x(bi_depth,                     32)     \
+       x(bi_inodes_32bit,              8)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()                       \
@@ -115,7 +116,8 @@ struct bch_inode_generation {
        x(foreground_target,            16)     \
        x(background_target,            16)     \
        x(erasure_code,                 16)     \
-       x(nocow,                        8)
+       x(nocow,                        8)      \
+       x(inodes_32bit,                 8)
 
 enum inode_opt_id {
 #define x(name, ...)                           \
@@ -165,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START,
                                struct bch_inode_v3, bi_flags, 31, 36);
 LE64_BITMASK(INODEv3_MODE,     struct bch_inode_v3, bi_flags, 36, 52);
 
+struct bch_inode_alloc_cursor {
+       struct bch_val          v;
+       __u8                    bits;
+       __u8                    pad;
+       __le32                  gen;
+       __le64                  idx;
+};
+
 #endif /* _BCACHEFS_INODE_FORMAT_H */
index 1ac51af16299f3d4ac74de43495b8b01df83a50e..75f27ec26f85f8000ac1a090d83b94d92f7f9db6 100644 (file)
@@ -65,7 +65,8 @@ int bch2_resume_logged_ops(struct bch_fs *c)
        int ret = bch2_trans_run(c,
                for_each_btree_key_max(trans, iter,
                                   BTREE_ID_logged_ops,
-                                  POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX),
+                                  POS(LOGGED_OPS_INUM_logged_ops, 0),
+                                  POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
                                   BTREE_ITER_prefetch, k,
                        resume_logged_op(trans, &iter, k)));
        bch_err_fn(c, ret);
@@ -76,7 +77,7 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
 {
        struct btree_iter iter;
        int ret = bch2_bkey_get_empty_slot(trans, &iter,
-                                BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX));
+                                BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
        if (ret)
                return ret;
 
index 0b370a963ac609ce814fb030cd87b775d11d95b4..cfb67c95d4c8a0f9fbfd9325fc1928e6cc20de1b 100644 (file)
@@ -2,7 +2,10 @@
 #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
 #define _BCACHEFS_LOGGED_OPS_FORMAT_H
 
-#define LOGGED_OPS_INUM                0
+enum logged_ops_inums {
+       LOGGED_OPS_INUM_logged_ops,
+       LOGGED_OPS_INUM_inode_cursors,
+};
 
 struct bch_logged_op_truncate {
        struct bch_val          v;
index ea69099e681de32a827d36b431690fdb471dbea6..e763d52e0f389d59f26430215594d1d11c02a36c 100644 (file)
@@ -222,14 +222,14 @@ enum fsck_err_opts {
          BCH_SB_ERASURE_CODE,          false,                          \
          NULL,         "Enable erasure coding (DO NOT USE YET)")       \
        x(inodes_32bit,                 u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_BOOL(),                                                   \
          BCH_SB_INODE_32BIT,           true,                           \
          NULL,         "Constrain inode numbers to 32 bits")           \
-       x(shard_inode_numbers,          u8,                             \
-         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
-         OPT_BOOL(),                                                   \
-         BCH_SB_SHARD_INUMS,           true,                           \
+       x(shard_inode_numbers_bits,     u8,                             \
+         OPT_FS|OPT_FORMAT,                                            \
+         OPT_UINT(0, 8),                                               \
+         BCH_SB_SHARD_INUMS_NBITS,     0,                              \
          NULL,         "Shard new inode numbers by CPU id")            \
        x(inodes_use_key_cache, u8,                                     \
          OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
index 8064866350757093f5649633b96bf8c351bb5fd1..e26317c367f74575af88e222c9718a14274db5d9 100644 (file)
@@ -211,6 +211,7 @@ enum bch_fsck_flags {
        x(bkey_in_missing_snapshot,                             190,    0)              \
        x(inode_pos_inode_nonzero,                              191,    0)              \
        x(inode_pos_blockdev_range,                             192,    0)              \
+       x(inode_alloc_cursor_inode_bad,                         301,    0)              \
        x(inode_unpack_error,                                   193,    0)              \
        x(inode_str_hash_invalid,                               194,    0)              \
        x(inode_v3_fields_start_bad,                            195,    0)              \
@@ -311,7 +312,7 @@ enum bch_fsck_flags {
        x(logged_op_but_clean,                                  283,    FSCK_AUTOFIX)   \
        x(compression_opt_not_marked_in_sb,                     295,    FSCK_AUTOFIX)   \
        x(compression_type_not_marked_in_sb,                    296,    FSCK_AUTOFIX)   \
-       x(MAX,                                                  301,    0)
+       x(MAX,                                                  302,    0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
index b0d52b6ccad4e74f4bc24fa55ee2c76a9bded0b3..dbc09e305c2759e175b578718804ff5f44c031ea 100644 (file)
@@ -460,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
                        SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
        }
 
+#ifdef __KERNEL__
+       if (!BCH_SB_SHARD_INUMS_NBITS(sb))
+               SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
+#endif
+
        for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
                const struct bch_option *opt = bch2_opt_table + opt_id;
 
index 2b2e0835c8fea7adbd70255c07daad4ead0587ca..7e97c198efe2d629d047427cd3ac285013b665b8 100644 (file)
@@ -586,7 +586,6 @@ static void __bch2_fs_free(struct bch_fs *c)
 #endif
        kfree(rcu_dereference_protected(c->disk_groups, 1));
        kfree(c->journal_seq_blacklist_table);
-       kfree(c->unused_inode_hints);
 
        if (c->write_ref_wq)
                destroy_workqueue(c->write_ref_wq);
@@ -872,8 +871,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                (btree_blocks(c) + 1) * 2 *
                sizeof(struct sort_iter_set);
 
-       c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
-
        if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
                                WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
            !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
@@ -900,9 +897,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            !(c->online_reserved = alloc_percpu(u64)) ||
            mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
                                       c->opts.btree_node_size) ||
-           mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
-           !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
-                                             sizeof(u64), GFP_KERNEL))) {
+           mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
                ret = -BCH_ERR_ENOMEM_fs_other_alloc;
                goto err;
        }