bcachefs: Log finsert/fcollapse operations
authorKent Overstreet <kent.overstreet@linux.dev>
Sun, 10 Sep 2023 23:11:47 +0000 (19:11 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:10:12 +0000 (17:10 -0400)
Now that we have the logged operations btree, we can make
finsert/fcollapse atomic w.r.t. unclean shutdown as well.

This adds bch_logged_op_finsert to represent the state of an finsert or
fcollapse, which is a bit more complicated than truncate since we need
to track our position in the "shift extents" operation.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs_format.h
fs/bcachefs/io_misc.c
fs/bcachefs/io_misc.h
fs/bcachefs/logged_ops.h

index 3c9e788f1c9ddfd9c9fdc4f402ae00bb4b18eb95..c434202f351ae50e3ada8041895ab09bf23f9263 100644 (file)
@@ -371,7 +371,8 @@ static inline void bkey_init(struct bkey *k)
        x(inode_v3,             29)                     \
        x(bucket_gens,          30)                     \
        x(snapshot_tree,        31)                     \
-       x(logged_op_truncate,   32)
+       x(logged_op_truncate,   32)                     \
+       x(logged_op_finsert,    33)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -1194,6 +1195,23 @@ struct bch_logged_op_truncate {
        __le64                  new_i_size;
 };
 
+enum logged_op_finsert_state {
+       LOGGED_OP_FINSERT_start,
+       LOGGED_OP_FINSERT_shift_extents,
+       LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+       struct bch_val          v;
+       __u8                    state;
+       __u8                    pad[3];
+       __le32                  subvol;
+       __le64                  inum;
+       __le64                  dst_offset;
+       __le64                  src_offset;
+       __le64                  pos;
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -2262,7 +2280,8 @@ enum btree_id_flags {
        x(deleted_inodes,       16,     BTREE_ID_SNAPSHOTS,                     \
          BIT_ULL(KEY_TYPE_set))                                                \
        x(logged_ops,           17,     0,                                      \
-         BIT_ULL(KEY_TYPE_logged_op_truncate))
+         BIT_ULL(KEY_TYPE_logged_op_truncate)|                                 \
+         BIT_ULL(KEY_TYPE_logged_op_finsert))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
index 327b3dd642de2bd23726e0da42515549e194d259..b1be70e15c6007f8b79c000399240691dcef01c8 100644 (file)
@@ -293,6 +293,18 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec
                __bch2_resume_logged_op_truncate(&trans, &op.k_i, i_sectors_delta));
 }
 
+/* finsert/fcollapse: */
+
+void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
+
+       prt_printf(out, "subvol=%u",            le32_to_cpu(op.v->subvol));
+       prt_printf(out, " inum=%llu",           le64_to_cpu(op.v->inum));
+       prt_printf(out, " dst_offset=%lli",     le64_to_cpu(op.v->dst_offset));
+       prt_printf(out, " src_offset=%llu",     le64_to_cpu(op.v->src_offset));
+}
+
 static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
 {
        struct btree_iter iter;
@@ -327,145 +339,160 @@ err:
        return ret;
 }
 
-int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
-                          u64 offset, u64 len, bool insert,
-                          s64 *i_sectors_delta)
+static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
+                                          struct bkey_i *op_k,
+                                          u64 *i_sectors_delta)
 {
-       struct bkey_buf copy;
-       struct btree_trans trans;
-       struct btree_iter src = { NULL }, dst = { NULL }, del = { NULL };
-       s64 shift = insert ? len : -len;
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
+       subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+       u64 dst_offset = le64_to_cpu(op->v.dst_offset);
+       u64 src_offset = le64_to_cpu(op->v.src_offset);
+       s64 shift = dst_offset - src_offset;
+       u64 len = abs(shift);
+       u64 pos = le64_to_cpu(op->v.pos);
+       bool insert = shift > 0;
        int ret = 0;
 
-       bch2_bkey_buf_init(&copy);
-       bch2_trans_init(&trans, c, 0, 1024);
-
-       bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
-                            POS(inum.inum, U64_MAX),
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            POS(inum.inum, 0),
                             BTREE_ITER_INTENT);
-       bch2_trans_copy_iter(&dst, &src);
-       bch2_trans_copy_iter(&del, &src);
+
+       switch (op->v.state) {
+case LOGGED_OP_FINSERT_start:
+       op->v.state = LOGGED_OP_FINSERT_shift_extents;
 
        if (insert) {
-               ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-                               adjust_i_size(&trans, inum, offset, len));
+               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               adjust_i_size(trans, inum, src_offset, len) ?:
+                               bch2_logged_op_update(trans, &op->k_i));
                if (ret)
                        goto err;
        } else {
-               bch2_btree_iter_set_pos(&src, POS(inum.inum, offset));
+               bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
 
-               ret = bch2_fpunch_at(&trans, &src, inum, offset + len, i_sectors_delta);
+               ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
                if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto err;
 
-               bch2_btree_iter_set_pos(&src, POS(inum.inum, offset + len));
+               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               bch2_logged_op_update(trans, &op->k_i));
        }
 
-       while (ret == 0 || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+       fallthrough;
+case LOGGED_OP_FINSERT_shift_extents:
+       while (1) {
                struct disk_reservation disk_res =
                        bch2_disk_reservation_init(c, 0);
-               struct bkey_i delete;
+               struct bkey_i delete, *copy;
                struct bkey_s_c k;
-               struct bpos next_pos;
-               struct bpos move_pos = POS(inum.inum, offset);
-               struct bpos atomic_end;
-               unsigned trigger_flags = 0;
+               struct bpos src_pos = POS(inum.inum, src_offset);
                u32 snapshot;
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
-               ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
                if (ret)
-                       continue;
+                       goto btree_err;
 
-               bch2_btree_iter_set_snapshot(&src, snapshot);
-               bch2_btree_iter_set_snapshot(&dst, snapshot);
-               bch2_btree_iter_set_snapshot(&del, snapshot);
-
-               bch2_trans_begin(&trans);
+               bch2_btree_iter_set_snapshot(&iter, snapshot);
+               bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
 
                k = insert
-                       ? bch2_btree_iter_peek_prev(&src)
-                       : bch2_btree_iter_peek_upto(&src, POS(inum.inum, U64_MAX));
+                       ? bch2_btree_iter_peek_prev(&iter)
+                       : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
                if ((ret = bkey_err(k)))
-                       continue;
+                       goto btree_err;
 
-               if (!k.k || k.k->p.inode != inum.inum)
+               if (!k.k ||
+                   k.k->p.inode != inum.inum ||
+                   bkey_le(k.k->p, POS(inum.inum, src_offset)))
                        break;
 
-               if (insert &&
-                   bkey_le(k.k->p, POS(inum.inum, offset)))
-                       break;
-reassemble:
-               bch2_bkey_buf_reassemble(&copy, c, k);
+               copy = bch2_bkey_make_mut_noupdate(trans, k);
+               if ((ret = PTR_ERR_OR_ZERO(copy)))
+                       goto btree_err;
 
                if (insert &&
-                   bkey_lt(bkey_start_pos(k.k), move_pos))
-                       bch2_cut_front(move_pos, copy.k);
-
-               copy.k->k.p.offset += shift;
-               bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
+                   bkey_lt(bkey_start_pos(k.k), src_pos)) {
+                       bch2_cut_front(src_pos, copy);
 
-               ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
-               if (ret)
-                       continue;
-
-               if (!bkey_eq(atomic_end, copy.k->k.p)) {
-                       if (insert) {
-                               move_pos = atomic_end;
-                               move_pos.offset -= shift;
-                               goto reassemble;
-                       } else {
-                               bch2_cut_back(atomic_end, copy.k);
-                       }
+                       /* Splitting compressed extent? */
+                       bch2_disk_reservation_add(c, &disk_res,
+                                       copy->k.size *
+                                       bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
+                                       BCH_DISK_RESERVATION_NOFAIL);
                }
 
                bkey_init(&delete.k);
-               delete.k.p = copy.k->k.p;
-               delete.k.size = copy.k->k.size;
-               delete.k.p.offset -= shift;
-               bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
+               delete.k.p = copy->k.p;
+               delete.k.p.snapshot = snapshot;
+               delete.k.size = copy->k.size;
 
-               next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
+               copy->k.p.offset += shift;
+               copy->k.p.snapshot = snapshot;
 
-               if (copy.k->k.size != k.k->size) {
-                       /* We might end up splitting compressed extents: */
-                       unsigned nr_ptrs =
-                               bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
+               op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
 
-                       ret = bch2_disk_reservation_get(c, &disk_res,
-                                       copy.k->k.size, nr_ptrs,
-                                       BCH_DISK_RESERVATION_NOFAIL);
-                       BUG_ON(ret);
-               }
-
-               ret =   bch2_btree_iter_traverse(&del) ?:
-                       bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
-                       bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
-                       bch2_trans_commit(&trans, &disk_res, NULL,
-                                         BTREE_INSERT_NOFAIL);
+               ret =   bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
+                       bch2_logged_op_update(trans, &op->k_i) ?:
+                       bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+btree_err:
                bch2_disk_reservation_put(c, &disk_res);
 
-               if (!ret)
-                       bch2_btree_iter_set_pos(&src, next_pos);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       continue;
+               if (ret)
+                       goto err;
+
+               pos = le64_to_cpu(op->v.pos);
        }
 
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto err;
+       op->v.state = LOGGED_OP_FINSERT_finish;
 
        if (!insert) {
-               ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-                               adjust_i_size(&trans, inum, offset, -len));
+               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               adjust_i_size(trans, inum, src_offset, shift) ?:
+                               bch2_logged_op_update(trans, &op->k_i));
        } else {
                /* We need an inode update to update bi_journal_seq for fsync: */
-               ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-                               adjust_i_size(&trans, inum, 0, 0));
+               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                               adjust_i_size(trans, inum, 0, 0) ?:
+                               bch2_logged_op_update(trans, &op->k_i));
+       }
+
+       fallthrough;
+case LOGGED_OP_FINSERT_finish:
+       ret = ret;
        }
 err:
-       bch2_trans_iter_exit(&trans, &del);
-       bch2_trans_iter_exit(&trans, &dst);
-       bch2_trans_iter_exit(&trans, &src);
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&copy, c);
+       bch2_logged_op_finish(trans, op_k);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
+
+int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
+{
+       return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+                          u64 offset, u64 len, bool insert,
+                          s64 *i_sectors_delta)
+{
+       struct bkey_i_logged_op_finsert op;
+       s64 shift = insert ? len : -len;
+
+       bkey_logged_op_finsert_init(&op.k_i);
+       op.v.subvol     = cpu_to_le32(inum.subvol);
+       op.v.inum       = cpu_to_le64(inum.inum);
+       op.v.dst_offset = cpu_to_le64(offset + shift);
+       op.v.src_offset = cpu_to_le64(offset);
+       op.v.pos        = cpu_to_le64(insert ? U64_MAX : offset);
+
+       return bch2_trans_run(c,
+               bch2_logged_op_start(&trans, &op.k_i) ?:
+               __bch2_resume_logged_op_finsert(&trans, &op.k_i, i_sectors_delta));
+}
index 1b792451fff24cc1d11e1f754ef0e470511c27b3..c9e6ed40e1b80c6582d40d0db40b9c1e3eef807d 100644 (file)
@@ -19,6 +19,16 @@ void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct
 int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
 
 int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+
+void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {   \
+       .val_to_text    = bch2_logged_op_finsert_to_text,       \
+       .min_val_size   = 24,                                   \
+})
+
+int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
+
 int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
 
 #endif /* _BCACHEFS_IO_MISC_H */
index b2f2ebea54b6fc3604687623e68f231f76fc8673..4d1e786a27a89fcf733d6cd937662274c885fdcc 100644 (file)
@@ -5,7 +5,8 @@
 #include "bkey.h"
 
 #define BCH_LOGGED_OPS()                       \
-       x(truncate)
+       x(truncate)                             \
+       x(finsert)
 
 static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
 {