bcachefs: Kick off btree node writes from write completions
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 11 Jul 2021 20:41:14 +0000 (16:41 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:08 +0000 (17:09 -0400)
This is a performance improvement by removing the need to wait for the
in flight btree write to complete before kicking one off, which is going
to be needed to avoid a performance regression with the upcoming patch
to update btree ptrs after every btree write.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_io.c
fs/bcachefs/btree_io.h

index 051d2867ad63761bef18124982cce1e9249fbf3b..430d5951263f551ef1caafc65e5c2c5bec89808c 100644 (file)
@@ -234,7 +234,7 @@ wait_on_io:
                if (bch2_verify_btree_ondisk)
                        bch2_btree_node_write(c, b, SIX_LOCK_intent);
                else
-                       __bch2_btree_node_write(c, b);
+                       __bch2_btree_node_write(c, b, false);
 
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
@@ -1009,7 +1009,7 @@ wait_on_io:
        six_lock_write(&b->c.lock, NULL, NULL);
 
        if (btree_node_dirty(b)) {
-               __bch2_btree_node_write(c, b);
+               __bch2_btree_node_write(c, b, false);
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
                goto wait_on_io;
index 2974b2ad69660b89b28476b265bda2c5c57af435..1d4b5fcd1e3910570349356c1f4d353a455b6c6c 100644 (file)
@@ -1562,9 +1562,47 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
        struct btree_write *w = btree_prev_write(b);
+       unsigned long old, new, v;
 
        bch2_btree_complete_write(c, b, w);
-       bch2_btree_node_io_unlock(b);
+
+       v = READ_ONCE(b->flags);
+       do {
+               old = new = v;
+
+               if (old & (1U << BTREE_NODE_need_write))
+                       goto do_write;
+
+               new &= ~(1U << BTREE_NODE_write_in_flight);
+       } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+       return;
+
+do_write:
+       six_lock_read(&b->c.lock, NULL, NULL);
+       v = READ_ONCE(b->flags);
+       do {
+               old = new = v;
+
+               if ((old & (1U << BTREE_NODE_dirty)) &&
+                   (old & (1U << BTREE_NODE_need_write)) &&
+                   !(old & (1U << BTREE_NODE_never_write)) &&
+                   btree_node_may_write(b)) {
+                       new &= ~(1U << BTREE_NODE_dirty);
+                       new &= ~(1U << BTREE_NODE_need_write);
+                       new |=  (1U << BTREE_NODE_write_in_flight);
+                       new |=  (1U << BTREE_NODE_just_written);
+                       new ^=  (1U << BTREE_NODE_write_idx);
+               } else {
+                       new &= ~(1U << BTREE_NODE_write_in_flight);
+               }
+       } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+       if (new & (1U << BTREE_NODE_write_in_flight))
+               __bch2_btree_node_write(c, b, true);
+
+       six_unlock_read(&b->c.lock);
 }
 
 static void bch2_btree_node_write_error(struct bch_fs *c,
@@ -1729,7 +1767,7 @@ static void btree_write_submit(struct work_struct *work)
        bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
 }
 
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
 {
        struct btree_write_bio *wbio;
        struct bset_tree *t;
@@ -1746,7 +1784,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
        bool validate_before_checksum = false;
        void *data;
 
-       BUG_ON(btree_node_write_in_flight(b));
+       if (already_started)
+               goto do_write;
 
        if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
                return;
@@ -1770,14 +1809,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
                if (old & (1 << BTREE_NODE_never_write))
                        return;
 
-               if (old & (1 << BTREE_NODE_write_in_flight)) {
-                       /*
-                        * XXX waiting on btree writes with btree locks held -
-                        * this can deadlock, and we hit the write error path
-                        */
-                       bch2_btree_node_wait_on_write(b);
-                       continue;
-               }
+               BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
 
                new &= ~(1 << BTREE_NODE_dirty);
                new &= ~(1 << BTREE_NODE_need_write);
@@ -1786,6 +1818,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
                new ^=  (1 << BTREE_NODE_write_idx);
        } while (cmpxchg_acquire(&b->flags, old, new) != old);
 
+       if (new & (1U << BTREE_NODE_need_write))
+               return;
+do_write:
        atomic_dec(&c->btree_cache.dirty);
 
        BUG_ON(btree_node_fake(b));
@@ -2041,7 +2076,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        if (lock_type_held == SIX_LOCK_intent ||
            (lock_type_held == SIX_LOCK_read &&
             six_lock_tryupgrade(&b->c.lock))) {
-               __bch2_btree_node_write(c, b);
+               __bch2_btree_node_write(c, b, false);
 
                /* don't cycle lock unnecessarily: */
                if (btree_node_just_written(b) &&
@@ -2053,7 +2088,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                if (lock_type_held == SIX_LOCK_read)
                        six_lock_downgrade(&b->c.lock);
        } else {
-               __bch2_btree_node_write(c, b);
+               __bch2_btree_node_write(c, b, false);
                if (lock_type_held == SIX_LOCK_write &&
                    btree_node_just_written(b))
                        bch2_btree_post_write_cleanup(c, b);
index 89fd4aba521880624e39d73faf28aaff9a9241ae..3732d135de8dd44ca6134fd55a1dde7732632c55 100644 (file)
@@ -139,7 +139,7 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
                              struct btree_write *);
 void bch2_btree_write_error_work(struct work_struct *);
 
-void __bch2_btree_node_write(struct bch_fs *, struct btree *);
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
@@ -148,18 +148,11 @@ void bch2_btree_node_write(struct bch_fs *, struct btree *,
 static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
                                            enum six_lock_type lock_held)
 {
-       while (b->written &&
-              btree_node_need_write(b) &&
-              btree_node_may_write(b)) {
-               if (!btree_node_write_in_flight(b)) {
-                       bch2_btree_node_write(c, b, lock_held);
-                       break;
-               }
-
-               six_unlock_type(&b->c.lock, lock_held);
-               bch2_btree_node_wait_on_write(b);
-               btree_node_lock_type(c, b, lock_held);
-       }
+       if (b->written &&
+           btree_node_need_write(b) &&
+           btree_node_may_write(b) &&
+           !btree_node_write_in_flight(b))
+               bch2_btree_node_write(c, b, lock_held);
 }
 
 #define bch2_btree_node_write_cond(_c, _b, cond)                       \