bcachefs: use dedicated workqueue for tasks holding write refs
authorBrian Foster <bfoster@redhat.com>
Thu, 23 Mar 2023 18:09:05 +0000 (14:09 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:58 +0000 (17:09 -0400)
A workqueue resource deadlock has been observed when running fsck
on a filesystem with a full/stuck journal. fsck is not currently
able to repair the fs due to fairly rapid emergency shutdown, but
rather than exit gracefully the fsck process hangs during the
shutdown sequence. Fortunately this is easily recoverable from
userspace, but the root cause involves code shared between the
kernel and userspace and so should be addressed.

The deadlock scenario involves the main task in the bch2_fs_stop()
-> bch2_fs_read_only() path waiting on write references to drain
with the fs state lock held. A bch2_read_only_work() workqueue task
is scheduled on the system_long_wq, blocked on the state lock.
Finally, various other write ref holding workqueue tasks are
scheduled to run on the same workqueue and must complete in order to
release references that the initial task is waiting on.

To avoid this problem, we can split the dependent workqueue tasks
across different workqueues. It's a bit of a waste to create a
dedicated wq for the read-only worker, but there are several tasks
throughout the fs that follow the pattern of acquiring a write
reference and then scheduling to the system wq. Use a local wq
for such tasks to break the subtle dependency between these and the
read-only worker.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/bcachefs.h
fs/bcachefs/ec.c
fs/bcachefs/subvolume.c
fs/bcachefs/super.c

index 17bcebbd1f2a828fe2e46ae22be78c810c443f2b..23de3ecc6a1ee4cec0598b59b3a73f32c4ad1911 100644 (file)
@@ -1760,7 +1760,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 void bch2_do_discards(struct bch_fs *c)
 {
        if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
-           !queue_work(system_long_wq, &c->discard_work))
+           !queue_work(c->write_ref_wq, &c->discard_work))
                bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }
 
@@ -1886,7 +1886,7 @@ err:
 void bch2_do_invalidates(struct bch_fs *c)
 {
        if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
-           !queue_work(system_long_wq, &c->invalidate_work))
+           !queue_work(c->write_ref_wq, &c->invalidate_work))
                bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
index c1f27b4910a0f64818f97df28910211a877c8050..fcbbc88d77c214962006f71a895974db954356ca 100644 (file)
@@ -808,6 +808,12 @@ struct bch_fs {
        struct workqueue_struct *btree_io_complete_wq;
        /* copygc needs its own workqueue for index updates.. */
        struct workqueue_struct *copygc_wq;
+       /*
+        * Use a dedicated wq for write ref holder tasks. Required to avoid
+        * dependency problems with other wq tasks that can block on ref
+        * draining, such as read-only transition.
+        */
+       struct workqueue_struct *write_ref_wq;
 
        /* ALLOCATION */
        struct bch_devs_mask    rw_devs[BCH_DATA_NR];
index 1e621dcc1d3724a44605356fc5e09cbaf54b8bd3..a444f6d513e5eac85f4e8dbdf582d066667fa82d 100644 (file)
@@ -826,7 +826,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
 void bch2_do_stripe_deletes(struct bch_fs *c)
 {
        if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
-           !schedule_work(&c->ec_stripe_delete_work))
+           !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
                bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
 
index 43d83705a7aee6ce209ca3c3023b058ac3eda1f0..6407d19edc0e5cc87be979376a906858d0137ae3 100644 (file)
@@ -714,7 +714,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 {
        if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
-           !queue_work(system_long_wq, &c->snapshot_delete_work))
+           !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
                bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
@@ -926,7 +926,7 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
        if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
                return -EROFS;
 
-       if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+       if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
                bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
        return 0;
 }
index d6f2f453c0275feaf682c9c7a38a26503a129484..a209de24064cb116eb61cd3926e19e3910eb6736 100644 (file)
@@ -493,6 +493,8 @@ static void __bch2_fs_free(struct bch_fs *c)
        kfree(c->journal_seq_blacklist_table);
        kfree(c->unused_inode_hints);
 
+       if (c->write_ref_wq)
+               destroy_workqueue(c->write_ref_wq);
        if (c->io_complete_wq)
                destroy_workqueue(c->io_complete_wq);
        if (c->copygc_wq)
@@ -787,6 +789,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
            !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
                                WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+           !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
+                               WQ_FREEZABLE, 0)) ||
 #ifndef BCH_WRITE_REF_DEBUG
            percpu_ref_init(&c->writes, bch2_writes_disabled,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||