bcachefs: Fixes for going RO
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 28 May 2020 20:06:13 +0000 (16:06 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:40 +0000 (17:08 -0400)
Now that interior btree updates are fully transactional, we don't need
to write out alloc info in a loop. However, interior btree updates do
put more things in the journal, so we still need a loop in the RO
sequence.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/bcachefs.h
fs/bcachefs/journal_reclaim.c
fs/bcachefs/journal_reclaim.h
fs/bcachefs/super.c

index 38173f662d1efa3875e88915e1480569892c405f..09a719b256b3d960b82ee0d945a8cf9e6cc9650f 100644 (file)
@@ -869,6 +869,15 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
        if (!invalidating_cached_data)
                goto out;
 
+       /*
+        * If the read-only path is trying to shut down, we can't be generating
+        * new btree updates:
+        */
+       if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
+               ret = 1;
+               goto out;
+       }
+
        BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
 
        bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
@@ -956,7 +965,7 @@ out:
                percpu_up_read(&c->mark_lock);
        }
 
-       return ret;
+       return ret < 0 ? ret : 0;
 }
 
 static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
index e12946d686ddecf63d34ddb249c9796a55628b63..a9007250881961153018998ec0dd21220bb513b9 100644 (file)
@@ -482,6 +482,7 @@ enum {
        BCH_FS_ALLOC_CLEAN,
        BCH_FS_ALLOCATOR_STARTED,
        BCH_FS_ALLOCATOR_RUNNING,
+       BCH_FS_ALLOCATOR_STOPPING,
        BCH_FS_INITIAL_GC_DONE,
        BCH_FS_FSCK_DONE,
        BCH_FS_STARTED,
index 6cb37045cf685fc2193ccffa3051a7464d316d13..556f12602fcfa7cd665f15965bebeaa6cad2867f 100644 (file)
@@ -413,10 +413,12 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
        return ret;
 }
 
-static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
+/* returns true if we did work */
+static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
                               unsigned min_nr)
 {
        struct journal_entry_pin *pin;
+       bool ret = false;
        u64 seq;
 
        lockdep_assert_held(&j->reclaim_lock);
@@ -431,7 +433,10 @@ static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
                BUG_ON(j->flush_in_progress != pin);
                j->flush_in_progress = NULL;
                wake_up(&j->pin_flush_wait);
+               ret = true;
        }
+
+       return ret;
 }
 
 /**
@@ -523,7 +528,8 @@ void bch2_journal_reclaim_work(struct work_struct *work)
        mutex_unlock(&j->reclaim_lock);
 }
 
-static int journal_flush_done(struct journal *j, u64 seq_to_flush)
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+                             bool *did_work)
 {
        int ret;
 
@@ -533,7 +539,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 
        mutex_lock(&j->reclaim_lock);
 
-       journal_flush_pins(j, seq_to_flush, 0);
+       *did_work = journal_flush_pins(j, seq_to_flush, 0);
 
        spin_lock(&j->lock);
        /*
@@ -551,12 +557,17 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
        return ret;
 }
 
-void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 {
+       bool did_work = false;
+
        if (!test_bit(JOURNAL_STARTED, &j->flags))
-               return;
+               return false;
+
+       closure_wait_event(&j->async_wait,
+               journal_flush_done(j, seq_to_flush, &did_work));
 
-       closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
+       return did_work;
 }
 
 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
index 3ef641f7ce3030c2844ae59bcc987fe8e634e875..272ba8a37967c8c0e1605d833e1b1df735b2035b 100644 (file)
@@ -53,11 +53,11 @@ void bch2_journal_do_discards(struct journal *);
 void bch2_journal_reclaim(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
 
-void bch2_journal_flush_pins(struct journal *, u64);
+bool bch2_journal_flush_pins(struct journal *, u64);
 
-static inline void bch2_journal_flush_all_pins(struct journal *j)
+static inline bool bch2_journal_flush_all_pins(struct journal *j)
 {
-       bch2_journal_flush_pins(j, U64_MAX);
+       return bch2_journal_flush_pins(j, U64_MAX);
 }
 
 int bch2_journal_flush_device_pins(struct journal *, int);
index 3cf75ac1b8047c63f694d28743324b57c9968bd6..9da64d9d52e571ed9b9463f565ea49661ea82592 100644 (file)
@@ -175,7 +175,7 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
 static void __bch2_fs_read_only(struct bch_fs *c)
 {
        struct bch_dev *ca;
-       bool wrote;
+       bool wrote = false;
        unsigned i, clean_passes = 0;
        int ret;
 
@@ -200,39 +200,46 @@ static void __bch2_fs_read_only(struct bch_fs *c)
                goto nowrote_alloc;
 
        bch_verbose(c, "writing alloc info");
+       /*
+        * This should normally just be writing the bucket read/write clocks:
+        */
+       ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
+               bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
+       bch_verbose(c, "writing alloc info complete");
 
-       do {
-               wrote = false;
+       if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+               bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
 
-               ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?:
-                       bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote);
+       if (ret)
+               goto nowrote_alloc;
 
-               if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-                       bch2_fs_inconsistent(c, "error writing out alloc info %i", ret);
+       bch_verbose(c, "flushing journal and stopping allocators");
 
-               if (ret)
-                       goto nowrote_alloc;
+       bch2_journal_flush_all_pins(&c->journal);
+       set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 
-               for_each_member_device(ca, c, i)
-                       bch2_dev_allocator_quiesce(c, ca);
+       do {
+               clean_passes++;
 
-               bch2_journal_flush_all_pins(&c->journal);
+               if (bch2_journal_flush_all_pins(&c->journal))
+                       clean_passes = 0;
 
                /*
-                * We need to explicitly wait on btree interior updates to complete
-                * before stopping the journal, flushing all journal pins isn't
-                * sufficient, because in the BTREE_INTERIOR_UPDATING_ROOT case btree
-                * interior updates have to drop their journal pin before they're
-                * fully complete:
+                * In flight interior btree updates will generate more journal
+                * updates and btree updates (alloc btree):
                 */
-               closure_wait_event(&c->btree_interior_update_wait,
-                                  !bch2_btree_interior_updates_nr_pending(c));
+               if (bch2_btree_interior_updates_nr_pending(c)) {
+                       closure_wait_event(&c->btree_interior_update_wait,
+                                          !bch2_btree_interior_updates_nr_pending(c));
+                       clean_passes = 0;
+               }
                flush_work(&c->btree_interior_update_work);
 
-               clean_passes = wrote ? 0 : clean_passes + 1;
+               if (bch2_journal_flush_all_pins(&c->journal))
+                       clean_passes = 0;
        } while (clean_passes < 2);
+       bch_verbose(c, "flushing journal and stopping allocators complete");
 
-       bch_verbose(c, "writing alloc info complete");
        set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 nowrote_alloc:
        closure_wait_event(&c->btree_interior_update_wait,
@@ -243,11 +250,10 @@ nowrote_alloc:
                bch2_dev_allocator_stop(ca);
 
        clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
+       clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 
        bch2_fs_journal_stop(&c->journal);
 
-       /* XXX: mark super that alloc info is persistent */
-
        /*
         * the journal kicks off btree writes via reclaim - wait for in flight
         * writes after stopping journal: