bcachefs: Improve journal pin flushing
authorKent Overstreet <kent.overstreet@linux.dev>
Sun, 26 Jan 2025 00:22:50 +0000 (19:22 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 26 Jan 2025 00:37:43 +0000 (19:37 -0500)
Running the preempt tiering tests with a lower than normal journal
reclaim delay turned up a shutdown hang - a lost wakeup, caused because
flushing a journal pin (e.g. key cache/write buffer) can generate a new
journal pin.

The "simple" fix of adding the correct wakeup didn't work because of
ordering issues; if we flush btree node pins too aggressively before
other pins have completed, we end up spinning where each flush iteration
generates new work.

So to fix this correctly:
- The list of flushed journal pins is now broken out by type, so that
  we can wait for key cache/write buffer pin flushing to complete
  before flushing dirty btree nodes

- A new closure_waitlist is added for bch2_journal_flush_pins; this one
  is only used under or when we're taking the journal lock, so it's
  pretty cheap to add rigorously correct wakeups to journal_pin_set()
  and journal_pin_drop().

Additionally, bch2_journal_seq_pins_to_text() is moved to
journal_reclaim.c, where it belongs, along with a bit of other small
renaming and refactoring.

Besides fixing the hang, the better ordering between key cache/write
buffer flushing and btree node flushing should help or fix the "unmount
taking excessively long" a few users have been noticing.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/debug.c
fs/bcachefs/journal.c
fs/bcachefs/journal.h
fs/bcachefs/journal_reclaim.c
fs/bcachefs/journal_reclaim.h
fs/bcachefs/journal_types.h

index b5de52a50d10b0c69a5e51aa7a1450f2d8f53cd4..55333e82d1fe8f9549e46506b0a50ee1485f428e 100644 (file)
@@ -20,6 +20,7 @@
 #include "extents.h"
 #include "fsck.h"
 #include "inode.h"
+#include "journal_reclaim.h"
 #include "super.h"
 
 #include <linux/console.h>
index 46d53d3ba018996ea68dbaa99830a46fbf662dcc..cb2c3722f674198948968519a22dfc8f9ad784eb 100644 (file)
@@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 
 static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 {
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(p->list); i++)
-               INIT_LIST_HEAD(&p->list[i]);
-       INIT_LIST_HEAD(&p->flushed);
+       for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
+               INIT_LIST_HEAD(&p->unflushed[i]);
+       for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
+               INIT_LIST_HEAD(&p->flushed[i]);
        atomic_set(&p->count, count);
        p->devs.nr = 0;
 }
@@ -1626,54 +1625,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        __bch2_journal_debug_to_text(out, j);
        spin_unlock(&j->lock);
 }
-
-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
-{
-       struct journal_entry_pin_list *pin_list;
-       struct journal_entry_pin *pin;
-
-       spin_lock(&j->lock);
-       if (!test_bit(JOURNAL_running, &j->flags)) {
-               spin_unlock(&j->lock);
-               return true;
-       }
-
-       *seq = max(*seq, j->pin.front);
-
-       if (*seq >= j->pin.back) {
-               spin_unlock(&j->lock);
-               return true;
-       }
-
-       out->atomic++;
-
-       pin_list = journal_seq_pin(j, *seq);
-
-       prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
-       printbuf_indent_add(out, 2);
-
-       for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
-               list_for_each_entry(pin, &pin_list->list[i], list)
-                       prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
-       if (!list_empty(&pin_list->flushed))
-               prt_printf(out, "flushed:\n");
-
-       list_for_each_entry(pin, &pin_list->flushed, list)
-               prt_printf(out, "\t%px %ps\n", pin, pin->flush);
-
-       printbuf_indent_sub(out, 2);
-
-       --out->atomic;
-       spin_unlock(&j->lock);
-
-       return false;
-}
-
-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
-{
-       u64 seq = 0;
-
-       while (!bch2_journal_seq_pins_to_text(out, j, &seq))
-               seq++;
-}
index a01dae1a57e317c30451f9fa6b1ddbc8ee4b5d45..dccddd5420adf6c8f58ddf9bf3def6e04b2225aa 100644 (file)
@@ -430,8 +430,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
 
 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
 
 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
                                unsigned nr);
index 3c8242606da7e3db5a475549a2822ac3179cafe6..6a9cefb635d63bf04ff06294b59b7192ee3d7b60 100644 (file)
@@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j)
                popped = true;
        }
 
-       if (popped)
+       if (popped) {
                bch2_journal_space_available(j);
+               __closure_wake_up(&j->reclaim_flush_wait);
+       }
 }
 
 bool __bch2_journal_pin_put(struct journal *j, u64 seq)
@@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j,
        pin->seq = 0;
        list_del_init(&pin->list);
 
+       if (j->reclaim_flush_wait.list.first)
+               __closure_wake_up(&j->reclaim_flush_wait);
+
        /*
         * Unpinning a journal entry may make journal_next_bucket() succeed, if
         * writing a new last_seq will now make another bucket available:
@@ -383,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
 {
        if (fn == bch2_btree_node_flush0 ||
            fn == bch2_btree_node_flush1)
-               return JOURNAL_PIN_btree;
+               return JOURNAL_PIN_TYPE_btree;
        else if (fn == bch2_btree_key_cache_journal_flush)
-               return JOURNAL_PIN_key_cache;
+               return JOURNAL_PIN_TYPE_key_cache;
        else
-               return JOURNAL_PIN_other;
+               return JOURNAL_PIN_TYPE_other;
 }
 
 static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
@@ -406,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
        atomic_inc(&pin_list->count);
        pin->seq        = seq;
        pin->flush      = flush_fn;
-       list_add(&pin->list, &pin_list->list[type]);
+
+       if (list_empty(&pin_list->unflushed[type]) &&
+           j->reclaim_flush_wait.list.first)
+               __closure_wake_up(&j->reclaim_flush_wait);
+
+       list_add(&pin->list, &pin_list->unflushed[type]);
 }
 
 void bch2_journal_pin_copy(struct journal *j,
@@ -499,16 +509,15 @@ journal_get_next_pin(struct journal *j,
 {
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *ret = NULL;
-       unsigned i;
 
        fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
                if (*seq > seq_to_flush && !allowed_above_seq)
                        break;
 
-               for (i = 0; i < JOURNAL_PIN_NR; i++)
-                       if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
-                           ((1U << i) & allowed_above_seq)) {
-                               ret = list_first_entry_or_null(&pin_list->list[i],
+               for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
+                       if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
+                           (BIT(i) & allowed_above_seq)) {
+                               ret = list_first_entry_or_null(&pin_list->unflushed[i],
                                        struct journal_entry_pin, list);
                                if (ret)
                                        return ret;
@@ -544,8 +553,8 @@ static size_t journal_flush_pins(struct journal *j,
                }
 
                if (min_key_cache) {
-                       allowed_above |= 1U << JOURNAL_PIN_key_cache;
-                       allowed_below |= 1U << JOURNAL_PIN_key_cache;
+                       allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
+                       allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
                }
 
                cond_resched();
@@ -553,7 +562,9 @@ static size_t journal_flush_pins(struct journal *j,
                j->last_flushed = jiffies;
 
                spin_lock(&j->lock);
-               pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
+               pin = journal_get_next_pin(j, seq_to_flush,
+                                          allowed_below,
+                                          allowed_above, &seq);
                if (pin) {
                        BUG_ON(j->flush_in_progress);
                        j->flush_in_progress = pin;
@@ -576,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j,
                spin_lock(&j->lock);
                /* Pin might have been dropped or rearmed: */
                if (likely(!err && !j->flush_in_progress_dropped))
-                       list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+                       list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
                j->flush_in_progress = NULL;
                j->flush_in_progress_dropped = false;
                spin_unlock(&j->lock);
@@ -816,10 +827,41 @@ int bch2_journal_reclaim_start(struct journal *j)
        return 0;
 }
 
+static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
+                                       unsigned types)
+{
+       struct journal_entry_pin_list *pin_list;
+       u64 seq;
+
+       spin_lock(&j->lock);
+       fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
+               if (seq > seq_to_flush)
+                       break;
+
+               for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
+                       if ((BIT(i) & types) &&
+                           (!list_empty(&pin_list->unflushed[i]) ||
+                            !list_empty(&pin_list->flushed[i]))) {
+                               spin_unlock(&j->lock);
+                               return true;
+                       }
+       }
+       spin_unlock(&j->lock);
+
+       return false;
+}
+
+static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
+                                                unsigned types)
+{
+       return  journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
+               journal_pins_still_flushing(j, seq_to_flush, types);
+}
+
 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
                              bool *did_work)
 {
-       int ret;
+       int ret = 0;
 
        ret = bch2_journal_error(j);
        if (ret)
@@ -827,12 +869,18 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
        mutex_lock(&j->reclaim_lock);
 
-       if (journal_flush_pins(j, seq_to_flush,
-                              (1U << JOURNAL_PIN_key_cache)|
-                              (1U << JOURNAL_PIN_other), 0, 0, 0) ||
-           journal_flush_pins(j, seq_to_flush,
-                              (1U << JOURNAL_PIN_btree), 0, 0, 0))
+       if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
+                              BIT(JOURNAL_PIN_TYPE_key_cache)|
+                              BIT(JOURNAL_PIN_TYPE_other))) {
+               *did_work = true;
+               goto unlock;
+       }
+
+       if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
+                              BIT(JOURNAL_PIN_TYPE_btree))) {
                *did_work = true;
+               goto unlock;
+       }
 
        if (seq_to_flush > journal_cur_seq(j))
                bch2_journal_entry_close(j);
@@ -847,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
                !fifo_used(&j->pin);
 
        spin_unlock(&j->lock);
+unlock:
        mutex_unlock(&j->reclaim_lock);
 
        return ret;
@@ -860,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
        if (!test_bit(JOURNAL_running, &j->flags))
                return false;
 
-       closure_wait_event(&j->async_wait,
+       closure_wait_event(&j->reclaim_flush_wait,
                journal_flush_done(j, seq_to_flush, &did_work));
 
        return did_work;
@@ -926,3 +975,54 @@ err:
 
        return ret;
 }
+
+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
+{
+       struct journal_entry_pin_list *pin_list;
+       struct journal_entry_pin *pin;
+
+       spin_lock(&j->lock);
+       if (!test_bit(JOURNAL_running, &j->flags)) {
+               spin_unlock(&j->lock);
+               return true;
+       }
+
+       *seq = max(*seq, j->pin.front);
+
+       if (*seq >= j->pin.back) {
+               spin_unlock(&j->lock);
+               return true;
+       }
+
+       out->atomic++;
+
+       pin_list = journal_seq_pin(j, *seq);
+
+       prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
+       printbuf_indent_add(out, 2);
+
+       prt_printf(out, "unflushed:\n");
+       for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
+               list_for_each_entry(pin, &pin_list->unflushed[i], list)
+                       prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+       prt_printf(out, "flushed:\n");
+       for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
+               list_for_each_entry(pin, &pin_list->flushed[i], list)
+                       prt_printf(out, "\t%px %ps\n", pin, pin->flush);
+
+       printbuf_indent_sub(out, 2);
+
+       --out->atomic;
+       spin_unlock(&j->lock);
+
+       return false;
+}
+
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
+{
+       u64 seq = 0;
+
+       while (!bch2_journal_seq_pins_to_text(out, j, &seq))
+               seq++;
+}
index ec84c334528177e8c865ebdbf9b9d7e265270718..0a73d7134e1cc6c6a770b4f933760f843e58d458 100644 (file)
@@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j)
 
 int bch2_journal_flush_device_pins(struct journal *, int);
 
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
+
 #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
index e9bd716fbb71083bea09a9c2105817c2bed95708..3ba433a48eb8aed01f8c45907b3458cd120851ef 100644 (file)
@@ -53,15 +53,15 @@ struct journal_buf {
  */
 
 enum journal_pin_type {
-       JOURNAL_PIN_btree,
-       JOURNAL_PIN_key_cache,
-       JOURNAL_PIN_other,
-       JOURNAL_PIN_NR,
+       JOURNAL_PIN_TYPE_btree,
+       JOURNAL_PIN_TYPE_key_cache,
+       JOURNAL_PIN_TYPE_other,
+       JOURNAL_PIN_TYPE_NR,
 };
 
 struct journal_entry_pin_list {
-       struct list_head                list[JOURNAL_PIN_NR];
-       struct list_head                flushed;
+       struct list_head                unflushed[JOURNAL_PIN_TYPE_NR];
+       struct list_head                flushed[JOURNAL_PIN_TYPE_NR];
        atomic_t                        count;
        struct bch_devs_list            devs;
 };
@@ -226,6 +226,7 @@ struct journal {
        /* Used when waiting because the journal was full */
        wait_queue_head_t       wait;
        struct closure_waitlist async_wait;
+       struct closure_waitlist reclaim_flush_wait;
 
        struct delayed_work     write_work;
        struct workqueue_struct *wq;