bcachefs: Don't block on journal reservation with btree locks held
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 19 Nov 2018 02:35:59 +0000 (21:35 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:11 +0000 (17:08 -0400)
Fixes a deadlock between the allocator thread, when it first starts up,
and journal replay

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/journal.c
fs/bcachefs/journal.h

index 01e476d72595cb39eee870a74e022274476cd4ef..af31819c88c78e9339156e59714e17d61e2dca33 100644 (file)
@@ -638,12 +638,12 @@ static void btree_update_wait_on_journal(struct closure *cl)
        int ret;
 
        ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-       if (ret < 0)
-               goto err;
-       if (!ret) {
+       if (ret == -EAGAIN) {
                continue_at(cl, btree_update_wait_on_journal, system_wq);
                return;
        }
+       if (ret < 0)
+               goto err;
 
        bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
 err:
index 093e480977c713eb47c38c917f26c39e716cceea..41691bebf6795cc40035d28ad92e9268d27b6bc3 100644 (file)
@@ -344,19 +344,35 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
        trans_for_each_entry(trans, i)
                BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
-       u64s = 0;
-       trans_for_each_entry(trans, i)
-               u64s += jset_u64s(i->k->k.u64s);
-
        memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
-       ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
-               ? bch2_journal_res_get(&c->journal,
-                                     &trans->journal_res,
-                                     u64s, u64s)
-               : 0;
-       if (ret)
-               return ret;
+       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+               u64s = 0;
+               trans_for_each_entry(trans, i)
+                       u64s += jset_u64s(i->k->k.u64s);
+
+               while ((ret = bch2_journal_res_get(&c->journal,
+                                       &trans->journal_res, u64s,
+                                       JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
+                       struct btree_iter *iter = trans->entries[0].iter;
+
+                       bch2_btree_iter_unlock(iter);
+
+                       ret = bch2_journal_res_get(&c->journal,
+                                       &trans->journal_res, u64s,
+                                       JOURNAL_RES_GET_CHECK);
+                       if (ret)
+                               return ret;
+
+                       if (!bch2_btree_iter_relock(iter)) {
+                               trans_restart(" (iter relock after journal res get blocked)");
+                               return -EINTR;
+                       }
+               }
+
+               if (ret)
+                       return ret;
+       }
 
        multi_lock_write(c, trans);
 
index b4d03766462811e1546e0d0311686f64d1ab94d6..5db0a469ac24ac0de19c75a615bc905fba696fcd 100644 (file)
@@ -335,15 +335,14 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
 }
 
 static int __journal_res_get(struct journal *j, struct journal_res *res,
-                             unsigned u64s_min, unsigned u64s_max)
+                            unsigned flags)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *buf;
        int ret;
 retry:
-       ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-       if (ret)
-               return ret;
+       if (journal_res_get_fast(j, res, flags))
+               return 0;
 
        spin_lock(&j->lock);
        /*
@@ -351,10 +350,9 @@ retry:
         * that just did journal_entry_open() and call journal_entry_close()
         * unnecessarily
         */
-       ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-       if (ret) {
+       if (journal_res_get_fast(j, res, flags)) {
                spin_unlock(&j->lock);
-               return 1;
+               return 0;
        }
 
        /*
@@ -377,7 +375,12 @@ retry:
                spin_unlock(&j->lock);
                return -EROFS;
        case JOURNAL_ENTRY_INUSE:
-               /* haven't finished writing out the previous one: */
+               /*
+                * The current journal entry is still open, but we failed to get
+                * a journal reservation because there's not enough space in it,
+                * and we can't close it and start another because we haven't
+                * finished writing out the previous entry:
+                */
                spin_unlock(&j->lock);
                trace_journal_entry_full(c);
                goto blocked;
@@ -408,7 +411,7 @@ retry:
 blocked:
        if (!j->res_get_blocked_start)
                j->res_get_blocked_start = local_clock() ?: 1;
-       return 0;
+       return -EAGAIN;
 }
 
 /*
@@ -422,14 +425,14 @@ blocked:
  * btree node write locks.
  */
 int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-                                unsigned u64s_min, unsigned u64s_max)
+                                 unsigned flags)
 {
        int ret;
 
        wait_event(j->wait,
-                  (ret = __journal_res_get(j, res, u64s_min,
-                                           u64s_max)));
-       return ret < 0 ? ret : 0;
+                  (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
+                  (flags & JOURNAL_RES_GET_NONBLOCK));
+       return ret;
 }
 
 u64 bch2_journal_last_unwritten_seq(struct journal *j)
@@ -453,28 +456,55 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
  * btree root - every journal entry contains the roots of all the btrees, so it
  * doesn't need to bother with getting a journal reservation
  */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
-       int ret;
-
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       bool need_reclaim = false;
+retry:
        spin_lock(&j->lock);
-       BUG_ON(seq > journal_cur_seq(j));
 
        if (seq < journal_cur_seq(j) ||
            journal_entry_is_open(j)) {
                spin_unlock(&j->lock);
-               return 1;
+               return 0;
+       }
+
+       if (journal_cur_seq(j) < seq) {
+               switch (journal_buf_switch(j, false)) {
+               case JOURNAL_ENTRY_ERROR:
+                       spin_unlock(&j->lock);
+                       return -EROFS;
+               case JOURNAL_ENTRY_INUSE:
+                       /* haven't finished writing out the previous one: */
+                       trace_journal_entry_full(c);
+                       goto blocked;
+               case JOURNAL_ENTRY_CLOSED:
+                       break;
+               case JOURNAL_UNLOCKED:
+                       goto retry;
+               }
+       }
+
+       BUG_ON(journal_cur_seq(j) < seq);
+
+       if (!journal_entry_open(j)) {
+               need_reclaim = true;
+               goto blocked;
        }
 
-       ret = journal_entry_open(j);
-       if (!ret)
-               closure_wait(&j->async_wait, parent);
        spin_unlock(&j->lock);
 
-       if (!ret)
-               bch2_journal_reclaim_work(&j->reclaim_work.work);
+       return 0;
+blocked:
+       if (!j->res_get_blocked_start)
+               j->res_get_blocked_start = local_clock() ?: 1;
 
-       return ret;
+       closure_wait(&j->async_wait, cl);
+       spin_unlock(&j->lock);
+
+       if (need_reclaim)
+               bch2_journal_reclaim_work(&j->reclaim_work.work);
+       return -EAGAIN;
 }
 
 static int journal_seq_error(struct journal *j, u64 seq)
@@ -594,11 +624,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 {
        struct journal_res res;
-       unsigned u64s = jset_u64s(0);
 
        memset(&res, 0, sizeof(res));
 
-       bch2_journal_res_get(j, &res, u64s, u64s);
+       bch2_journal_res_get(j, &res, jset_u64s(0), 0);
        bch2_journal_res_put(j, &res);
 
        bch2_journal_flush_seq_async(j, res.seq, parent);
@@ -607,12 +636,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 int bch2_journal_meta(struct journal *j)
 {
        struct journal_res res;
-       unsigned u64s = jset_u64s(0);
        int ret;
 
        memset(&res, 0, sizeof(res));
 
-       ret = bch2_journal_res_get(j, &res, u64s, u64s);
+       ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
        if (ret)
                return ret;
 
index 77cf39cc64ff57b42a5e152710f6b9083572893b..d9c094ba2ca0b2fc809b8ac98ec21f2f8c682e4e 100644 (file)
@@ -272,12 +272,14 @@ static inline void bch2_journal_res_put(struct journal *j,
 }
 
 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
-                                unsigned, unsigned);
+                                 unsigned);
+
+#define JOURNAL_RES_GET_NONBLOCK       (1 << 0)
+#define JOURNAL_RES_GET_CHECK          (1 << 1)
 
 static inline int journal_res_get_fast(struct journal *j,
                                       struct journal_res *res,
-                                      unsigned u64s_min,
-                                      unsigned u64s_max)
+                                      unsigned flags)
 {
        union journal_res_state old, new;
        u64 v = atomic64_read(&j->reservations.counter);
@@ -289,42 +291,45 @@ static inline int journal_res_get_fast(struct journal *j,
                 * Check if there is still room in the current journal
                 * entry:
                 */
-               if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+               if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
                        return 0;
 
-               res->offset     = old.cur_entry_offset;
-               res->u64s       = min(u64s_max, j->cur_entry_u64s -
-                                     old.cur_entry_offset);
+               if (flags & JOURNAL_RES_GET_CHECK)
+                       return 1;
 
-               journal_state_inc(&new);
                new.cur_entry_offset += res->u64s;
+               journal_state_inc(&new);
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
-       res->ref = true;
-       res->idx = new.idx;
-       res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+       res->ref        = true;
+       res->idx        = old.idx;
+       res->offset     = old.cur_entry_offset;
+       res->seq        = le64_to_cpu(j->buf[old.idx].data->seq);
        return 1;
 }
 
 static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
-                                     unsigned u64s_min, unsigned u64s_max)
+                                      unsigned u64s, unsigned flags)
 {
        int ret;
 
        EBUG_ON(res->ref);
-       EBUG_ON(u64s_max < u64s_min);
        EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
 
-       if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+       res->u64s = u64s;
+
+       if (journal_res_get_fast(j, res, flags))
                goto out;
 
-       ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+       ret = bch2_journal_res_get_slowpath(j, res, flags);
        if (ret)
                return ret;
 out:
-       lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
-       EBUG_ON(!res->ref);
+       if (!(flags & JOURNAL_RES_GET_CHECK)) {
+               lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+               EBUG_ON(!res->ref);
+       }
        return 0;
 }