bcachefs: Journal resize fixes
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 6 Mar 2023 10:29:12 +0000 (05:29 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:56 +0000 (17:09 -0400)
 - Fix a sleeping-in-atomic bug due to calling
   bch2_journal_buckets_to_sb() under the journal lock.
 - Additionally, now we mark buckets as journal buckets before adding
   them to the journal in memory and the superblock. This ensures that
   if we crash part way through we'll never be writing to journal
   buckets that aren't marked correctly.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/buckets.c
fs/bcachefs/journal.c
fs/bcachefs/journal_sb.c
fs/bcachefs/journal_sb.h

index c7139dd8e1dc995796b44a3a80a6ee9d3c4ccd08..19b4e2bde399804bd32aae514521dd23ff9e349c 100644 (file)
@@ -1855,7 +1855,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
        if (IS_ERR(a))
                return PTR_ERR(a);
 
-       if (a->v.data_type && a->v.data_type != type) {
+       if (a->v.data_type && type && a->v.data_type != type) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
index 00e806a6424721810c3e5b8a4b2a5d33e3f8873c..3cc93caf563a8e23bfd53c248ddf3d7627ccad30 100644 (file)
@@ -757,19 +757,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        u64 *new_bucket_seq = NULL, *new_buckets = NULL;
        struct open_bucket **ob = NULL;
        long *bu = NULL;
-       unsigned i, nr_got = 0, nr_want = nr - ja->nr;
-       unsigned old_nr                 = ja->nr;
-       unsigned old_discard_idx        = ja->discard_idx;
-       unsigned old_dirty_idx_ondisk   = ja->dirty_idx_ondisk;
-       unsigned old_dirty_idx          = ja->dirty_idx;
-       unsigned old_cur_idx            = ja->cur_idx;
+       unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr;
        int ret = 0;
 
-       if (c) {
-               bch2_journal_flush_all_pins(&c->journal);
-               bch2_journal_block(&c->journal);
-               mutex_lock(&c->sb_lock);
-       }
+       BUG_ON(nr <= ja->nr);
 
        bu              = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL);
        ob              = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL);
@@ -777,7 +768,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        new_bucket_seq  = kcalloc(nr, sizeof(u64), GFP_KERNEL);
        if (!bu || !ob || !new_buckets || !new_bucket_seq) {
                ret = -ENOMEM;
-               goto err_unblock;
+               goto err_free;
        }
 
        for (nr_got = 0; nr_got < nr_want; nr_got++) {
@@ -794,87 +785,92 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                        if (ret)
                                break;
 
+                       ret = bch2_trans_run(c,
+                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                                               ob[nr_got]->bucket, BCH_DATA_journal,
+                                               ca->mi.bucket_size));
+                       if (ret) {
+                               bch2_open_bucket_put(c, ob[nr_got]);
+                               bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+                               break;
+                       }
+
                        bu[nr_got] = ob[nr_got]->bucket;
                }
        }
 
        if (!nr_got)
-               goto err_unblock;
+               goto err_free;
 
-       /*
-        * We may be called from the device add path, before the new device has
-        * actually been added to the running filesystem:
-        */
-       if (!new_fs)
-               spin_lock(&c->journal.lock);
+       /* Don't return an error if we successfully allocated some buckets: */
+       ret = 0;
+
+       if (c) {
+               bch2_journal_flush_all_pins(&c->journal);
+               bch2_journal_block(&c->journal);
+               mutex_lock(&c->sb_lock);
+       }
 
        memcpy(new_buckets,     ja->buckets,    ja->nr * sizeof(u64));
        memcpy(new_bucket_seq,  ja->bucket_seq, ja->nr * sizeof(u64));
-       swap(new_buckets,       ja->buckets);
-       swap(new_bucket_seq,    ja->bucket_seq);
+
+       BUG_ON(ja->discard_idx > ja->nr);
+
+       pos = ja->discard_idx ?: ja->nr;
+
+       memmove(new_buckets + pos + nr_got,
+               new_buckets + pos,
+               sizeof(new_buckets[0]) * (ja->nr - pos));
+       memmove(new_bucket_seq + pos + nr_got,
+               new_bucket_seq + pos,
+               sizeof(new_bucket_seq[0]) * (ja->nr - pos));
 
        for (i = 0; i < nr_got; i++) {
-               unsigned pos = ja->discard_idx ?: ja->nr;
-               long b = bu[i];
-
-               __array_insert_item(ja->buckets,                ja->nr, pos);
-               __array_insert_item(ja->bucket_seq,             ja->nr, pos);
-               ja->nr++;
-
-               ja->buckets[pos] = b;
-               ja->bucket_seq[pos] = 0;
-
-               if (pos <= ja->discard_idx)
-                       ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
-               if (pos <= ja->dirty_idx_ondisk)
-                       ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
-               if (pos <= ja->dirty_idx)
-                       ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
-               if (pos <= ja->cur_idx)
-                       ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+               new_buckets[pos + i] = bu[i];
+               new_bucket_seq[pos + i] = 0;
        }
 
-       ret = bch2_journal_buckets_to_sb(c, ca);
-       if (ret) {
-               /* Revert: */
-               swap(new_buckets,       ja->buckets);
-               swap(new_bucket_seq,    ja->bucket_seq);
-               ja->nr                  = old_nr;
-               ja->discard_idx         = old_discard_idx;
-               ja->dirty_idx_ondisk    = old_dirty_idx_ondisk;
-               ja->dirty_idx           = old_dirty_idx;
-               ja->cur_idx             = old_cur_idx;
-       }
+       nr = ja->nr + nr_got;
 
-       if (!new_fs)
-               spin_unlock(&c->journal.lock);
+       ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr);
+       if (ret)
+               goto err_unblock;
 
-       if (ja->nr != old_nr && !new_fs)
+       if (!new_fs)
                bch2_write_super(c);
 
+       /* Commit: */
        if (c)
-               bch2_journal_unblock(&c->journal);
+               spin_lock(&c->journal.lock);
 
-       if (ret)
-               goto err;
+       swap(new_buckets,       ja->buckets);
+       swap(new_bucket_seq,    ja->bucket_seq);
+       ja->nr = nr;
+
+       if (pos <= ja->discard_idx)
+               ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr;
+       if (pos <= ja->dirty_idx_ondisk)
+               ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr;
+       if (pos <= ja->dirty_idx)
+               ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr;
+       if (pos <= ja->cur_idx)
+               ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr;
 
-       if (!new_fs) {
-               for (i = 0; i < nr_got; i++) {
-                       ret = bch2_trans_run(c,
-                               bch2_trans_mark_metadata_bucket(&trans, ca,
-                                               bu[i], BCH_DATA_journal,
-                                               ca->mi.bucket_size));
-                       if (ret) {
-                               bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
-                               goto err;
-                       }
-               }
-       }
-err:
        if (c)
+               spin_unlock(&c->journal.lock);
+err_unblock:
+       if (c) {
+               bch2_journal_unblock(&c->journal);
                mutex_unlock(&c->sb_lock);
+       }
 
-       if (ob && !new_fs)
+       if (ret && !new_fs)
+               for (i = 0; i < nr_got; i++)
+                       bch2_trans_run(c,
+                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                                               bu[i], BCH_DATA_free, 0));
+err_free:
+       if (!new_fs)
                for (i = 0; i < nr_got; i++)
                        bch2_open_bucket_put(c, ob[i]);
 
@@ -882,12 +878,7 @@ err:
        kfree(new_buckets);
        kfree(ob);
        kfree(bu);
-
        return ret;
-err_unblock:
-       if (c)
-               bch2_journal_unblock(&c->journal);
-       goto err;
 }
 
 /*
@@ -901,13 +892,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
        struct closure cl;
        int ret = 0;
 
+       closure_init_stack(&cl);
+
+       down_write(&c->state_lock);
+
        /* don't handle reducing nr of buckets yet: */
        if (nr < ja->nr)
-               return 0;
-
-       closure_init_stack(&cl);
+               goto unlock;
 
-       while (ja->nr != nr) {
+       while (ja->nr < nr) {
                struct disk_reservation disk_res = { 0, 0 };
 
                /*
@@ -938,7 +931,8 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 
        if (ret)
                bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
-
+unlock:
+       up_write(&c->state_lock);
        return ret;
 }
 
index 9b933330a4c374e0aa9b00fd08a0ee178c582a41..5be7882342e0f1d2b26e0ab80c9c08d4d77e9a39 100644 (file)
@@ -175,46 +175,45 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
        .to_text        = bch2_sb_journal_v2_to_text,
 };
 
-int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
+                              u64 *buckets, unsigned nr)
 {
-       struct journal_device *ja = &ca->journal;
        struct bch_sb_field_journal_v2 *j;
-       unsigned i, dst = 0, nr = 1;
+       unsigned i, dst = 0, nr_compacted = 1;
 
        if (c)
                lockdep_assert_held(&c->sb_lock);
 
-       if (!ja->nr) {
+       if (!nr) {
                bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
                bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
                return 0;
        }
 
-       for (i = 0; i + 1 < ja->nr; i++)
-               if (ja->buckets[i] + 1 != ja->buckets[i + 1])
-                       nr++;
+       for (i = 0; i + 1 < nr; i++)
+               if (buckets[i] + 1 != buckets[i + 1])
+                       nr_compacted++;
 
        j = bch2_sb_resize_journal_v2(&ca->disk_sb,
-                                (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
+                        (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
        if (!j)
                return -BCH_ERR_ENOSPC_sb_journal;
 
        bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
 
-       j->d[dst].start = le64_to_cpu(ja->buckets[0]);
+       j->d[dst].start = le64_to_cpu(buckets[0]);
        j->d[dst].nr    = le64_to_cpu(1);
 
-       for (i = 1; i < ja->nr; i++) {
-               if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
+       for (i = 1; i < nr; i++) {
+               if (buckets[i] == buckets[i - 1] + 1) {
                        le64_add_cpu(&j->d[dst].nr, 1);
                } else {
                        dst++;
-                       j->d[dst].start = le64_to_cpu(ja->buckets[i]);
+                       j->d[dst].start = le64_to_cpu(buckets[i]);
                        j->d[dst].nr    = le64_to_cpu(1);
                }
        }
 
-       BUG_ON(dst + 1 != nr);
-
+       BUG_ON(dst + 1 != nr_compacted);
        return 0;
 }
index a39192e9f6f4c5a0ff251366ce1e2268fae38172..ba40a7e8d90a32d391a3d99dd2b4a8ca1418fb23 100644 (file)
@@ -21,4 +21,4 @@ static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_j
 extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
 extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
 
-int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned);