bcachefs: New erasure coding shutdown path
authorKent Overstreet <kent.overstreet@linux.dev>
Tue, 14 Mar 2023 02:01:47 +0000 (22:01 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:09:57 +0000 (17:09 -0400)
This implements a new shutdown path for erasure coding, which is needed
for the upcoming BCH_WRITE_WAIT_FOR_EC write path.

The process is:
 - Cancel new stripes being built up
 - Close out/cancel open buckets on write points or the partial list
   that are for stripes
 - Shutdown rebalance/copygc
 - Then wait for in flight new stripes to finish

With BCH_WRITE_WAIT_FOR_EC, move ops will be waiting on stripes to fill
up before they complete; the new ec shutdown path is needed for shutting
down copygc/rebalance without deadlocking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/alloc_background.c
fs/bcachefs/alloc_foreground.c
fs/bcachefs/alloc_foreground.h
fs/bcachefs/bcachefs.h
fs/bcachefs/data_update.c
fs/bcachefs/ec.c
fs/bcachefs/ec.h
fs/bcachefs/io.c
fs/bcachefs/move.c
fs/bcachefs/super.c

index e5abe6406afe22211ead53c6f8cb0997a3f16579..17bcebbd1f2a828fe2e46ae22be78c810c443f2b 100644 (file)
@@ -2158,44 +2158,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
         */
        bch2_recalc_capacity(c);
 
-       /* Next, close write points that point to this device... */
-       for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
-               bch2_writepoint_stop(c, ca, &c->write_points[i]);
-
-       bch2_writepoint_stop(c, ca, &c->copygc_write_point);
-       bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
-       bch2_writepoint_stop(c, ca, &c->btree_write_point);
-
-       mutex_lock(&c->btree_reserve_cache_lock);
-       while (c->btree_reserve_cache_nr) {
-               struct btree_alloc *a =
-                       &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
-
-               bch2_open_buckets_put(c, &a->ob);
-       }
-       mutex_unlock(&c->btree_reserve_cache_lock);
-
-       spin_lock(&c->freelist_lock);
-       i = 0;
-       while (i < c->open_buckets_partial_nr) {
-               struct open_bucket *ob =
-                       c->open_buckets + c->open_buckets_partial[i];
-
-               if (ob->dev == ca->dev_idx) {
-                       --c->open_buckets_partial_nr;
-                       swap(c->open_buckets_partial[i],
-                            c->open_buckets_partial[c->open_buckets_partial_nr]);
-                       ob->on_partial_list = false;
-                       spin_unlock(&c->freelist_lock);
-                       bch2_open_bucket_put(c, ob);
-                       spin_lock(&c->freelist_lock);
-               } else {
-                       i++;
-               }
-       }
-       spin_unlock(&c->freelist_lock);
-
-       bch2_ec_stop_dev(c, ca);
+       bch2_open_buckets_stop(c, ca, false);
 
        /*
         * Wake up threads that were blocked on allocation, so they can notice
index 7c81189bcd621e77d385abc960a3cb20a9c05405..20c64882104e263f86dea2c7a31dc4168947d2de 100644 (file)
@@ -1023,45 +1023,96 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
        return ret < 0 ? ret : 0;
 }
 
-void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
-                               struct open_buckets *obs)
+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
+                              struct bch_dev *ca, bool ec)
 {
-       struct open_buckets ptrs = { .nr = 0 };
-       struct open_bucket *ob, *ob2;
-       unsigned i, j;
-
-       open_bucket_for_each(c, obs, ob, i) {
-               bool drop = !ca || ob->dev == ca->dev_idx;
+       if (ec) {
+               return ob->ec != NULL;
+       } else if (ca) {
+               bool drop = ob->dev == ca->dev_idx;
+               struct open_bucket *ob2;
+               unsigned i;
 
                if (!drop && ob->ec) {
                        mutex_lock(&ob->ec->lock);
-                       for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
-                               if (!ob->ec->blocks[j])
+                       for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) {
+                               if (!ob->ec->blocks[i])
                                        continue;
 
-                               ob2 = c->open_buckets + ob->ec->blocks[j];
+                               ob2 = c->open_buckets + ob->ec->blocks[i];
                                drop |= ob2->dev == ca->dev_idx;
                        }
                        mutex_unlock(&ob->ec->lock);
                }
 
-               if (drop)
-                       bch2_open_bucket_put(c, ob);
-               else
-                       ob_push(c, &ptrs, ob);
+               return drop;
+       } else {
+               return true;
        }
-
-       *obs = ptrs;
 }
 
-void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
-                         struct write_point *wp)
+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+                                bool ec, struct write_point *wp)
 {
+       struct open_buckets ptrs = { .nr = 0 };
+       struct open_bucket *ob;
+       unsigned i;
+
        mutex_lock(&wp->lock);
-       bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
+       open_bucket_for_each(c, &wp->ptrs, ob, i)
+               if (should_drop_bucket(ob, c, ca, ec))
+                       bch2_open_bucket_put(c, ob);
+               else
+                       ob_push(c, &ptrs, ob);
+       wp->ptrs = ptrs;
        mutex_unlock(&wp->lock);
 }
 
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
+                           bool ec)
+{
+       unsigned i;
+
+       /* Next, close write points that point to this device... */
+       for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+               bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
+
+       bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
+       bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
+       bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
+
+       mutex_lock(&c->btree_reserve_cache_lock);
+       while (c->btree_reserve_cache_nr) {
+               struct btree_alloc *a =
+                       &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
+
+               bch2_open_buckets_put(c, &a->ob);
+       }
+       mutex_unlock(&c->btree_reserve_cache_lock);
+
+       spin_lock(&c->freelist_lock);
+       i = 0;
+       while (i < c->open_buckets_partial_nr) {
+               struct open_bucket *ob =
+                       c->open_buckets + c->open_buckets_partial[i];
+
+               if (should_drop_bucket(ob, c, ca, ec)) {
+                       --c->open_buckets_partial_nr;
+                       swap(c->open_buckets_partial[i],
+                            c->open_buckets_partial[c->open_buckets_partial_nr]);
+                       ob->on_partial_list = false;
+                       spin_unlock(&c->freelist_lock);
+                       bch2_open_bucket_put(c, ob);
+                       spin_lock(&c->freelist_lock);
+               } else {
+                       i++;
+               }
+       }
+       spin_unlock(&c->freelist_lock);
+
+       bch2_ec_stop_dev(c, ca);
+}
+
 static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
                                                 unsigned long write_point)
 {
@@ -1107,8 +1158,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
        return true;
 }
 
-static bool try_decrease_writepoints(struct bch_fs *c,
-                                    unsigned old_nr)
+static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr)
 {
        struct write_point *wp;
 
@@ -1129,7 +1179,7 @@ static bool try_decrease_writepoints(struct bch_fs *c,
        hlist_del_rcu(&wp->node);
        mutex_unlock(&c->write_points_hash_lock);
 
-       bch2_writepoint_stop(c, NULL, wp);
+       bch2_writepoint_stop(c, NULL, false, wp);
        return true;
 }
 
index 1fa96f8c68796882e282c4880f69ef5d1b702878..8a1cf425091b8aacd8438f1222b688074c60bffb 100644 (file)
@@ -202,11 +202,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
                                    struct bkey_i *, unsigned, bool);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
-void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
-                               struct open_buckets *);
-
-void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
-                         struct write_point *);
+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
 
 static inline struct write_point_specifier writepoint_hashed(unsigned long v)
 {
index 8be65ebb34adf5d5c9c20223f939b82a2615ede2..05fc0f7434dd2530766430b44e23068e022d9720 100644 (file)
@@ -655,7 +655,6 @@ typedef struct {
        x(fallocate)                                                    \
        x(discard)                                                      \
        x(invalidate)                                                   \
-       x(move)                                                         \
        x(delete_dead_snapshots)                                        \
        x(snapshot_delete_pagecache)                                    \
        x(sysfs)
@@ -958,14 +957,14 @@ struct bch_fs {
 
        struct list_head        ec_stripe_new_list;
        struct mutex            ec_stripe_new_lock;
+       wait_queue_head_t       ec_stripe_new_wait;
 
        struct work_struct      ec_stripe_create_work;
        u64                     ec_stripe_hint;
 
-       struct bio_set          ec_bioset;
-
        struct work_struct      ec_stripe_delete_work;
-       struct llist_head       ec_stripe_delete_list;
+
+       struct bio_set          ec_bioset;
 
        /* REFLINK */
        u64                     reflink_hint;
index 447863825a8923580e195c50db9af0e062df5be9..5ec884a222f8c1a2a8cb432e7463304369ff6839 100644 (file)
@@ -252,6 +252,7 @@ restart_drop_extra_replicas:
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(trans, &op->res,
                                NULL,
+                               BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_NOFAIL|
                                m->data_opts.btree_insert_flags);
                if (!ret) {
index af3a72acc67f474563958500e04419bfa32b9c1c..1e621dcc1d3724a44605356fc5e09cbaf54b8bd3 100644 (file)
@@ -989,6 +989,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
        while (1) {
                ret = commit_do(trans, NULL, NULL,
+                               BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_NOFAIL,
                        ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
                                                s, &bp_offset));
@@ -1127,7 +1128,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
                goto err;
        }
 
-       ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+       ret = bch2_trans_do(c, &s->res, NULL,
+                           BTREE_INSERT_NOCHECK_RW|
+                           BTREE_INSERT_NOFAIL,
                            ec_stripe_key_update(&trans, &s->new_stripe.key,
                                                 !s->have_existing_stripe));
        if (ret) {
@@ -1409,6 +1412,11 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
        if (ret)
                return ERR_PTR(ret);
 
+       if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+               h = ERR_PTR(-EROFS);
+               goto found;
+       }
+
        list_for_each_entry(h, &c->ec_stripe_head_list, list)
                if (h->target           == target &&
                    h->algo             == algo &&
@@ -1753,7 +1761,7 @@ err:
        return ERR_PTR(ret);
 }
 
-void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
 {
        struct ec_stripe_head *h;
        struct open_bucket *ob;
@@ -1761,11 +1769,13 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
 
        mutex_lock(&c->ec_stripe_head_lock);
        list_for_each_entry(h, &c->ec_stripe_head_list, list) {
-
                mutex_lock(&h->lock);
                if (!h->s)
                        goto unlock;
 
+               if (!ca)
+                       goto found;
+
                for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
                        if (!h->s->blocks[i])
                                continue;
@@ -1784,6 +1794,32 @@ unlock:
        mutex_unlock(&c->ec_stripe_head_lock);
 }
 
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+       __bch2_ec_stop(c, ca);
+}
+
+void bch2_fs_ec_stop(struct bch_fs *c)
+{
+       __bch2_ec_stop(c, NULL);
+}
+
+static bool bch2_fs_ec_flush_done(struct bch_fs *c)
+{
+       bool ret;
+
+       mutex_lock(&c->ec_stripe_new_lock);
+       ret = list_empty(&c->ec_stripe_new_list);
+       mutex_unlock(&c->ec_stripe_new_lock);
+
+       return ret;
+}
+
+void bch2_fs_ec_flush(struct bch_fs *c)
+{
+       wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
+}
+
 int bch2_stripes_read(struct bch_fs *c)
 {
        struct btree_trans trans;
@@ -1915,14 +1951,22 @@ void bch2_fs_ec_exit(struct bch_fs *c)
 
 void bch2_fs_ec_init_early(struct bch_fs *c)
 {
+       spin_lock_init(&c->ec_stripes_new_lock);
+       mutex_init(&c->ec_stripes_heap_lock);
+
+       INIT_LIST_HEAD(&c->ec_stripe_head_list);
+       mutex_init(&c->ec_stripe_head_lock);
+
+       INIT_LIST_HEAD(&c->ec_stripe_new_list);
+       mutex_init(&c->ec_stripe_new_lock);
+       init_waitqueue_head(&c->ec_stripe_new_wait);
+
        INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
        INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
 }
 
 int bch2_fs_ec_init(struct bch_fs *c)
 {
-       spin_lock_init(&c->ec_stripes_new_lock);
-
        return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
                           BIOSET_NEED_BVECS);
 }
index 8f777a37e43d77591506271a8a205f61261c8cfa..7c08a49d741956534a229e92e2d787ceacb3b407 100644 (file)
@@ -245,8 +245,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
 }
 
 void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
-
-void bch2_ec_flush_new_stripes(struct bch_fs *);
+void bch2_fs_ec_stop(struct bch_fs *);
+void bch2_fs_ec_flush(struct bch_fs *);
 
 int bch2_stripes_read(struct bch_fs *);
 
index 1b093650ff9a0ad2d85ed19b87649567fe026b6d..e82da496b3f8ceb25c30efd1317d196a103cdd3d 100644 (file)
@@ -705,7 +705,8 @@ static void bch2_write_done(struct closure *cl)
        struct bch_fs *c = op->c;
 
        bch2_disk_reservation_put(c, &op->res);
-       bch2_write_ref_put(c, BCH_WRITE_REF_write);
+       if (!(op->flags & BCH_WRITE_MOVE))
+               bch2_write_ref_put(c, BCH_WRITE_REF_write);
        bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
        bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
@@ -1842,7 +1843,12 @@ void bch2_write(struct closure *cl)
                goto err;
        }
 
-       if (c->opts.nochanges ||
+       if (c->opts.nochanges) {
+               op->error = -BCH_ERR_erofs_no_writes;
+               goto err;
+       }
+
+       if (!(op->flags & BCH_WRITE_MOVE) &&
            !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
                op->error = -BCH_ERR_erofs_no_writes;
                goto err;
index f74ef947cac503603f185f2dc20ff7381470ca51..4a9ffca7be629b95d9af247379a65b0ba5498d75 100644 (file)
@@ -59,7 +59,6 @@ struct moving_io {
 static void move_free(struct moving_io *io)
 {
        struct moving_context *ctxt = io->write.ctxt;
-       struct bch_fs *c = ctxt->c;
 
        if (io->b)
                atomic_dec(&io->b->count);
@@ -71,7 +70,6 @@ static void move_free(struct moving_io *io)
        wake_up(&ctxt->wait);
        mutex_unlock(&ctxt->lock);
 
-       bch2_write_ref_put(c, BCH_WRITE_REF_move);
        kfree(io);
 }
 
@@ -280,9 +278,6 @@ static int bch2_move_extent(struct btree_trans *trans,
                return 0;
        }
 
-       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
-               return -BCH_ERR_erofs_no_writes;
-
        /*
         * Before memory allocations & taking nocow locks in
         * bch2_data_update_init():
@@ -378,7 +373,6 @@ err_free_pages:
 err_free:
        kfree(io);
 err:
-       bch2_write_ref_put(c, BCH_WRITE_REF_move);
        trace_and_count(c, move_extent_alloc_mem_fail, k.k);
        return ret;
 }
index bf3aabdb0fc97d89161b83aa2293d7cf926481f0..278f8f19a2307c9b9eb78a11c26280635cda1126 100644 (file)
@@ -205,9 +205,12 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        unsigned i, clean_passes = 0;
        u64 seq = 0;
 
+       bch2_fs_ec_stop(c);
+       bch2_open_buckets_stop(c, NULL, true);
        bch2_rebalance_stop(c);
        bch2_copygc_stop(c);
        bch2_gc_thread_stop(c);
+       bch2_fs_ec_flush(c);
 
        bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
                    journal_cur_seq(&c->journal));
@@ -700,15 +703,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        INIT_LIST_HEAD(&c->fsck_errors);
        mutex_init(&c->fsck_error_lock);
 
-       INIT_LIST_HEAD(&c->ec_stripe_head_list);
-       mutex_init(&c->ec_stripe_head_lock);
-
-       INIT_LIST_HEAD(&c->ec_stripe_new_list);
-       mutex_init(&c->ec_stripe_new_lock);
-
-
-       mutex_init(&c->ec_stripes_heap_lock);
-
        seqcount_init(&c->gc_pos_lock);
 
        seqcount_init(&c->usage_lock);