dm bio prison v1: improve concurrent IO performance
authorJoe Thornber <ejt@redhat.com>
Thu, 2 Mar 2023 09:35:37 +0000 (09:35 +0000)
committerMike Snitzer <snitzer@kernel.org>
Thu, 30 Mar 2023 19:57:51 +0000 (15:57 -0400)
Split the bio prison into multiple regions, with a separate rbtree and
associated lock for each region.

To get fast bio prison locking and not damage the performance of
discards too much the bio-prison now stipulates that discards should
not cross a BIO_PRISON_MAX_RANGE boundary.

Because the range of a key (block_end - block_begin) must not exceed
BIO_PRISON_MAX_RANGE: break_up_discard_bio() now ensures the data
range reflected in PHYSICAL key doesn't exceed BIO_PRISON_MAX_RANGE.
And splitting the thin target's discards (handled with VIRTUAL key) is
achieved by updating dm-thin.c to set limits->max_discard_sectors in
terms of BIO_PRISON_MAX_RANGE _and_ setting the thin and thin-pool
targets' max_discard_granularity to true.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
drivers/md/dm-bio-prison-v1.c
drivers/md/dm-bio-prison-v1.h
drivers/md/dm-thin.c

index c4c05d5d890928332c53cea1e6ba5216df3513f0..2b8af861e5f6d32450b913d556015f248557aefb 100644 (file)
 
 /*----------------------------------------------------------------*/
 
+#define NR_LOCKS 64
+#define LOCK_MASK (NR_LOCKS - 1)
 #define MIN_CELLS 1024
 
-struct dm_bio_prison {
+struct prison_region {
        spinlock_t lock;
-       struct rb_root cells;
+       struct rb_root cell;
+} ____cacheline_aligned_in_smp;
+
+struct dm_bio_prison {
+       struct prison_region regions[NR_LOCKS];
        mempool_t cell_pool;
 };
 
@@ -34,13 +40,17 @@ static struct kmem_cache *_cell_cache;
  */
 struct dm_bio_prison *dm_bio_prison_create(void)
 {
-       struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
        int ret;
+       unsigned i;
+       struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
 
        if (!prison)
                return NULL;
 
-       spin_lock_init(&prison->lock);
+       for (i = 0; i < NR_LOCKS; i++) {
+               spin_lock_init(&prison->regions[i].lock);
+               prison->regions[i].cell = RB_ROOT;
+       }
 
        ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
        if (ret) {
@@ -48,8 +58,6 @@ struct dm_bio_prison *dm_bio_prison_create(void)
                return NULL;
        }
 
-       prison->cells = RB_ROOT;
-
        return prison;
 }
 EXPORT_SYMBOL_GPL(dm_bio_prison_create);
@@ -107,14 +115,26 @@ static int cmp_keys(struct dm_cell_key *lhs,
        return 0;
 }
 
-static int __bio_detain(struct dm_bio_prison *prison,
+static unsigned lock_nr(struct dm_cell_key *key)
+{
+       return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & LOCK_MASK;
+}
+
+static void check_range(struct dm_cell_key *key)
+{
+       BUG_ON(key->block_end - key->block_begin > BIO_PRISON_MAX_RANGE);
+       BUG_ON((key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) !=
+              ((key->block_end - 1) >> BIO_PRISON_MAX_RANGE_SHIFT));
+}
+
+static int __bio_detain(struct rb_root *root,
                        struct dm_cell_key *key,
                        struct bio *inmate,
                        struct dm_bio_prison_cell *cell_prealloc,
                        struct dm_bio_prison_cell **cell_result)
 {
        int r;
-       struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+       struct rb_node **new = &root->rb_node, *parent = NULL;
 
        while (*new) {
                struct dm_bio_prison_cell *cell =
@@ -139,7 +159,7 @@ static int __bio_detain(struct dm_bio_prison *prison,
        *cell_result = cell_prealloc;
 
        rb_link_node(&cell_prealloc->node, parent, new);
-       rb_insert_color(&cell_prealloc->node, &prison->cells);
+       rb_insert_color(&cell_prealloc->node, root);
 
        return 0;
 }
@@ -151,10 +171,12 @@ static int bio_detain(struct dm_bio_prison *prison,
                      struct dm_bio_prison_cell **cell_result)
 {
        int r;
+       unsigned l = lock_nr(key);
+       check_range(key);
 
-       spin_lock_irq(&prison->lock);
-       r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
-       spin_unlock_irq(&prison->lock);
+       spin_lock_irq(&prison->regions[l].lock);
+       r = __bio_detain(&prison->regions[l].cell, key, inmate, cell_prealloc, cell_result);
+       spin_unlock_irq(&prison->regions[l].lock);
 
        return r;
 }
@@ -181,11 +203,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
 /*
  * @inmates must have been initialised prior to this call
  */
-static void __cell_release(struct dm_bio_prison *prison,
+static void __cell_release(struct rb_root *root,
                           struct dm_bio_prison_cell *cell,
                           struct bio_list *inmates)
 {
-       rb_erase(&cell->node, &prison->cells);
+       rb_erase(&cell->node, root);
 
        if (inmates) {
                if (cell->holder)
@@ -198,20 +220,22 @@ void dm_cell_release(struct dm_bio_prison *prison,
                     struct dm_bio_prison_cell *cell,
                     struct bio_list *bios)
 {
-       spin_lock_irq(&prison->lock);
-       __cell_release(prison, cell, bios);
-       spin_unlock_irq(&prison->lock);
+       unsigned l = lock_nr(&cell->key);
+
+       spin_lock_irq(&prison->regions[l].lock);
+       __cell_release(&prison->regions[l].cell, cell, bios);
+       spin_unlock_irq(&prison->regions[l].lock);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release);
 
 /*
  * Sometimes we don't want the holder, just the additional bios.
  */
-static void __cell_release_no_holder(struct dm_bio_prison *prison,
+static void __cell_release_no_holder(struct rb_root *root,
                                     struct dm_bio_prison_cell *cell,
                                     struct bio_list *inmates)
 {
-       rb_erase(&cell->node, &prison->cells);
+       rb_erase(&cell->node, root);
        bio_list_merge(inmates, &cell->bios);
 }
 
@@ -219,11 +243,12 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
                               struct dm_bio_prison_cell *cell,
                               struct bio_list *inmates)
 {
+       unsigned l = lock_nr(&cell->key);
        unsigned long flags;
 
-       spin_lock_irqsave(&prison->lock, flags);
-       __cell_release_no_holder(prison, cell, inmates);
-       spin_unlock_irqrestore(&prison->lock, flags);
+       spin_lock_irqsave(&prison->regions[l].lock, flags);
+       __cell_release_no_holder(&prison->regions[l].cell, cell, inmates);
+       spin_unlock_irqrestore(&prison->regions[l].lock, flags);
 }
 EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
 
@@ -248,18 +273,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
                           void *context,
                           struct dm_bio_prison_cell *cell)
 {
-       spin_lock_irq(&prison->lock);
+       unsigned l = lock_nr(&cell->key);
+       spin_lock_irq(&prison->regions[l].lock);
        visit_fn(context, cell);
-       rb_erase(&cell->node, &prison->cells);
-       spin_unlock_irq(&prison->lock);
+       rb_erase(&cell->node, &prison->regions[l].cell);
+       spin_unlock_irq(&prison->regions[l].lock);
 }
 EXPORT_SYMBOL_GPL(dm_cell_visit_release);
 
-static int __promote_or_release(struct dm_bio_prison *prison,
+static int __promote_or_release(struct rb_root *root,
                                struct dm_bio_prison_cell *cell)
 {
        if (bio_list_empty(&cell->bios)) {
-               rb_erase(&cell->node, &prison->cells);
+               rb_erase(&cell->node, root);
                return 1;
        }
 
@@ -271,10 +297,11 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison,
                               struct dm_bio_prison_cell *cell)
 {
        int r;
+       unsigned l = lock_nr(&cell->key);
 
-       spin_lock_irq(&prison->lock);
-       r = __promote_or_release(prison, cell);
-       spin_unlock_irq(&prison->lock);
+       spin_lock_irq(&prison->regions[l].lock);
+       r = __promote_or_release(&prison->regions[l].cell, cell);
+       spin_unlock_irq(&prison->regions[l].lock);
 
        return r;
 }
index dfbf1e94cb7585d4f45d318298fca2e3dc6ec855..0b8acd6708fb5cac524a09b4499debb243ea9bd0 100644 (file)
@@ -34,6 +34,16 @@ struct dm_cell_key {
        dm_block_t block_begin, block_end;
 };
 
+/*
+ * The range of a key (block_end - block_begin) must not
+ * exceed BIO_PRISON_MAX_RANGE.  Also the range must not
+ * cross a similarly sized boundary.
+ *
+ * Must be a power of 2.
+ */
+#define BIO_PRISON_MAX_RANGE 1024
+#define BIO_PRISON_MAX_RANGE_SHIFT 10
+
 /*
  * Treat this as opaque, only in header so callers can manage allocation
  * themselves.
index 00323428919eaee2e3469c3035c0bacd3f7779e0..33ad5695f9598726e468c180f067737554c80b6b 100644 (file)
@@ -1674,54 +1674,69 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
        struct dm_cell_key data_key;
        struct dm_bio_prison_cell *data_cell;
        struct dm_thin_new_mapping *m;
-       dm_block_t virt_begin, virt_end, data_begin;
+       dm_block_t virt_begin, virt_end, data_begin, data_end;
+       dm_block_t len, next_boundary;
 
        while (begin != end) {
-               r = ensure_next_mapping(pool);
-               if (r)
-                       /* we did our best */
-                       return;
-
                r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
                                              &data_begin, &maybe_shared);
-               if (r)
+               if (r) {
                        /*
                         * Silently fail, letting any mappings we've
                         * created complete.
                         */
                        break;
-
-               build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
-               if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
-                       /* contention, we'll give up with this range */
-                       begin = virt_end;
-                       continue;
                }
 
-               /*
-                * IO may still be going to the destination block.  We must
-                * quiesce before we can do the removal.
-                */
-               m = get_next_mapping(pool);
-               m->tc = tc;
-               m->maybe_shared = maybe_shared;
-               m->virt_begin = virt_begin;
-               m->virt_end = virt_end;
-               m->data_block = data_begin;
-               m->cell = data_cell;
-               m->bio = bio;
+               data_end = data_begin + (virt_end - virt_begin);
 
                /*
-                * The parent bio must not complete before sub discard bios are
-                * chained to it (see end_discard's bio_chain)!
-                *
-                * This per-mapping bi_remaining increment is paired with
-                * the implicit decrement that occurs via bio_endio() in
-                * end_discard().
+                * Make sure the data region obeys the bio prison restrictions.
                 */
-               bio_inc_remaining(bio);
-               if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
-                       pool->process_prepared_discard(m);
+               while (data_begin < data_end) {
+                       r = ensure_next_mapping(pool);
+                       if (r)
+                               return; /* we did our best */
+
+                       next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + 1)
+                               << BIO_PRISON_MAX_RANGE_SHIFT;
+                       len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
+
+                       build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);
+                       if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+                               /* contention, we'll give up with this range */
+                               data_begin += len;
+                               continue;
+                       }
+
+                       /*
+                        * IO may still be going to the destination block.  We must
+                        * quiesce before we can do the removal.
+                        */
+                       m = get_next_mapping(pool);
+                       m->tc = tc;
+                       m->maybe_shared = maybe_shared;
+                       m->virt_begin = virt_begin;
+                       m->virt_end = virt_begin + len;
+                       m->data_block = data_begin;
+                       m->cell = data_cell;
+                       m->bio = bio;
+
+                       /*
+                        * The parent bio must not complete before sub discard bios are
+                        * chained to it (see end_discard's bio_chain)!
+                        *
+                        * This per-mapping bi_remaining increment is paired with
+                        * the implicit decrement that occurs via bio_endio() in
+                        * end_discard().
+                        */
+                       bio_inc_remaining(bio);
+                       if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+                               pool->process_prepared_discard(m);
+
+                       virt_begin += len;
+                       data_begin += len;
+               }
 
                begin = virt_end;
        }
@@ -3380,13 +3395,13 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
         */
        if (pf.discard_enabled && pf.discard_passdown) {
                ti->num_discard_bios = 1;
-
                /*
                 * Setting 'discards_supported' circumvents the normal
                 * stacking of discard limits (this keeps the pool and
                 * thin devices' discard limits consistent).
                 */
                ti->discards_supported = true;
+               ti->max_discard_granularity = true;
        }
        ti->private = pt;
 
@@ -4096,7 +4111,7 @@ static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-       .version = {1, 22, 0},
+       .version = {1, 23, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -4261,6 +4276,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        if (tc->pool->pf.discard_enabled) {
                ti->discards_supported = true;
                ti->num_discard_bios = 1;
+               ti->max_discard_granularity = true;
        }
 
        mutex_unlock(&dm_thin_pool_table.mutex);
@@ -4476,12 +4492,12 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
                return;
 
        limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-       limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+       limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
 }
 
 static struct target_type thin_target = {
        .name = "thin",
-       .version = {1, 22, 0},
+       .version = {1, 23, 0},
        .module = THIS_MODULE,
        .ctr = thin_ctr,
        .dtr = thin_dtr,