blk-mq: allocate tags in batches for-5.6/block-test
authorJens Axboe <axboe@kernel.dk>
Mon, 30 Dec 2019 05:15:21 +0000 (22:15 -0700)
committerJens Axboe <axboe@kernel.dk>
Fri, 17 Jan 2020 15:27:16 +0000 (08:27 -0700)
Instead of grabbing tags one by one, grab a batch and store the local
cache in the software queue. Then subsequent tag allocations can just
grab free tags from there, without having to hit the shared tag map.

We flush these batches out if we run out of tags on the hardware queue.
The intent here is this should rarely happen.

This works very well in practice, with anywhere from 40-60 batch counts
seen regularly in testing.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/blk-mq-debugfs.c
block/blk-mq-tag.c
block/blk-mq-tag.h
block/blk-mq.c
block/blk-mq.h
include/linux/blk-mq.h

index e789f830ff596e33ef5396f321c74907452ae094..914be72d080ecc76d7944d1850a8db7617fe3d10 100644 (file)
@@ -659,6 +659,23 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
 CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
 CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
 
+static ssize_t ctx_tag_hit_write(void *data, const char __user *buf,
+                                   size_t count, loff_t *ppos)
+{
+       struct blk_mq_ctx *ctx = data;
+
+       ctx->tag_hit = ctx->tag_refill = 0;
+       return count;
+}
+
+static int ctx_tag_hit_show(void *data, struct seq_file *m)
+{
+       struct blk_mq_ctx *ctx = data;
+
+       seq_printf(m, "hit=%lu refills=%lu\n", ctx->tag_hit, ctx->tag_refill);
+       return 0;
+}
+
 static int ctx_dispatched_show(void *data, struct seq_file *m)
 {
        struct blk_mq_ctx *ctx = data;
@@ -800,6 +817,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
        {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
        {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
        {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
+       {"tag_hit", 0600, ctx_tag_hit_show, ctx_tag_hit_write},
        {"merged", 0600, ctx_merged_show, ctx_merged_write},
        {"completed", 0600, ctx_completed_show, ctx_completed_write},
        {},
index fbacde454718583555b38f07d0fe88ad89b712c3..fe2273eec3128533e74a8f067869b60f8b207182 100644 (file)
@@ -8,9 +8,10 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
-
 #include <linux/blk-mq.h>
 #include <linux/delay.h>
+#include <linux/cpu.h>
+
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
@@ -99,6 +100,103 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
                return __sbitmap_queue_get(bt);
 }
 
+void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
+                               struct blk_mq_ctx *ctx)
+{
+       struct sbitmap_queue *bt = &hctx->tags->bitmap_tags;
+       unsigned int i;
+
+       for (i = 0; i < hctx->queue->tag_set->nr_maps; i++) {
+               struct blk_mq_ctx_type *type = &ctx->type[i];
+
+               if (!type->tags)
+                       continue;
+
+               __sbitmap_queue_clear_batch(bt, type->tag_offset, type->tags);
+               type->tags = 0;
+       }
+}
+
+static void ctx_flush_ipi(void *data)
+{
+       struct blk_mq_hw_ctx *hctx = data;
+       struct blk_mq_ctx *ctx;
+
+       ctx = __blk_mq_get_ctx(hctx->queue, smp_processor_id());
+       blk_mq_tag_ctx_flush_batch(hctx, ctx);
+       atomic_dec(&hctx->flush_pending);
+}
+
+static void blk_mq_tag_flush_batches(struct blk_mq_hw_ctx *hctx)
+{
+       int cpu, err;
+
+       if (atomic_cmpxchg(&hctx->flush_pending, 0, hctx->nr_ctx))
+               return;
+       cpus_read_lock();
+       for_each_cpu(cpu, hctx->cpumask) {
+               err = smp_call_function_single(cpu, ctx_flush_ipi, hctx, 0);
+               if (err)
+                       atomic_dec(&hctx->flush_pending);
+       }
+       cpus_read_unlock();
+}
+
+void blk_mq_tag_queue_flush_batches(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       unsigned int i;
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               blk_mq_tag_flush_batches(hctx);
+}
+
+static int blk_mq_get_tag_batch(struct blk_mq_alloc_data *data)
+{
+       struct blk_mq_hw_ctx *hctx = data->hctx;
+       struct blk_mq_ctx_type *type;
+       struct blk_mq_ctx *ctx = data->ctx;
+       struct blk_mq_tags *tags;
+       struct sbitmap_queue *bt;
+       int tag = -1;
+
+       if (!ctx || (data->flags & BLK_MQ_REQ_INTERNAL))
+               return -1;
+
+       tags = hctx->tags;
+       bt = &tags->bitmap_tags;
+       /* don't do batches for round-robin or (very) sparse maps */
+       if (bt->round_robin || bt->sb.shift < ilog2(BITS_PER_LONG) - 1)
+               return -1;
+
+       /* we could make do with preempt disable, but we need to block flush */
+       local_irq_disable();
+       if (unlikely(ctx->cpu != smp_processor_id()))
+               goto out;
+
+       type = &ctx->type[hctx->type];
+
+       if (type->tags) {
+get_tag:
+               ctx->tag_hit++;
+
+               tag = __ffs(type->tags);
+               type->tags &= ~(1UL << tag);
+               tag += type->tag_offset;
+out:
+               local_irq_enable();
+               return tag;
+       }
+
+       /* no current tag cache, attempt to refill a batch */
+       if (!__sbitmap_queue_get_batch(bt, &type->tag_offset, &type->tags)) {
+               ctx->tag_refill++;
+               goto get_tag;
+       }
+
+       goto out;
+}
+
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -116,8 +214,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                bt = &tags->breserved_tags;
                tag_offset = 0;
        } else {
-               bt = &tags->bitmap_tags;
                tag_offset = tags->nr_reserved_tags;
+
+               tag = blk_mq_get_tag_batch(data);
+               if (tag != -1)
+                       goto found_tag;
+
+               bt = &tags->bitmap_tags;
        }
 
        tag = __blk_mq_get_tag(data, bt);
@@ -146,6 +249,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                if (tag != -1)
                        break;
 
+               if (!(data->flags & BLK_MQ_REQ_RESERVED))
+                       blk_mq_tag_flush_batches(data->hctx);
+
                sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
                tag = __blk_mq_get_tag(data, bt);
index 15bc74acb57eca1c56bf0564cb000c002ea08618..b5964fff163080099ac3d5caea87ef87f9df76d6 100644 (file)
@@ -34,6 +34,9 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
                void *priv);
+void blk_mq_tag_queue_flush_batches(struct request_queue *q);
+void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
+                               struct blk_mq_ctx *ctx);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
                                                 struct blk_mq_hw_ctx *hctx)
index cc48a0ffa5ec0ac8a94061bfd5657e67a35da801..81140f61a7c9ca31fc81272540a64884b758b78b 100644 (file)
@@ -2255,6 +2255,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
        ctx = __blk_mq_get_ctx(hctx->queue, cpu);
        type = hctx->type;
 
+       blk_mq_tag_ctx_flush_batch(hctx, ctx);
+
        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->type[type].rq_list)) {
                list_splice_init(&ctx->type[type].rq_list, &tmp);
@@ -2436,8 +2438,10 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 
                __ctx->cpu = i;
                spin_lock_init(&__ctx->lock);
-               for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
+               for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) {
                        INIT_LIST_HEAD(&__ctx->type[k].rq_list);
+                       __ctx->type[k].tags = 0;
+               }
 
                /*
                 * Set local node, IFF we have more than one hw queue. If
@@ -2521,6 +2525,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                        }
 
                        hctx = blk_mq_map_queue_type(q, j, i);
+                       ctx->type[j].tags = 0;
                        ctx->hctxs[j] = hctx;
                        /*
                         * If the CPU is already set in the mask, then we've
@@ -2542,9 +2547,11 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                        BUG_ON(!hctx->nr_ctx);
                }
 
-               for (; j < HCTX_MAX_TYPES; j++)
+               for (; j < HCTX_MAX_TYPES; j++) {
                        ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                        HCTX_TYPE_DEFAULT, i);
+                       ctx->type[j].tags = 0;
+               }
        }
 
        queue_for_each_hw_ctx(q, hctx, i) {
@@ -3298,8 +3305,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
        if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
                return;
 
-       list_for_each_entry(q, &set->tag_list, tag_set_list)
+       list_for_each_entry(q, &set->tag_list, tag_set_list) {
+               blk_mq_tag_queue_flush_batches(q);
                blk_mq_freeze_queue(q);
+       }
+
        /*
         * Switch IO scheduler to 'none', cleaning up the data associated
         * with the previous scheduler. We will switch back once we are done
index 271f16771499c0b5a22f1ff1e2c14f5f2f9eb8db..b6095cc50921741a7b81cf0e3a93fe3154f400ad 100644 (file)
@@ -14,6 +14,10 @@ struct blk_mq_ctxs {
 
 struct blk_mq_ctx_type {
        struct list_head                rq_list;
+
+       /* tag batch cache */
+       unsigned long                   tags;
+       unsigned int                    tag_offset;
 };
 
 /**
@@ -23,6 +27,7 @@ struct blk_mq_ctx {
        struct {
                spinlock_t              lock;
                struct blk_mq_ctx_type  type[HCTX_MAX_TYPES];
+               unsigned long           tag_hit, tag_refill;
        } ____cacheline_aligned_in_smp;
 
        unsigned int            cpu;
index 11cfd6470b1a3081fcca14944c7df0cd14c748c6..2c6a8657a72c32e97a67647c0c4e827ed037a03c 100644 (file)
@@ -140,6 +140,8 @@ struct blk_mq_hw_ctx {
         */
        atomic_t                nr_active;
 
+       atomic_t                flush_pending;
+
        /** @cpuhp_dead: List to store request if some CPU die. */
        struct hlist_node       cpuhp_dead;
        /** @kobj: Kernel object for sysfs. */