blk-mq: allocate tags in batches

author Jens Axboe <axboe@kernel.dk>

Mon, 30 Dec 2019 05:15:21 +0000 (22:15 -0700)

committer Jens Axboe <axboe@kernel.dk>

Fri, 17 Jan 2020 15:27:16 +0000 (08:27 -0700)
author Jens Axboe <axboe@kernel.dk>
Mon, 30 Dec 2019 05:15:21 +0000 (22:15 -0700)
committer Jens Axboe <axboe@kernel.dk>
Fri, 17 Jan 2020 15:27:16 +0000 (08:27 -0700)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c

index e789f830ff596e33ef5396f321c74907452ae094..914be72d080ecc76d7944d1850a8db7617fe3d10 100644 (file)
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -659,6 +659,23 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
  CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
  CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
  
+static ssize_t ctx_tag_hit_write(void *data, const char __user *buf,
+                                   size_t count, loff_t *ppos)
+{
+       struct blk_mq_ctx *ctx = data;
+
+       ctx->tag_hit = ctx->tag_refill = 0;
+       return count;
+}
+
+static int ctx_tag_hit_show(void *data, struct seq_file *m)
+{
+       struct blk_mq_ctx *ctx = data;
+
+       seq_printf(m, "hit=%lu refills=%lu\n", ctx->tag_hit, ctx->tag_refill);
+       return 0;
+}
+
  static int ctx_dispatched_show(void *data, struct seq_file *m)
  {
         struct blk_mq_ctx *ctx = data;
@@ -800,6 +817,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
         {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
         {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
         {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
+       {"tag_hit", 0600, ctx_tag_hit_show, ctx_tag_hit_write},
         {"merged", 0600, ctx_merged_show, ctx_merged_write},
         {"completed", 0600, ctx_completed_show, ctx_completed_write},
         {},
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c

index fbacde454718583555b38f07d0fe88ad89b712c3..fe2273eec3128533e74a8f067869b60f8b207182 100644 (file)
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -8,9 +8,10 @@
   */
  #include <linux/kernel.h>
  #include <linux/module.h>
-
  #include <linux/blk-mq.h>
  #include <linux/delay.h>
+#include <linux/cpu.h>
+
  #include "blk.h"
  #include "blk-mq.h"
  #include "blk-mq-tag.h"
@@ -99,6 +100,103 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
                 return __sbitmap_queue_get(bt);
  }
  
+void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
+                               struct blk_mq_ctx *ctx)
+{
+       struct sbitmap_queue *bt = &hctx->tags->bitmap_tags;
+       unsigned int i;
+
+       for (i = 0; i < hctx->queue->tag_set->nr_maps; i++) {
+               struct blk_mq_ctx_type *type = &ctx->type[i];
+
+               if (!type->tags)
+                       continue;
+
+               __sbitmap_queue_clear_batch(bt, type->tag_offset, type->tags);
+               type->tags = 0;
+       }
+}
+
+static void ctx_flush_ipi(void *data)
+{
+       struct blk_mq_hw_ctx *hctx = data;
+       struct blk_mq_ctx *ctx;
+
+       ctx = __blk_mq_get_ctx(hctx->queue, smp_processor_id());
+       blk_mq_tag_ctx_flush_batch(hctx, ctx);
+       atomic_dec(&hctx->flush_pending);
+}
+
+static void blk_mq_tag_flush_batches(struct blk_mq_hw_ctx *hctx)
+{
+       int cpu, err;
+
+       if (atomic_cmpxchg(&hctx->flush_pending, 0, hctx->nr_ctx))
+               return;
+       cpus_read_lock();
+       for_each_cpu(cpu, hctx->cpumask) {
+               err = smp_call_function_single(cpu, ctx_flush_ipi, hctx, 0);
+               if (err)
+                       atomic_dec(&hctx->flush_pending);
+       }
+       cpus_read_unlock();
+}
+
+void blk_mq_tag_queue_flush_batches(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       unsigned int i;
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               blk_mq_tag_flush_batches(hctx);
+}
+
+static int blk_mq_get_tag_batch(struct blk_mq_alloc_data *data)
+{
+       struct blk_mq_hw_ctx *hctx = data->hctx;
+       struct blk_mq_ctx_type *type;
+       struct blk_mq_ctx *ctx = data->ctx;
+       struct blk_mq_tags *tags;
+       struct sbitmap_queue *bt;
+       int tag = -1;
+
+       if (!ctx || (data->flags & BLK_MQ_REQ_INTERNAL))
+               return -1;
+
+       tags = hctx->tags;
+       bt = &tags->bitmap_tags;
+       /* don't do batches for round-robin or (very) sparse maps */
+       if (bt->round_robin || bt->sb.shift < ilog2(BITS_PER_LONG) - 1)
+               return -1;
+
+       /* we could make do with preempt disable, but we need to block flush */
+       local_irq_disable();
+       if (unlikely(ctx->cpu != smp_processor_id()))
+               goto out;
+
+       type = &ctx->type[hctx->type];
+
+       if (type->tags) {
+get_tag:
+               ctx->tag_hit++;
+
+               tag = __ffs(type->tags);
+               type->tags &= ~(1UL << tag);
+               tag += type->tag_offset;
+out:
+               local_irq_enable();
+               return tag;
+       }
+
+       /* no current tag cache, attempt to refill a batch */
+       if (!__sbitmap_queue_get_batch(bt, &type->tag_offset, &type->tags)) {
+               ctx->tag_refill++;
+               goto get_tag;
+       }
+
+       goto out;
+}
+
  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
  {
         struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -116,8 +214,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                 bt = &tags->breserved_tags;
                 tag_offset = 0;
         } else {
-               bt = &tags->bitmap_tags;
                 tag_offset = tags->nr_reserved_tags;
+
+               tag = blk_mq_get_tag_batch(data);
+               if (tag != -1)
+                       goto found_tag;
+
+               bt = &tags->bitmap_tags;
         }
  
         tag = __blk_mq_get_tag(data, bt);
@@ -146,6 +249,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                 if (tag != -1)
                         break;
  
+               if (!(data->flags & BLK_MQ_REQ_RESERVED))
+                       blk_mq_tag_flush_batches(data->hctx);
+
                 sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
  
                 tag = __blk_mq_get_tag(data, bt);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h

index 15bc74acb57eca1c56bf0564cb000c002ea08618..b5964fff163080099ac3d5caea87ef87f9df76d6 100644 (file)
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -34,6 +34,9 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
  extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
  void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
                 void *priv);
+void blk_mq_tag_queue_flush_batches(struct request_queue *q);
+void blk_mq_tag_ctx_flush_batch(struct blk_mq_hw_ctx *hctx,
+                               struct blk_mq_ctx *ctx);
  
  static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
                                                  struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c

index cc48a0ffa5ec0ac8a94061bfd5657e67a35da801..81140f61a7c9ca31fc81272540a64884b758b78b 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2255,6 +2255,8 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
         type = hctx->type;
  
+       blk_mq_tag_ctx_flush_batch(hctx, ctx);
+
         spin_lock(&ctx->lock);
         if (!list_empty(&ctx->type[type].rq_list)) {
                 list_splice_init(&ctx->type[type].rq_list, &tmp);
@@ -2436,8 +2438,10 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
  
                 __ctx->cpu = i;
                 spin_lock_init(&__ctx->lock);
-               for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
+               for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) {
                         INIT_LIST_HEAD(&__ctx->type[k].rq_list);
+                       __ctx->type[k].tags = 0;
+               }
  
                 /*
                  * Set local node, IFF we have more than one hw queue. If
@@ -2521,6 +2525,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                         }
  
                         hctx = blk_mq_map_queue_type(q, j, i);
+                       ctx->type[j].tags = 0;
                         ctx->hctxs[j] = hctx;
                         /*
                          * If the CPU is already set in the mask, then we've
@@ -2542,9 +2547,11 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                         BUG_ON(!hctx->nr_ctx);
                 }
  
-               for (; j < HCTX_MAX_TYPES; j++)
+               for (; j < HCTX_MAX_TYPES; j++) {
                         ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                         HCTX_TYPE_DEFAULT, i);
+                       ctx->type[j].tags = 0;
+               }
         }
  
         queue_for_each_hw_ctx(q, hctx, i) {
@@ -3298,8 +3305,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
         if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
                 return;
  
-       list_for_each_entry(q, &set->tag_list, tag_set_list)
+       list_for_each_entry(q, &set->tag_list, tag_set_list) {
+               blk_mq_tag_queue_flush_batches(q);
                 blk_mq_freeze_queue(q);
+       }
+
         /*
          * Switch IO scheduler to 'none', cleaning up the data associated
          * with the previous scheduler. We will switch back once we are done
diff --git a/block/blk-mq.h b/block/blk-mq.h

index 271f16771499c0b5a22f1ff1e2c14f5f2f9eb8db..b6095cc50921741a7b81cf0e3a93fe3154f400ad 100644 (file)
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -14,6 +14,10 @@ struct blk_mq_ctxs {
  
  struct blk_mq_ctx_type {
         struct list_head                rq_list;
+
+       /* tag batch cache */
+       unsigned long                   tags;
+       unsigned int                    tag_offset;
  };
  
  /**
@@ -23,6 +27,7 @@ struct blk_mq_ctx {
         struct {
                 spinlock_t              lock;
                 struct blk_mq_ctx_type  type[HCTX_MAX_TYPES];
+               unsigned long           tag_hit, tag_refill;
         } ____cacheline_aligned_in_smp;
  
         unsigned int            cpu;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h

index 11cfd6470b1a3081fcca14944c7df0cd14c748c6..2c6a8657a72c32e97a67647c0c4e827ed037a03c 100644 (file)
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -140,6 +140,8 @@ struct blk_mq_hw_ctx {
          */
         atomic_t                nr_active;
  
+       atomic_t                flush_pending;
+
         /** @cpuhp_dead: List to store request if some CPU die. */
         struct hlist_node       cpuhp_dead;
         /** @kobj: Kernel object for sysfs. */
author	Jens Axboe <axboe@kernel.dk>
	Mon, 30 Dec 2019 05:15:21 +0000 (22:15 -0700)
committer	Jens Axboe <axboe@kernel.dk>
	Fri, 17 Jan 2020 15:27:16 +0000 (08:27 -0700)
block/blk-mq-debugfs.c		patch \| blob \| blame \| history
block/blk-mq-tag.c		patch \| blob \| blame \| history
block/blk-mq-tag.h		patch \| blob \| blame \| history
block/blk-mq.c		patch \| blob \| blame \| history
block/blk-mq.h		patch \| blob \| blame \| history
include/linux/blk-mq.h		patch \| blob \| blame \| history