blk-mq: scalable per-ctx/hctx request completion stats mq-stats
authorJens Axboe <axboe@fb.com>
Tue, 10 Nov 2015 21:52:24 +0000 (14:52 -0700)
committerJens Axboe <axboe@fb.com>
Wed, 11 Nov 2015 02:45:30 +0000 (19:45 -0700)
Tracked on a per cpu/ctx basis, but summed on a per hctx basis.
Could trivially be extended to per queue, which would just sum
the per hctx stats.

Not done yet. The stats should work, and there's the beginnings
of doing stat windows. Additionally, might be worth tracking
depths with the stats.

Signed-off-by: Jens Axboe <axboe@fb.com>
block/blk-mq-sysfs.c
block/blk-mq.c
block/blk-mq.h
include/linux/blkdev.h

index 1cf18784c5cf3c44be94dbd003ca9d7088f883e0..a8ecb233b71805ce2fe5d1090a0e37f66e57f959 100644 (file)
@@ -247,6 +247,35 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
        return ret;
 }
 
+static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
+                                         const char *page, size_t count)
+{
+       blk_mq_hctx_clear_stat(hctx);
+       return count;
+}
+
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+       return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+                       pre, (long long) stat->nr_samples,
+                       (long long) stat->mean, (long long) stat->min,
+                       (long long) stat->max);
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+       struct blk_rq_stat stat[2];
+       ssize_t ret;
+
+       blk_mq_init_stat(&stat[0]);
+       blk_mq_init_stat(&stat[1]);
+
+       blk_mq_hctx_get_stat(hctx, stat);
+       ret = print_stat(page, &stat[0], "read :");
+       ret += print_stat(page + ret, &stat[1], "write:");
+       return ret;
+}
+
 static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
        .attr = {.name = "dispatched", .mode = S_IRUGO },
        .show = blk_mq_sysfs_dispatched_show,
@@ -304,6 +333,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
        .attr = {.name = "io_poll", .mode = S_IRUGO },
        .show = blk_mq_hw_sysfs_poll_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
+       .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
+       .show = blk_mq_hw_sysfs_stat_show,
+       .store = blk_mq_hw_sysfs_stat_store,
+};
 
 static struct attribute *default_hw_ctx_attrs[] = {
        &blk_mq_hw_sysfs_queued.attr,
@@ -314,6 +348,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
        &blk_mq_hw_sysfs_cpus.attr,
        &blk_mq_hw_sysfs_active.attr,
        &blk_mq_hw_sysfs_poll.attr,
+       &blk_mq_hw_sysfs_stat.attr,
        NULL,
 };
 
index 86bd5b25288e29c0415700db4993cd239183366e..7c8937b02f5a2eaf9aee50d8658ad2650547cc12 100644 (file)
@@ -158,6 +158,84 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 
+void blk_mq_hctx_clear_stat(struct blk_mq_hw_ctx *hctx)
+{
+       struct blk_mq_ctx *ctx;
+       unsigned int i;
+
+       hctx_for_each_ctx(hctx, ctx, i) {
+               blk_mq_init_stat(&ctx->stat[0]);
+               blk_mq_init_stat(&ctx->stat[1]);
+       }
+}
+
+static void sum_stat(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+{
+       if (!src->nr_samples)
+               return;
+
+       dst->min = min(dst->min, src->min);
+       dst->max = max(dst->max, src->max);
+
+       if (!dst->nr_samples)
+               dst->mean = src->mean;
+       else {
+               dst->mean = div64_s64((src->mean * src->nr_samples) +
+                                       (dst->mean * dst->nr_samples),
+                                       dst->nr_samples + src->nr_samples);
+       }
+       dst->nr_samples += src->nr_samples;
+}
+
+void blk_mq_hctx_get_stat(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+{
+       struct blk_mq_ctx *ctx;
+       unsigned int i;
+
+       hctx_for_each_ctx(hctx, ctx, i) {
+               sum_stat(&dst[0], &ctx->stat[0]);
+               sum_stat(&dst[1], &ctx->stat[1]);
+       }
+}
+
+static void __blk_mq_init_stat(struct blk_rq_stat *stat, s64 time_now)
+{
+       memset(stat, 0, sizeof(*stat));
+       stat->min = -1ULL;
+       stat->time = time_now;
+}
+
+void blk_mq_init_stat(struct blk_rq_stat *stat)
+{
+       __blk_mq_init_stat(stat, ktime_to_ns(ktime_get()));
+}
+
+static void blk_mq_add_stat(struct request *rq)
+{
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
+       struct blk_rq_stat *stat = &ctx->stat[rq_data_dir(rq)];
+       s64 delta, now, value;
+
+       now = ktime_to_ns(ktime_get());
+       if (now < rq->issue_time)
+               return;
+
+       if (now - stat->time >= BLK_MQ_STAT_NSEC)
+               __blk_mq_init_stat(stat, now);
+
+       value = now - rq->issue_time;
+       if (value > stat->max)
+               stat->max = value;
+       if (value < stat->min)
+               stat->min = value;
+
+       delta = value - stat->mean;
+       if (delta)
+               stat->mean += div64_s64(delta, stat->nr_samples + 1);
+
+       stat->nr_samples++;
+}
+
 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
                               struct request *rq, unsigned int rw_flags)
 {
@@ -362,6 +440,8 @@ void __blk_mq_complete_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
 
+       blk_mq_add_stat(rq);
+
        if (!q->softirq_done_fn)
                blk_mq_end_request(rq, rq->errors);
        else
@@ -405,6 +485,8 @@ void blk_mq_start_request(struct request *rq)
        if (unlikely(blk_bidi_rq(rq)))
                rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 
+       rq->issue_time = ktime_to_ns(ktime_get());
+
        blk_add_timer(rq);
 
        /*
@@ -1778,6 +1860,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
                spin_lock_init(&__ctx->lock);
                INIT_LIST_HEAD(&__ctx->rq_list);
                __ctx->queue = q;
+               blk_mq_init_stat(&__ctx->stat[0]);
+               blk_mq_init_stat(&__ctx->stat[1]);
 
                /* If the cpu isn't online, the cpu is mapped to first hctx */
                if (!cpu_online(i))
index b44dce165761268c1f0a6bd64db78451f68d912f..472d80c9c91254d2d1c35897bea3bc2c9a8a97fe 100644 (file)
@@ -3,10 +3,25 @@
 
 struct blk_mq_tag_set;
 
+/*
+ * 0.5s window
+ */
+#define BLK_MQ_STAT_NSEC       500000000ULL
+
+struct blk_rq_stat {
+       s64 mean;
+       u64 min;
+       u64 max;
+       s64 nr_samples;
+       s64 time;
+};
+
 struct blk_mq_ctx {
        struct {
                spinlock_t              lock;
                struct list_head        rq_list;
+               unsigned int            rq_list_cnt;
+               unsigned int            in_flight;
        }  ____cacheline_aligned_in_smp;
 
        unsigned int            cpu;
@@ -20,6 +35,7 @@ struct blk_mq_ctx {
 
        /* incremented at completion time */
        unsigned long           ____cacheline_aligned_in_smp rq_completed[2];
+       struct blk_rq_stat      stat[2];
 
        struct request_queue    *queue;
        struct kobject          kobj;
@@ -122,4 +138,9 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
        return hctx->nr_ctx && hctx->tags;
 }
 
+
+void blk_mq_hctx_get_stat(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
+void blk_mq_hctx_clear_stat(struct blk_mq_hw_ctx *);
+void blk_mq_init_stat(struct blk_rq_stat *);
+
 #endif
index 3fe27f8d91f04f83751453f19f4e4bb6a4f9a2b4..709344b7b1270bcfea27d0472304ed977c294d23 100644 (file)
@@ -152,6 +152,7 @@ struct request {
        struct gendisk *rq_disk;
        struct hd_struct *part;
        unsigned long start_time;
+       s64 issue_time;
 #ifdef CONFIG_BLK_CGROUP
        struct request_list *rl;                /* rl this rq is alloced from */
        unsigned long long start_time_ns;