writeback: throttle buffered writeback

author Jens Axboe <axboe@fb.com>

Tue, 26 Apr 2016 15:14:27 +0000 (09:14 -0600)

committer Jens Axboe <axboe@fb.com>

Tue, 26 Apr 2016 15:51:21 +0000 (09:51 -0600)
author Jens Axboe <axboe@fb.com>
Tue, 26 Apr 2016 15:14:27 +0000 (09:14 -0600)
committer Jens Axboe <axboe@fb.com>
Tue, 26 Apr 2016 15:51:21 +0000 (09:51 -0600)
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt

index dce25d848d92c67074ebd26ffaa526217f8f3957..9bc990abef4dd03a9d82aa431eda663a2e31d900 100644 (file)
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -151,5 +151,18 @@ device state. This means that it might not be safe to toggle the
  setting from "write back" to "write through", since that will also
  eliminate cache flushes issued by the kernel.
  
+wb_lat_usec (RW)
+----------------
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+-------------------
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.
+
  
  Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/Kconfig b/block/Kconfig

index 0363cd731320d8d9d93cdcd8e3fb5351682e6464..d4c2ff4b9b2cdda60184076653b2a60701eec475 100644 (file)
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
  menuconfig BLOCK
         bool "Enable the block layer" if EXPERT
         default y
+       select WBT
         help
          Provide block layer support for the kernel.
  
diff --git a/block/blk-core.c b/block/blk-core.c

index 40b57bf4852c32f12393db9a9b8d2f4e9d4fcffc..c166d46a09d14e36d777c333eeeae15b1aa9c9b0 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,6 +33,7 @@
  #include <linux/ratelimit.h>
  #include <linux/pm_runtime.h>
  #include <linux/blk-cgroup.h>
+#include <linux/wbt.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/block.h>
@@ -880,6 +881,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
  
  fail:
         blk_free_flush_queue(q->fq);
+       wbt_exit(q->rq_wb);
+       q->rq_wb = NULL;
         return NULL;
  }
  EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1395,6 +1398,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
         blk_delete_timer(rq);
         blk_clear_rq_complete(rq);
         trace_block_rq_requeue(q, rq);
+       wbt_requeue(q->rq_wb, &rq->wb_stat);
  
         if (rq->cmd_flags & REQ_QUEUED)
                 blk_queue_end_tag(q, rq);
@@ -1485,6 +1489,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
         /* this is a bio leak */
         WARN_ON(req->bio != NULL);
  
+       wbt_done(q->rq_wb, &req->wb_stat);
+
         /*
          * Request may not have originated from ll_rw_blk. if not,
          * it didn't come out of our reserved rq pools
@@ -1714,6 +1720,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
         int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
         struct request *req;
         unsigned int request_count = 0;
+       bool wb_acct;
  
         /*
          * low level driver can indicate that it wants pages above a
@@ -1766,6 +1773,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
         }
  
  get_rq:
+       wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, q->queue_lock);
+
         /*
          * This sync check and mask will be re-done in init_request_from_bio(),
          * but we need to set it earlier to expose the sync flag to the
@@ -1781,11 +1790,16 @@ get_rq:
          */
         req = get_request(q, rw_flags, bio, GFP_NOIO);
         if (IS_ERR(req)) {
+               if (wb_acct)
+                       __wbt_done(q->rq_wb);
                 bio->bi_error = PTR_ERR(req);
                 bio_endio(bio);
                 goto out_unlock;
         }
  
+       if (wb_acct)
+               wbt_mark_tracked(&req->wb_stat);
+
         /*
          * After dropping the lock and possibly sleeping here, our request
          * may now be mergeable after it had proven unmergeable (above).
@@ -2514,7 +2528,7 @@ void blk_start_request(struct request *req)
  {
         blk_dequeue_request(req);
  
-       req->issue_time = ktime_to_ns(ktime_get());
+       wbt_issue(req->q->rq_wb, &req->wb_stat);
  
         /*
          * We are now handing the request to the hardware, initialize
@@ -2752,9 +2766,10 @@ void blk_finish_request(struct request *req, int error)
  
         blk_account_io_done(req);
  
-       if (req->end_io)
+       if (req->end_io) {
+               wbt_done(req->q->rq_wb, &req->wb_stat);
                 req->end_io(req, error);
-       else {
+       } else {
                 if (blk_bidi_rq(req))
                         __blk_put_request(req->next_rq->q, req->next_rq);
  
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 71b4a13fbf9478a52a69e92fe2210ecb8acdb02c..556229e4da92b15a5c51c15411f658dd733d4509 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -22,6 +22,7 @@
  #include <linux/sched/sysctl.h>
  #include <linux/delay.h>
  #include <linux/crash_dump.h>
+#include <linux/wbt.h>
  
  #include <trace/events/block.h>
  
@@ -275,6 +276,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
  
         if (rq->cmd_flags & REQ_MQ_INFLIGHT)
                 atomic_dec(&hctx->nr_active);
+
+       wbt_done(q->rq_wb, &rq->wb_stat);
         rq->cmd_flags = 0;
  
         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
@@ -307,6 +310,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
         blk_account_io_done(rq);
  
         if (rq->end_io) {
+               wbt_done(rq->q->rq_wb, &rq->wb_stat);
                 rq->end_io(rq, error);
         } else {
                 if (unlikely(blk_bidi_rq(rq)))
@@ -413,7 +417,7 @@ void blk_mq_start_request(struct request *rq)
         if (unlikely(blk_bidi_rq(rq)))
                 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
  
-       rq->issue_time = ktime_to_ns(ktime_get());
+       wbt_issue(q->rq_wb, &rq->wb_stat);
  
         blk_add_timer(rq);
  
@@ -450,6 +454,7 @@ static void __blk_mq_requeue_request(struct request *rq)
         struct request_queue *q = rq->q;
  
         trace_block_rq_requeue(q, rq);
+       wbt_requeue(q->rq_wb, &rq->wb_stat);
  
         if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
                 if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -1265,6 +1270,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         struct blk_plug *plug;
         struct request *same_queue_rq = NULL;
         blk_qc_t cookie;
+       bool wb_acct;
  
         blk_queue_bounce(q, &bio);
  
@@ -1282,9 +1288,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         } else
                 request_count = blk_plug_queued_count(q);
  
+       wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL);
+
         rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               if (wb_acct)
+                       __wbt_done(q->rq_wb);
                 return BLK_QC_T_NONE;
+       }
+
+       if (wb_acct)
+               wbt_mark_tracked(&rq->wb_stat);
  
         cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
  
@@ -1361,6 +1375,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
         struct blk_map_ctx data;
         struct request *rq;
         blk_qc_t cookie;
+       bool wb_acct;
  
         blk_queue_bounce(q, &bio);
  
@@ -1375,9 +1390,17 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
             blk_attempt_plug_merge(q, bio, &request_count, NULL))
                 return BLK_QC_T_NONE;
  
+       wb_acct = wbt_wait(q->rq_wb, bio->bi_rw, NULL);
+
         rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               if (wb_acct)
+                       __wbt_done(q->rq_wb);
                 return BLK_QC_T_NONE;
+       }
+
+       if (wb_acct)
+               wbt_mark_tracked(&rq->wb_stat);
  
         cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
  
@@ -2111,6 +2134,9 @@ void blk_mq_free_queue(struct request_queue *q)
         list_del_init(&q->all_q_node);
         mutex_unlock(&all_q_mutex);
  
+       wbt_exit(q->rq_wb);
+       q->rq_wb = NULL;
+
         blk_mq_del_queue_tag_set(q);
  
         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
diff --git a/block/blk-settings.c b/block/blk-settings.c

index f7e122e717e80431b92ff261c6fa6ce7d758f2c6..746dc9fee1ac0e410a96a5f00cc4ca91179ba158 100644 (file)
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -840,6 +840,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
  void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
  {
         q->queue_depth = depth;
+       wbt_set_queue_depth(q->rq_wb, depth);
  }
  EXPORT_SYMBOL(blk_set_queue_depth);
  
@@ -863,6 +864,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
         else
                 queue_flag_clear(QUEUE_FLAG_FUA, q);
         spin_unlock_irq(q->queue_lock);
+
+       wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
  }
  EXPORT_SYMBOL_GPL(blk_queue_write_cache);
  
diff --git a/block/blk-stat.c b/block/blk-stat.c

index b38776a8317347d7771addd247a595da4e1d543b..8e3974d87c1f586b07357ec5df74453b3d5466a5 100644 (file)
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -143,15 +143,16 @@ void blk_stat_init(struct blk_rq_stat *stat)
  void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
  {
         s64 delta, now, value;
+       u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat);
  
         now = ktime_to_ns(ktime_get());
-       if (now < rq->issue_time)
+       if (now < rq_time)
                 return;
  
         if ((now & BLK_STAT_MASK) != (stat->time & BLK_STAT_MASK))
                 __blk_stat_init(stat, now);
  
-       value = now - rq->issue_time;
+       value = now - rq_time;
         if (value > stat->max)
                 stat->max = value;
         if (value < stat->min)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

index 6e516cc0d3d071a7f7f7eb31e003a79055caa474..df194bf93598e3f2715fcc9de3b182125e6ca779 100644 (file)
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -10,6 +10,7 @@
  #include <linux/blktrace_api.h>
  #include <linux/blk-mq.h>
  #include <linux/blk-cgroup.h>
+#include <linux/wbt.h>
  
  #include "blk.h"
  #include "blk-mq.h"
@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
         return count;
  }
  
+static ssize_t queue_var_store64(u64 *var, const char *page)
+{
+       int err;
+       u64 v;
+
+       err = kstrtou64(page, 10, &v);
+       if (err < 0)
+               return err;
+
+       *var = v;
+       return 0;
+}
+
  static ssize_t queue_requests_show(struct request_queue *q, char *page)
  {
         return queue_var_show(q->nr_requests, (page));
@@ -347,6 +361,58 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
         return ret;
  }
  
+static ssize_t queue_wb_win_show(struct request_queue *q, char *page)
+{
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000));
+}
+
+static ssize_t queue_wb_win_store(struct request_queue *q, const char *page,
+                                 size_t count)
+{
+       ssize_t ret;
+       u64 val;
+
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       ret = queue_var_store64(&val, page);
+       if (ret < 0)
+               return ret;
+
+       q->rq_wb->win_nsec = val * 1000ULL;
+       wbt_update_limits(q->rq_wb);
+       return count;
+}
+
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+{
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+}
+
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+                                 size_t count)
+{
+       ssize_t ret;
+       u64 val;
+
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       ret = queue_var_store64(&val, page);
+       if (ret < 0)
+               return ret;
+
+       q->rq_wb->min_lat_nsec = val * 1000ULL;
+       wbt_update_limits(q->rq_wb);
+       return count;
+}
+
  static ssize_t queue_wc_show(struct request_queue *q, char *page)
  {
         if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
@@ -541,6 +607,18 @@ static struct queue_sysfs_entry queue_stats_entry = {
         .show = queue_stats_show,
  };
  
+static struct queue_sysfs_entry queue_wb_lat_entry = {
+       .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_wb_lat_show,
+       .store = queue_wb_lat_store,
+};
+
+static struct queue_sysfs_entry queue_wb_win_entry = {
+       .attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_wb_win_show,
+       .store = queue_wb_win_store,
+};
+
  static struct attribute *default_attrs[] = {
         &queue_requests_entry.attr,
         &queue_ra_entry.attr,
@@ -568,6 +646,8 @@ static struct attribute *default_attrs[] = {
         &queue_poll_entry.attr,
         &queue_wc_entry.attr,
         &queue_stats_entry.attr,
+       &queue_wb_lat_entry.attr,
+       &queue_wb_win_entry.attr,
         NULL,
  };
  
@@ -682,6 +762,43 @@ struct kobj_type blk_queue_ktype = {
         .release        = blk_release_queue,
  };
  
+static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
+{
+       blk_queue_stat_get(data, stat);
+}
+
+static void blk_wb_stat_clear(void *data)
+{
+       blk_stat_clear(data);
+}
+
+static struct wb_stat_ops wb_stat_ops = {
+       .get    = blk_wb_stat_get,
+       .clear  = blk_wb_stat_clear,
+};
+
+static void blk_wb_init(struct request_queue *q)
+{
+       struct rq_wb *rwb;
+
+       rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q);
+
+       /*
+        * If this fails, we don't get throttling
+        */
+       if (IS_ERR(rwb))
+               return;
+
+       if (blk_queue_nonrot(q))
+               rwb->min_lat_nsec = 2000000ULL;
+       else
+               rwb->min_lat_nsec = 75000000ULL;
+
+       wbt_set_queue_depth(rwb, blk_queue_depth(q));
+       wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+       q->rq_wb = rwb;
+}
+
  int blk_register_queue(struct gendisk *disk)
  {
         int ret;
@@ -721,6 +838,8 @@ int blk_register_queue(struct gendisk *disk)
         if (q->mq_ops)
                 blk_mq_register_disk(disk);
  
+       blk_wb_init(q);
+
         if (!q->request_fn)
                 return 0;
  
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 87f6703ced71d7a45d99c1c1de90b6cd7e14bf25..a89f46c58d5fb9948ff4d4ca9c06efaac728c3dd 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -24,6 +24,7 @@
  #include <linux/rcupdate.h>
  #include <linux/percpu-refcount.h>
  #include <linux/scatterlist.h>
+#include <linux/wbt.h>
  
  struct module;
  struct scsi_ioctl_command;
@@ -37,6 +38,7 @@ struct bsg_job;
  struct blkcg_gq;
  struct blk_flush_queue;
  struct pr_ops;
+struct rq_wb;
  
  #define BLKDEV_MIN_RQ  4
  #define BLKDEV_MAX_RQ  128     /* Default maximum */
@@ -153,7 +155,7 @@ struct request {
         struct gendisk *rq_disk;
         struct hd_struct *part;
         unsigned long start_time;
-       s64 issue_time;
+       struct wb_issue_stat wb_stat;
  #ifdef CONFIG_BLK_CGROUP
         struct request_list *rl;                /* rl this rq is alloced from */
         unsigned long long start_time_ns;
@@ -291,6 +293,8 @@ struct request_queue {
         int                     nr_rqs[2];      /* # allocated [a]sync rqs */
         int                     nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
  
+       struct rq_wb            *rq_wb;
+
         /*
          * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
          * is used, root blkg allocates from @q->root_rl and all other
author	Jens Axboe <axboe@fb.com>
	Tue, 26 Apr 2016 15:14:27 +0000 (09:14 -0600)
committer	Jens Axboe <axboe@fb.com>
	Tue, 26 Apr 2016 15:51:21 +0000 (09:51 -0600)
Documentation/block/queue-sysfs.txt		patch \| blob \| blame \| history
block/Kconfig		patch \| blob \| blame \| history
block/blk-core.c		patch \| blob \| blame \| history
block/blk-mq.c		patch \| blob \| blame \| history
block/blk-settings.c		patch \| blob \| blame \| history
block/blk-stat.c		patch \| blob \| blame \| history
block/blk-sysfs.c		patch \| blob \| blame \| history
include/linux/blkdev.h		patch \| blob \| blame \| history