writeback: throttle buffered writeback

author Jens Axboe <axboe@fb.com>

Wed, 23 Mar 2016 15:16:02 +0000 (09:16 -0600)

committer Jens Axboe <axboe@fb.com>

Wed, 23 Mar 2016 15:16:02 +0000 (09:16 -0600)
author Jens Axboe <axboe@fb.com>
Wed, 23 Mar 2016 15:16:02 +0000 (09:16 -0600)
committer Jens Axboe <axboe@fb.com>
Wed, 23 Mar 2016 15:16:02 +0000 (09:16 -0600)
diff --git a/block/Makefile b/block/Makefile

index 9eda2322b2d40acfb308a8afa245de04c9b84989..9df911a3b5690f2f70f2445908c79f67effb4327 100644 (file)
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
  obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                         blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                         blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-                       blk-lib.o blk-mq.o blk-mq-tag.o \
+                       blk-lib.o blk-mq.o blk-mq-tag.o blk-wb.o \
                         blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
                         genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
                         badblocks.o partitions/
diff --git a/block/blk-core.c b/block/blk-core.c

index 827f8badd143fbf7571b608264571691af6686c5..887a9e64c6effeeea833846dfb90d485bab55fcd 100644 (file)
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
  
  #include "blk.h"
  #include "blk-mq.h"
+#include "blk-wb.h"
  
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -848,6 +849,9 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
         if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
                 goto fail;
  
+       if (blk_buffered_writeback_init(q))
+               goto fail;
+
         INIT_WORK(&q->timeout_work, blk_timeout_work);
         q->request_fn           = rfn;
         q->prep_rq_fn           = NULL;
@@ -880,6 +884,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
  
  fail:
         blk_free_flush_queue(q->fq);
+       blk_buffered_writeback_exit(q);
         return NULL;
  }
  EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1485,6 +1490,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
         /* this is a bio leak */
         WARN_ON(req->bio != NULL);
  
+       blk_buffered_writeback_done(q->rq_wb, req);
+
         /*
          * Request may not have originated from ll_rw_blk. if not,
          * it didn't come out of our reserved rq pools
@@ -1714,6 +1721,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
         int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
         struct request *req;
         unsigned int request_count = 0;
+       bool wb_acct;
  
         /*
          * low level driver can indicate that it wants pages above a
@@ -1766,6 +1774,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
         }
  
  get_rq:
+       wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, q->queue_lock);
+
         /*
          * This sync check and mask will be re-done in init_request_from_bio(),
          * but we need to set it earlier to expose the sync flag to the
@@ -1781,11 +1791,16 @@ get_rq:
          */
         req = get_request(q, rw_flags, bio, GFP_NOIO);
         if (IS_ERR(req)) {
+               if (wb_acct)
+                       __blk_buffered_writeback_done(q->rq_wb);
                 bio->bi_error = PTR_ERR(req);
                 bio_endio(bio);
                 goto out_unlock;
         }
  
+       if (wb_acct)
+               req->cmd_flags |= REQ_BUF_INFLIGHT;
+
         /*
          * After dropping the lock and possibly sleeping here, our request
          * may now be mergeable after it had proven unmergeable (above).
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 050f7a13021baca7347df79d8d26dad2b89e327e..55aace97fd35753979c37967b857a73a0f5b6280 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -29,6 +29,7 @@
  #include "blk.h"
  #include "blk-mq.h"
  #include "blk-mq-tag.h"
+#include "blk-wb.h"
  
  static DEFINE_MUTEX(all_q_mutex);
  static LIST_HEAD(all_q_list);
@@ -274,6 +275,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
  
         if (rq->cmd_flags & REQ_MQ_INFLIGHT)
                 atomic_dec(&hctx->nr_active);
+
+       blk_buffered_writeback_done(q->rq_wb, rq);
+
         rq->cmd_flags = 0;
  
         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
@@ -1253,6 +1257,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         struct blk_plug *plug;
         struct request *same_queue_rq = NULL;
         blk_qc_t cookie;
+       bool wb_acct;
  
         blk_queue_bounce(q, &bio);
  
@@ -1270,9 +1275,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         } else
                 request_count = blk_plug_queued_count(q);
  
+       wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, NULL);
+
         rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               if (wb_acct)
+                       __blk_buffered_writeback_done(q->rq_wb);
                 return BLK_QC_T_NONE;
+       }
+
+       if (wb_acct)
+               rq->cmd_flags |= REQ_BUF_INFLIGHT;
  
         cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
  
@@ -1349,6 +1362,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
         struct blk_map_ctx data;
         struct request *rq;
         blk_qc_t cookie;
+       bool wb_acct;
  
         blk_queue_bounce(q, &bio);
  
@@ -1363,9 +1377,17 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
             blk_attempt_plug_merge(q, bio, &request_count, NULL))
                 return BLK_QC_T_NONE;
  
+       wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, NULL);
+
         rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               if (wb_acct)
+                       __blk_buffered_writeback_done(q->rq_wb);
                 return BLK_QC_T_NONE;
+       }
+
+       if (wb_acct)
+               rq->cmd_flags |= REQ_BUF_INFLIGHT;
  
         cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
  
@@ -2018,6 +2040,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         /* mark the queue as mq asap */
         q->mq_ops = set->ops;
  
+       if (blk_buffered_writeback_init(q))
+               return ERR_PTR(-ENOMEM);
+
         q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
         if (!q->queue_ctx)
                 return ERR_PTR(-ENOMEM);
@@ -2084,6 +2109,7 @@ err_map:
         kfree(q->queue_hw_ctx);
  err_percpu:
         free_percpu(q->queue_ctx);
+       blk_buffered_writeback_exit(q);
         return ERR_PTR(-ENOMEM);
  }
  EXPORT_SYMBOL(blk_mq_init_allocated_queue);
@@ -2096,6 +2122,8 @@ void blk_mq_free_queue(struct request_queue *q)
         list_del_init(&q->all_q_node);
         mutex_unlock(&all_q_mutex);
  
+       blk_buffered_writeback_exit(q);
+
         blk_mq_del_queue_tag_set(q);
  
         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c

index 954e510452d78ea05effb567ed6c75656febfcfb..9ac9be23e700f165d2ada7a233e04cea60a78dc9 100644 (file)
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -13,6 +13,7 @@
  
  #include "blk.h"
  #include "blk-mq.h"
+#include "blk-wb.h"
  
  struct queue_sysfs_entry {
         struct attribute attr;
@@ -347,6 +348,71 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
         return ret;
  }
  
+static ssize_t queue_wb_stats_show(struct request_queue *q, char *page)
+{
+       struct rq_wb *wb = q->rq_wb;
+
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       return sprintf(page, "limit=%d, batch=%d, inflight=%d, wait=%d, timer=%d\n",
+                       wb->limit, wb->batch, atomic_read(&wb->inflight),
+                       waitqueue_active(&wb->wait), timer_pending(&wb->timer));
+}
+
+static ssize_t queue_wb_depth_show(struct request_queue *q, char *page)
+{
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       return queue_var_show(q->rq_wb->limit, page);
+}
+
+static ssize_t queue_wb_depth_store(struct request_queue *q, const char *page,
+                                   size_t count)
+{
+       unsigned long var;
+       ssize_t ret;
+
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       ret = queue_var_store(&var, page, count);
+       if (ret < 0)
+               return ret;
+       if (var != (unsigned int) var)
+               return -EINVAL;
+
+       blk_update_wb_limit(q->rq_wb, var);
+       return ret;
+}
+
+static ssize_t queue_wb_cache_delay_show(struct request_queue *q, char *page)
+{
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       return queue_var_show(q->rq_wb->cache_delay_usecs, page);
+}
+
+static ssize_t queue_wb_cache_delay_store(struct request_queue *q,
+                                         const char *page, size_t count)
+{
+       unsigned long var;
+       ssize_t ret;
+
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       ret = queue_var_store(&var, page, count);
+       if (ret < 0)
+               return ret;
+
+       q->rq_wb->cache_delay_usecs = var;
+       q->rq_wb->cache_delay = usecs_to_jiffies(var);
+       return ret;
+}
+
  static ssize_t queue_wc_show(struct request_queue *q, char *page)
  {
         if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
@@ -516,6 +582,21 @@ static struct queue_sysfs_entry queue_wc_entry = {
         .store = queue_wc_store,
  };
  
+static struct queue_sysfs_entry queue_wb_stats_entry = {
+       .attr = {.name = "wb_stats", .mode = S_IRUGO },
+       .show = queue_wb_stats_show,
+};
+static struct queue_sysfs_entry queue_wb_cache_delay_entry = {
+       .attr = {.name = "wb_cache_usecs", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_wb_cache_delay_show,
+       .store = queue_wb_cache_delay_store,
+};
+static struct queue_sysfs_entry queue_wb_depth_entry = {
+       .attr = {.name = "wb_depth", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_wb_depth_show,
+       .store = queue_wb_depth_store,
+};
+
  static struct attribute *default_attrs[] = {
         &queue_requests_entry.attr,
         &queue_ra_entry.attr,
@@ -542,6 +623,9 @@ static struct attribute *default_attrs[] = {
         &queue_random_entry.attr,
         &queue_poll_entry.attr,
         &queue_wc_entry.attr,
+       &queue_wb_stats_entry.attr,
+       &queue_wb_cache_delay_entry.attr,
+       &queue_wb_depth_entry.attr,
         NULL,
  };
  
diff --git a/block/blk-wb.c b/block/blk-wb.c

new file mode 100644 (file)

index 0000000..2aa3753
--- /dev/null
+++ b/block/blk-wb.c
@@ -0,0 +1,219 @@
+/*
+ * buffered writeback throttling
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ * Things that need changing:
+ *
+ *     - Auto-detection of most of this, no tunables. Cache type we can get,
+ *       and most other settings we can tweak/gather based on time.
+ *     - Better solution for rwb->bdp_wait?
+ *     - Higher depth for WB_SYNC_ALL?
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+#include "blk.h"
+#include "blk-wb.h"
+
+void __blk_buffered_writeback_done(struct rq_wb *rwb)
+{
+       int inflight;
+
+       inflight = atomic_dec_return(&rwb->inflight);
+       if (inflight >= rwb->limit)
+               return;
+
+       /*
+        * If the device does caching, we can still flood it with IO
+        * even at a low depth. If caching is on, delay a bit before
+        * submitting the next, if we're still purely background
+        * activity.
+        */
+       if (test_bit(QUEUE_FLAG_WC, &rwb->q->queue_flags) && !*rwb->bdp_wait &&
+           time_before(jiffies, rwb->last_comp + rwb->cache_delay)) {
+               if (!timer_pending(&rwb->timer))
+                       mod_timer(&rwb->timer, jiffies + rwb->cache_delay);
+               return;
+       }
+
+       if (waitqueue_active(&rwb->wait)) {
+               int diff = rwb->limit - inflight;
+
+               if (diff >= rwb->batch)
+                       wake_up_nr(&rwb->wait, 1);
+       }
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void blk_buffered_writeback_done(struct rq_wb *rwb, struct request *rq)
+{
+       if (!(rq->cmd_flags & REQ_BUF_INFLIGHT)) {
+               const unsigned long cur = jiffies;
+
+               if (rwb->limit && cur != rwb->last_comp)
+                       rwb->last_comp = cur;
+       } else
+               __blk_buffered_writeback_done(rwb);
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+       int cur = atomic_read(v);
+
+       for (;;) {
+               int old;
+
+               if (cur >= below)
+                       return false;
+               old = atomic_cmpxchg(v, cur, cur + 1);
+               if (old == cur)
+                       break;
+               cur = old;
+       }
+
+       return true;
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __blk_buffered_writeback_wait(struct rq_wb *rwb, unsigned int limit,
+                                         spinlock_t *lock)
+{
+       DEFINE_WAIT(wait);
+
+       if (!timer_pending(&rwb->timer) &&
+           atomic_inc_below(&rwb->inflight, limit))
+               return;
+
+       do {
+               prepare_to_wait_exclusive(&rwb->wait, &wait,
+                                               TASK_UNINTERRUPTIBLE);
+
+               if (!timer_pending(&rwb->timer) &&
+                   atomic_inc_below(&rwb->inflight, limit))
+                       break;
+
+               if (lock)
+                       spin_unlock_irq(lock);
+
+               io_schedule();
+
+               if (lock)
+                       spin_lock_irq(lock);
+       } while (1);
+
+       finish_wait(&rwb->wait, &wait);
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+bool blk_buffered_writeback_wait(struct rq_wb *rwb, struct bio *bio,
+                                spinlock_t *lock)
+{
+       unsigned int limit;
+
+       /*
+        * If disabled, or not a WRITE (or a discard), do nothing
+        */
+       if (!rwb->limit || !(bio->bi_rw & REQ_WRITE) ||
+           (bio->bi_rw & REQ_DISCARD))
+               return false;
+
+       /*
+        * Don't throttle WRITE_ODIRECT
+        */
+       if ((bio->bi_rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
+               return false;
+
+       /*
+        * At this point we know it's a buffered write. If REQ_SYNC is
+        * set, then it's WB_SYNC_ALL writeback. Bump the limit 4x for
+        * those, since someone is (or will be) waiting on that.
+        */
+       limit = rwb->limit;
+       if (bio->bi_rw & REQ_SYNC)
+               limit <<= 2;
+       else if (limit != 1) {
+               /*
+                * If less than 100ms since we completed unrelated IO,
+                * limit us to a depth of 1 for background writeback.
+                */
+               if (time_before(jiffies, rwb->last_comp + HZ / 10))
+                       limit = 1;
+               else if (!*rwb->bdp_wait)
+                       limit >>= 1;
+       }
+
+       __blk_buffered_writeback_wait(rwb, limit, lock);
+       return true;
+}
+
+void blk_update_wb_limit(struct rq_wb *rwb, unsigned int limit)
+{
+       rwb->limit = limit;
+       rwb->batch = rwb->limit / 2;
+       if (!rwb->batch && rwb->limit)
+               rwb->batch = 1;
+       else if (rwb->batch > 4)
+               rwb->batch = 4;
+
+       wake_up_all(&rwb->wait);
+}
+
+static void blk_buffered_writeback_timer(unsigned long data)
+{
+       struct rq_wb *rwb = (struct rq_wb *) data;
+
+       if (waitqueue_active(&rwb->wait))
+               wake_up_nr(&rwb->wait, 1);
+}
+
+#define DEF_WB_LIMIT           4
+#define DEF_WB_CACHE_DELAY     10000
+
+int blk_buffered_writeback_init(struct request_queue *q)
+{
+       struct rq_wb *rwb;
+
+       rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+       if (!rwb)
+               return -ENOMEM;
+
+       atomic_set(&rwb->inflight, 0);
+       init_waitqueue_head(&rwb->wait);
+       rwb->last_comp = jiffies;
+       rwb->bdp_wait = &q->backing_dev_info.wb.dirty_sleeping;
+       setup_timer(&rwb->timer, blk_buffered_writeback_timer,
+                       (unsigned long) rwb);
+       rwb->cache_delay_usecs = DEF_WB_CACHE_DELAY;
+       rwb->cache_delay = usecs_to_jiffies(rwb->cache_delay);
+       rwb->q = q;
+       blk_update_wb_limit(rwb, DEF_WB_LIMIT);
+       q->rq_wb = rwb;
+       return 0;
+}
+
+void blk_buffered_writeback_exit(struct request_queue *q)
+{
+       if (q->rq_wb)
+               del_timer_sync(&q->rq_wb->timer);
+
+       kfree(q->rq_wb);
+       q->rq_wb = NULL;
+}
diff --git a/block/blk-wb.h b/block/blk-wb.h

new file mode 100644 (file)

index 0000000..f3b4cd1
--- /dev/null
+++ b/block/blk-wb.h
@@ -0,0 +1,27 @@
+#ifndef BLK_WB_H
+#define BLK_WB_H
+
+#include <linux/atomic.h>
+#include <linux/wait.h>
+
+struct rq_wb {
+       unsigned int limit;
+       unsigned int batch;
+       unsigned int cache_delay;
+       unsigned int cache_delay_usecs;
+       unsigned long last_comp;
+       unsigned int *bdp_wait;
+       struct request_queue *q;
+       atomic_t inflight;
+       wait_queue_head_t wait;
+       struct timer_list timer;
+};
+
+void __blk_buffered_writeback_done(struct rq_wb *);
+void blk_buffered_writeback_done(struct rq_wb *, struct request *);
+bool blk_buffered_writeback_wait(struct rq_wb *, struct bio *, spinlock_t *);
+int blk_buffered_writeback_init(struct request_queue *);
+void blk_buffered_writeback_exit(struct request_queue *);
+void blk_update_wb_limit(struct rq_wb *, unsigned int);
+
+#endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h

index 86a38ea1823f3307caec422941fcb15dcf7c4e25..6f2a174b771cc267561636031461db708297127d 100644 (file)
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -188,6 +188,7 @@ enum rq_flag_bits {
         __REQ_PM,               /* runtime pm request */
         __REQ_HASHED,           /* on IO scheduler merge hash */
         __REQ_MQ_INFLIGHT,      /* track inflight for MQ */
+       __REQ_BUF_INFLIGHT,     /* track inflight for buffered */
         __REQ_NR_BITS,          /* stops here */
  };
  
@@ -241,6 +242,7 @@ enum rq_flag_bits {
  #define REQ_PM                 (1ULL << __REQ_PM)
  #define REQ_HASHED             (1ULL << __REQ_HASHED)
  #define REQ_MQ_INFLIGHT                (1ULL << __REQ_MQ_INFLIGHT)
+#define REQ_BUF_INFLIGHT       (1ULL << __REQ_BUF_INFLIGHT)
  
  typedef unsigned int blk_qc_t;
  #define BLK_QC_T_NONE  -1U
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 76e875159e521f4577e4a3f59d01fcc086f18d3f..8586685bf7b2a3c13cee5333385fd4f646b033c6 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -37,6 +37,7 @@ struct bsg_job;
  struct blkcg_gq;
  struct blk_flush_queue;
  struct pr_ops;
+struct rq_wb;
  
  #define BLKDEV_MIN_RQ  4
  #define BLKDEV_MAX_RQ  128     /* Default maximum */
@@ -290,6 +291,8 @@ struct request_queue {
         int                     nr_rqs[2];      /* # allocated [a]sync rqs */
         int                     nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
  
+       struct rq_wb            *rq_wb;
+
         /*
          * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
          * is used, root blkg allocates from @q->root_rl and all other
author	Jens Axboe <axboe@fb.com>
	Wed, 23 Mar 2016 15:16:02 +0000 (09:16 -0600)
committer	Jens Axboe <axboe@fb.com>
	Wed, 23 Mar 2016 15:16:02 +0000 (09:16 -0600)
block/Makefile		patch \| blob \| blame \| history
block/blk-core.c		patch \| blob \| blame \| history
block/blk-mq.c		patch \| blob \| blame \| history
block/blk-sysfs.c		patch \| blob \| blame \| history
block/blk-wb.c	[new file with mode: 0644]	patch \| blob
block/blk-wb.h	[new file with mode: 0644]	patch \| blob
include/linux/blk_types.h		patch \| blob \| blame \| history
include/linux/blkdev.h		patch \| blob \| blame \| history