obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-lib.o blk-mq.o blk-mq-tag.o blk-wb.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
#include "blk.h"
#include "blk-mq.h"
+#include "blk-wb.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
goto fail;
+ if (blk_buffered_writeback_init(q))
+ goto fail;
+
INIT_WORK(&q->timeout_work, blk_timeout_work);
q->request_fn = rfn;
q->prep_rq_fn = NULL;
fail:
blk_free_flush_queue(q->fq);
+ blk_buffered_writeback_exit(q);
return NULL;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
/* this is a bio leak */
WARN_ON(req->bio != NULL);
+ blk_buffered_writeback_done(q->rq_wb, req);
+
/*
* Request may not have originated from ll_rw_blk. if not,
* it didn't come out of our reserved rq pools
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+ bool wb_acct;
/*
* low level driver can indicate that it wants pages above a
}
get_rq:
+ wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, q->queue_lock);
+
/*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
*/
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (IS_ERR(req)) {
+ if (wb_acct)
+ __blk_buffered_writeback_done(q->rq_wb);
bio->bi_error = PTR_ERR(req);
bio_endio(bio);
goto out_unlock;
}
+ if (wb_acct)
+ req->cmd_flags |= REQ_BUF_INFLIGHT;
+
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-wb.h"
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
if (rq->cmd_flags & REQ_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
+
+ blk_buffered_writeback_done(q->rq_wb, rq);
+
rq->cmd_flags = 0;
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
+ bool wb_acct;
blk_queue_bounce(q, &bio);
} else
request_count = blk_plug_queued_count(q);
+ wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ if (wb_acct)
+ __blk_buffered_writeback_done(q->rq_wb);
return BLK_QC_T_NONE;
+ }
+
+ if (wb_acct)
+ rq->cmd_flags |= REQ_BUF_INFLIGHT;
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
struct blk_map_ctx data;
struct request *rq;
blk_qc_t cookie;
+ bool wb_acct;
blk_queue_bounce(q, &bio);
blk_attempt_plug_merge(q, bio, &request_count, NULL))
return BLK_QC_T_NONE;
+ wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ if (wb_acct)
+ __blk_buffered_writeback_done(q->rq_wb);
return BLK_QC_T_NONE;
+ }
+
+ if (wb_acct)
+ rq->cmd_flags |= REQ_BUF_INFLIGHT;
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
/* mark the queue as mq asap */
q->mq_ops = set->ops;
+ if (blk_buffered_writeback_init(q))
+ return ERR_PTR(-ENOMEM);
+
q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
if (!q->queue_ctx)
return ERR_PTR(-ENOMEM);
kfree(q->queue_hw_ctx);
err_percpu:
free_percpu(q->queue_ctx);
+ blk_buffered_writeback_exit(q);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);
+ blk_buffered_writeback_exit(q);
+
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
#include "blk.h"
#include "blk-mq.h"
+#include "blk-wb.h"
struct queue_sysfs_entry {
struct attribute attr;
return ret;
}
+static ssize_t queue_wb_stats_show(struct request_queue *q, char *page)
+{
+ struct rq_wb *wb = q->rq_wb;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return sprintf(page, "limit=%d, batch=%d, inflight=%d, wait=%d, timer=%d\n",
+ wb->limit, wb->batch, atomic_read(&wb->inflight),
+ waitqueue_active(&wb->wait), timer_pending(&wb->timer));
+}
+
+static ssize_t queue_wb_depth_show(struct request_queue *q, char *page)
+{
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return queue_var_show(q->rq_wb->limit, page);
+}
+
+static ssize_t queue_wb_depth_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ unsigned long var;
+ ssize_t ret;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ ret = queue_var_store(&var, page, count);
+ if (ret < 0)
+ return ret;
+ if (var != (unsigned int) var)
+ return -EINVAL;
+
+ blk_update_wb_limit(q->rq_wb, var);
+ return ret;
+}
+
+static ssize_t queue_wb_cache_delay_show(struct request_queue *q, char *page)
+{
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return queue_var_show(q->rq_wb->cache_delay_usecs, page);
+}
+
+static ssize_t queue_wb_cache_delay_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ unsigned long var;
+ ssize_t ret;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ ret = queue_var_store(&var, page, count);
+ if (ret < 0)
+ return ret;
+
+ q->rq_wb->cache_delay_usecs = var;
+ q->rq_wb->cache_delay = usecs_to_jiffies(var);
+ return ret;
+}
+
static ssize_t queue_wc_show(struct request_queue *q, char *page)
{
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
.store = queue_wc_store,
};
+static struct queue_sysfs_entry queue_wb_stats_entry = {
+ .attr = {.name = "wb_stats", .mode = S_IRUGO },
+ .show = queue_wb_stats_show,
+};
+static struct queue_sysfs_entry queue_wb_cache_delay_entry = {
+ .attr = {.name = "wb_cache_usecs", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_wb_cache_delay_show,
+ .store = queue_wb_cache_delay_store,
+};
+static struct queue_sysfs_entry queue_wb_depth_entry = {
+ .attr = {.name = "wb_depth", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_wb_depth_show,
+ .store = queue_wb_depth_store,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
&queue_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
+ &queue_wb_stats_entry.attr,
+ &queue_wb_cache_delay_entry.attr,
+ &queue_wb_depth_entry.attr,
NULL,
};
--- /dev/null
+/*
+ * buffered writeback throttling
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ * Things that need changing:
+ *
+ * - Auto-detection of most of this, no tunables. Cache type we can get,
+ * and most other settings we can tweak/gather based on time.
+ * - Better solution for rwb->bdp_wait?
+ * - Higher depth for WB_SYNC_ALL?
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+#include "blk.h"
+#include "blk-wb.h"
+
+void __blk_buffered_writeback_done(struct rq_wb *rwb)
+{
+ int inflight;
+
+ inflight = atomic_dec_return(&rwb->inflight);
+ if (inflight >= rwb->limit)
+ return;
+
+ /*
+ * If the device does caching, we can still flood it with IO
+ * even at a low depth. If caching is on, delay a bit before
+ * submitting the next, if we're still purely background
+ * activity.
+ */
+ if (test_bit(QUEUE_FLAG_WC, &rwb->q->queue_flags) && !*rwb->bdp_wait &&
+ time_before(jiffies, rwb->last_comp + rwb->cache_delay)) {
+ if (!timer_pending(&rwb->timer))
+ mod_timer(&rwb->timer, jiffies + rwb->cache_delay);
+ return;
+ }
+
+ if (waitqueue_active(&rwb->wait)) {
+ int diff = rwb->limit - inflight;
+
+ if (diff >= rwb->batch)
+ wake_up_nr(&rwb->wait, 1);
+ }
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void blk_buffered_writeback_done(struct rq_wb *rwb, struct request *rq)
+{
+ if (!(rq->cmd_flags & REQ_BUF_INFLIGHT)) {
+ const unsigned long cur = jiffies;
+
+ if (rwb->limit && cur != rwb->last_comp)
+ rwb->last_comp = cur;
+ } else
+ __blk_buffered_writeback_done(rwb);
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+ int cur = atomic_read(v);
+
+ for (;;) {
+ int old;
+
+ if (cur >= below)
+ return false;
+ old = atomic_cmpxchg(v, cur, cur + 1);
+ if (old == cur)
+ break;
+ cur = old;
+ }
+
+ return true;
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __blk_buffered_writeback_wait(struct rq_wb *rwb, unsigned int limit,
+ spinlock_t *lock)
+{
+ DEFINE_WAIT(wait);
+
+ if (!timer_pending(&rwb->timer) &&
+ atomic_inc_below(&rwb->inflight, limit))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&rwb->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (!timer_pending(&rwb->timer) &&
+ atomic_inc_below(&rwb->inflight, limit))
+ break;
+
+ if (lock)
+ spin_unlock_irq(lock);
+
+ io_schedule();
+
+ if (lock)
+ spin_lock_irq(lock);
+ } while (1);
+
+ finish_wait(&rwb->wait, &wait);
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+bool blk_buffered_writeback_wait(struct rq_wb *rwb, struct bio *bio,
+ spinlock_t *lock)
+{
+ unsigned int limit;
+
+ /*
+ * If disabled, or not a WRITE (or a discard), do nothing
+ */
+ if (!rwb->limit || !(bio->bi_rw & REQ_WRITE) ||
+ (bio->bi_rw & REQ_DISCARD))
+ return false;
+
+ /*
+ * Don't throttle WRITE_ODIRECT
+ */
+ if ((bio->bi_rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
+ return false;
+
+ /*
+ * At this point we know it's a buffered write. If REQ_SYNC is
+ * set, then it's WB_SYNC_ALL writeback. Bump the limit 4x for
+ * those, since someone is (or will be) waiting on that.
+ */
+ limit = rwb->limit;
+ if (bio->bi_rw & REQ_SYNC)
+ limit <<= 2;
+ else if (limit != 1) {
+ /*
+ * If less than 100ms since we completed unrelated IO,
+ * limit us to a depth of 1 for background writeback.
+ */
+ if (time_before(jiffies, rwb->last_comp + HZ / 10))
+ limit = 1;
+ else if (!*rwb->bdp_wait)
+ limit >>= 1;
+ }
+
+ __blk_buffered_writeback_wait(rwb, limit, lock);
+ return true;
+}
+
+void blk_update_wb_limit(struct rq_wb *rwb, unsigned int limit)
+{
+ rwb->limit = limit;
+ rwb->batch = rwb->limit / 2;
+ if (!rwb->batch && rwb->limit)
+ rwb->batch = 1;
+ else if (rwb->batch > 4)
+ rwb->batch = 4;
+
+ wake_up_all(&rwb->wait);
+}
+
+static void blk_buffered_writeback_timer(unsigned long data)
+{
+ struct rq_wb *rwb = (struct rq_wb *) data;
+
+ if (waitqueue_active(&rwb->wait))
+ wake_up_nr(&rwb->wait, 1);
+}
+
+#define DEF_WB_LIMIT 4
+#define DEF_WB_CACHE_DELAY 10000
+
+int blk_buffered_writeback_init(struct request_queue *q)
+{
+ struct rq_wb *rwb;
+
+ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+ if (!rwb)
+ return -ENOMEM;
+
+ atomic_set(&rwb->inflight, 0);
+ init_waitqueue_head(&rwb->wait);
+ rwb->last_comp = jiffies;
+ rwb->bdp_wait = &q->backing_dev_info.wb.dirty_sleeping;
+ setup_timer(&rwb->timer, blk_buffered_writeback_timer,
+ (unsigned long) rwb);
+ rwb->cache_delay_usecs = DEF_WB_CACHE_DELAY;
+ rwb->cache_delay = usecs_to_jiffies(rwb->cache_delay);
+ rwb->q = q;
+ blk_update_wb_limit(rwb, DEF_WB_LIMIT);
+ q->rq_wb = rwb;
+ return 0;
+}
+
+void blk_buffered_writeback_exit(struct request_queue *q)
+{
+ if (q->rq_wb)
+ del_timer_sync(&q->rq_wb->timer);
+
+ kfree(q->rq_wb);
+ q->rq_wb = NULL;
+}
--- /dev/null
+#ifndef BLK_WB_H
+#define BLK_WB_H
+
+#include <linux/atomic.h>
+#include <linux/wait.h>
+
+struct rq_wb {
+ unsigned int limit;
+ unsigned int batch;
+ unsigned int cache_delay;
+ unsigned int cache_delay_usecs;
+ unsigned long last_comp;
+ unsigned int *bdp_wait;
+ struct request_queue *q;
+ atomic_t inflight;
+ wait_queue_head_t wait;
+ struct timer_list timer;
+};
+
+void __blk_buffered_writeback_done(struct rq_wb *);
+void blk_buffered_writeback_done(struct rq_wb *, struct request *);
+bool blk_buffered_writeback_wait(struct rq_wb *, struct bio *, spinlock_t *);
+int blk_buffered_writeback_init(struct request_queue *);
+void blk_buffered_writeback_exit(struct request_queue *);
+void blk_update_wb_limit(struct rq_wb *, unsigned int);
+
+#endif
__REQ_PM, /* runtime pm request */
__REQ_HASHED, /* on IO scheduler merge hash */
__REQ_MQ_INFLIGHT, /* track inflight for MQ */
+ __REQ_BUF_INFLIGHT, /* track inflight for buffered */
__REQ_NR_BITS, /* stops here */
};
#define REQ_PM (1ULL << __REQ_PM)
#define REQ_HASHED (1ULL << __REQ_HASHED)
#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT)
+#define REQ_BUF_INFLIGHT (1ULL << __REQ_BUF_INFLIGHT)
typedef unsigned int blk_qc_t;
#define BLK_QC_T_NONE -1U
struct blkcg_gq;
struct blk_flush_queue;
struct pr_ops;
+struct rq_wb;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_MAX_RQ 128 /* Default maximum */
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
+ struct rq_wb *rq_wb;
+
/*
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
* is used, root blkg allocates from @q->root_rl and all other