obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-lib.o blk-mq.o blk-mq-tag.o blk-wb.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
#include "blk.h"
#include "blk-mq.h"
+#include "blk-wb.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
*/
blk_queue_make_request(q, blk_queue_bio);
+ if (blk_wb_init(q))
+ goto fail;
+
q->sg_reserved_size = INT_MAX;
/* Protect q->elevator from elevator_change */
fail:
blk_free_flush_queue(q->fq);
+ blk_wb_exit(q);
return NULL;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
/* this is a bio leak */
WARN_ON(req->bio != NULL);
+ blk_wb_done(q->rq_wb, req);
+
/*
* Request may not have originated from ll_rw_blk. if not,
* it didn't come out of our reserved rq pools
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+ bool wb_acct;
/*
* low level driver can indicate that it wants pages above a
}
get_rq:
+ wb_acct = blk_wb_wait(q->rq_wb, bio, q->queue_lock);
+
/*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
*/
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (IS_ERR(req)) {
+ if (wb_acct)
+ __blk_wb_done(q->rq_wb);
bio->bi_error = PTR_ERR(req);
bio_endio(bio);
goto out_unlock;
}
+ if (wb_acct)
+ req->cmd_flags |= REQ_BUF_INFLIGHT;
+
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-wb.h"
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
if (rq->cmd_flags & REQ_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
+
+ blk_wb_done(q->rq_wb, rq);
+
rq->cmd_flags = 0;
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
+ bool wb_acct;
blk_queue_bounce(q, &bio);
} else
request_count = blk_plug_queued_count(q);
+ wb_acct = blk_wb_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ if (wb_acct)
+ __blk_wb_done(q->rq_wb);
return BLK_QC_T_NONE;
+ }
+
+ if (wb_acct)
+ rq->cmd_flags |= REQ_BUF_INFLIGHT;
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
struct blk_map_ctx data;
struct request *rq;
blk_qc_t cookie;
+ bool wb_acct;
blk_queue_bounce(q, &bio);
blk_attempt_plug_merge(q, bio, &request_count, NULL))
return BLK_QC_T_NONE;
+ wb_acct = blk_wb_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ if (wb_acct)
+ __blk_wb_done(q->rq_wb);
return BLK_QC_T_NONE;
+ }
+
+ if (wb_acct)
+ rq->cmd_flags |= REQ_BUF_INFLIGHT;
cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
*/
q->nr_requests = set->queue_depth;
+ if (blk_wb_init(q))
+ goto err_hctxs;
+
if (set->ops->complete)
blk_queue_softirq_done(q, set->ops->complete);
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);
+ blk_wb_exit(q);
+
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
#include <linux/gfp.h>
#include "blk.h"
+#include "blk-wb.h"
unsigned long blk_max_low_pfn;
EXPORT_SYMBOL(blk_max_low_pfn);
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
{
q->queue_depth = depth;
+ if (q->rq_wb)
+ blk_wb_update_limits(q->rq_wb, depth);
}
EXPORT_SYMBOL(blk_set_queue_depth);
#include "blk.h"
#include "blk-mq.h"
+#include "blk-wb.h"
struct queue_sysfs_entry {
struct attribute attr;
return ret;
}
+static ssize_t queue_wb_stats_show(struct request_queue *q, char *page)
+{
+ struct rq_wb *wb = q->rq_wb;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return sprintf(page, "idle=%d, normal=%d, max=%d, inflight=%d, wait=%d,"
+ " timer=%d, bdp_wait=%d\n", wb->wb_idle,
+ wb->wb_normal, wb->wb_max,
+ atomic_read(&wb->inflight),
+ waitqueue_active(&wb->wait),
+ timer_pending(&wb->timer),
+ *wb->bdp_wait);
+}
+
+static ssize_t queue_wb_perc_show(struct request_queue *q, char *page)
+{
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return queue_var_show(q->rq_wb->perc, page);
+}
+
+static ssize_t queue_wb_perc_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ unsigned long perc;
+ ssize_t ret;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ ret = queue_var_store(&perc, page, count);
+ if (ret < 0)
+ return ret;
+ if (perc > 100)
+ return -EINVAL;
+
+ q->rq_wb->perc = perc;
+ blk_wb_update_limits(q->rq_wb, blk_queue_depth(q));
+ return ret;
+}
+
+static ssize_t queue_wb_cache_delay_show(struct request_queue *q, char *page)
+{
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return queue_var_show(q->rq_wb->cache_delay_usecs, page);
+}
+
+static ssize_t queue_wb_cache_delay_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ unsigned long var;
+ ssize_t ret;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ ret = queue_var_store(&var, page, count);
+ if (ret < 0)
+ return ret;
+
+ q->rq_wb->cache_delay_usecs = var;
+ q->rq_wb->cache_delay = usecs_to_jiffies(var);
+ return ret;
+}
+
static ssize_t queue_wc_show(struct request_queue *q, char *page)
{
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
.store = queue_wc_store,
};
+static struct queue_sysfs_entry queue_wb_stats_entry = {
+ .attr = {.name = "wb_stats", .mode = S_IRUGO },
+ .show = queue_wb_stats_show,
+};
+static struct queue_sysfs_entry queue_wb_cache_delay_entry = {
+ .attr = {.name = "wb_cache_usecs", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_wb_cache_delay_show,
+ .store = queue_wb_cache_delay_store,
+};
+static struct queue_sysfs_entry queue_wb_perc_entry = {
+ .attr = {.name = "wb_percent", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_wb_perc_show,
+ .store = queue_wb_perc_store,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
&queue_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
+ &queue_wb_stats_entry.attr,
+ &queue_wb_cache_delay_entry.attr,
+ &queue_wb_perc_entry.attr,
NULL,
};
--- /dev/null
+/*
+ * buffered writeback throttling
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ * Things that need changing:
+ *
+ * - Auto-detection of optimal wb_percent setting. A lower setting
+ * is appropriate on rotating storage (wb_percent=15 gives good
+ * separation, while still getting full bandwidth with wb cache).
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+#include "blk.h"
+#include "blk-wb.h"
+
+static inline bool rwb_enabled(struct rq_wb *rwb)
+{
+ return rwb->wb_normal != 0;
+}
+
+void __blk_wb_done(struct rq_wb *rwb)
+{
+ int inflight, limit = rwb->wb_normal;
+
+ inflight = atomic_dec_return(&rwb->inflight);
+ if (inflight >= limit)
+ return;
+
+ /*
+ * If the device does caching, we can still flood it with IO
+ * even at a low depth. If caching is on, delay a bit before
+ * submitting the next, if we're still purely background
+ * activity.
+ */
+ if (test_bit(QUEUE_FLAG_WC, &rwb->q->queue_flags) && !*rwb->bdp_wait &&
+ time_before(jiffies, rwb->last_comp + rwb->cache_delay)) {
+ if (!timer_pending(&rwb->timer))
+ mod_timer(&rwb->timer, jiffies + rwb->cache_delay);
+ return;
+ }
+
+ if (waitqueue_active(&rwb->wait)) {
+ int diff = limit - inflight;
+
+ if (diff >= rwb->wb_idle / 2)
+ wake_up_nr(&rwb->wait, 1);
+ }
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void blk_wb_done(struct rq_wb *rwb, struct request *rq)
+{
+ if (!(rq->cmd_flags & REQ_BUF_INFLIGHT)) {
+ if (rwb_enabled(rwb)) {
+ const unsigned long cur = jiffies;
+
+ if (cur != rwb->last_comp)
+ rwb->last_comp = cur;
+ }
+ } else
+ __blk_wb_done(rwb);
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+ int cur = atomic_read(v);
+
+ for (;;) {
+ int old;
+
+ if (cur >= below)
+ return false;
+ old = atomic_cmpxchg(v, cur, cur + 1);
+ if (old == cur)
+ break;
+ cur = old;
+ }
+
+ return true;
+}
+
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned int rw)
+{
+ unsigned int limit;
+
+ /*
+ * At this point we know it's a buffered write. If REQ_SYNC is
+ * set, then it's WB_SYNC_ALL writeback. Bump the limit 4x for
+ * those, since someone is (or will be) waiting on that.
+ */
+ if ((rw & REQ_SYNC) || *rwb->bdp_wait)
+ limit = rwb->wb_max;
+ else if (time_before(jiffies, rwb->last_comp + HZ / 10)) {
+ /*
+ * If less than 100ms since we completed unrelated IO,
+ * limit us to half the depth for background writeback.
+ */
+ limit = rwb->wb_idle;
+ } else
+ limit = rwb->wb_normal;
+
+ return limit;
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __blk_wb_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
+{
+ DEFINE_WAIT(wait);
+
+ if (!timer_pending(&rwb->timer) &&
+ atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&rwb->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (!timer_pending(&rwb->timer) &&
+ atomic_inc_below(&rwb->inflight, get_limit(rwb, rw)))
+ break;
+
+ if (lock)
+ spin_unlock_irq(lock);
+
+ io_schedule();
+
+ if (lock)
+ spin_lock_irq(lock);
+ } while (1);
+
+ finish_wait(&rwb->wait, &wait);
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+bool blk_wb_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
+{
+ /*
+ * If disabled, or not a WRITE (or a discard), do nothing
+ */
+ if (!rwb_enabled(rwb) || !(bio->bi_rw & REQ_WRITE) ||
+ (bio->bi_rw & REQ_DISCARD))
+ return false;
+
+ /*
+ * Don't throttle WRITE_ODIRECT
+ */
+ if ((bio->bi_rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
+ return false;
+
+ __blk_wb_wait(rwb, bio->bi_rw, lock);
+ return true;
+}
+
+static void calc_wb_limits(struct rq_wb *rwb, unsigned int depth,
+ unsigned int perc)
+{
+ /*
+ * We'll use depth==64 as a reasonable max limit that should be able
+ * to achieve full device bandwidth anywhere.
+ */
+ depth = min(64U, depth);
+
+ /*
+ * Full perf writes are max 'perc' percentage of the depth
+ */
+ rwb->wb_max = (perc * depth + 1) / 100;
+ if (!rwb->wb_max && perc)
+ rwb->wb_max = 1;
+ rwb->wb_normal = (rwb->wb_max + 1) / 2;
+ rwb->wb_idle = (rwb->wb_max + 3) / 4;
+}
+
+void blk_wb_update_limits(struct rq_wb *rwb, unsigned int depth)
+{
+ calc_wb_limits(rwb, depth, rwb->perc);
+ wake_up_all(&rwb->wait);
+}
+
+static void blk_wb_timer(unsigned long data)
+{
+ struct rq_wb *rwb = (struct rq_wb *) data;
+
+ if (waitqueue_active(&rwb->wait))
+ wake_up_nr(&rwb->wait, 1);
+}
+
+#define DEF_WB_PERC 50
+#define DEF_WB_CACHE_DELAY 10000
+
+int blk_wb_init(struct request_queue *q)
+{
+ struct rq_wb *rwb;
+
+ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+ if (!rwb)
+ return -ENOMEM;
+
+ atomic_set(&rwb->inflight, 0);
+ init_waitqueue_head(&rwb->wait);
+ rwb->last_comp = jiffies;
+ rwb->bdp_wait = &q->backing_dev_info.wb.dirty_sleeping;
+ setup_timer(&rwb->timer, blk_wb_timer, (unsigned long) rwb);
+ rwb->perc = DEF_WB_PERC;
+ rwb->cache_delay_usecs = DEF_WB_CACHE_DELAY;
+ rwb->cache_delay = usecs_to_jiffies(rwb->cache_delay);
+ rwb->q = q;
+ blk_wb_update_limits(rwb, blk_queue_depth(q));
+ q->rq_wb = rwb;
+ return 0;
+}
+
+void blk_wb_exit(struct request_queue *q)
+{
+ if (q->rq_wb)
+ del_timer_sync(&q->rq_wb->timer);
+
+ kfree(q->rq_wb);
+ q->rq_wb = NULL;
+}
--- /dev/null
+#ifndef BLK_WB_H
+#define BLK_WB_H
+
+#include <linux/atomic.h>
+#include <linux/wait.h>
+
+struct rq_wb {
+ /*
+ * Settings that govern how we throttle
+ */
+ unsigned int perc; /* INPUT */
+ unsigned int wb_idle; /* idle writeback */
+ unsigned int wb_normal; /* normal writeback */
+ unsigned int wb_max; /* max throughput writeback */
+
+ unsigned int cache_delay;
+ unsigned int cache_delay_usecs;
+ unsigned long last_comp;
+ unsigned int *bdp_wait;
+ struct request_queue *q;
+ atomic_t inflight;
+ wait_queue_head_t wait;
+ struct timer_list timer;
+};
+
+void __blk_wb_done(struct rq_wb *);
+void blk_wb_done(struct rq_wb *, struct request *);
+bool blk_wb_wait(struct rq_wb *, struct bio *, spinlock_t *);
+int blk_wb_init(struct request_queue *);
+void blk_wb_exit(struct request_queue *);
+void blk_wb_update_limits(struct rq_wb *, unsigned int);
+
+#endif
__REQ_PM, /* runtime pm request */
__REQ_HASHED, /* on IO scheduler merge hash */
__REQ_MQ_INFLIGHT, /* track inflight for MQ */
+ __REQ_BUF_INFLIGHT, /* track inflight for buffered */
__REQ_NR_BITS, /* stops here */
};
#define REQ_PM (1ULL << __REQ_PM)
#define REQ_HASHED (1ULL << __REQ_HASHED)
#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT)
+#define REQ_BUF_INFLIGHT (1ULL << __REQ_BUF_INFLIGHT)
typedef unsigned int blk_qc_t;
#define BLK_QC_T_NONE -1U
struct blkcg_gq;
struct blk_flush_queue;
struct pr_ops;
+struct rq_wb;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_MAX_RQ 128 /* Default maximum */
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
+ struct rq_wb *rq_wb;
+
/*
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
* is used, root blkg allocates from @q->root_rl and all other