summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2017-10-25 14:47:00 -0600
committerJens Axboe <axboe@kernel.dk>2017-10-25 14:47:00 -0600
commit374e800f2140e66802400343211a627810053974 (patch)
tree95cda22eeb1d25b05843c2ed032c3b923fbe17c5
parent16f1477ebeeef4170f1f16eb66ad565daecdc7a8 (diff)
blk-wbt: account and throttle O_DIRECTwbt-odirect
Before this patch, we had two buckets - one for kswapd writes and one for normal buffered writes. The two bucket approach was introduced to avoid starvation between the two different types of writes. Before that, we treated all non-direct writes identically. This patch adds a third bucket, for O_DIRECT writes. Mixed workloads, with both lots of buffered and O_DIRECT writes, O_DIRECT writes are unnecessarily starving the buffered writes. Account and throttle O_DIRECT writes independently. Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--block/blk-wbt.c62
-rw-r--r--block/blk-wbt.h20
2 files changed, 48 insertions, 34 deletions
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 2512442873d4..990aa0700b31 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -15,7 +15,7 @@
* positive scaling steps where we shrink the monitoring window, a negative
* scaling step retains the default step==0 window size.
*
- * Copyright (C) 2016 Jens Axboe
+ * Copyright (C) 2016-2017 Jens Axboe
*
*/
#include <linux/kernel.h>
@@ -101,9 +101,20 @@ static bool wb_recent_wait(struct rq_wb *rwb)
return time_before(jiffies, wb->dirty_sleep + HZ);
}
-static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+static bool rw_is_odirect_write(unsigned int rw)
{
- return &rwb->rq_wait[is_kswapd];
+ return (rw & REQ_OP_FLAGS_ODIRECT) == REQ_OP_FLAGS_ODIRECT;
+}
+
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd,
+ bool is_odirect)
+{
+ if (is_kswapd)
+ return &rwb->rq_wait[WBT_RWQ_KSWAPD];
+ else if (is_odirect)
+ return &rwb->rq_wait[WBT_RWQ_ODIRECT];
+
+ return &rwb->rq_wait[WBT_RWQ_BUFFERED];
}
static void rwb_wake_all(struct rq_wb *rwb)
@@ -126,7 +137,7 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
if (!(wb_acct & WBT_TRACKED))
return;
- rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+ rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD, wb_acct & WBT_ODIRECT);
inflight = atomic_dec_return(&rqw->inflight);
/*
@@ -140,7 +151,9 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
/*
* If the device does write back caching, drop further down
- * before we wake people up.
+ * before we wake people up. The exception is if we recently
+ * waited in balance_dirty_pages() - if that's the case, use
+ * the normal wake limit to boost throughput a bit.
*/
if (rwb->wc && !wb_recent_wait(rwb))
limit = 0;
@@ -502,9 +515,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
unsigned int limit;
/*
- * At this point we know it's a buffered write. If this is
- * kswapd trying to free memory, or REQ_SYNC is set, set, then
- * it's WB_SYNC_ALL writeback, and we'll use the max limit for
+ * At this point we know it's a write that we should throttle.
+ * If this is kswapd trying to free memory, or REQ_SYNC is set,
+ * then it's WB_SYNC_ALL writeback, and we'll use the max limit for
* that. If the write is marked as a background write, then use
* the idle limit, or go to normal if we haven't had competing
* IO for a bit.
@@ -555,9 +568,10 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
__releases(lock)
__acquires(lock)
{
- struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+ struct rq_wait *rqw;
DEFINE_WAIT(wait);
+ rqw = get_rq_wait(rwb, current_is_kswapd(), rw_is_odirect_write(rw));
if (may_queue(rwb, rqw, &wait, rw))
return;
@@ -579,23 +593,12 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
finish_wait(&rqw->wait, &wait);
}
-static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
+static inline bool wbt_should_throttle(struct bio *bio)
{
- const int op = bio_op(bio);
-
/*
- * If not a WRITE, do nothing
+ * Throttle all writes, we'll bucketize them appropriately later
*/
- if (op != REQ_OP_WRITE)
- return false;
-
- /*
- * Don't throttle WRITE_ODIRECT
- */
- if ((bio->bi_opf & REQ_OP_FLAGS_ODIRECT) == REQ_OP_FLAGS_ODIRECT)
- return false;
-
- return true;
+ return bio_op(bio) == REQ_OP_WRITE;
}
/*
@@ -606,18 +609,15 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
*/
enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
{
- unsigned int ret = 0;
+ enum wbt_flags ret = 0;
if (!rwb_enabled(rwb))
return 0;
- if (bio_op(bio) == REQ_OP_READ)
- ret = WBT_READ;
-
- if (!wbt_should_throttle(rwb, bio)) {
- if (ret & WBT_READ)
+ if (!wbt_should_throttle(bio)) {
+ if (bio_op(bio) == REQ_OP_READ)
wb_timestamp(rwb, &rwb->last_issue);
- return ret;
+ return 0;
}
__wbt_wait(rwb, bio->bi_opf, lock);
@@ -627,6 +627,8 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
if (current_is_kswapd())
ret |= WBT_KSWAPD;
+ else if (rw_is_odirect_write(bio->bi_opf))
+ ret |= WBT_ODIRECT;
return ret | WBT_TRACKED;
}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index e7c086374e20..a6470175ac9e 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -9,16 +9,28 @@
#include "blk-stat.h"
+/*
+ * First bit is tracked or not, next two bits is a value shifted up
+ */
enum wbt_flags {
WBT_TRACKED = 1, /* write, tracked for throttling */
- WBT_READ = 2, /* read */
- WBT_KSWAPD = 4, /* write, from kswapd */
+ WBT_KSWAPD = 2,
+ WBT_ODIRECT = 4,
WBT_NR_BITS = 3, /* number of bits */
};
+/*
+ * We have three buckets for accounting and waiting - one for buffered
+ * IO, one for kswapd, and one for O_DIRECT writes. This helps us ensure
+ * fairness between different types of writes, without starving one of
+ * them unnecessarily.
+ */
enum {
- WBT_NUM_RWQ = 2,
+ WBT_RWQ_BUFFERED = 0,
+ WBT_RWQ_KSWAPD,
+ WBT_RWQ_ODIRECT,
+ WBT_NUM_RWQ,
};
/*
@@ -52,7 +64,7 @@ static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
static inline bool wbt_is_read(struct blk_issue_stat *stat)
{
- return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
+ return (stat->stat >> BLK_STAT_RES_SHIFT) == 0;
}
struct rq_wait {