diff options
authorPaolo Valente <>2017-12-20 12:38:33 +0100
committerJens Axboe <>2018-01-05 09:26:09 -0700
commit7b8fa3b900a087bc03b11329a92398fde563ba37 (patch)
parent1be6e8a964ee9aa8d4daac523ce29e5f486dd756 (diff)
block, bfq: let a queue be merged only shortly after starting I/O
In BFQ and CFQ, two processes are said to be cooperating if they do I/O in such a way that the union of their I/O requests yields a sequential I/O pattern. To get such a sequential I/O pattern out of the non-sequential pattern of each cooperating process, BFQ and CFQ merge the queues associated with these processes. In more detail, cooperating processes, and thus their associated queues, usually start, or restart, to do I/O shortly after each other. This is the case, e.g., for the I/O threads of KVM/QEMU and of the dump utility. Basing on this assumption, this commit allows a bfq_queue to be merged only during a short time interval (100ms) after it starts, or re-starts, to do I/O. This filtering provides two important benefits. First, it greatly reduces the probability that two non-cooperating processes have their queues merged by mistake, if they just happen to do I/O close to each other for a short time interval. These spurious merges cause loss of service guarantees. A low-weight bfq_queue may unjustly get more than its expected share of the throughput: if such a low-weight queue is merged with a high-weight queue, then the I/O for the low-weight queue is served as if the queue had a high weight. This may damage other high-weight queues unexpectedly. For instance, because of this issue, lxterminal occasionally took 7.5 seconds to start, instead of 6.5 seconds, when some sequential readers and writers did I/O in the background on a FUJITSU MHX2300BT HDD. The reason is that the bfq_queues associated with some of the readers or the writers were merged with the high-weight queues of some processes that had to do some urgent but little I/O. The readers then exploited the inherited high weight for all or most of their I/O, during the start-up of terminal. The filtering introduced by this commit eliminated any outlier caused by spurious queue merges in our start-up time tests. This filtering also provides a little boost of the throughput sustainable by BFQ: 3-4%, depending on the CPU. The reason is that, once a bfq_queue cannot be merged any longer, this commit makes BFQ stop updating the data needed to handle merging for the queue. Signed-off-by: Paolo Valente <> Signed-off-by: Angelo Ruocco <> Signed-off-by: Jens Axboe <>
3 files changed, 52 insertions, 11 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 2cf395daee80..7066d90f09df 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10;
/* Default timeout values, in jiffies, approximating CFQ defaults. */
const int bfq_timeout = HZ / 8;
+ * Time limit for merging (see comments in bfq_setup_cooperator). Set
+ * to the slowest value that, in our tests, proved to be effective in
+ * removing false positives, while not causing true positives to miss
+ * queue merging.
+ *
+ * As can be deduced from the low time limit below, queue merging, if
+ * successful, happens at the very beggining of the I/O of the involved
+ * cooperating processes, as a consequence of the arrival of the very
+ * first requests from each cooperator. After that, there is very
+ * little chance to find cooperators.
+ */
+static const unsigned long bfq_merge_time_limit = HZ/10;
static struct kmem_cache *bfq_pool;
/* Below this threshold (in ns), we consider thinktime immediate. */
@@ -444,6 +458,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
return bfqq;
+static bool bfq_too_late_for_merging(struct bfq_queue *bfqq)
+ return bfqq->service_from_backlogged > 0 &&
+ time_is_before_jiffies(bfqq->first_IO_time +
+ bfq_merge_time_limit);
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
struct rb_node **p, *parent;
@@ -454,6 +475,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->pos_root = NULL;
+ /*
+ * bfqq cannot be merged any longer (see comments in
+ * bfq_setup_cooperator): no point in adding bfqq into the
+ * position tree.
+ */
+ if (bfq_too_late_for_merging(bfqq))
+ return;
if (bfq_class_idle(bfqq))
if (!bfqq->next_rq)
@@ -1935,6 +1964,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
struct bfq_queue *new_bfqq)
+ if (bfq_too_late_for_merging(new_bfqq))
+ return false;
if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
(bfqq->ioprio_class != new_bfqq->ioprio_class))
return false;
@@ -2003,6 +2035,20 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_queue *in_service_bfqq, *new_bfqq;
+ /*
+ * Prevent bfqq from being merged if it has been created too
+ * long ago. The idea is that true cooperating processes, and
+ * thus their associated bfq_queues, are supposed to be
+ * created shortly after each other. This is the case, e.g.,
+ * for KVM/QEMU and dump I/O threads. Basing on this
+ * assumption, the following filtering greatly reduces the
+ * probability that two non-cooperating processes, which just
+ * happen to do close I/O for some short time interval, have
+ * their queues merged by mistake.
+ */
+ if (bfq_too_late_for_merging(bfqq))
+ return NULL;
if (bfqq->new_bfqq)
return bfqq->new_bfqq;
@@ -3003,17 +3049,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
- * Increase service_from_backlogged before next statement,
- * because the possible next invocation of
- * bfq_bfqq_charge_time would likely inflate
- * entity->service. In contrast, service_from_backlogged must
- * contain real service, to enable the soft real-time
- * heuristic to correctly compute the bandwidth consumed by
- * bfqq.
- */
- bfqq->service_from_backlogged += entity->service;
- /*
* As above explained, charge slow (typically seeky) and
* timed-out queues with the time and not the service
* received, to favor sequential workloads.
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 91c4390903a1..5d47b58d5fc8 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -344,6 +344,8 @@ struct bfq_queue {
unsigned long wr_start_at_switch_to_srt;
unsigned long split_time; /* time of last split */
+ unsigned long first_IO_time; /* time of first I/O for this queue */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index e495d3f9b4b0..4456eda34e48 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -835,6 +835,10 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
struct bfq_entity *entity = &bfqq->entity;
struct bfq_service_tree *st;
+ if (!bfqq->service_from_backlogged)
+ bfqq->first_IO_time = jiffies;
+ bfqq->service_from_backlogged += served;
for_each_entity(entity) {
st = bfq_entity_service_tree(entity);