[linux-block.git] / block / blk-mq-sched.c

/*
 * blk-mq scheduling framework
 *
 * Copyright (C) 2016 Jens Axboe
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blk-mq.h>

#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
#include "blk-wbt.h"

void blk_mq_sched_free_hctx_data(struct request_queue *q,
				 void (*exit)(struct blk_mq_hw_ctx *))
{
	struct blk_mq_hw_ctx *hctx;
	int i;

	queue_for_each_hw_ctx(q, hctx, i) {
		if (exit && hctx->sched_data)
			exit(hctx);
		kfree(hctx->sched_data);
		hctx->sched_data = NULL;
	}
}
EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);

void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
{
	struct request_queue *q = rq->q;
	struct io_context *ioc = rq_ioc(bio);
	struct io_cq *icq;

	spin_lock_irq(&q->queue_lock);
	icq = ioc_lookup_icq(ioc, q);
	spin_unlock_irq(&q->queue_lock);

	if (!icq) {
		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
		if (!icq)
			return;
	}
	get_io_context(icq->ioc);
	rq->elv.icq = icq;
}

/*
 * Mark a hardware queue as needing a restart. For shared queues, maintain
 * a count of how many hardware queues are marked for restart.
 */
static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
{
	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
		return;

	set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}

void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
		return;
	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);

	blk_mq_run_hw_queue(hctx, true);
}

/*
 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 * its queue by itself in its completion handler, so we don't need to
 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
 */
static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
	struct request_queue *q = hctx->queue;
	struct elevator_queue *e = q->elevator;
	LIST_HEAD(rq_list);

	do {
		struct request *rq;

		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
			break;

		if (!blk_mq_get_dispatch_budget(hctx))
			break;

		rq = e->type->ops.dispatch_request(hctx);
		if (!rq) {
			blk_mq_put_dispatch_budget(hctx);
			break;
		}

		/*
		 * Now this rq owns the budget which has to be released
		 * if this rq won't be queued to driver via .queue_rq()
		 * in blk_mq_dispatch_rq_list().
		 */
		list_add(&rq->queuelist, &rq_list);
	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
}

static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
					  struct blk_mq_ctx *ctx)
{
	unsigned short idx = ctx->index_hw[hctx->type];

	if (++idx == hctx->nr_ctx)
		idx = 0;

	return hctx->ctxs[idx];
}

/*
 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 * its queue by itself in its completion handler, so we don't need to
 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
 */
static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
	struct request_queue *q = hctx->queue;
	LIST_HEAD(rq_list);
	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);

	do {
		struct request *rq;

		if (!sbitmap_any_bit_set(&hctx->ctx_map))
			break;

		if (!blk_mq_get_dispatch_budget(hctx))
			break;

		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
		if (!rq) {
			blk_mq_put_dispatch_budget(hctx);
			break;
		}

		/*
		 * Now this rq owns the budget which has to be released
		 * if this rq won't be queued to driver via .queue_rq()
		 * in blk_mq_dispatch_rq_list().
		 */
		list_add(&rq->queuelist, &rq_list);

		/* round robin for fair dispatch */
		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);

	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));

	WRITE_ONCE(hctx->dispatch_from, ctx);
}

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
	struct request_queue *q = hctx->queue;
	struct elevator_queue *e = q->elevator;
	const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
	LIST_HEAD(rq_list);

	/* RCU or SRCU read lock is needed before checking quiesced flag */
	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
		return;

	hctx->run++;

	/*
	 * If we have previous entries on our dispatch list, grab them first for
	 * more fair dispatch.
	 */
	if (!list_empty_careful(&hctx->dispatch)) {
		spin_lock(&hctx->lock);
		if (!list_empty(&hctx->dispatch))
			list_splice_init(&hctx->dispatch, &rq_list);
		spin_unlock(&hctx->lock);
	}

	/*
	 * Only ask the scheduler for requests, if we didn't have residual
	 * requests from the dispatch list. This is to avoid the case where
	 * we only ever dispatch a fraction of the requests available because
	 * of low device queue depth. Once we pull requests out of the IO
	 * scheduler, we can no longer merge or sort them. So it's best to
	 * leave them there for as long as we can. Mark the hw queue as
	 * needing a restart in that case.
	 *
	 * We want to dispatch from the scheduler if there was nothing
	 * on the dispatch list or we were able to dispatch from the
	 * dispatch list.
	 */
	if (!list_empty(&rq_list)) {
		blk_mq_sched_mark_restart_hctx(hctx);
		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
			if (has_sched_dispatch)
				blk_mq_do_dispatch_sched(hctx);
			else
				blk_mq_do_dispatch_ctx(hctx);
		}
	} else if (has_sched_dispatch) {
		blk_mq_do_dispatch_sched(hctx);
	} else if (hctx->dispatch_busy) {
		/* dequeue request one by one from sw queue if queue is busy */
		blk_mq_do_dispatch_ctx(hctx);
	} else {
		blk_mq_flush_busy_ctxs(hctx, &rq_list);
		blk_mq_dispatch_rq_list(q, &rq_list, false);
	}
}

bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
			    struct request **merged_request)
{
	struct request *rq;

	switch (elv_merge(q, &rq, bio)) {
	case ELEVATOR_BACK_MERGE:
		if (!blk_mq_sched_allow_merge(q, rq, bio))
			return false;
		if (!bio_attempt_back_merge(q, rq, bio))
			return false;
		*merged_request = attempt_back_merge(q, rq);
		if (!*merged_request)
			elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
		return true;
	case ELEVATOR_FRONT_MERGE:
		if (!blk_mq_sched_allow_merge(q, rq, bio))
			return false;
		if (!bio_attempt_front_merge(q, rq, bio))
			return false;
		*merged_request = attempt_front_merge(q, rq);
		if (!*merged_request)
			elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
		return true;
	case ELEVATOR_DISCARD_MERGE:
		return bio_attempt_discard_merge(q, rq, bio);
	default:
		return false;
	}
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);

/*
 * Iterate list of requests and see if we can merge this bio with any
 * of them.
 */
bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
			   struct bio *bio)
{
	struct request *rq;
	int checked = 8;

	list_for_each_entry_reverse(rq, list, queuelist) {
		bool merged = false;

		if (!checked--)
			break;

		if (!blk_rq_merge_ok(rq, bio))
			continue;

		switch (blk_try_merge(rq, bio)) {
		case ELEVATOR_BACK_MERGE:
			if (blk_mq_sched_allow_merge(q, rq, bio))
				merged = bio_attempt_back_merge(q, rq, bio);
			break;
		case ELEVATOR_FRONT_MERGE:
			if (blk_mq_sched_allow_merge(q, rq, bio))
				merged = bio_attempt_front_merge(q, rq, bio);
			break;
		case ELEVATOR_DISCARD_MERGE:
			merged = bio_attempt_discard_merge(q, rq, bio);
			break;
		default:
			continue;
		}

		return merged;
	}

	return false;
}
EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);

/*
 * Reverse check our software queue for entries that we could potentially
 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
 * too much time checking for merges.
 */
static bool blk_mq_attempt_merge(struct request_queue *q,
				 struct blk_mq_ctx *ctx, struct bio *bio)
{
	lockdep_assert_held(&ctx->lock);

	if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) {
		ctx->rq_merged++;
		return true;
	}

	return false;
}

bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
{
	struct elevator_queue *e = q->elevator;
	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
	bool ret = false;

	if (e && e->type->ops.bio_merge) {
		blk_mq_put_ctx(ctx);
		return e->type->ops.bio_merge(hctx, bio);
	}

	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
			!list_empty_careful(&ctx->rq_list)) {
		/* default per sw-queue merge */
		spin_lock(&ctx->lock);
		ret = blk_mq_attempt_merge(q, ctx, bio);
		spin_unlock(&ctx->lock);
	}

	blk_mq_put_ctx(ctx);
	return ret;
}

bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
{
	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);

void blk_mq_sched_request_inserted(struct request *rq)
{
	trace_block_rq_insert(rq->q, rq);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);

static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
				       bool has_sched,
				       struct request *rq)
{
	/* dispatch flush rq directly */
	if (rq->rq_flags & RQF_FLUSH_SEQ) {
		spin_lock(&hctx->lock);
		list_add(&rq->queuelist, &hctx->dispatch);
		spin_unlock(&hctx->lock);
		return true;
	}

	if (has_sched)
		rq->rq_flags |= RQF_SORTED;

	return false;
}

void blk_mq_sched_insert_request(struct request *rq, bool at_head,
				 bool run_queue, bool async)
{
	struct request_queue *q = rq->q;
	struct elevator_queue *e = q->elevator;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

	/* flush rq in flush machinery need to be dispatched directly */
	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
		blk_insert_flush(rq);
		goto run;
	}

	WARN_ON(e && (rq->tag != -1));

	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
		goto run;

	if (e && e->type->ops.insert_requests) {
		LIST_HEAD(list);

		list_add(&rq->queuelist, &list);
		e->type->ops.insert_requests(hctx, &list, at_head);
	} else {
		spin_lock(&ctx->lock);
		__blk_mq_insert_request(hctx, rq, at_head);
		spin_unlock(&ctx->lock);
	}

run:
	if (run_queue)
		blk_mq_run_hw_queue(hctx, async);
}

void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
				  struct blk_mq_ctx *ctx,
				  struct list_head *list, bool run_queue_async)
{
	struct elevator_queue *e;

	e = hctx->queue->elevator;
	if (e && e->type->ops.insert_requests)
		e->type->ops.insert_requests(hctx, list, false);
	else {
		/*
		 * try to issue requests directly if the hw queue isn't
		 * busy in case of 'none' scheduler, and this way may save
		 * us one extra enqueue & dequeue to sw queue.
		 */
		if (!hctx->dispatch_busy && !e && !run_queue_async) {
			blk_mq_try_issue_list_directly(hctx, list);
			if (list_empty(list))
				return;
		}
		blk_mq_insert_requests(hctx, ctx, list);
	}

	blk_mq_run_hw_queue(hctx, run_queue_async);
}

static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
				   struct blk_mq_hw_ctx *hctx,
				   unsigned int hctx_idx)
{
	if (hctx->sched_tags) {
		blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
		blk_mq_free_rq_map(hctx->sched_tags);
		hctx->sched_tags = NULL;
	}
}

static int blk_mq_sched_alloc_tags(struct request_queue *q,
				   struct blk_mq_hw_ctx *hctx,
				   unsigned int hctx_idx)
{
	struct blk_mq_tag_set *set = q->tag_set;
	int ret;

	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
					       set->reserved_tags);
	if (!hctx->sched_tags)
		return -ENOMEM;

	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
	if (ret)
		blk_mq_sched_free_tags(set, hctx, hctx_idx);

	return ret;
}

static void blk_mq_sched_tags_teardown(struct request_queue *q)
{
	struct blk_mq_tag_set *set = q->tag_set;
	struct blk_mq_hw_ctx *hctx;
	int i;

	queue_for_each_hw_ctx(q, hctx, i)
		blk_mq_sched_free_tags(set, hctx, i);
}

int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
	struct blk_mq_hw_ctx *hctx;
	struct elevator_queue *eq;
	unsigned int i;
	int ret;

	if (!e) {
		q->elevator = NULL;
		q->nr_requests = q->tag_set->queue_depth;
		return 0;
	}

	/*
	 * Default to double of smaller one between hw queue_depth and 128,
	 * since we don't split into sync/async like the old code did.
	 * Additionally, this is a per-hw queue depth.
	 */
	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
				   BLKDEV_MAX_RQ);

	queue_for_each_hw_ctx(q, hctx, i) {
		ret = blk_mq_sched_alloc_tags(q, hctx, i);
		if (ret)
			goto err;
	}

	ret = e->ops.init_sched(q, e);
	if (ret)
		goto err;

	blk_mq_debugfs_register_sched(q);

	queue_for_each_hw_ctx(q, hctx, i) {
		if (e->ops.init_hctx) {
			ret = e->ops.init_hctx(hctx, i);
			if (ret) {
				eq = q->elevator;
				blk_mq_exit_sched(q, eq);
				kobject_put(&eq->kobj);
				return ret;
			}
		}
		blk_mq_debugfs_register_sched_hctx(q, hctx);
	}

	return 0;

err:
	blk_mq_sched_tags_teardown(q);
	q->elevator = NULL;
	return ret;
}

void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;

	queue_for_each_hw_ctx(q, hctx, i) {
		blk_mq_debugfs_unregister_sched_hctx(hctx);
		if (e->type->ops.exit_hctx && hctx->sched_data) {
			e->type->ops.exit_hctx(hctx, i);
			hctx->sched_data = NULL;
		}
	}
	blk_mq_debugfs_unregister_sched(q);
	if (e->type->ops.exit_sched)
		e->type->ops.exit_sched(e);
	blk_mq_sched_tags_teardown(q);
	q->elevator = NULL;
}
Commit	Line	Data
bd166ef1 JA	1	/*
	2	* blk-mq scheduling framework
	3	*
	4	* Copyright (C) 2016 Jens Axboe
	5	*/
	6	#include <linux/kernel.h>
	7	#include <linux/module.h>
	8	#include <linux/blk-mq.h>
	9
	10	#include <trace/events/block.h>
	11
	12	#include "blk.h"
	13	#include "blk-mq.h"
d332ce09	14	#include "blk-mq-debugfs.h"
bd166ef1 JA	15	#include "blk-mq-sched.h"
	16	#include "blk-mq-tag.h"
	17	#include "blk-wbt.h"
	18
	19	void blk_mq_sched_free_hctx_data(struct request_queue *q,
	20	void (exit)(struct blk_mq_hw_ctx ))
	21	{
	22	struct blk_mq_hw_ctx *hctx;
	23	int i;
	24
	25	queue_for_each_hw_ctx(q, hctx, i) {
	26	if (exit && hctx->sched_data)
	27	exit(hctx);
	28	kfree(hctx->sched_data);
	29	hctx->sched_data = NULL;
	30	}
	31	}
	32	EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
	33
44e8c2bf	34	void blk_mq_sched_assign_ioc(struct request rq, struct bio bio)
bd166ef1	35	{
44e8c2bf CH	36	struct request_queue *q = rq->q;
44e8c2bf CH	37	struct io_context *ioc = rq_ioc(bio);
bd166ef1 JA	38	struct io_cq *icq;
bd166ef1 JA	39
0d945c1f	40	spin_lock_irq(&q->queue_lock);
bd166ef1	41	icq = ioc_lookup_icq(ioc, q);
0d945c1f	42	spin_unlock_irq(&q->queue_lock);
bd166ef1 JA	43
	44	if (!icq) {
	45	icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
	46	if (!icq)
	47	return;
	48	}
ea511e3c	49	get_io_context(icq->ioc);
44e8c2bf	50	rq->elv.icq = icq;
bd166ef1 JA	51	}
bd166ef1 JA	52
8e8320c9 JA	53	/*
	54	* Mark a hardware queue as needing a restart. For shared queues, maintain
	55	* a count of how many hardware queues are marked for restart.
	56	*/
	57	static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
	58	{
	59	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
	60	return;
	61
97889f9a	62	set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
8e8320c9 JA	63	}
8e8320c9 JA	64
97889f9a	65	void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
8e8320c9 JA	66	{
8e8320c9 JA	67	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
97889f9a ML	68	return;
97889f9a ML	69	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
8e8320c9	70
97889f9a	71	blk_mq_run_hw_queue(hctx, true);
8e8320c9 JA	72	}
8e8320c9 JA	73
1f460b63 ML	74	/*
	75	* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
	76	* its queue by itself in its completion handler, so we don't need to
	77	* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
	78	*/
	79	static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
caf8eb0d ML	80	{
	81	struct request_queue *q = hctx->queue;
	82	struct elevator_queue *e = q->elevator;
	83	LIST_HEAD(rq_list);
	84
	85	do {
de148297	86	struct request *rq;
caf8eb0d	87
f9cd4bfe	88	if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
caf8eb0d	89	break;
de148297	90
88022d72	91	if (!blk_mq_get_dispatch_budget(hctx))
1f460b63	92	break;
de148297	93
f9cd4bfe	94	rq = e->type->ops.dispatch_request(hctx);
de148297 ML	95	if (!rq) {
	96	blk_mq_put_dispatch_budget(hctx);
	97	break;
de148297 ML	98	}
	99
	100	/*
	101	* Now this rq owns the budget which has to be released
	102	* if this rq won't be queued to driver via .queue_rq()
	103	* in blk_mq_dispatch_rq_list().
	104	*/
caf8eb0d	105	list_add(&rq->queuelist, &rq_list);
de148297	106	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
caf8eb0d ML	107	}
caf8eb0d ML	108
b347689f ML	109	static struct blk_mq_ctx blk_mq_next_ctx(struct blk_mq_hw_ctx hctx,
	110	struct blk_mq_ctx *ctx)
	111	{
f31967f0	112	unsigned short idx = ctx->index_hw[hctx->type];
b347689f ML	113
	114	if (++idx == hctx->nr_ctx)
	115	idx = 0;
	116
	117	return hctx->ctxs[idx];
	118	}
	119
1f460b63 ML	120	/*
	121	* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
	122	* its queue by itself in its completion handler, so we don't need to
	123	* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
	124	*/
	125	static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
b347689f ML	126	{
	127	struct request_queue *q = hctx->queue;
	128	LIST_HEAD(rq_list);
	129	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
	130
	131	do {
	132	struct request *rq;
b347689f ML	133
	134	if (!sbitmap_any_bit_set(&hctx->ctx_map))
	135	break;
	136
88022d72	137	if (!blk_mq_get_dispatch_budget(hctx))
1f460b63	138	break;
b347689f ML	139
	140	rq = blk_mq_dequeue_from_ctx(hctx, ctx);
	141	if (!rq) {
	142	blk_mq_put_dispatch_budget(hctx);
	143	break;
b347689f ML	144	}
	145
	146	/*
	147	* Now this rq owns the budget which has to be released
	148	* if this rq won't be queued to driver via .queue_rq()
	149	* in blk_mq_dispatch_rq_list().
	150	*/
	151	list_add(&rq->queuelist, &rq_list);
	152
	153	/* round robin for fair dispatch */
	154	ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
	155
	156	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
	157
	158	WRITE_ONCE(hctx->dispatch_from, ctx);
b347689f ML	159	}
b347689f ML	160
1f460b63	161	void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
bd166ef1	162	{
81380ca1 OS	163	struct request_queue *q = hctx->queue;
81380ca1 OS	164	struct elevator_queue *e = q->elevator;
f9cd4bfe	165	const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
bd166ef1 JA	166	LIST_HEAD(rq_list);
bd166ef1 JA	167
f4560ffe ML	168	/* RCU or SRCU read lock is needed before checking quiesced flag */
f4560ffe ML	169	if (unlikely(blk_mq_hctx_stopped(hctx) \|\| blk_queue_quiesced(q)))
1f460b63	170	return;
bd166ef1 JA	171
	172	hctx->run++;
	173
	174	/*
	175	* If we have previous entries on our dispatch list, grab them first for
	176	* more fair dispatch.
	177	*/
	178	if (!list_empty_careful(&hctx->dispatch)) {
	179	spin_lock(&hctx->lock);
	180	if (!list_empty(&hctx->dispatch))
	181	list_splice_init(&hctx->dispatch, &rq_list);
	182	spin_unlock(&hctx->lock);
	183	}
	184
	185	/*
	186	* Only ask the scheduler for requests, if we didn't have residual
	187	* requests from the dispatch list. This is to avoid the case where
	188	* we only ever dispatch a fraction of the requests available because
	189	* of low device queue depth. Once we pull requests out of the IO
	190	* scheduler, we can no longer merge or sort them. So it's best to
	191	* leave them there for as long as we can. Mark the hw queue as
	192	* needing a restart in that case.
caf8eb0d ML	193	*
	194	* We want to dispatch from the scheduler if there was nothing
	195	* on the dispatch list or we were able to dispatch from the
	196	* dispatch list.
bd166ef1	197	*/
c13660a0	198	if (!list_empty(&rq_list)) {
d38d3515	199	blk_mq_sched_mark_restart_hctx(hctx);
b347689f ML	200	if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
b347689f ML	201	if (has_sched_dispatch)
1f460b63	202	blk_mq_do_dispatch_sched(hctx);
b347689f	203	else
1f460b63	204	blk_mq_do_dispatch_ctx(hctx);
b347689f	205	}
caf8eb0d	206	} else if (has_sched_dispatch) {
1f460b63	207	blk_mq_do_dispatch_sched(hctx);
6e768717 ML	208	} else if (hctx->dispatch_busy) {
6e768717 ML	209	/* dequeue request one by one from sw queue if queue is busy */
1f460b63	210	blk_mq_do_dispatch_ctx(hctx);
caf8eb0d	211	} else {
c13660a0	212	blk_mq_flush_busy_ctxs(hctx, &rq_list);
de148297	213	blk_mq_dispatch_rq_list(q, &rq_list, false);
64765a75	214	}
bd166ef1 JA	215	}
bd166ef1 JA	216
e4d750c9 JA	217	bool blk_mq_sched_try_merge(struct request_queue q, struct bio bio,
e4d750c9 JA	218	struct request **merged_request)
bd166ef1 JA	219	{
bd166ef1 JA	220	struct request *rq;
bd166ef1	221
34fe7c05 CH	222	switch (elv_merge(q, &rq, bio)) {
34fe7c05 CH	223	case ELEVATOR_BACK_MERGE:
bd166ef1 JA	224	if (!blk_mq_sched_allow_merge(q, rq, bio))
bd166ef1 JA	225	return false;
34fe7c05 CH	226	if (!bio_attempt_back_merge(q, rq, bio))
	227	return false;
	228	*merged_request = attempt_back_merge(q, rq);
	229	if (!*merged_request)
	230	elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
	231	return true;
	232	case ELEVATOR_FRONT_MERGE:
bd166ef1 JA	233	if (!blk_mq_sched_allow_merge(q, rq, bio))
bd166ef1 JA	234	return false;
34fe7c05 CH	235	if (!bio_attempt_front_merge(q, rq, bio))
	236	return false;
	237	*merged_request = attempt_front_merge(q, rq);
	238	if (!*merged_request)
	239	elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
	240	return true;
bea99a50 KB	241	case ELEVATOR_DISCARD_MERGE:
bea99a50 KB	242	return bio_attempt_discard_merge(q, rq, bio);
34fe7c05 CH	243	default:
34fe7c05 CH	244	return false;
bd166ef1	245	}
bd166ef1 JA	246	}
	247	EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
	248
9bddeb2a	249	/*
9c558734 JA	250	* Iterate list of requests and see if we can merge this bio with any
9c558734 JA	251	* of them.
9bddeb2a	252	*/
9c558734 JA	253	bool blk_mq_bio_list_merge(struct request_queue q, struct list_head list,
9c558734 JA	254	struct bio *bio)
9bddeb2a ML	255	{
	256	struct request *rq;
	257	int checked = 8;
	258
9c558734	259	list_for_each_entry_reverse(rq, list, queuelist) {
9bddeb2a ML	260	bool merged = false;
	261
	262	if (!checked--)
	263	break;
	264
	265	if (!blk_rq_merge_ok(rq, bio))
	266	continue;
	267
	268	switch (blk_try_merge(rq, bio)) {
	269	case ELEVATOR_BACK_MERGE:
	270	if (blk_mq_sched_allow_merge(q, rq, bio))
	271	merged = bio_attempt_back_merge(q, rq, bio);
	272	break;
	273	case ELEVATOR_FRONT_MERGE:
	274	if (blk_mq_sched_allow_merge(q, rq, bio))
	275	merged = bio_attempt_front_merge(q, rq, bio);
	276	break;
	277	case ELEVATOR_DISCARD_MERGE:
	278	merged = bio_attempt_discard_merge(q, rq, bio);
	279	break;
	280	default:
	281	continue;
	282	}
	283
9bddeb2a ML	284	return merged;
	285	}
	286
	287	return false;
	288	}
9c558734 JA	289	EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
	290
	291	/*
	292	* Reverse check our software queue for entries that we could potentially
	293	* merge with. Currently includes a hand-wavy stop count of 8, to not spend
	294	* too much time checking for merges.
	295	*/
	296	static bool blk_mq_attempt_merge(struct request_queue *q,
	297	struct blk_mq_ctx ctx, struct bio bio)
	298	{
	299	lockdep_assert_held(&ctx->lock);
	300
	301	if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) {
	302	ctx->rq_merged++;
	303	return true;
	304	}
	305
	306	return false;
	307	}
9bddeb2a	308
bd166ef1 JA	309	bool __blk_mq_sched_bio_merge(struct request_queue q, struct bio bio)
	310	{
	311	struct elevator_queue *e = q->elevator;
9bddeb2a	312	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
f9afca4d	313	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
9bddeb2a	314	bool ret = false;
bd166ef1	315
f9cd4bfe	316	if (e && e->type->ops.bio_merge) {
bd166ef1	317	blk_mq_put_ctx(ctx);
f9cd4bfe	318	return e->type->ops.bio_merge(hctx, bio);
bd166ef1 JA	319	}
bd166ef1 JA	320
b04f50ab ML	321	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
b04f50ab ML	322	!list_empty_careful(&ctx->rq_list)) {
9bddeb2a ML	323	/* default per sw-queue merge */
	324	spin_lock(&ctx->lock);
	325	ret = blk_mq_attempt_merge(q, ctx, bio);
	326	spin_unlock(&ctx->lock);
	327	}
	328
	329	blk_mq_put_ctx(ctx);
	330	return ret;
bd166ef1 JA	331	}
	332
	333	bool blk_mq_sched_try_insert_merge(struct request_queue q, struct request rq)
	334	{
	335	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
	336	}
	337	EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
	338
	339	void blk_mq_sched_request_inserted(struct request *rq)
	340	{
	341	trace_block_rq_insert(rq->q, rq);
	342	}
	343	EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
	344
0cacba6c	345	static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
a6a252e6	346	bool has_sched,
0cacba6c	347	struct request *rq)
bd166ef1	348	{
a6a252e6 ML	349	/* dispatch flush rq directly */
	350	if (rq->rq_flags & RQF_FLUSH_SEQ) {
	351	spin_lock(&hctx->lock);
	352	list_add(&rq->queuelist, &hctx->dispatch);
	353	spin_unlock(&hctx->lock);
	354	return true;
	355	}
	356
923218f6	357	if (has_sched)
bd166ef1	358	rq->rq_flags \|= RQF_SORTED;
bd166ef1	359
a6a252e6	360	return false;
bd166ef1	361	}
bd166ef1	362
bd6737f1	363	void blk_mq_sched_insert_request(struct request *rq, bool at_head,
9e97d295	364	bool run_queue, bool async)
bd6737f1 JA	365	{
	366	struct request_queue *q = rq->q;
	367	struct elevator_queue *e = q->elevator;
	368	struct blk_mq_ctx *ctx = rq->mq_ctx;
ea4f995e	369	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
bd6737f1	370
a6a252e6 ML	371	/* flush rq in flush machinery need to be dispatched directly */
a6a252e6 ML	372	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
923218f6 ML	373	blk_insert_flush(rq);
923218f6 ML	374	goto run;
bd6737f1 JA	375	}
bd6737f1 JA	376
923218f6 ML	377	WARN_ON(e && (rq->tag != -1));
923218f6 ML	378
a6a252e6	379	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
0cacba6c OS	380	goto run;
0cacba6c OS	381
f9cd4bfe	382	if (e && e->type->ops.insert_requests) {
bd6737f1 JA	383	LIST_HEAD(list);
	384
	385	list_add(&rq->queuelist, &list);
f9cd4bfe	386	e->type->ops.insert_requests(hctx, &list, at_head);
bd6737f1 JA	387	} else {
	388	spin_lock(&ctx->lock);
	389	__blk_mq_insert_request(hctx, rq, at_head);
	390	spin_unlock(&ctx->lock);
	391	}
	392
0cacba6c	393	run:
bd6737f1 JA	394	if (run_queue)
	395	blk_mq_run_hw_queue(hctx, async);
	396	}
	397
67cae4c9	398	void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
bd6737f1 JA	399	struct blk_mq_ctx *ctx,
	400	struct list_head *list, bool run_queue_async)
	401	{
f9afca4d	402	struct elevator_queue *e;
bd6737f1	403
f9afca4d	404	e = hctx->queue->elevator;
f9cd4bfe JA	405	if (e && e->type->ops.insert_requests)
f9cd4bfe JA	406	e->type->ops.insert_requests(hctx, list, false);
6ce3dd6e ML	407	else {
	408	/*
	409	* try to issue requests directly if the hw queue isn't
	410	* busy in case of 'none' scheduler, and this way may save
	411	* us one extra enqueue & dequeue to sw queue.
	412	*/
	413	if (!hctx->dispatch_busy && !e && !run_queue_async) {
	414	blk_mq_try_issue_list_directly(hctx, list);
	415	if (list_empty(list))
	416	return;
	417	}
bd6737f1	418	blk_mq_insert_requests(hctx, ctx, list);
6ce3dd6e	419	}
bd6737f1 JA	420
	421	blk_mq_run_hw_queue(hctx, run_queue_async);
	422	}
	423
bd166ef1 JA	424	static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
	425	struct blk_mq_hw_ctx *hctx,
	426	unsigned int hctx_idx)
	427	{
	428	if (hctx->sched_tags) {
	429	blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
	430	blk_mq_free_rq_map(hctx->sched_tags);
	431	hctx->sched_tags = NULL;
	432	}
	433	}
	434
6917ff0b OS	435	static int blk_mq_sched_alloc_tags(struct request_queue *q,
	436	struct blk_mq_hw_ctx *hctx,
	437	unsigned int hctx_idx)
	438	{
	439	struct blk_mq_tag_set *set = q->tag_set;
	440	int ret;
	441
	442	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
	443	set->reserved_tags);
	444	if (!hctx->sched_tags)
	445	return -ENOMEM;
	446
	447	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
	448	if (ret)
	449	blk_mq_sched_free_tags(set, hctx, hctx_idx);
	450
	451	return ret;
	452	}
	453
54d5329d	454	static void blk_mq_sched_tags_teardown(struct request_queue *q)
bd166ef1 JA	455	{
	456	struct blk_mq_tag_set *set = q->tag_set;
	457	struct blk_mq_hw_ctx *hctx;
6917ff0b OS	458	int i;
	459
	460	queue_for_each_hw_ctx(q, hctx, i)
	461	blk_mq_sched_free_tags(set, hctx, i);
	462	}
	463
	464	int blk_mq_init_sched(struct request_queue q, struct elevator_type e)
	465	{
	466	struct blk_mq_hw_ctx *hctx;
ee056f98	467	struct elevator_queue *eq;
6917ff0b OS	468	unsigned int i;
	469	int ret;
	470
	471	if (!e) {
	472	q->elevator = NULL;
32a50fab	473	q->nr_requests = q->tag_set->queue_depth;
6917ff0b OS	474	return 0;
6917ff0b OS	475	}
bd166ef1 JA	476
bd166ef1 JA	477	/*
32825c45 ML	478	* Default to double of smaller one between hw queue_depth and 128,
	479	* since we don't split into sync/async like the old code did.
	480	* Additionally, this is a per-hw queue depth.
bd166ef1	481	*/
32825c45 ML	482	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
32825c45 ML	483	BLKDEV_MAX_RQ);
bd166ef1	484
bd166ef1	485	queue_for_each_hw_ctx(q, hctx, i) {
6917ff0b	486	ret = blk_mq_sched_alloc_tags(q, hctx, i);
bd166ef1	487	if (ret)
6917ff0b	488	goto err;
bd166ef1 JA	489	}
bd166ef1 JA	490
f9cd4bfe	491	ret = e->ops.init_sched(q, e);
6917ff0b OS	492	if (ret)
6917ff0b OS	493	goto err;
bd166ef1	494
d332ce09 OS	495	blk_mq_debugfs_register_sched(q);
	496
	497	queue_for_each_hw_ctx(q, hctx, i) {
f9cd4bfe JA	498	if (e->ops.init_hctx) {
f9cd4bfe JA	499	ret = e->ops.init_hctx(hctx, i);
ee056f98 OS	500	if (ret) {
	501	eq = q->elevator;
	502	blk_mq_exit_sched(q, eq);
	503	kobject_put(&eq->kobj);
	504	return ret;
	505	}
	506	}
d332ce09	507	blk_mq_debugfs_register_sched_hctx(q, hctx);
ee056f98 OS	508	}
ee056f98 OS	509
bd166ef1	510	return 0;
bd166ef1	511
6917ff0b	512	err:
54d5329d OS	513	blk_mq_sched_tags_teardown(q);
54d5329d OS	514	q->elevator = NULL;
6917ff0b	515	return ret;
bd166ef1	516	}
d3484991	517
54d5329d OS	518	void blk_mq_exit_sched(struct request_queue q, struct elevator_queue e)
54d5329d OS	519	{
ee056f98 OS	520	struct blk_mq_hw_ctx *hctx;
	521	unsigned int i;
	522
d332ce09 OS	523	queue_for_each_hw_ctx(q, hctx, i) {
d332ce09 OS	524	blk_mq_debugfs_unregister_sched_hctx(hctx);
f9cd4bfe JA	525	if (e->type->ops.exit_hctx && hctx->sched_data) {
f9cd4bfe JA	526	e->type->ops.exit_hctx(hctx, i);
d332ce09	527	hctx->sched_data = NULL;
ee056f98 OS	528	}
ee056f98 OS	529	}
d332ce09	530	blk_mq_debugfs_unregister_sched(q);
f9cd4bfe JA	531	if (e->type->ops.exit_sched)
f9cd4bfe JA	532	e->type->ops.exit_sched(e);
54d5329d OS	533	blk_mq_sched_tags_teardown(q);
	534	q->elevator = NULL;
	535	}