sbitmap: add __sbitmap_queue_get_batch()
[linux-block.git] / block / blk-mq.c
CommitLineData
3dcf60bc 1// SPDX-License-Identifier: GPL-2.0
75bb4625
JA
2/*
3 * Block multiqueue core code
4 *
5 * Copyright (C) 2013-2014 Jens Axboe
6 * Copyright (C) 2013-2014 Christoph Hellwig
7 */
320ae51f
JA
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/backing-dev.h>
11#include <linux/bio.h>
12#include <linux/blkdev.h>
fe45e630 13#include <linux/blk-integrity.h>
f75782e4 14#include <linux/kmemleak.h>
320ae51f
JA
15#include <linux/mm.h>
16#include <linux/init.h>
17#include <linux/slab.h>
18#include <linux/workqueue.h>
19#include <linux/smp.h>
e41d12f5 20#include <linux/interrupt.h>
320ae51f
JA
21#include <linux/llist.h>
22#include <linux/list_sort.h>
23#include <linux/cpu.h>
24#include <linux/cache.h>
25#include <linux/sched/sysctl.h>
105ab3d8 26#include <linux/sched/topology.h>
174cd4b1 27#include <linux/sched/signal.h>
320ae51f 28#include <linux/delay.h>
aedcd72f 29#include <linux/crash_dump.h>
88c7b2b7 30#include <linux/prefetch.h>
a892c8d5 31#include <linux/blk-crypto.h>
320ae51f
JA
32
33#include <trace/events/block.h>
34
35#include <linux/blk-mq.h>
54d4e6ab 36#include <linux/t10-pi.h>
320ae51f
JA
37#include "blk.h"
38#include "blk-mq.h"
9c1051aa 39#include "blk-mq-debugfs.h"
320ae51f 40#include "blk-mq-tag.h"
986d413b 41#include "blk-pm.h"
cf43e6be 42#include "blk-stat.h"
bd166ef1 43#include "blk-mq-sched.h"
c1c80384 44#include "blk-rq-qos.h"
320ae51f 45
f9ab4918 46static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
c3077b5d 47
34dbad5d
OS
48static void blk_mq_poll_stats_start(struct request_queue *q);
49static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
50
720b8ccc
SB
51static int blk_mq_poll_stats_bkt(const struct request *rq)
52{
3d244306 53 int ddir, sectors, bucket;
720b8ccc 54
99c749a4 55 ddir = rq_data_dir(rq);
3d244306 56 sectors = blk_rq_stats_sectors(rq);
720b8ccc 57
3d244306 58 bucket = ddir + 2 * ilog2(sectors);
720b8ccc
SB
59
60 if (bucket < 0)
61 return -1;
62 else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
63 return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
64
65 return bucket;
66}
67
320ae51f 68/*
85fae294
YY
69 * Check if any of the ctx, dispatch list or elevator
70 * have pending work in this hardware queue.
320ae51f 71 */
79f720a7 72static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
320ae51f 73{
79f720a7
JA
74 return !list_empty_careful(&hctx->dispatch) ||
75 sbitmap_any_bit_set(&hctx->ctx_map) ||
bd166ef1 76 blk_mq_sched_has_work(hctx);
1429d7c9
JA
77}
78
320ae51f
JA
79/*
80 * Mark this ctx as having pending work in this hardware queue
81 */
82static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
83 struct blk_mq_ctx *ctx)
84{
f31967f0
JA
85 const int bit = ctx->index_hw[hctx->type];
86
87 if (!sbitmap_test_bit(&hctx->ctx_map, bit))
88 sbitmap_set_bit(&hctx->ctx_map, bit);
1429d7c9
JA
89}
90
91static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
92 struct blk_mq_ctx *ctx)
93{
f31967f0
JA
94 const int bit = ctx->index_hw[hctx->type];
95
96 sbitmap_clear_bit(&hctx->ctx_map, bit);
320ae51f
JA
97}
98
f299b7c7 99struct mq_inflight {
8446fe92 100 struct block_device *part;
a2e80f6f 101 unsigned int inflight[2];
f299b7c7
JA
102};
103
7baa8572 104static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
f299b7c7
JA
105 struct request *rq, void *priv,
106 bool reserved)
107{
108 struct mq_inflight *mi = priv;
109
b0d97557
JX
110 if ((!mi->part->bd_partno || rq->part == mi->part) &&
111 blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
bb4e6b14 112 mi->inflight[rq_data_dir(rq)]++;
7baa8572
JA
113
114 return true;
f299b7c7
JA
115}
116
8446fe92
CH
117unsigned int blk_mq_in_flight(struct request_queue *q,
118 struct block_device *part)
f299b7c7 119{
a2e80f6f 120 struct mq_inflight mi = { .part = part };
f299b7c7 121
f299b7c7 122 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
e016b782 123
a2e80f6f 124 return mi.inflight[0] + mi.inflight[1];
bf0ddaba
OS
125}
126
8446fe92
CH
127void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
128 unsigned int inflight[2])
bf0ddaba 129{
a2e80f6f 130 struct mq_inflight mi = { .part = part };
bf0ddaba 131
bb4e6b14 132 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
a2e80f6f
PB
133 inflight[0] = mi.inflight[0];
134 inflight[1] = mi.inflight[1];
bf0ddaba
OS
135}
136
1671d522 137void blk_freeze_queue_start(struct request_queue *q)
43a5e4e2 138{
7996a8b5
BL
139 mutex_lock(&q->mq_freeze_lock);
140 if (++q->mq_freeze_depth == 1) {
3ef28e83 141 percpu_ref_kill(&q->q_usage_counter);
7996a8b5 142 mutex_unlock(&q->mq_freeze_lock);
344e9ffc 143 if (queue_is_mq(q))
055f6e18 144 blk_mq_run_hw_queues(q, false);
7996a8b5
BL
145 } else {
146 mutex_unlock(&q->mq_freeze_lock);
cddd5d17 147 }
f3af020b 148}
1671d522 149EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
f3af020b 150
6bae363e 151void blk_mq_freeze_queue_wait(struct request_queue *q)
f3af020b 152{
3ef28e83 153 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
43a5e4e2 154}
6bae363e 155EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
43a5e4e2 156
f91328c4
KB
157int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
158 unsigned long timeout)
159{
160 return wait_event_timeout(q->mq_freeze_wq,
161 percpu_ref_is_zero(&q->q_usage_counter),
162 timeout);
163}
164EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
43a5e4e2 165
f3af020b
TH
166/*
167 * Guarantee no request is in use, so we can change any data structure of
168 * the queue afterward.
169 */
3ef28e83 170void blk_freeze_queue(struct request_queue *q)
f3af020b 171{
3ef28e83
DW
172 /*
173 * In the !blk_mq case we are only calling this to kill the
174 * q_usage_counter, otherwise this increases the freeze depth
175 * and waits for it to return to zero. For this reason there is
176 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
177 * exported to drivers as the only user for unfreeze is blk_mq.
178 */
1671d522 179 blk_freeze_queue_start(q);
f3af020b
TH
180 blk_mq_freeze_queue_wait(q);
181}
3ef28e83
DW
182
183void blk_mq_freeze_queue(struct request_queue *q)
184{
185 /*
186 * ...just an alias to keep freeze and unfreeze actions balanced
187 * in the blk_mq_* namespace
188 */
189 blk_freeze_queue(q);
190}
c761d96b 191EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
f3af020b 192
aec89dc5 193void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
320ae51f 194{
7996a8b5 195 mutex_lock(&q->mq_freeze_lock);
aec89dc5
CH
196 if (force_atomic)
197 q->q_usage_counter.data->force_atomic = true;
7996a8b5
BL
198 q->mq_freeze_depth--;
199 WARN_ON_ONCE(q->mq_freeze_depth < 0);
200 if (!q->mq_freeze_depth) {
bdd63160 201 percpu_ref_resurrect(&q->q_usage_counter);
320ae51f 202 wake_up_all(&q->mq_freeze_wq);
add703fd 203 }
7996a8b5 204 mutex_unlock(&q->mq_freeze_lock);
320ae51f 205}
aec89dc5
CH
206
207void blk_mq_unfreeze_queue(struct request_queue *q)
208{
209 __blk_mq_unfreeze_queue(q, false);
210}
b4c6a028 211EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
320ae51f 212
852ec809
BVA
213/*
214 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
215 * mpt3sas driver such that this function can be removed.
216 */
217void blk_mq_quiesce_queue_nowait(struct request_queue *q)
218{
8814ce8a 219 blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
852ec809
BVA
220}
221EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
222
6a83e74d 223/**
69e07c4a 224 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
6a83e74d
BVA
225 * @q: request queue.
226 *
227 * Note: this function does not prevent that the struct request end_io()
69e07c4a
ML
228 * callback function is invoked. Once this function is returned, we make
229 * sure no dispatch can happen until the queue is unquiesced via
230 * blk_mq_unquiesce_queue().
6a83e74d
BVA
231 */
232void blk_mq_quiesce_queue(struct request_queue *q)
233{
234 struct blk_mq_hw_ctx *hctx;
235 unsigned int i;
236 bool rcu = false;
237
1d9e9bc6 238 blk_mq_quiesce_queue_nowait(q);
f4560ffe 239
6a83e74d
BVA
240 queue_for_each_hw_ctx(q, hctx, i) {
241 if (hctx->flags & BLK_MQ_F_BLOCKING)
05707b64 242 synchronize_srcu(hctx->srcu);
6a83e74d
BVA
243 else
244 rcu = true;
245 }
246 if (rcu)
247 synchronize_rcu();
248}
249EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
250
e4e73913
ML
251/*
252 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
253 * @q: request queue.
254 *
255 * This function recovers queue into the state before quiescing
256 * which is done by blk_mq_quiesce_queue.
257 */
258void blk_mq_unquiesce_queue(struct request_queue *q)
259{
8814ce8a 260 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
f4560ffe 261
1d9e9bc6
ML
262 /* dispatch requests which are inserted during quiescing */
263 blk_mq_run_hw_queues(q, true);
e4e73913
ML
264}
265EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
266
aed3ea94
JA
267void blk_mq_wake_waiters(struct request_queue *q)
268{
269 struct blk_mq_hw_ctx *hctx;
270 unsigned int i;
271
272 queue_for_each_hw_ctx(q, hctx, i)
273 if (blk_mq_hw_queue_mapped(hctx))
274 blk_mq_tag_wakeup_all(hctx->tags, true);
275}
276
fe1f4526 277/*
9a91b05b
HT
278 * Only need start/end time stamping if we have iostat or
279 * blk stats enabled, or using an IO scheduler.
fe1f4526
JA
280 */
281static inline bool blk_mq_need_time_stamp(struct request *rq)
282{
9a91b05b 283 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
fe1f4526
JA
284}
285
e4cdf1a1 286static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
7ea4d8a4 287 unsigned int tag, u64 alloc_time_ns)
320ae51f 288{
e4cdf1a1
CH
289 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
290 struct request *rq = tags->static_rqs[tag];
c3a148d2 291
42fdc5e4 292 if (data->q->elevator) {
76647368 293 rq->tag = BLK_MQ_NO_TAG;
e4cdf1a1
CH
294 rq->internal_tag = tag;
295 } else {
e4cdf1a1 296 rq->tag = tag;
76647368 297 rq->internal_tag = BLK_MQ_NO_TAG;
e4cdf1a1
CH
298 }
299
af76e555 300 /* csd/requeue_work/fifo_time is initialized before use */
e4cdf1a1
CH
301 rq->q = data->q;
302 rq->mq_ctx = data->ctx;
ea4f995e 303 rq->mq_hctx = data->hctx;
568f2700 304 rq->rq_flags = 0;
7ea4d8a4 305 rq->cmd_flags = data->cmd_flags;
0854bcdc
BVA
306 if (data->flags & BLK_MQ_REQ_PM)
307 rq->rq_flags |= RQF_PM;
e4cdf1a1 308 if (blk_queue_io_stat(data->q))
e8064021 309 rq->rq_flags |= RQF_IO_STAT;
7c3fb70f 310 INIT_LIST_HEAD(&rq->queuelist);
af76e555
CH
311 INIT_HLIST_NODE(&rq->hash);
312 RB_CLEAR_NODE(&rq->rb_node);
af76e555
CH
313 rq->rq_disk = NULL;
314 rq->part = NULL;
6f816b4b
TH
315#ifdef CONFIG_BLK_RQ_ALLOC_TIME
316 rq->alloc_time_ns = alloc_time_ns;
317#endif
fe1f4526
JA
318 if (blk_mq_need_time_stamp(rq))
319 rq->start_time_ns = ktime_get_ns();
320 else
321 rq->start_time_ns = 0;
544ccc8d 322 rq->io_start_time_ns = 0;
3d244306 323 rq->stats_sectors = 0;
af76e555
CH
324 rq->nr_phys_segments = 0;
325#if defined(CONFIG_BLK_DEV_INTEGRITY)
326 rq->nr_integrity_segments = 0;
327#endif
a892c8d5 328 blk_crypto_rq_set_defaults(rq);
af76e555 329 /* tag was already set */
079076b3 330 WRITE_ONCE(rq->deadline, 0);
af76e555 331
f6be4fb4
JA
332 rq->timeout = 0;
333
af76e555
CH
334 rq->end_io = NULL;
335 rq->end_io_data = NULL;
af76e555 336
7ea4d8a4 337 data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
12f5b931 338 refcount_set(&rq->ref, 1);
7ea4d8a4
CH
339
340 if (!op_is_flush(data->cmd_flags)) {
341 struct elevator_queue *e = data->q->elevator;
342
343 rq->elv.icq = NULL;
344 if (e && e->type->ops.prepare_request) {
345 if (e->type->icq_cache)
346 blk_mq_sched_assign_ioc(rq);
347
348 e->type->ops.prepare_request(rq);
349 rq->rq_flags |= RQF_ELVPRIV;
350 }
351 }
352
353 data->hctx->queued++;
e4cdf1a1 354 return rq;
5dee8577
CH
355}
356
b90cfaed 357static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
d2c0d383 358{
e6e7abff 359 struct request_queue *q = data->q;
d2c0d383 360 struct elevator_queue *e = q->elevator;
6f816b4b 361 u64 alloc_time_ns = 0;
47c122e3 362 struct request *rq;
600c3b0c 363 unsigned int tag;
d2c0d383 364
6f816b4b
TH
365 /* alloc_time includes depth and tag waits */
366 if (blk_queue_rq_alloc_time(q))
367 alloc_time_ns = ktime_get_ns();
368
f9afca4d 369 if (data->cmd_flags & REQ_NOWAIT)
03a07c92 370 data->flags |= BLK_MQ_REQ_NOWAIT;
d2c0d383
CH
371
372 if (e) {
d2c0d383 373 /*
8d663f34 374 * Flush/passthrough requests are special and go directly to the
17a51199
JA
375 * dispatch list. Don't include reserved tags in the
376 * limiting, as it isn't useful.
d2c0d383 377 */
f9afca4d 378 if (!op_is_flush(data->cmd_flags) &&
8d663f34 379 !blk_op_is_passthrough(data->cmd_flags) &&
f9afca4d 380 e->type->ops.limit_depth &&
17a51199 381 !(data->flags & BLK_MQ_REQ_RESERVED))
f9afca4d 382 e->type->ops.limit_depth(data->cmd_flags, data);
d2c0d383
CH
383 }
384
bf0beec0 385retry:
600c3b0c
CH
386 data->ctx = blk_mq_get_ctx(q);
387 data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
42fdc5e4 388 if (!e)
600c3b0c
CH
389 blk_mq_tag_busy(data->hctx);
390
bf0beec0
ML
391 /*
392 * Waiting allocations only fail because of an inactive hctx. In that
393 * case just retry the hctx assignment and tag allocation as CPU hotplug
394 * should have migrated us to an online CPU by now.
395 */
47c122e3
JA
396 do {
397 tag = blk_mq_get_tag(data);
b90cfaed
CH
398 if (tag == BLK_MQ_NO_TAG) {
399 if (data->flags & BLK_MQ_REQ_NOWAIT)
400 break;
401 /*
402 * Give up the CPU and sleep for a random short time to
403 * ensure that thread using a realtime scheduling class
404 * are migrated off the CPU, and thus off the hctx that
405 * is going away.
406 */
407 msleep(3);
408 goto retry;
47c122e3 409 }
bf0beec0 410
b90cfaed
CH
411 rq = blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
412 if (!--data->nr_tags || e ||
413 (data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
414 return rq;
415
416 /* link into the cached list */
417 rq->rq_next = *data->cached_rq;
418 *data->cached_rq = rq;
419 data->flags |= BLK_MQ_REQ_NOWAIT;
47c122e3
JA
420 } while (1);
421
b90cfaed
CH
422 if (!data->cached_rq)
423 return NULL;
47c122e3 424
b90cfaed
CH
425 rq = *data->cached_rq;
426 *data->cached_rq = rq->rq_next;
427 return rq;
d2c0d383
CH
428}
429
cd6ce148 430struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
9a95e4ef 431 blk_mq_req_flags_t flags)
320ae51f 432{
e6e7abff
CH
433 struct blk_mq_alloc_data data = {
434 .q = q,
435 .flags = flags,
436 .cmd_flags = op,
47c122e3 437 .nr_tags = 1,
e6e7abff 438 };
bd166ef1 439 struct request *rq;
a492f075 440 int ret;
320ae51f 441
3a0a5299 442 ret = blk_queue_enter(q, flags);
a492f075
JL
443 if (ret)
444 return ERR_PTR(ret);
320ae51f 445
b90cfaed 446 rq = __blk_mq_alloc_requests(&data);
bd166ef1 447 if (!rq)
a5ea5811 448 goto out_queue_exit;
0c4de0f3
CH
449 rq->__data_len = 0;
450 rq->__sector = (sector_t) -1;
451 rq->bio = rq->biotail = NULL;
320ae51f 452 return rq;
a5ea5811
CH
453out_queue_exit:
454 blk_queue_exit(q);
455 return ERR_PTR(-EWOULDBLOCK);
320ae51f 456}
4bb659b1 457EXPORT_SYMBOL(blk_mq_alloc_request);
320ae51f 458
cd6ce148 459struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
9a95e4ef 460 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
1f5bd336 461{
e6e7abff
CH
462 struct blk_mq_alloc_data data = {
463 .q = q,
464 .flags = flags,
465 .cmd_flags = op,
47c122e3 466 .nr_tags = 1,
e6e7abff 467 };
600c3b0c 468 u64 alloc_time_ns = 0;
6d2809d5 469 unsigned int cpu;
600c3b0c 470 unsigned int tag;
1f5bd336
ML
471 int ret;
472
600c3b0c
CH
473 /* alloc_time includes depth and tag waits */
474 if (blk_queue_rq_alloc_time(q))
475 alloc_time_ns = ktime_get_ns();
476
1f5bd336
ML
477 /*
478 * If the tag allocator sleeps we could get an allocation for a
479 * different hardware context. No need to complicate the low level
480 * allocator for this for the rare use case of a command tied to
481 * a specific queue.
482 */
600c3b0c 483 if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
1f5bd336
ML
484 return ERR_PTR(-EINVAL);
485
486 if (hctx_idx >= q->nr_hw_queues)
487 return ERR_PTR(-EIO);
488
3a0a5299 489 ret = blk_queue_enter(q, flags);
1f5bd336
ML
490 if (ret)
491 return ERR_PTR(ret);
492
c8712c6a
CH
493 /*
494 * Check if the hardware context is actually mapped to anything.
495 * If not tell the caller that it should skip this queue.
496 */
a5ea5811 497 ret = -EXDEV;
e6e7abff
CH
498 data.hctx = q->queue_hw_ctx[hctx_idx];
499 if (!blk_mq_hw_queue_mapped(data.hctx))
a5ea5811 500 goto out_queue_exit;
e6e7abff
CH
501 cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
502 data.ctx = __blk_mq_get_ctx(q, cpu);
1f5bd336 503
42fdc5e4 504 if (!q->elevator)
600c3b0c
CH
505 blk_mq_tag_busy(data.hctx);
506
a5ea5811 507 ret = -EWOULDBLOCK;
600c3b0c
CH
508 tag = blk_mq_get_tag(&data);
509 if (tag == BLK_MQ_NO_TAG)
a5ea5811 510 goto out_queue_exit;
600c3b0c
CH
511 return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
512
a5ea5811
CH
513out_queue_exit:
514 blk_queue_exit(q);
515 return ERR_PTR(ret);
1f5bd336
ML
516}
517EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
518
12f5b931
KB
519static void __blk_mq_free_request(struct request *rq)
520{
521 struct request_queue *q = rq->q;
522 struct blk_mq_ctx *ctx = rq->mq_ctx;
ea4f995e 523 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
12f5b931
KB
524 const int sched_tag = rq->internal_tag;
525
a892c8d5 526 blk_crypto_free_request(rq);
986d413b 527 blk_pm_mark_last_busy(rq);
ea4f995e 528 rq->mq_hctx = NULL;
76647368 529 if (rq->tag != BLK_MQ_NO_TAG)
cae740a0 530 blk_mq_put_tag(hctx->tags, ctx, rq->tag);
76647368 531 if (sched_tag != BLK_MQ_NO_TAG)
cae740a0 532 blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
12f5b931
KB
533 blk_mq_sched_restart(hctx);
534 blk_queue_exit(q);
535}
536
6af54051 537void blk_mq_free_request(struct request *rq)
320ae51f 538{
320ae51f 539 struct request_queue *q = rq->q;
6af54051
CH
540 struct elevator_queue *e = q->elevator;
541 struct blk_mq_ctx *ctx = rq->mq_ctx;
ea4f995e 542 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
6af54051 543
5bbf4e5a 544 if (rq->rq_flags & RQF_ELVPRIV) {
f9cd4bfe
JA
545 if (e && e->type->ops.finish_request)
546 e->type->ops.finish_request(rq);
6af54051
CH
547 if (rq->elv.icq) {
548 put_io_context(rq->elv.icq->ioc);
549 rq->elv.icq = NULL;
550 }
551 }
320ae51f 552
6af54051 553 ctx->rq_completed[rq_is_sync(rq)]++;
e8064021 554 if (rq->rq_flags & RQF_MQ_INFLIGHT)
bccf5e26 555 __blk_mq_dec_active_requests(hctx);
87760e5e 556
7beb2f84 557 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
d152c682 558 laptop_io_completion(q->disk->bdi);
7beb2f84 559
a7905043 560 rq_qos_done(q, rq);
0d2602ca 561
12f5b931
KB
562 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
563 if (refcount_dec_and_test(&rq->ref))
564 __blk_mq_free_request(rq);
320ae51f 565}
1a3b595a 566EXPORT_SYMBOL_GPL(blk_mq_free_request);
320ae51f 567
47c122e3
JA
568void blk_mq_free_plug_rqs(struct blk_plug *plug)
569{
570 while (plug->cached_rq) {
571 struct request *rq;
572
573 rq = plug->cached_rq;
574 plug->cached_rq = rq->rq_next;
575 percpu_ref_get(&rq->q->q_usage_counter);
576 blk_mq_free_request(rq);
577 }
578}
579
2a842aca 580inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
320ae51f 581{
8971a3b7
PB
582 if (blk_mq_need_time_stamp(rq)) {
583 u64 now = ktime_get_ns();
fe1f4526 584
8971a3b7
PB
585 if (rq->rq_flags & RQF_STATS) {
586 blk_mq_poll_stats_start(rq->q);
587 blk_stat_add(rq, now);
588 }
522a7775 589
8971a3b7
PB
590 blk_mq_sched_completed_request(rq, now);
591 blk_account_io_done(rq, now);
4bc6339a
OS
592 }
593
91b63639 594 if (rq->end_io) {
a7905043 595 rq_qos_done(rq->q, rq);
320ae51f 596 rq->end_io(rq, error);
91b63639 597 } else {
320ae51f 598 blk_mq_free_request(rq);
91b63639 599 }
320ae51f 600}
c8a446ad 601EXPORT_SYMBOL(__blk_mq_end_request);
63151a44 602
2a842aca 603void blk_mq_end_request(struct request *rq, blk_status_t error)
63151a44
CH
604{
605 if (blk_update_request(rq, error, blk_rq_bytes(rq)))
606 BUG();
c8a446ad 607 __blk_mq_end_request(rq, error);
63151a44 608}
c8a446ad 609EXPORT_SYMBOL(blk_mq_end_request);
320ae51f 610
f9ab4918 611static void blk_complete_reqs(struct llist_head *list)
320ae51f 612{
f9ab4918
SAS
613 struct llist_node *entry = llist_reverse_order(llist_del_all(list));
614 struct request *rq, *next;
c3077b5d 615
f9ab4918 616 llist_for_each_entry_safe(rq, next, entry, ipi_list)
c3077b5d 617 rq->q->mq_ops->complete(rq);
320ae51f 618}
320ae51f 619
f9ab4918 620static __latent_entropy void blk_done_softirq(struct softirq_action *h)
320ae51f 621{
f9ab4918 622 blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
115243f5
CH
623}
624
c3077b5d
CH
625static int blk_softirq_cpu_dead(unsigned int cpu)
626{
f9ab4918 627 blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
c3077b5d
CH
628 return 0;
629}
630
40d09b53 631static void __blk_mq_complete_request_remote(void *data)
c3077b5d 632{
f9ab4918 633 __raise_softirq_irqoff(BLOCK_SOFTIRQ);
c3077b5d
CH
634}
635
96339526
CH
636static inline bool blk_mq_complete_need_ipi(struct request *rq)
637{
638 int cpu = raw_smp_processor_id();
639
640 if (!IS_ENABLED(CONFIG_SMP) ||
641 !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
642 return false;
71425189
SAS
643 /*
644 * With force threaded interrupts enabled, raising softirq from an SMP
645 * function call will always result in waking the ksoftirqd thread.
646 * This is probably worse than completing the request on a different
647 * cache domain.
648 */
91cc470e 649 if (force_irqthreads())
71425189 650 return false;
96339526
CH
651
652 /* same CPU or cache domain? Complete locally */
653 if (cpu == rq->mq_ctx->cpu ||
654 (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
655 cpus_share_cache(cpu, rq->mq_ctx->cpu)))
656 return false;
657
658 /* don't try to IPI to an offline CPU */
659 return cpu_online(rq->mq_ctx->cpu);
660}
661
f9ab4918
SAS
662static void blk_mq_complete_send_ipi(struct request *rq)
663{
664 struct llist_head *list;
665 unsigned int cpu;
666
667 cpu = rq->mq_ctx->cpu;
668 list = &per_cpu(blk_cpu_done, cpu);
669 if (llist_add(&rq->ipi_list, list)) {
670 INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
671 smp_call_function_single_async(cpu, &rq->csd);
672 }
673}
674
675static void blk_mq_raise_softirq(struct request *rq)
676{
677 struct llist_head *list;
678
679 preempt_disable();
680 list = this_cpu_ptr(&blk_cpu_done);
681 if (llist_add(&rq->ipi_list, list))
682 raise_softirq(BLOCK_SOFTIRQ);
683 preempt_enable();
684}
685
40d09b53 686bool blk_mq_complete_request_remote(struct request *rq)
320ae51f 687{
af78ff7c 688 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
36e76539 689
4ab32bf3
JA
690 /*
691 * For a polled request, always complete locallly, it's pointless
692 * to redirect the completion.
693 */
40d09b53
CH
694 if (rq->cmd_flags & REQ_HIPRI)
695 return false;
38535201 696
96339526 697 if (blk_mq_complete_need_ipi(rq)) {
f9ab4918
SAS
698 blk_mq_complete_send_ipi(rq);
699 return true;
3d6efbf6 700 }
40d09b53 701
f9ab4918
SAS
702 if (rq->q->nr_hw_queues == 1) {
703 blk_mq_raise_softirq(rq);
704 return true;
705 }
706 return false;
40d09b53
CH
707}
708EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
709
710/**
711 * blk_mq_complete_request - end I/O on a request
712 * @rq: the request being processed
713 *
714 * Description:
715 * Complete a request by scheduling the ->complete_rq operation.
716 **/
717void blk_mq_complete_request(struct request *rq)
718{
719 if (!blk_mq_complete_request_remote(rq))
720 rq->q->mq_ops->complete(rq);
320ae51f 721}
15f73f5b 722EXPORT_SYMBOL(blk_mq_complete_request);
30a91cb4 723
04ced159 724static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
b7435db8 725 __releases(hctx->srcu)
04ced159
JA
726{
727 if (!(hctx->flags & BLK_MQ_F_BLOCKING))
728 rcu_read_unlock();
729 else
05707b64 730 srcu_read_unlock(hctx->srcu, srcu_idx);
04ced159
JA
731}
732
733static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
b7435db8 734 __acquires(hctx->srcu)
04ced159 735{
08b5a6e2
JA
736 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
737 /* shut up gcc false positive */
738 *srcu_idx = 0;
04ced159 739 rcu_read_lock();
08b5a6e2 740 } else
05707b64 741 *srcu_idx = srcu_read_lock(hctx->srcu);
04ced159
JA
742}
743
105663f7
AA
744/**
745 * blk_mq_start_request - Start processing a request
746 * @rq: Pointer to request to be started
747 *
748 * Function used by device drivers to notify the block layer that a request
749 * is going to be processed now, so blk layer can do proper initializations
750 * such as starting the timeout timer.
751 */
e2490073 752void blk_mq_start_request(struct request *rq)
320ae51f
JA
753{
754 struct request_queue *q = rq->q;
755
a54895fa 756 trace_block_rq_issue(rq);
320ae51f 757
cf43e6be 758 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
00067077
JA
759 u64 start_time;
760#ifdef CONFIG_BLK_CGROUP
761 if (rq->bio)
762 start_time = bio_issue_time(&rq->bio->bi_issue);
763 else
764#endif
765 start_time = ktime_get_ns();
766 rq->io_start_time_ns = start_time;
3d244306 767 rq->stats_sectors = blk_rq_sectors(rq);
cf43e6be 768 rq->rq_flags |= RQF_STATS;
a7905043 769 rq_qos_issue(q, rq);
cf43e6be
JA
770 }
771
1d9bd516 772 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
538b7534 773
1d9bd516 774 blk_add_timer(rq);
12f5b931 775 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
49f5baa5 776
54d4e6ab
MG
777#ifdef CONFIG_BLK_DEV_INTEGRITY
778 if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
779 q->integrity.profile->prepare_fn(rq);
780#endif
320ae51f 781}
e2490073 782EXPORT_SYMBOL(blk_mq_start_request);
320ae51f 783
ed0791b2 784static void __blk_mq_requeue_request(struct request *rq)
320ae51f
JA
785{
786 struct request_queue *q = rq->q;
787
923218f6
ML
788 blk_mq_put_driver_tag(rq);
789
a54895fa 790 trace_block_rq_requeue(rq);
a7905043 791 rq_qos_requeue(q, rq);
49f5baa5 792
12f5b931
KB
793 if (blk_mq_request_started(rq)) {
794 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
da661267 795 rq->rq_flags &= ~RQF_TIMED_OUT;
e2490073 796 }
320ae51f
JA
797}
798
2b053aca 799void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
ed0791b2 800{
ed0791b2 801 __blk_mq_requeue_request(rq);
ed0791b2 802
105976f5
ML
803 /* this request will be re-inserted to io scheduler queue */
804 blk_mq_sched_requeue_request(rq);
805
7d692330 806 BUG_ON(!list_empty(&rq->queuelist));
2b053aca 807 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
ed0791b2
CH
808}
809EXPORT_SYMBOL(blk_mq_requeue_request);
810
6fca6a61
CH
811static void blk_mq_requeue_work(struct work_struct *work)
812{
813 struct request_queue *q =
2849450a 814 container_of(work, struct request_queue, requeue_work.work);
6fca6a61
CH
815 LIST_HEAD(rq_list);
816 struct request *rq, *next;
6fca6a61 817
18e9781d 818 spin_lock_irq(&q->requeue_lock);
6fca6a61 819 list_splice_init(&q->requeue_list, &rq_list);
18e9781d 820 spin_unlock_irq(&q->requeue_lock);
6fca6a61
CH
821
822 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
aef1897c 823 if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
6fca6a61
CH
824 continue;
825
e8064021 826 rq->rq_flags &= ~RQF_SOFTBARRIER;
6fca6a61 827 list_del_init(&rq->queuelist);
aef1897c
JW
828 /*
829 * If RQF_DONTPREP, rq has contained some driver specific
830 * data, so insert it to hctx dispatch list to avoid any
831 * merge.
832 */
833 if (rq->rq_flags & RQF_DONTPREP)
01e99aec 834 blk_mq_request_bypass_insert(rq, false, false);
aef1897c
JW
835 else
836 blk_mq_sched_insert_request(rq, true, false, false);
6fca6a61
CH
837 }
838
839 while (!list_empty(&rq_list)) {
840 rq = list_entry(rq_list.next, struct request, queuelist);
841 list_del_init(&rq->queuelist);
9e97d295 842 blk_mq_sched_insert_request(rq, false, false, false);
6fca6a61
CH
843 }
844
52d7f1b5 845 blk_mq_run_hw_queues(q, false);
6fca6a61
CH
846}
847
2b053aca
BVA
848void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
849 bool kick_requeue_list)
6fca6a61
CH
850{
851 struct request_queue *q = rq->q;
852 unsigned long flags;
853
854 /*
855 * We abuse this flag that is otherwise used by the I/O scheduler to
ff821d27 856 * request head insertion from the workqueue.
6fca6a61 857 */
e8064021 858 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
6fca6a61
CH
859
860 spin_lock_irqsave(&q->requeue_lock, flags);
861 if (at_head) {
e8064021 862 rq->rq_flags |= RQF_SOFTBARRIER;
6fca6a61
CH
863 list_add(&rq->queuelist, &q->requeue_list);
864 } else {
865 list_add_tail(&rq->queuelist, &q->requeue_list);
866 }
867 spin_unlock_irqrestore(&q->requeue_lock, flags);
2b053aca
BVA
868
869 if (kick_requeue_list)
870 blk_mq_kick_requeue_list(q);
6fca6a61 871}
6fca6a61
CH
872
873void blk_mq_kick_requeue_list(struct request_queue *q)
874{
ae943d20 875 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
6fca6a61
CH
876}
877EXPORT_SYMBOL(blk_mq_kick_requeue_list);
878
2849450a
MS
879void blk_mq_delay_kick_requeue_list(struct request_queue *q,
880 unsigned long msecs)
881{
d4acf365
BVA
882 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
883 msecs_to_jiffies(msecs));
2849450a
MS
884}
885EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
886
0e62f51f
JA
887struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
888{
88c7b2b7
JA
889 if (tag < tags->nr_tags) {
890 prefetch(tags->rqs[tag]);
4ee86bab 891 return tags->rqs[tag];
88c7b2b7 892 }
4ee86bab
HR
893
894 return NULL;
24d2f903
CH
895}
896EXPORT_SYMBOL(blk_mq_tag_to_rq);
897
3c94d83c
JA
898static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
899 void *priv, bool reserved)
ae879912
JA
900{
901 /*
05a4fed6 902 * If we find a request that isn't idle and the queue matches,
3c94d83c 903 * we know the queue is busy. Return false to stop the iteration.
ae879912 904 */
05a4fed6 905 if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
ae879912
JA
906 bool *busy = priv;
907
908 *busy = true;
909 return false;
910 }
911
912 return true;
913}
914
3c94d83c 915bool blk_mq_queue_inflight(struct request_queue *q)
ae879912
JA
916{
917 bool busy = false;
918
3c94d83c 919 blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
ae879912
JA
920 return busy;
921}
3c94d83c 922EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
ae879912 923
358f70da 924static void blk_mq_rq_timed_out(struct request *req, bool reserved)
320ae51f 925{
da661267 926 req->rq_flags |= RQF_TIMED_OUT;
d1210d5a
CH
927 if (req->q->mq_ops->timeout) {
928 enum blk_eh_timer_return ret;
929
930 ret = req->q->mq_ops->timeout(req, reserved);
931 if (ret == BLK_EH_DONE)
932 return;
933 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
46f92d42 934 }
d1210d5a
CH
935
936 blk_add_timer(req);
87ee7b11 937}
5b3f25fc 938
12f5b931 939static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
81481eb4 940{
12f5b931 941 unsigned long deadline;
87ee7b11 942
12f5b931
KB
943 if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
944 return false;
da661267
CH
945 if (rq->rq_flags & RQF_TIMED_OUT)
946 return false;
a7af0af3 947
079076b3 948 deadline = READ_ONCE(rq->deadline);
12f5b931
KB
949 if (time_after_eq(jiffies, deadline))
950 return true;
a7af0af3 951
12f5b931
KB
952 if (*next == 0)
953 *next = deadline;
954 else if (time_after(*next, deadline))
955 *next = deadline;
956 return false;
87ee7b11
JA
957}
958
2e315dc0
ML
959void blk_mq_put_rq_ref(struct request *rq)
960{
a9ed27a7 961 if (is_flush_rq(rq))
2e315dc0
ML
962 rq->end_io(rq, 0);
963 else if (refcount_dec_and_test(&rq->ref))
964 __blk_mq_free_request(rq);
965}
966
7baa8572 967static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
1d9bd516
TH
968 struct request *rq, void *priv, bool reserved)
969{
12f5b931
KB
970 unsigned long *next = priv;
971
972 /*
c797b40c
ML
973 * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
974 * be reallocated underneath the timeout handler's processing, then
975 * the expire check is reliable. If the request is not expired, then
976 * it was completed and reallocated as a new request after returning
977 * from blk_mq_check_expired().
1d9bd516 978 */
12f5b931 979 if (blk_mq_req_expired(rq, next))
1d9bd516 980 blk_mq_rq_timed_out(rq, reserved);
7baa8572 981 return true;
1d9bd516
TH
982}
983
287922eb 984static void blk_mq_timeout_work(struct work_struct *work)
320ae51f 985{
287922eb
CH
986 struct request_queue *q =
987 container_of(work, struct request_queue, timeout_work);
12f5b931 988 unsigned long next = 0;
1d9bd516 989 struct blk_mq_hw_ctx *hctx;
81481eb4 990 int i;
320ae51f 991
71f79fb3
GKB
992 /* A deadlock might occur if a request is stuck requiring a
993 * timeout at the same time a queue freeze is waiting
994 * completion, since the timeout code would not be able to
995 * acquire the queue reference here.
996 *
997 * That's why we don't use blk_queue_enter here; instead, we use
998 * percpu_ref_tryget directly, because we need to be able to
999 * obtain a reference even in the short window between the queue
1000 * starting to freeze, by dropping the first reference in
1671d522 1001 * blk_freeze_queue_start, and the moment the last request is
71f79fb3
GKB
1002 * consumed, marked by the instant q_usage_counter reaches
1003 * zero.
1004 */
1005 if (!percpu_ref_tryget(&q->q_usage_counter))
287922eb
CH
1006 return;
1007
12f5b931 1008 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
320ae51f 1009
12f5b931
KB
1010 if (next != 0) {
1011 mod_timer(&q->timeout, next);
0d2602ca 1012 } else {
fcd36c36
BVA
1013 /*
1014 * Request timeouts are handled as a forward rolling timer. If
1015 * we end up here it means that no requests are pending and
1016 * also that no request has been pending for a while. Mark
1017 * each hctx as idle.
1018 */
f054b56c
ML
1019 queue_for_each_hw_ctx(q, hctx, i) {
1020 /* the hctx may be unmapped, so check it here */
1021 if (blk_mq_hw_queue_mapped(hctx))
1022 blk_mq_tag_idle(hctx);
1023 }
0d2602ca 1024 }
287922eb 1025 blk_queue_exit(q);
320ae51f
JA
1026}
1027
88459642
OS
1028struct flush_busy_ctx_data {
1029 struct blk_mq_hw_ctx *hctx;
1030 struct list_head *list;
1031};
1032
1033static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
1034{
1035 struct flush_busy_ctx_data *flush_data = data;
1036 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
1037 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
c16d6b5a 1038 enum hctx_type type = hctx->type;
88459642 1039
88459642 1040 spin_lock(&ctx->lock);
c16d6b5a 1041 list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
e9a99a63 1042 sbitmap_clear_bit(sb, bitnr);
88459642
OS
1043 spin_unlock(&ctx->lock);
1044 return true;
1045}
1046
1429d7c9
JA
1047/*
1048 * Process software queues that have been marked busy, splicing them
1049 * to the for-dispatch
1050 */
2c3ad667 1051void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1429d7c9 1052{
88459642
OS
1053 struct flush_busy_ctx_data data = {
1054 .hctx = hctx,
1055 .list = list,
1056 };
1429d7c9 1057
88459642 1058 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
1429d7c9 1059}
2c3ad667 1060EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
1429d7c9 1061
b347689f
ML
1062struct dispatch_rq_data {
1063 struct blk_mq_hw_ctx *hctx;
1064 struct request *rq;
1065};
1066
1067static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
1068 void *data)
1069{
1070 struct dispatch_rq_data *dispatch_data = data;
1071 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
1072 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
c16d6b5a 1073 enum hctx_type type = hctx->type;
b347689f
ML
1074
1075 spin_lock(&ctx->lock);
c16d6b5a
ML
1076 if (!list_empty(&ctx->rq_lists[type])) {
1077 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
b347689f 1078 list_del_init(&dispatch_data->rq->queuelist);
c16d6b5a 1079 if (list_empty(&ctx->rq_lists[type]))
b347689f
ML
1080 sbitmap_clear_bit(sb, bitnr);
1081 }
1082 spin_unlock(&ctx->lock);
1083
1084 return !dispatch_data->rq;
1085}
1086
1087struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
1088 struct blk_mq_ctx *start)
1089{
f31967f0 1090 unsigned off = start ? start->index_hw[hctx->type] : 0;
b347689f
ML
1091 struct dispatch_rq_data data = {
1092 .hctx = hctx,
1093 .rq = NULL,
1094 };
1095
1096 __sbitmap_for_each_set(&hctx->ctx_map, off,
1097 dispatch_rq_from_ctx, &data);
1098
1099 return data.rq;
1100}
1101
703fd1c0
JA
1102static inline unsigned int queued_to_index(unsigned int queued)
1103{
1104 if (!queued)
1105 return 0;
1429d7c9 1106
703fd1c0 1107 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
1429d7c9
JA
1108}
1109
570e9b73
ML
1110static bool __blk_mq_get_driver_tag(struct request *rq)
1111{
ae0f1a73 1112 struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
570e9b73 1113 unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
570e9b73
ML
1114 int tag;
1115
568f2700
ML
1116 blk_mq_tag_busy(rq->mq_hctx);
1117
570e9b73 1118 if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
ae0f1a73 1119 bt = &rq->mq_hctx->tags->breserved_tags;
570e9b73 1120 tag_offset = 0;
28500850
ML
1121 } else {
1122 if (!hctx_may_queue(rq->mq_hctx, bt))
1123 return false;
570e9b73
ML
1124 }
1125
570e9b73
ML
1126 tag = __sbitmap_queue_get(bt);
1127 if (tag == BLK_MQ_NO_TAG)
1128 return false;
1129
1130 rq->tag = tag + tag_offset;
570e9b73
ML
1131 return true;
1132}
1133
61347154 1134bool blk_mq_get_driver_tag(struct request *rq)
570e9b73 1135{
568f2700
ML
1136 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
1137
1138 if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
1139 return false;
1140
51db1c37 1141 if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
568f2700
ML
1142 !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
1143 rq->rq_flags |= RQF_MQ_INFLIGHT;
bccf5e26 1144 __blk_mq_inc_active_requests(hctx);
568f2700
ML
1145 }
1146 hctx->tags->rqs[rq->tag] = rq;
1147 return true;
570e9b73
ML
1148}
1149
eb619fdb
JA
1150static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1151 int flags, void *key)
da55f2cc
OS
1152{
1153 struct blk_mq_hw_ctx *hctx;
1154
1155 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1156
5815839b 1157 spin_lock(&hctx->dispatch_wait_lock);
e8618575
JA
1158 if (!list_empty(&wait->entry)) {
1159 struct sbitmap_queue *sbq;
1160
1161 list_del_init(&wait->entry);
ae0f1a73 1162 sbq = &hctx->tags->bitmap_tags;
e8618575
JA
1163 atomic_dec(&sbq->ws_active);
1164 }
5815839b
ML
1165 spin_unlock(&hctx->dispatch_wait_lock);
1166
da55f2cc
OS
1167 blk_mq_run_hw_queue(hctx, true);
1168 return 1;
1169}
1170
f906a6a0
JA
1171/*
1172 * Mark us waiting for a tag. For shared tags, this involves hooking us into
ee3e4de5
BVA
1173 * the tag wakeups. For non-shared tags, we can simply mark us needing a
1174 * restart. For both cases, take care to check the condition again after
f906a6a0
JA
1175 * marking us as waiting.
1176 */
2278d69f 1177static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
f906a6a0 1178 struct request *rq)
da55f2cc 1179{
ae0f1a73 1180 struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
5815839b 1181 struct wait_queue_head *wq;
f906a6a0
JA
1182 wait_queue_entry_t *wait;
1183 bool ret;
da55f2cc 1184
51db1c37 1185 if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
684b7324 1186 blk_mq_sched_mark_restart_hctx(hctx);
f906a6a0 1187
c27d53fb
BVA
1188 /*
1189 * It's possible that a tag was freed in the window between the
1190 * allocation failure and adding the hardware queue to the wait
1191 * queue.
1192 *
1193 * Don't clear RESTART here, someone else could have set it.
1194 * At most this will cost an extra queue run.
1195 */
8ab6bb9e 1196 return blk_mq_get_driver_tag(rq);
eb619fdb 1197 }
eb619fdb 1198
2278d69f 1199 wait = &hctx->dispatch_wait;
c27d53fb
BVA
1200 if (!list_empty_careful(&wait->entry))
1201 return false;
1202
e8618575 1203 wq = &bt_wait_ptr(sbq, hctx)->wait;
5815839b
ML
1204
1205 spin_lock_irq(&wq->lock);
1206 spin_lock(&hctx->dispatch_wait_lock);
c27d53fb 1207 if (!list_empty(&wait->entry)) {
5815839b
ML
1208 spin_unlock(&hctx->dispatch_wait_lock);
1209 spin_unlock_irq(&wq->lock);
c27d53fb 1210 return false;
eb619fdb
JA
1211 }
1212
e8618575 1213 atomic_inc(&sbq->ws_active);
5815839b
ML
1214 wait->flags &= ~WQ_FLAG_EXCLUSIVE;
1215 __add_wait_queue(wq, wait);
c27d53fb 1216
da55f2cc 1217 /*
eb619fdb
JA
1218 * It's possible that a tag was freed in the window between the
1219 * allocation failure and adding the hardware queue to the wait
1220 * queue.
da55f2cc 1221 */
8ab6bb9e 1222 ret = blk_mq_get_driver_tag(rq);
c27d53fb 1223 if (!ret) {
5815839b
ML
1224 spin_unlock(&hctx->dispatch_wait_lock);
1225 spin_unlock_irq(&wq->lock);
c27d53fb 1226 return false;
eb619fdb 1227 }
c27d53fb
BVA
1228
1229 /*
1230 * We got a tag, remove ourselves from the wait queue to ensure
1231 * someone else gets the wakeup.
1232 */
c27d53fb 1233 list_del_init(&wait->entry);
e8618575 1234 atomic_dec(&sbq->ws_active);
5815839b
ML
1235 spin_unlock(&hctx->dispatch_wait_lock);
1236 spin_unlock_irq(&wq->lock);
c27d53fb
BVA
1237
1238 return true;
da55f2cc
OS
1239}
1240
6e768717
ML
1241#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
1242#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
1243/*
1244 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
1245 * - EWMA is one simple way to compute running average value
1246 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
1247 * - take 4 as factor for avoiding to get too small(0) result, and this
1248 * factor doesn't matter because EWMA decreases exponentially
1249 */
1250static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
1251{
1252 unsigned int ewma;
1253
6e768717
ML
1254 ewma = hctx->dispatch_busy;
1255
1256 if (!ewma && !busy)
1257 return;
1258
1259 ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
1260 if (busy)
1261 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
1262 ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
1263
1264 hctx->dispatch_busy = ewma;
1265}
1266
86ff7c2a
ML
1267#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
1268
c92a4103
JT
1269static void blk_mq_handle_dev_resource(struct request *rq,
1270 struct list_head *list)
1271{
1272 struct request *next =
1273 list_first_entry_or_null(list, struct request, queuelist);
1274
1275 /*
1276 * If an I/O scheduler has been configured and we got a driver tag for
1277 * the next request already, free it.
1278 */
1279 if (next)
1280 blk_mq_put_driver_tag(next);
1281
1282 list_add(&rq->queuelist, list);
1283 __blk_mq_requeue_request(rq);
1284}
1285
0512a75b
KB
1286static void blk_mq_handle_zone_resource(struct request *rq,
1287 struct list_head *zone_list)
1288{
1289 /*
1290 * If we end up here it is because we cannot dispatch a request to a
1291 * specific zone due to LLD level zone-write locking or other zone
1292 * related resource not being available. In this case, set the request
1293 * aside in zone_list for retrying it later.
1294 */
1295 list_add(&rq->queuelist, zone_list);
1296 __blk_mq_requeue_request(rq);
1297}
1298
75383524
ML
1299enum prep_dispatch {
1300 PREP_DISPATCH_OK,
1301 PREP_DISPATCH_NO_TAG,
1302 PREP_DISPATCH_NO_BUDGET,
1303};
1304
1305static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
1306 bool need_budget)
1307{
1308 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2a5a24aa 1309 int budget_token = -1;
75383524 1310
2a5a24aa
ML
1311 if (need_budget) {
1312 budget_token = blk_mq_get_dispatch_budget(rq->q);
1313 if (budget_token < 0) {
1314 blk_mq_put_driver_tag(rq);
1315 return PREP_DISPATCH_NO_BUDGET;
1316 }
1317 blk_mq_set_rq_budget_token(rq, budget_token);
75383524
ML
1318 }
1319
1320 if (!blk_mq_get_driver_tag(rq)) {
1321 /*
1322 * The initial allocation attempt failed, so we need to
1323 * rerun the hardware queue when a tag is freed. The
1324 * waitqueue takes care of that. If the queue is run
1325 * before we add this entry back on the dispatch list,
1326 * we'll re-run it below.
1327 */
1328 if (!blk_mq_mark_tag_wait(hctx, rq)) {
1fd40b5e
ML
1329 /*
1330 * All budgets not got from this function will be put
1331 * together during handling partial dispatch
1332 */
1333 if (need_budget)
2a5a24aa 1334 blk_mq_put_dispatch_budget(rq->q, budget_token);
75383524
ML
1335 return PREP_DISPATCH_NO_TAG;
1336 }
1337 }
1338
1339 return PREP_DISPATCH_OK;
1340}
1341
1fd40b5e
ML
1342/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
1343static void blk_mq_release_budgets(struct request_queue *q,
2a5a24aa 1344 struct list_head *list)
1fd40b5e 1345{
2a5a24aa 1346 struct request *rq;
1fd40b5e 1347
2a5a24aa
ML
1348 list_for_each_entry(rq, list, queuelist) {
1349 int budget_token = blk_mq_get_rq_budget_token(rq);
1fd40b5e 1350
2a5a24aa
ML
1351 if (budget_token >= 0)
1352 blk_mq_put_dispatch_budget(q, budget_token);
1353 }
1fd40b5e
ML
1354}
1355
1f57f8d4
JA
1356/*
1357 * Returns true if we did some work AND can potentially do more.
1358 */
445874e8 1359bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
1fd40b5e 1360 unsigned int nr_budgets)
320ae51f 1361{
75383524 1362 enum prep_dispatch prep;
445874e8 1363 struct request_queue *q = hctx->queue;
6d6f167c 1364 struct request *rq, *nxt;
fc17b653 1365 int errors, queued;
86ff7c2a 1366 blk_status_t ret = BLK_STS_OK;
0512a75b 1367 LIST_HEAD(zone_list);
320ae51f 1368
81380ca1
OS
1369 if (list_empty(list))
1370 return false;
1371
320ae51f
JA
1372 /*
1373 * Now process all the entries, sending them to the driver.
1374 */
93efe981 1375 errors = queued = 0;
81380ca1 1376 do {
74c45052 1377 struct blk_mq_queue_data bd;
320ae51f 1378
f04c3df3 1379 rq = list_first_entry(list, struct request, queuelist);
0bca799b 1380
445874e8 1381 WARN_ON_ONCE(hctx != rq->mq_hctx);
1fd40b5e 1382 prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
75383524 1383 if (prep != PREP_DISPATCH_OK)
0bca799b 1384 break;
de148297 1385
320ae51f 1386 list_del_init(&rq->queuelist);
320ae51f 1387
74c45052 1388 bd.rq = rq;
113285b4
JA
1389
1390 /*
1391 * Flag last if we have no more requests, or if we have more
1392 * but can't assign a driver tag to it.
1393 */
1394 if (list_empty(list))
1395 bd.last = true;
1396 else {
113285b4 1397 nxt = list_first_entry(list, struct request, queuelist);
8ab6bb9e 1398 bd.last = !blk_mq_get_driver_tag(nxt);
113285b4 1399 }
74c45052 1400
1fd40b5e
ML
1401 /*
1402 * once the request is queued to lld, no need to cover the
1403 * budget any more
1404 */
1405 if (nr_budgets)
1406 nr_budgets--;
74c45052 1407 ret = q->mq_ops->queue_rq(hctx, &bd);
7bf13729
ML
1408 switch (ret) {
1409 case BLK_STS_OK:
1410 queued++;
320ae51f 1411 break;
7bf13729
ML
1412 case BLK_STS_RESOURCE:
1413 case BLK_STS_DEV_RESOURCE:
1414 blk_mq_handle_dev_resource(rq, list);
1415 goto out;
1416 case BLK_STS_ZONE_RESOURCE:
0512a75b
KB
1417 /*
1418 * Move the request to zone_list and keep going through
1419 * the dispatch list to find more requests the drive can
1420 * accept.
1421 */
1422 blk_mq_handle_zone_resource(rq, &zone_list);
7bf13729
ML
1423 break;
1424 default:
93efe981 1425 errors++;
e21ee5a6 1426 blk_mq_end_request(rq, ret);
320ae51f 1427 }
81380ca1 1428 } while (!list_empty(list));
7bf13729 1429out:
0512a75b
KB
1430 if (!list_empty(&zone_list))
1431 list_splice_tail_init(&zone_list, list);
1432
703fd1c0 1433 hctx->dispatched[queued_to_index(queued)]++;
320ae51f 1434
632bfb63 1435 /* If we didn't flush the entire list, we could have told the driver
1436 * there was more coming, but that turned out to be a lie.
1437 */
1438 if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued)
1439 q->mq_ops->commit_rqs(hctx);
320ae51f
JA
1440 /*
1441 * Any items that need requeuing? Stuff them into hctx->dispatch,
1442 * that is where we will continue on next queue run.
1443 */
f04c3df3 1444 if (!list_empty(list)) {
86ff7c2a 1445 bool needs_restart;
75383524
ML
1446 /* For non-shared tags, the RESTART check will suffice */
1447 bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
51db1c37 1448 (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
75383524 1449 bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
86ff7c2a 1450
2a5a24aa
ML
1451 if (nr_budgets)
1452 blk_mq_release_budgets(q, list);
86ff7c2a 1453
320ae51f 1454 spin_lock(&hctx->lock);
01e99aec 1455 list_splice_tail_init(list, &hctx->dispatch);
320ae51f 1456 spin_unlock(&hctx->lock);
f04c3df3 1457
d7d8535f
ML
1458 /*
1459 * Order adding requests to hctx->dispatch and checking
1460 * SCHED_RESTART flag. The pair of this smp_mb() is the one
1461 * in blk_mq_sched_restart(). Avoid restart code path to
1462 * miss the new added requests to hctx->dispatch, meantime
1463 * SCHED_RESTART is observed here.
1464 */
1465 smp_mb();
1466
9ba52e58 1467 /*
710c785f
BVA
1468 * If SCHED_RESTART was set by the caller of this function and
1469 * it is no longer set that means that it was cleared by another
1470 * thread and hence that a queue rerun is needed.
9ba52e58 1471 *
eb619fdb
JA
1472 * If 'no_tag' is set, that means that we failed getting
1473 * a driver tag with an I/O scheduler attached. If our dispatch
1474 * waitqueue is no longer active, ensure that we run the queue
1475 * AFTER adding our entries back to the list.
bd166ef1 1476 *
710c785f
BVA
1477 * If no I/O scheduler has been configured it is possible that
1478 * the hardware queue got stopped and restarted before requests
1479 * were pushed back onto the dispatch list. Rerun the queue to
1480 * avoid starvation. Notes:
1481 * - blk_mq_run_hw_queue() checks whether or not a queue has
1482 * been stopped before rerunning a queue.
1483 * - Some but not all block drivers stop a queue before
fc17b653 1484 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
710c785f 1485 * and dm-rq.
86ff7c2a
ML
1486 *
1487 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
1488 * bit is set, run queue after a delay to avoid IO stalls
ab3cee37
DA
1489 * that could otherwise occur if the queue is idle. We'll do
1490 * similar if we couldn't get budget and SCHED_RESTART is set.
bd166ef1 1491 */
86ff7c2a
ML
1492 needs_restart = blk_mq_sched_needs_restart(hctx);
1493 if (!needs_restart ||
eb619fdb 1494 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
bd166ef1 1495 blk_mq_run_hw_queue(hctx, true);
ab3cee37
DA
1496 else if (needs_restart && (ret == BLK_STS_RESOURCE ||
1497 no_budget_avail))
86ff7c2a 1498 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
1f57f8d4 1499
6e768717 1500 blk_mq_update_dispatch_busy(hctx, true);
1f57f8d4 1501 return false;
6e768717
ML
1502 } else
1503 blk_mq_update_dispatch_busy(hctx, false);
f04c3df3 1504
93efe981 1505 return (queued + errors) != 0;
f04c3df3
JA
1506}
1507
105663f7
AA
1508/**
1509 * __blk_mq_run_hw_queue - Run a hardware queue.
1510 * @hctx: Pointer to the hardware queue to run.
1511 *
1512 * Send pending requests to the hardware.
1513 */
6a83e74d
BVA
1514static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1515{
1516 int srcu_idx;
1517
b7a71e66
JA
1518 /*
1519 * We can't run the queue inline with ints disabled. Ensure that
1520 * we catch bad users of this early.
1521 */
1522 WARN_ON_ONCE(in_interrupt());
1523
04ced159 1524 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
bf4907c0 1525
04ced159
JA
1526 hctx_lock(hctx, &srcu_idx);
1527 blk_mq_sched_dispatch_requests(hctx);
1528 hctx_unlock(hctx, srcu_idx);
6a83e74d
BVA
1529}
1530
f82ddf19
ML
1531static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
1532{
1533 int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
1534
1535 if (cpu >= nr_cpu_ids)
1536 cpu = cpumask_first(hctx->cpumask);
1537 return cpu;
1538}
1539
506e931f
JA
1540/*
1541 * It'd be great if the workqueue API had a way to pass
1542 * in a mask and had some smarts for more clever placement.
1543 * For now we just round-robin here, switching for every
1544 * BLK_MQ_CPU_WORK_BATCH queued items.
1545 */
1546static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1547{
7bed4595 1548 bool tried = false;
476f8c98 1549 int next_cpu = hctx->next_cpu;
7bed4595 1550
b657d7e6
CH
1551 if (hctx->queue->nr_hw_queues == 1)
1552 return WORK_CPU_UNBOUND;
506e931f
JA
1553
1554 if (--hctx->next_cpu_batch <= 0) {
7bed4595 1555select_cpu:
476f8c98 1556 next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
20e4d813 1557 cpu_online_mask);
506e931f 1558 if (next_cpu >= nr_cpu_ids)
f82ddf19 1559 next_cpu = blk_mq_first_mapped_cpu(hctx);
506e931f
JA
1560 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1561 }
1562
7bed4595
ML
1563 /*
1564 * Do unbound schedule if we can't find a online CPU for this hctx,
1565 * and it should only happen in the path of handling CPU DEAD.
1566 */
476f8c98 1567 if (!cpu_online(next_cpu)) {
7bed4595
ML
1568 if (!tried) {
1569 tried = true;
1570 goto select_cpu;
1571 }
1572
1573 /*
1574 * Make sure to re-select CPU next time once after CPUs
1575 * in hctx->cpumask become online again.
1576 */
476f8c98 1577 hctx->next_cpu = next_cpu;
7bed4595
ML
1578 hctx->next_cpu_batch = 1;
1579 return WORK_CPU_UNBOUND;
1580 }
476f8c98
ML
1581
1582 hctx->next_cpu = next_cpu;
1583 return next_cpu;
506e931f
JA
1584}
1585
105663f7
AA
1586/**
1587 * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
1588 * @hctx: Pointer to the hardware queue to run.
1589 * @async: If we want to run the queue asynchronously.
fa94ba8a 1590 * @msecs: Milliseconds of delay to wait before running the queue.
105663f7
AA
1591 *
1592 * If !@async, try to run the queue now. Else, run the queue asynchronously and
1593 * with a delay of @msecs.
1594 */
7587a5ae
BVA
1595static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1596 unsigned long msecs)
320ae51f 1597{
5435c023 1598 if (unlikely(blk_mq_hctx_stopped(hctx)))
320ae51f
JA
1599 return;
1600
1b792f2f 1601 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
2a90d4aa
PB
1602 int cpu = get_cpu();
1603 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
398205b8 1604 __blk_mq_run_hw_queue(hctx);
2a90d4aa 1605 put_cpu();
398205b8
PB
1606 return;
1607 }
e4043dcf 1608
2a90d4aa 1609 put_cpu();
e4043dcf 1610 }
398205b8 1611
ae943d20
BVA
1612 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
1613 msecs_to_jiffies(msecs));
7587a5ae
BVA
1614}
1615
105663f7
AA
1616/**
1617 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
1618 * @hctx: Pointer to the hardware queue to run.
fa94ba8a 1619 * @msecs: Milliseconds of delay to wait before running the queue.
105663f7
AA
1620 *
1621 * Run a hardware queue asynchronously with a delay of @msecs.
1622 */
7587a5ae
BVA
1623void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1624{
1625 __blk_mq_delay_run_hw_queue(hctx, true, msecs);
1626}
1627EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1628
105663f7
AA
1629/**
1630 * blk_mq_run_hw_queue - Start to run a hardware queue.
1631 * @hctx: Pointer to the hardware queue to run.
1632 * @async: If we want to run the queue asynchronously.
1633 *
1634 * Check if the request queue is not in a quiesced state and if there are
1635 * pending requests to be sent. If this is true, run the queue to send requests
1636 * to hardware.
1637 */
626fb735 1638void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
7587a5ae 1639{
24f5a90f
ML
1640 int srcu_idx;
1641 bool need_run;
1642
1643 /*
1644 * When queue is quiesced, we may be switching io scheduler, or
1645 * updating nr_hw_queues, or other things, and we can't run queue
1646 * any more, even __blk_mq_hctx_has_pending() can't be called safely.
1647 *
1648 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
1649 * quiesced.
1650 */
04ced159
JA
1651 hctx_lock(hctx, &srcu_idx);
1652 need_run = !blk_queue_quiesced(hctx->queue) &&
1653 blk_mq_hctx_has_pending(hctx);
1654 hctx_unlock(hctx, srcu_idx);
24f5a90f 1655
626fb735 1656 if (need_run)
79f720a7 1657 __blk_mq_delay_run_hw_queue(hctx, async, 0);
320ae51f 1658}
5b727272 1659EXPORT_SYMBOL(blk_mq_run_hw_queue);
320ae51f 1660
b6e68ee8
JK
1661/*
1662 * Is the request queue handled by an IO scheduler that does not respect
1663 * hardware queues when dispatching?
1664 */
1665static bool blk_mq_has_sqsched(struct request_queue *q)
1666{
1667 struct elevator_queue *e = q->elevator;
1668
1669 if (e && e->type->ops.dispatch_request &&
1670 !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
1671 return true;
1672 return false;
1673}
1674
1675/*
1676 * Return prefered queue to dispatch from (if any) for non-mq aware IO
1677 * scheduler.
1678 */
1679static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
1680{
1681 struct blk_mq_hw_ctx *hctx;
1682
1683 /*
1684 * If the IO scheduler does not respect hardware queues when
1685 * dispatching, we just don't bother with multiple HW queues and
1686 * dispatch from hctx for the current CPU since running multiple queues
1687 * just causes lock contention inside the scheduler and pointless cache
1688 * bouncing.
1689 */
1690 hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT,
1691 raw_smp_processor_id());
1692 if (!blk_mq_hctx_stopped(hctx))
1693 return hctx;
1694 return NULL;
1695}
1696
105663f7 1697/**
24f7bb88 1698 * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
105663f7
AA
1699 * @q: Pointer to the request queue to run.
1700 * @async: If we want to run the queue asynchronously.
1701 */
b94ec296 1702void blk_mq_run_hw_queues(struct request_queue *q, bool async)
320ae51f 1703{
b6e68ee8 1704 struct blk_mq_hw_ctx *hctx, *sq_hctx;
320ae51f
JA
1705 int i;
1706
b6e68ee8
JK
1707 sq_hctx = NULL;
1708 if (blk_mq_has_sqsched(q))
1709 sq_hctx = blk_mq_get_sq_hctx(q);
320ae51f 1710 queue_for_each_hw_ctx(q, hctx, i) {
79f720a7 1711 if (blk_mq_hctx_stopped(hctx))
320ae51f 1712 continue;
b6e68ee8
JK
1713 /*
1714 * Dispatch from this hctx either if there's no hctx preferred
1715 * by IO scheduler or if it has requests that bypass the
1716 * scheduler.
1717 */
1718 if (!sq_hctx || sq_hctx == hctx ||
1719 !list_empty_careful(&hctx->dispatch))
1720 blk_mq_run_hw_queue(hctx, async);
320ae51f
JA
1721 }
1722}
b94ec296 1723EXPORT_SYMBOL(blk_mq_run_hw_queues);
320ae51f 1724
b9151e7b
DA
1725/**
1726 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
1727 * @q: Pointer to the request queue to run.
fa94ba8a 1728 * @msecs: Milliseconds of delay to wait before running the queues.
b9151e7b
DA
1729 */
1730void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
1731{
b6e68ee8 1732 struct blk_mq_hw_ctx *hctx, *sq_hctx;
b9151e7b
DA
1733 int i;
1734
b6e68ee8
JK
1735 sq_hctx = NULL;
1736 if (blk_mq_has_sqsched(q))
1737 sq_hctx = blk_mq_get_sq_hctx(q);
b9151e7b
DA
1738 queue_for_each_hw_ctx(q, hctx, i) {
1739 if (blk_mq_hctx_stopped(hctx))
1740 continue;
b6e68ee8
JK
1741 /*
1742 * Dispatch from this hctx either if there's no hctx preferred
1743 * by IO scheduler or if it has requests that bypass the
1744 * scheduler.
1745 */
1746 if (!sq_hctx || sq_hctx == hctx ||
1747 !list_empty_careful(&hctx->dispatch))
1748 blk_mq_delay_run_hw_queue(hctx, msecs);
b9151e7b
DA
1749 }
1750}
1751EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
1752
fd001443
BVA
1753/**
1754 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1755 * @q: request queue.
1756 *
1757 * The caller is responsible for serializing this function against
1758 * blk_mq_{start,stop}_hw_queue().
1759 */
1760bool blk_mq_queue_stopped(struct request_queue *q)
1761{
1762 struct blk_mq_hw_ctx *hctx;
1763 int i;
1764
1765 queue_for_each_hw_ctx(q, hctx, i)
1766 if (blk_mq_hctx_stopped(hctx))
1767 return true;
1768
1769 return false;
1770}
1771EXPORT_SYMBOL(blk_mq_queue_stopped);
1772
39a70c76
ML
1773/*
1774 * This function is often used for pausing .queue_rq() by driver when
1775 * there isn't enough resource or some conditions aren't satisfied, and
4d606219 1776 * BLK_STS_RESOURCE is usually returned.
39a70c76
ML
1777 *
1778 * We do not guarantee that dispatch can be drained or blocked
1779 * after blk_mq_stop_hw_queue() returns. Please use
1780 * blk_mq_quiesce_queue() for that requirement.
1781 */
2719aa21
JA
1782void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1783{
641a9ed6 1784 cancel_delayed_work(&hctx->run_work);
280d45f6 1785
641a9ed6 1786 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
2719aa21 1787}
641a9ed6 1788EXPORT_SYMBOL(blk_mq_stop_hw_queue);
2719aa21 1789
39a70c76
ML
1790/*
1791 * This function is often used for pausing .queue_rq() by driver when
1792 * there isn't enough resource or some conditions aren't satisfied, and
4d606219 1793 * BLK_STS_RESOURCE is usually returned.
39a70c76
ML
1794 *
1795 * We do not guarantee that dispatch can be drained or blocked
1796 * after blk_mq_stop_hw_queues() returns. Please use
1797 * blk_mq_quiesce_queue() for that requirement.
1798 */
2719aa21
JA
1799void blk_mq_stop_hw_queues(struct request_queue *q)
1800{
641a9ed6
ML
1801 struct blk_mq_hw_ctx *hctx;
1802 int i;
1803
1804 queue_for_each_hw_ctx(q, hctx, i)
1805 blk_mq_stop_hw_queue(hctx);
280d45f6
CH
1806}
1807EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1808
320ae51f
JA
1809void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1810{
1811 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
e4043dcf 1812
0ffbce80 1813 blk_mq_run_hw_queue(hctx, false);
320ae51f
JA
1814}
1815EXPORT_SYMBOL(blk_mq_start_hw_queue);
1816
2f268556
CH
1817void blk_mq_start_hw_queues(struct request_queue *q)
1818{
1819 struct blk_mq_hw_ctx *hctx;
1820 int i;
1821
1822 queue_for_each_hw_ctx(q, hctx, i)
1823 blk_mq_start_hw_queue(hctx);
1824}
1825EXPORT_SYMBOL(blk_mq_start_hw_queues);
1826
ae911c5e
JA
1827void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1828{
1829 if (!blk_mq_hctx_stopped(hctx))
1830 return;
1831
1832 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1833 blk_mq_run_hw_queue(hctx, async);
1834}
1835EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
1836
1b4a3258 1837void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
320ae51f
JA
1838{
1839 struct blk_mq_hw_ctx *hctx;
1840 int i;
1841
ae911c5e
JA
1842 queue_for_each_hw_ctx(q, hctx, i)
1843 blk_mq_start_stopped_hw_queue(hctx, async);
320ae51f
JA
1844}
1845EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1846
70f4db63 1847static void blk_mq_run_work_fn(struct work_struct *work)
320ae51f
JA
1848{
1849 struct blk_mq_hw_ctx *hctx;
1850
9f993737 1851 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
320ae51f 1852
21c6e939 1853 /*
15fe8a90 1854 * If we are stopped, don't run the queue.
21c6e939 1855 */
0841031a 1856 if (blk_mq_hctx_stopped(hctx))
0196d6b4 1857 return;
7587a5ae
BVA
1858
1859 __blk_mq_run_hw_queue(hctx);
1860}
1861
cfd0c552 1862static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
cfd0c552
ML
1863 struct request *rq,
1864 bool at_head)
320ae51f 1865{
e57690fe 1866 struct blk_mq_ctx *ctx = rq->mq_ctx;
c16d6b5a 1867 enum hctx_type type = hctx->type;
e57690fe 1868
7b607814
BVA
1869 lockdep_assert_held(&ctx->lock);
1870
a54895fa 1871 trace_block_rq_insert(rq);
01b983c9 1872
72a0a36e 1873 if (at_head)
c16d6b5a 1874 list_add(&rq->queuelist, &ctx->rq_lists[type]);
72a0a36e 1875 else
c16d6b5a 1876 list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
cfd0c552 1877}
4bb659b1 1878
2c3ad667
JA
1879void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1880 bool at_head)
cfd0c552
ML
1881{
1882 struct blk_mq_ctx *ctx = rq->mq_ctx;
1883
7b607814
BVA
1884 lockdep_assert_held(&ctx->lock);
1885
e57690fe 1886 __blk_mq_insert_req_list(hctx, rq, at_head);
320ae51f 1887 blk_mq_hctx_mark_pending(hctx, ctx);
320ae51f
JA
1888}
1889
105663f7
AA
1890/**
1891 * blk_mq_request_bypass_insert - Insert a request at dispatch list.
1892 * @rq: Pointer to request to be inserted.
26bfeb26 1893 * @at_head: true if the request should be inserted at the head of the list.
105663f7
AA
1894 * @run_queue: If we should run the hardware queue after inserting the request.
1895 *
157f377b
JA
1896 * Should only be used carefully, when the caller knows we want to
1897 * bypass a potential IO scheduler on the target device.
1898 */
01e99aec
ML
1899void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
1900 bool run_queue)
157f377b 1901{
ea4f995e 1902 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
157f377b
JA
1903
1904 spin_lock(&hctx->lock);
01e99aec
ML
1905 if (at_head)
1906 list_add(&rq->queuelist, &hctx->dispatch);
1907 else
1908 list_add_tail(&rq->queuelist, &hctx->dispatch);
157f377b
JA
1909 spin_unlock(&hctx->lock);
1910
b0850297
ML
1911 if (run_queue)
1912 blk_mq_run_hw_queue(hctx, false);
157f377b
JA
1913}
1914
bd166ef1
JA
1915void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1916 struct list_head *list)
320ae51f
JA
1917
1918{
3f0cedc7 1919 struct request *rq;
c16d6b5a 1920 enum hctx_type type = hctx->type;
3f0cedc7 1921
320ae51f
JA
1922 /*
1923 * preemption doesn't flush plug list, so it's possible ctx->cpu is
1924 * offline now
1925 */
3f0cedc7 1926 list_for_each_entry(rq, list, queuelist) {
e57690fe 1927 BUG_ON(rq->mq_ctx != ctx);
a54895fa 1928 trace_block_rq_insert(rq);
320ae51f 1929 }
3f0cedc7
ML
1930
1931 spin_lock(&ctx->lock);
c16d6b5a 1932 list_splice_tail_init(list, &ctx->rq_lists[type]);
cfd0c552 1933 blk_mq_hctx_mark_pending(hctx, ctx);
320ae51f 1934 spin_unlock(&ctx->lock);
320ae51f
JA
1935}
1936
4f0f586b
ST
1937static int plug_rq_cmp(void *priv, const struct list_head *a,
1938 const struct list_head *b)
320ae51f
JA
1939{
1940 struct request *rqa = container_of(a, struct request, queuelist);
1941 struct request *rqb = container_of(b, struct request, queuelist);
1942
7d30a621
PB
1943 if (rqa->mq_ctx != rqb->mq_ctx)
1944 return rqa->mq_ctx > rqb->mq_ctx;
1945 if (rqa->mq_hctx != rqb->mq_hctx)
1946 return rqa->mq_hctx > rqb->mq_hctx;
3110fc79
JA
1947
1948 return blk_rq_pos(rqa) > blk_rq_pos(rqb);
320ae51f
JA
1949}
1950
1951void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1952{
320ae51f 1953 LIST_HEAD(list);
320ae51f 1954
95ed0c5b
PB
1955 if (list_empty(&plug->mq_list))
1956 return;
320ae51f
JA
1957 list_splice_init(&plug->mq_list, &list);
1958
ce5b009c
JA
1959 if (plug->rq_count > 2 && plug->multiple_queues)
1960 list_sort(NULL, &list, plug_rq_cmp);
320ae51f 1961
bcc816df
DZ
1962 plug->rq_count = 0;
1963
95ed0c5b
PB
1964 do {
1965 struct list_head rq_list;
1966 struct request *rq, *head_rq = list_entry_rq(list.next);
1967 struct list_head *pos = &head_rq->queuelist; /* skip first */
1968 struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
1969 struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
1970 unsigned int depth = 1;
1971
1972 list_for_each_continue(pos, &list) {
1973 rq = list_entry_rq(pos);
1974 BUG_ON(!rq->q);
1975 if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
1976 break;
1977 depth++;
320ae51f
JA
1978 }
1979
95ed0c5b
PB
1980 list_cut_before(&rq_list, &list, pos);
1981 trace_block_unplug(head_rq->q, depth, !from_schedule);
67cae4c9 1982 blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
bd166ef1 1983 from_schedule);
95ed0c5b 1984 } while(!list_empty(&list));
320ae51f
JA
1985}
1986
14ccb66b
CH
1987static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
1988 unsigned int nr_segs)
320ae51f 1989{
93f221ae
EB
1990 int err;
1991
f924cdde
CH
1992 if (bio->bi_opf & REQ_RAHEAD)
1993 rq->cmd_flags |= REQ_FAILFAST_MASK;
1994
1995 rq->__sector = bio->bi_iter.bi_sector;
1996 rq->write_hint = bio->bi_write_hint;
14ccb66b 1997 blk_rq_bio_prep(rq, bio, nr_segs);
93f221ae
EB
1998
1999 /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
2000 err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
2001 WARN_ON_ONCE(err);
4b570521 2002
b5af37ab 2003 blk_account_io_start(rq);
320ae51f
JA
2004}
2005
0f95549c
MS
2006static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
2007 struct request *rq,
be94f058 2008 blk_qc_t *cookie, bool last)
f984df1f 2009{
f984df1f 2010 struct request_queue *q = rq->q;
f984df1f
SL
2011 struct blk_mq_queue_data bd = {
2012 .rq = rq,
be94f058 2013 .last = last,
f984df1f 2014 };
bd166ef1 2015 blk_qc_t new_cookie;
f06345ad 2016 blk_status_t ret;
0f95549c
MS
2017
2018 new_cookie = request_to_qc_t(hctx, rq);
2019
2020 /*
2021 * For OK queue, we are done. For error, caller may kill it.
2022 * Any other error (busy), just add it to our list as we
2023 * previously would have done.
2024 */
2025 ret = q->mq_ops->queue_rq(hctx, &bd);
2026 switch (ret) {
2027 case BLK_STS_OK:
6ce3dd6e 2028 blk_mq_update_dispatch_busy(hctx, false);
0f95549c
MS
2029 *cookie = new_cookie;
2030 break;
2031 case BLK_STS_RESOURCE:
86ff7c2a 2032 case BLK_STS_DEV_RESOURCE:
6ce3dd6e 2033 blk_mq_update_dispatch_busy(hctx, true);
0f95549c
MS
2034 __blk_mq_requeue_request(rq);
2035 break;
2036 default:
6ce3dd6e 2037 blk_mq_update_dispatch_busy(hctx, false);
0f95549c
MS
2038 *cookie = BLK_QC_T_NONE;
2039 break;
2040 }
2041
2042 return ret;
2043}
2044
fd9c40f6 2045static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
0f95549c 2046 struct request *rq,
396eaf21 2047 blk_qc_t *cookie,
fd9c40f6 2048 bool bypass_insert, bool last)
0f95549c
MS
2049{
2050 struct request_queue *q = rq->q;
d964f04a 2051 bool run_queue = true;
2a5a24aa 2052 int budget_token;
d964f04a 2053
23d4ee19 2054 /*
fd9c40f6 2055 * RCU or SRCU read lock is needed before checking quiesced flag.
23d4ee19 2056 *
fd9c40f6
BVA
2057 * When queue is stopped or quiesced, ignore 'bypass_insert' from
2058 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
2059 * and avoid driver to try to dispatch again.
23d4ee19 2060 */
fd9c40f6 2061 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
d964f04a 2062 run_queue = false;
fd9c40f6
BVA
2063 bypass_insert = false;
2064 goto insert;
d964f04a 2065 }
f984df1f 2066
fd9c40f6
BVA
2067 if (q->elevator && !bypass_insert)
2068 goto insert;
2253efc8 2069
2a5a24aa
ML
2070 budget_token = blk_mq_get_dispatch_budget(q);
2071 if (budget_token < 0)
fd9c40f6 2072 goto insert;
bd166ef1 2073
2a5a24aa
ML
2074 blk_mq_set_rq_budget_token(rq, budget_token);
2075
8ab6bb9e 2076 if (!blk_mq_get_driver_tag(rq)) {
2a5a24aa 2077 blk_mq_put_dispatch_budget(q, budget_token);
fd9c40f6 2078 goto insert;
88022d72 2079 }
de148297 2080
fd9c40f6
BVA
2081 return __blk_mq_issue_directly(hctx, rq, cookie, last);
2082insert:
2083 if (bypass_insert)
2084 return BLK_STS_RESOURCE;
2085
db03f88f
ML
2086 blk_mq_sched_insert_request(rq, false, run_queue, false);
2087
fd9c40f6
BVA
2088 return BLK_STS_OK;
2089}
2090
105663f7
AA
2091/**
2092 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2093 * @hctx: Pointer of the associated hardware queue.
2094 * @rq: Pointer to request to be sent.
2095 * @cookie: Request queue cookie.
2096 *
2097 * If the device has enough resources to accept a new request now, send the
2098 * request directly to device driver. Else, insert at hctx->dispatch queue, so
2099 * we can try send it another time in the future. Requests inserted at this
2100 * queue have higher priority.
2101 */
fd9c40f6
BVA
2102static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
2103 struct request *rq, blk_qc_t *cookie)
2104{
2105 blk_status_t ret;
2106 int srcu_idx;
2107
2108 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
2109
2110 hctx_lock(hctx, &srcu_idx);
2111
2112 ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
2113 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
01e99aec 2114 blk_mq_request_bypass_insert(rq, false, true);
fd9c40f6
BVA
2115 else if (ret != BLK_STS_OK)
2116 blk_mq_end_request(rq, ret);
2117
2118 hctx_unlock(hctx, srcu_idx);
2119}
2120
2121blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
2122{
2123 blk_status_t ret;
2124 int srcu_idx;
2125 blk_qc_t unused_cookie;
2126 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
2127
2128 hctx_lock(hctx, &srcu_idx);
2129 ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
04ced159 2130 hctx_unlock(hctx, srcu_idx);
7f556a44
JW
2131
2132 return ret;
5eb6126e
CH
2133}
2134
6ce3dd6e
ML
2135void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
2136 struct list_head *list)
2137{
536167d4 2138 int queued = 0;
632bfb63 2139 int errors = 0;
536167d4 2140
6ce3dd6e 2141 while (!list_empty(list)) {
fd9c40f6 2142 blk_status_t ret;
6ce3dd6e
ML
2143 struct request *rq = list_first_entry(list, struct request,
2144 queuelist);
2145
2146 list_del_init(&rq->queuelist);
fd9c40f6
BVA
2147 ret = blk_mq_request_issue_directly(rq, list_empty(list));
2148 if (ret != BLK_STS_OK) {
2149 if (ret == BLK_STS_RESOURCE ||
2150 ret == BLK_STS_DEV_RESOURCE) {
01e99aec 2151 blk_mq_request_bypass_insert(rq, false,
c616cbee 2152 list_empty(list));
fd9c40f6
BVA
2153 break;
2154 }
2155 blk_mq_end_request(rq, ret);
632bfb63 2156 errors++;
536167d4
KB
2157 } else
2158 queued++;
6ce3dd6e 2159 }
d666ba98
JA
2160
2161 /*
2162 * If we didn't flush the entire list, we could have told
2163 * the driver there was more coming, but that turned out to
2164 * be a lie.
2165 */
632bfb63 2166 if ((!list_empty(list) || errors) &&
2167 hctx->queue->mq_ops->commit_rqs && queued)
d666ba98 2168 hctx->queue->mq_ops->commit_rqs(hctx);
6ce3dd6e
ML
2169}
2170
ce5b009c
JA
2171static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
2172{
2173 list_add_tail(&rq->queuelist, &plug->mq_list);
2174 plug->rq_count++;
2175 if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
2176 struct request *tmp;
2177
2178 tmp = list_first_entry(&plug->mq_list, struct request,
2179 queuelist);
2180 if (tmp->q != rq->q)
2181 plug->multiple_queues = true;
2182 }
2183}
2184
7f2a6a69 2185/*
ba0ffdd8 2186 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
7f2a6a69
SL
2187 * queues. This is important for md arrays to benefit from merging
2188 * requests.
2189 */
2190static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
2191{
2192 if (plug->multiple_queues)
ba0ffdd8 2193 return BLK_MAX_REQUEST_COUNT * 2;
7f2a6a69
SL
2194 return BLK_MAX_REQUEST_COUNT;
2195}
2196
105663f7 2197/**
c62b37d9 2198 * blk_mq_submit_bio - Create and send a request to block device.
105663f7
AA
2199 * @bio: Bio pointer.
2200 *
2201 * Builds up a request structure from @q and @bio and send to the device. The
2202 * request may not be queued directly to hardware if:
2203 * * This request can be merged with another one
2204 * * We want to place request at plug queue for possible future merging
2205 * * There is an IO scheduler active at this queue
2206 *
2207 * It will not queue the request if there is an error with the bio, or at the
2208 * request creation.
2209 *
2210 * Returns: Request queue cookie.
2211 */
c62b37d9 2212blk_qc_t blk_mq_submit_bio(struct bio *bio)
07068d5b 2213{
309dca30 2214 struct request_queue *q = bio->bi_bdev->bd_disk->queue;
ef295ecf 2215 const int is_sync = op_is_sync(bio->bi_opf);
f73f44eb 2216 const int is_flush_fua = op_is_flush(bio->bi_opf);
07068d5b 2217 struct request *rq;
f984df1f 2218 struct blk_plug *plug;
5b3f341f 2219 struct request *same_queue_rq = NULL;
14ccb66b 2220 unsigned int nr_segs;
7b371636 2221 blk_qc_t cookie;
a892c8d5 2222 blk_status_t ret;
cc29e1bf 2223 bool hipri;
07068d5b
JA
2224
2225 blk_queue_bounce(q, &bio);
f695ca38 2226 __blk_queue_split(&bio, &nr_segs);
f36ea50c 2227
e23947bd 2228 if (!bio_integrity_prep(bio))
ac7c5675 2229 goto queue_exit;
07068d5b 2230
87c279e6 2231 if (!is_flush_fua && !blk_queue_nomerges(q) &&
14ccb66b 2232 blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
ac7c5675 2233 goto queue_exit;
f984df1f 2234
14ccb66b 2235 if (blk_mq_sched_bio_merge(q, bio, nr_segs))
ac7c5675 2236 goto queue_exit;
bd166ef1 2237
d5337560 2238 rq_qos_throttle(q, bio);
87760e5e 2239
cc29e1bf
JX
2240 hipri = bio->bi_opf & REQ_HIPRI;
2241
47c122e3
JA
2242 plug = blk_mq_plug(q, bio);
2243 if (plug && plug->cached_rq) {
2244 rq = plug->cached_rq;
2245 plug->cached_rq = rq->rq_next;
2246 INIT_LIST_HEAD(&rq->queuelist);
47c122e3 2247 } else {
0f38d766
CH
2248 struct blk_mq_alloc_data data = {
2249 .q = q,
2250 .nr_tags = 1,
2251 .cmd_flags = bio->bi_opf,
2252 };
2253
47c122e3
JA
2254 if (plug) {
2255 data.nr_tags = plug->nr_ios;
2256 plug->nr_ios = 1;
2257 data.cached_rq = &plug->cached_rq;
2258 }
b90cfaed 2259 rq = __blk_mq_alloc_requests(&data);
47c122e3
JA
2260 if (unlikely(!rq)) {
2261 rq_qos_cleanup(q, bio);
2262 if (bio->bi_opf & REQ_NOWAIT)
2263 bio_wouldblock_error(bio);
2264 goto queue_exit;
2265 }
87760e5e
JA
2266 }
2267
e8a676d6 2268 trace_block_getrq(bio);
d6f1dda2 2269
c1c80384 2270 rq_qos_track(q, rq, bio);
07068d5b 2271
0f38d766 2272 cookie = request_to_qc_t(rq->mq_hctx, rq);
07068d5b 2273
970d168d
BVA
2274 blk_mq_bio_to_request(rq, bio, nr_segs);
2275
a892c8d5
ST
2276 ret = blk_crypto_init_request(rq);
2277 if (ret != BLK_STS_OK) {
2278 bio->bi_status = ret;
2279 bio_endio(bio);
2280 blk_mq_free_request(rq);
2281 return BLK_QC_T_NONE;
2282 }
2283
07068d5b 2284 if (unlikely(is_flush_fua)) {
4a60f360 2285 struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
105663f7 2286 /* Bypass scheduler for flush requests */
923218f6 2287 blk_insert_flush(rq);
4a60f360 2288 blk_mq_run_hw_queue(hctx, true);
03f26d8f 2289 } else if (plug && (q->nr_hw_queues == 1 ||
079a2e3e 2290 blk_mq_is_shared_tags(rq->mq_hctx->flags) ||
03f26d8f 2291 q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) {
b2c5d16b
JA
2292 /*
2293 * Use plugging if we have a ->commit_rqs() hook as well, as
2294 * we know the driver uses bd->last in a smart fashion.
3154df26
ML
2295 *
2296 * Use normal plugging if this disk is slow HDD, as sequential
2297 * IO may benefit a lot from plug merging.
b2c5d16b 2298 */
5f0ed774 2299 unsigned int request_count = plug->rq_count;
600271d9
SL
2300 struct request *last = NULL;
2301
676d0607 2302 if (!request_count)
e6c4438b 2303 trace_block_plug(q);
600271d9
SL
2304 else
2305 last = list_entry_rq(plug->mq_list.prev);
b094f89c 2306
7f2a6a69 2307 if (request_count >= blk_plug_max_rq_count(plug) || (last &&
600271d9 2308 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
e6c4438b
JM
2309 blk_flush_plug_list(plug, false);
2310 trace_block_plug(q);
320ae51f 2311 }
b094f89c 2312
ce5b009c 2313 blk_add_rq_to_plug(plug, rq);
a12de1d4 2314 } else if (q->elevator) {
105663f7 2315 /* Insert the request at the IO scheduler queue */
a12de1d4 2316 blk_mq_sched_insert_request(rq, false, true, true);
2299722c 2317 } else if (plug && !blk_queue_nomerges(q)) {
07068d5b 2318 /*
6a83e74d 2319 * We do limited plugging. If the bio can be merged, do that.
f984df1f
SL
2320 * Otherwise the existing request in the plug list will be
2321 * issued. So the plug list will have one request at most
2299722c
CH
2322 * The plug list might get flushed before this. If that happens,
2323 * the plug list is empty, and same_queue_rq is invalid.
07068d5b 2324 */
2299722c
CH
2325 if (list_empty(&plug->mq_list))
2326 same_queue_rq = NULL;
4711b573 2327 if (same_queue_rq) {
2299722c 2328 list_del_init(&same_queue_rq->queuelist);
4711b573
JA
2329 plug->rq_count--;
2330 }
ce5b009c 2331 blk_add_rq_to_plug(plug, rq);
ff3b74b8 2332 trace_block_plug(q);
2299722c 2333
dad7a3be 2334 if (same_queue_rq) {
ff3b74b8 2335 trace_block_unplug(q, 1, true);
0f38d766
CH
2336 blk_mq_try_issue_directly(same_queue_rq->mq_hctx,
2337 same_queue_rq, &cookie);
dad7a3be 2338 }
a12de1d4 2339 } else if ((q->nr_hw_queues > 1 && is_sync) ||
0f38d766 2340 !rq->mq_hctx->dispatch_busy) {
105663f7
AA
2341 /*
2342 * There is no scheduler and we can try to send directly
2343 * to the hardware.
2344 */
0f38d766 2345 blk_mq_try_issue_directly(rq->mq_hctx, rq, &cookie);
ab42f35d 2346 } else {
105663f7 2347 /* Default case. */
8fa9f556 2348 blk_mq_sched_insert_request(rq, false, true, true);
ab42f35d 2349 }
320ae51f 2350
cc29e1bf
JX
2351 if (!hipri)
2352 return BLK_QC_T_NONE;
7b371636 2353 return cookie;
ac7c5675
CH
2354queue_exit:
2355 blk_queue_exit(q);
2356 return BLK_QC_T_NONE;
320ae51f
JA
2357}
2358
bd63141d
ML
2359static size_t order_to_size(unsigned int order)
2360{
2361 return (size_t)PAGE_SIZE << order;
2362}
2363
2364/* called before freeing request pool in @tags */
f32e4eaf
JG
2365static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
2366 struct blk_mq_tags *tags)
bd63141d 2367{
bd63141d
ML
2368 struct page *page;
2369 unsigned long flags;
2370
4f245d5b
JG
2371 /* There is no need to clear a driver tags own mapping */
2372 if (drv_tags == tags)
2373 return;
2374
bd63141d
ML
2375 list_for_each_entry(page, &tags->page_list, lru) {
2376 unsigned long start = (unsigned long)page_address(page);
2377 unsigned long end = start + order_to_size(page->private);
2378 int i;
2379
f32e4eaf 2380 for (i = 0; i < drv_tags->nr_tags; i++) {
bd63141d
ML
2381 struct request *rq = drv_tags->rqs[i];
2382 unsigned long rq_addr = (unsigned long)rq;
2383
2384 if (rq_addr >= start && rq_addr < end) {
2385 WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
2386 cmpxchg(&drv_tags->rqs[i], rq, NULL);
2387 }
2388 }
2389 }
2390
2391 /*
2392 * Wait until all pending iteration is done.
2393 *
2394 * Request reference is cleared and it is guaranteed to be observed
2395 * after the ->lock is released.
2396 */
2397 spin_lock_irqsave(&drv_tags->lock, flags);
2398 spin_unlock_irqrestore(&drv_tags->lock, flags);
2399}
2400
cc71a6f4
JA
2401void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
2402 unsigned int hctx_idx)
95363efd 2403{
f32e4eaf 2404 struct blk_mq_tags *drv_tags;
e9b267d9 2405 struct page *page;
320ae51f 2406
079a2e3e
JG
2407 if (blk_mq_is_shared_tags(set->flags))
2408 drv_tags = set->shared_tags;
e155b0c2
JG
2409 else
2410 drv_tags = set->tags[hctx_idx];
f32e4eaf 2411
65de57bb 2412 if (tags->static_rqs && set->ops->exit_request) {
e9b267d9 2413 int i;
320ae51f 2414
24d2f903 2415 for (i = 0; i < tags->nr_tags; i++) {
2af8cbe3
JA
2416 struct request *rq = tags->static_rqs[i];
2417
2418 if (!rq)
e9b267d9 2419 continue;
d6296d39 2420 set->ops->exit_request(set, rq, hctx_idx);
2af8cbe3 2421 tags->static_rqs[i] = NULL;
e9b267d9 2422 }
320ae51f 2423 }
320ae51f 2424
f32e4eaf 2425 blk_mq_clear_rq_mapping(drv_tags, tags);
bd63141d 2426
24d2f903
CH
2427 while (!list_empty(&tags->page_list)) {
2428 page = list_first_entry(&tags->page_list, struct page, lru);
6753471c 2429 list_del_init(&page->lru);
f75782e4
CM
2430 /*
2431 * Remove kmemleak object previously allocated in
273938bf 2432 * blk_mq_alloc_rqs().
f75782e4
CM
2433 */
2434 kmemleak_free(page_address(page));
320ae51f
JA
2435 __free_pages(page, page->private);
2436 }
cc71a6f4 2437}
320ae51f 2438
e155b0c2 2439void blk_mq_free_rq_map(struct blk_mq_tags *tags)
cc71a6f4 2440{
24d2f903 2441 kfree(tags->rqs);
cc71a6f4 2442 tags->rqs = NULL;
2af8cbe3
JA
2443 kfree(tags->static_rqs);
2444 tags->static_rqs = NULL;
320ae51f 2445
e155b0c2 2446 blk_mq_free_tags(tags);
320ae51f
JA
2447}
2448
63064be1
JG
2449static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
2450 unsigned int hctx_idx,
2451 unsigned int nr_tags,
e155b0c2 2452 unsigned int reserved_tags)
320ae51f 2453{
24d2f903 2454 struct blk_mq_tags *tags;
59f082e4 2455 int node;
320ae51f 2456
7d76f856 2457 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
59f082e4
SL
2458 if (node == NUMA_NO_NODE)
2459 node = set->numa_node;
2460
e155b0c2
JG
2461 tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
2462 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
24d2f903
CH
2463 if (!tags)
2464 return NULL;
320ae51f 2465
590b5b7d 2466 tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
36e1f3d1 2467 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
59f082e4 2468 node);
24d2f903 2469 if (!tags->rqs) {
e155b0c2 2470 blk_mq_free_tags(tags);
24d2f903
CH
2471 return NULL;
2472 }
320ae51f 2473
590b5b7d
KC
2474 tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
2475 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
2476 node);
2af8cbe3
JA
2477 if (!tags->static_rqs) {
2478 kfree(tags->rqs);
e155b0c2 2479 blk_mq_free_tags(tags);
2af8cbe3
JA
2480 return NULL;
2481 }
2482
cc71a6f4
JA
2483 return tags;
2484}
2485
1d9bd516
TH
2486static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
2487 unsigned int hctx_idx, int node)
2488{
2489 int ret;
2490
2491 if (set->ops->init_request) {
2492 ret = set->ops->init_request(set, rq, hctx_idx, node);
2493 if (ret)
2494 return ret;
2495 }
2496
12f5b931 2497 WRITE_ONCE(rq->state, MQ_RQ_IDLE);
1d9bd516
TH
2498 return 0;
2499}
2500
63064be1
JG
2501static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
2502 struct blk_mq_tags *tags,
2503 unsigned int hctx_idx, unsigned int depth)
cc71a6f4
JA
2504{
2505 unsigned int i, j, entries_per_page, max_order = 4;
2506 size_t rq_size, left;
59f082e4
SL
2507 int node;
2508
7d76f856 2509 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
59f082e4
SL
2510 if (node == NUMA_NO_NODE)
2511 node = set->numa_node;
cc71a6f4
JA
2512
2513 INIT_LIST_HEAD(&tags->page_list);
2514
320ae51f
JA
2515 /*
2516 * rq_size is the size of the request plus driver payload, rounded
2517 * to the cacheline size
2518 */
24d2f903 2519 rq_size = round_up(sizeof(struct request) + set->cmd_size,
320ae51f 2520 cache_line_size());
cc71a6f4 2521 left = rq_size * depth;
320ae51f 2522
cc71a6f4 2523 for (i = 0; i < depth; ) {
320ae51f
JA
2524 int this_order = max_order;
2525 struct page *page;
2526 int to_do;
2527 void *p;
2528
b3a834b1 2529 while (this_order && left < order_to_size(this_order - 1))
320ae51f
JA
2530 this_order--;
2531
2532 do {
59f082e4 2533 page = alloc_pages_node(node,
36e1f3d1 2534 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
a5164405 2535 this_order);
320ae51f
JA
2536 if (page)
2537 break;
2538 if (!this_order--)
2539 break;
2540 if (order_to_size(this_order) < rq_size)
2541 break;
2542 } while (1);
2543
2544 if (!page)
24d2f903 2545 goto fail;
320ae51f
JA
2546
2547 page->private = this_order;
24d2f903 2548 list_add_tail(&page->lru, &tags->page_list);
320ae51f
JA
2549
2550 p = page_address(page);
f75782e4
CM
2551 /*
2552 * Allow kmemleak to scan these pages as they contain pointers
2553 * to additional allocations like via ops->init_request().
2554 */
36e1f3d1 2555 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
320ae51f 2556 entries_per_page = order_to_size(this_order) / rq_size;
cc71a6f4 2557 to_do = min(entries_per_page, depth - i);
320ae51f
JA
2558 left -= to_do * rq_size;
2559 for (j = 0; j < to_do; j++) {
2af8cbe3
JA
2560 struct request *rq = p;
2561
2562 tags->static_rqs[i] = rq;
1d9bd516
TH
2563 if (blk_mq_init_request(set, rq, hctx_idx, node)) {
2564 tags->static_rqs[i] = NULL;
2565 goto fail;
e9b267d9
CH
2566 }
2567
320ae51f
JA
2568 p += rq_size;
2569 i++;
2570 }
2571 }
cc71a6f4 2572 return 0;
320ae51f 2573
24d2f903 2574fail:
cc71a6f4
JA
2575 blk_mq_free_rqs(set, tags, hctx_idx);
2576 return -ENOMEM;
320ae51f
JA
2577}
2578
bf0beec0
ML
2579struct rq_iter_data {
2580 struct blk_mq_hw_ctx *hctx;
2581 bool has_rq;
2582};
2583
2584static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
2585{
2586 struct rq_iter_data *iter_data = data;
2587
2588 if (rq->mq_hctx != iter_data->hctx)
2589 return true;
2590 iter_data->has_rq = true;
2591 return false;
2592}
2593
2594static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
2595{
2596 struct blk_mq_tags *tags = hctx->sched_tags ?
2597 hctx->sched_tags : hctx->tags;
2598 struct rq_iter_data data = {
2599 .hctx = hctx,
2600 };
2601
2602 blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
2603 return data.has_rq;
2604}
2605
2606static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
2607 struct blk_mq_hw_ctx *hctx)
2608{
2609 if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
2610 return false;
2611 if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
2612 return false;
2613 return true;
2614}
2615
2616static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
2617{
2618 struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2619 struct blk_mq_hw_ctx, cpuhp_online);
2620
2621 if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
2622 !blk_mq_last_cpu_in_hctx(cpu, hctx))
2623 return 0;
2624
2625 /*
2626 * Prevent new request from being allocated on the current hctx.
2627 *
2628 * The smp_mb__after_atomic() Pairs with the implied barrier in
2629 * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
2630 * seen once we return from the tag allocator.
2631 */
2632 set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2633 smp_mb__after_atomic();
2634
2635 /*
2636 * Try to grab a reference to the queue and wait for any outstanding
2637 * requests. If we could not grab a reference the queue has been
2638 * frozen and there are no requests.
2639 */
2640 if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
2641 while (blk_mq_hctx_has_requests(hctx))
2642 msleep(5);
2643 percpu_ref_put(&hctx->queue->q_usage_counter);
2644 }
2645
2646 return 0;
2647}
2648
2649static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
2650{
2651 struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
2652 struct blk_mq_hw_ctx, cpuhp_online);
2653
2654 if (cpumask_test_cpu(cpu, hctx->cpumask))
2655 clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
2656 return 0;
2657}
2658
e57690fe
JA
2659/*
2660 * 'cpu' is going away. splice any existing rq_list entries from this
2661 * software queue to the hw queue dispatch list, and ensure that it
2662 * gets run.
2663 */
9467f859 2664static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
484b4061 2665{
9467f859 2666 struct blk_mq_hw_ctx *hctx;
484b4061
JA
2667 struct blk_mq_ctx *ctx;
2668 LIST_HEAD(tmp);
c16d6b5a 2669 enum hctx_type type;
484b4061 2670
9467f859 2671 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
bf0beec0
ML
2672 if (!cpumask_test_cpu(cpu, hctx->cpumask))
2673 return 0;
2674
e57690fe 2675 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
c16d6b5a 2676 type = hctx->type;
484b4061
JA
2677
2678 spin_lock(&ctx->lock);
c16d6b5a
ML
2679 if (!list_empty(&ctx->rq_lists[type])) {
2680 list_splice_init(&ctx->rq_lists[type], &tmp);
484b4061
JA
2681 blk_mq_hctx_clear_pending(hctx, ctx);
2682 }
2683 spin_unlock(&ctx->lock);
2684
2685 if (list_empty(&tmp))
9467f859 2686 return 0;
484b4061 2687
e57690fe
JA
2688 spin_lock(&hctx->lock);
2689 list_splice_tail_init(&tmp, &hctx->dispatch);
2690 spin_unlock(&hctx->lock);
484b4061
JA
2691
2692 blk_mq_run_hw_queue(hctx, true);
9467f859 2693 return 0;
484b4061
JA
2694}
2695
9467f859 2696static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
484b4061 2697{
bf0beec0
ML
2698 if (!(hctx->flags & BLK_MQ_F_STACKING))
2699 cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2700 &hctx->cpuhp_online);
9467f859
TG
2701 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2702 &hctx->cpuhp_dead);
484b4061
JA
2703}
2704
364b6181
ML
2705/*
2706 * Before freeing hw queue, clearing the flush request reference in
2707 * tags->rqs[] for avoiding potential UAF.
2708 */
2709static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
2710 unsigned int queue_depth, struct request *flush_rq)
2711{
2712 int i;
2713 unsigned long flags;
2714
2715 /* The hw queue may not be mapped yet */
2716 if (!tags)
2717 return;
2718
2719 WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
2720
2721 for (i = 0; i < queue_depth; i++)
2722 cmpxchg(&tags->rqs[i], flush_rq, NULL);
2723
2724 /*
2725 * Wait until all pending iteration is done.
2726 *
2727 * Request reference is cleared and it is guaranteed to be observed
2728 * after the ->lock is released.
2729 */
2730 spin_lock_irqsave(&tags->lock, flags);
2731 spin_unlock_irqrestore(&tags->lock, flags);
2732}
2733
c3b4afca 2734/* hctx->ctxs will be freed in queue's release handler */
08e98fc6
ML
2735static void blk_mq_exit_hctx(struct request_queue *q,
2736 struct blk_mq_tag_set *set,
2737 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2738{
364b6181
ML
2739 struct request *flush_rq = hctx->fq->flush_rq;
2740
8ab0b7dc
ML
2741 if (blk_mq_hw_queue_mapped(hctx))
2742 blk_mq_tag_idle(hctx);
08e98fc6 2743
364b6181
ML
2744 blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
2745 set->queue_depth, flush_rq);
f70ced09 2746 if (set->ops->exit_request)
364b6181 2747 set->ops->exit_request(set, flush_rq, hctx_idx);
f70ced09 2748
08e98fc6
ML
2749 if (set->ops->exit_hctx)
2750 set->ops->exit_hctx(hctx, hctx_idx);
2751
9467f859 2752 blk_mq_remove_cpuhp(hctx);
2f8f1336
ML
2753
2754 spin_lock(&q->unused_hctx_lock);
2755 list_add(&hctx->hctx_list, &q->unused_hctx_list);
2756 spin_unlock(&q->unused_hctx_lock);
08e98fc6
ML
2757}
2758
624dbe47
ML
2759static void blk_mq_exit_hw_queues(struct request_queue *q,
2760 struct blk_mq_tag_set *set, int nr_queue)
2761{
2762 struct blk_mq_hw_ctx *hctx;
2763 unsigned int i;
2764
2765 queue_for_each_hw_ctx(q, hctx, i) {
2766 if (i == nr_queue)
2767 break;
477e19de 2768 blk_mq_debugfs_unregister_hctx(hctx);
08e98fc6 2769 blk_mq_exit_hctx(q, set, hctx, i);
624dbe47 2770 }
624dbe47
ML
2771}
2772
7c6c5b7c
ML
2773static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2774{
2775 int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2776
2777 BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
2778 __alignof__(struct blk_mq_hw_ctx)) !=
2779 sizeof(struct blk_mq_hw_ctx));
2780
2781 if (tag_set->flags & BLK_MQ_F_BLOCKING)
2782 hw_ctx_size += sizeof(struct srcu_struct);
2783
2784 return hw_ctx_size;
2785}
2786
08e98fc6
ML
2787static int blk_mq_init_hctx(struct request_queue *q,
2788 struct blk_mq_tag_set *set,
2789 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
320ae51f 2790{
7c6c5b7c
ML
2791 hctx->queue_num = hctx_idx;
2792
bf0beec0
ML
2793 if (!(hctx->flags & BLK_MQ_F_STACKING))
2794 cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
2795 &hctx->cpuhp_online);
7c6c5b7c
ML
2796 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2797
2798 hctx->tags = set->tags[hctx_idx];
2799
2800 if (set->ops->init_hctx &&
2801 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2802 goto unregister_cpu_notifier;
08e98fc6 2803
7c6c5b7c
ML
2804 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
2805 hctx->numa_node))
2806 goto exit_hctx;
2807 return 0;
2808
2809 exit_hctx:
2810 if (set->ops->exit_hctx)
2811 set->ops->exit_hctx(hctx, hctx_idx);
2812 unregister_cpu_notifier:
2813 blk_mq_remove_cpuhp(hctx);
2814 return -1;
2815}
2816
2817static struct blk_mq_hw_ctx *
2818blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
2819 int node)
2820{
2821 struct blk_mq_hw_ctx *hctx;
2822 gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
2823
2824 hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
2825 if (!hctx)
2826 goto fail_alloc_hctx;
2827
2828 if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
2829 goto free_hctx;
2830
2831 atomic_set(&hctx->nr_active, 0);
08e98fc6 2832 if (node == NUMA_NO_NODE)
7c6c5b7c
ML
2833 node = set->numa_node;
2834 hctx->numa_node = node;
08e98fc6 2835
9f993737 2836 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
08e98fc6
ML
2837 spin_lock_init(&hctx->lock);
2838 INIT_LIST_HEAD(&hctx->dispatch);
2839 hctx->queue = q;
51db1c37 2840 hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
08e98fc6 2841
2f8f1336
ML
2842 INIT_LIST_HEAD(&hctx->hctx_list);
2843
320ae51f 2844 /*
08e98fc6
ML
2845 * Allocate space for all possible cpus to avoid allocation at
2846 * runtime
320ae51f 2847 */
d904bfa7 2848 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
7c6c5b7c 2849 gfp, node);
08e98fc6 2850 if (!hctx->ctxs)
7c6c5b7c 2851 goto free_cpumask;
320ae51f 2852
5b202853 2853 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
c548e62b 2854 gfp, node, false, false))
08e98fc6 2855 goto free_ctxs;
08e98fc6 2856 hctx->nr_ctx = 0;
320ae51f 2857
5815839b 2858 spin_lock_init(&hctx->dispatch_wait_lock);
eb619fdb
JA
2859 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2860 INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2861
754a1572 2862 hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
f70ced09 2863 if (!hctx->fq)
7c6c5b7c 2864 goto free_bitmap;
320ae51f 2865
6a83e74d 2866 if (hctx->flags & BLK_MQ_F_BLOCKING)
05707b64 2867 init_srcu_struct(hctx->srcu);
7c6c5b7c 2868 blk_mq_hctx_kobj_init(hctx);
6a83e74d 2869
7c6c5b7c 2870 return hctx;
320ae51f 2871
08e98fc6 2872 free_bitmap:
88459642 2873 sbitmap_free(&hctx->ctx_map);
08e98fc6
ML
2874 free_ctxs:
2875 kfree(hctx->ctxs);
7c6c5b7c
ML
2876 free_cpumask:
2877 free_cpumask_var(hctx->cpumask);
2878 free_hctx:
2879 kfree(hctx);
2880 fail_alloc_hctx:
2881 return NULL;
08e98fc6 2882}
320ae51f 2883
320ae51f
JA
2884static void blk_mq_init_cpu_queues(struct request_queue *q,
2885 unsigned int nr_hw_queues)
2886{
b3c661b1
JA
2887 struct blk_mq_tag_set *set = q->tag_set;
2888 unsigned int i, j;
320ae51f
JA
2889
2890 for_each_possible_cpu(i) {
2891 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2892 struct blk_mq_hw_ctx *hctx;
c16d6b5a 2893 int k;
320ae51f 2894
320ae51f
JA
2895 __ctx->cpu = i;
2896 spin_lock_init(&__ctx->lock);
c16d6b5a
ML
2897 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
2898 INIT_LIST_HEAD(&__ctx->rq_lists[k]);
2899
320ae51f
JA
2900 __ctx->queue = q;
2901
320ae51f
JA
2902 /*
2903 * Set local node, IFF we have more than one hw queue. If
2904 * not, we remain on the home node of the device
2905 */
b3c661b1
JA
2906 for (j = 0; j < set->nr_maps; j++) {
2907 hctx = blk_mq_map_queue_type(q, j, i);
2908 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
576e85c5 2909 hctx->numa_node = cpu_to_node(i);
b3c661b1 2910 }
320ae51f
JA
2911 }
2912}
2913
63064be1
JG
2914struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
2915 unsigned int hctx_idx,
2916 unsigned int depth)
cc71a6f4 2917{
63064be1
JG
2918 struct blk_mq_tags *tags;
2919 int ret;
cc71a6f4 2920
e155b0c2 2921 tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
63064be1
JG
2922 if (!tags)
2923 return NULL;
cc71a6f4 2924
63064be1
JG
2925 ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
2926 if (ret) {
e155b0c2 2927 blk_mq_free_rq_map(tags);
63064be1
JG
2928 return NULL;
2929 }
cc71a6f4 2930
63064be1
JG
2931 return tags;
2932}
2933
2934static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
2935 int hctx_idx)
2936{
079a2e3e
JG
2937 if (blk_mq_is_shared_tags(set->flags)) {
2938 set->tags[hctx_idx] = set->shared_tags;
e155b0c2
JG
2939
2940 return true;
2941 }
2942
63064be1
JG
2943 set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
2944 set->queue_depth);
2945
2946 return set->tags[hctx_idx];
cc71a6f4
JA
2947}
2948
645db34e
JG
2949void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
2950 struct blk_mq_tags *tags,
2951 unsigned int hctx_idx)
cc71a6f4 2952{
645db34e
JG
2953 if (tags) {
2954 blk_mq_free_rqs(set, tags, hctx_idx);
e155b0c2 2955 blk_mq_free_rq_map(tags);
bd166ef1 2956 }
cc71a6f4
JA
2957}
2958
e155b0c2
JG
2959static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
2960 unsigned int hctx_idx)
2961{
079a2e3e 2962 if (!blk_mq_is_shared_tags(set->flags))
e155b0c2
JG
2963 blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
2964
2965 set->tags[hctx_idx] = NULL;
2966}
2967
4b855ad3 2968static void blk_mq_map_swqueue(struct request_queue *q)
320ae51f 2969{
b3c661b1 2970 unsigned int i, j, hctx_idx;
320ae51f
JA
2971 struct blk_mq_hw_ctx *hctx;
2972 struct blk_mq_ctx *ctx;
2a34c087 2973 struct blk_mq_tag_set *set = q->tag_set;
320ae51f
JA
2974
2975 queue_for_each_hw_ctx(q, hctx, i) {
e4043dcf 2976 cpumask_clear(hctx->cpumask);
320ae51f 2977 hctx->nr_ctx = 0;
d416c92c 2978 hctx->dispatch_from = NULL;
320ae51f
JA
2979 }
2980
2981 /*
4b855ad3 2982 * Map software to hardware queues.
4412efec
ML
2983 *
2984 * If the cpu isn't present, the cpu is mapped to first hctx.
320ae51f 2985 */
20e4d813 2986 for_each_possible_cpu(i) {
4412efec 2987
897bb0c7 2988 ctx = per_cpu_ptr(q->queue_ctx, i);
b3c661b1 2989 for (j = 0; j < set->nr_maps; j++) {
bb94aea1
JW
2990 if (!set->map[j].nr_queues) {
2991 ctx->hctxs[j] = blk_mq_map_queue_type(q,
2992 HCTX_TYPE_DEFAULT, i);
e5edd5f2 2993 continue;
bb94aea1 2994 }
fd689871
ML
2995 hctx_idx = set->map[j].mq_map[i];
2996 /* unmapped hw queue can be remapped after CPU topo changed */
2997 if (!set->tags[hctx_idx] &&
63064be1 2998 !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
fd689871
ML
2999 /*
3000 * If tags initialization fail for some hctx,
3001 * that hctx won't be brought online. In this
3002 * case, remap the current ctx to hctx[0] which
3003 * is guaranteed to always have tags allocated
3004 */
3005 set->map[j].mq_map[i] = 0;
3006 }
e5edd5f2 3007
b3c661b1 3008 hctx = blk_mq_map_queue_type(q, j, i);
8ccdf4a3 3009 ctx->hctxs[j] = hctx;
b3c661b1
JA
3010 /*
3011 * If the CPU is already set in the mask, then we've
3012 * mapped this one already. This can happen if
3013 * devices share queues across queue maps.
3014 */
3015 if (cpumask_test_cpu(i, hctx->cpumask))
3016 continue;
3017
3018 cpumask_set_cpu(i, hctx->cpumask);
3019 hctx->type = j;
3020 ctx->index_hw[hctx->type] = hctx->nr_ctx;
3021 hctx->ctxs[hctx->nr_ctx++] = ctx;
3022
3023 /*
3024 * If the nr_ctx type overflows, we have exceeded the
3025 * amount of sw queues we can support.
3026 */
3027 BUG_ON(!hctx->nr_ctx);
3028 }
bb94aea1
JW
3029
3030 for (; j < HCTX_MAX_TYPES; j++)
3031 ctx->hctxs[j] = blk_mq_map_queue_type(q,
3032 HCTX_TYPE_DEFAULT, i);
320ae51f 3033 }
506e931f
JA
3034
3035 queue_for_each_hw_ctx(q, hctx, i) {
4412efec
ML
3036 /*
3037 * If no software queues are mapped to this hardware queue,
3038 * disable it and free the request entries.
3039 */
3040 if (!hctx->nr_ctx) {
3041 /* Never unmap queue 0. We need it as a
3042 * fallback in case of a new remap fails
3043 * allocation
3044 */
e155b0c2
JG
3045 if (i)
3046 __blk_mq_free_map_and_rqs(set, i);
4412efec
ML
3047
3048 hctx->tags = NULL;
3049 continue;
3050 }
484b4061 3051
2a34c087
ML
3052 hctx->tags = set->tags[i];
3053 WARN_ON(!hctx->tags);
3054
889fa31f
CY
3055 /*
3056 * Set the map size to the number of mapped software queues.
3057 * This is more accurate and more efficient than looping
3058 * over all possibly mapped software queues.
3059 */
88459642 3060 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
889fa31f 3061
484b4061
JA
3062 /*
3063 * Initialize batch roundrobin counts
3064 */
f82ddf19 3065 hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
506e931f
JA
3066 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
3067 }
320ae51f
JA
3068}
3069
8e8320c9
JA
3070/*
3071 * Caller needs to ensure that we're either frozen/quiesced, or that
3072 * the queue isn't live yet.
3073 */
2404e607 3074static void queue_set_hctx_shared(struct request_queue *q, bool shared)
0d2602ca
JA
3075{
3076 struct blk_mq_hw_ctx *hctx;
0d2602ca
JA
3077 int i;
3078
2404e607 3079 queue_for_each_hw_ctx(q, hctx, i) {
454bb677 3080 if (shared) {
51db1c37 3081 hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
454bb677
YK
3082 } else {
3083 blk_mq_tag_idle(hctx);
51db1c37 3084 hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
454bb677 3085 }
2404e607
JM
3086 }
3087}
3088
655ac300
HR
3089static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
3090 bool shared)
2404e607
JM
3091{
3092 struct request_queue *q;
0d2602ca 3093
705cda97
BVA
3094 lockdep_assert_held(&set->tag_list_lock);
3095
0d2602ca
JA
3096 list_for_each_entry(q, &set->tag_list, tag_set_list) {
3097 blk_mq_freeze_queue(q);
2404e607 3098 queue_set_hctx_shared(q, shared);
0d2602ca
JA
3099 blk_mq_unfreeze_queue(q);
3100 }
3101}
3102
3103static void blk_mq_del_queue_tag_set(struct request_queue *q)
3104{
3105 struct blk_mq_tag_set *set = q->tag_set;
3106
0d2602ca 3107 mutex_lock(&set->tag_list_lock);
08c875cb 3108 list_del(&q->tag_set_list);
2404e607
JM
3109 if (list_is_singular(&set->tag_list)) {
3110 /* just transitioned to unshared */
51db1c37 3111 set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
2404e607 3112 /* update existing queue */
655ac300 3113 blk_mq_update_tag_set_shared(set, false);
2404e607 3114 }
0d2602ca 3115 mutex_unlock(&set->tag_list_lock);
a347c7ad 3116 INIT_LIST_HEAD(&q->tag_set_list);
0d2602ca
JA
3117}
3118
3119static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
3120 struct request_queue *q)
3121{
0d2602ca 3122 mutex_lock(&set->tag_list_lock);
2404e607 3123
ff821d27
JA
3124 /*
3125 * Check to see if we're transitioning to shared (from 1 to 2 queues).
3126 */
3127 if (!list_empty(&set->tag_list) &&
51db1c37
ML
3128 !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
3129 set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
2404e607 3130 /* update existing queue */
655ac300 3131 blk_mq_update_tag_set_shared(set, true);
2404e607 3132 }
51db1c37 3133 if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
2404e607 3134 queue_set_hctx_shared(q, true);
08c875cb 3135 list_add_tail(&q->tag_set_list, &set->tag_list);
2404e607 3136
0d2602ca
JA
3137 mutex_unlock(&set->tag_list_lock);
3138}
3139
1db4909e
ML
3140/* All allocations will be freed in release handler of q->mq_kobj */
3141static int blk_mq_alloc_ctxs(struct request_queue *q)
3142{
3143 struct blk_mq_ctxs *ctxs;
3144 int cpu;
3145
3146 ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
3147 if (!ctxs)
3148 return -ENOMEM;
3149
3150 ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
3151 if (!ctxs->queue_ctx)
3152 goto fail;
3153
3154 for_each_possible_cpu(cpu) {
3155 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
3156 ctx->ctxs = ctxs;
3157 }
3158
3159 q->mq_kobj = &ctxs->kobj;
3160 q->queue_ctx = ctxs->queue_ctx;
3161
3162 return 0;
3163 fail:
3164 kfree(ctxs);
3165 return -ENOMEM;
3166}
3167
e09aae7e
ML
3168/*
3169 * It is the actual release handler for mq, but we do it from
3170 * request queue's release handler for avoiding use-after-free
3171 * and headache because q->mq_kobj shouldn't have been introduced,
3172 * but we can't group ctx/kctx kobj without it.
3173 */
3174void blk_mq_release(struct request_queue *q)
3175{
2f8f1336
ML
3176 struct blk_mq_hw_ctx *hctx, *next;
3177 int i;
e09aae7e 3178
2f8f1336
ML
3179 queue_for_each_hw_ctx(q, hctx, i)
3180 WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
3181
3182 /* all hctx are in .unused_hctx_list now */
3183 list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
3184 list_del_init(&hctx->hctx_list);
6c8b232e 3185 kobject_put(&hctx->kobj);
c3b4afca 3186 }
e09aae7e
ML
3187
3188 kfree(q->queue_hw_ctx);
3189
7ea5fe31
ML
3190 /*
3191 * release .mq_kobj and sw queue's kobject now because
3192 * both share lifetime with request queue.
3193 */
3194 blk_mq_sysfs_deinit(q);
e09aae7e
ML
3195}
3196
5ec780a6 3197static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
2f227bb9 3198 void *queuedata)
b62c21b7 3199{
26a9750a
CH
3200 struct request_queue *q;
3201 int ret;
b62c21b7 3202
26a9750a
CH
3203 q = blk_alloc_queue(set->numa_node);
3204 if (!q)
b62c21b7 3205 return ERR_PTR(-ENOMEM);
26a9750a
CH
3206 q->queuedata = queuedata;
3207 ret = blk_mq_init_allocated_queue(set, q);
3208 if (ret) {
3209 blk_cleanup_queue(q);
3210 return ERR_PTR(ret);
3211 }
b62c21b7
MS
3212 return q;
3213}
2f227bb9
CH
3214
3215struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
3216{
3217 return blk_mq_init_queue_data(set, NULL);
3218}
b62c21b7
MS
3219EXPORT_SYMBOL(blk_mq_init_queue);
3220
4dcc4874
CH
3221struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
3222 struct lock_class_key *lkclass)
9316a9ed
JA
3223{
3224 struct request_queue *q;
b461dfc4 3225 struct gendisk *disk;
9316a9ed 3226
b461dfc4
CH
3227 q = blk_mq_init_queue_data(set, queuedata);
3228 if (IS_ERR(q))
3229 return ERR_CAST(q);
9316a9ed 3230
4a1fa41d 3231 disk = __alloc_disk_node(q, set->numa_node, lkclass);
b461dfc4
CH
3232 if (!disk) {
3233 blk_cleanup_queue(q);
3234 return ERR_PTR(-ENOMEM);
9316a9ed 3235 }
b461dfc4 3236 return disk;
9316a9ed 3237}
b461dfc4 3238EXPORT_SYMBOL(__blk_mq_alloc_disk);
9316a9ed 3239
34d11ffa
JW
3240static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
3241 struct blk_mq_tag_set *set, struct request_queue *q,
3242 int hctx_idx, int node)
3243{
2f8f1336 3244 struct blk_mq_hw_ctx *hctx = NULL, *tmp;
34d11ffa 3245
2f8f1336
ML
3246 /* reuse dead hctx first */
3247 spin_lock(&q->unused_hctx_lock);
3248 list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
3249 if (tmp->numa_node == node) {
3250 hctx = tmp;
3251 break;
3252 }
3253 }
3254 if (hctx)
3255 list_del_init(&hctx->hctx_list);
3256 spin_unlock(&q->unused_hctx_lock);
3257
3258 if (!hctx)
3259 hctx = blk_mq_alloc_hctx(q, set, node);
34d11ffa 3260 if (!hctx)
7c6c5b7c 3261 goto fail;
34d11ffa 3262
7c6c5b7c
ML
3263 if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
3264 goto free_hctx;
34d11ffa
JW
3265
3266 return hctx;
7c6c5b7c
ML
3267
3268 free_hctx:
3269 kobject_put(&hctx->kobj);
3270 fail:
3271 return NULL;
34d11ffa
JW
3272}
3273
868f2f0b
KB
3274static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
3275 struct request_queue *q)
320ae51f 3276{
e01ad46d 3277 int i, j, end;
868f2f0b 3278 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
f14bbe77 3279
ac0d6b92
BVA
3280 if (q->nr_hw_queues < set->nr_hw_queues) {
3281 struct blk_mq_hw_ctx **new_hctxs;
3282
3283 new_hctxs = kcalloc_node(set->nr_hw_queues,
3284 sizeof(*new_hctxs), GFP_KERNEL,
3285 set->numa_node);
3286 if (!new_hctxs)
3287 return;
3288 if (hctxs)
3289 memcpy(new_hctxs, hctxs, q->nr_hw_queues *
3290 sizeof(*hctxs));
3291 q->queue_hw_ctx = new_hctxs;
ac0d6b92
BVA
3292 kfree(hctxs);
3293 hctxs = new_hctxs;
3294 }
3295
fb350e0a
ML
3296 /* protect against switching io scheduler */
3297 mutex_lock(&q->sysfs_lock);
24d2f903 3298 for (i = 0; i < set->nr_hw_queues; i++) {
868f2f0b 3299 int node;
34d11ffa 3300 struct blk_mq_hw_ctx *hctx;
868f2f0b 3301
7d76f856 3302 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
34d11ffa
JW
3303 /*
3304 * If the hw queue has been mapped to another numa node,
3305 * we need to realloc the hctx. If allocation fails, fallback
3306 * to use the previous one.
3307 */
3308 if (hctxs[i] && (hctxs[i]->numa_node == node))
3309 continue;
868f2f0b 3310
34d11ffa
JW
3311 hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
3312 if (hctx) {
2f8f1336 3313 if (hctxs[i])
34d11ffa 3314 blk_mq_exit_hctx(q, set, hctxs[i], i);
34d11ffa
JW
3315 hctxs[i] = hctx;
3316 } else {
3317 if (hctxs[i])
3318 pr_warn("Allocate new hctx on node %d fails,\
3319 fallback to previous one on node %d\n",
3320 node, hctxs[i]->numa_node);
3321 else
3322 break;
868f2f0b 3323 }
320ae51f 3324 }
e01ad46d
JW
3325 /*
3326 * Increasing nr_hw_queues fails. Free the newly allocated
3327 * hctxs and keep the previous q->nr_hw_queues.
3328 */
3329 if (i != set->nr_hw_queues) {
3330 j = q->nr_hw_queues;
3331 end = i;
3332 } else {
3333 j = i;
3334 end = q->nr_hw_queues;
3335 q->nr_hw_queues = set->nr_hw_queues;
3336 }
34d11ffa 3337
e01ad46d 3338 for (; j < end; j++) {
868f2f0b
KB
3339 struct blk_mq_hw_ctx *hctx = hctxs[j];
3340
3341 if (hctx) {
e155b0c2 3342 __blk_mq_free_map_and_rqs(set, j);
868f2f0b 3343 blk_mq_exit_hctx(q, set, hctx, j);
868f2f0b 3344 hctxs[j] = NULL;
868f2f0b
KB
3345 }
3346 }
fb350e0a 3347 mutex_unlock(&q->sysfs_lock);
868f2f0b
KB
3348}
3349
26a9750a
CH
3350int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
3351 struct request_queue *q)
868f2f0b 3352{
66841672
ML
3353 /* mark the queue as mq asap */
3354 q->mq_ops = set->ops;
3355
34dbad5d 3356 q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
720b8ccc
SB
3357 blk_mq_poll_stats_bkt,
3358 BLK_MQ_POLL_STATS_BKTS, q);
34dbad5d
OS
3359 if (!q->poll_cb)
3360 goto err_exit;
3361
1db4909e 3362 if (blk_mq_alloc_ctxs(q))
41de54c6 3363 goto err_poll;
868f2f0b 3364
737f98cf
ML
3365 /* init q->mq_kobj and sw queues' kobjects */
3366 blk_mq_sysfs_init(q);
3367
2f8f1336
ML
3368 INIT_LIST_HEAD(&q->unused_hctx_list);
3369 spin_lock_init(&q->unused_hctx_lock);
3370
868f2f0b
KB
3371 blk_mq_realloc_hw_ctxs(set, q);
3372 if (!q->nr_hw_queues)
3373 goto err_hctxs;
320ae51f 3374
287922eb 3375 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
e56f698b 3376 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
320ae51f 3377
a8908939 3378 q->tag_set = set;
320ae51f 3379
94eddfbe 3380 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
cd19181b
ML
3381 if (set->nr_maps > HCTX_TYPE_POLL &&
3382 set->map[HCTX_TYPE_POLL].nr_queues)
6544d229 3383 blk_queue_flag_set(QUEUE_FLAG_POLL, q);
320ae51f 3384
2849450a 3385 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
6fca6a61
CH
3386 INIT_LIST_HEAD(&q->requeue_list);
3387 spin_lock_init(&q->requeue_lock);
3388
eba71768
JA
3389 q->nr_requests = set->queue_depth;
3390
64f1c21e
JA
3391 /*
3392 * Default to classic polling
3393 */
29ece8b4 3394 q->poll_nsec = BLK_MQ_POLL_CLASSIC;
64f1c21e 3395
24d2f903 3396 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
0d2602ca 3397 blk_mq_add_queue_tag_set(set, q);
4b855ad3 3398 blk_mq_map_swqueue(q);
26a9750a 3399 return 0;
18741986 3400
320ae51f 3401err_hctxs:
868f2f0b 3402 kfree(q->queue_hw_ctx);
73d9c8d4 3403 q->nr_hw_queues = 0;
1db4909e 3404 blk_mq_sysfs_deinit(q);
41de54c6
JS
3405err_poll:
3406 blk_stat_free_callback(q->poll_cb);
3407 q->poll_cb = NULL;
c7de5726
ML
3408err_exit:
3409 q->mq_ops = NULL;
26a9750a 3410 return -ENOMEM;
320ae51f 3411}
b62c21b7 3412EXPORT_SYMBOL(blk_mq_init_allocated_queue);
320ae51f 3413
c7e2d94b
ML
3414/* tags can _not_ be used after returning from blk_mq_exit_queue */
3415void blk_mq_exit_queue(struct request_queue *q)
320ae51f 3416{
630ef623 3417 struct blk_mq_tag_set *set = q->tag_set;
320ae51f 3418
630ef623 3419 /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
624dbe47 3420 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
630ef623
BVA
3421 /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
3422 blk_mq_del_queue_tag_set(q);
320ae51f 3423}
320ae51f 3424
a5164405
JA
3425static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
3426{
3427 int i;
3428
079a2e3e
JG
3429 if (blk_mq_is_shared_tags(set->flags)) {
3430 set->shared_tags = blk_mq_alloc_map_and_rqs(set,
e155b0c2
JG
3431 BLK_MQ_NO_HCTX_IDX,
3432 set->queue_depth);
079a2e3e 3433 if (!set->shared_tags)
e155b0c2
JG
3434 return -ENOMEM;
3435 }
3436
8229cca8 3437 for (i = 0; i < set->nr_hw_queues; i++) {
63064be1 3438 if (!__blk_mq_alloc_map_and_rqs(set, i))
a5164405 3439 goto out_unwind;
8229cca8
XT
3440 cond_resched();
3441 }
a5164405
JA
3442
3443 return 0;
3444
3445out_unwind:
e155b0c2
JG
3446 while (--i >= 0)
3447 __blk_mq_free_map_and_rqs(set, i);
3448
079a2e3e
JG
3449 if (blk_mq_is_shared_tags(set->flags)) {
3450 blk_mq_free_map_and_rqs(set, set->shared_tags,
e155b0c2 3451 BLK_MQ_NO_HCTX_IDX);
645db34e 3452 }
a5164405 3453
a5164405
JA
3454 return -ENOMEM;
3455}
3456
3457/*
3458 * Allocate the request maps associated with this tag_set. Note that this
3459 * may reduce the depth asked for, if memory is tight. set->queue_depth
3460 * will be updated to reflect the allocated depth.
3461 */
63064be1 3462static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
a5164405
JA
3463{
3464 unsigned int depth;
3465 int err;
3466
3467 depth = set->queue_depth;
3468 do {
3469 err = __blk_mq_alloc_rq_maps(set);
3470 if (!err)
3471 break;
3472
3473 set->queue_depth >>= 1;
3474 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
3475 err = -ENOMEM;
3476 break;
3477 }
3478 } while (set->queue_depth);
3479
3480 if (!set->queue_depth || err) {
3481 pr_err("blk-mq: failed to allocate request map\n");
3482 return -ENOMEM;
3483 }
3484
3485 if (depth != set->queue_depth)
3486 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
3487 depth, set->queue_depth);
3488
3489 return 0;
3490}
3491
ebe8bddb
OS
3492static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
3493{
6e66b493
BVA
3494 /*
3495 * blk_mq_map_queues() and multiple .map_queues() implementations
3496 * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
3497 * number of hardware queues.
3498 */
3499 if (set->nr_maps == 1)
3500 set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
3501
59388702 3502 if (set->ops->map_queues && !is_kdump_kernel()) {
b3c661b1
JA
3503 int i;
3504
7d4901a9
ML
3505 /*
3506 * transport .map_queues is usually done in the following
3507 * way:
3508 *
3509 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
3510 * mask = get_cpu_mask(queue)
3511 * for_each_cpu(cpu, mask)
b3c661b1 3512 * set->map[x].mq_map[cpu] = queue;
7d4901a9
ML
3513 * }
3514 *
3515 * When we need to remap, the table has to be cleared for
3516 * killing stale mapping since one CPU may not be mapped
3517 * to any hw queue.
3518 */
b3c661b1
JA
3519 for (i = 0; i < set->nr_maps; i++)
3520 blk_mq_clear_mq_map(&set->map[i]);
7d4901a9 3521
ebe8bddb 3522 return set->ops->map_queues(set);
b3c661b1
JA
3523 } else {
3524 BUG_ON(set->nr_maps > 1);
7d76f856 3525 return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
b3c661b1 3526 }
ebe8bddb
OS
3527}
3528
f7e76dbc
BVA
3529static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
3530 int cur_nr_hw_queues, int new_nr_hw_queues)
3531{
3532 struct blk_mq_tags **new_tags;
3533
3534 if (cur_nr_hw_queues >= new_nr_hw_queues)
3535 return 0;
3536
3537 new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
3538 GFP_KERNEL, set->numa_node);
3539 if (!new_tags)
3540 return -ENOMEM;
3541
3542 if (set->tags)
3543 memcpy(new_tags, set->tags, cur_nr_hw_queues *
3544 sizeof(*set->tags));
3545 kfree(set->tags);
3546 set->tags = new_tags;
3547 set->nr_hw_queues = new_nr_hw_queues;
3548
3549 return 0;
3550}
3551
91cdf265
MI
3552static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
3553 int new_nr_hw_queues)
3554{
3555 return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
3556}
3557
a4391c64
JA
3558/*
3559 * Alloc a tag set to be associated with one or more request queues.
3560 * May fail with EINVAL for various error conditions. May adjust the
c018c84f 3561 * requested depth down, if it's too large. In that case, the set
a4391c64
JA
3562 * value will be stored in set->queue_depth.
3563 */
24d2f903
CH
3564int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
3565{
b3c661b1 3566 int i, ret;
da695ba2 3567
205fb5f5
BVA
3568 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
3569
24d2f903
CH
3570 if (!set->nr_hw_queues)
3571 return -EINVAL;
a4391c64 3572 if (!set->queue_depth)
24d2f903
CH
3573 return -EINVAL;
3574 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
3575 return -EINVAL;
3576
7d7e0f90 3577 if (!set->ops->queue_rq)
24d2f903
CH
3578 return -EINVAL;
3579
de148297
ML
3580 if (!set->ops->get_budget ^ !set->ops->put_budget)
3581 return -EINVAL;
3582
a4391c64
JA
3583 if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
3584 pr_info("blk-mq: reduced tag depth to %u\n",
3585 BLK_MQ_MAX_DEPTH);
3586 set->queue_depth = BLK_MQ_MAX_DEPTH;
3587 }
24d2f903 3588
b3c661b1
JA
3589 if (!set->nr_maps)
3590 set->nr_maps = 1;
3591 else if (set->nr_maps > HCTX_MAX_TYPES)
3592 return -EINVAL;
3593
6637fadf
SL
3594 /*
3595 * If a crashdump is active, then we are potentially in a very
3596 * memory constrained environment. Limit us to 1 queue and
3597 * 64 tags to prevent using too much memory.
3598 */
3599 if (is_kdump_kernel()) {
3600 set->nr_hw_queues = 1;
59388702 3601 set->nr_maps = 1;
6637fadf
SL
3602 set->queue_depth = min(64U, set->queue_depth);
3603 }
868f2f0b 3604 /*
392546ae
JA
3605 * There is no use for more h/w queues than cpus if we just have
3606 * a single map
868f2f0b 3607 */
392546ae 3608 if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
868f2f0b 3609 set->nr_hw_queues = nr_cpu_ids;
6637fadf 3610
91cdf265 3611 if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
a5164405 3612 return -ENOMEM;
24d2f903 3613
da695ba2 3614 ret = -ENOMEM;
b3c661b1
JA
3615 for (i = 0; i < set->nr_maps; i++) {
3616 set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
07b35eb5 3617 sizeof(set->map[i].mq_map[0]),
b3c661b1
JA
3618 GFP_KERNEL, set->numa_node);
3619 if (!set->map[i].mq_map)
3620 goto out_free_mq_map;
59388702 3621 set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
b3c661b1 3622 }
bdd17e75 3623
ebe8bddb 3624 ret = blk_mq_update_queue_map(set);
da695ba2
CH
3625 if (ret)
3626 goto out_free_mq_map;
3627
63064be1 3628 ret = blk_mq_alloc_set_map_and_rqs(set);
da695ba2 3629 if (ret)
bdd17e75 3630 goto out_free_mq_map;
24d2f903 3631
0d2602ca
JA
3632 mutex_init(&set->tag_list_lock);
3633 INIT_LIST_HEAD(&set->tag_list);
3634
24d2f903 3635 return 0;
bdd17e75
CH
3636
3637out_free_mq_map:
b3c661b1
JA
3638 for (i = 0; i < set->nr_maps; i++) {
3639 kfree(set->map[i].mq_map);
3640 set->map[i].mq_map = NULL;
3641 }
5676e7b6
RE
3642 kfree(set->tags);
3643 set->tags = NULL;
da695ba2 3644 return ret;
24d2f903
CH
3645}
3646EXPORT_SYMBOL(blk_mq_alloc_tag_set);
3647
cdb14e0f
CH
3648/* allocate and initialize a tagset for a simple single-queue device */
3649int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
3650 const struct blk_mq_ops *ops, unsigned int queue_depth,
3651 unsigned int set_flags)
3652{
3653 memset(set, 0, sizeof(*set));
3654 set->ops = ops;
3655 set->nr_hw_queues = 1;
3656 set->nr_maps = 1;
3657 set->queue_depth = queue_depth;
3658 set->numa_node = NUMA_NO_NODE;
3659 set->flags = set_flags;
3660 return blk_mq_alloc_tag_set(set);
3661}
3662EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
3663
24d2f903
CH
3664void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
3665{
b3c661b1 3666 int i, j;
24d2f903 3667
e155b0c2
JG
3668 for (i = 0; i < set->nr_hw_queues; i++)
3669 __blk_mq_free_map_and_rqs(set, i);
484b4061 3670
079a2e3e
JG
3671 if (blk_mq_is_shared_tags(set->flags)) {
3672 blk_mq_free_map_and_rqs(set, set->shared_tags,
e155b0c2
JG
3673 BLK_MQ_NO_HCTX_IDX);
3674 }
32bc15af 3675
b3c661b1
JA
3676 for (j = 0; j < set->nr_maps; j++) {
3677 kfree(set->map[j].mq_map);
3678 set->map[j].mq_map = NULL;
3679 }
bdd17e75 3680
981bd189 3681 kfree(set->tags);
5676e7b6 3682 set->tags = NULL;
24d2f903
CH
3683}
3684EXPORT_SYMBOL(blk_mq_free_tag_set);
3685
e3a2b3f9
JA
3686int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
3687{
3688 struct blk_mq_tag_set *set = q->tag_set;
3689 struct blk_mq_hw_ctx *hctx;
3690 int i, ret;
3691
bd166ef1 3692 if (!set)
e3a2b3f9
JA
3693 return -EINVAL;
3694
e5fa8140
AZ
3695 if (q->nr_requests == nr)
3696 return 0;
3697
70f36b60 3698 blk_mq_freeze_queue(q);
24f5a90f 3699 blk_mq_quiesce_queue(q);
70f36b60 3700
e3a2b3f9
JA
3701 ret = 0;
3702 queue_for_each_hw_ctx(q, hctx, i) {
e9137d4b
KB
3703 if (!hctx->tags)
3704 continue;
bd166ef1
JA
3705 /*
3706 * If we're using an MQ scheduler, just update the scheduler
3707 * queue depth. This is similar to what the old code would do.
3708 */
f6adcef5 3709 if (hctx->sched_tags) {
70f36b60 3710 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
f6adcef5 3711 nr, true);
f6adcef5
JG
3712 } else {
3713 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
3714 false);
70f36b60 3715 }
e3a2b3f9
JA
3716 if (ret)
3717 break;
77f1e0a5
JA
3718 if (q->elevator && q->elevator->type->ops.depth_updated)
3719 q->elevator->type->ops.depth_updated(hctx);
e3a2b3f9 3720 }
d97e594c 3721 if (!ret) {
e3a2b3f9 3722 q->nr_requests = nr;
079a2e3e 3723 if (blk_mq_is_shared_tags(set->flags)) {
8fa04464 3724 if (q->elevator)
079a2e3e 3725 blk_mq_tag_update_sched_shared_tags(q);
8fa04464 3726 else
079a2e3e 3727 blk_mq_tag_resize_shared_tags(set, nr);
8fa04464 3728 }
d97e594c 3729 }
e3a2b3f9 3730
24f5a90f 3731 blk_mq_unquiesce_queue(q);
70f36b60 3732 blk_mq_unfreeze_queue(q);
70f36b60 3733
e3a2b3f9
JA
3734 return ret;
3735}
3736
d48ece20
JW
3737/*
3738 * request_queue and elevator_type pair.
3739 * It is just used by __blk_mq_update_nr_hw_queues to cache
3740 * the elevator_type associated with a request_queue.
3741 */
3742struct blk_mq_qe_pair {
3743 struct list_head node;
3744 struct request_queue *q;
3745 struct elevator_type *type;
3746};
3747
3748/*
3749 * Cache the elevator_type in qe pair list and switch the
3750 * io scheduler to 'none'
3751 */
3752static bool blk_mq_elv_switch_none(struct list_head *head,
3753 struct request_queue *q)
3754{
3755 struct blk_mq_qe_pair *qe;
3756
3757 if (!q->elevator)
3758 return true;
3759
3760 qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
3761 if (!qe)
3762 return false;
3763
3764 INIT_LIST_HEAD(&qe->node);
3765 qe->q = q;
3766 qe->type = q->elevator->type;
3767 list_add(&qe->node, head);
3768
3769 mutex_lock(&q->sysfs_lock);
3770 /*
3771 * After elevator_switch_mq, the previous elevator_queue will be
3772 * released by elevator_release. The reference of the io scheduler
3773 * module get by elevator_get will also be put. So we need to get
3774 * a reference of the io scheduler module here to prevent it to be
3775 * removed.
3776 */
3777 __module_get(qe->type->elevator_owner);
3778 elevator_switch_mq(q, NULL);
3779 mutex_unlock(&q->sysfs_lock);
3780
3781 return true;
3782}
3783
3784static void blk_mq_elv_switch_back(struct list_head *head,
3785 struct request_queue *q)
3786{
3787 struct blk_mq_qe_pair *qe;
3788 struct elevator_type *t = NULL;
3789
3790 list_for_each_entry(qe, head, node)
3791 if (qe->q == q) {
3792 t = qe->type;
3793 break;
3794 }
3795
3796 if (!t)
3797 return;
3798
3799 list_del(&qe->node);
3800 kfree(qe);
3801
3802 mutex_lock(&q->sysfs_lock);
3803 elevator_switch_mq(q, t);
3804 mutex_unlock(&q->sysfs_lock);
3805}
3806
e4dc2b32
KB
3807static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
3808 int nr_hw_queues)
868f2f0b
KB
3809{
3810 struct request_queue *q;
d48ece20 3811 LIST_HEAD(head);
e01ad46d 3812 int prev_nr_hw_queues;
868f2f0b 3813
705cda97
BVA
3814 lockdep_assert_held(&set->tag_list_lock);
3815
392546ae 3816 if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
868f2f0b 3817 nr_hw_queues = nr_cpu_ids;
fe35ec58
WZ
3818 if (nr_hw_queues < 1)
3819 return;
3820 if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
868f2f0b
KB
3821 return;
3822
3823 list_for_each_entry(q, &set->tag_list, tag_set_list)
3824 blk_mq_freeze_queue(q);
d48ece20
JW
3825 /*
3826 * Switch IO scheduler to 'none', cleaning up the data associated
3827 * with the previous scheduler. We will switch back once we are done
3828 * updating the new sw to hw queue mappings.
3829 */
3830 list_for_each_entry(q, &set->tag_list, tag_set_list)
3831 if (!blk_mq_elv_switch_none(&head, q))
3832 goto switch_back;
868f2f0b 3833
477e19de
JW
3834 list_for_each_entry(q, &set->tag_list, tag_set_list) {
3835 blk_mq_debugfs_unregister_hctxs(q);
3836 blk_mq_sysfs_unregister(q);
3837 }
3838
a2584e43 3839 prev_nr_hw_queues = set->nr_hw_queues;
f7e76dbc
BVA
3840 if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
3841 0)
3842 goto reregister;
3843
868f2f0b 3844 set->nr_hw_queues = nr_hw_queues;
e01ad46d 3845fallback:
aa880ad6 3846 blk_mq_update_queue_map(set);
868f2f0b
KB
3847 list_for_each_entry(q, &set->tag_list, tag_set_list) {
3848 blk_mq_realloc_hw_ctxs(set, q);
e01ad46d
JW
3849 if (q->nr_hw_queues != set->nr_hw_queues) {
3850 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
3851 nr_hw_queues, prev_nr_hw_queues);
3852 set->nr_hw_queues = prev_nr_hw_queues;
7d76f856 3853 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
e01ad46d
JW
3854 goto fallback;
3855 }
477e19de
JW
3856 blk_mq_map_swqueue(q);
3857 }
3858
f7e76dbc 3859reregister:
477e19de
JW
3860 list_for_each_entry(q, &set->tag_list, tag_set_list) {
3861 blk_mq_sysfs_register(q);
3862 blk_mq_debugfs_register_hctxs(q);
868f2f0b
KB
3863 }
3864
d48ece20
JW
3865switch_back:
3866 list_for_each_entry(q, &set->tag_list, tag_set_list)
3867 blk_mq_elv_switch_back(&head, q);
3868
868f2f0b
KB
3869 list_for_each_entry(q, &set->tag_list, tag_set_list)
3870 blk_mq_unfreeze_queue(q);
3871}
e4dc2b32
KB
3872
3873void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
3874{
3875 mutex_lock(&set->tag_list_lock);
3876 __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
3877 mutex_unlock(&set->tag_list_lock);
3878}
868f2f0b
KB
3879EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
3880
34dbad5d
OS
3881/* Enable polling stats and return whether they were already enabled. */
3882static bool blk_poll_stats_enable(struct request_queue *q)
3883{
3884 if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
7dfdbc73 3885 blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
34dbad5d
OS
3886 return true;
3887 blk_stat_add_callback(q, q->poll_cb);
3888 return false;
3889}
3890
3891static void blk_mq_poll_stats_start(struct request_queue *q)
3892{
3893 /*
3894 * We don't arm the callback if polling stats are not enabled or the
3895 * callback is already active.
3896 */
3897 if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
3898 blk_stat_is_active(q->poll_cb))
3899 return;
3900
3901 blk_stat_activate_msecs(q->poll_cb, 100);
3902}
3903
3904static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
3905{
3906 struct request_queue *q = cb->data;
720b8ccc 3907 int bucket;
34dbad5d 3908
720b8ccc
SB
3909 for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
3910 if (cb->stat[bucket].nr_samples)
3911 q->poll_stat[bucket] = cb->stat[bucket];
3912 }
34dbad5d
OS
3913}
3914
64f1c21e 3915static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
64f1c21e
JA
3916 struct request *rq)
3917{
64f1c21e 3918 unsigned long ret = 0;
720b8ccc 3919 int bucket;
64f1c21e
JA
3920
3921 /*
3922 * If stats collection isn't on, don't sleep but turn it on for
3923 * future users
3924 */
34dbad5d 3925 if (!blk_poll_stats_enable(q))
64f1c21e
JA
3926 return 0;
3927
64f1c21e
JA
3928 /*
3929 * As an optimistic guess, use half of the mean service time
3930 * for this type of request. We can (and should) make this smarter.
3931 * For instance, if the completion latencies are tight, we can
3932 * get closer than just half the mean. This is especially
3933 * important on devices where the completion latencies are longer
720b8ccc
SB
3934 * than ~10 usec. We do use the stats for the relevant IO size
3935 * if available which does lead to better estimates.
64f1c21e 3936 */
720b8ccc
SB
3937 bucket = blk_mq_poll_stats_bkt(rq);
3938 if (bucket < 0)
3939 return ret;
3940
3941 if (q->poll_stat[bucket].nr_samples)
3942 ret = (q->poll_stat[bucket].mean + 1) / 2;
64f1c21e
JA
3943
3944 return ret;
3945}
3946
06426adf
JA
3947static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
3948 struct request *rq)
3949{
3950 struct hrtimer_sleeper hs;
3951 enum hrtimer_mode mode;
64f1c21e 3952 unsigned int nsecs;
06426adf
JA
3953 ktime_t kt;
3954
76a86f9d 3955 if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
64f1c21e
JA
3956 return false;
3957
3958 /*
1052b8ac 3959 * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
64f1c21e 3960 *
64f1c21e
JA
3961 * 0: use half of prev avg
3962 * >0: use this specific value
3963 */
1052b8ac 3964 if (q->poll_nsec > 0)
64f1c21e
JA
3965 nsecs = q->poll_nsec;
3966 else
cae740a0 3967 nsecs = blk_mq_poll_nsecs(q, rq);
64f1c21e
JA
3968
3969 if (!nsecs)
06426adf
JA
3970 return false;
3971
76a86f9d 3972 rq->rq_flags |= RQF_MQ_POLL_SLEPT;
06426adf
JA
3973
3974 /*
3975 * This will be replaced with the stats tracking code, using
3976 * 'avg_completion_time / 2' as the pre-sleep target.
3977 */
8b0e1953 3978 kt = nsecs;
06426adf
JA
3979
3980 mode = HRTIMER_MODE_REL;
dbc1625f 3981 hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
06426adf
JA
3982 hrtimer_set_expires(&hs.timer, kt);
3983
06426adf 3984 do {
5a61c363 3985 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
06426adf
JA
3986 break;
3987 set_current_state(TASK_UNINTERRUPTIBLE);
9dd8813e 3988 hrtimer_sleeper_start_expires(&hs, mode);
06426adf
JA
3989 if (hs.task)
3990 io_schedule();
3991 hrtimer_cancel(&hs.timer);
3992 mode = HRTIMER_MODE_ABS;
3993 } while (hs.task && !signal_pending(current));
3994
3995 __set_current_state(TASK_RUNNING);
3996 destroy_hrtimer_on_stack(&hs.timer);
3997 return true;
3998}
3999
1052b8ac
JA
4000static bool blk_mq_poll_hybrid(struct request_queue *q,
4001 struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
bbd7bb70 4002{
1052b8ac
JA
4003 struct request *rq;
4004
29ece8b4 4005 if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
1052b8ac
JA
4006 return false;
4007
4008 if (!blk_qc_t_is_internal(cookie))
4009 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
4010 else {
4011 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
4012 /*
4013 * With scheduling, if the request has completed, we'll
4014 * get a NULL return here, as we clear the sched tag when
4015 * that happens. The request still remains valid, like always,
4016 * so we should be safe with just the NULL check.
4017 */
4018 if (!rq)
4019 return false;
4020 }
4021
cae740a0 4022 return blk_mq_poll_hybrid_sleep(q, rq);
1052b8ac
JA
4023}
4024
529262d5
CH
4025/**
4026 * blk_poll - poll for IO completions
4027 * @q: the queue
4028 * @cookie: cookie passed back at IO submission time
4029 * @spin: whether to spin for completions
4030 *
4031 * Description:
4032 * Poll for completions on the passed in queue. Returns number of
4033 * completed entries found. If @spin is true, then blk_poll will continue
4034 * looping until at least one completion is found, unless the task is
4035 * otherwise marked running (or we need to reschedule).
4036 */
4037int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
1052b8ac
JA
4038{
4039 struct blk_mq_hw_ctx *hctx;
2f064a59 4040 unsigned int state;
bbd7bb70 4041
529262d5
CH
4042 if (!blk_qc_t_valid(cookie) ||
4043 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
1052b8ac
JA
4044 return 0;
4045
529262d5
CH
4046 if (current->plug)
4047 blk_flush_plug_list(current->plug, false);
4048
1052b8ac
JA
4049 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
4050
06426adf
JA
4051 /*
4052 * If we sleep, have the caller restart the poll loop to reset
4053 * the state. Like for the other success return cases, the
4054 * caller is responsible for checking if the IO completed. If
4055 * the IO isn't complete, we'll get called again and will go
f6f371f7
PB
4056 * straight to the busy poll loop. If specified not to spin,
4057 * we also should not sleep.
06426adf 4058 */
f6f371f7 4059 if (spin && blk_mq_poll_hybrid(q, hctx, cookie))
85f4d4b6 4060 return 1;
06426adf 4061
bbd7bb70
JA
4062 hctx->poll_considered++;
4063
d6c23bb3 4064 state = get_current_state();
aa61bec3 4065 do {
bbd7bb70
JA
4066 int ret;
4067
4068 hctx->poll_invoked++;
4069
9743139c 4070 ret = q->mq_ops->poll(hctx);
bbd7bb70
JA
4071 if (ret > 0) {
4072 hctx->poll_success++;
849a3700 4073 __set_current_state(TASK_RUNNING);
85f4d4b6 4074 return ret;
bbd7bb70
JA
4075 }
4076
4077 if (signal_pending_state(state, current))
849a3700 4078 __set_current_state(TASK_RUNNING);
bbd7bb70 4079
b03fbd4f 4080 if (task_is_running(current))
85f4d4b6 4081 return 1;
0a1b8b87 4082 if (ret < 0 || !spin)
bbd7bb70
JA
4083 break;
4084 cpu_relax();
aa61bec3 4085 } while (!need_resched());
bbd7bb70 4086
67b4110f 4087 __set_current_state(TASK_RUNNING);
85f4d4b6 4088 return 0;
bbd7bb70 4089}
529262d5 4090EXPORT_SYMBOL_GPL(blk_poll);
bbd7bb70 4091
9cf2bab6
JA
4092unsigned int blk_mq_rq_cpu(struct request *rq)
4093{
4094 return rq->mq_ctx->cpu;
4095}
4096EXPORT_SYMBOL(blk_mq_rq_cpu);
4097
320ae51f
JA
4098static int __init blk_mq_init(void)
4099{
c3077b5d
CH
4100 int i;
4101
4102 for_each_possible_cpu(i)
f9ab4918 4103 init_llist_head(&per_cpu(blk_cpu_done, i));
c3077b5d
CH
4104 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
4105
4106 cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
4107 "block/softirq:dead", NULL,
4108 blk_softirq_cpu_dead);
9467f859
TG
4109 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
4110 blk_mq_hctx_notify_dead);
bf0beec0
ML
4111 cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
4112 blk_mq_hctx_notify_online,
4113 blk_mq_hctx_notify_offline);
320ae51f
JA
4114 return 0;
4115}
4116subsys_initcall(blk_mq_init);