block/blk-mq-sched.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * blk-mq scheduling framework
   4  *
   5  * Copyright (C) 2016 Jens Axboe
   6  */
   7 #include <linux/kernel.h>
   8 #include <linux/module.h>
   9 #include <linux/list_sort.h>
  10
  11 #include <trace/events/block.h>
  12
  13 #include "blk.h"
  14 #include "blk-mq.h"
  15 #include "blk-mq-debugfs.h"
  16 #include "blk-mq-sched.h"
  17 #include "blk-wbt.h"
  18
  19 /*
  20  * Mark a hardware queue as needing a restart.
  21  */
  22 void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
  23 {
  24         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
  25                 return;
  26
  27         set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  28 }
  29 EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
  30
  31 void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  32 {
  33         clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  34
  35         /*
  36          * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
  37          * in blk_mq_run_hw_queue(). Its pair is the barrier in
  38          * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
  39          * meantime new request added to hctx->dispatch is missed to check in
  40          * blk_mq_run_hw_queue().
  41          */
  42         smp_mb();
  43
  44         blk_mq_run_hw_queue(hctx, true);
  45 }
  46
  47 static int sched_rq_cmp(void *priv, const struct list_head *a,
  48                         const struct list_head *b)
  49 {
  50         struct request *rqa = container_of(a, struct request, queuelist);
  51         struct request *rqb = container_of(b, struct request, queuelist);
  52
  53         return rqa->mq_hctx > rqb->mq_hctx;
  54 }
  55
  56 static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
  57 {
  58         struct blk_mq_hw_ctx *hctx =
  59                 list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
  60         struct request *rq;
  61         LIST_HEAD(hctx_list);
  62         unsigned int count = 0;
  63
  64         list_for_each_entry(rq, rq_list, queuelist) {
  65                 if (rq->mq_hctx != hctx) {
  66                         list_cut_before(&hctx_list, rq_list, &rq->queuelist);
  67                         goto dispatch;
  68                 }
  69                 count++;
  70         }
  71         list_splice_tail_init(rq_list, &hctx_list);
  72
  73 dispatch:
  74         return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
  75 }
  76
  77 #define BLK_MQ_BUDGET_DELAY     3               /* ms units */
  78
  79 /*
  80  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  81  * its queue by itself in its completion handler, so we don't need to
  82  * restart queue if .get_budget() fails to get the budget.
  83  *
  84  * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
  85  * be run again.  This is necessary to avoid starving flushes.
  86  */
  87 static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
  88 {
  89         struct request_queue *q = hctx->queue;
  90         struct elevator_queue *e = q->elevator;
  91         bool multi_hctxs = false, run_queue = false;
  92         bool dispatched = false, busy = false;
  93         unsigned int max_dispatch;
  94         LIST_HEAD(rq_list);
  95         int count = 0;
  96
  97         if (hctx->dispatch_busy)
  98                 max_dispatch = 1;
  99         else
 100                 max_dispatch = hctx->queue->nr_requests;
 101
 102         do {
 103                 struct request *rq;
 104                 int budget_token;
 105
 106                 if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
 107                         break;
 108
 109                 if (!list_empty_careful(&hctx->dispatch)) {
 110                         busy = true;
 111                         break;
 112                 }
 113
 114                 budget_token = blk_mq_get_dispatch_budget(q);
 115                 if (budget_token < 0)
 116                         break;
 117
 118                 rq = e->type->ops.dispatch_request(hctx);
 119                 if (!rq) {
 120                         blk_mq_put_dispatch_budget(q, budget_token);
 121                         /*
 122                          * We're releasing without dispatching. Holding the
 123                          * budget could have blocked any "hctx"s with the
 124                          * same queue and if we didn't dispatch then there's
 125                          * no guarantee anyone will kick the queue.  Kick it
 126                          * ourselves.
 127                          */
 128                         run_queue = true;
 129                         break;
 130                 }
 131
 132                 blk_mq_set_rq_budget_token(rq, budget_token);
 133
 134                 /*
 135                  * Now this rq owns the budget which has to be released
 136                  * if this rq won't be queued to driver via .queue_rq()
 137                  * in blk_mq_dispatch_rq_list().
 138                  */
 139                 list_add_tail(&rq->queuelist, &rq_list);
 140                 count++;
 141                 if (rq->mq_hctx != hctx)
 142                         multi_hctxs = true;
 143
 144                 /*
 145                  * If we cannot get tag for the request, stop dequeueing
 146                  * requests from the IO scheduler. We are unlikely to be able
 147                  * to submit them anyway and it creates false impression for
 148                  * scheduling heuristics that the device can take more IO.
 149                  */
 150                 if (!blk_mq_get_driver_tag(rq))
 151                         break;
 152         } while (count < max_dispatch);
 153
 154         if (!count) {
 155                 if (run_queue)
 156                         blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
 157         } else if (multi_hctxs) {
 158                 /*
 159                  * Requests from different hctx may be dequeued from some
 160                  * schedulers, such as bfq and deadline.
 161                  *
 162                  * Sort the requests in the list according to their hctx,
 163                  * dispatch batching requests from same hctx at a time.
 164                  */
 165                 list_sort(NULL, &rq_list, sched_rq_cmp);
 166                 do {
 167                         dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
 168                 } while (!list_empty(&rq_list));
 169         } else {
 170                 dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
 171         }
 172
 173         if (busy)
 174                 return -EAGAIN;
 175         return !!dispatched;
 176 }
 177
 178 static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 179 {
 180         unsigned long end = jiffies + HZ;
 181         int ret;
 182
 183         do {
 184                 ret = __blk_mq_do_dispatch_sched(hctx);
 185                 if (ret != 1)
 186                         break;
 187                 if (need_resched() || time_is_before_jiffies(end)) {
 188                         blk_mq_delay_run_hw_queue(hctx, 0);
 189                         break;
 190                 }
 191         } while (1);
 192
 193         return ret;
 194 }
 195
 196 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
 197                                           struct blk_mq_ctx *ctx)
 198 {
 199         unsigned short idx = ctx->index_hw[hctx->type];
 200
 201         if (++idx == hctx->nr_ctx)
 202                 idx = 0;
 203
 204         return hctx->ctxs[idx];
 205 }
 206
 207 /*
 208  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 209  * its queue by itself in its completion handler, so we don't need to
 210  * restart queue if .get_budget() fails to get the budget.
 211  *
 212  * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
 213  * be run again.  This is necessary to avoid starving flushes.
 214  */
 215 static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 216 {
 217         struct request_queue *q = hctx->queue;
 218         LIST_HEAD(rq_list);
 219         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
 220         int ret = 0;
 221         struct request *rq;
 222
 223         do {
 224                 int budget_token;
 225
 226                 if (!list_empty_careful(&hctx->dispatch)) {
 227                         ret = -EAGAIN;
 228                         break;
 229                 }
 230
 231                 if (!sbitmap_any_bit_set(&hctx->ctx_map))
 232                         break;
 233
 234                 budget_token = blk_mq_get_dispatch_budget(q);
 235                 if (budget_token < 0)
 236                         break;
 237
 238                 rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 239                 if (!rq) {
 240                         blk_mq_put_dispatch_budget(q, budget_token);
 241                         /*
 242                          * We're releasing without dispatching. Holding the
 243                          * budget could have blocked any "hctx"s with the
 244                          * same queue and if we didn't dispatch then there's
 245                          * no guarantee anyone will kick the queue.  Kick it
 246                          * ourselves.
 247                          */
 248                         blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
 249                         break;
 250                 }
 251
 252                 blk_mq_set_rq_budget_token(rq, budget_token);
 253
 254                 /*
 255                  * Now this rq owns the budget which has to be released
 256                  * if this rq won't be queued to driver via .queue_rq()
 257                  * in blk_mq_dispatch_rq_list().
 258                  */
 259                 list_add(&rq->queuelist, &rq_list);
 260
 261                 /* round robin for fair dispatch */
 262                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
 263
 264         } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
 265
 266         WRITE_ONCE(hctx->dispatch_from, ctx);
 267         return ret;
 268 }
 269
 270 static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 271 {
 272         struct request_queue *q = hctx->queue;
 273         const bool has_sched = q->elevator;
 274         int ret = 0;
 275         LIST_HEAD(rq_list);
 276
 277         /*
 278          * If we have previous entries on our dispatch list, grab them first for
 279          * more fair dispatch.
 280          */
 281         if (!list_empty_careful(&hctx->dispatch)) {
 282                 spin_lock(&hctx->lock);
 283                 if (!list_empty(&hctx->dispatch))
 284                         list_splice_init(&hctx->dispatch, &rq_list);
 285                 spin_unlock(&hctx->lock);
 286         }
 287
 288         /*
 289          * Only ask the scheduler for requests, if we didn't have residual
 290          * requests from the dispatch list. This is to avoid the case where
 291          * we only ever dispatch a fraction of the requests available because
 292          * of low device queue depth. Once we pull requests out of the IO
 293          * scheduler, we can no longer merge or sort them. So it's best to
 294          * leave them there for as long as we can. Mark the hw queue as
 295          * needing a restart in that case.
 296          *
 297          * We want to dispatch from the scheduler if there was nothing
 298          * on the dispatch list or we were able to dispatch from the
 299          * dispatch list.
 300          */
 301         if (!list_empty(&rq_list)) {
 302                 blk_mq_sched_mark_restart_hctx(hctx);
 303                 if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) {
 304                         if (has_sched)
 305                                 ret = blk_mq_do_dispatch_sched(hctx);
 306                         else
 307                                 ret = blk_mq_do_dispatch_ctx(hctx);
 308                 }
 309         } else if (has_sched) {
 310                 ret = blk_mq_do_dispatch_sched(hctx);
 311         } else if (hctx->dispatch_busy) {
 312                 /* dequeue request one by one from sw queue if queue is busy */
 313                 ret = blk_mq_do_dispatch_ctx(hctx);
 314         } else {
 315                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
 316                 blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
 317         }
 318
 319         return ret;
 320 }
 321
 322 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 323 {
 324         struct request_queue *q = hctx->queue;
 325
 326         /* RCU or SRCU read lock is needed before checking quiesced flag */
 327         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
 328                 return;
 329
 330         hctx->run++;
 331
 332         /*
 333          * A return of -EAGAIN is an indication that hctx->dispatch is not
 334          * empty and we must run again in order to avoid starving flushes.
 335          */
 336         if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
 337                 if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
 338                         blk_mq_run_hw_queue(hctx, true);
 339         }
 340 }
 341
 342 bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 343                 unsigned int nr_segs)
 344 {
 345         struct elevator_queue *e = q->elevator;
 346         struct blk_mq_ctx *ctx;
 347         struct blk_mq_hw_ctx *hctx;
 348         bool ret = false;
 349         enum hctx_type type;
 350
 351         if (e && e->type->ops.bio_merge) {
 352                 ret = e->type->ops.bio_merge(q, bio, nr_segs);
 353                 goto out_put;
 354         }
 355
 356         ctx = blk_mq_get_ctx(q);
 357         hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
 358         type = hctx->type;
 359         if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
 360             list_empty_careful(&ctx->rq_lists[type]))
 361                 goto out_put;
 362
 363         /* default per sw-queue merge */
 364         spin_lock(&ctx->lock);
 365         /*
 366          * Reverse check our software queue for entries that we could
 367          * potentially merge with. Currently includes a hand-wavy stop
 368          * count of 8, to not spend too much time checking for merges.
 369          */
 370         if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
 371                 ret = true;
 372
 373         spin_unlock(&ctx->lock);
 374 out_put:
 375         return ret;
 376 }
 377
 378 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
 379                                    struct list_head *free)
 380 {
 381         return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
 382 }
 383 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 384
 385 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 386                                        struct request *rq)
 387 {
 388         /*
 389          * dispatch flush and passthrough rq directly
 390          *
 391          * passthrough request has to be added to hctx->dispatch directly.
 392          * For some reason, device may be in one situation which can't
 393          * handle FS request, so STS_RESOURCE is always returned and the
 394          * FS request will be added to hctx->dispatch. However passthrough
 395          * request may be required at that time for fixing the problem. If
 396          * passthrough request is added to scheduler queue, there isn't any
 397          * chance to dispatch it given we prioritize requests in hctx->dispatch.
 398          */
 399         if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
 400                 return true;
 401
 402         return false;
 403 }
 404
 405 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 406                                  bool run_queue, bool async)
 407 {
 408         struct request_queue *q = rq->q;
 409         struct elevator_queue *e = q->elevator;
 410         struct blk_mq_ctx *ctx = rq->mq_ctx;
 411         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 412
 413         WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
 414
 415         if (blk_mq_sched_bypass_insert(hctx, rq)) {
 416                 /*
 417                  * Firstly normal IO request is inserted to scheduler queue or
 418                  * sw queue, meantime we add flush request to dispatch queue(
 419                  * hctx->dispatch) directly and there is at most one in-flight
 420                  * flush request for each hw queue, so it doesn't matter to add
 421                  * flush request to tail or front of the dispatch queue.
 422                  *
 423                  * Secondly in case of NCQ, flush request belongs to non-NCQ
 424                  * command, and queueing it will fail when there is any
 425                  * in-flight normal IO request(NCQ command). When adding flush
 426                  * rq to the front of hctx->dispatch, it is easier to introduce
 427                  * extra time to flush rq's latency because of S_SCHED_RESTART
 428                  * compared with adding to the tail of dispatch queue, then
 429                  * chance of flush merge is increased, and less flush requests
 430                  * will be issued to controller. It is observed that ~10% time
 431                  * is saved in blktests block/004 on disk attached to AHCI/NCQ
 432                  * drive when adding flush rq to the front of hctx->dispatch.
 433                  *
 434                  * Simply queue flush rq to the front of hctx->dispatch so that
 435                  * intensive flush workloads can benefit in case of NCQ HW.
 436                  */
 437                 at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head;
 438                 blk_mq_request_bypass_insert(rq, at_head, false);
 439                 goto run;
 440         }
 441
 442         if (e) {
 443                 LIST_HEAD(list);
 444
 445                 list_add(&rq->queuelist, &list);
 446                 e->type->ops.insert_requests(hctx, &list, at_head);
 447         } else {
 448                 spin_lock(&ctx->lock);
 449                 __blk_mq_insert_request(hctx, rq, at_head);
 450                 spin_unlock(&ctx->lock);
 451         }
 452
 453 run:
 454         if (run_queue)
 455                 blk_mq_run_hw_queue(hctx, async);
 456 }
 457
 458 static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
 459                                           struct blk_mq_hw_ctx *hctx,
 460                                           unsigned int hctx_idx)
 461 {
 462         if (blk_mq_is_shared_tags(q->tag_set->flags)) {
 463                 hctx->sched_tags = q->sched_shared_tags;
 464                 return 0;
 465         }
 466
 467         hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
 468                                                     q->nr_requests);
 469
 470         if (!hctx->sched_tags)
 471                 return -ENOMEM;
 472         return 0;
 473 }
 474
 475 static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
 476 {
 477         blk_mq_free_rq_map(queue->sched_shared_tags);
 478         queue->sched_shared_tags = NULL;
 479 }
 480
 481 /* called in queue's release handler, tagset has gone away */
 482 static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
 483 {
 484         struct blk_mq_hw_ctx *hctx;
 485         unsigned long i;
 486
 487         queue_for_each_hw_ctx(q, hctx, i) {
 488                 if (hctx->sched_tags) {
 489                         if (!blk_mq_is_shared_tags(flags))
 490                                 blk_mq_free_rq_map(hctx->sched_tags);
 491                         hctx->sched_tags = NULL;
 492                 }
 493         }
 494
 495         if (blk_mq_is_shared_tags(flags))
 496                 blk_mq_exit_sched_shared_tags(q);
 497 }
 498
 499 static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
 500 {
 501         struct blk_mq_tag_set *set = queue->tag_set;
 502
 503         /*
 504          * Set initial depth at max so that we don't need to reallocate for
 505          * updating nr_requests.
 506          */
 507         queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
 508                                                 BLK_MQ_NO_HCTX_IDX,
 509                                                 MAX_SCHED_RQ);
 510         if (!queue->sched_shared_tags)
 511                 return -ENOMEM;
 512
 513         blk_mq_tag_update_sched_shared_tags(queue);
 514
 515         return 0;
 516 }
 517
 518 /* caller must have a reference to @e, will grab another one if successful */
 519 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 520 {
 521         unsigned int flags = q->tag_set->flags;
 522         struct blk_mq_hw_ctx *hctx;
 523         struct elevator_queue *eq;
 524         unsigned long i;
 525         int ret;
 526
 527         /*
 528          * Default to double of smaller one between hw queue_depth and 128,
 529          * since we don't split into sync/async like the old code did.
 530          * Additionally, this is a per-hw queue depth.
 531          */
 532         q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
 533                                    BLKDEV_DEFAULT_RQ);
 534
 535         if (blk_mq_is_shared_tags(flags)) {
 536                 ret = blk_mq_init_sched_shared_tags(q);
 537                 if (ret)
 538                         return ret;
 539         }
 540
 541         queue_for_each_hw_ctx(q, hctx, i) {
 542                 ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
 543                 if (ret)
 544                         goto err_free_map_and_rqs;
 545         }
 546
 547         ret = e->ops.init_sched(q, e);
 548         if (ret)
 549                 goto err_free_map_and_rqs;
 550
 551         mutex_lock(&q->debugfs_mutex);
 552         blk_mq_debugfs_register_sched(q);
 553         mutex_unlock(&q->debugfs_mutex);
 554
 555         queue_for_each_hw_ctx(q, hctx, i) {
 556                 if (e->ops.init_hctx) {
 557                         ret = e->ops.init_hctx(hctx, i);
 558                         if (ret) {
 559                                 eq = q->elevator;
 560                                 blk_mq_sched_free_rqs(q);
 561                                 blk_mq_exit_sched(q, eq);
 562                                 kobject_put(&eq->kobj);
 563                                 return ret;
 564                         }
 565                 }
 566                 mutex_lock(&q->debugfs_mutex);
 567                 blk_mq_debugfs_register_sched_hctx(q, hctx);
 568                 mutex_unlock(&q->debugfs_mutex);
 569         }
 570
 571         return 0;
 572
 573 err_free_map_and_rqs:
 574         blk_mq_sched_free_rqs(q);
 575         blk_mq_sched_tags_teardown(q, flags);
 576
 577         q->elevator = NULL;
 578         return ret;
 579 }
 580
 581 /*
 582  * called in either blk_queue_cleanup or elevator_switch, tagset
 583  * is required for freeing requests
 584  */
 585 void blk_mq_sched_free_rqs(struct request_queue *q)
 586 {
 587         struct blk_mq_hw_ctx *hctx;
 588         unsigned long i;
 589
 590         if (blk_mq_is_shared_tags(q->tag_set->flags)) {
 591                 blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
 592                                 BLK_MQ_NO_HCTX_IDX);
 593         } else {
 594                 queue_for_each_hw_ctx(q, hctx, i) {
 595                         if (hctx->sched_tags)
 596                                 blk_mq_free_rqs(q->tag_set,
 597                                                 hctx->sched_tags, i);
 598                 }
 599         }
 600 }
 601
 602 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 603 {
 604         struct blk_mq_hw_ctx *hctx;
 605         unsigned long i;
 606         unsigned int flags = 0;
 607
 608         queue_for_each_hw_ctx(q, hctx, i) {
 609                 mutex_lock(&q->debugfs_mutex);
 610                 blk_mq_debugfs_unregister_sched_hctx(hctx);
 611                 mutex_unlock(&q->debugfs_mutex);
 612
 613                 if (e->type->ops.exit_hctx && hctx->sched_data) {
 614                         e->type->ops.exit_hctx(hctx, i);
 615                         hctx->sched_data = NULL;
 616                 }
 617                 flags = hctx->flags;
 618         }
 619
 620         mutex_lock(&q->debugfs_mutex);
 621         blk_mq_debugfs_unregister_sched(q);
 622         mutex_unlock(&q->debugfs_mutex);
 623
 624         if (e->type->ops.exit_sched)
 625                 e->type->ops.exit_sched(e);
 626         blk_mq_sched_tags_teardown(q, flags);
 627         q->elevator = NULL;
 628 }