block/blk-mq-sched.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * blk-mq scheduling framework
   4  *
   5  * Copyright (C) 2016 Jens Axboe
   6  */
   7 #include <linux/kernel.h>
   8 #include <linux/module.h>
   9 #include <linux/list_sort.h>
  10
  11 #include <trace/events/block.h>
  12
  13 #include "blk.h"
  14 #include "blk-mq.h"
  15 #include "blk-mq-debugfs.h"
  16 #include "blk-mq-sched.h"
  17 #include "blk-wbt.h"
  18
  19 /*
  20  * Mark a hardware queue as needing a restart.
  21  */
  22 void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
  23 {
  24         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
  25                 return;
  26
  27         set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  28 }
  29 EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
  30
  31 void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  32 {
  33         clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  34
  35         /*
  36          * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
  37          * in blk_mq_run_hw_queue(). Its pair is the barrier in
  38          * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
  39          * meantime new request added to hctx->dispatch is missed to check in
  40          * blk_mq_run_hw_queue().
  41          */
  42         smp_mb();
  43
  44         blk_mq_run_hw_queue(hctx, true);
  45 }
  46
  47 static int sched_rq_cmp(void *priv, const struct list_head *a,
  48                         const struct list_head *b)
  49 {
  50         struct request *rqa = container_of(a, struct request, queuelist);
  51         struct request *rqb = container_of(b, struct request, queuelist);
  52
  53         return rqa->mq_hctx > rqb->mq_hctx;
  54 }
  55
  56 static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
  57 {
  58         struct blk_mq_hw_ctx *hctx =
  59                 list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
  60         struct request *rq;
  61         LIST_HEAD(hctx_list);
  62
  63         list_for_each_entry(rq, rq_list, queuelist) {
  64                 if (rq->mq_hctx != hctx) {
  65                         list_cut_before(&hctx_list, rq_list, &rq->queuelist);
  66                         goto dispatch;
  67                 }
  68         }
  69         list_splice_tail_init(rq_list, &hctx_list);
  70
  71 dispatch:
  72         return blk_mq_dispatch_rq_list(hctx, &hctx_list, false);
  73 }
  74
  75 #define BLK_MQ_BUDGET_DELAY     3               /* ms units */
  76
  77 /*
  78  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  79  * its queue by itself in its completion handler, so we don't need to
  80  * restart queue if .get_budget() fails to get the budget.
  81  *
  82  * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
  83  * be run again.  This is necessary to avoid starving flushes.
  84  */
  85 static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
  86 {
  87         struct request_queue *q = hctx->queue;
  88         struct elevator_queue *e = q->elevator;
  89         bool multi_hctxs = false, run_queue = false;
  90         bool dispatched = false, busy = false;
  91         unsigned int max_dispatch;
  92         LIST_HEAD(rq_list);
  93         int count = 0;
  94
  95         if (hctx->dispatch_busy)
  96                 max_dispatch = 1;
  97         else
  98                 max_dispatch = hctx->queue->nr_requests;
  99
 100         do {
 101                 struct request *rq;
 102                 int budget_token;
 103
 104                 if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
 105                         break;
 106
 107                 if (!list_empty_careful(&hctx->dispatch)) {
 108                         busy = true;
 109                         break;
 110                 }
 111
 112                 budget_token = blk_mq_get_dispatch_budget(q);
 113                 if (budget_token < 0)
 114                         break;
 115
 116                 rq = e->type->ops.dispatch_request(hctx);
 117                 if (!rq) {
 118                         blk_mq_put_dispatch_budget(q, budget_token);
 119                         /*
 120                          * We're releasing without dispatching. Holding the
 121                          * budget could have blocked any "hctx"s with the
 122                          * same queue and if we didn't dispatch then there's
 123                          * no guarantee anyone will kick the queue.  Kick it
 124                          * ourselves.
 125                          */
 126                         run_queue = true;
 127                         break;
 128                 }
 129
 130                 blk_mq_set_rq_budget_token(rq, budget_token);
 131
 132                 /*
 133                  * Now this rq owns the budget which has to be released
 134                  * if this rq won't be queued to driver via .queue_rq()
 135                  * in blk_mq_dispatch_rq_list().
 136                  */
 137                 list_add_tail(&rq->queuelist, &rq_list);
 138                 count++;
 139                 if (rq->mq_hctx != hctx)
 140                         multi_hctxs = true;
 141
 142                 /*
 143                  * If we cannot get tag for the request, stop dequeueing
 144                  * requests from the IO scheduler. We are unlikely to be able
 145                  * to submit them anyway and it creates false impression for
 146                  * scheduling heuristics that the device can take more IO.
 147                  */
 148                 if (!blk_mq_get_driver_tag(rq))
 149                         break;
 150         } while (count < max_dispatch);
 151
 152         if (!count) {
 153                 if (run_queue)
 154                         blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
 155         } else if (multi_hctxs) {
 156                 /*
 157                  * Requests from different hctx may be dequeued from some
 158                  * schedulers, such as bfq and deadline.
 159                  *
 160                  * Sort the requests in the list according to their hctx,
 161                  * dispatch batching requests from same hctx at a time.
 162                  */
 163                 list_sort(NULL, &rq_list, sched_rq_cmp);
 164                 do {
 165                         dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
 166                 } while (!list_empty(&rq_list));
 167         } else {
 168                 dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, false);
 169         }
 170
 171         if (busy)
 172                 return -EAGAIN;
 173         return !!dispatched;
 174 }
 175
 176 static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 177 {
 178         unsigned long end = jiffies + HZ;
 179         int ret;
 180
 181         do {
 182                 ret = __blk_mq_do_dispatch_sched(hctx);
 183                 if (ret != 1)
 184                         break;
 185                 if (need_resched() || time_is_before_jiffies(end)) {
 186                         blk_mq_delay_run_hw_queue(hctx, 0);
 187                         break;
 188                 }
 189         } while (1);
 190
 191         return ret;
 192 }
 193
 194 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
 195                                           struct blk_mq_ctx *ctx)
 196 {
 197         unsigned short idx = ctx->index_hw[hctx->type];
 198
 199         if (++idx == hctx->nr_ctx)
 200                 idx = 0;
 201
 202         return hctx->ctxs[idx];
 203 }
 204
 205 /*
 206  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 207  * its queue by itself in its completion handler, so we don't need to
 208  * restart queue if .get_budget() fails to get the budget.
 209  *
 210  * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
 211  * be run again.  This is necessary to avoid starving flushes.
 212  */
 213 static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 214 {
 215         struct request_queue *q = hctx->queue;
 216         LIST_HEAD(rq_list);
 217         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
 218         int ret = 0;
 219         struct request *rq;
 220
 221         do {
 222                 int budget_token;
 223
 224                 if (!list_empty_careful(&hctx->dispatch)) {
 225                         ret = -EAGAIN;
 226                         break;
 227                 }
 228
 229                 if (!sbitmap_any_bit_set(&hctx->ctx_map))
 230                         break;
 231
 232                 budget_token = blk_mq_get_dispatch_budget(q);
 233                 if (budget_token < 0)
 234                         break;
 235
 236                 rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 237                 if (!rq) {
 238                         blk_mq_put_dispatch_budget(q, budget_token);
 239                         /*
 240                          * We're releasing without dispatching. Holding the
 241                          * budget could have blocked any "hctx"s with the
 242                          * same queue and if we didn't dispatch then there's
 243                          * no guarantee anyone will kick the queue.  Kick it
 244                          * ourselves.
 245                          */
 246                         blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
 247                         break;
 248                 }
 249
 250                 blk_mq_set_rq_budget_token(rq, budget_token);
 251
 252                 /*
 253                  * Now this rq owns the budget which has to be released
 254                  * if this rq won't be queued to driver via .queue_rq()
 255                  * in blk_mq_dispatch_rq_list().
 256                  */
 257                 list_add(&rq->queuelist, &rq_list);
 258
 259                 /* round robin for fair dispatch */
 260                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
 261
 262         } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, false));
 263
 264         WRITE_ONCE(hctx->dispatch_from, ctx);
 265         return ret;
 266 }
 267
 268 static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 269 {
 270         bool need_dispatch = false;
 271         LIST_HEAD(rq_list);
 272
 273         /*
 274          * If we have previous entries on our dispatch list, grab them first for
 275          * more fair dispatch.
 276          */
 277         if (!list_empty_careful(&hctx->dispatch)) {
 278                 spin_lock(&hctx->lock);
 279                 if (!list_empty(&hctx->dispatch))
 280                         list_splice_init(&hctx->dispatch, &rq_list);
 281                 spin_unlock(&hctx->lock);
 282         }
 283
 284         /*
 285          * Only ask the scheduler for requests, if we didn't have residual
 286          * requests from the dispatch list. This is to avoid the case where
 287          * we only ever dispatch a fraction of the requests available because
 288          * of low device queue depth. Once we pull requests out of the IO
 289          * scheduler, we can no longer merge or sort them. So it's best to
 290          * leave them there for as long as we can. Mark the hw queue as
 291          * needing a restart in that case.
 292          *
 293          * We want to dispatch from the scheduler if there was nothing
 294          * on the dispatch list or we were able to dispatch from the
 295          * dispatch list.
 296          */
 297         if (!list_empty(&rq_list)) {
 298                 blk_mq_sched_mark_restart_hctx(hctx);
 299                 if (!blk_mq_dispatch_rq_list(hctx, &rq_list, true))
 300                         return 0;
 301                 need_dispatch = true;
 302         } else {
 303                 need_dispatch = hctx->dispatch_busy;
 304         }
 305
 306         if (hctx->queue->elevator)
 307                 return blk_mq_do_dispatch_sched(hctx);
 308
 309         /* dequeue request one by one from sw queue if queue is busy */
 310         if (need_dispatch)
 311                 return blk_mq_do_dispatch_ctx(hctx);
 312         blk_mq_flush_busy_ctxs(hctx, &rq_list);
 313         blk_mq_dispatch_rq_list(hctx, &rq_list, true);
 314         return 0;
 315 }
 316
 317 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 318 {
 319         struct request_queue *q = hctx->queue;
 320
 321         /* RCU or SRCU read lock is needed before checking quiesced flag */
 322         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
 323                 return;
 324
 325         /*
 326          * A return of -EAGAIN is an indication that hctx->dispatch is not
 327          * empty and we must run again in order to avoid starving flushes.
 328          */
 329         if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
 330                 if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
 331                         blk_mq_run_hw_queue(hctx, true);
 332         }
 333 }
 334
 335 bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 336                 unsigned int nr_segs)
 337 {
 338         struct elevator_queue *e = q->elevator;
 339         struct blk_mq_ctx *ctx;
 340         struct blk_mq_hw_ctx *hctx;
 341         bool ret = false;
 342         enum hctx_type type;
 343
 344         if (e && e->type->ops.bio_merge) {
 345                 ret = e->type->ops.bio_merge(q, bio, nr_segs);
 346                 goto out_put;
 347         }
 348
 349         ctx = blk_mq_get_ctx(q);
 350         hctx = blk_mq_map_queue(bio->bi_opf, ctx);
 351         type = hctx->type;
 352         if (list_empty_careful(&ctx->rq_lists[type]))
 353                 goto out_put;
 354
 355         /* default per sw-queue merge */
 356         spin_lock(&ctx->lock);
 357         /*
 358          * Reverse check our software queue for entries that we could
 359          * potentially merge with. Currently includes a hand-wavy stop
 360          * count of 8, to not spend too much time checking for merges.
 361          */
 362         if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
 363                 ret = true;
 364
 365         spin_unlock(&ctx->lock);
 366 out_put:
 367         return ret;
 368 }
 369
 370 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
 371                                    struct list_head *free)
 372 {
 373         return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
 374 }
 375 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 376
 377 static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
 378                                           struct blk_mq_hw_ctx *hctx,
 379                                           unsigned int hctx_idx)
 380 {
 381         if (blk_mq_is_shared_tags(q->tag_set->flags)) {
 382                 hctx->sched_tags = q->sched_shared_tags;
 383                 return 0;
 384         }
 385
 386         hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
 387                                                     q->nr_requests);
 388
 389         if (!hctx->sched_tags)
 390                 return -ENOMEM;
 391         return 0;
 392 }
 393
 394 static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
 395 {
 396         blk_mq_free_rq_map(queue->sched_shared_tags);
 397         queue->sched_shared_tags = NULL;
 398 }
 399
 400 /* called in queue's release handler, tagset has gone away */
 401 static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
 402 {
 403         struct blk_mq_hw_ctx *hctx;
 404         unsigned long i;
 405
 406         queue_for_each_hw_ctx(q, hctx, i) {
 407                 if (hctx->sched_tags) {
 408                         if (!blk_mq_is_shared_tags(flags))
 409                                 blk_mq_free_rq_map(hctx->sched_tags);
 410                         hctx->sched_tags = NULL;
 411                 }
 412         }
 413
 414         if (blk_mq_is_shared_tags(flags))
 415                 blk_mq_exit_sched_shared_tags(q);
 416 }
 417
 418 static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
 419 {
 420         struct blk_mq_tag_set *set = queue->tag_set;
 421
 422         /*
 423          * Set initial depth at max so that we don't need to reallocate for
 424          * updating nr_requests.
 425          */
 426         queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
 427                                                 BLK_MQ_NO_HCTX_IDX,
 428                                                 MAX_SCHED_RQ);
 429         if (!queue->sched_shared_tags)
 430                 return -ENOMEM;
 431
 432         blk_mq_tag_update_sched_shared_tags(queue);
 433
 434         return 0;
 435 }
 436
 437 void blk_mq_sched_reg_debugfs(struct request_queue *q)
 438 {
 439         struct blk_mq_hw_ctx *hctx;
 440         unsigned long i;
 441
 442         mutex_lock(&q->debugfs_mutex);
 443         blk_mq_debugfs_register_sched(q);
 444         queue_for_each_hw_ctx(q, hctx, i)
 445                 blk_mq_debugfs_register_sched_hctx(q, hctx);
 446         mutex_unlock(&q->debugfs_mutex);
 447 }
 448
 449 void blk_mq_sched_unreg_debugfs(struct request_queue *q)
 450 {
 451         struct blk_mq_hw_ctx *hctx;
 452         unsigned long i;
 453
 454         mutex_lock(&q->debugfs_mutex);
 455         queue_for_each_hw_ctx(q, hctx, i)
 456                 blk_mq_debugfs_unregister_sched_hctx(hctx);
 457         blk_mq_debugfs_unregister_sched(q);
 458         mutex_unlock(&q->debugfs_mutex);
 459 }
 460
 461 /* caller must have a reference to @e, will grab another one if successful */
 462 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 463 {
 464         unsigned int flags = q->tag_set->flags;
 465         struct blk_mq_hw_ctx *hctx;
 466         struct elevator_queue *eq;
 467         unsigned long i;
 468         int ret;
 469
 470         /*
 471          * Default to double of smaller one between hw queue_depth and 128,
 472          * since we don't split into sync/async like the old code did.
 473          * Additionally, this is a per-hw queue depth.
 474          */
 475         q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
 476                                    BLKDEV_DEFAULT_RQ);
 477
 478         if (blk_mq_is_shared_tags(flags)) {
 479                 ret = blk_mq_init_sched_shared_tags(q);
 480                 if (ret)
 481                         return ret;
 482         }
 483
 484         queue_for_each_hw_ctx(q, hctx, i) {
 485                 ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
 486                 if (ret)
 487                         goto err_free_map_and_rqs;
 488         }
 489
 490         ret = e->ops.init_sched(q, e);
 491         if (ret)
 492                 goto err_free_map_and_rqs;
 493
 494         queue_for_each_hw_ctx(q, hctx, i) {
 495                 if (e->ops.init_hctx) {
 496                         ret = e->ops.init_hctx(hctx, i);
 497                         if (ret) {
 498                                 eq = q->elevator;
 499                                 blk_mq_sched_free_rqs(q);
 500                                 blk_mq_exit_sched(q, eq);
 501                                 kobject_put(&eq->kobj);
 502                                 return ret;
 503                         }
 504                 }
 505         }
 506         return 0;
 507
 508 err_free_map_and_rqs:
 509         blk_mq_sched_free_rqs(q);
 510         blk_mq_sched_tags_teardown(q, flags);
 511
 512         q->elevator = NULL;
 513         return ret;
 514 }
 515
 516 /*
 517  * called in either blk_queue_cleanup or elevator_switch, tagset
 518  * is required for freeing requests
 519  */
 520 void blk_mq_sched_free_rqs(struct request_queue *q)
 521 {
 522         struct blk_mq_hw_ctx *hctx;
 523         unsigned long i;
 524
 525         if (blk_mq_is_shared_tags(q->tag_set->flags)) {
 526                 blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
 527                                 BLK_MQ_NO_HCTX_IDX);
 528         } else {
 529                 queue_for_each_hw_ctx(q, hctx, i) {
 530                         if (hctx->sched_tags)
 531                                 blk_mq_free_rqs(q->tag_set,
 532                                                 hctx->sched_tags, i);
 533                 }
 534         }
 535 }
 536
 537 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 538 {
 539         struct blk_mq_hw_ctx *hctx;
 540         unsigned long i;
 541         unsigned int flags = 0;
 542
 543         queue_for_each_hw_ctx(q, hctx, i) {
 544                 if (e->type->ops.exit_hctx && hctx->sched_data) {
 545                         e->type->ops.exit_hctx(hctx, i);
 546                         hctx->sched_data = NULL;
 547                 }
 548                 flags = hctx->flags;
 549         }
 550
 551         if (e->type->ops.exit_sched)
 552                 e->type->ops.exit_sched(e);
 553         blk_mq_sched_tags_teardown(q, flags);
 554         set_bit(ELEVATOR_FLAG_DYING, &q->elevator->flags);
 555         q->elevator = NULL;
 556 }