block/kyber-iosched.c

   1 /*
   2  * The Kyber I/O scheduler. Controls latency by throttling queue depths using
   3  * scalable techniques.
   4  *
   5  * Copyright (C) 2017 Facebook
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public
   9  * License v2 as published by the Free Software Foundation.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  18  */
  19
  20 #include <linux/kernel.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/blk-mq.h>
  23 #include <linux/elevator.h>
  24 #include <linux/module.h>
  25 #include <linux/sbitmap.h>
  26
  27 #include "blk.h"
  28 #include "blk-mq.h"
  29 #include "blk-mq-debugfs.h"
  30 #include "blk-mq-sched.h"
  31 #include "blk-mq-tag.h"
  32 #include "blk-stat.h"
  33
  34 /* Scheduling domains. */
  35 enum {
  36         KYBER_READ,
  37         KYBER_SYNC_WRITE,
  38         KYBER_OTHER, /* Async writes, discard, etc. */
  39         KYBER_NUM_DOMAINS,
  40 };
  41
  42 enum {
  43         /*
  44          * In order to prevent starvation of synchronous requests by a flood of
  45          * asynchronous requests, we reserve 25% of requests for synchronous
  46          * operations.
  47          */
  48         KYBER_ASYNC_PERCENT = 75,
  49 };
  50
  51 /*
  52  * Initial device-wide depths for each scheduling domain.
  53  *
  54  * Even for fast devices with lots of tags like NVMe, you can saturate
  55  * the device with only a fraction of the maximum possible queue depth.
  56  * So, we cap these to a reasonable value.
  57  */
  58 static const unsigned int kyber_depth[] = {
  59         [KYBER_READ] = 256,
  60         [KYBER_SYNC_WRITE] = 128,
  61         [KYBER_OTHER] = 64,
  62 };
  63
  64 /*
  65  * Scheduling domain batch sizes. We favor reads.
  66  */
  67 static const unsigned int kyber_batch_size[] = {
  68         [KYBER_READ] = 16,
  69         [KYBER_SYNC_WRITE] = 8,
  70         [KYBER_OTHER] = 8,
  71 };
  72
  73 /*
  74  * There is a same mapping between ctx & hctx and kcq & khd,
  75  * we use request->mq_ctx->index_hw to index the kcq in khd.
  76  */
  77 struct kyber_ctx_queue {
  78         /*
  79          * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
  80          * Also protect the rqs on rq_list when merge.
  81          */
  82         spinlock_t lock;
  83         struct list_head rq_list[KYBER_NUM_DOMAINS];
  84 } ____cacheline_aligned_in_smp;
  85
  86 struct kyber_queue_data {
  87         struct request_queue *q;
  88
  89         struct blk_stat_callback *cb;
  90
  91         /*
  92          * The device is divided into multiple scheduling domains based on the
  93          * request type. Each domain has a fixed number of in-flight requests of
  94          * that type device-wide, limited by these tokens.
  95          */
  96         struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  97
  98         /*
  99          * Async request percentage, converted to per-word depth for
 100          * sbitmap_get_shallow().
 101          */
 102         unsigned int async_depth;
 103
 104         /* Target latencies in nanoseconds. */
 105         u64 read_lat_nsec, write_lat_nsec;
 106 };
 107
 108 struct kyber_hctx_data {
 109         spinlock_t lock;
 110         struct list_head rqs[KYBER_NUM_DOMAINS];
 111         unsigned int cur_domain;
 112         unsigned int batching;
 113         struct kyber_ctx_queue *kcqs;
 114         struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
 115         wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
 116         struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
 117         atomic_t wait_index[KYBER_NUM_DOMAINS];
 118 };
 119
 120 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 121                              void *key);
 122
 123 static unsigned int kyber_sched_domain(unsigned int op)
 124 {
 125         if ((op & REQ_OP_MASK) == REQ_OP_READ)
 126                 return KYBER_READ;
 127         else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
 128                 return KYBER_SYNC_WRITE;
 129         else
 130                 return KYBER_OTHER;
 131 }
 132
 133 enum {
 134         NONE = 0,
 135         GOOD = 1,
 136         GREAT = 2,
 137         BAD = -1,
 138         AWFUL = -2,
 139 };
 140
 141 #define IS_GOOD(status) ((status) > 0)
 142 #define IS_BAD(status) ((status) < 0)
 143
 144 static int kyber_lat_status(struct blk_stat_callback *cb,
 145                             unsigned int sched_domain, u64 target)
 146 {
 147         u64 latency;
 148
 149         if (!cb->stat[sched_domain].nr_samples)
 150                 return NONE;
 151
 152         latency = cb->stat[sched_domain].mean;
 153         if (latency >= 2 * target)
 154                 return AWFUL;
 155         else if (latency > target)
 156                 return BAD;
 157         else if (latency <= target / 2)
 158                 return GREAT;
 159         else /* (latency <= target) */
 160                 return GOOD;
 161 }
 162
 163 /*
 164  * Adjust the read or synchronous write depth given the status of reads and
 165  * writes. The goal is that the latencies of the two domains are fair (i.e., if
 166  * one is good, then the other is good).
 167  */
 168 static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
 169                                   unsigned int sched_domain, int this_status,
 170                                   int other_status)
 171 {
 172         unsigned int orig_depth, depth;
 173
 174         /*
 175          * If this domain had no samples, or reads and writes are both good or
 176          * both bad, don't adjust the depth.
 177          */
 178         if (this_status == NONE ||
 179             (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
 180             (IS_BAD(this_status) && IS_BAD(other_status)))
 181                 return;
 182
 183         orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
 184
 185         if (other_status == NONE) {
 186                 depth++;
 187         } else {
 188                 switch (this_status) {
 189                 case GOOD:
 190                         if (other_status == AWFUL)
 191                                 depth -= max(depth / 4, 1U);
 192                         else
 193                                 depth -= max(depth / 8, 1U);
 194                         break;
 195                 case GREAT:
 196                         if (other_status == AWFUL)
 197                                 depth /= 2;
 198                         else
 199                                 depth -= max(depth / 4, 1U);
 200                         break;
 201                 case BAD:
 202                         depth++;
 203                         break;
 204                 case AWFUL:
 205                         if (other_status == GREAT)
 206                                 depth += 2;
 207                         else
 208                                 depth++;
 209                         break;
 210                 }
 211         }
 212
 213         depth = clamp(depth, 1U, kyber_depth[sched_domain]);
 214         if (depth != orig_depth)
 215                 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
 216 }
 217
 218 /*
 219  * Adjust the depth of other requests given the status of reads and synchronous
 220  * writes. As long as either domain is doing fine, we don't throttle, but if
 221  * both domains are doing badly, we throttle heavily.
 222  */
 223 static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
 224                                      int read_status, int write_status,
 225                                      bool have_samples)
 226 {
 227         unsigned int orig_depth, depth;
 228         int status;
 229
 230         orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
 231
 232         if (read_status == NONE && write_status == NONE) {
 233                 depth += 2;
 234         } else if (have_samples) {
 235                 if (read_status == NONE)
 236                         status = write_status;
 237                 else if (write_status == NONE)
 238                         status = read_status;
 239                 else
 240                         status = max(read_status, write_status);
 241                 switch (status) {
 242                 case GREAT:
 243                         depth += 2;
 244                         break;
 245                 case GOOD:
 246                         depth++;
 247                         break;
 248                 case BAD:
 249                         depth -= max(depth / 4, 1U);
 250                         break;
 251                 case AWFUL:
 252                         depth /= 2;
 253                         break;
 254                 }
 255         }
 256
 257         depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
 258         if (depth != orig_depth)
 259                 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
 260 }
 261
 262 /*
 263  * Apply heuristics for limiting queue depths based on gathered latency
 264  * statistics.
 265  */
 266 static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
 267 {
 268         struct kyber_queue_data *kqd = cb->data;
 269         int read_status, write_status;
 270
 271         read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
 272         write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
 273
 274         kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
 275         kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
 276         kyber_adjust_other_depth(kqd, read_status, write_status,
 277                                  cb->stat[KYBER_OTHER].nr_samples != 0);
 278
 279         /*
 280          * Continue monitoring latencies if we aren't hitting the targets or
 281          * we're still throttling other requests.
 282          */
 283         if (!blk_stat_is_active(kqd->cb) &&
 284             ((IS_BAD(read_status) || IS_BAD(write_status) ||
 285               kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
 286                 blk_stat_activate_msecs(kqd->cb, 100);
 287 }
 288
 289 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
 290 {
 291         /*
 292          * All of the hardware queues have the same depth, so we can just grab
 293          * the shift of the first one.
 294          */
 295         return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 296 }
 297
 298 static int kyber_bucket_fn(const struct request *rq)
 299 {
 300         return kyber_sched_domain(rq->cmd_flags);
 301 }
 302
 303 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 304 {
 305         struct kyber_queue_data *kqd;
 306         unsigned int shift;
 307         int ret = -ENOMEM;
 308         int i;
 309
 310         kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
 311         if (!kqd)
 312                 goto err;
 313         kqd->q = q;
 314
 315         kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
 316                                           KYBER_NUM_DOMAINS, kqd);
 317         if (!kqd->cb)
 318                 goto err_kqd;
 319
 320         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 321                 WARN_ON(!kyber_depth[i]);
 322                 WARN_ON(!kyber_batch_size[i]);
 323                 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
 324                                               kyber_depth[i], -1, false,
 325                                               GFP_KERNEL, q->node);
 326                 if (ret) {
 327                         while (--i >= 0)
 328                                 sbitmap_queue_free(&kqd->domain_tokens[i]);
 329                         goto err_cb;
 330                 }
 331         }
 332
 333         shift = kyber_sched_tags_shift(kqd);
 334         kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 335
 336         kqd->read_lat_nsec = 2000000ULL;
 337         kqd->write_lat_nsec = 10000000ULL;
 338
 339         return kqd;
 340
 341 err_cb:
 342         blk_stat_free_callback(kqd->cb);
 343 err_kqd:
 344         kfree(kqd);
 345 err:
 346         return ERR_PTR(ret);
 347 }
 348
 349 static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
 350 {
 351         struct kyber_queue_data *kqd;
 352         struct elevator_queue *eq;
 353
 354         eq = elevator_alloc(q, e);
 355         if (!eq)
 356                 return -ENOMEM;
 357
 358         kqd = kyber_queue_data_alloc(q);
 359         if (IS_ERR(kqd)) {
 360                 kobject_put(&eq->kobj);
 361                 return PTR_ERR(kqd);
 362         }
 363
 364         eq->elevator_data = kqd;
 365         q->elevator = eq;
 366
 367         blk_stat_add_callback(q, kqd->cb);
 368
 369         return 0;
 370 }
 371
 372 static void kyber_exit_sched(struct elevator_queue *e)
 373 {
 374         struct kyber_queue_data *kqd = e->elevator_data;
 375         struct request_queue *q = kqd->q;
 376         int i;
 377
 378         blk_stat_remove_callback(q, kqd->cb);
 379
 380         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 381                 sbitmap_queue_free(&kqd->domain_tokens[i]);
 382         blk_stat_free_callback(kqd->cb);
 383         kfree(kqd);
 384 }
 385
 386 static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
 387 {
 388         unsigned int i;
 389
 390         spin_lock_init(&kcq->lock);
 391         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 392                 INIT_LIST_HEAD(&kcq->rq_list[i]);
 393 }
 394
 395 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 396 {
 397         struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 398         struct kyber_hctx_data *khd;
 399         int i;
 400
 401         khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
 402         if (!khd)
 403                 return -ENOMEM;
 404
 405         khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
 406                                        sizeof(struct kyber_ctx_queue),
 407                                        GFP_KERNEL, hctx->numa_node);
 408         if (!khd->kcqs)
 409                 goto err_khd;
 410
 411         for (i = 0; i < hctx->nr_ctx; i++)
 412                 kyber_ctx_queue_init(&khd->kcqs[i]);
 413
 414         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 415                 if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
 416                                       ilog2(8), GFP_KERNEL, hctx->numa_node)) {
 417                         while (--i >= 0)
 418                                 sbitmap_free(&khd->kcq_map[i]);
 419                         goto err_kcqs;
 420                 }
 421         }
 422
 423         spin_lock_init(&khd->lock);
 424
 425         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 426                 INIT_LIST_HEAD(&khd->rqs[i]);
 427                 init_waitqueue_func_entry(&khd->domain_wait[i],
 428                                           kyber_domain_wake);
 429                 khd->domain_wait[i].private = hctx;
 430                 INIT_LIST_HEAD(&khd->domain_wait[i].entry);
 431                 atomic_set(&khd->wait_index[i], 0);
 432         }
 433
 434         khd->cur_domain = 0;
 435         khd->batching = 0;
 436
 437         hctx->sched_data = khd;
 438         sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
 439                                         kqd->async_depth);
 440
 441         return 0;
 442
 443 err_kcqs:
 444         kfree(khd->kcqs);
 445 err_khd:
 446         kfree(khd);
 447         return -ENOMEM;
 448 }
 449
 450 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 451 {
 452         struct kyber_hctx_data *khd = hctx->sched_data;
 453         int i;
 454
 455         for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 456                 sbitmap_free(&khd->kcq_map[i]);
 457         kfree(khd->kcqs);
 458         kfree(hctx->sched_data);
 459 }
 460
 461 static int rq_get_domain_token(struct request *rq)
 462 {
 463         return (long)rq->elv.priv[0];
 464 }
 465
 466 static void rq_set_domain_token(struct request *rq, int token)
 467 {
 468         rq->elv.priv[0] = (void *)(long)token;
 469 }
 470
 471 static void rq_clear_domain_token(struct kyber_queue_data *kqd,
 472                                   struct request *rq)
 473 {
 474         unsigned int sched_domain;
 475         int nr;
 476
 477         nr = rq_get_domain_token(rq);
 478         if (nr != -1) {
 479                 sched_domain = kyber_sched_domain(rq->cmd_flags);
 480                 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
 481                                     rq->mq_ctx->cpu);
 482         }
 483 }
 484
 485 static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 486 {
 487         /*
 488          * We use the scheduler tags as per-hardware queue queueing tokens.
 489          * Async requests can be limited at this stage.
 490          */
 491         if (!op_is_sync(op)) {
 492                 struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
 493
 494                 data->shallow_depth = kqd->async_depth;
 495         }
 496 }
 497
 498 static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
 499 {
 500         struct kyber_hctx_data *khd = hctx->sched_data;
 501         struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
 502         struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
 503         unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
 504         struct list_head *rq_list = &kcq->rq_list[sched_domain];
 505         bool merged;
 506
 507         spin_lock(&kcq->lock);
 508         merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
 509         spin_unlock(&kcq->lock);
 510         blk_mq_put_ctx(ctx);
 511
 512         return merged;
 513 }
 514
 515 static void kyber_prepare_request(struct request *rq, struct bio *bio)
 516 {
 517         rq_set_domain_token(rq, -1);
 518 }
 519
 520 static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
 521                                   struct list_head *rq_list, bool at_head)
 522 {
 523         struct kyber_hctx_data *khd = hctx->sched_data;
 524         struct request *rq, *next;
 525
 526         list_for_each_entry_safe(rq, next, rq_list, queuelist) {
 527                 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
 528                 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
 529                 struct list_head *head = &kcq->rq_list[sched_domain];
 530
 531                 spin_lock(&kcq->lock);
 532                 if (at_head)
 533                         list_move(&rq->queuelist, head);
 534                 else
 535                         list_move_tail(&rq->queuelist, head);
 536                 sbitmap_set_bit(&khd->kcq_map[sched_domain],
 537                                 rq->mq_ctx->index_hw);
 538                 blk_mq_sched_request_inserted(rq);
 539                 spin_unlock(&kcq->lock);
 540         }
 541 }
 542
 543 static void kyber_finish_request(struct request *rq)
 544 {
 545         struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
 546
 547         rq_clear_domain_token(kqd, rq);
 548 }
 549
 550 static void kyber_completed_request(struct request *rq, u64 now)
 551 {
 552         struct request_queue *q = rq->q;
 553         struct kyber_queue_data *kqd = q->elevator->elevator_data;
 554         unsigned int sched_domain;
 555         u64 latency, target;
 556
 557         /*
 558          * Check if this request met our latency goal. If not, quickly gather
 559          * some statistics and start throttling.
 560          */
 561         sched_domain = kyber_sched_domain(rq->cmd_flags);
 562         switch (sched_domain) {
 563         case KYBER_READ:
 564                 target = kqd->read_lat_nsec;
 565                 break;
 566         case KYBER_SYNC_WRITE:
 567                 target = kqd->write_lat_nsec;
 568                 break;
 569         default:
 570                 return;
 571         }
 572
 573         /* If we are already monitoring latencies, don't check again. */
 574         if (blk_stat_is_active(kqd->cb))
 575                 return;
 576
 577         if (now < rq->io_start_time_ns)
 578                 return;
 579
 580         latency = now - rq->io_start_time_ns;
 581
 582         if (latency > target)
 583                 blk_stat_activate_msecs(kqd->cb, 10);
 584 }
 585
 586 struct flush_kcq_data {
 587         struct kyber_hctx_data *khd;
 588         unsigned int sched_domain;
 589         struct list_head *list;
 590 };
 591
 592 static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
 593 {
 594         struct flush_kcq_data *flush_data = data;
 595         struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
 596
 597         spin_lock(&kcq->lock);
 598         list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
 599                               flush_data->list);
 600         sbitmap_clear_bit(sb, bitnr);
 601         spin_unlock(&kcq->lock);
 602
 603         return true;
 604 }
 605
 606 static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
 607                                   unsigned int sched_domain,
 608                                   struct list_head *list)
 609 {
 610         struct flush_kcq_data data = {
 611                 .khd = khd,
 612                 .sched_domain = sched_domain,
 613                 .list = list,
 614         };
 615
 616         sbitmap_for_each_set(&khd->kcq_map[sched_domain],
 617                              flush_busy_kcq, &data);
 618 }
 619
 620 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 621                              void *key)
 622 {
 623         struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
 624
 625         list_del_init(&wait->entry);
 626         blk_mq_run_hw_queue(hctx, true);
 627         return 1;
 628 }
 629
 630 static int kyber_get_domain_token(struct kyber_queue_data *kqd,
 631                                   struct kyber_hctx_data *khd,
 632                                   struct blk_mq_hw_ctx *hctx)
 633 {
 634         unsigned int sched_domain = khd->cur_domain;
 635         struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
 636         wait_queue_entry_t *wait = &khd->domain_wait[sched_domain];
 637         struct sbq_wait_state *ws;
 638         int nr;
 639
 640         nr = __sbitmap_queue_get(domain_tokens);
 641
 642         /*
 643          * If we failed to get a domain token, make sure the hardware queue is
 644          * run when one becomes available. Note that this is serialized on
 645          * khd->lock, but we still need to be careful about the waker.
 646          */
 647         if (nr < 0 && list_empty_careful(&wait->entry)) {
 648                 ws = sbq_wait_ptr(domain_tokens,
 649                                   &khd->wait_index[sched_domain]);
 650                 khd->domain_ws[sched_domain] = ws;
 651                 add_wait_queue(&ws->wait, wait);
 652
 653                 /*
 654                  * Try again in case a token was freed before we got on the wait
 655                  * queue.
 656                  */
 657                 nr = __sbitmap_queue_get(domain_tokens);
 658         }
 659
 660         /*
 661          * If we got a token while we were on the wait queue, remove ourselves
 662          * from the wait queue to ensure that all wake ups make forward
 663          * progress. It's possible that the waker already deleted the entry
 664          * between the !list_empty_careful() check and us grabbing the lock, but
 665          * list_del_init() is okay with that.
 666          */
 667         if (nr >= 0 && !list_empty_careful(&wait->entry)) {
 668                 ws = khd->domain_ws[sched_domain];
 669                 spin_lock_irq(&ws->wait.lock);
 670                 list_del_init(&wait->entry);
 671                 spin_unlock_irq(&ws->wait.lock);
 672         }
 673
 674         return nr;
 675 }
 676
 677 static struct request *
 678 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
 679                           struct kyber_hctx_data *khd,
 680                           struct blk_mq_hw_ctx *hctx)
 681 {
 682         struct list_head *rqs;
 683         struct request *rq;
 684         int nr;
 685
 686         rqs = &khd->rqs[khd->cur_domain];
 687
 688         /*
 689          * If we already have a flushed request, then we just need to get a
 690          * token for it. Otherwise, if there are pending requests in the kcqs,
 691          * flush the kcqs, but only if we can get a token. If not, we should
 692          * leave the requests in the kcqs so that they can be merged. Note that
 693          * khd->lock serializes the flushes, so if we observed any bit set in
 694          * the kcq_map, we will always get a request.
 695          */
 696         rq = list_first_entry_or_null(rqs, struct request, queuelist);
 697         if (rq) {
 698                 nr = kyber_get_domain_token(kqd, khd, hctx);
 699                 if (nr >= 0) {
 700                         khd->batching++;
 701                         rq_set_domain_token(rq, nr);
 702                         list_del_init(&rq->queuelist);
 703                         return rq;
 704                 }
 705         } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
 706                 nr = kyber_get_domain_token(kqd, khd, hctx);
 707                 if (nr >= 0) {
 708                         kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
 709                         rq = list_first_entry(rqs, struct request, queuelist);
 710                         khd->batching++;
 711                         rq_set_domain_token(rq, nr);
 712                         list_del_init(&rq->queuelist);
 713                         return rq;
 714                 }
 715         }
 716
 717         /* There were either no pending requests or no tokens. */
 718         return NULL;
 719 }
 720
 721 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
 722 {
 723         struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 724         struct kyber_hctx_data *khd = hctx->sched_data;
 725         struct request *rq;
 726         int i;
 727
 728         spin_lock(&khd->lock);
 729
 730         /*
 731          * First, if we are still entitled to batch, try to dispatch a request
 732          * from the batch.
 733          */
 734         if (khd->batching < kyber_batch_size[khd->cur_domain]) {
 735                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
 736                 if (rq)
 737                         goto out;
 738         }
 739
 740         /*
 741          * Either,
 742          * 1. We were no longer entitled to a batch.
 743          * 2. The domain we were batching didn't have any requests.
 744          * 3. The domain we were batching was out of tokens.
 745          *
 746          * Start another batch. Note that this wraps back around to the original
 747          * domain if no other domains have requests or tokens.
 748          */
 749         khd->batching = 0;
 750         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 751                 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
 752                         khd->cur_domain = 0;
 753                 else
 754                         khd->cur_domain++;
 755
 756                 rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
 757                 if (rq)
 758                         goto out;
 759         }
 760
 761         rq = NULL;
 762 out:
 763         spin_unlock(&khd->lock);
 764         return rq;
 765 }
 766
 767 static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
 768 {
 769         struct kyber_hctx_data *khd = hctx->sched_data;
 770         int i;
 771
 772         for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
 773                 if (!list_empty_careful(&khd->rqs[i]) ||
 774                     sbitmap_any_bit_set(&khd->kcq_map[i]))
 775                         return true;
 776         }
 777
 778         return false;
 779 }
 780
 781 #define KYBER_LAT_SHOW_STORE(op)                                        \
 782 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,          \
 783                                      char *page)                        \
 784 {                                                                       \
 785         struct kyber_queue_data *kqd = e->elevator_data;                \
 786                                                                         \
 787         return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
 788 }                                                                       \
 789                                                                         \
 790 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,         \
 791                                       const char *page, size_t count)   \
 792 {                                                                       \
 793         struct kyber_queue_data *kqd = e->elevator_data;                \
 794         unsigned long long nsec;                                        \
 795         int ret;                                                        \
 796                                                                         \
 797         ret = kstrtoull(page, 10, &nsec);                               \
 798         if (ret)                                                        \
 799                 return ret;                                             \
 800                                                                         \
 801         kqd->op##_lat_nsec = nsec;                                      \
 802                                                                         \
 803         return count;                                                   \
 804 }
 805 KYBER_LAT_SHOW_STORE(read);
 806 KYBER_LAT_SHOW_STORE(write);
 807 #undef KYBER_LAT_SHOW_STORE
 808
 809 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
 810 static struct elv_fs_entry kyber_sched_attrs[] = {
 811         KYBER_LAT_ATTR(read),
 812         KYBER_LAT_ATTR(write),
 813         __ATTR_NULL
 814 };
 815 #undef KYBER_LAT_ATTR
 816
 817 #ifdef CONFIG_BLK_DEBUG_FS
 818 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name)                        \
 819 static int kyber_##name##_tokens_show(void *data, struct seq_file *m)   \
 820 {                                                                       \
 821         struct request_queue *q = data;                                 \
 822         struct kyber_queue_data *kqd = q->elevator->elevator_data;      \
 823                                                                         \
 824         sbitmap_queue_show(&kqd->domain_tokens[domain], m);             \
 825         return 0;                                                       \
 826 }                                                                       \
 827                                                                         \
 828 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos)  \
 829         __acquires(&khd->lock)                                          \
 830 {                                                                       \
 831         struct blk_mq_hw_ctx *hctx = m->private;                        \
 832         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 833                                                                         \
 834         spin_lock(&khd->lock);                                          \
 835         return seq_list_start(&khd->rqs[domain], *pos);                 \
 836 }                                                                       \
 837                                                                         \
 838 static void *kyber_##name##_rqs_next(struct seq_file *m, void *v,       \
 839                                      loff_t *pos)                       \
 840 {                                                                       \
 841         struct blk_mq_hw_ctx *hctx = m->private;                        \
 842         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 843                                                                         \
 844         return seq_list_next(v, &khd->rqs[domain], pos);                \
 845 }                                                                       \
 846                                                                         \
 847 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v)        \
 848         __releases(&khd->lock)                                          \
 849 {                                                                       \
 850         struct blk_mq_hw_ctx *hctx = m->private;                        \
 851         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 852                                                                         \
 853         spin_unlock(&khd->lock);                                        \
 854 }                                                                       \
 855                                                                         \
 856 static const struct seq_operations kyber_##name##_rqs_seq_ops = {       \
 857         .start  = kyber_##name##_rqs_start,                             \
 858         .next   = kyber_##name##_rqs_next,                              \
 859         .stop   = kyber_##name##_rqs_stop,                              \
 860         .show   = blk_mq_debugfs_rq_show,                               \
 861 };                                                                      \
 862                                                                         \
 863 static int kyber_##name##_waiting_show(void *data, struct seq_file *m)  \
 864 {                                                                       \
 865         struct blk_mq_hw_ctx *hctx = data;                              \
 866         struct kyber_hctx_data *khd = hctx->sched_data;                 \
 867         wait_queue_entry_t *wait = &khd->domain_wait[domain];           \
 868                                                                         \
 869         seq_printf(m, "%d\n", !list_empty_careful(&wait->entry));       \
 870         return 0;                                                       \
 871 }
 872 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
 873 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
 874 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
 875 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
 876
 877 static int kyber_async_depth_show(void *data, struct seq_file *m)
 878 {
 879         struct request_queue *q = data;
 880         struct kyber_queue_data *kqd = q->elevator->elevator_data;
 881
 882         seq_printf(m, "%u\n", kqd->async_depth);
 883         return 0;
 884 }
 885
 886 static int kyber_cur_domain_show(void *data, struct seq_file *m)
 887 {
 888         struct blk_mq_hw_ctx *hctx = data;
 889         struct kyber_hctx_data *khd = hctx->sched_data;
 890
 891         switch (khd->cur_domain) {
 892         case KYBER_READ:
 893                 seq_puts(m, "READ\n");
 894                 break;
 895         case KYBER_SYNC_WRITE:
 896                 seq_puts(m, "SYNC_WRITE\n");
 897                 break;
 898         case KYBER_OTHER:
 899                 seq_puts(m, "OTHER\n");
 900                 break;
 901         default:
 902                 seq_printf(m, "%u\n", khd->cur_domain);
 903                 break;
 904         }
 905         return 0;
 906 }
 907
 908 static int kyber_batching_show(void *data, struct seq_file *m)
 909 {
 910         struct blk_mq_hw_ctx *hctx = data;
 911         struct kyber_hctx_data *khd = hctx->sched_data;
 912
 913         seq_printf(m, "%u\n", khd->batching);
 914         return 0;
 915 }
 916
 917 #define KYBER_QUEUE_DOMAIN_ATTRS(name)  \
 918         {#name "_tokens", 0400, kyber_##name##_tokens_show}
 919 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
 920         KYBER_QUEUE_DOMAIN_ATTRS(read),
 921         KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
 922         KYBER_QUEUE_DOMAIN_ATTRS(other),
 923         {"async_depth", 0400, kyber_async_depth_show},
 924         {},
 925 };
 926 #undef KYBER_QUEUE_DOMAIN_ATTRS
 927
 928 #define KYBER_HCTX_DOMAIN_ATTRS(name)                                   \
 929         {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops},   \
 930         {#name "_waiting", 0400, kyber_##name##_waiting_show}
 931 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
 932         KYBER_HCTX_DOMAIN_ATTRS(read),
 933         KYBER_HCTX_DOMAIN_ATTRS(sync_write),
 934         KYBER_HCTX_DOMAIN_ATTRS(other),
 935         {"cur_domain", 0400, kyber_cur_domain_show},
 936         {"batching", 0400, kyber_batching_show},
 937         {},
 938 };
 939 #undef KYBER_HCTX_DOMAIN_ATTRS
 940 #endif
 941
 942 static struct elevator_type kyber_sched = {
 943         .ops.mq = {
 944                 .init_sched = kyber_init_sched,
 945                 .exit_sched = kyber_exit_sched,
 946                 .init_hctx = kyber_init_hctx,
 947                 .exit_hctx = kyber_exit_hctx,
 948                 .limit_depth = kyber_limit_depth,
 949                 .bio_merge = kyber_bio_merge,
 950                 .prepare_request = kyber_prepare_request,
 951                 .insert_requests = kyber_insert_requests,
 952                 .finish_request = kyber_finish_request,
 953                 .requeue_request = kyber_finish_request,
 954                 .completed_request = kyber_completed_request,
 955                 .dispatch_request = kyber_dispatch_request,
 956                 .has_work = kyber_has_work,
 957         },
 958         .uses_mq = true,
 959 #ifdef CONFIG_BLK_DEBUG_FS
 960         .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
 961         .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
 962 #endif
 963         .elevator_attrs = kyber_sched_attrs,
 964         .elevator_name = "kyber",
 965         .elevator_owner = THIS_MODULE,
 966 };
 967
 968 static int __init kyber_init(void)
 969 {
 970         return elv_register(&kyber_sched);
 971 }
 972
 973 static void __exit kyber_exit(void)
 974 {
 975         elv_unregister(&kyber_sched);
 976 }
 977
 978 module_init(kyber_init);
 979 module_exit(kyber_exit);
 980
 981 MODULE_AUTHOR("Omar Sandoval");
 982 MODULE_LICENSE("GPL");
 983 MODULE_DESCRIPTION("Kyber I/O scheduler");