include/linux/blk-mq.h

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 #ifndef BLK_MQ_H
   3 #define BLK_MQ_H
   4
   5 #include <linux/blkdev.h>
   6 #include <linux/sbitmap.h>
   7 #include <linux/srcu.h>
   8 #include <linux/lockdep.h>
   9 #include <linux/scatterlist.h>
  10 #include <linux/prefetch.h>
  11
  12 struct blk_mq_tags;
  13 struct blk_flush_queue;
  14
  15 #define BLKDEV_MIN_RQ   4
  16 #define BLKDEV_DEFAULT_RQ       128
  17
  18 typedef void (rq_end_io_fn)(struct request *, blk_status_t);
  19
  20 /*
  21  * request flags */
  22 typedef __u32 __bitwise req_flags_t;
  23
  24 /* drive already may have started this one */
  25 #define RQF_STARTED             ((__force req_flags_t)(1 << 1))
  26 /* may not be passed by ioscheduler */
  27 #define RQF_SOFTBARRIER         ((__force req_flags_t)(1 << 3))
  28 /* request for flush sequence */
  29 #define RQF_FLUSH_SEQ           ((__force req_flags_t)(1 << 4))
  30 /* merge of different types, fail separately */
  31 #define RQF_MIXED_MERGE         ((__force req_flags_t)(1 << 5))
  32 /* track inflight for MQ */
  33 #define RQF_MQ_INFLIGHT         ((__force req_flags_t)(1 << 6))
  34 /* don't call prep for this one */
  35 #define RQF_DONTPREP            ((__force req_flags_t)(1 << 7))
  36 /* vaguely specified driver internal error.  Ignored by the block layer */
  37 #define RQF_FAILED              ((__force req_flags_t)(1 << 10))
  38 /* don't warn about errors */
  39 #define RQF_QUIET               ((__force req_flags_t)(1 << 11))
  40 /* elevator private data attached */
  41 #define RQF_ELVPRIV             ((__force req_flags_t)(1 << 12))
  42 /* account into disk and partition IO statistics */
  43 #define RQF_IO_STAT             ((__force req_flags_t)(1 << 13))
  44 /* runtime pm request */
  45 #define RQF_PM                  ((__force req_flags_t)(1 << 15))
  46 /* on IO scheduler merge hash */
  47 #define RQF_HASHED              ((__force req_flags_t)(1 << 16))
  48 /* track IO completion time */
  49 #define RQF_STATS               ((__force req_flags_t)(1 << 17))
  50 /* Look at ->special_vec for the actual data payload instead of the
  51    bio chain. */
  52 #define RQF_SPECIAL_PAYLOAD     ((__force req_flags_t)(1 << 18))
  53 /* The per-zone write lock is held for this request */
  54 #define RQF_ZONE_WRITE_LOCKED   ((__force req_flags_t)(1 << 19))
  55 /* already slept for hybrid poll */
  56 #define RQF_MQ_POLL_SLEPT       ((__force req_flags_t)(1 << 20))
  57 /* ->timeout has been called, don't expire again */
  58 #define RQF_TIMED_OUT           ((__force req_flags_t)(1 << 21))
  59 /* queue has elevator attached */
  60 #define RQF_ELV                 ((__force req_flags_t)(1 << 22))
  61
  62 /* flags that prevent us from merging requests: */
  63 #define RQF_NOMERGE_FLAGS \
  64         (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
  65
  66 enum mq_rq_state {
  67         MQ_RQ_IDLE              = 0,
  68         MQ_RQ_IN_FLIGHT         = 1,
  69         MQ_RQ_COMPLETE          = 2,
  70 };
  71
  72 /*
  73  * Try to put the fields that are referenced together in the same cacheline.
  74  *
  75  * If you modify this structure, make sure to update blk_rq_init() and
  76  * especially blk_mq_rq_ctx_init() to take care of the added fields.
  77  */
  78 struct request {
  79         struct request_queue *q;
  80         struct blk_mq_ctx *mq_ctx;
  81         struct blk_mq_hw_ctx *mq_hctx;
  82
  83         unsigned int cmd_flags;         /* op and common flags */
  84         req_flags_t rq_flags;
  85
  86         int tag;
  87         int internal_tag;
  88
  89         unsigned int timeout;
  90
  91         /* the following two fields are internal, NEVER access directly */
  92         unsigned int __data_len;        /* total data len */
  93         sector_t __sector;              /* sector cursor */
  94
  95         struct bio *bio;
  96         struct bio *biotail;
  97
  98         union {
  99                 struct list_head queuelist;
 100                 struct request *rq_next;
 101         };
 102
 103         struct gendisk *rq_disk;
 104         struct block_device *part;
 105 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 106         /* Time that the first bio started allocating this request. */
 107         u64 alloc_time_ns;
 108 #endif
 109         /* Time that this request was allocated for this IO. */
 110         u64 start_time_ns;
 111         /* Time that I/O was submitted to the device. */
 112         u64 io_start_time_ns;
 113
 114 #ifdef CONFIG_BLK_WBT
 115         unsigned short wbt_flags;
 116 #endif
 117         /*
 118          * rq sectors used for blk stats. It has the same value
 119          * with blk_rq_sectors(rq), except that it never be zeroed
 120          * by completion.
 121          */
 122         unsigned short stats_sectors;
 123
 124         /*
 125          * Number of scatter-gather DMA addr+len pairs after
 126          * physical address coalescing is performed.
 127          */
 128         unsigned short nr_phys_segments;
 129
 130 #ifdef CONFIG_BLK_DEV_INTEGRITY
 131         unsigned short nr_integrity_segments;
 132 #endif
 133
 134 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 135         struct bio_crypt_ctx *crypt_ctx;
 136         struct blk_crypto_keyslot *crypt_keyslot;
 137 #endif
 138
 139         unsigned short write_hint;
 140         unsigned short ioprio;
 141
 142         enum mq_rq_state state;
 143         refcount_t ref;
 144
 145         unsigned long deadline;
 146
 147         /*
 148          * The hash is used inside the scheduler, and killed once the
 149          * request reaches the dispatch list. The ipi_list is only used
 150          * to queue the request for softirq completion, which is long
 151          * after the request has been unhashed (and even removed from
 152          * the dispatch list).
 153          */
 154         union {
 155                 struct hlist_node hash; /* merge hash */
 156                 struct llist_node ipi_list;
 157         };
 158
 159         /*
 160          * The rb_node is only used inside the io scheduler, requests
 161          * are pruned when moved to the dispatch queue. So let the
 162          * completion_data share space with the rb_node.
 163          */
 164         union {
 165                 struct rb_node rb_node; /* sort/lookup */
 166                 struct bio_vec special_vec;
 167                 void *completion_data;
 168                 int error_count; /* for legacy drivers, don't use */
 169         };
 170
 171
 172         /*
 173          * Three pointers are available for the IO schedulers, if they need
 174          * more they have to dynamically allocate it.  Flush requests are
 175          * never put on the IO scheduler. So let the flush fields share
 176          * space with the elevator data.
 177          */
 178         union {
 179                 struct {
 180                         struct io_cq            *icq;
 181                         void                    *priv[2];
 182                 } elv;
 183
 184                 struct {
 185                         unsigned int            seq;
 186                         struct list_head        list;
 187                         rq_end_io_fn            *saved_end_io;
 188                 } flush;
 189         };
 190
 191         union {
 192                 struct __call_single_data csd;
 193                 u64 fifo_time;
 194         };
 195
 196         /*
 197          * completion callback.
 198          */
 199         rq_end_io_fn *end_io;
 200         void *end_io_data;
 201 };
 202
 203 #define req_op(req) \
 204         ((req)->cmd_flags & REQ_OP_MASK)
 205
 206 static inline bool blk_rq_is_passthrough(struct request *rq)
 207 {
 208         return blk_op_is_passthrough(req_op(rq));
 209 }
 210
 211 static inline unsigned short req_get_ioprio(struct request *req)
 212 {
 213         return req->ioprio;
 214 }
 215
 216 #define rq_data_dir(rq)         (op_is_write(req_op(rq)) ? WRITE : READ)
 217
 218 #define rq_dma_dir(rq) \
 219         (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
 220
 221 enum blk_eh_timer_return {
 222         BLK_EH_DONE,            /* drivers has completed the command */
 223         BLK_EH_RESET_TIMER,     /* reset timer and try again */
 224 };
 225
 226 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
 227 #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
 228
 229 /**
 230  * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
 231  * block device
 232  */
 233 struct blk_mq_hw_ctx {
 234         struct {
 235                 /** @lock: Protects the dispatch list. */
 236                 spinlock_t              lock;
 237                 /**
 238                  * @dispatch: Used for requests that are ready to be
 239                  * dispatched to the hardware but for some reason (e.g. lack of
 240                  * resources) could not be sent to the hardware. As soon as the
 241                  * driver can send new requests, requests at this list will
 242                  * be sent first for a fairer dispatch.
 243                  */
 244                 struct list_head        dispatch;
 245                  /**
 246                   * @state: BLK_MQ_S_* flags. Defines the state of the hw
 247                   * queue (active, scheduled to restart, stopped).
 248                   */
 249                 unsigned long           state;
 250         } ____cacheline_aligned_in_smp;
 251
 252         /**
 253          * @run_work: Used for scheduling a hardware queue run at a later time.
 254          */
 255         struct delayed_work     run_work;
 256         /** @cpumask: Map of available CPUs where this hctx can run. */
 257         cpumask_var_t           cpumask;
 258         /**
 259          * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
 260          * selection from @cpumask.
 261          */
 262         int                     next_cpu;
 263         /**
 264          * @next_cpu_batch: Counter of how many works left in the batch before
 265          * changing to the next CPU.
 266          */
 267         int                     next_cpu_batch;
 268
 269         /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
 270         unsigned long           flags;
 271
 272         /**
 273          * @sched_data: Pointer owned by the IO scheduler attached to a request
 274          * queue. It's up to the IO scheduler how to use this pointer.
 275          */
 276         void                    *sched_data;
 277         /**
 278          * @queue: Pointer to the request queue that owns this hardware context.
 279          */
 280         struct request_queue    *queue;
 281         /** @fq: Queue of requests that need to perform a flush operation. */
 282         struct blk_flush_queue  *fq;
 283
 284         /**
 285          * @driver_data: Pointer to data owned by the block driver that created
 286          * this hctx
 287          */
 288         void                    *driver_data;
 289
 290         /**
 291          * @ctx_map: Bitmap for each software queue. If bit is on, there is a
 292          * pending request in that software queue.
 293          */
 294         struct sbitmap          ctx_map;
 295
 296         /**
 297          * @dispatch_from: Software queue to be used when no scheduler was
 298          * selected.
 299          */
 300         struct blk_mq_ctx       *dispatch_from;
 301         /**
 302          * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
 303          * decide if the hw_queue is busy using Exponential Weighted Moving
 304          * Average algorithm.
 305          */
 306         unsigned int            dispatch_busy;
 307
 308         /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
 309         unsigned short          type;
 310         /** @nr_ctx: Number of software queues. */
 311         unsigned short          nr_ctx;
 312         /** @ctxs: Array of software queues. */
 313         struct blk_mq_ctx       **ctxs;
 314
 315         /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
 316         spinlock_t              dispatch_wait_lock;
 317         /**
 318          * @dispatch_wait: Waitqueue to put requests when there is no tag
 319          * available at the moment, to wait for another try in the future.
 320          */
 321         wait_queue_entry_t      dispatch_wait;
 322
 323         /**
 324          * @wait_index: Index of next available dispatch_wait queue to insert
 325          * requests.
 326          */
 327         atomic_t                wait_index;
 328
 329         /**
 330          * @tags: Tags owned by the block driver. A tag at this set is only
 331          * assigned when a request is dispatched from a hardware queue.
 332          */
 333         struct blk_mq_tags      *tags;
 334         /**
 335          * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
 336          * scheduler associated with a request queue, a tag is assigned when
 337          * that request is allocated. Else, this member is not used.
 338          */
 339         struct blk_mq_tags      *sched_tags;
 340
 341         /** @queued: Number of queued requests. */
 342         unsigned long           queued;
 343         /** @run: Number of dispatched requests. */
 344         unsigned long           run;
 345
 346         /** @numa_node: NUMA node the storage adapter has been connected to. */
 347         unsigned int            numa_node;
 348         /** @queue_num: Index of this hardware queue. */
 349         unsigned int            queue_num;
 350
 351         /**
 352          * @nr_active: Number of active requests. Only used when a tag set is
 353          * shared across request queues.
 354          */
 355         atomic_t                nr_active;
 356
 357         /** @cpuhp_online: List to store request if CPU is going to die */
 358         struct hlist_node       cpuhp_online;
 359         /** @cpuhp_dead: List to store request if some CPU die. */
 360         struct hlist_node       cpuhp_dead;
 361         /** @kobj: Kernel object for sysfs. */
 362         struct kobject          kobj;
 363
 364 #ifdef CONFIG_BLK_DEBUG_FS
 365         /**
 366          * @debugfs_dir: debugfs directory for this hardware queue. Named
 367          * as cpu<cpu_number>.
 368          */
 369         struct dentry           *debugfs_dir;
 370         /** @sched_debugfs_dir: debugfs directory for the scheduler. */
 371         struct dentry           *sched_debugfs_dir;
 372 #endif
 373
 374         /**
 375          * @hctx_list: if this hctx is not in use, this is an entry in
 376          * q->unused_hctx_list.
 377          */
 378         struct list_head        hctx_list;
 379
 380         /**
 381          * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
 382          * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
 383          * blk_mq_hw_ctx_size().
 384          */
 385         struct srcu_struct      srcu[];
 386 };
 387
 388 /**
 389  * struct blk_mq_queue_map - Map software queues to hardware queues
 390  * @mq_map:       CPU ID to hardware queue index map. This is an array
 391  *      with nr_cpu_ids elements. Each element has a value in the range
 392  *      [@queue_offset, @queue_offset + @nr_queues).
 393  * @nr_queues:    Number of hardware queues to map CPU IDs onto.
 394  * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
 395  *      driver to map each hardware queue type (enum hctx_type) onto a distinct
 396  *      set of hardware queues.
 397  */
 398 struct blk_mq_queue_map {
 399         unsigned int *mq_map;
 400         unsigned int nr_queues;
 401         unsigned int queue_offset;
 402 };
 403
 404 /**
 405  * enum hctx_type - Type of hardware queue
 406  * @HCTX_TYPE_DEFAULT:  All I/O not otherwise accounted for.
 407  * @HCTX_TYPE_READ:     Just for READ I/O.
 408  * @HCTX_TYPE_POLL:     Polled I/O of any kind.
 409  * @HCTX_MAX_TYPES:     Number of types of hctx.
 410  */
 411 enum hctx_type {
 412         HCTX_TYPE_DEFAULT,
 413         HCTX_TYPE_READ,
 414         HCTX_TYPE_POLL,
 415
 416         HCTX_MAX_TYPES,
 417 };
 418
 419 /**
 420  * struct blk_mq_tag_set - tag set that can be shared between request queues
 421  * @map:           One or more ctx -> hctx mappings. One map exists for each
 422  *                 hardware queue type (enum hctx_type) that the driver wishes
 423  *                 to support. There are no restrictions on maps being of the
 424  *                 same size, and it's perfectly legal to share maps between
 425  *                 types.
 426  * @nr_maps:       Number of elements in the @map array. A number in the range
 427  *                 [1, HCTX_MAX_TYPES].
 428  * @ops:           Pointers to functions that implement block driver behavior.
 429  * @nr_hw_queues:  Number of hardware queues supported by the block driver that
 430  *                 owns this data structure.
 431  * @queue_depth:   Number of tags per hardware queue, reserved tags included.
 432  * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
 433  *                 allocations.
 434  * @cmd_size:      Number of additional bytes to allocate per request. The block
 435  *                 driver owns these additional bytes.
 436  * @numa_node:     NUMA node the storage adapter has been connected to.
 437  * @timeout:       Request processing timeout in jiffies.
 438  * @flags:         Zero or more BLK_MQ_F_* flags.
 439  * @driver_data:   Pointer to data owned by the block driver that created this
 440  *                 tag set.
 441  * @tags:          Tag sets. One tag set per hardware queue. Has @nr_hw_queues
 442  *                 elements.
 443  * @shared_tags:
 444  *                 Shared set of tags. Has @nr_hw_queues elements. If set,
 445  *                 shared by all @tags.
 446  * @tag_list_lock: Serializes tag_list accesses.
 447  * @tag_list:      List of the request queues that use this tag set. See also
 448  *                 request_queue.tag_set_list.
 449  */
 450 struct blk_mq_tag_set {
 451         struct blk_mq_queue_map map[HCTX_MAX_TYPES];
 452         unsigned int            nr_maps;
 453         const struct blk_mq_ops *ops;
 454         unsigned int            nr_hw_queues;
 455         unsigned int            queue_depth;
 456         unsigned int            reserved_tags;
 457         unsigned int            cmd_size;
 458         int                     numa_node;
 459         unsigned int            timeout;
 460         unsigned int            flags;
 461         void                    *driver_data;
 462
 463         struct blk_mq_tags      **tags;
 464
 465         struct blk_mq_tags      *shared_tags;
 466
 467         struct mutex            tag_list_lock;
 468         struct list_head        tag_list;
 469 };
 470
 471 /**
 472  * struct blk_mq_queue_data - Data about a request inserted in a queue
 473  *
 474  * @rq:   Request pointer.
 475  * @last: If it is the last request in the queue.
 476  */
 477 struct blk_mq_queue_data {
 478         struct request *rq;
 479         bool last;
 480 };
 481
 482 typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 483                 bool);
 484 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
 485
 486 /**
 487  * struct blk_mq_ops - Callback functions that implements block driver
 488  * behaviour.
 489  */
 490 struct blk_mq_ops {
 491         /**
 492          * @queue_rq: Queue a new request from block IO.
 493          */
 494         blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
 495                                  const struct blk_mq_queue_data *);
 496
 497         /**
 498          * @commit_rqs: If a driver uses bd->last to judge when to submit
 499          * requests to hardware, it must define this function. In case of errors
 500          * that make us stop issuing further requests, this hook serves the
 501          * purpose of kicking the hardware (which the last request otherwise
 502          * would have done).
 503          */
 504         void (*commit_rqs)(struct blk_mq_hw_ctx *);
 505
 506         /**
 507          * @get_budget: Reserve budget before queue request, once .queue_rq is
 508          * run, it is driver's responsibility to release the
 509          * reserved budget. Also we have to handle failure case
 510          * of .get_budget for avoiding I/O deadlock.
 511          */
 512         int (*get_budget)(struct request_queue *);
 513
 514         /**
 515          * @put_budget: Release the reserved budget.
 516          */
 517         void (*put_budget)(struct request_queue *, int);
 518
 519         /**
 520          * @set_rq_budget_token: store rq's budget token
 521          */
 522         void (*set_rq_budget_token)(struct request *, int);
 523         /**
 524          * @get_rq_budget_token: retrieve rq's budget token
 525          */
 526         int (*get_rq_budget_token)(struct request *);
 527
 528         /**
 529          * @timeout: Called on request timeout.
 530          */
 531         enum blk_eh_timer_return (*timeout)(struct request *, bool);
 532
 533         /**
 534          * @poll: Called to poll for completion of a specific tag.
 535          */
 536         int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);
 537
 538         /**
 539          * @complete: Mark the request as complete.
 540          */
 541         void (*complete)(struct request *);
 542
 543         /**
 544          * @init_hctx: Called when the block layer side of a hardware queue has
 545          * been set up, allowing the driver to allocate/init matching
 546          * structures.
 547          */
 548         int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
 549         /**
 550          * @exit_hctx: Ditto for exit/teardown.
 551          */
 552         void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
 553
 554         /**
 555          * @init_request: Called for every command allocated by the block layer
 556          * to allow the driver to set up driver specific data.
 557          *
 558          * Tag greater than or equal to queue_depth is for setting up
 559          * flush request.
 560          */
 561         int (*init_request)(struct blk_mq_tag_set *set, struct request *,
 562                             unsigned int, unsigned int);
 563         /**
 564          * @exit_request: Ditto for exit/teardown.
 565          */
 566         void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
 567                              unsigned int);
 568
 569         /**
 570          * @initialize_rq_fn: Called from inside blk_get_request().
 571          */
 572         void (*initialize_rq_fn)(struct request *rq);
 573
 574         /**
 575          * @cleanup_rq: Called before freeing one request which isn't completed
 576          * yet, and usually for freeing the driver private data.
 577          */
 578         void (*cleanup_rq)(struct request *);
 579
 580         /**
 581          * @busy: If set, returns whether or not this queue currently is busy.
 582          */
 583         bool (*busy)(struct request_queue *);
 584
 585         /**
 586          * @map_queues: This allows drivers specify their own queue mapping by
 587          * overriding the setup-time function that builds the mq_map.
 588          */
 589         int (*map_queues)(struct blk_mq_tag_set *set);
 590
 591 #ifdef CONFIG_BLK_DEBUG_FS
 592         /**
 593          * @show_rq: Used by the debugfs implementation to show driver-specific
 594          * information about a request.
 595          */
 596         void (*show_rq)(struct seq_file *m, struct request *rq);
 597 #endif
 598 };
 599
 600 enum {
 601         BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
 602         BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
 603         /*
 604          * Set when this device requires underlying blk-mq device for
 605          * completing IO:
 606          */
 607         BLK_MQ_F_STACKING       = 1 << 2,
 608         BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
 609         BLK_MQ_F_BLOCKING       = 1 << 5,
 610         /* Do not allow an I/O scheduler to be configured. */
 611         BLK_MQ_F_NO_SCHED       = 1 << 6,
 612         /*
 613          * Select 'none' during queue registration in case of a single hwq
 614          * or shared hwqs instead of 'mq-deadline'.
 615          */
 616         BLK_MQ_F_NO_SCHED_BY_DEFAULT    = 1 << 7,
 617         BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 618         BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 619
 620         BLK_MQ_S_STOPPED        = 0,
 621         BLK_MQ_S_TAG_ACTIVE     = 1,
 622         BLK_MQ_S_SCHED_RESTART  = 2,
 623
 624         /* hw queue is inactive after all its CPUs become offline */
 625         BLK_MQ_S_INACTIVE       = 3,
 626
 627         BLK_MQ_MAX_DEPTH        = 10240,
 628
 629         BLK_MQ_CPU_WORK_BATCH   = 8,
 630 };
 631 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
 632         ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
 633                 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
 634 #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
 635         ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
 636                 << BLK_MQ_F_ALLOC_POLICY_START_BIT)
 637
 638 #define BLK_MQ_NO_HCTX_IDX      (-1U)
 639
 640 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
 641                 struct lock_class_key *lkclass);
 642 #define blk_mq_alloc_disk(set, queuedata)                               \
 643 ({                                                                      \
 644         static struct lock_class_key __key;                             \
 645                                                                         \
 646         __blk_mq_alloc_disk(set, queuedata, &__key);                    \
 647 })
 648 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 649 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 650                 struct request_queue *q);
 651 void blk_mq_unregister_dev(struct device *, struct request_queue *);
 652
 653 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
 654 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
 655                 const struct blk_mq_ops *ops, unsigned int queue_depth,
 656                 unsigned int set_flags);
 657 void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 658
 659 void blk_mq_free_request(struct request *rq);
 660
 661 bool blk_mq_queue_inflight(struct request_queue *q);
 662
 663 enum {
 664         /* return when out of requests */
 665         BLK_MQ_REQ_NOWAIT       = (__force blk_mq_req_flags_t)(1 << 0),
 666         /* allocate from reserved pool */
 667         BLK_MQ_REQ_RESERVED     = (__force blk_mq_req_flags_t)(1 << 1),
 668         /* set RQF_PM */
 669         BLK_MQ_REQ_PM           = (__force blk_mq_req_flags_t)(1 << 2),
 670 };
 671
 672 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 673                 blk_mq_req_flags_t flags);
 674 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 675                 unsigned int op, blk_mq_req_flags_t flags,
 676                 unsigned int hctx_idx);
 677
 678 /*
 679  * Tag address space map.
 680  */
 681 struct blk_mq_tags {
 682         unsigned int nr_tags;
 683         unsigned int nr_reserved_tags;
 684
 685         atomic_t active_queues;
 686
 687         struct sbitmap_queue bitmap_tags;
 688         struct sbitmap_queue breserved_tags;
 689
 690         struct request **rqs;
 691         struct request **static_rqs;
 692         struct list_head page_list;
 693
 694         /*
 695          * used to clear request reference in rqs[] before freeing one
 696          * request pool
 697          */
 698         spinlock_t lock;
 699 };
 700
 701 static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
 702                                                unsigned int tag)
 703 {
 704         if (tag < tags->nr_tags) {
 705                 prefetch(tags->rqs[tag]);
 706                 return tags->rqs[tag];
 707         }
 708
 709         return NULL;
 710 }
 711
 712 enum {
 713         BLK_MQ_UNIQUE_TAG_BITS = 16,
 714         BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
 715 };
 716
 717 u32 blk_mq_unique_tag(struct request *rq);
 718
 719 static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
 720 {
 721         return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
 722 }
 723
 724 static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
 725 {
 726         return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
 727 }
 728
 729 /**
 730  * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
 731  * @rq: target request.
 732  */
 733 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
 734 {
 735         return READ_ONCE(rq->state);
 736 }
 737
 738 static inline int blk_mq_request_started(struct request *rq)
 739 {
 740         return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
 741 }
 742
 743 static inline int blk_mq_request_completed(struct request *rq)
 744 {
 745         return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
 746 }
 747
 748 /*
 749  *
 750  * Set the state to complete when completing a request from inside ->queue_rq.
 751  * This is used by drivers that want to ensure special complete actions that
 752  * need access to the request are called on failure, e.g. by nvme for
 753  * multipathing.
 754  */
 755 static inline void blk_mq_set_request_complete(struct request *rq)
 756 {
 757         WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
 758 }
 759
 760 void blk_mq_start_request(struct request *rq);
 761 void blk_mq_end_request(struct request *rq, blk_status_t error);
 762 void __blk_mq_end_request(struct request *rq, blk_status_t error);
 763 void blk_mq_end_request_batch(struct io_comp_batch *ib);
 764
 765 /*
 766  * Only need start/end time stamping if we have iostat or
 767  * blk stats enabled, or using an IO scheduler.
 768  */
 769 static inline bool blk_mq_need_time_stamp(struct request *rq)
 770 {
 771         return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
 772 }
 773
 774 /*
 775  * Batched completions only work when there is no I/O error and no special
 776  * ->end_io handler.
 777  */
 778 static inline bool blk_mq_add_to_batch(struct request *req,
 779                                        struct io_comp_batch *iob, int ioerror,
 780                                        void (*complete)(struct io_comp_batch *))
 781 {
 782         if (!iob || (req->rq_flags & RQF_ELV) || req->end_io || ioerror)
 783                 return false;
 784         if (!iob->complete)
 785                 iob->complete = complete;
 786         else if (iob->complete != complete)
 787                 return false;
 788         iob->need_ts |= blk_mq_need_time_stamp(req);
 789         rq_list_add(&iob->req_list, req);
 790         return true;
 791 }
 792
 793 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
 794 void blk_mq_kick_requeue_list(struct request_queue *q);
 795 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 796 void blk_mq_complete_request(struct request *rq);
 797 bool blk_mq_complete_request_remote(struct request *rq);
 798 bool blk_mq_queue_stopped(struct request_queue *q);
 799 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 800 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
 801 void blk_mq_stop_hw_queues(struct request_queue *q);
 802 void blk_mq_start_hw_queues(struct request_queue *q);
 803 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 804 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 805 void blk_mq_quiesce_queue(struct request_queue *q);
 806 void blk_mq_unquiesce_queue(struct request_queue *q);
 807 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 808 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 809 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 810 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
 811 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 812                 busy_tag_iter_fn *fn, void *priv);
 813 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
 814 void blk_mq_freeze_queue(struct request_queue *q);
 815 void blk_mq_unfreeze_queue(struct request_queue *q);
 816 void blk_freeze_queue_start(struct request_queue *q);
 817 void blk_mq_freeze_queue_wait(struct request_queue *q);
 818 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 819                                      unsigned long timeout);
 820
 821 int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 822 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 823
 824 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
 825
 826 unsigned int blk_mq_rq_cpu(struct request *rq);
 827
 828 bool __blk_should_fake_timeout(struct request_queue *q);
 829 static inline bool blk_should_fake_timeout(struct request_queue *q)
 830 {
 831         if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
 832             test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
 833                 return __blk_should_fake_timeout(q);
 834         return false;
 835 }
 836
 837 /**
 838  * blk_mq_rq_from_pdu - cast a PDU to a request
 839  * @pdu: the PDU (Protocol Data Unit) to be casted
 840  *
 841  * Return: request
 842  *
 843  * Driver command data is immediately after the request. So subtract request
 844  * size to get back to the original request.
 845  */
 846 static inline struct request *blk_mq_rq_from_pdu(void *pdu)
 847 {
 848         return pdu - sizeof(struct request);
 849 }
 850
 851 /**
 852  * blk_mq_rq_to_pdu - cast a request to a PDU
 853  * @rq: the request to be casted
 854  *
 855  * Return: pointer to the PDU
 856  *
 857  * Driver command data is immediately after the request. So add request to get
 858  * the PDU.
 859  */
 860 static inline void *blk_mq_rq_to_pdu(struct request *rq)
 861 {
 862         return rq + 1;
 863 }
 864
 865 #define queue_for_each_hw_ctx(q, hctx, i)                               \
 866         for ((i) = 0; (i) < (q)->nr_hw_queues &&                        \
 867              ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
 868
 869 #define hctx_for_each_ctx(hctx, ctx, i)                                 \
 870         for ((i) = 0; (i) < (hctx)->nr_ctx &&                           \
 871              ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
 872
 873 static inline void blk_mq_cleanup_rq(struct request *rq)
 874 {
 875         if (rq->q->mq_ops->cleanup_rq)
 876                 rq->q->mq_ops->cleanup_rq(rq);
 877 }
 878
 879 static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
 880                 unsigned int nr_segs)
 881 {
 882         rq->nr_phys_segments = nr_segs;
 883         rq->__data_len = bio->bi_iter.bi_size;
 884         rq->bio = rq->biotail = bio;
 885         rq->ioprio = bio_prio(bio);
 886
 887         if (bio->bi_bdev)
 888                 rq->rq_disk = bio->bi_bdev->bd_disk;
 889 }
 890
 891 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
 892                 struct lock_class_key *key);
 893
 894 static inline bool rq_is_sync(struct request *rq)
 895 {
 896         return op_is_sync(rq->cmd_flags);
 897 }
 898
 899 void blk_rq_init(struct request_queue *q, struct request *rq);
 900 void blk_put_request(struct request *rq);
 901 struct request *blk_get_request(struct request_queue *q, unsigned int op,
 902                 blk_mq_req_flags_t flags);
 903 int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 904                 struct bio_set *bs, gfp_t gfp_mask,
 905                 int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
 906 void blk_rq_unprep_clone(struct request *rq);
 907 blk_status_t blk_insert_cloned_request(struct request_queue *q,
 908                 struct request *rq);
 909
 910 struct rq_map_data {
 911         struct page **pages;
 912         int page_order;
 913         int nr_entries;
 914         unsigned long offset;
 915         int null_mapped;
 916         int from_user;
 917 };
 918
 919 int blk_rq_map_user(struct request_queue *, struct request *,
 920                 struct rq_map_data *, void __user *, unsigned long, gfp_t);
 921 int blk_rq_map_user_iov(struct request_queue *, struct request *,
 922                 struct rq_map_data *, const struct iov_iter *, gfp_t);
 923 int blk_rq_unmap_user(struct bio *);
 924 int blk_rq_map_kern(struct request_queue *, struct request *, void *,
 925                 unsigned int, gfp_t);
 926 int blk_rq_append_bio(struct request *rq, struct bio *bio);
 927 void blk_execute_rq_nowait(struct gendisk *, struct request *, int,
 928                 rq_end_io_fn *);
 929 blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
 930                 int at_head);
 931
 932 struct req_iterator {
 933         struct bvec_iter iter;
 934         struct bio *bio;
 935 };
 936
 937 #define __rq_for_each_bio(_bio, rq)     \
 938         if ((rq->bio))                  \
 939                 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
 940
 941 #define rq_for_each_segment(bvl, _rq, _iter)                    \
 942         __rq_for_each_bio(_iter.bio, _rq)                       \
 943                 bio_for_each_segment(bvl, _iter.bio, _iter.iter)
 944
 945 #define rq_for_each_bvec(bvl, _rq, _iter)                       \
 946         __rq_for_each_bio(_iter.bio, _rq)                       \
 947                 bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
 948
 949 #define rq_iter_last(bvec, _iter)                               \
 950                 (_iter.bio->bi_next == NULL &&                  \
 951                  bio_iter_last(bvec, _iter.iter))
 952
 953 /*
 954  * blk_rq_pos()                 : the current sector
 955  * blk_rq_bytes()               : bytes left in the entire request
 956  * blk_rq_cur_bytes()           : bytes left in the current segment
 957  * blk_rq_err_bytes()           : bytes left till the next error boundary
 958  * blk_rq_sectors()             : sectors left in the entire request
 959  * blk_rq_cur_sectors()         : sectors left in the current segment
 960  * blk_rq_stats_sectors()       : sectors of the entire request used for stats
 961  */
 962 static inline sector_t blk_rq_pos(const struct request *rq)
 963 {
 964         return rq->__sector;
 965 }
 966
 967 static inline unsigned int blk_rq_bytes(const struct request *rq)
 968 {
 969         return rq->__data_len;
 970 }
 971
 972 static inline int blk_rq_cur_bytes(const struct request *rq)
 973 {
 974         if (!rq->bio)
 975                 return 0;
 976         if (!bio_has_data(rq->bio))     /* dataless requests such as discard */
 977                 return rq->bio->bi_iter.bi_size;
 978         return bio_iovec(rq->bio).bv_len;
 979 }
 980
 981 unsigned int blk_rq_err_bytes(const struct request *rq);
 982
 983 static inline unsigned int blk_rq_sectors(const struct request *rq)
 984 {
 985         return blk_rq_bytes(rq) >> SECTOR_SHIFT;
 986 }
 987
 988 static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 989 {
 990         return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
 991 }
 992
 993 static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
 994 {
 995         return rq->stats_sectors;
 996 }
 997
 998 /*
 999  * Some commands like WRITE SAME have a payload or data transfer size which
1000  * is different from the size of the request.  Any driver that supports such
1001  * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
1002  * calculate the data transfer size.
1003  */
1004 static inline unsigned int blk_rq_payload_bytes(struct request *rq)
1005 {
1006         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1007                 return rq->special_vec.bv_len;
1008         return blk_rq_bytes(rq);
1009 }
1010
1011 /*
1012  * Return the first full biovec in the request.  The caller needs to check that
1013  * there are any bvecs before calling this helper.
1014  */
1015 static inline struct bio_vec req_bvec(struct request *rq)
1016 {
1017         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1018                 return rq->special_vec;
1019         return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
1020 }
1021
1022 static inline unsigned int blk_rq_count_bios(struct request *rq)
1023 {
1024         unsigned int nr_bios = 0;
1025         struct bio *bio;
1026
1027         __rq_for_each_bio(bio, rq)
1028                 nr_bios++;
1029
1030         return nr_bios;
1031 }
1032
1033 void blk_steal_bios(struct bio_list *list, struct request *rq);
1034
1035 /*
1036  * Request completion related functions.
1037  *
1038  * blk_update_request() completes given number of bytes and updates
1039  * the request without completing it.
1040  */
1041 bool blk_update_request(struct request *rq, blk_status_t error,
1042                                unsigned int nr_bytes);
1043 void blk_abort_request(struct request *);
1044
1045 /*
1046  * Number of physical segments as sent to the device.
1047  *
1048  * Normally this is the number of discontiguous data segments sent by the
1049  * submitter.  But for data-less command like discard we might have no
1050  * actual data segments submitted, but the driver might have to add it's
1051  * own special payload.  In that case we still return 1 here so that this
1052  * special payload will be mapped.
1053  */
1054 static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
1055 {
1056         if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1057                 return 1;
1058         return rq->nr_phys_segments;
1059 }
1060
1061 /*
1062  * Number of discard segments (or ranges) the driver needs to fill in.
1063  * Each discard bio merged into a request is counted as one segment.
1064  */
1065 static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
1066 {
1067         return max_t(unsigned short, rq->nr_phys_segments, 1);
1068 }
1069
1070 int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
1071                 struct scatterlist *sglist, struct scatterlist **last_sg);
1072 static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
1073                 struct scatterlist *sglist)
1074 {
1075         struct scatterlist *last_sg = NULL;
1076
1077         return __blk_rq_map_sg(q, rq, sglist, &last_sg);
1078 }
1079 void blk_dump_rq_flags(struct request *, char *);
1080
1081 #ifdef CONFIG_BLK_DEV_ZONED
1082 static inline unsigned int blk_rq_zone_no(struct request *rq)
1083 {
1084         return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
1085 }
1086
1087 static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
1088 {
1089         return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
1090 }
1091
1092 bool blk_req_needs_zone_write_lock(struct request *rq);
1093 bool blk_req_zone_write_trylock(struct request *rq);
1094 void __blk_req_zone_write_lock(struct request *rq);
1095 void __blk_req_zone_write_unlock(struct request *rq);
1096
1097 static inline void blk_req_zone_write_lock(struct request *rq)
1098 {
1099         if (blk_req_needs_zone_write_lock(rq))
1100                 __blk_req_zone_write_lock(rq);
1101 }
1102
1103 static inline void blk_req_zone_write_unlock(struct request *rq)
1104 {
1105         if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
1106                 __blk_req_zone_write_unlock(rq);
1107 }
1108
1109 static inline bool blk_req_zone_is_write_locked(struct request *rq)
1110 {
1111         return rq->q->seq_zones_wlock &&
1112                 test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
1113 }
1114
1115 static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
1116 {
1117         if (!blk_req_needs_zone_write_lock(rq))
1118                 return true;
1119         return !blk_req_zone_is_write_locked(rq);
1120 }
1121 #else /* CONFIG_BLK_DEV_ZONED */
1122 static inline bool blk_req_needs_zone_write_lock(struct request *rq)
1123 {
1124         return false;
1125 }
1126
1127 static inline void blk_req_zone_write_lock(struct request *rq)
1128 {
1129 }
1130
1131 static inline void blk_req_zone_write_unlock(struct request *rq)
1132 {
1133 }
1134 static inline bool blk_req_zone_is_write_locked(struct request *rq)
1135 {
1136         return false;
1137 }
1138
1139 static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
1140 {
1141         return true;
1142 }
1143 #endif /* CONFIG_BLK_DEV_ZONED */
1144
1145 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1146 # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
1147 #endif
1148 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1149 void rq_flush_dcache_pages(struct request *rq);
1150 #else
1151 static inline void rq_flush_dcache_pages(struct request *rq)
1152 {
1153 }
1154 #endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */
1155 #endif /* BLK_MQ_H */