blk-mq: Change rqs check in blk_mq_free_rqs()
[linux-2.6-block.git] / include / linux / blk-mq.h
CommitLineData
b2441318 1/* SPDX-License-Identifier: GPL-2.0 */
320ae51f
JA
2#ifndef BLK_MQ_H
3#define BLK_MQ_H
4
5#include <linux/blkdev.h>
88459642 6#include <linux/sbitmap.h>
6a83e74d 7#include <linux/srcu.h>
fb01a293 8#include <linux/lockdep.h>
24b83deb 9#include <linux/scatterlist.h>
320ae51f
JA
10
11struct blk_mq_tags;
f70ced09 12struct blk_flush_queue;
320ae51f 13
24b83deb
CH
14#define BLKDEV_MIN_RQ 4
15#define BLKDEV_MAX_RQ 128 /* Default maximum */
16
17typedef void (rq_end_io_fn)(struct request *, blk_status_t);
18
19/*
20 * request flags */
21typedef __u32 __bitwise req_flags_t;
22
23/* drive already may have started this one */
24#define RQF_STARTED ((__force req_flags_t)(1 << 1))
25/* may not be passed by ioscheduler */
26#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3))
27/* request for flush sequence */
28#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4))
29/* merge of different types, fail separately */
30#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5))
31/* track inflight for MQ */
32#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6))
33/* don't call prep for this one */
34#define RQF_DONTPREP ((__force req_flags_t)(1 << 7))
35/* vaguely specified driver internal error. Ignored by the block layer */
36#define RQF_FAILED ((__force req_flags_t)(1 << 10))
37/* don't warn about errors */
38#define RQF_QUIET ((__force req_flags_t)(1 << 11))
39/* elevator private data attached */
40#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12))
41/* account into disk and partition IO statistics */
42#define RQF_IO_STAT ((__force req_flags_t)(1 << 13))
43/* runtime pm request */
44#define RQF_PM ((__force req_flags_t)(1 << 15))
45/* on IO scheduler merge hash */
46#define RQF_HASHED ((__force req_flags_t)(1 << 16))
47/* track IO completion time */
48#define RQF_STATS ((__force req_flags_t)(1 << 17))
49/* Look at ->special_vec for the actual data payload instead of the
50 bio chain. */
51#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
52/* The per-zone write lock is held for this request */
53#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
54/* already slept for hybrid poll */
55#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20))
56/* ->timeout has been called, don't expire again */
57#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
58
59/* flags that prevent us from merging requests: */
60#define RQF_NOMERGE_FLAGS \
61 (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
62
63enum mq_rq_state {
64 MQ_RQ_IDLE = 0,
65 MQ_RQ_IN_FLIGHT = 1,
66 MQ_RQ_COMPLETE = 2,
67};
68
69/*
70 * Try to put the fields that are referenced together in the same cacheline.
71 *
72 * If you modify this structure, make sure to update blk_rq_init() and
73 * especially blk_mq_rq_ctx_init() to take care of the added fields.
74 */
75struct request {
76 struct request_queue *q;
77 struct blk_mq_ctx *mq_ctx;
78 struct blk_mq_hw_ctx *mq_hctx;
79
80 unsigned int cmd_flags; /* op and common flags */
81 req_flags_t rq_flags;
82
83 int tag;
84 int internal_tag;
85
86 /* the following two fields are internal, NEVER access directly */
87 unsigned int __data_len; /* total data len */
88 sector_t __sector; /* sector cursor */
89
90 struct bio *bio;
91 struct bio *biotail;
92
93 struct list_head queuelist;
94
95 /*
96 * The hash is used inside the scheduler, and killed once the
97 * request reaches the dispatch list. The ipi_list is only used
98 * to queue the request for softirq completion, which is long
99 * after the request has been unhashed (and even removed from
100 * the dispatch list).
101 */
102 union {
103 struct hlist_node hash; /* merge hash */
104 struct llist_node ipi_list;
105 };
106
107 /*
108 * The rb_node is only used inside the io scheduler, requests
109 * are pruned when moved to the dispatch queue. So let the
110 * completion_data share space with the rb_node.
111 */
112 union {
113 struct rb_node rb_node; /* sort/lookup */
114 struct bio_vec special_vec;
115 void *completion_data;
116 int error_count; /* for legacy drivers, don't use */
117 };
118
119 /*
120 * Three pointers are available for the IO schedulers, if they need
121 * more they have to dynamically allocate it. Flush requests are
122 * never put on the IO scheduler. So let the flush fields share
123 * space with the elevator data.
124 */
125 union {
126 struct {
127 struct io_cq *icq;
128 void *priv[2];
129 } elv;
130
131 struct {
132 unsigned int seq;
133 struct list_head list;
134 rq_end_io_fn *saved_end_io;
135 } flush;
136 };
137
138 struct gendisk *rq_disk;
139 struct block_device *part;
140#ifdef CONFIG_BLK_RQ_ALLOC_TIME
141 /* Time that the first bio started allocating this request. */
142 u64 alloc_time_ns;
143#endif
144 /* Time that this request was allocated for this IO. */
145 u64 start_time_ns;
146 /* Time that I/O was submitted to the device. */
147 u64 io_start_time_ns;
148
149#ifdef CONFIG_BLK_WBT
150 unsigned short wbt_flags;
151#endif
152 /*
153 * rq sectors used for blk stats. It has the same value
154 * with blk_rq_sectors(rq), except that it never be zeroed
155 * by completion.
156 */
157 unsigned short stats_sectors;
158
159 /*
160 * Number of scatter-gather DMA addr+len pairs after
161 * physical address coalescing is performed.
162 */
163 unsigned short nr_phys_segments;
164
165#ifdef CONFIG_BLK_DEV_INTEGRITY
166 unsigned short nr_integrity_segments;
167#endif
168
169#ifdef CONFIG_BLK_INLINE_ENCRYPTION
170 struct bio_crypt_ctx *crypt_ctx;
171 struct blk_ksm_keyslot *crypt_keyslot;
172#endif
173
174 unsigned short write_hint;
175 unsigned short ioprio;
176
177 enum mq_rq_state state;
178 refcount_t ref;
179
180 unsigned int timeout;
181 unsigned long deadline;
182
183 union {
184 struct __call_single_data csd;
185 u64 fifo_time;
186 };
187
188 /*
189 * completion callback.
190 */
191 rq_end_io_fn *end_io;
192 void *end_io_data;
193};
194
195#define req_op(req) \
196 ((req)->cmd_flags & REQ_OP_MASK)
197
198static inline bool blk_rq_is_passthrough(struct request *rq)
199{
200 return blk_op_is_passthrough(req_op(rq));
201}
202
203static inline unsigned short req_get_ioprio(struct request *req)
204{
205 return req->ioprio;
206}
207
208#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ)
209
210#define rq_dma_dir(rq) \
211 (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
212
213enum blk_eh_timer_return {
214 BLK_EH_DONE, /* drivers has completed the command */
215 BLK_EH_RESET_TIMER, /* reset timer and try again */
216};
217
218#define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
219#define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
220
fe644072 221/**
d386732b
AA
222 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
223 * block device
fe644072 224 */
320ae51f
JA
225struct blk_mq_hw_ctx {
226 struct {
d386732b 227 /** @lock: Protects the dispatch list. */
320ae51f 228 spinlock_t lock;
d386732b
AA
229 /**
230 * @dispatch: Used for requests that are ready to be
231 * dispatched to the hardware but for some reason (e.g. lack of
232 * resources) could not be sent to the hardware. As soon as the
233 * driver can send new requests, requests at this list will
234 * be sent first for a fairer dispatch.
235 */
320ae51f 236 struct list_head dispatch;
d386732b
AA
237 /**
238 * @state: BLK_MQ_S_* flags. Defines the state of the hw
239 * queue (active, scheduled to restart, stopped).
240 */
241 unsigned long state;
320ae51f
JA
242 } ____cacheline_aligned_in_smp;
243
d386732b
AA
244 /**
245 * @run_work: Used for scheduling a hardware queue run at a later time.
246 */
9f993737 247 struct delayed_work run_work;
d386732b 248 /** @cpumask: Map of available CPUs where this hctx can run. */
e4043dcf 249 cpumask_var_t cpumask;
d386732b
AA
250 /**
251 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
252 * selection from @cpumask.
253 */
506e931f 254 int next_cpu;
d386732b
AA
255 /**
256 * @next_cpu_batch: Counter of how many works left in the batch before
257 * changing to the next CPU.
258 */
506e931f 259 int next_cpu_batch;
320ae51f 260
d386732b
AA
261 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
262 unsigned long flags;
320ae51f 263
d386732b
AA
264 /**
265 * @sched_data: Pointer owned by the IO scheduler attached to a request
266 * queue. It's up to the IO scheduler how to use this pointer.
267 */
bd166ef1 268 void *sched_data;
d386732b
AA
269 /**
270 * @queue: Pointer to the request queue that owns this hardware context.
271 */
320ae51f 272 struct request_queue *queue;
d386732b 273 /** @fq: Queue of requests that need to perform a flush operation. */
f70ced09 274 struct blk_flush_queue *fq;
320ae51f 275
d386732b
AA
276 /**
277 * @driver_data: Pointer to data owned by the block driver that created
278 * this hctx
279 */
320ae51f
JA
280 void *driver_data;
281
d386732b
AA
282 /**
283 * @ctx_map: Bitmap for each software queue. If bit is on, there is a
284 * pending request in that software queue.
285 */
88459642 286 struct sbitmap ctx_map;
1429d7c9 287
d386732b
AA
288 /**
289 * @dispatch_from: Software queue to be used when no scheduler was
290 * selected.
291 */
b347689f 292 struct blk_mq_ctx *dispatch_from;
d386732b
AA
293 /**
294 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
295 * decide if the hw_queue is busy using Exponential Weighted Moving
296 * Average algorithm.
297 */
6e768717 298 unsigned int dispatch_busy;
b347689f 299
d386732b 300 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
f31967f0 301 unsigned short type;
d386732b 302 /** @nr_ctx: Number of software queues. */
f31967f0 303 unsigned short nr_ctx;
d386732b 304 /** @ctxs: Array of software queues. */
6e768717 305 struct blk_mq_ctx **ctxs;
4bb659b1 306
d386732b 307 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
5815839b 308 spinlock_t dispatch_wait_lock;
d386732b
AA
309 /**
310 * @dispatch_wait: Waitqueue to put requests when there is no tag
311 * available at the moment, to wait for another try in the future.
312 */
eb619fdb 313 wait_queue_entry_t dispatch_wait;
d386732b
AA
314
315 /**
316 * @wait_index: Index of next available dispatch_wait queue to insert
317 * requests.
318 */
8537b120 319 atomic_t wait_index;
320ae51f 320
d386732b
AA
321 /**
322 * @tags: Tags owned by the block driver. A tag at this set is only
323 * assigned when a request is dispatched from a hardware queue.
324 */
320ae51f 325 struct blk_mq_tags *tags;
d386732b
AA
326 /**
327 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
328 * scheduler associated with a request queue, a tag is assigned when
329 * that request is allocated. Else, this member is not used.
330 */
bd166ef1 331 struct blk_mq_tags *sched_tags;
320ae51f 332
d386732b 333 /** @queued: Number of queued requests. */
320ae51f 334 unsigned long queued;
d386732b 335 /** @run: Number of dispatched requests. */
320ae51f 336 unsigned long run;
8d354f13 337#define BLK_MQ_MAX_DISPATCH_ORDER 7
d386732b 338 /** @dispatched: Number of dispatch requests by queue. */
320ae51f
JA
339 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
340
d386732b 341 /** @numa_node: NUMA node the storage adapter has been connected to. */
320ae51f 342 unsigned int numa_node;
d386732b 343 /** @queue_num: Index of this hardware queue. */
17ded320 344 unsigned int queue_num;
320ae51f 345
d386732b
AA
346 /**
347 * @nr_active: Number of active requests. Only used when a tag set is
348 * shared across request queues.
349 */
0d2602ca
JA
350 atomic_t nr_active;
351
bf0beec0
ML
352 /** @cpuhp_online: List to store request if CPU is going to die */
353 struct hlist_node cpuhp_online;
d386732b 354 /** @cpuhp_dead: List to store request if some CPU die. */
9467f859 355 struct hlist_node cpuhp_dead;
d386732b 356 /** @kobj: Kernel object for sysfs. */
320ae51f 357 struct kobject kobj;
05229bee 358
d386732b 359 /** @poll_considered: Count times blk_poll() was called. */
6e219353 360 unsigned long poll_considered;
d386732b 361 /** @poll_invoked: Count how many requests blk_poll() polled. */
05229bee 362 unsigned long poll_invoked;
d386732b 363 /** @poll_success: Count how many polled requests were completed. */
05229bee 364 unsigned long poll_success;
9c1051aa
OS
365
366#ifdef CONFIG_BLK_DEBUG_FS
d386732b
AA
367 /**
368 * @debugfs_dir: debugfs directory for this hardware queue. Named
369 * as cpu<cpu_number>.
370 */
9c1051aa 371 struct dentry *debugfs_dir;
d386732b 372 /** @sched_debugfs_dir: debugfs directory for the scheduler. */
d332ce09 373 struct dentry *sched_debugfs_dir;
9c1051aa 374#endif
07319678 375
2dd209f0
BVA
376 /**
377 * @hctx_list: if this hctx is not in use, this is an entry in
378 * q->unused_hctx_list.
379 */
2f8f1336
ML
380 struct list_head hctx_list;
381
d386732b
AA
382 /**
383 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
384 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
385 * blk_mq_hw_ctx_size().
386 */
f36aaf8b 387 struct srcu_struct srcu[];
320ae51f
JA
388};
389
7a18312c 390/**
d386732b 391 * struct blk_mq_queue_map - Map software queues to hardware queues
7a18312c
BVA
392 * @mq_map: CPU ID to hardware queue index map. This is an array
393 * with nr_cpu_ids elements. Each element has a value in the range
394 * [@queue_offset, @queue_offset + @nr_queues).
395 * @nr_queues: Number of hardware queues to map CPU IDs onto.
396 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
397 * driver to map each hardware queue type (enum hctx_type) onto a distinct
398 * set of hardware queues.
399 */
ed76e329
JA
400struct blk_mq_queue_map {
401 unsigned int *mq_map;
402 unsigned int nr_queues;
843477d4 403 unsigned int queue_offset;
ed76e329
JA
404};
405
d386732b
AA
406/**
407 * enum hctx_type - Type of hardware queue
408 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for.
409 * @HCTX_TYPE_READ: Just for READ I/O.
410 * @HCTX_TYPE_POLL: Polled I/O of any kind.
411 * @HCTX_MAX_TYPES: Number of types of hctx.
412 */
e20ba6e1 413enum hctx_type {
d386732b
AA
414 HCTX_TYPE_DEFAULT,
415 HCTX_TYPE_READ,
416 HCTX_TYPE_POLL,
e20ba6e1
CH
417
418 HCTX_MAX_TYPES,
ed76e329
JA
419};
420
7a18312c
BVA
421/**
422 * struct blk_mq_tag_set - tag set that can be shared between request queues
423 * @map: One or more ctx -> hctx mappings. One map exists for each
424 * hardware queue type (enum hctx_type) that the driver wishes
425 * to support. There are no restrictions on maps being of the
426 * same size, and it's perfectly legal to share maps between
427 * types.
428 * @nr_maps: Number of elements in the @map array. A number in the range
429 * [1, HCTX_MAX_TYPES].
430 * @ops: Pointers to functions that implement block driver behavior.
431 * @nr_hw_queues: Number of hardware queues supported by the block driver that
432 * owns this data structure.
433 * @queue_depth: Number of tags per hardware queue, reserved tags included.
434 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
435 * allocations.
436 * @cmd_size: Number of additional bytes to allocate per request. The block
437 * driver owns these additional bytes.
438 * @numa_node: NUMA node the storage adapter has been connected to.
439 * @timeout: Request processing timeout in jiffies.
440 * @flags: Zero or more BLK_MQ_F_* flags.
441 * @driver_data: Pointer to data owned by the block driver that created this
442 * tag set.
6a6223ec
MCC
443 * @active_queues_shared_sbitmap:
444 * number of active request queues per tag set.
32bc15af
JG
445 * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
446 * @__breserved_tags:
447 * A shared reserved tags sbitmap, used over all hctx's
7a18312c
BVA
448 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues
449 * elements.
450 * @tag_list_lock: Serializes tag_list accesses.
451 * @tag_list: List of the request queues that use this tag set. See also
452 * request_queue.tag_set_list.
453 */
24d2f903 454struct blk_mq_tag_set {
ed76e329 455 struct blk_mq_queue_map map[HCTX_MAX_TYPES];
7a18312c 456 unsigned int nr_maps;
f8a5b122 457 const struct blk_mq_ops *ops;
7a18312c
BVA
458 unsigned int nr_hw_queues;
459 unsigned int queue_depth;
320ae51f 460 unsigned int reserved_tags;
7a18312c 461 unsigned int cmd_size;
320ae51f
JA
462 int numa_node;
463 unsigned int timeout;
7a18312c 464 unsigned int flags;
24d2f903 465 void *driver_data;
f1b49fdc 466 atomic_t active_queues_shared_sbitmap;
24d2f903 467
32bc15af
JG
468 struct sbitmap_queue __bitmap_tags;
469 struct sbitmap_queue __breserved_tags;
24d2f903 470 struct blk_mq_tags **tags;
0d2602ca
JA
471
472 struct mutex tag_list_lock;
473 struct list_head tag_list;
320ae51f
JA
474};
475
d386732b
AA
476/**
477 * struct blk_mq_queue_data - Data about a request inserted in a queue
478 *
479 * @rq: Request pointer.
480 * @last: If it is the last request in the queue.
481 */
74c45052
JA
482struct blk_mq_queue_data {
483 struct request *rq;
74c45052
JA
484 bool last;
485};
486
7baa8572 487typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
81481eb4 488 bool);
7baa8572 489typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
05229bee 490
d386732b
AA
491/**
492 * struct blk_mq_ops - Callback functions that implements block driver
493 * behaviour.
494 */
320ae51f 495struct blk_mq_ops {
d386732b
AA
496 /**
497 * @queue_rq: Queue a new request from block IO.
320ae51f 498 */
0516c2f6
DW
499 blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
500 const struct blk_mq_queue_data *);
320ae51f 501
d386732b
AA
502 /**
503 * @commit_rqs: If a driver uses bd->last to judge when to submit
504 * requests to hardware, it must define this function. In case of errors
505 * that make us stop issuing further requests, this hook serves the
d666ba98
JA
506 * purpose of kicking the hardware (which the last request otherwise
507 * would have done).
508 */
0516c2f6 509 void (*commit_rqs)(struct blk_mq_hw_ctx *);
d666ba98 510
d386732b
AA
511 /**
512 * @get_budget: Reserve budget before queue request, once .queue_rq is
de148297
ML
513 * run, it is driver's responsibility to release the
514 * reserved budget. Also we have to handle failure case
515 * of .get_budget for avoiding I/O deadlock.
516 */
2a5a24aa 517 int (*get_budget)(struct request_queue *);
0516c2f6 518
d386732b
AA
519 /**
520 * @put_budget: Release the reserved budget.
521 */
2a5a24aa 522 void (*put_budget)(struct request_queue *, int);
de148297 523
85367040
ML
524 /**
525 * @set_rq_budget_token: store rq's budget token
d022d18c
ML
526 */
527 void (*set_rq_budget_token)(struct request *, int);
85367040
ML
528 /**
529 * @get_rq_budget_token: retrieve rq's budget token
d022d18c
ML
530 */
531 int (*get_rq_budget_token)(struct request *);
532
d386732b
AA
533 /**
534 * @timeout: Called on request timeout.
320ae51f 535 */
0516c2f6 536 enum blk_eh_timer_return (*timeout)(struct request *, bool);
320ae51f 537
d386732b
AA
538 /**
539 * @poll: Called to poll for completion of a specific tag.
05229bee 540 */
0516c2f6 541 int (*poll)(struct blk_mq_hw_ctx *);
05229bee 542
d386732b
AA
543 /**
544 * @complete: Mark the request as complete.
545 */
0516c2f6 546 void (*complete)(struct request *);
30a91cb4 547
d386732b
AA
548 /**
549 * @init_hctx: Called when the block layer side of a hardware queue has
550 * been set up, allowing the driver to allocate/init matching
551 * structures.
320ae51f 552 */
0516c2f6 553 int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
d386732b
AA
554 /**
555 * @exit_hctx: Ditto for exit/teardown.
556 */
0516c2f6 557 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
e9b267d9 558
d386732b
AA
559 /**
560 * @init_request: Called for every command allocated by the block layer
561 * to allow the driver to set up driver specific data.
f70ced09
ML
562 *
563 * Tag greater than or equal to queue_depth is for setting up
564 * flush request.
e9b267d9 565 */
0516c2f6
DW
566 int (*init_request)(struct blk_mq_tag_set *set, struct request *,
567 unsigned int, unsigned int);
d386732b
AA
568 /**
569 * @exit_request: Ditto for exit/teardown.
570 */
0516c2f6
DW
571 void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
572 unsigned int);
d386732b
AA
573
574 /**
575 * @initialize_rq_fn: Called from inside blk_get_request().
576 */
d280bab3 577 void (*initialize_rq_fn)(struct request *rq);
da695ba2 578
d386732b
AA
579 /**
580 * @cleanup_rq: Called before freeing one request which isn't completed
581 * yet, and usually for freeing the driver private data.
226b4fc7 582 */
0516c2f6 583 void (*cleanup_rq)(struct request *);
226b4fc7 584
d386732b
AA
585 /**
586 * @busy: If set, returns whether or not this queue currently is busy.
9ba20527 587 */
0516c2f6 588 bool (*busy)(struct request_queue *);
9ba20527 589
d386732b
AA
590 /**
591 * @map_queues: This allows drivers specify their own queue mapping by
592 * overriding the setup-time function that builds the mq_map.
593 */
0516c2f6 594 int (*map_queues)(struct blk_mq_tag_set *set);
2836ee4b
BVA
595
596#ifdef CONFIG_BLK_DEBUG_FS
d386732b
AA
597 /**
598 * @show_rq: Used by the debugfs implementation to show driver-specific
2836ee4b
BVA
599 * information about a request.
600 */
601 void (*show_rq)(struct seq_file *m, struct request *rq);
602#endif
320ae51f
JA
603};
604
605enum {
320ae51f 606 BLK_MQ_F_SHOULD_MERGE = 1 << 0,
51db1c37 607 BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
bf0beec0
ML
608 /*
609 * Set when this device requires underlying blk-mq device for
610 * completing IO:
611 */
612 BLK_MQ_F_STACKING = 1 << 2,
32bc15af 613 BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
1b792f2f 614 BLK_MQ_F_BLOCKING = 1 << 5,
90b71980 615 /* Do not allow an I/O scheduler to be configured. */
d3484991 616 BLK_MQ_F_NO_SCHED = 1 << 6,
90b71980
BVA
617 /*
618 * Select 'none' during queue registration in case of a single hwq
619 * or shared hwqs instead of 'mq-deadline'.
620 */
621 BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7,
24391c0d
SL
622 BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
623 BLK_MQ_F_ALLOC_POLICY_BITS = 1,
320ae51f 624
5d12f905 625 BLK_MQ_S_STOPPED = 0,
0d2602ca 626 BLK_MQ_S_TAG_ACTIVE = 1,
bd166ef1 627 BLK_MQ_S_SCHED_RESTART = 2,
320ae51f 628
bf0beec0
ML
629 /* hw queue is inactive after all its CPUs become offline */
630 BLK_MQ_S_INACTIVE = 3,
631
a4391c64 632 BLK_MQ_MAX_DEPTH = 10240,
506e931f
JA
633
634 BLK_MQ_CPU_WORK_BATCH = 8,
320ae51f 635};
24391c0d
SL
636#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
637 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
638 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
639#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
640 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
641 << BLK_MQ_F_ALLOC_POLICY_START_BIT)
320ae51f 642
4dcc4874
CH
643struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
644 struct lock_class_key *lkclass);
b461dfc4
CH
645#define blk_mq_alloc_disk(set, queuedata) \
646({ \
647 static struct lock_class_key __key; \
b461dfc4 648 \
4dcc4874 649 __blk_mq_alloc_disk(set, queuedata, &__key); \
b461dfc4 650})
24d2f903 651struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
26a9750a
CH
652int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
653 struct request_queue *q);
b21d5b30 654void blk_mq_unregister_dev(struct device *, struct request_queue *);
320ae51f 655
24d2f903 656int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
cdb14e0f
CH
657int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
658 const struct blk_mq_ops *ops, unsigned int queue_depth,
659 unsigned int set_flags);
24d2f903
CH
660void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
661
320ae51f
JA
662void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
663
320ae51f 664void blk_mq_free_request(struct request *rq);
6f3b0e8b 665
3c94d83c 666bool blk_mq_queue_inflight(struct request_queue *q);
ae879912 667
6f3b0e8b 668enum {
9a95e4ef
BVA
669 /* return when out of requests */
670 BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0),
671 /* allocate from reserved pool */
672 BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1),
0854bcdc
BVA
673 /* set RQF_PM */
674 BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2),
6f3b0e8b
CH
675};
676
cd6ce148 677struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
9a95e4ef 678 blk_mq_req_flags_t flags);
cd6ce148 679struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
9a95e4ef
BVA
680 unsigned int op, blk_mq_req_flags_t flags,
681 unsigned int hctx_idx);
0e62f51f 682struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
320ae51f 683
205fb5f5
BVA
684enum {
685 BLK_MQ_UNIQUE_TAG_BITS = 16,
686 BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
687};
688
689u32 blk_mq_unique_tag(struct request *rq);
690
691static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
692{
693 return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
694}
695
696static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
697{
698 return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
699}
700
27a46989
PB
701/**
702 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
703 * @rq: target request.
704 */
705static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
706{
707 return READ_ONCE(rq->state);
708}
709
710static inline int blk_mq_request_started(struct request *rq)
711{
712 return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
713}
714
715static inline int blk_mq_request_completed(struct request *rq)
716{
717 return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
718}
320ae51f 719
83fba8c8
CL
720/*
721 *
722 * Set the state to complete when completing a request from inside ->queue_rq.
723 * This is used by drivers that want to ensure special complete actions that
724 * need access to the request are called on failure, e.g. by nvme for
725 * multipathing.
726 */
727static inline void blk_mq_set_request_complete(struct request *rq)
728{
729 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
730}
731
e2490073 732void blk_mq_start_request(struct request *rq);
2a842aca
CH
733void blk_mq_end_request(struct request *rq, blk_status_t error);
734void __blk_mq_end_request(struct request *rq, blk_status_t error);
320ae51f 735
2b053aca 736void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
6fca6a61 737void blk_mq_kick_requeue_list(struct request_queue *q);
2849450a 738void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
15f73f5b 739void blk_mq_complete_request(struct request *rq);
40d09b53 740bool blk_mq_complete_request_remote(struct request *rq);
fd001443 741bool blk_mq_queue_stopped(struct request_queue *q);
320ae51f
JA
742void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
743void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
280d45f6 744void blk_mq_stop_hw_queues(struct request_queue *q);
2f268556 745void blk_mq_start_hw_queues(struct request_queue *q);
ae911c5e 746void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
1b4a3258 747void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
97e01209 748void blk_mq_quiesce_queue(struct request_queue *q);
e4e73913 749void blk_mq_unquiesce_queue(struct request_queue *q);
7587a5ae 750void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
626fb735 751void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
b94ec296 752void blk_mq_run_hw_queues(struct request_queue *q, bool async);
b9151e7b 753void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
e0489487
SG
754void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
755 busy_tag_iter_fn *fn, void *priv);
f9934a80 756void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
c761d96b 757void blk_mq_freeze_queue(struct request_queue *q);
b4c6a028 758void blk_mq_unfreeze_queue(struct request_queue *q);
1671d522 759void blk_freeze_queue_start(struct request_queue *q);
6bae363e 760void blk_mq_freeze_queue_wait(struct request_queue *q);
f91328c4
KB
761int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
762 unsigned long timeout);
320ae51f 763
ed76e329 764int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
868f2f0b
KB
765void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
766
852ec809 767void blk_mq_quiesce_queue_nowait(struct request_queue *q);
4f084b41 768
9cf2bab6
JA
769unsigned int blk_mq_rq_cpu(struct request *rq);
770
15f73f5b
CH
771bool __blk_should_fake_timeout(struct request_queue *q);
772static inline bool blk_should_fake_timeout(struct request_queue *q)
773{
774 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
775 test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
776 return __blk_should_fake_timeout(q);
777 return false;
778}
779
d386732b
AA
780/**
781 * blk_mq_rq_from_pdu - cast a PDU to a request
782 * @pdu: the PDU (Protocol Data Unit) to be casted
783 *
784 * Return: request
785 *
320ae51f 786 * Driver command data is immediately after the request. So subtract request
d386732b 787 * size to get back to the original request.
320ae51f
JA
788 */
789static inline struct request *blk_mq_rq_from_pdu(void *pdu)
790{
791 return pdu - sizeof(struct request);
792}
d386732b
AA
793
794/**
795 * blk_mq_rq_to_pdu - cast a request to a PDU
796 * @rq: the request to be casted
797 *
798 * Return: pointer to the PDU
799 *
800 * Driver command data is immediately after the request. So add request to get
801 * the PDU.
802 */
320ae51f
JA
803static inline void *blk_mq_rq_to_pdu(struct request *rq)
804{
2963e3f7 805 return rq + 1;
320ae51f
JA
806}
807
320ae51f 808#define queue_for_each_hw_ctx(q, hctx, i) \
0d0b7d42
JA
809 for ((i) = 0; (i) < (q)->nr_hw_queues && \
810 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
320ae51f 811
320ae51f 812#define hctx_for_each_ctx(hctx, ctx, i) \
0d0b7d42
JA
813 for ((i) = 0; (i) < (hctx)->nr_ctx && \
814 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
320ae51f 815
7b7ab780
SG
816static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx,
817 struct request *rq)
818{
819 if (rq->tag != -1)
820 return rq->tag | (hctx->queue_num << BLK_QC_T_SHIFT);
821
822 return rq->internal_tag | (hctx->queue_num << BLK_QC_T_SHIFT) |
823 BLK_QC_T_INTERNAL;
824}
825
226b4fc7
ML
826static inline void blk_mq_cleanup_rq(struct request *rq)
827{
828 if (rq->q->mq_ops->cleanup_rq)
829 rq->q->mq_ops->cleanup_rq(rq);
830}
831
53ffabfd
CK
832static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
833 unsigned int nr_segs)
834{
835 rq->nr_phys_segments = nr_segs;
836 rq->__data_len = bio->bi_iter.bi_size;
837 rq->bio = rq->biotail = bio;
838 rq->ioprio = bio_prio(bio);
839
309dca30
CH
840 if (bio->bi_bdev)
841 rq->rq_disk = bio->bi_bdev->bd_disk;
53ffabfd
CK
842}
843
c62b37d9 844blk_qc_t blk_mq_submit_bio(struct bio *bio);
fb01a293
ML
845void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
846 struct lock_class_key *key);
8cf7961d 847
24b83deb
CH
848static inline bool rq_is_sync(struct request *rq)
849{
850 return op_is_sync(rq->cmd_flags);
851}
852
853void blk_rq_init(struct request_queue *q, struct request *rq);
854void blk_put_request(struct request *rq);
855struct request *blk_get_request(struct request_queue *q, unsigned int op,
856 blk_mq_req_flags_t flags);
857int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
858 struct bio_set *bs, gfp_t gfp_mask,
859 int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
860void blk_rq_unprep_clone(struct request *rq);
861blk_status_t blk_insert_cloned_request(struct request_queue *q,
862 struct request *rq);
863
864struct rq_map_data {
865 struct page **pages;
866 int page_order;
867 int nr_entries;
868 unsigned long offset;
869 int null_mapped;
870 int from_user;
871};
872
873int blk_rq_map_user(struct request_queue *, struct request *,
874 struct rq_map_data *, void __user *, unsigned long, gfp_t);
875int blk_rq_map_user_iov(struct request_queue *, struct request *,
876 struct rq_map_data *, const struct iov_iter *, gfp_t);
877int blk_rq_unmap_user(struct bio *);
878int blk_rq_map_kern(struct request_queue *, struct request *, void *,
879 unsigned int, gfp_t);
880int blk_rq_append_bio(struct request *rq, struct bio *bio);
881void blk_execute_rq_nowait(struct gendisk *, struct request *, int,
882 rq_end_io_fn *);
883blk_status_t blk_execute_rq(struct gendisk *bd_disk, struct request *rq,
884 int at_head);
885
886struct req_iterator {
887 struct bvec_iter iter;
888 struct bio *bio;
889};
890
891#define __rq_for_each_bio(_bio, rq) \
892 if ((rq->bio)) \
893 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
894
895#define rq_for_each_segment(bvl, _rq, _iter) \
896 __rq_for_each_bio(_iter.bio, _rq) \
897 bio_for_each_segment(bvl, _iter.bio, _iter.iter)
898
899#define rq_for_each_bvec(bvl, _rq, _iter) \
900 __rq_for_each_bio(_iter.bio, _rq) \
901 bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
902
903#define rq_iter_last(bvec, _iter) \
904 (_iter.bio->bi_next == NULL && \
905 bio_iter_last(bvec, _iter.iter))
906
907/*
908 * blk_rq_pos() : the current sector
909 * blk_rq_bytes() : bytes left in the entire request
910 * blk_rq_cur_bytes() : bytes left in the current segment
911 * blk_rq_err_bytes() : bytes left till the next error boundary
912 * blk_rq_sectors() : sectors left in the entire request
913 * blk_rq_cur_sectors() : sectors left in the current segment
914 * blk_rq_stats_sectors() : sectors of the entire request used for stats
915 */
916static inline sector_t blk_rq_pos(const struct request *rq)
917{
918 return rq->__sector;
919}
920
921static inline unsigned int blk_rq_bytes(const struct request *rq)
922{
923 return rq->__data_len;
924}
925
926static inline int blk_rq_cur_bytes(const struct request *rq)
927{
928 return rq->bio ? bio_cur_bytes(rq->bio) : 0;
929}
930
931unsigned int blk_rq_err_bytes(const struct request *rq);
932
933static inline unsigned int blk_rq_sectors(const struct request *rq)
934{
935 return blk_rq_bytes(rq) >> SECTOR_SHIFT;
936}
937
938static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
939{
940 return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
941}
942
943static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
944{
945 return rq->stats_sectors;
946}
947
948/*
949 * Some commands like WRITE SAME have a payload or data transfer size which
950 * is different from the size of the request. Any driver that supports such
951 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
952 * calculate the data transfer size.
953 */
954static inline unsigned int blk_rq_payload_bytes(struct request *rq)
955{
956 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
957 return rq->special_vec.bv_len;
958 return blk_rq_bytes(rq);
959}
960
961/*
962 * Return the first full biovec in the request. The caller needs to check that
963 * there are any bvecs before calling this helper.
964 */
965static inline struct bio_vec req_bvec(struct request *rq)
966{
967 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
968 return rq->special_vec;
969 return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
970}
971
972static inline unsigned int blk_rq_count_bios(struct request *rq)
973{
974 unsigned int nr_bios = 0;
975 struct bio *bio;
976
977 __rq_for_each_bio(bio, rq)
978 nr_bios++;
979
980 return nr_bios;
981}
982
983void blk_steal_bios(struct bio_list *list, struct request *rq);
984
985/*
986 * Request completion related functions.
987 *
988 * blk_update_request() completes given number of bytes and updates
989 * the request without completing it.
990 */
991bool blk_update_request(struct request *rq, blk_status_t error,
992 unsigned int nr_bytes);
993void blk_abort_request(struct request *);
994
995/*
996 * Number of physical segments as sent to the device.
997 *
998 * Normally this is the number of discontiguous data segments sent by the
999 * submitter. But for data-less command like discard we might have no
1000 * actual data segments submitted, but the driver might have to add it's
1001 * own special payload. In that case we still return 1 here so that this
1002 * special payload will be mapped.
1003 */
1004static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
1005{
1006 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
1007 return 1;
1008 return rq->nr_phys_segments;
1009}
1010
1011/*
1012 * Number of discard segments (or ranges) the driver needs to fill in.
1013 * Each discard bio merged into a request is counted as one segment.
1014 */
1015static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
1016{
1017 return max_t(unsigned short, rq->nr_phys_segments, 1);
1018}
1019
1020int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
1021 struct scatterlist *sglist, struct scatterlist **last_sg);
1022static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
1023 struct scatterlist *sglist)
1024{
1025 struct scatterlist *last_sg = NULL;
1026
1027 return __blk_rq_map_sg(q, rq, sglist, &last_sg);
1028}
1029void blk_dump_rq_flags(struct request *, char *);
1030
1031#ifdef CONFIG_BLK_DEV_ZONED
1032static inline unsigned int blk_rq_zone_no(struct request *rq)
1033{
1034 return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
1035}
1036
1037static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
1038{
1039 return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
1040}
1041
1042bool blk_req_needs_zone_write_lock(struct request *rq);
1043bool blk_req_zone_write_trylock(struct request *rq);
1044void __blk_req_zone_write_lock(struct request *rq);
1045void __blk_req_zone_write_unlock(struct request *rq);
1046
1047static inline void blk_req_zone_write_lock(struct request *rq)
1048{
1049 if (blk_req_needs_zone_write_lock(rq))
1050 __blk_req_zone_write_lock(rq);
1051}
1052
1053static inline void blk_req_zone_write_unlock(struct request *rq)
1054{
1055 if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
1056 __blk_req_zone_write_unlock(rq);
1057}
1058
1059static inline bool blk_req_zone_is_write_locked(struct request *rq)
1060{
1061 return rq->q->seq_zones_wlock &&
1062 test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
1063}
1064
1065static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
1066{
1067 if (!blk_req_needs_zone_write_lock(rq))
1068 return true;
1069 return !blk_req_zone_is_write_locked(rq);
1070}
1071#else /* CONFIG_BLK_DEV_ZONED */
1072static inline bool blk_req_needs_zone_write_lock(struct request *rq)
1073{
1074 return false;
1075}
1076
1077static inline void blk_req_zone_write_lock(struct request *rq)
1078{
1079}
1080
1081static inline void blk_req_zone_write_unlock(struct request *rq)
1082{
1083}
1084static inline bool blk_req_zone_is_write_locked(struct request *rq)
1085{
1086 return false;
1087}
1088
1089static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
1090{
1091 return true;
1092}
1093#endif /* CONFIG_BLK_DEV_ZONED */
1094
1095#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1096# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
320ae51f 1097#endif
24b83deb
CH
1098#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1099void rq_flush_dcache_pages(struct request *rq);
1100#else
1101static inline void rq_flush_dcache_pages(struct request *rq)
1102{
1103}
1104#endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */
1105#endif /* BLK_MQ_H */