block: remove generic_{start,end}_io_acct
[linux-block.git] / block / blk-core.c
CommitLineData
3dcf60bc 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
1da177e4
LT
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
5 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
6 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6728cb0e
JA
7 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
8 * - July2000
1da177e4
LT
9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10 */
11
12/*
13 * This handles all read/write requests to block devices
14 */
1da177e4
LT
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/backing-dev.h>
18#include <linux/bio.h>
19#include <linux/blkdev.h>
320ae51f 20#include <linux/blk-mq.h>
1da177e4
LT
21#include <linux/highmem.h>
22#include <linux/mm.h>
23#include <linux/kernel_stat.h>
24#include <linux/string.h>
25#include <linux/init.h>
1da177e4
LT
26#include <linux/completion.h>
27#include <linux/slab.h>
28#include <linux/swap.h>
29#include <linux/writeback.h>
faccbd4b 30#include <linux/task_io_accounting_ops.h>
c17bb495 31#include <linux/fault-inject.h>
73c10101 32#include <linux/list_sort.h>
e3c78ca5 33#include <linux/delay.h>
aaf7c680 34#include <linux/ratelimit.h>
6c954667 35#include <linux/pm_runtime.h>
eea8f41c 36#include <linux/blk-cgroup.h>
54d4e6ab 37#include <linux/t10-pi.h>
18fbda91 38#include <linux/debugfs.h>
30abb3a6 39#include <linux/bpf.h>
b8e24a93 40#include <linux/psi.h>
71ac860a 41#include <linux/sched/sysctl.h>
a892c8d5 42#include <linux/blk-crypto.h>
55782138
LZ
43
44#define CREATE_TRACE_POINTS
45#include <trace/events/block.h>
1da177e4 46
8324aa91 47#include "blk.h"
43a5e4e2 48#include "blk-mq.h"
bd166ef1 49#include "blk-mq-sched.h"
bca6b067 50#include "blk-pm.h"
c1c80384 51#include "blk-rq-qos.h"
8324aa91 52
18fbda91
OS
53#ifdef CONFIG_DEBUG_FS
54struct dentry *blk_debugfs_root;
55#endif
56
d07335e5 57EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
b0da3f0d 58EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
0a82a8d1 59EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
3291fa57 60EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
cbae8d45 61EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
0bfc2455 62
a73f730d
TH
63DEFINE_IDA(blk_queue_ida);
64
1da177e4
LT
65/*
66 * For queue allocation
67 */
6728cb0e 68struct kmem_cache *blk_requestq_cachep;
1da177e4 69
1da177e4
LT
70/*
71 * Controlling structure to kblockd
72 */
ff856bad 73static struct workqueue_struct *kblockd_workqueue;
1da177e4 74
8814ce8a
BVA
75/**
76 * blk_queue_flag_set - atomically set a queue flag
77 * @flag: flag to be set
78 * @q: request queue
79 */
80void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
81{
57d74df9 82 set_bit(flag, &q->queue_flags);
8814ce8a
BVA
83}
84EXPORT_SYMBOL(blk_queue_flag_set);
85
86/**
87 * blk_queue_flag_clear - atomically clear a queue flag
88 * @flag: flag to be cleared
89 * @q: request queue
90 */
91void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
92{
57d74df9 93 clear_bit(flag, &q->queue_flags);
8814ce8a
BVA
94}
95EXPORT_SYMBOL(blk_queue_flag_clear);
96
97/**
98 * blk_queue_flag_test_and_set - atomically test and set a queue flag
99 * @flag: flag to be set
100 * @q: request queue
101 *
102 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
103 * the flag was already set.
104 */
105bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
106{
57d74df9 107 return test_and_set_bit(flag, &q->queue_flags);
8814ce8a
BVA
108}
109EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
110
2a4aa30c 111void blk_rq_init(struct request_queue *q, struct request *rq)
1da177e4 112{
1afb20f3
FT
113 memset(rq, 0, sizeof(*rq));
114
1da177e4 115 INIT_LIST_HEAD(&rq->queuelist);
63a71386 116 rq->q = q;
a2dec7b3 117 rq->__sector = (sector_t) -1;
2e662b65
JA
118 INIT_HLIST_NODE(&rq->hash);
119 RB_CLEAR_NODE(&rq->rb_node);
63a71386 120 rq->tag = -1;
bd166ef1 121 rq->internal_tag = -1;
522a7775 122 rq->start_time_ns = ktime_get_ns();
09e099d4 123 rq->part = NULL;
b554db14 124 refcount_set(&rq->ref, 1);
a892c8d5 125 blk_crypto_rq_set_defaults(rq);
1da177e4 126}
2a4aa30c 127EXPORT_SYMBOL(blk_rq_init);
1da177e4 128
e47bc4ed
CK
129#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
130static const char *const blk_op_name[] = {
131 REQ_OP_NAME(READ),
132 REQ_OP_NAME(WRITE),
133 REQ_OP_NAME(FLUSH),
134 REQ_OP_NAME(DISCARD),
135 REQ_OP_NAME(SECURE_ERASE),
136 REQ_OP_NAME(ZONE_RESET),
6e33dbf2 137 REQ_OP_NAME(ZONE_RESET_ALL),
6c1b1da5
AJ
138 REQ_OP_NAME(ZONE_OPEN),
139 REQ_OP_NAME(ZONE_CLOSE),
140 REQ_OP_NAME(ZONE_FINISH),
0512a75b 141 REQ_OP_NAME(ZONE_APPEND),
e47bc4ed
CK
142 REQ_OP_NAME(WRITE_SAME),
143 REQ_OP_NAME(WRITE_ZEROES),
144 REQ_OP_NAME(SCSI_IN),
145 REQ_OP_NAME(SCSI_OUT),
146 REQ_OP_NAME(DRV_IN),
147 REQ_OP_NAME(DRV_OUT),
148};
149#undef REQ_OP_NAME
150
151/**
152 * blk_op_str - Return string XXX in the REQ_OP_XXX.
153 * @op: REQ_OP_XXX.
154 *
155 * Description: Centralize block layer function to convert REQ_OP_XXX into
156 * string format. Useful in the debugging and tracing bio or request. For
157 * invalid REQ_OP_XXX it returns string "UNKNOWN".
158 */
159inline const char *blk_op_str(unsigned int op)
160{
161 const char *op_str = "UNKNOWN";
162
163 if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
164 op_str = blk_op_name[op];
165
166 return op_str;
167}
168EXPORT_SYMBOL_GPL(blk_op_str);
169
2a842aca
CH
170static const struct {
171 int errno;
172 const char *name;
173} blk_errors[] = {
174 [BLK_STS_OK] = { 0, "" },
175 [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
176 [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
177 [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
178 [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
179 [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
180 [BLK_STS_NEXUS] = { -EBADE, "critical nexus" },
181 [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
182 [BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
183 [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
86ff7c2a 184 [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
03a07c92 185 [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
2a842aca 186
4e4cbee9
CH
187 /* device mapper special case, should not leak out: */
188 [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
189
2a842aca
CH
190 /* everything else not covered above: */
191 [BLK_STS_IOERR] = { -EIO, "I/O" },
192};
193
194blk_status_t errno_to_blk_status(int errno)
195{
196 int i;
197
198 for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
199 if (blk_errors[i].errno == errno)
200 return (__force blk_status_t)i;
201 }
202
203 return BLK_STS_IOERR;
204}
205EXPORT_SYMBOL_GPL(errno_to_blk_status);
206
207int blk_status_to_errno(blk_status_t status)
208{
209 int idx = (__force int)status;
210
34bd9c1c 211 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
2a842aca
CH
212 return -EIO;
213 return blk_errors[idx].errno;
214}
215EXPORT_SYMBOL_GPL(blk_status_to_errno);
216
178cc590
CH
217static void print_req_error(struct request *req, blk_status_t status,
218 const char *caller)
2a842aca
CH
219{
220 int idx = (__force int)status;
221
34bd9c1c 222 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
2a842aca
CH
223 return;
224
178cc590 225 printk_ratelimited(KERN_ERR
b0e5168a
CK
226 "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
227 "phys_seg %u prio class %u\n",
178cc590 228 caller, blk_errors[idx].name,
b0e5168a
CK
229 req->rq_disk ? req->rq_disk->disk_name : "?",
230 blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
231 req->cmd_flags & ~REQ_OP_MASK,
232 req->nr_phys_segments,
233 IOPRIO_PRIO_CLASS(req->ioprio));
2a842aca
CH
234}
235
5bb23a68 236static void req_bio_endio(struct request *rq, struct bio *bio,
2a842aca 237 unsigned int nbytes, blk_status_t error)
1da177e4 238{
78d8e58a 239 if (error)
4e4cbee9 240 bio->bi_status = error;
797e7dbb 241
e8064021 242 if (unlikely(rq->rq_flags & RQF_QUIET))
b7c44ed9 243 bio_set_flag(bio, BIO_QUIET);
08bafc03 244
f79ea416 245 bio_advance(bio, nbytes);
7ba1ba12 246
0512a75b
KB
247 if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
248 /*
249 * Partial zone append completions cannot be supported as the
250 * BIO fragments may end up not being written sequentially.
251 */
252 if (bio->bi_iter.bi_size)
253 bio->bi_status = BLK_STS_IOERR;
254 else
255 bio->bi_iter.bi_sector = rq->__sector;
256 }
257
143a87f4 258 /* don't actually finish bio if it's part of flush sequence */
e8064021 259 if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
4246a0b6 260 bio_endio(bio);
1da177e4 261}
1da177e4 262
1da177e4
LT
263void blk_dump_rq_flags(struct request *rq, char *msg)
264{
aebf526b
CH
265 printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
266 rq->rq_disk ? rq->rq_disk->disk_name : "?",
5953316d 267 (unsigned long long) rq->cmd_flags);
1da177e4 268
83096ebf
TH
269 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
270 (unsigned long long)blk_rq_pos(rq),
271 blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
b4f42e28
JA
272 printk(KERN_INFO " bio %p, biotail %p, len %u\n",
273 rq->bio, rq->biotail, blk_rq_bytes(rq));
1da177e4 274}
1da177e4
LT
275EXPORT_SYMBOL(blk_dump_rq_flags);
276
1da177e4
LT
277/**
278 * blk_sync_queue - cancel any pending callbacks on a queue
279 * @q: the queue
280 *
281 * Description:
282 * The block layer may perform asynchronous callback activity
283 * on a queue, such as calling the unplug function after a timeout.
284 * A block device may call blk_sync_queue to ensure that any
285 * such activity is cancelled, thus allowing it to release resources
59c51591 286 * that the callbacks might use. The caller must already have made sure
1da177e4
LT
287 * that its ->make_request_fn will not re-add plugging prior to calling
288 * this function.
289 *
da527770 290 * This function does not cancel any asynchronous activity arising
da3dae54 291 * out of elevator or throttling code. That would require elevator_exit()
5efd6113 292 * and blkcg_exit_queue() to be called with queue lock initialized.
da527770 293 *
1da177e4
LT
294 */
295void blk_sync_queue(struct request_queue *q)
296{
70ed28b9 297 del_timer_sync(&q->timeout);
4e9b6f20 298 cancel_work_sync(&q->timeout_work);
1da177e4
LT
299}
300EXPORT_SYMBOL(blk_sync_queue);
301
c9254f2d 302/**
cd84a62e 303 * blk_set_pm_only - increment pm_only counter
c9254f2d 304 * @q: request queue pointer
c9254f2d 305 */
cd84a62e 306void blk_set_pm_only(struct request_queue *q)
c9254f2d 307{
cd84a62e 308 atomic_inc(&q->pm_only);
c9254f2d 309}
cd84a62e 310EXPORT_SYMBOL_GPL(blk_set_pm_only);
c9254f2d 311
cd84a62e 312void blk_clear_pm_only(struct request_queue *q)
c9254f2d 313{
cd84a62e
BVA
314 int pm_only;
315
316 pm_only = atomic_dec_return(&q->pm_only);
317 WARN_ON_ONCE(pm_only < 0);
318 if (pm_only == 0)
319 wake_up_all(&q->mq_freeze_wq);
c9254f2d 320}
cd84a62e 321EXPORT_SYMBOL_GPL(blk_clear_pm_only);
c9254f2d 322
165125e1 323void blk_put_queue(struct request_queue *q)
483f4afc
AV
324{
325 kobject_put(&q->kobj);
326}
d86e0e83 327EXPORT_SYMBOL(blk_put_queue);
483f4afc 328
aed3ea94
JA
329void blk_set_queue_dying(struct request_queue *q)
330{
8814ce8a 331 blk_queue_flag_set(QUEUE_FLAG_DYING, q);
aed3ea94 332
d3cfb2a0
ML
333 /*
334 * When queue DYING flag is set, we need to block new req
335 * entering queue, so we call blk_freeze_queue_start() to
336 * prevent I/O from crossing blk_queue_enter().
337 */
338 blk_freeze_queue_start(q);
339
344e9ffc 340 if (queue_is_mq(q))
aed3ea94 341 blk_mq_wake_waiters(q);
055f6e18
ML
342
343 /* Make blk_queue_enter() reexamine the DYING flag. */
344 wake_up_all(&q->mq_freeze_wq);
aed3ea94
JA
345}
346EXPORT_SYMBOL_GPL(blk_set_queue_dying);
347
c9a929dd
TH
348/**
349 * blk_cleanup_queue - shutdown a request queue
350 * @q: request queue to shutdown
351 *
c246e80d
BVA
352 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
353 * put it. All future requests will be failed immediately with -ENODEV.
c94a96ac 354 */
6728cb0e 355void blk_cleanup_queue(struct request_queue *q)
483f4afc 356{
bae85c15
BVA
357 WARN_ON_ONCE(blk_queue_registered(q));
358
3f3299d5 359 /* mark @q DYING, no new request or merges will be allowed afterwards */
aed3ea94 360 blk_set_queue_dying(q);
6ecf23af 361
57d74df9
CH
362 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
363 blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
c9a929dd 364
c246e80d
BVA
365 /*
366 * Drain all requests queued before DYING marking. Set DEAD flag to
67ed8b73
BVA
367 * prevent that blk_mq_run_hw_queues() accesses the hardware queues
368 * after draining finished.
c246e80d 369 */
3ef28e83 370 blk_freeze_queue(q);
c57cdf7a
ML
371
372 rq_qos_exit(q);
373
57d74df9 374 blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
c9a929dd 375
5a48fc14
DW
376 /* for synchronous bio-based driver finish in-flight integrity i/o */
377 blk_flush_integrity();
378
c9a929dd 379 /* @q won't process any more request, flush async actions */
dc3b17cc 380 del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
c9a929dd
TH
381 blk_sync_queue(q);
382
344e9ffc 383 if (queue_is_mq(q))
c7e2d94b 384 blk_mq_exit_queue(q);
a1ce35fa 385
c3e22192
ML
386 /*
387 * In theory, request pool of sched_tags belongs to request queue.
388 * However, the current implementation requires tag_set for freeing
389 * requests, so free the pool now.
390 *
391 * Queue has become frozen, there can't be any in-queue requests, so
392 * it is safe to free requests now.
393 */
394 mutex_lock(&q->sysfs_lock);
395 if (q->elevator)
396 blk_mq_sched_free_requests(q);
397 mutex_unlock(&q->sysfs_lock);
398
3ef28e83 399 percpu_ref_exit(&q->q_usage_counter);
45a9c9d9 400
c9a929dd 401 /* @q is and will stay empty, shutdown and put */
483f4afc
AV
402 blk_put_queue(q);
403}
1da177e4
LT
404EXPORT_SYMBOL(blk_cleanup_queue);
405
3a0a5299
BVA
406/**
407 * blk_queue_enter() - try to increase q->q_usage_counter
408 * @q: request queue pointer
409 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
410 */
9a95e4ef 411int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
3ef28e83 412{
cd84a62e 413 const bool pm = flags & BLK_MQ_REQ_PREEMPT;
3a0a5299 414
3ef28e83 415 while (true) {
3a0a5299 416 bool success = false;
3ef28e83 417
818e0fa2 418 rcu_read_lock();
3a0a5299
BVA
419 if (percpu_ref_tryget_live(&q->q_usage_counter)) {
420 /*
cd84a62e
BVA
421 * The code that increments the pm_only counter is
422 * responsible for ensuring that that counter is
423 * globally visible before the queue is unfrozen.
3a0a5299 424 */
cd84a62e 425 if (pm || !blk_queue_pm_only(q)) {
3a0a5299
BVA
426 success = true;
427 } else {
428 percpu_ref_put(&q->q_usage_counter);
429 }
430 }
818e0fa2 431 rcu_read_unlock();
3a0a5299
BVA
432
433 if (success)
3ef28e83
DW
434 return 0;
435
3a0a5299 436 if (flags & BLK_MQ_REQ_NOWAIT)
3ef28e83
DW
437 return -EBUSY;
438
5ed61d3f 439 /*
1671d522 440 * read pair of barrier in blk_freeze_queue_start(),
5ed61d3f 441 * we need to order reading __PERCPU_REF_DEAD flag of
d3cfb2a0
ML
442 * .q_usage_counter and reading .mq_freeze_depth or
443 * queue dying flag, otherwise the following wait may
444 * never return if the two reads are reordered.
5ed61d3f
ML
445 */
446 smp_rmb();
447
1dc3039b 448 wait_event(q->mq_freeze_wq,
7996a8b5 449 (!q->mq_freeze_depth &&
0d25bd07
BVA
450 (pm || (blk_pm_request_resume(q),
451 !blk_queue_pm_only(q)))) ||
1dc3039b 452 blk_queue_dying(q));
3ef28e83
DW
453 if (blk_queue_dying(q))
454 return -ENODEV;
3ef28e83
DW
455 }
456}
457
accea322
CH
458static inline int bio_queue_enter(struct bio *bio)
459{
460 struct request_queue *q = bio->bi_disk->queue;
461 bool nowait = bio->bi_opf & REQ_NOWAIT;
462 int ret;
463
464 ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
465 if (unlikely(ret)) {
466 if (nowait && !blk_queue_dying(q))
467 bio_wouldblock_error(bio);
468 else
469 bio_io_error(bio);
470 }
471
472 return ret;
473}
474
3ef28e83
DW
475void blk_queue_exit(struct request_queue *q)
476{
477 percpu_ref_put(&q->q_usage_counter);
478}
479
480static void blk_queue_usage_counter_release(struct percpu_ref *ref)
481{
482 struct request_queue *q =
483 container_of(ref, struct request_queue, q_usage_counter);
484
485 wake_up_all(&q->mq_freeze_wq);
486}
487
bca237a5 488static void blk_rq_timed_out_timer(struct timer_list *t)
287922eb 489{
bca237a5 490 struct request_queue *q = from_timer(q, t, timeout);
287922eb
CH
491
492 kblockd_schedule_work(&q->timeout_work);
493}
494
2e3c18d0
TH
495static void blk_timeout_work(struct work_struct *work)
496{
497}
498
3d745ea5 499struct request_queue *__blk_alloc_queue(int node_id)
1946089a 500{
165125e1 501 struct request_queue *q;
338aa96d 502 int ret;
1946089a 503
8324aa91 504 q = kmem_cache_alloc_node(blk_requestq_cachep,
3d745ea5 505 GFP_KERNEL | __GFP_ZERO, node_id);
1da177e4
LT
506 if (!q)
507 return NULL;
508
cbf62af3 509 q->last_merge = NULL;
cbf62af3 510
3d745ea5 511 q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
a73f730d 512 if (q->id < 0)
3d2936f4 513 goto fail_q;
a73f730d 514
338aa96d
KO
515 ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
516 if (ret)
54efd50b
KO
517 goto fail_id;
518
aef33c2f 519 q->backing_dev_info = bdi_alloc(node_id);
d03f6cdc
JK
520 if (!q->backing_dev_info)
521 goto fail_split;
522
a83b576c
JA
523 q->stats = blk_alloc_queue_stats();
524 if (!q->stats)
525 goto fail_stats;
526
b5420237 527 q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
dc3b17cc 528 q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
5151412d 529 q->node = node_id;
0989a025 530
bca237a5
KC
531 timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
532 laptop_mode_timer_fn, 0);
533 timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
2e3c18d0 534 INIT_WORK(&q->timeout_work, blk_timeout_work);
a612fddf 535 INIT_LIST_HEAD(&q->icq_list);
4eef3049 536#ifdef CONFIG_BLK_CGROUP
e8989fae 537 INIT_LIST_HEAD(&q->blkg_list);
4eef3049 538#endif
483f4afc 539
8324aa91 540 kobject_init(&q->kobj, &blk_queue_ktype);
1da177e4 541
5acb3cc2
WL
542#ifdef CONFIG_BLK_DEV_IO_TRACE
543 mutex_init(&q->blk_trace_mutex);
544#endif
483f4afc 545 mutex_init(&q->sysfs_lock);
cecf5d87 546 mutex_init(&q->sysfs_dir_lock);
0d945c1f 547 spin_lock_init(&q->queue_lock);
c94a96ac 548
320ae51f 549 init_waitqueue_head(&q->mq_freeze_wq);
7996a8b5 550 mutex_init(&q->mq_freeze_lock);
320ae51f 551
3ef28e83
DW
552 /*
553 * Init percpu_ref in atomic mode so that it's faster to shutdown.
554 * See blk_register_queue() for details.
555 */
556 if (percpu_ref_init(&q->q_usage_counter,
557 blk_queue_usage_counter_release,
558 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
fff4996b 559 goto fail_bdi;
f51b802c 560
3ef28e83
DW
561 if (blkcg_init_queue(q))
562 goto fail_ref;
563
3d745ea5
CH
564 blk_queue_dma_alignment(q, 511);
565 blk_set_default_limits(&q->limits);
566
1da177e4 567 return q;
a73f730d 568
3ef28e83
DW
569fail_ref:
570 percpu_ref_exit(&q->q_usage_counter);
fff4996b 571fail_bdi:
a83b576c
JA
572 blk_free_queue_stats(q->stats);
573fail_stats:
d03f6cdc 574 bdi_put(q->backing_dev_info);
54efd50b 575fail_split:
338aa96d 576 bioset_exit(&q->bio_split);
a73f730d
TH
577fail_id:
578 ida_simple_remove(&blk_queue_ida, q->id);
579fail_q:
580 kmem_cache_free(blk_requestq_cachep, q);
581 return NULL;
1da177e4 582}
3d745ea5
CH
583
584struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id)
585{
586 struct request_queue *q;
587
588 if (WARN_ON_ONCE(!make_request))
654a3667 589 return NULL;
3d745ea5
CH
590
591 q = __blk_alloc_queue(node_id);
592 if (!q)
593 return NULL;
594 q->make_request_fn = make_request;
595 q->nr_requests = BLKDEV_MAX_RQ;
596 return q;
597}
598EXPORT_SYMBOL(blk_alloc_queue);
1da177e4 599
09ac46c4 600bool blk_get_queue(struct request_queue *q)
1da177e4 601{
3f3299d5 602 if (likely(!blk_queue_dying(q))) {
09ac46c4
TH
603 __blk_get_queue(q);
604 return true;
1da177e4
LT
605 }
606
09ac46c4 607 return false;
1da177e4 608}
d86e0e83 609EXPORT_SYMBOL(blk_get_queue);
1da177e4 610
a1ce35fa
JA
611/**
612 * blk_get_request - allocate a request
613 * @q: request queue to allocate a request for
614 * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
615 * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
1da177e4 616 */
a1ce35fa
JA
617struct request *blk_get_request(struct request_queue *q, unsigned int op,
618 blk_mq_req_flags_t flags)
1da177e4 619{
a1ce35fa 620 struct request *req;
1da177e4 621
a1ce35fa
JA
622 WARN_ON_ONCE(op & REQ_NOWAIT);
623 WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
1da177e4 624
a1ce35fa
JA
625 req = blk_mq_alloc_request(q, op, flags);
626 if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
627 q->mq_ops->initialize_rq_fn(req);
1da177e4 628
a1ce35fa 629 return req;
1da177e4 630}
a1ce35fa 631EXPORT_SYMBOL(blk_get_request);
1da177e4 632
1da177e4
LT
633void blk_put_request(struct request *req)
634{
a1ce35fa 635 blk_mq_free_request(req);
1da177e4 636}
1da177e4
LT
637EXPORT_SYMBOL(blk_put_request);
638
14ccb66b
CH
639bool bio_attempt_back_merge(struct request *req, struct bio *bio,
640 unsigned int nr_segs)
73c10101 641{
1eff9d32 642 const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
73c10101 643
14ccb66b 644 if (!ll_back_merge_fn(req, bio, nr_segs))
73c10101
JA
645 return false;
646
14ccb66b 647 trace_block_bio_backmerge(req->q, req, bio);
d3e65fff 648 rq_qos_merge(req->q, req, bio);
73c10101
JA
649
650 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
651 blk_rq_set_mixed_merge(req);
652
653 req->biotail->bi_next = bio;
654 req->biotail = bio;
4f024f37 655 req->__data_len += bio->bi_iter.bi_size;
73c10101 656
a892c8d5
ST
657 bio_crypt_free_ctx(bio);
658
320ae51f 659 blk_account_io_start(req, false);
73c10101
JA
660 return true;
661}
662
14ccb66b
CH
663bool bio_attempt_front_merge(struct request *req, struct bio *bio,
664 unsigned int nr_segs)
73c10101 665{
1eff9d32 666 const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
73c10101 667
14ccb66b 668 if (!ll_front_merge_fn(req, bio, nr_segs))
73c10101
JA
669 return false;
670
14ccb66b 671 trace_block_bio_frontmerge(req->q, req, bio);
d3e65fff 672 rq_qos_merge(req->q, req, bio);
73c10101
JA
673
674 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
675 blk_rq_set_mixed_merge(req);
676
73c10101
JA
677 bio->bi_next = req->bio;
678 req->bio = bio;
679
4f024f37
KO
680 req->__sector = bio->bi_iter.bi_sector;
681 req->__data_len += bio->bi_iter.bi_size;
73c10101 682
a892c8d5
ST
683 bio_crypt_do_front_merge(req, bio);
684
320ae51f 685 blk_account_io_start(req, false);
73c10101
JA
686 return true;
687}
688
1e739730
CH
689bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
690 struct bio *bio)
691{
692 unsigned short segments = blk_rq_nr_discard_segments(req);
693
694 if (segments >= queue_max_discard_segments(q))
695 goto no_merge;
696 if (blk_rq_sectors(req) + bio_sectors(bio) >
697 blk_rq_get_max_sectors(req, blk_rq_pos(req)))
698 goto no_merge;
699
d3e65fff
TH
700 rq_qos_merge(q, req, bio);
701
1e739730
CH
702 req->biotail->bi_next = bio;
703 req->biotail = bio;
704 req->__data_len += bio->bi_iter.bi_size;
1e739730
CH
705 req->nr_phys_segments = segments + 1;
706
707 blk_account_io_start(req, false);
708 return true;
709no_merge:
710 req_set_nomerge(q, req);
711 return false;
712}
713
bd87b589 714/**
320ae51f 715 * blk_attempt_plug_merge - try to merge with %current's plugged list
bd87b589
TH
716 * @q: request_queue new bio is being queued at
717 * @bio: new bio being queued
14ccb66b 718 * @nr_segs: number of segments in @bio
ccc2600b
RD
719 * @same_queue_rq: pointer to &struct request that gets filled in when
720 * another request associated with @q is found on the plug list
721 * (optional, may be %NULL)
bd87b589
TH
722 *
723 * Determine whether @bio being queued on @q can be merged with a request
724 * on %current's plugged list. Returns %true if merge was successful,
725 * otherwise %false.
726 *
07c2bd37
TH
727 * Plugging coalesces IOs from the same issuer for the same purpose without
728 * going through @q->queue_lock. As such it's more of an issuing mechanism
729 * than scheduling, and the request, while may have elvpriv data, is not
730 * added on the elevator at this point. In addition, we don't have
731 * reliable access to the elevator outside queue lock. Only check basic
732 * merging parameters without querying the elevator.
da41a589
RE
733 *
734 * Caller must ensure !blk_queue_nomerges(q) beforehand.
73c10101 735 */
320ae51f 736bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
14ccb66b 737 unsigned int nr_segs, struct request **same_queue_rq)
73c10101
JA
738{
739 struct blk_plug *plug;
740 struct request *rq;
92f399c7 741 struct list_head *plug_list;
73c10101 742
b49773e7 743 plug = blk_mq_plug(q, bio);
73c10101 744 if (!plug)
34fe7c05 745 return false;
73c10101 746
a1ce35fa 747 plug_list = &plug->mq_list;
92f399c7
SL
748
749 list_for_each_entry_reverse(rq, plug_list, queuelist) {
34fe7c05 750 bool merged = false;
73c10101 751
5f0ed774 752 if (rq->q == q && same_queue_rq) {
5b3f341f
SL
753 /*
754 * Only blk-mq multiple hardware queues case checks the
755 * rq in the same queue, there should be only one such
756 * rq in a queue
757 **/
5f0ed774 758 *same_queue_rq = rq;
5b3f341f 759 }
56ebdaf2 760
07c2bd37 761 if (rq->q != q || !blk_rq_merge_ok(rq, bio))
73c10101
JA
762 continue;
763
34fe7c05
CH
764 switch (blk_try_merge(rq, bio)) {
765 case ELEVATOR_BACK_MERGE:
14ccb66b 766 merged = bio_attempt_back_merge(rq, bio, nr_segs);
34fe7c05
CH
767 break;
768 case ELEVATOR_FRONT_MERGE:
14ccb66b 769 merged = bio_attempt_front_merge(rq, bio, nr_segs);
34fe7c05 770 break;
1e739730
CH
771 case ELEVATOR_DISCARD_MERGE:
772 merged = bio_attempt_discard_merge(q, rq, bio);
773 break;
34fe7c05
CH
774 default:
775 break;
73c10101 776 }
34fe7c05
CH
777
778 if (merged)
779 return true;
73c10101 780 }
34fe7c05
CH
781
782 return false;
73c10101
JA
783}
784
52c5e62d 785static void handle_bad_sector(struct bio *bio, sector_t maxsector)
1da177e4
LT
786{
787 char b[BDEVNAME_SIZE];
788
789 printk(KERN_INFO "attempt to access beyond end of device\n");
6296b960 790 printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
74d46992 791 bio_devname(bio, b), bio->bi_opf,
f73a1c7d 792 (unsigned long long)bio_end_sector(bio),
52c5e62d 793 (long long)maxsector);
1da177e4
LT
794}
795
c17bb495
AM
796#ifdef CONFIG_FAIL_MAKE_REQUEST
797
798static DECLARE_FAULT_ATTR(fail_make_request);
799
800static int __init setup_fail_make_request(char *str)
801{
802 return setup_fault_attr(&fail_make_request, str);
803}
804__setup("fail_make_request=", setup_fail_make_request);
805
b2c9cd37 806static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
c17bb495 807{
b2c9cd37 808 return part->make_it_fail && should_fail(&fail_make_request, bytes);
c17bb495
AM
809}
810
811static int __init fail_make_request_debugfs(void)
812{
dd48c085
AM
813 struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
814 NULL, &fail_make_request);
815
21f9fcd8 816 return PTR_ERR_OR_ZERO(dir);
c17bb495
AM
817}
818
819late_initcall(fail_make_request_debugfs);
820
821#else /* CONFIG_FAIL_MAKE_REQUEST */
822
b2c9cd37
AM
823static inline bool should_fail_request(struct hd_struct *part,
824 unsigned int bytes)
c17bb495 825{
b2c9cd37 826 return false;
c17bb495
AM
827}
828
829#endif /* CONFIG_FAIL_MAKE_REQUEST */
830
721c7fc7
ID
831static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
832{
b089cfd9
JA
833 const int op = bio_op(bio);
834
8b2ded1c 835 if (part->policy && op_is_write(op)) {
721c7fc7
ID
836 char b[BDEVNAME_SIZE];
837
8b2ded1c
MP
838 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
839 return false;
840
a32e236e 841 WARN_ONCE(1,
721c7fc7
ID
842 "generic_make_request: Trying to write "
843 "to read-only block-device %s (partno %d)\n",
844 bio_devname(bio, b), part->partno);
a32e236e
LT
845 /* Older lvm-tools actually trigger this */
846 return false;
721c7fc7
ID
847 }
848
849 return false;
850}
851
30abb3a6
HM
852static noinline int should_fail_bio(struct bio *bio)
853{
854 if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
855 return -EIO;
856 return 0;
857}
858ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
859
52c5e62d
CH
860/*
861 * Check whether this bio extends beyond the end of the device or partition.
862 * This may well happen - the kernel calls bread() without checking the size of
863 * the device, e.g., when mounting a file system.
864 */
865static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
866{
867 unsigned int nr_sectors = bio_sectors(bio);
868
869 if (nr_sectors && maxsector &&
870 (nr_sectors > maxsector ||
871 bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
872 handle_bad_sector(bio, maxsector);
873 return -EIO;
874 }
875 return 0;
876}
877
74d46992
CH
878/*
879 * Remap block n of partition p to block n+start(p) of the disk.
880 */
881static inline int blk_partition_remap(struct bio *bio)
882{
883 struct hd_struct *p;
52c5e62d 884 int ret = -EIO;
74d46992 885
721c7fc7
ID
886 rcu_read_lock();
887 p = __disk_get_part(bio->bi_disk, bio->bi_partno);
52c5e62d
CH
888 if (unlikely(!p))
889 goto out;
890 if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
891 goto out;
892 if (unlikely(bio_check_ro(bio, p)))
721c7fc7 893 goto out;
721c7fc7 894
5eac3eb3 895 if (bio_sectors(bio)) {
52c5e62d
CH
896 if (bio_check_eod(bio, part_nr_sects_read(p)))
897 goto out;
898 bio->bi_iter.bi_sector += p->start_sect;
52c5e62d
CH
899 trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
900 bio->bi_iter.bi_sector - p->start_sect);
901 }
c04fa44b 902 bio->bi_partno = 0;
52c5e62d 903 ret = 0;
721c7fc7
ID
904out:
905 rcu_read_unlock();
74d46992
CH
906 return ret;
907}
908
0512a75b
KB
909/*
910 * Check write append to a zoned block device.
911 */
912static inline blk_status_t blk_check_zone_append(struct request_queue *q,
913 struct bio *bio)
914{
915 sector_t pos = bio->bi_iter.bi_sector;
916 int nr_sectors = bio_sectors(bio);
917
918 /* Only applicable to zoned block devices */
919 if (!blk_queue_is_zoned(q))
920 return BLK_STS_NOTSUPP;
921
922 /* The bio sector must point to the start of a sequential zone */
923 if (pos & (blk_queue_zone_sectors(q) - 1) ||
924 !blk_queue_zone_is_seq(q, pos))
925 return BLK_STS_IOERR;
926
927 /*
928 * Not allowed to cross zone boundaries. Otherwise, the BIO will be
929 * split and could result in non-contiguous sectors being written in
930 * different zones.
931 */
932 if (nr_sectors > q->limits.chunk_sectors)
933 return BLK_STS_IOERR;
934
935 /* Make sure the BIO is small enough and will not get split */
936 if (nr_sectors > q->limits.max_zone_append_sectors)
937 return BLK_STS_IOERR;
938
939 bio->bi_opf |= REQ_NOMERGE;
940
941 return BLK_STS_OK;
942}
943
27a84d54
CH
944static noinline_for_stack bool
945generic_make_request_checks(struct bio *bio)
1da177e4 946{
165125e1 947 struct request_queue *q;
5a7bbad2 948 int nr_sectors = bio_sectors(bio);
4e4cbee9 949 blk_status_t status = BLK_STS_IOERR;
5a7bbad2 950 char b[BDEVNAME_SIZE];
1da177e4
LT
951
952 might_sleep();
1da177e4 953
74d46992 954 q = bio->bi_disk->queue;
5a7bbad2
CH
955 if (unlikely(!q)) {
956 printk(KERN_ERR
957 "generic_make_request: Trying to access "
958 "nonexistent block-device %s (%Lu)\n",
74d46992 959 bio_devname(bio, b), (long long)bio->bi_iter.bi_sector);
5a7bbad2
CH
960 goto end_io;
961 }
c17bb495 962
03a07c92 963 /*
c58c1f83
RP
964 * Non-mq queues do not honor REQ_NOWAIT, so complete a bio
965 * with BLK_STS_AGAIN status in order to catch -EAGAIN and
966 * to give a chance to the caller to repeat request gracefully.
03a07c92 967 */
c58c1f83
RP
968 if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) {
969 status = BLK_STS_AGAIN;
970 goto end_io;
971 }
03a07c92 972
30abb3a6 973 if (should_fail_bio(bio))
5a7bbad2 974 goto end_io;
2056a782 975
52c5e62d
CH
976 if (bio->bi_partno) {
977 if (unlikely(blk_partition_remap(bio)))
721c7fc7
ID
978 goto end_io;
979 } else {
52c5e62d
CH
980 if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
981 goto end_io;
982 if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
721c7fc7
ID
983 goto end_io;
984 }
2056a782 985
5a7bbad2
CH
986 /*
987 * Filter flush bio's early so that make_request based
988 * drivers without flush support don't have to worry
989 * about them.
990 */
f3a8ab7d 991 if (op_is_flush(bio->bi_opf) &&
c888a8f9 992 !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
1eff9d32 993 bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
5a7bbad2 994 if (!nr_sectors) {
4e4cbee9 995 status = BLK_STS_OK;
51fd77bd
JA
996 goto end_io;
997 }
5a7bbad2 998 }
5ddfe969 999
d04c406f
CH
1000 if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
1001 bio->bi_opf &= ~REQ_HIPRI;
1002
288dab8a
CH
1003 switch (bio_op(bio)) {
1004 case REQ_OP_DISCARD:
1005 if (!blk_queue_discard(q))
1006 goto not_supported;
1007 break;
1008 case REQ_OP_SECURE_ERASE:
1009 if (!blk_queue_secure_erase(q))
1010 goto not_supported;
1011 break;
1012 case REQ_OP_WRITE_SAME:
74d46992 1013 if (!q->limits.max_write_same_sectors)
288dab8a 1014 goto not_supported;
58886785 1015 break;
0512a75b
KB
1016 case REQ_OP_ZONE_APPEND:
1017 status = blk_check_zone_append(q, bio);
1018 if (status != BLK_STS_OK)
1019 goto end_io;
1020 break;
2d253440 1021 case REQ_OP_ZONE_RESET:
6c1b1da5
AJ
1022 case REQ_OP_ZONE_OPEN:
1023 case REQ_OP_ZONE_CLOSE:
1024 case REQ_OP_ZONE_FINISH:
74d46992 1025 if (!blk_queue_is_zoned(q))
2d253440 1026 goto not_supported;
288dab8a 1027 break;
6e33dbf2
CK
1028 case REQ_OP_ZONE_RESET_ALL:
1029 if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
1030 goto not_supported;
1031 break;
a6f0788e 1032 case REQ_OP_WRITE_ZEROES:
74d46992 1033 if (!q->limits.max_write_zeroes_sectors)
a6f0788e
CK
1034 goto not_supported;
1035 break;
288dab8a
CH
1036 default:
1037 break;
5a7bbad2 1038 }
01edede4 1039
7f4b35d1 1040 /*
3e82c348
CH
1041 * Various block parts want %current->io_context, so allocate it up
1042 * front rather than dealing with lots of pain to allocate it only
1043 * where needed. This may fail and the block layer knows how to live
1044 * with it.
7f4b35d1 1045 */
3e82c348
CH
1046 if (unlikely(!current->io_context))
1047 create_task_io_context(current, GFP_ATOMIC, q->node);
7f4b35d1 1048
ae118896
TH
1049 if (!blkcg_bio_issue_check(q, bio))
1050 return false;
27a84d54 1051
fbbaf700
N
1052 if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
1053 trace_block_bio_queue(q, bio);
1054 /* Now that enqueuing has been traced, we need to trace
1055 * completion as well.
1056 */
1057 bio_set_flag(bio, BIO_TRACE_COMPLETION);
1058 }
27a84d54 1059 return true;
a7384677 1060
288dab8a 1061not_supported:
4e4cbee9 1062 status = BLK_STS_NOTSUPP;
a7384677 1063end_io:
4e4cbee9 1064 bio->bi_status = status;
4246a0b6 1065 bio_endio(bio);
27a84d54 1066 return false;
1da177e4
LT
1067}
1068
ac7c5675
CH
1069static blk_qc_t do_make_request(struct bio *bio)
1070{
1071 struct request_queue *q = bio->bi_disk->queue;
1072 blk_qc_t ret = BLK_QC_T_NONE;
1073
1074 if (blk_crypto_bio_prep(&bio)) {
1075 if (!q->make_request_fn)
1076 return blk_mq_make_request(q, bio);
1077 ret = q->make_request_fn(q, bio);
1078 }
1079 blk_queue_exit(q);
1080 return ret;
1081}
1082
27a84d54 1083/**
3fdd4086 1084 * generic_make_request - re-submit a bio to the block device layer for I/O
27a84d54
CH
1085 * @bio: The bio describing the location in memory and on the device.
1086 *
3fdd4086
CH
1087 * This is a version of submit_bio() that shall only be used for I/O that is
1088 * resubmitted to lower level drivers by stacking block drivers. All file
1089 * systems and other upper level users of the block layer should use
1090 * submit_bio() instead.
d89d8796 1091 */
dece1635 1092blk_qc_t generic_make_request(struct bio *bio)
d89d8796 1093{
f5fe1b51
N
1094 /*
1095 * bio_list_on_stack[0] contains bios submitted by the current
1096 * make_request_fn.
1097 * bio_list_on_stack[1] contains bios that were submitted before
1098 * the current make_request_fn, but that haven't been processed
1099 * yet.
1100 */
1101 struct bio_list bio_list_on_stack[2];
dece1635 1102 blk_qc_t ret = BLK_QC_T_NONE;
bddd87c7 1103
27a84d54 1104 if (!generic_make_request_checks(bio))
dece1635 1105 goto out;
27a84d54
CH
1106
1107 /*
1108 * We only want one ->make_request_fn to be active at a time, else
1109 * stack usage with stacked devices could be a problem. So use
1110 * current->bio_list to keep a list of requests submited by a
1111 * make_request_fn function. current->bio_list is also used as a
1112 * flag to say if generic_make_request is currently active in this
1113 * task or not. If it is NULL, then no make_request is active. If
1114 * it is non-NULL, then a make_request is active, and new requests
1115 * should be added at the tail
1116 */
bddd87c7 1117 if (current->bio_list) {
f5fe1b51 1118 bio_list_add(&current->bio_list[0], bio);
dece1635 1119 goto out;
d89d8796 1120 }
27a84d54 1121
d89d8796
NB
1122 /* following loop may be a bit non-obvious, and so deserves some
1123 * explanation.
1124 * Before entering the loop, bio->bi_next is NULL (as all callers
1125 * ensure that) so we have a list with a single bio.
1126 * We pretend that we have just taken it off a longer list, so
bddd87c7
AM
1127 * we assign bio_list to a pointer to the bio_list_on_stack,
1128 * thus initialising the bio_list of new bios to be
27a84d54 1129 * added. ->make_request() may indeed add some more bios
d89d8796
NB
1130 * through a recursive call to generic_make_request. If it
1131 * did, we find a non-NULL value in bio_list and re-enter the loop
1132 * from the top. In this case we really did just take the bio
bddd87c7 1133 * of the top of the list (no pretending) and so remove it from
27a84d54 1134 * bio_list, and call into ->make_request() again.
d89d8796
NB
1135 */
1136 BUG_ON(bio->bi_next);
f5fe1b51
N
1137 bio_list_init(&bio_list_on_stack[0]);
1138 current->bio_list = bio_list_on_stack;
d89d8796 1139 do {
fe200864 1140 struct request_queue *q = bio->bi_disk->queue;
27a84d54 1141
accea322 1142 if (likely(bio_queue_enter(bio) == 0)) {
79bd9959
N
1143 struct bio_list lower, same;
1144
1145 /* Create a fresh bio_list for all subordinate requests */
f5fe1b51
N
1146 bio_list_on_stack[1] = bio_list_on_stack[0];
1147 bio_list_init(&bio_list_on_stack[0]);
ac7c5675 1148 ret = do_make_request(bio);
fe200864 1149
79bd9959
N
1150 /* sort new bios into those for a lower level
1151 * and those for the same level
1152 */
1153 bio_list_init(&lower);
1154 bio_list_init(&same);
f5fe1b51 1155 while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
74d46992 1156 if (q == bio->bi_disk->queue)
79bd9959
N
1157 bio_list_add(&same, bio);
1158 else
1159 bio_list_add(&lower, bio);
1160 /* now assemble so we handle the lowest level first */
f5fe1b51
N
1161 bio_list_merge(&bio_list_on_stack[0], &lower);
1162 bio_list_merge(&bio_list_on_stack[0], &same);
1163 bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
3ef28e83 1164 }
f5fe1b51 1165 bio = bio_list_pop(&bio_list_on_stack[0]);
d89d8796 1166 } while (bio);
bddd87c7 1167 current->bio_list = NULL; /* deactivate */
dece1635
JA
1168
1169out:
1170 return ret;
d89d8796 1171}
1da177e4
LT
1172EXPORT_SYMBOL(generic_make_request);
1173
f421e1d9
CH
1174/**
1175 * direct_make_request - hand a buffer directly to its device driver for I/O
1176 * @bio: The bio describing the location in memory and on the device.
1177 *
1178 * This function behaves like generic_make_request(), but does not protect
1179 * against recursion. Must only be used if the called driver is known
8cf7961d 1180 * to be blk-mq based.
f421e1d9
CH
1181 */
1182blk_qc_t direct_make_request(struct bio *bio)
1183{
1184 struct request_queue *q = bio->bi_disk->queue;
f421e1d9 1185
accea322
CH
1186 if (WARN_ON_ONCE(q->make_request_fn)) {
1187 bio_io_error(bio);
f421e1d9 1188 return BLK_QC_T_NONE;
f421e1d9 1189 }
accea322
CH
1190 if (!generic_make_request_checks(bio))
1191 return BLK_QC_T_NONE;
1192 if (unlikely(bio_queue_enter(bio)))
1193 return BLK_QC_T_NONE;
ac7c5675
CH
1194 if (!blk_crypto_bio_prep(&bio)) {
1195 blk_queue_exit(q);
1196 return BLK_QC_T_NONE;
1197 }
1198 return blk_mq_make_request(q, bio);
f421e1d9
CH
1199}
1200EXPORT_SYMBOL_GPL(direct_make_request);
1201
1da177e4 1202/**
710027a4 1203 * submit_bio - submit a bio to the block device layer for I/O
1da177e4
LT
1204 * @bio: The &struct bio which describes the I/O
1205 *
3fdd4086
CH
1206 * submit_bio() is used to submit I/O requests to block devices. It is passed a
1207 * fully set up &struct bio that describes the I/O that needs to be done. The
1208 * bio will be send to the device described by the bi_disk and bi_partno fields.
1da177e4 1209 *
3fdd4086
CH
1210 * The success/failure status of the request, along with notification of
1211 * completion, is delivered asynchronously through the ->bi_end_io() callback
1212 * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
1213 * been called.
1da177e4 1214 */
4e49ea4a 1215blk_qc_t submit_bio(struct bio *bio)
1da177e4 1216{
d3f77dfd
TH
1217 if (blkcg_punt_bio_submit(bio))
1218 return BLK_QC_T_NONE;
1219
bf2de6f5
JA
1220 /*
1221 * If it's a regular read/write or a barrier with data attached,
1222 * go through the normal accounting stuff before submission.
1223 */
e2a60da7 1224 if (bio_has_data(bio)) {
4363ac7c
MP
1225 unsigned int count;
1226
95fe6c1a 1227 if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
7c5a0dcf 1228 count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
4363ac7c
MP
1229 else
1230 count = bio_sectors(bio);
1231
a8ebb056 1232 if (op_is_write(bio_op(bio))) {
bf2de6f5
JA
1233 count_vm_events(PGPGOUT, count);
1234 } else {
4f024f37 1235 task_io_account_read(bio->bi_iter.bi_size);
bf2de6f5
JA
1236 count_vm_events(PGPGIN, count);
1237 }
1238
1239 if (unlikely(block_dump)) {
1240 char b[BDEVNAME_SIZE];
8dcbdc74 1241 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
ba25f9dc 1242 current->comm, task_pid_nr(current),
a8ebb056 1243 op_is_write(bio_op(bio)) ? "WRITE" : "READ",
4f024f37 1244 (unsigned long long)bio->bi_iter.bi_sector,
74d46992 1245 bio_devname(bio, b), count);
bf2de6f5 1246 }
1da177e4
LT
1247 }
1248
b8e24a93 1249 /*
760f83ea
CH
1250 * If we're reading data that is part of the userspace workingset, count
1251 * submission time as memory stall. When the device is congested, or
1252 * the submitting cgroup IO-throttled, submission can be a significant
1253 * part of overall IO time.
b8e24a93 1254 */
760f83ea
CH
1255 if (unlikely(bio_op(bio) == REQ_OP_READ &&
1256 bio_flagged(bio, BIO_WORKINGSET))) {
1257 unsigned long pflags;
1258 blk_qc_t ret;
b8e24a93 1259
760f83ea
CH
1260 psi_memstall_enter(&pflags);
1261 ret = generic_make_request(bio);
b8e24a93
JW
1262 psi_memstall_leave(&pflags);
1263
760f83ea
CH
1264 return ret;
1265 }
1266
1267 return generic_make_request(bio);
1da177e4 1268}
1da177e4
LT
1269EXPORT_SYMBOL(submit_bio);
1270
82124d60 1271/**
bf4e6b4e 1272 * blk_cloned_rq_check_limits - Helper function to check a cloned request
0d720318 1273 * for the new queue limits
82124d60
KU
1274 * @q: the queue
1275 * @rq: the request being checked
1276 *
1277 * Description:
1278 * @rq may have been made based on weaker limitations of upper-level queues
1279 * in request stacking drivers, and it may violate the limitation of @q.
1280 * Since the block layer and the underlying device driver trust @rq
1281 * after it is inserted to @q, it should be checked against @q before
1282 * the insertion using this generic function.
1283 *
82124d60 1284 * Request stacking drivers like request-based dm may change the queue
bf4e6b4e
HR
1285 * limits when retrying requests on other queues. Those requests need
1286 * to be checked against the new queue limits again during dispatch.
82124d60 1287 */
bf4e6b4e
HR
1288static int blk_cloned_rq_check_limits(struct request_queue *q,
1289 struct request *rq)
82124d60 1290{
8fe0d473 1291 if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) {
61939b12
JP
1292 printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
1293 __func__, blk_rq_sectors(rq),
1294 blk_queue_get_max_sectors(q, req_op(rq)));
82124d60
KU
1295 return -EIO;
1296 }
1297
1298 /*
1299 * queue's settings related to segment counting like q->bounce_pfn
1300 * may differ from that of other stacking queues.
1301 * Recalculate it to check the request correctly on this queue's
1302 * limitation.
1303 */
e9cd19c0 1304 rq->nr_phys_segments = blk_recalc_rq_segments(rq);
8a78362c 1305 if (rq->nr_phys_segments > queue_max_segments(q)) {
61939b12
JP
1306 printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
1307 __func__, rq->nr_phys_segments, queue_max_segments(q));
82124d60
KU
1308 return -EIO;
1309 }
1310
1311 return 0;
1312}
82124d60
KU
1313
1314/**
1315 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
1316 * @q: the queue to submit the request
1317 * @rq: the request being queued
1318 */
2a842aca 1319blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
82124d60 1320{
bf4e6b4e 1321 if (blk_cloned_rq_check_limits(q, rq))
2a842aca 1322 return BLK_STS_IOERR;
82124d60 1323
b2c9cd37
AM
1324 if (rq->rq_disk &&
1325 should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
2a842aca 1326 return BLK_STS_IOERR;
82124d60 1327
a892c8d5
ST
1328 if (blk_crypto_insert_cloned_request(rq))
1329 return BLK_STS_IOERR;
1330
a1ce35fa
JA
1331 if (blk_queue_io_stat(q))
1332 blk_account_io_start(rq, true);
82124d60
KU
1333
1334 /*
a1ce35fa
JA
1335 * Since we have a scheduler attached on the top device,
1336 * bypass a potential scheduler on the bottom device for
1337 * insert.
82124d60 1338 */
fd9c40f6 1339 return blk_mq_request_issue_directly(rq, true);
82124d60
KU
1340}
1341EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
1342
80a761fd
TH
1343/**
1344 * blk_rq_err_bytes - determine number of bytes till the next failure boundary
1345 * @rq: request to examine
1346 *
1347 * Description:
1348 * A request could be merge of IOs which require different failure
1349 * handling. This function determines the number of bytes which
1350 * can be failed from the beginning of the request without
1351 * crossing into area which need to be retried further.
1352 *
1353 * Return:
1354 * The number of bytes to fail.
80a761fd
TH
1355 */
1356unsigned int blk_rq_err_bytes(const struct request *rq)
1357{
1358 unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
1359 unsigned int bytes = 0;
1360 struct bio *bio;
1361
e8064021 1362 if (!(rq->rq_flags & RQF_MIXED_MERGE))
80a761fd
TH
1363 return blk_rq_bytes(rq);
1364
1365 /*
1366 * Currently the only 'mixing' which can happen is between
1367 * different fastfail types. We can safely fail portions
1368 * which have all the failfast bits that the first one has -
1369 * the ones which are at least as eager to fail as the first
1370 * one.
1371 */
1372 for (bio = rq->bio; bio; bio = bio->bi_next) {
1eff9d32 1373 if ((bio->bi_opf & ff) != ff)
80a761fd 1374 break;
4f024f37 1375 bytes += bio->bi_iter.bi_size;
80a761fd
TH
1376 }
1377
1378 /* this could lead to infinite loop */
1379 BUG_ON(blk_rq_bytes(rq) && !bytes);
1380 return bytes;
1381}
1382EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
1383
f1394b79 1384static void blk_account_io_completion(struct request *req, unsigned int bytes)
bc58ba94 1385{
ecb6186c 1386 if (req->part && blk_do_io_stat(req)) {
ddcf35d3 1387 const int sgrp = op_stat_group(req_op(req));
bc58ba94 1388 struct hd_struct *part;
bc58ba94 1389
112f158f 1390 part_stat_lock();
09e099d4 1391 part = req->part;
112f158f 1392 part_stat_add(part, sectors[sgrp], bytes >> 9);
bc58ba94
JA
1393 part_stat_unlock();
1394 }
1395}
1396
522a7775 1397void blk_account_io_done(struct request *req, u64 now)
bc58ba94 1398{
bc58ba94 1399 /*
dd4c133f
TH
1400 * Account IO completion. flush_rq isn't accounted as a
1401 * normal IO on queueing nor completion. Accounting the
1402 * containing request is enough.
bc58ba94 1403 */
ecb6186c
LG
1404 if (req->part && blk_do_io_stat(req) &&
1405 !(req->rq_flags & RQF_FLUSH_SEQ)) {
ddcf35d3 1406 const int sgrp = op_stat_group(req_op(req));
bc58ba94 1407 struct hd_struct *part;
bc58ba94 1408
112f158f 1409 part_stat_lock();
09e099d4 1410 part = req->part;
bc58ba94 1411
2b8bd423 1412 update_io_ticks(part, jiffies, true);
112f158f
MS
1413 part_stat_inc(part, ios[sgrp]);
1414 part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
bc58ba94 1415
6c23a968 1416 hd_struct_put(part);
bc58ba94
JA
1417 part_stat_unlock();
1418 }
1419}
1420
320ae51f
JA
1421void blk_account_io_start(struct request *rq, bool new_io)
1422{
320ae51f
JA
1423 if (!blk_do_io_stat(rq))
1424 return;
1425
112f158f 1426 part_stat_lock();
76268f3a
CH
1427 if (!new_io)
1428 part_stat_inc(rq->part, merges[rq_data_dir(rq)]);
1429 else
1430 rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
1431 update_io_ticks(rq->part, jiffies, false);
320ae51f
JA
1432 part_stat_unlock();
1433}
1434
956d510e
CH
1435unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
1436 unsigned int op)
1437{
1438 struct hd_struct *part = &disk->part0;
1439 const int sgrp = op_stat_group(op);
1440 unsigned long now = READ_ONCE(jiffies);
1441
1442 part_stat_lock();
1443 update_io_ticks(part, now, false);
1444 part_stat_inc(part, ios[sgrp]);
1445 part_stat_add(part, sectors[sgrp], sectors);
1446 part_stat_local_inc(part, in_flight[op_is_write(op)]);
1447 part_stat_unlock();
1448
1449 return now;
1450}
1451EXPORT_SYMBOL(disk_start_io_acct);
1452
1453void disk_end_io_acct(struct gendisk *disk, unsigned int op,
1454 unsigned long start_time)
1455{
1456 struct hd_struct *part = &disk->part0;
1457 const int sgrp = op_stat_group(op);
1458 unsigned long now = READ_ONCE(jiffies);
1459 unsigned long duration = now - start_time;
1460
1461 part_stat_lock();
1462 update_io_ticks(part, now, true);
1463 part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
1464 part_stat_local_dec(part, in_flight[op_is_write(op)]);
1465 part_stat_unlock();
1466}
1467EXPORT_SYMBOL(disk_end_io_acct);
1468
ef71de8b
CH
1469/*
1470 * Steal bios from a request and add them to a bio list.
1471 * The request must not have been partially completed before.
1472 */
1473void blk_steal_bios(struct bio_list *list, struct request *rq)
1474{
1475 if (rq->bio) {
1476 if (list->tail)
1477 list->tail->bi_next = rq->bio;
1478 else
1479 list->head = rq->bio;
1480 list->tail = rq->biotail;
1481
1482 rq->bio = NULL;
1483 rq->biotail = NULL;
1484 }
1485
1486 rq->__data_len = 0;
1487}
1488EXPORT_SYMBOL_GPL(blk_steal_bios);
1489
3bcddeac 1490/**
2e60e022 1491 * blk_update_request - Special helper function for request stacking drivers
8ebf9756 1492 * @req: the request being processed
2a842aca 1493 * @error: block status code
8ebf9756 1494 * @nr_bytes: number of bytes to complete @req
3bcddeac
KU
1495 *
1496 * Description:
8ebf9756
RD
1497 * Ends I/O on a number of bytes attached to @req, but doesn't complete
1498 * the request structure even if @req doesn't have leftover.
1499 * If @req has leftover, sets it up for the next range of segments.
2e60e022
TH
1500 *
1501 * This special helper function is only for request stacking drivers
1502 * (e.g. request-based dm) so that they can handle partial completion.
3a211b71 1503 * Actual device drivers should use blk_mq_end_request instead.
2e60e022
TH
1504 *
1505 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
1506 * %false return from this function.
3bcddeac 1507 *
1954e9a9
BVA
1508 * Note:
1509 * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
1510 * blk_rq_bytes() and in blk_update_request().
1511 *
3bcddeac 1512 * Return:
2e60e022
TH
1513 * %false - this request doesn't have any more data
1514 * %true - this request has more data
3bcddeac 1515 **/
2a842aca
CH
1516bool blk_update_request(struct request *req, blk_status_t error,
1517 unsigned int nr_bytes)
1da177e4 1518{
f79ea416 1519 int total_bytes;
1da177e4 1520
2a842aca 1521 trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
4a0efdc9 1522
2e60e022
TH
1523 if (!req->bio)
1524 return false;
1525
54d4e6ab
MG
1526#ifdef CONFIG_BLK_DEV_INTEGRITY
1527 if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
1528 error == BLK_STS_OK)
1529 req->q->integrity.profile->complete_fn(req, nr_bytes);
1530#endif
1531
2a842aca
CH
1532 if (unlikely(error && !blk_rq_is_passthrough(req) &&
1533 !(req->rq_flags & RQF_QUIET)))
178cc590 1534 print_req_error(req, error, __func__);
1da177e4 1535
bc58ba94 1536 blk_account_io_completion(req, nr_bytes);
d72d904a 1537
f79ea416
KO
1538 total_bytes = 0;
1539 while (req->bio) {
1540 struct bio *bio = req->bio;
4f024f37 1541 unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
1da177e4 1542
9c24c10a 1543 if (bio_bytes == bio->bi_iter.bi_size)
1da177e4 1544 req->bio = bio->bi_next;
1da177e4 1545
fbbaf700
N
1546 /* Completion has already been traced */
1547 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
f79ea416 1548 req_bio_endio(req, bio, bio_bytes, error);
1da177e4 1549
f79ea416
KO
1550 total_bytes += bio_bytes;
1551 nr_bytes -= bio_bytes;
1da177e4 1552
f79ea416
KO
1553 if (!nr_bytes)
1554 break;
1da177e4
LT
1555 }
1556
1557 /*
1558 * completely done
1559 */
2e60e022
TH
1560 if (!req->bio) {
1561 /*
1562 * Reset counters so that the request stacking driver
1563 * can find how many bytes remain in the request
1564 * later.
1565 */
a2dec7b3 1566 req->__data_len = 0;
2e60e022
TH
1567 return false;
1568 }
1da177e4 1569
a2dec7b3 1570 req->__data_len -= total_bytes;
2e46e8b2
TH
1571
1572 /* update sector only for requests with clear definition of sector */
57292b58 1573 if (!blk_rq_is_passthrough(req))
a2dec7b3 1574 req->__sector += total_bytes >> 9;
2e46e8b2 1575
80a761fd 1576 /* mixed attributes always follow the first bio */
e8064021 1577 if (req->rq_flags & RQF_MIXED_MERGE) {
80a761fd 1578 req->cmd_flags &= ~REQ_FAILFAST_MASK;
1eff9d32 1579 req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
80a761fd
TH
1580 }
1581
ed6565e7
CH
1582 if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
1583 /*
1584 * If total number of sectors is less than the first segment
1585 * size, something has gone terribly wrong.
1586 */
1587 if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
1588 blk_dump_rq_flags(req, "request botched");
1589 req->__data_len = blk_rq_cur_bytes(req);
1590 }
2e46e8b2 1591
ed6565e7 1592 /* recalculate the number of segments */
e9cd19c0 1593 req->nr_phys_segments = blk_recalc_rq_segments(req);
ed6565e7 1594 }
2e46e8b2 1595
2e60e022 1596 return true;
1da177e4 1597}
2e60e022 1598EXPORT_SYMBOL_GPL(blk_update_request);
1da177e4 1599
2d4dc890
IL
1600#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1601/**
1602 * rq_flush_dcache_pages - Helper function to flush all pages in a request
1603 * @rq: the request to be flushed
1604 *
1605 * Description:
1606 * Flush all pages in @rq.
1607 */
1608void rq_flush_dcache_pages(struct request *rq)
1609{
1610 struct req_iterator iter;
7988613b 1611 struct bio_vec bvec;
2d4dc890
IL
1612
1613 rq_for_each_segment(bvec, rq, iter)
7988613b 1614 flush_dcache_page(bvec.bv_page);
2d4dc890
IL
1615}
1616EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
1617#endif
1618
ef9e3fac
KU
1619/**
1620 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
1621 * @q : the queue of the device being checked
1622 *
1623 * Description:
1624 * Check if underlying low-level drivers of a device are busy.
1625 * If the drivers want to export their busy state, they must set own
1626 * exporting function using blk_queue_lld_busy() first.
1627 *
1628 * Basically, this function is used only by request stacking drivers
1629 * to stop dispatching requests to underlying devices when underlying
1630 * devices are busy. This behavior helps more I/O merging on the queue
1631 * of the request stacking driver and prevents I/O throughput regression
1632 * on burst I/O load.
1633 *
1634 * Return:
1635 * 0 - Not busy (The request stacking driver should dispatch request)
1636 * 1 - Busy (The request stacking driver should stop dispatching request)
1637 */
1638int blk_lld_busy(struct request_queue *q)
1639{
344e9ffc 1640 if (queue_is_mq(q) && q->mq_ops->busy)
9ba20527 1641 return q->mq_ops->busy(q);
ef9e3fac
KU
1642
1643 return 0;
1644}
1645EXPORT_SYMBOL_GPL(blk_lld_busy);
1646
78d8e58a
MS
1647/**
1648 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
1649 * @rq: the clone request to be cleaned up
1650 *
1651 * Description:
1652 * Free all bios in @rq for a cloned request.
1653 */
1654void blk_rq_unprep_clone(struct request *rq)
1655{
1656 struct bio *bio;
1657
1658 while ((bio = rq->bio) != NULL) {
1659 rq->bio = bio->bi_next;
1660
1661 bio_put(bio);
1662 }
1663}
1664EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
1665
78d8e58a
MS
1666/**
1667 * blk_rq_prep_clone - Helper function to setup clone request
1668 * @rq: the request to be setup
1669 * @rq_src: original request to be cloned
1670 * @bs: bio_set that bios for clone are allocated from
1671 * @gfp_mask: memory allocation mask for bio
1672 * @bio_ctr: setup function to be called for each clone bio.
1673 * Returns %0 for success, non %0 for failure.
1674 * @data: private data to be passed to @bio_ctr
1675 *
1676 * Description:
1677 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
78d8e58a
MS
1678 * Also, pages which the original bios are pointing to are not copied
1679 * and the cloned bios just point same pages.
1680 * So cloned bios must be completed before original bios, which means
1681 * the caller must complete @rq before @rq_src.
1682 */
1683int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
1684 struct bio_set *bs, gfp_t gfp_mask,
1685 int (*bio_ctr)(struct bio *, struct bio *, void *),
1686 void *data)
1687{
1688 struct bio *bio, *bio_src;
1689
1690 if (!bs)
f4f8154a 1691 bs = &fs_bio_set;
78d8e58a
MS
1692
1693 __rq_for_each_bio(bio_src, rq_src) {
1694 bio = bio_clone_fast(bio_src, gfp_mask, bs);
1695 if (!bio)
1696 goto free_and_out;
1697
1698 if (bio_ctr && bio_ctr(bio, bio_src, data))
1699 goto free_and_out;
1700
1701 if (rq->bio) {
1702 rq->biotail->bi_next = bio;
1703 rq->biotail = bio;
1704 } else
1705 rq->bio = rq->biotail = bio;
1706 }
1707
361301a2
GJ
1708 /* Copy attributes of the original request to the clone request. */
1709 rq->__sector = blk_rq_pos(rq_src);
1710 rq->__data_len = blk_rq_bytes(rq_src);
1711 if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
1712 rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
1713 rq->special_vec = rq_src->special_vec;
1714 }
1715 rq->nr_phys_segments = rq_src->nr_phys_segments;
1716 rq->ioprio = rq_src->ioprio;
78d8e58a 1717
a892c8d5
ST
1718 if (rq->bio)
1719 blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask);
1720
78d8e58a
MS
1721 return 0;
1722
1723free_and_out:
1724 if (bio)
1725 bio_put(bio);
1726 blk_rq_unprep_clone(rq);
1727
1728 return -ENOMEM;
b0fd271d
KU
1729}
1730EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
1731
59c3d45e 1732int kblockd_schedule_work(struct work_struct *work)
1da177e4
LT
1733{
1734 return queue_work(kblockd_workqueue, work);
1735}
1da177e4
LT
1736EXPORT_SYMBOL(kblockd_schedule_work);
1737
818cd1cb
JA
1738int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
1739 unsigned long delay)
1740{
1741 return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
1742}
1743EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
1744
75df7136
SJ
1745/**
1746 * blk_start_plug - initialize blk_plug and track it inside the task_struct
1747 * @plug: The &struct blk_plug that needs to be initialized
1748 *
1749 * Description:
40405851
JM
1750 * blk_start_plug() indicates to the block layer an intent by the caller
1751 * to submit multiple I/O requests in a batch. The block layer may use
1752 * this hint to defer submitting I/Os from the caller until blk_finish_plug()
1753 * is called. However, the block layer may choose to submit requests
1754 * before a call to blk_finish_plug() if the number of queued I/Os
1755 * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
1756 * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if
1757 * the task schedules (see below).
1758 *
75df7136
SJ
1759 * Tracking blk_plug inside the task_struct will help with auto-flushing the
1760 * pending I/O should the task end up blocking between blk_start_plug() and
1761 * blk_finish_plug(). This is important from a performance perspective, but
1762 * also ensures that we don't deadlock. For instance, if the task is blocking
1763 * for a memory allocation, memory reclaim could end up wanting to free a
1764 * page belonging to that request that is currently residing in our private
1765 * plug. By flushing the pending I/O when the process goes to sleep, we avoid
1766 * this kind of deadlock.
1767 */
73c10101
JA
1768void blk_start_plug(struct blk_plug *plug)
1769{
1770 struct task_struct *tsk = current;
1771
dd6cf3e1
SL
1772 /*
1773 * If this is a nested plug, don't actually assign it.
1774 */
1775 if (tsk->plug)
1776 return;
1777
320ae51f 1778 INIT_LIST_HEAD(&plug->mq_list);
048c9374 1779 INIT_LIST_HEAD(&plug->cb_list);
5f0ed774 1780 plug->rq_count = 0;
ce5b009c 1781 plug->multiple_queues = false;
5f0ed774 1782
73c10101 1783 /*
dd6cf3e1
SL
1784 * Store ordering should not be needed here, since a potential
1785 * preempt will imply a full memory barrier
73c10101 1786 */
dd6cf3e1 1787 tsk->plug = plug;
73c10101
JA
1788}
1789EXPORT_SYMBOL(blk_start_plug);
1790
74018dc3 1791static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
048c9374
N
1792{
1793 LIST_HEAD(callbacks);
1794
2a7d5559
SL
1795 while (!list_empty(&plug->cb_list)) {
1796 list_splice_init(&plug->cb_list, &callbacks);
048c9374 1797
2a7d5559
SL
1798 while (!list_empty(&callbacks)) {
1799 struct blk_plug_cb *cb = list_first_entry(&callbacks,
048c9374
N
1800 struct blk_plug_cb,
1801 list);
2a7d5559 1802 list_del(&cb->list);
74018dc3 1803 cb->callback(cb, from_schedule);
2a7d5559 1804 }
048c9374
N
1805 }
1806}
1807
9cbb1750
N
1808struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
1809 int size)
1810{
1811 struct blk_plug *plug = current->plug;
1812 struct blk_plug_cb *cb;
1813
1814 if (!plug)
1815 return NULL;
1816
1817 list_for_each_entry(cb, &plug->cb_list, list)
1818 if (cb->callback == unplug && cb->data == data)
1819 return cb;
1820
1821 /* Not currently on the callback list */
1822 BUG_ON(size < sizeof(*cb));
1823 cb = kzalloc(size, GFP_ATOMIC);
1824 if (cb) {
1825 cb->data = data;
1826 cb->callback = unplug;
1827 list_add(&cb->list, &plug->cb_list);
1828 }
1829 return cb;
1830}
1831EXPORT_SYMBOL(blk_check_plugged);
1832
49cac01e 1833void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
73c10101 1834{
74018dc3 1835 flush_plug_callbacks(plug, from_schedule);
320ae51f
JA
1836
1837 if (!list_empty(&plug->mq_list))
1838 blk_mq_flush_plug_list(plug, from_schedule);
73c10101 1839}
73c10101 1840
40405851
JM
1841/**
1842 * blk_finish_plug - mark the end of a batch of submitted I/O
1843 * @plug: The &struct blk_plug passed to blk_start_plug()
1844 *
1845 * Description:
1846 * Indicate that a batch of I/O submissions is complete. This function
1847 * must be paired with an initial call to blk_start_plug(). The intent
1848 * is to allow the block layer to optimize I/O submission. See the
1849 * documentation for blk_start_plug() for more information.
1850 */
73c10101
JA
1851void blk_finish_plug(struct blk_plug *plug)
1852{
dd6cf3e1
SL
1853 if (plug != current->plug)
1854 return;
f6603783 1855 blk_flush_plug_list(plug, false);
73c10101 1856
dd6cf3e1 1857 current->plug = NULL;
73c10101 1858}
88b996cd 1859EXPORT_SYMBOL(blk_finish_plug);
73c10101 1860
71ac860a
ML
1861void blk_io_schedule(void)
1862{
1863 /* Prevent hang_check timer from firing at us during very long I/O */
1864 unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
1865
1866 if (timeout)
1867 io_schedule_timeout(timeout);
1868 else
1869 io_schedule();
1870}
1871EXPORT_SYMBOL_GPL(blk_io_schedule);
1872
1da177e4
LT
1873int __init blk_dev_init(void)
1874{
ef295ecf
CH
1875 BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
1876 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
c593642c 1877 sizeof_field(struct request, cmd_flags));
ef295ecf 1878 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
c593642c 1879 sizeof_field(struct bio, bi_opf));
9eb55b03 1880
89b90be2
TH
1881 /* used for unplugging and affects IO latency/throughput - HIGHPRI */
1882 kblockd_workqueue = alloc_workqueue("kblockd",
28747fcd 1883 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1da177e4
LT
1884 if (!kblockd_workqueue)
1885 panic("Failed to create kblockd\n");
1886
c2789bd4 1887 blk_requestq_cachep = kmem_cache_create("request_queue",
165125e1 1888 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
1da177e4 1889
18fbda91
OS
1890#ifdef CONFIG_DEBUG_FS
1891 blk_debugfs_root = debugfs_create_dir("block", NULL);
1892#endif
1893
d38ecf93 1894 return 0;
1da177e4 1895}