block: initialize the target bio in __bio_clone_fast
[linux-block.git] / block / bio.c
CommitLineData
8c16567d 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
0fe23479 3 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
1da177e4
LT
4 */
5#include <linux/mm.h>
6#include <linux/swap.h>
7#include <linux/bio.h>
8#include <linux/blkdev.h>
a27bb332 9#include <linux/uio.h>
852c788f 10#include <linux/iocontext.h>
1da177e4
LT
11#include <linux/slab.h>
12#include <linux/init.h>
13#include <linux/kernel.h>
630d9c47 14#include <linux/export.h>
1da177e4
LT
15#include <linux/mempool.h>
16#include <linux/workqueue.h>
852c788f 17#include <linux/cgroup.h>
08e18eab 18#include <linux/blk-cgroup.h>
b4c5875d 19#include <linux/highmem.h>
de6a78b6 20#include <linux/sched/sysctl.h>
a892c8d5 21#include <linux/blk-crypto.h>
49d1ec85 22#include <linux/xarray.h>
1da177e4 23
55782138 24#include <trace/events/block.h>
9e234eea 25#include "blk.h"
67b42d0b 26#include "blk-rq-qos.h"
0bfc2455 27
be4d234d 28struct bio_alloc_cache {
fcade2ce 29 struct bio *free_list;
be4d234d
JA
30 unsigned int nr;
31};
32
de76fd89 33static struct biovec_slab {
6ac0b715
CH
34 int nr_vecs;
35 char *name;
36 struct kmem_cache *slab;
de76fd89
CH
37} bvec_slabs[] __read_mostly = {
38 { .nr_vecs = 16, .name = "biovec-16" },
39 { .nr_vecs = 64, .name = "biovec-64" },
40 { .nr_vecs = 128, .name = "biovec-128" },
a8affc03 41 { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
1da177e4 42};
6ac0b715 43
7a800a20
CH
44static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
45{
46 switch (nr_vecs) {
47 /* smaller bios use inline vecs */
48 case 5 ... 16:
49 return &bvec_slabs[0];
50 case 17 ... 64:
51 return &bvec_slabs[1];
52 case 65 ... 128:
53 return &bvec_slabs[2];
a8affc03 54 case 129 ... BIO_MAX_VECS:
7a800a20
CH
55 return &bvec_slabs[3];
56 default:
57 BUG();
58 return NULL;
59 }
60}
1da177e4 61
1da177e4
LT
62/*
63 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
64 * IO code that does not need private memory pools.
65 */
f4f8154a 66struct bio_set fs_bio_set;
3f86a82a 67EXPORT_SYMBOL(fs_bio_set);
1da177e4 68
bb799ca0
JA
69/*
70 * Our slab pool management
71 */
72struct bio_slab {
73 struct kmem_cache *slab;
74 unsigned int slab_ref;
75 unsigned int slab_size;
76 char name[8];
77};
78static DEFINE_MUTEX(bio_slab_lock);
49d1ec85 79static DEFINE_XARRAY(bio_slabs);
bb799ca0 80
49d1ec85 81static struct bio_slab *create_bio_slab(unsigned int size)
bb799ca0 82{
49d1ec85 83 struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL);
bb799ca0 84
49d1ec85
ML
85 if (!bslab)
86 return NULL;
bb799ca0 87
49d1ec85
ML
88 snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
89 bslab->slab = kmem_cache_create(bslab->name, size,
1a7e76e4
CH
90 ARCH_KMALLOC_MINALIGN,
91 SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
49d1ec85
ML
92 if (!bslab->slab)
93 goto fail_alloc_slab;
bb799ca0 94
49d1ec85
ML
95 bslab->slab_ref = 1;
96 bslab->slab_size = size;
bb799ca0 97
49d1ec85
ML
98 if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
99 return bslab;
bb799ca0 100
49d1ec85 101 kmem_cache_destroy(bslab->slab);
bb799ca0 102
49d1ec85
ML
103fail_alloc_slab:
104 kfree(bslab);
105 return NULL;
106}
bb799ca0 107
49d1ec85
ML
108static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
109{
9f180e31 110 return bs->front_pad + sizeof(struct bio) + bs->back_pad;
49d1ec85 111}
bb799ca0 112
49d1ec85
ML
113static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
114{
115 unsigned int size = bs_bio_slab_size(bs);
116 struct bio_slab *bslab;
bb799ca0 117
49d1ec85
ML
118 mutex_lock(&bio_slab_lock);
119 bslab = xa_load(&bio_slabs, size);
120 if (bslab)
121 bslab->slab_ref++;
122 else
123 bslab = create_bio_slab(size);
bb799ca0 124 mutex_unlock(&bio_slab_lock);
49d1ec85
ML
125
126 if (bslab)
127 return bslab->slab;
128 return NULL;
bb799ca0
JA
129}
130
131static void bio_put_slab(struct bio_set *bs)
132{
133 struct bio_slab *bslab = NULL;
49d1ec85 134 unsigned int slab_size = bs_bio_slab_size(bs);
bb799ca0
JA
135
136 mutex_lock(&bio_slab_lock);
137
49d1ec85 138 bslab = xa_load(&bio_slabs, slab_size);
bb799ca0
JA
139 if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
140 goto out;
141
49d1ec85
ML
142 WARN_ON_ONCE(bslab->slab != bs->bio_slab);
143
bb799ca0
JA
144 WARN_ON(!bslab->slab_ref);
145
146 if (--bslab->slab_ref)
147 goto out;
148
49d1ec85
ML
149 xa_erase(&bio_slabs, slab_size);
150
bb799ca0 151 kmem_cache_destroy(bslab->slab);
49d1ec85 152 kfree(bslab);
bb799ca0
JA
153
154out:
155 mutex_unlock(&bio_slab_lock);
156}
157
7a800a20 158void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
7ba1ba12 159{
9e8c0d0d 160 BUG_ON(nr_vecs > BIO_MAX_VECS);
ed996a52 161
a8affc03 162 if (nr_vecs == BIO_MAX_VECS)
9f060e22 163 mempool_free(bv, pool);
7a800a20
CH
164 else if (nr_vecs > BIO_INLINE_VECS)
165 kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
bb799ca0 166}
bb799ca0 167
f2c3eb9b
CH
168/*
169 * Make the first allocation restricted and don't dump info on allocation
170 * failures, since we'll fall back to the mempool in case of failure.
171 */
172static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
173{
174 return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
175 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
bb799ca0
JA
176}
177
7a800a20
CH
178struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
179 gfp_t gfp_mask)
1da177e4 180{
7a800a20 181 struct biovec_slab *bvs = biovec_slab(*nr_vecs);
1da177e4 182
7a800a20 183 if (WARN_ON_ONCE(!bvs))
7ff9345f 184 return NULL;
7ff9345f
JA
185
186 /*
7a800a20
CH
187 * Upgrade the nr_vecs request to take full advantage of the allocation.
188 * We also rely on this in the bvec_free path.
7ff9345f 189 */
7a800a20 190 *nr_vecs = bvs->nr_vecs;
7ff9345f 191
7ff9345f 192 /*
f007a3d6
CH
193 * Try a slab allocation first for all smaller allocations. If that
194 * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
a8affc03 195 * The mempool is sized to handle up to BIO_MAX_VECS entries.
7ff9345f 196 */
a8affc03 197 if (*nr_vecs < BIO_MAX_VECS) {
f007a3d6 198 struct bio_vec *bvl;
1da177e4 199
f2c3eb9b 200 bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
7a800a20 201 if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
f007a3d6 202 return bvl;
a8affc03 203 *nr_vecs = BIO_MAX_VECS;
7ff9345f
JA
204 }
205
f007a3d6 206 return mempool_alloc(pool, gfp_mask);
1da177e4
LT
207}
208
9ae3b3f5 209void bio_uninit(struct bio *bio)
1da177e4 210{
db9819c7
CH
211#ifdef CONFIG_BLK_CGROUP
212 if (bio->bi_blkg) {
213 blkg_put(bio->bi_blkg);
214 bio->bi_blkg = NULL;
215 }
216#endif
ece841ab
JT
217 if (bio_integrity(bio))
218 bio_integrity_free(bio);
a892c8d5
ST
219
220 bio_crypt_free_ctx(bio);
4254bba1 221}
9ae3b3f5 222EXPORT_SYMBOL(bio_uninit);
7ba1ba12 223
4254bba1
KO
224static void bio_free(struct bio *bio)
225{
226 struct bio_set *bs = bio->bi_pool;
227 void *p;
228
9ae3b3f5 229 bio_uninit(bio);
4254bba1
KO
230
231 if (bs) {
7a800a20 232 bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
4254bba1
KO
233
234 /*
235 * If we have front padding, adjust the bio pointer before freeing
236 */
237 p = bio;
bb799ca0
JA
238 p -= bs->front_pad;
239
8aa6ba2f 240 mempool_free(p, &bs->bio_pool);
4254bba1
KO
241 } else {
242 /* Bio was allocated by bio_kmalloc() */
243 kfree(bio);
244 }
3676347a
PO
245}
246
9ae3b3f5
JA
247/*
248 * Users of this function have their own bio allocation. Subsequently,
249 * they must remember to pair any call to bio_init() with bio_uninit()
250 * when IO has completed, or when the bio is released.
251 */
49add496
CH
252void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
253 unsigned short max_vecs, unsigned int opf)
1da177e4 254{
da521626 255 bio->bi_next = NULL;
49add496
CH
256 bio->bi_bdev = bdev;
257 bio->bi_opf = opf;
da521626
JA
258 bio->bi_flags = 0;
259 bio->bi_ioprio = 0;
260 bio->bi_write_hint = 0;
261 bio->bi_status = 0;
262 bio->bi_iter.bi_sector = 0;
263 bio->bi_iter.bi_size = 0;
264 bio->bi_iter.bi_idx = 0;
265 bio->bi_iter.bi_bvec_done = 0;
266 bio->bi_end_io = NULL;
267 bio->bi_private = NULL;
268#ifdef CONFIG_BLK_CGROUP
269 bio->bi_blkg = NULL;
270 bio->bi_issue.value = 0;
49add496
CH
271 if (bdev)
272 bio_associate_blkg(bio);
da521626
JA
273#ifdef CONFIG_BLK_CGROUP_IOCOST
274 bio->bi_iocost_cost = 0;
275#endif
276#endif
277#ifdef CONFIG_BLK_INLINE_ENCRYPTION
278 bio->bi_crypt_context = NULL;
279#endif
280#ifdef CONFIG_BLK_DEV_INTEGRITY
281 bio->bi_integrity = NULL;
282#endif
283 bio->bi_vcnt = 0;
284
c4cf5261 285 atomic_set(&bio->__bi_remaining, 1);
dac56212 286 atomic_set(&bio->__bi_cnt, 1);
3e08773c 287 bio->bi_cookie = BLK_QC_T_NONE;
3a83f467 288
3a83f467 289 bio->bi_max_vecs = max_vecs;
da521626
JA
290 bio->bi_io_vec = table;
291 bio->bi_pool = NULL;
1da177e4 292}
a112a71d 293EXPORT_SYMBOL(bio_init);
1da177e4 294
f44b48c7
KO
295/**
296 * bio_reset - reinitialize a bio
297 * @bio: bio to reset
a7c50c94
CH
298 * @bdev: block device to use the bio for
299 * @opf: operation and flags for bio
f44b48c7
KO
300 *
301 * Description:
302 * After calling bio_reset(), @bio will be in the same state as a freshly
303 * allocated bio returned bio bio_alloc_bioset() - the only fields that are
304 * preserved are the ones that are initialized by bio_alloc_bioset(). See
305 * comment in struct bio.
306 */
a7c50c94 307void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf)
f44b48c7 308{
9ae3b3f5 309 bio_uninit(bio);
f44b48c7 310 memset(bio, 0, BIO_RESET_BYTES);
c4cf5261 311 atomic_set(&bio->__bi_remaining, 1);
a7c50c94 312 bio->bi_bdev = bdev;
78e34374
CH
313 if (bio->bi_bdev)
314 bio_associate_blkg(bio);
a7c50c94 315 bio->bi_opf = opf;
f44b48c7
KO
316}
317EXPORT_SYMBOL(bio_reset);
318
38f8baae 319static struct bio *__bio_chain_endio(struct bio *bio)
196d38bc 320{
4246a0b6
CH
321 struct bio *parent = bio->bi_private;
322
3edf5346 323 if (bio->bi_status && !parent->bi_status)
4e4cbee9 324 parent->bi_status = bio->bi_status;
196d38bc 325 bio_put(bio);
38f8baae
CH
326 return parent;
327}
328
329static void bio_chain_endio(struct bio *bio)
330{
331 bio_endio(__bio_chain_endio(bio));
196d38bc
KO
332}
333
334/**
335 * bio_chain - chain bio completions
1051a902 336 * @bio: the target bio
5b874af6 337 * @parent: the parent bio of @bio
196d38bc
KO
338 *
339 * The caller won't have a bi_end_io called when @bio completes - instead,
340 * @parent's bi_end_io won't be called until both @parent and @bio have
341 * completed; the chained bio will also be freed when it completes.
342 *
343 * The caller must not set bi_private or bi_end_io in @bio.
344 */
345void bio_chain(struct bio *bio, struct bio *parent)
346{
347 BUG_ON(bio->bi_private || bio->bi_end_io);
348
349 bio->bi_private = parent;
350 bio->bi_end_io = bio_chain_endio;
c4cf5261 351 bio_inc_remaining(parent);
196d38bc
KO
352}
353EXPORT_SYMBOL(bio_chain);
354
0a3140ea
CK
355struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
356 unsigned int nr_pages, unsigned int opf, gfp_t gfp)
3b005bf6 357{
07888c66 358 struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
0a3140ea 359
3b005bf6
CH
360 if (bio) {
361 bio_chain(bio, new);
362 submit_bio(bio);
363 }
364
365 return new;
366}
367EXPORT_SYMBOL_GPL(blk_next_bio);
368
df2cb6da
KO
369static void bio_alloc_rescue(struct work_struct *work)
370{
371 struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
372 struct bio *bio;
373
374 while (1) {
375 spin_lock(&bs->rescue_lock);
376 bio = bio_list_pop(&bs->rescue_list);
377 spin_unlock(&bs->rescue_lock);
378
379 if (!bio)
380 break;
381
ed00aabd 382 submit_bio_noacct(bio);
df2cb6da
KO
383 }
384}
385
386static void punt_bios_to_rescuer(struct bio_set *bs)
387{
388 struct bio_list punt, nopunt;
389 struct bio *bio;
390
47e0fb46
N
391 if (WARN_ON_ONCE(!bs->rescue_workqueue))
392 return;
df2cb6da
KO
393 /*
394 * In order to guarantee forward progress we must punt only bios that
395 * were allocated from this bio_set; otherwise, if there was a bio on
396 * there for a stacking driver higher up in the stack, processing it
397 * could require allocating bios from this bio_set, and doing that from
398 * our own rescuer would be bad.
399 *
400 * Since bio lists are singly linked, pop them all instead of trying to
401 * remove from the middle of the list:
402 */
403
404 bio_list_init(&punt);
405 bio_list_init(&nopunt);
406
f5fe1b51 407 while ((bio = bio_list_pop(&current->bio_list[0])))
df2cb6da 408 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
f5fe1b51 409 current->bio_list[0] = nopunt;
df2cb6da 410
f5fe1b51
N
411 bio_list_init(&nopunt);
412 while ((bio = bio_list_pop(&current->bio_list[1])))
413 bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
414 current->bio_list[1] = nopunt;
df2cb6da
KO
415
416 spin_lock(&bs->rescue_lock);
417 bio_list_merge(&bs->rescue_list, &punt);
418 spin_unlock(&bs->rescue_lock);
419
420 queue_work(bs->rescue_workqueue, &bs->rescue_work);
421}
422
1da177e4
LT
423/**
424 * bio_alloc_bioset - allocate a bio for I/O
609be106
CH
425 * @bdev: block device to allocate the bio for (can be %NULL)
426 * @nr_vecs: number of bvecs to pre-allocate
427 * @opf: operation and flags for bio
519c8e9f 428 * @gfp_mask: the GFP_* mask given to the slab allocator
db18efac 429 * @bs: the bio_set to allocate from.
1da177e4 430 *
3175199a 431 * Allocate a bio from the mempools in @bs.
3f86a82a 432 *
3175199a
CH
433 * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
434 * allocate a bio. This is due to the mempool guarantees. To make this work,
435 * callers must never allocate more than 1 bio at a time from the general pool.
436 * Callers that need to allocate more than 1 bio must always submit the
437 * previously allocated bio for IO before attempting to allocate a new one.
438 * Failure to do so can cause deadlocks under memory pressure.
3f86a82a 439 *
3175199a
CH
440 * Note that when running under submit_bio_noacct() (i.e. any block driver),
441 * bios are not submitted until after you return - see the code in
442 * submit_bio_noacct() that converts recursion into iteration, to prevent
443 * stack overflows.
df2cb6da 444 *
3175199a
CH
445 * This would normally mean allocating multiple bios under submit_bio_noacct()
446 * would be susceptible to deadlocks, but we have
447 * deadlock avoidance code that resubmits any blocked bios from a rescuer
448 * thread.
df2cb6da 449 *
3175199a
CH
450 * However, we do not guarantee forward progress for allocations from other
451 * mempools. Doing multiple allocations from the same mempool under
452 * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
453 * for per bio allocations.
df2cb6da 454 *
3175199a 455 * Returns: Pointer to new bio on success, NULL on failure.
3f86a82a 456 */
609be106
CH
457struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
458 unsigned int opf, gfp_t gfp_mask,
7a88fa19 459 struct bio_set *bs)
1da177e4 460{
df2cb6da 461 gfp_t saved_gfp = gfp_mask;
451a9ebf
TH
462 struct bio *bio;
463 void *p;
464
609be106
CH
465 /* should not use nobvec bioset for nr_vecs > 0 */
466 if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
3175199a 467 return NULL;
df2cb6da 468
3175199a
CH
469 /*
470 * submit_bio_noacct() converts recursion to iteration; this means if
471 * we're running beneath it, any bios we allocate and submit will not be
472 * submitted (and thus freed) until after we return.
473 *
474 * This exposes us to a potential deadlock if we allocate multiple bios
475 * from the same bio_set() while running underneath submit_bio_noacct().
476 * If we were to allocate multiple bios (say a stacking block driver
477 * that was splitting bios), we would deadlock if we exhausted the
478 * mempool's reserve.
479 *
480 * We solve this, and guarantee forward progress, with a rescuer
481 * workqueue per bio_set. If we go to allocate and there are bios on
482 * current->bio_list, we first try the allocation without
483 * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
484 * blocking to the rescuer workqueue before we retry with the original
485 * gfp_flags.
486 */
487 if (current->bio_list &&
488 (!bio_list_empty(&current->bio_list[0]) ||
489 !bio_list_empty(&current->bio_list[1])) &&
490 bs->rescue_workqueue)
491 gfp_mask &= ~__GFP_DIRECT_RECLAIM;
492
493 p = mempool_alloc(&bs->bio_pool, gfp_mask);
494 if (!p && gfp_mask != saved_gfp) {
495 punt_bios_to_rescuer(bs);
496 gfp_mask = saved_gfp;
8aa6ba2f 497 p = mempool_alloc(&bs->bio_pool, gfp_mask);
3f86a82a 498 }
451a9ebf
TH
499 if (unlikely(!p))
500 return NULL;
1da177e4 501
3175199a 502 bio = p + bs->front_pad;
609be106 503 if (nr_vecs > BIO_INLINE_VECS) {
3175199a 504 struct bio_vec *bvl = NULL;
34053979 505
609be106 506 bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
df2cb6da
KO
507 if (!bvl && gfp_mask != saved_gfp) {
508 punt_bios_to_rescuer(bs);
509 gfp_mask = saved_gfp;
609be106 510 bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
df2cb6da 511 }
34053979
IM
512 if (unlikely(!bvl))
513 goto err_free;
a38352e0 514
49add496 515 bio_init(bio, bdev, bvl, nr_vecs, opf);
609be106 516 } else if (nr_vecs) {
49add496 517 bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
3175199a 518 } else {
49add496 519 bio_init(bio, bdev, NULL, 0, opf);
1da177e4 520 }
3f86a82a
KO
521
522 bio->bi_pool = bs;
1da177e4 523 return bio;
34053979
IM
524
525err_free:
8aa6ba2f 526 mempool_free(p, &bs->bio_pool);
34053979 527 return NULL;
1da177e4 528}
a112a71d 529EXPORT_SYMBOL(bio_alloc_bioset);
1da177e4 530
3175199a
CH
531/**
532 * bio_kmalloc - kmalloc a bio for I/O
533 * @gfp_mask: the GFP_* mask given to the slab allocator
534 * @nr_iovecs: number of iovecs to pre-allocate
535 *
536 * Use kmalloc to allocate and initialize a bio.
537 *
538 * Returns: Pointer to new bio on success, NULL on failure.
539 */
0f2e6ab8 540struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs)
3175199a
CH
541{
542 struct bio *bio;
543
544 if (nr_iovecs > UIO_MAXIOV)
545 return NULL;
546
547 bio = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
548 if (unlikely(!bio))
549 return NULL;
49add496
CH
550 bio_init(bio, NULL, nr_iovecs ? bio->bi_inline_vecs : NULL, nr_iovecs,
551 0);
3175199a
CH
552 bio->bi_pool = NULL;
553 return bio;
554}
555EXPORT_SYMBOL(bio_kmalloc);
556
6f822e1b 557void zero_fill_bio(struct bio *bio)
1da177e4 558{
7988613b
KO
559 struct bio_vec bv;
560 struct bvec_iter iter;
1da177e4 561
ab6c340e
CH
562 bio_for_each_segment(bv, bio, iter)
563 memzero_bvec(&bv);
1da177e4 564}
6f822e1b 565EXPORT_SYMBOL(zero_fill_bio);
1da177e4 566
83c9c547
ML
567/**
568 * bio_truncate - truncate the bio to small size of @new_size
569 * @bio: the bio to be truncated
570 * @new_size: new size for truncating the bio
571 *
572 * Description:
573 * Truncate the bio to new size of @new_size. If bio_op(bio) is
574 * REQ_OP_READ, zero the truncated part. This function should only
575 * be used for handling corner cases, such as bio eod.
576 */
4f7ab09a 577static void bio_truncate(struct bio *bio, unsigned new_size)
85a8ce62
ML
578{
579 struct bio_vec bv;
580 struct bvec_iter iter;
581 unsigned int done = 0;
582 bool truncated = false;
583
584 if (new_size >= bio->bi_iter.bi_size)
585 return;
586
83c9c547 587 if (bio_op(bio) != REQ_OP_READ)
85a8ce62
ML
588 goto exit;
589
590 bio_for_each_segment(bv, bio, iter) {
591 if (done + bv.bv_len > new_size) {
592 unsigned offset;
593
594 if (!truncated)
595 offset = new_size - done;
596 else
597 offset = 0;
3ee859e3
OH
598 zero_user(bv.bv_page, bv.bv_offset + offset,
599 bv.bv_len - offset);
85a8ce62
ML
600 truncated = true;
601 }
602 done += bv.bv_len;
603 }
604
605 exit:
606 /*
607 * Don't touch bvec table here and make it really immutable, since
608 * fs bio user has to retrieve all pages via bio_for_each_segment_all
609 * in its .end_bio() callback.
610 *
611 * It is enough to truncate bio by updating .bi_size since we can make
612 * correct bvec with the updated .bi_size for drivers.
613 */
614 bio->bi_iter.bi_size = new_size;
615}
616
29125ed6
CH
617/**
618 * guard_bio_eod - truncate a BIO to fit the block device
619 * @bio: bio to truncate
620 *
621 * This allows us to do IO even on the odd last sectors of a device, even if the
622 * block size is some multiple of the physical sector size.
623 *
624 * We'll just truncate the bio to the size of the device, and clear the end of
625 * the buffer head manually. Truly out-of-range accesses will turn into actual
626 * I/O errors, this only handles the "we need to be able to do I/O at the final
627 * sector" case.
628 */
629void guard_bio_eod(struct bio *bio)
630{
309dca30 631 sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
29125ed6
CH
632
633 if (!maxsector)
634 return;
635
636 /*
637 * If the *whole* IO is past the end of the device,
638 * let it through, and the IO layer will turn it into
639 * an EIO.
640 */
641 if (unlikely(bio->bi_iter.bi_sector >= maxsector))
642 return;
643
644 maxsector -= bio->bi_iter.bi_sector;
645 if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
646 return;
647
648 bio_truncate(bio, maxsector << 9);
649}
650
be4d234d
JA
651#define ALLOC_CACHE_MAX 512
652#define ALLOC_CACHE_SLACK 64
653
654static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
655 unsigned int nr)
656{
657 unsigned int i = 0;
658 struct bio *bio;
659
fcade2ce
JA
660 while ((bio = cache->free_list) != NULL) {
661 cache->free_list = bio->bi_next;
be4d234d
JA
662 cache->nr--;
663 bio_free(bio);
664 if (++i == nr)
665 break;
666 }
667}
668
669static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
670{
671 struct bio_set *bs;
672
673 bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
674 if (bs->cache) {
675 struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
676
677 bio_alloc_cache_prune(cache, -1U);
678 }
679 return 0;
680}
681
682static void bio_alloc_cache_destroy(struct bio_set *bs)
683{
684 int cpu;
685
686 if (!bs->cache)
687 return;
688
689 cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
690 for_each_possible_cpu(cpu) {
691 struct bio_alloc_cache *cache;
692
693 cache = per_cpu_ptr(bs->cache, cpu);
694 bio_alloc_cache_prune(cache, -1U);
695 }
696 free_percpu(bs->cache);
697}
698
1da177e4
LT
699/**
700 * bio_put - release a reference to a bio
701 * @bio: bio to release reference to
702 *
703 * Description:
704 * Put a reference to a &struct bio, either one you have gotten with
9b10f6a9 705 * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it.
1da177e4
LT
706 **/
707void bio_put(struct bio *bio)
708{
be4d234d 709 if (unlikely(bio_flagged(bio, BIO_REFFED))) {
9e8c0d0d 710 BUG_ON(!atomic_read(&bio->__bi_cnt));
be4d234d
JA
711 if (!atomic_dec_and_test(&bio->__bi_cnt))
712 return;
713 }
dac56212 714
be4d234d
JA
715 if (bio_flagged(bio, BIO_PERCPU_CACHE)) {
716 struct bio_alloc_cache *cache;
717
718 bio_uninit(bio);
719 cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
fcade2ce
JA
720 bio->bi_next = cache->free_list;
721 cache->free_list = bio;
be4d234d
JA
722 if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
723 bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
724 put_cpu();
725 } else {
726 bio_free(bio);
dac56212 727 }
1da177e4 728}
a112a71d 729EXPORT_SYMBOL(bio_put);
1da177e4 730
a0e8de79 731static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
59d276fe 732{
b7c44ed9 733 bio_set_flag(bio, BIO_CLONED);
111be883
SL
734 if (bio_flagged(bio_src, BIO_THROTTLED))
735 bio_set_flag(bio, BIO_THROTTLED);
46bbf653
CH
736 if (bio_flagged(bio_src, BIO_REMAPPED))
737 bio_set_flag(bio, BIO_REMAPPED);
ca474b73 738 bio->bi_ioprio = bio_src->bi_ioprio;
cb6934f8 739 bio->bi_write_hint = bio_src->bi_write_hint;
59d276fe 740 bio->bi_iter = bio_src->bi_iter;
20bd723e 741
db6638d7 742 bio_clone_blkg_association(bio, bio_src);
e439bedf 743 blkcg_bio_issue_init(bio);
56b4b5ab
CH
744
745 if (bio_crypt_clone(bio, bio_src, gfp) < 0)
746 return -ENOMEM;
747 if (bio_integrity(bio_src) &&
748 bio_integrity_clone(bio, bio_src, gfp) < 0)
749 return -ENOMEM;
750 return 0;
59d276fe 751}
59d276fe
KO
752
753/**
a0e8de79
CH
754 * bio_clone_fast - clone a bio that shares the original bio's biovec
755 * @bio_src: bio to clone from
756 * @gfp: allocation priority
757 * @bs: bio_set to allocate from
59d276fe 758 *
a0e8de79
CH
759 * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
760 * bio, but not the actual data it points to.
761 *
762 * The caller must ensure that the return bio is not freed before @bio_src.
59d276fe 763 */
a0e8de79 764struct bio *bio_clone_fast(struct bio *bio_src, gfp_t gfp, struct bio_set *bs)
59d276fe 765{
a0e8de79 766 struct bio *bio;
59d276fe 767
a0e8de79
CH
768 bio = bio_alloc_bioset(bio_src->bi_bdev, 0, bio_src->bi_opf, gfp, bs);
769 if (!bio)
59d276fe
KO
770 return NULL;
771
a0e8de79
CH
772 if (__bio_clone(bio, bio_src, gfp) < 0) {
773 bio_put(bio);
56b4b5ab
CH
774 return NULL;
775 }
a0e8de79 776 bio->bi_io_vec = bio_src->bi_io_vec;
59d276fe 777
a0e8de79 778 return bio;
59d276fe
KO
779}
780EXPORT_SYMBOL(bio_clone_fast);
781
a0e8de79
CH
782/**
783 * __bio_clone_fast - clone a bio that shares the original bio's biovec
784 * @bio: bio to clone into
785 * @bio_src: bio to clone from
786 * @gfp: allocation priority
787 *
788 * Initialize a new bio in caller provided memory that is a clone of @bio_src.
789 * The caller owns the returned bio, but not the actual data it points to.
790 *
791 * The caller must ensure that @bio_src is not freed before @bio.
792 */
793int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp)
794{
795 int ret;
796
797 bio_init(bio, bio_src->bi_bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
798 ret = __bio_clone(bio, bio_src, gfp);
799 if (ret)
800 bio_uninit(bio);
801 return ret;
802}
803EXPORT_SYMBOL(__bio_clone_fast);
804
5cbd28e3
CH
805const char *bio_devname(struct bio *bio, char *buf)
806{
309dca30 807 return bdevname(bio->bi_bdev, buf);
5cbd28e3
CH
808}
809EXPORT_SYMBOL(bio_devname);
810
9a6083be
CH
811/**
812 * bio_full - check if the bio is full
813 * @bio: bio to check
814 * @len: length of one segment to be added
815 *
816 * Return true if @bio is full and one segment with @len bytes can't be
817 * added to the bio, otherwise return false
818 */
819static inline bool bio_full(struct bio *bio, unsigned len)
820{
821 if (bio->bi_vcnt >= bio->bi_max_vecs)
822 return true;
823 if (bio->bi_iter.bi_size > UINT_MAX - len)
824 return true;
825 return false;
826}
827
5919482e
ML
828static inline bool page_is_mergeable(const struct bio_vec *bv,
829 struct page *page, unsigned int len, unsigned int off,
ff896738 830 bool *same_page)
5919482e 831{
d8166519
MWO
832 size_t bv_end = bv->bv_offset + bv->bv_len;
833 phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
5919482e
ML
834 phys_addr_t page_addr = page_to_phys(page);
835
836 if (vec_end_addr + 1 != page_addr + off)
837 return false;
838 if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
839 return false;
52d52d1c 840
ff896738 841 *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
d8166519
MWO
842 if (*same_page)
843 return true;
844 return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
5919482e
ML
845}
846
9774b391
CH
847/**
848 * __bio_try_merge_page - try appending data to an existing bvec.
849 * @bio: destination bio
850 * @page: start page to add
851 * @len: length of the data to add
852 * @off: offset of the data relative to @page
853 * @same_page: return if the segment has been merged inside the same page
854 *
855 * Try to add the data at @page + @off to the last bvec of @bio. This is a
856 * useful optimisation for file systems with a block size smaller than the
857 * page size.
858 *
859 * Warn if (@len, @off) crosses pages in case that @same_page is true.
860 *
861 * Return %true on success or %false on failure.
862 */
863static bool __bio_try_merge_page(struct bio *bio, struct page *page,
864 unsigned int len, unsigned int off, bool *same_page)
865{
866 if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
867 return false;
868
869 if (bio->bi_vcnt > 0) {
870 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
871
872 if (page_is_mergeable(bv, page, len, off, same_page)) {
873 if (bio->bi_iter.bi_size > UINT_MAX - len) {
874 *same_page = false;
875 return false;
876 }
877 bv->bv_len += len;
878 bio->bi_iter.bi_size += len;
879 return true;
880 }
881 }
882 return false;
883}
884
e4581105
CH
885/*
886 * Try to merge a page into a segment, while obeying the hardware segment
887 * size limit. This is not for normal read/write bios, but for passthrough
888 * or Zone Append operations that we can't split.
889 */
890static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
891 struct page *page, unsigned len,
892 unsigned offset, bool *same_page)
489fbbcb 893{
384209cd 894 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
489fbbcb
ML
895 unsigned long mask = queue_segment_boundary(q);
896 phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
897 phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
898
899 if ((addr1 | mask) != (addr2 | mask))
900 return false;
489fbbcb
ML
901 if (bv->bv_len + len > queue_max_segment_size(q))
902 return false;
384209cd 903 return __bio_try_merge_page(bio, page, len, offset, same_page);
489fbbcb
ML
904}
905
1da177e4 906/**
e4581105
CH
907 * bio_add_hw_page - attempt to add a page to a bio with hw constraints
908 * @q: the target queue
909 * @bio: destination bio
910 * @page: page to add
911 * @len: vec entry length
912 * @offset: vec entry offset
913 * @max_sectors: maximum number of sectors that can be added
914 * @same_page: return if the segment has been merged inside the same page
c66a14d0 915 *
e4581105
CH
916 * Add a page to a bio while respecting the hardware max_sectors, max_segment
917 * and gap limitations.
1da177e4 918 */
e4581105 919int bio_add_hw_page(struct request_queue *q, struct bio *bio,
19047087 920 struct page *page, unsigned int len, unsigned int offset,
e4581105 921 unsigned int max_sectors, bool *same_page)
1da177e4 922{
1da177e4
LT
923 struct bio_vec *bvec;
924
e4581105 925 if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
1da177e4
LT
926 return 0;
927
e4581105 928 if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
1da177e4
LT
929 return 0;
930
80cfd548 931 if (bio->bi_vcnt > 0) {
e4581105 932 if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
384209cd 933 return len;
320ea869
CH
934
935 /*
936 * If the queue doesn't support SG gaps and adding this segment
937 * would create a gap, disallow it.
938 */
384209cd 939 bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
320ea869
CH
940 if (bvec_gap_to_prev(q, bvec, offset))
941 return 0;
80cfd548
JA
942 }
943
79d08f89 944 if (bio_full(bio, len))
1da177e4
LT
945 return 0;
946
14ccb66b 947 if (bio->bi_vcnt >= queue_max_segments(q))
489fbbcb
ML
948 return 0;
949
fcbf6a08
ML
950 bvec = &bio->bi_io_vec[bio->bi_vcnt];
951 bvec->bv_page = page;
952 bvec->bv_len = len;
953 bvec->bv_offset = offset;
954 bio->bi_vcnt++;
dcdca753 955 bio->bi_iter.bi_size += len;
1da177e4
LT
956 return len;
957}
19047087 958
e4581105
CH
959/**
960 * bio_add_pc_page - attempt to add page to passthrough bio
961 * @q: the target queue
962 * @bio: destination bio
963 * @page: page to add
964 * @len: vec entry length
965 * @offset: vec entry offset
966 *
967 * Attempt to add a page to the bio_vec maplist. This can fail for a
968 * number of reasons, such as the bio being full or target block device
969 * limitations. The target block device must allow bio's up to PAGE_SIZE,
970 * so it is always possible to add a single page to an empty bio.
971 *
972 * This should only be used by passthrough bios.
973 */
19047087
ML
974int bio_add_pc_page(struct request_queue *q, struct bio *bio,
975 struct page *page, unsigned int len, unsigned int offset)
976{
d1916c86 977 bool same_page = false;
e4581105
CH
978 return bio_add_hw_page(q, bio, page, len, offset,
979 queue_max_hw_sectors(q), &same_page);
19047087 980}
a112a71d 981EXPORT_SYMBOL(bio_add_pc_page);
6e68af66 982
ae29333f
JT
983/**
984 * bio_add_zone_append_page - attempt to add page to zone-append bio
985 * @bio: destination bio
986 * @page: page to add
987 * @len: vec entry length
988 * @offset: vec entry offset
989 *
990 * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
991 * for a zone-append request. This can fail for a number of reasons, such as the
992 * bio being full or the target block device is not a zoned block device or
993 * other limitations of the target block device. The target block device must
994 * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
995 * to an empty bio.
996 *
997 * Returns: number of bytes added to the bio, or 0 in case of a failure.
998 */
999int bio_add_zone_append_page(struct bio *bio, struct page *page,
1000 unsigned int len, unsigned int offset)
1001{
3caee463 1002 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
ae29333f
JT
1003 bool same_page = false;
1004
1005 if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
1006 return 0;
1007
1008 if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
1009 return 0;
1010
1011 return bio_add_hw_page(q, bio, page, len, offset,
1012 queue_max_zone_append_sectors(q), &same_page);
1013}
1014EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
1015
0aa69fd3 1016/**
551879a4 1017 * __bio_add_page - add page(s) to a bio in a new segment
0aa69fd3 1018 * @bio: destination bio
551879a4
ML
1019 * @page: start page to add
1020 * @len: length of the data to add, may cross pages
1021 * @off: offset of the data relative to @page, may cross pages
0aa69fd3
CH
1022 *
1023 * Add the data at @page + @off to @bio as a new bvec. The caller must ensure
1024 * that @bio has space for another bvec.
1025 */
1026void __bio_add_page(struct bio *bio, struct page *page,
1027 unsigned int len, unsigned int off)
1028{
1029 struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
c66a14d0 1030
0aa69fd3 1031 WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
79d08f89 1032 WARN_ON_ONCE(bio_full(bio, len));
0aa69fd3
CH
1033
1034 bv->bv_page = page;
1035 bv->bv_offset = off;
1036 bv->bv_len = len;
c66a14d0 1037
c66a14d0 1038 bio->bi_iter.bi_size += len;
0aa69fd3 1039 bio->bi_vcnt++;
b8e24a93
JW
1040
1041 if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
1042 bio_set_flag(bio, BIO_WORKINGSET);
0aa69fd3
CH
1043}
1044EXPORT_SYMBOL_GPL(__bio_add_page);
1045
1046/**
551879a4 1047 * bio_add_page - attempt to add page(s) to bio
0aa69fd3 1048 * @bio: destination bio
551879a4
ML
1049 * @page: start page to add
1050 * @len: vec entry length, may cross pages
1051 * @offset: vec entry offset relative to @page, may cross pages
0aa69fd3 1052 *
551879a4 1053 * Attempt to add page(s) to the bio_vec maplist. This will only fail
0aa69fd3
CH
1054 * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio.
1055 */
1056int bio_add_page(struct bio *bio, struct page *page,
1057 unsigned int len, unsigned int offset)
1058{
ff896738
CH
1059 bool same_page = false;
1060
1061 if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
79d08f89 1062 if (bio_full(bio, len))
0aa69fd3
CH
1063 return 0;
1064 __bio_add_page(bio, page, len, offset);
1065 }
c66a14d0 1066 return len;
1da177e4 1067}
a112a71d 1068EXPORT_SYMBOL(bio_add_page);
1da177e4 1069
85f5a74c
MWO
1070/**
1071 * bio_add_folio - Attempt to add part of a folio to a bio.
1072 * @bio: BIO to add to.
1073 * @folio: Folio to add.
1074 * @len: How many bytes from the folio to add.
1075 * @off: First byte in this folio to add.
1076 *
1077 * Filesystems that use folios can call this function instead of calling
1078 * bio_add_page() for each page in the folio. If @off is bigger than
1079 * PAGE_SIZE, this function can create a bio_vec that starts in a page
1080 * after the bv_page. BIOs do not support folios that are 4GiB or larger.
1081 *
1082 * Return: Whether the addition was successful.
1083 */
1084bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
1085 size_t off)
1086{
1087 if (len > UINT_MAX || off > UINT_MAX)
455a844d 1088 return false;
85f5a74c
MWO
1089 return bio_add_page(bio, &folio->page, len, off) > 0;
1090}
1091
c809084a 1092void __bio_release_pages(struct bio *bio, bool mark_dirty)
7321ecbf
CH
1093{
1094 struct bvec_iter_all iter_all;
1095 struct bio_vec *bvec;
7321ecbf 1096
d241a95f
CH
1097 bio_for_each_segment_all(bvec, bio, iter_all) {
1098 if (mark_dirty && !PageCompound(bvec->bv_page))
1099 set_page_dirty_lock(bvec->bv_page);
7321ecbf 1100 put_page(bvec->bv_page);
d241a95f 1101 }
7321ecbf 1102}
c809084a 1103EXPORT_SYMBOL_GPL(__bio_release_pages);
7321ecbf 1104
1bb6b810 1105void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
6d0c48ae 1106{
fa5fa8ec
PB
1107 size_t size = iov_iter_count(iter);
1108
7a800a20 1109 WARN_ON_ONCE(bio->bi_max_vecs);
c42bca92 1110
fa5fa8ec
PB
1111 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1112 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
1113 size_t max_sectors = queue_max_zone_append_sectors(q);
1114
1115 size = min(size, max_sectors << SECTOR_SHIFT);
1116 }
1117
c42bca92 1118 bio->bi_vcnt = iter->nr_segs;
c42bca92
PB
1119 bio->bi_io_vec = (struct bio_vec *)iter->bvec;
1120 bio->bi_iter.bi_bvec_done = iter->iov_offset;
fa5fa8ec 1121 bio->bi_iter.bi_size = size;
ed97ce5e 1122 bio_set_flag(bio, BIO_NO_PAGE_REF);
977be012 1123 bio_set_flag(bio, BIO_CLONED);
7de55b7d 1124}
c42bca92 1125
d9cf3bd5
PB
1126static void bio_put_pages(struct page **pages, size_t size, size_t off)
1127{
1128 size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
1129
1130 for (i = 0; i < nr; i++)
1131 put_page(pages[i]);
1132}
1133
576ed913
CH
1134#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
1135
2cefe4db 1136/**
17d51b10 1137 * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
2cefe4db
KO
1138 * @bio: bio to add pages to
1139 * @iter: iov iterator describing the region to be mapped
1140 *
17d51b10 1141 * Pins pages from *iter and appends them to @bio's bvec array. The
2cefe4db 1142 * pages will have to be released using put_page() when done.
17d51b10 1143 * For multi-segment *iter, this function only adds pages from the
3cf14889 1144 * next non-empty segment of the iov iterator.
2cefe4db 1145 */
17d51b10 1146static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
2cefe4db 1147{
576ed913
CH
1148 unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
1149 unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
2cefe4db
KO
1150 struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
1151 struct page **pages = (struct page **)bv;
45691804 1152 bool same_page = false;
576ed913
CH
1153 ssize_t size, left;
1154 unsigned len, i;
b403ea24 1155 size_t offset;
576ed913
CH
1156
1157 /*
1158 * Move page array up in the allocated memory for the bio vecs as far as
1159 * possible so that we can start filling biovecs from the beginning
1160 * without overwriting the temporary page array.
1161 */
1162 BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
1163 pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
2cefe4db 1164
35c820e7 1165 size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
2cefe4db
KO
1166 if (unlikely(size <= 0))
1167 return size ? size : -EFAULT;
2cefe4db 1168
576ed913
CH
1169 for (left = size, i = 0; left > 0; left -= len, i++) {
1170 struct page *page = pages[i];
2cefe4db 1171
576ed913 1172 len = min_t(size_t, PAGE_SIZE - offset, left);
45691804
CH
1173
1174 if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
1175 if (same_page)
1176 put_page(page);
1177 } else {
d9cf3bd5
PB
1178 if (WARN_ON_ONCE(bio_full(bio, len))) {
1179 bio_put_pages(pages + i, left, offset);
1180 return -EINVAL;
1181 }
45691804
CH
1182 __bio_add_page(bio, page, len, offset);
1183 }
576ed913 1184 offset = 0;
2cefe4db
KO
1185 }
1186
2cefe4db
KO
1187 iov_iter_advance(iter, size);
1188 return 0;
1189}
17d51b10 1190
0512a75b
KB
1191static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
1192{
1193 unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
1194 unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
3caee463 1195 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
0512a75b
KB
1196 unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
1197 struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
1198 struct page **pages = (struct page **)bv;
1199 ssize_t size, left;
1200 unsigned len, i;
1201 size_t offset;
4977d121 1202 int ret = 0;
0512a75b
KB
1203
1204 if (WARN_ON_ONCE(!max_append_sectors))
1205 return 0;
1206
1207 /*
1208 * Move page array up in the allocated memory for the bio vecs as far as
1209 * possible so that we can start filling biovecs from the beginning
1210 * without overwriting the temporary page array.
1211 */
1212 BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
1213 pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
1214
1215 size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
1216 if (unlikely(size <= 0))
1217 return size ? size : -EFAULT;
1218
1219 for (left = size, i = 0; left > 0; left -= len, i++) {
1220 struct page *page = pages[i];
1221 bool same_page = false;
1222
1223 len = min_t(size_t, PAGE_SIZE - offset, left);
1224 if (bio_add_hw_page(q, bio, page, len, offset,
4977d121 1225 max_append_sectors, &same_page) != len) {
d9cf3bd5 1226 bio_put_pages(pages + i, left, offset);
4977d121
NA
1227 ret = -EINVAL;
1228 break;
1229 }
0512a75b
KB
1230 if (same_page)
1231 put_page(page);
1232 offset = 0;
1233 }
1234
4977d121
NA
1235 iov_iter_advance(iter, size - left);
1236 return ret;
0512a75b
KB
1237}
1238
17d51b10 1239/**
6d0c48ae 1240 * bio_iov_iter_get_pages - add user or kernel pages to a bio
17d51b10 1241 * @bio: bio to add pages to
6d0c48ae
JA
1242 * @iter: iov iterator describing the region to be added
1243 *
1244 * This takes either an iterator pointing to user memory, or one pointing to
1245 * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
1246 * map them into the kernel. On IO completion, the caller should put those
c42bca92
PB
1247 * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
1248 * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
1249 * to ensure the bvecs and pages stay referenced until the submitted I/O is
1250 * completed by a call to ->ki_complete() or returns with an error other than
1251 * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
1252 * on IO completion. If it isn't, then pages should be released.
17d51b10 1253 *
17d51b10 1254 * The function tries, but does not guarantee, to pin as many pages as
5cd3ddc1 1255 * fit into the bio, or are requested in @iter, whatever is smaller. If
6d0c48ae
JA
1256 * MM encounters an error pinning the requested pages, it stops. Error
1257 * is returned only if 0 pages could be pinned.
0cf41e5e
PB
1258 *
1259 * It's intended for direct IO, so doesn't do PSI tracking, the caller is
1260 * responsible for setting BIO_WORKINGSET if necessary.
17d51b10
MW
1261 */
1262int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
1263{
c42bca92 1264 int ret = 0;
14eacf12 1265
c42bca92 1266 if (iov_iter_is_bvec(iter)) {
fa5fa8ec
PB
1267 bio_iov_bvec_set(bio, iter);
1268 iov_iter_advance(iter, bio->bi_iter.bi_size);
1269 return 0;
c42bca92 1270 }
17d51b10
MW
1271
1272 do {
86004515 1273 if (bio_op(bio) == REQ_OP_ZONE_APPEND)
0512a75b 1274 ret = __bio_iov_append_get_pages(bio, iter);
86004515
CH
1275 else
1276 ret = __bio_iov_iter_get_pages(bio, iter);
79d08f89 1277 } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
17d51b10 1278
0cf41e5e
PB
1279 /* don't account direct I/O as memory stall */
1280 bio_clear_flag(bio, BIO_WORKINGSET);
14eacf12 1281 return bio->bi_vcnt ? 0 : ret;
17d51b10 1282}
29b2a3aa 1283EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
2cefe4db 1284
4246a0b6 1285static void submit_bio_wait_endio(struct bio *bio)
9e882242 1286{
65e53aab 1287 complete(bio->bi_private);
9e882242
KO
1288}
1289
1290/**
1291 * submit_bio_wait - submit a bio, and wait until it completes
9e882242
KO
1292 * @bio: The &struct bio which describes the I/O
1293 *
1294 * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
1295 * bio_endio() on failure.
3d289d68
JK
1296 *
1297 * WARNING: Unlike to how submit_bio() is usually used, this function does not
1298 * result in bio reference to be consumed. The caller must drop the reference
1299 * on his own.
9e882242 1300 */
4e49ea4a 1301int submit_bio_wait(struct bio *bio)
9e882242 1302{
309dca30
CH
1303 DECLARE_COMPLETION_ONSTACK_MAP(done,
1304 bio->bi_bdev->bd_disk->lockdep_map);
de6a78b6 1305 unsigned long hang_check;
9e882242 1306
65e53aab 1307 bio->bi_private = &done;
9e882242 1308 bio->bi_end_io = submit_bio_wait_endio;
1eff9d32 1309 bio->bi_opf |= REQ_SYNC;
4e49ea4a 1310 submit_bio(bio);
de6a78b6
ML
1311
1312 /* Prevent hang_check timer from firing at us during very long I/O */
1313 hang_check = sysctl_hung_task_timeout_secs;
1314 if (hang_check)
1315 while (!wait_for_completion_io_timeout(&done,
1316 hang_check * (HZ/2)))
1317 ;
1318 else
1319 wait_for_completion_io(&done);
9e882242 1320
65e53aab 1321 return blk_status_to_errno(bio->bi_status);
9e882242
KO
1322}
1323EXPORT_SYMBOL(submit_bio_wait);
1324
d4aa57a1 1325void __bio_advance(struct bio *bio, unsigned bytes)
054bdf64
KO
1326{
1327 if (bio_integrity(bio))
1328 bio_integrity_advance(bio, bytes);
1329
a892c8d5 1330 bio_crypt_advance(bio, bytes);
4550dd6c 1331 bio_advance_iter(bio, &bio->bi_iter, bytes);
054bdf64 1332}
d4aa57a1 1333EXPORT_SYMBOL(__bio_advance);
054bdf64 1334
45db54d5
KO
1335void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
1336 struct bio *src, struct bvec_iter *src_iter)
16ac3d63 1337{
45db54d5 1338 while (src_iter->bi_size && dst_iter->bi_size) {
f8b679a0
CH
1339 struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
1340 struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
1341 unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
1342 void *src_buf;
1343
1344 src_buf = bvec_kmap_local(&src_bv);
1345 memcpy_to_bvec(&dst_bv, src_buf);
1346 kunmap_local(src_buf);
6e6e811d 1347
22b56c29
PB
1348 bio_advance_iter_single(src, src_iter, bytes);
1349 bio_advance_iter_single(dst, dst_iter, bytes);
16ac3d63
KO
1350 }
1351}
38a72dac
KO
1352EXPORT_SYMBOL(bio_copy_data_iter);
1353
1354/**
45db54d5
KO
1355 * bio_copy_data - copy contents of data buffers from one bio to another
1356 * @src: source bio
1357 * @dst: destination bio
38a72dac
KO
1358 *
1359 * Stops when it reaches the end of either @src or @dst - that is, copies
1360 * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
1361 */
1362void bio_copy_data(struct bio *dst, struct bio *src)
1363{
45db54d5
KO
1364 struct bvec_iter src_iter = src->bi_iter;
1365 struct bvec_iter dst_iter = dst->bi_iter;
1366
1367 bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
38a72dac 1368}
16ac3d63
KO
1369EXPORT_SYMBOL(bio_copy_data);
1370
491221f8 1371void bio_free_pages(struct bio *bio)
1dfa0f68
CH
1372{
1373 struct bio_vec *bvec;
6dc4f100 1374 struct bvec_iter_all iter_all;
1dfa0f68 1375
2b070cfe 1376 bio_for_each_segment_all(bvec, bio, iter_all)
1dfa0f68
CH
1377 __free_page(bvec->bv_page);
1378}
491221f8 1379EXPORT_SYMBOL(bio_free_pages);
1dfa0f68 1380
1da177e4
LT
1381/*
1382 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
1383 * for performing direct-IO in BIOs.
1384 *
1385 * The problem is that we cannot run set_page_dirty() from interrupt context
1386 * because the required locks are not interrupt-safe. So what we can do is to
1387 * mark the pages dirty _before_ performing IO. And in interrupt context,
1388 * check that the pages are still dirty. If so, fine. If not, redirty them
1389 * in process context.
1390 *
1391 * We special-case compound pages here: normally this means reads into hugetlb
1392 * pages. The logic in here doesn't really work right for compound pages
1393 * because the VM does not uniformly chase down the head page in all cases.
1394 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
1395 * handle them at all. So we skip compound pages here at an early stage.
1396 *
1397 * Note that this code is very hard to test under normal circumstances because
1398 * direct-io pins the pages with get_user_pages(). This makes
1399 * is_page_cache_freeable return false, and the VM will not clean the pages.
0d5c3eba 1400 * But other code (eg, flusher threads) could clean the pages if they are mapped
1da177e4
LT
1401 * pagecache.
1402 *
1403 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
1404 * deferred bio dirtying paths.
1405 */
1406
1407/*
1408 * bio_set_pages_dirty() will mark all the bio's pages as dirty.
1409 */
1410void bio_set_pages_dirty(struct bio *bio)
1411{
cb34e057 1412 struct bio_vec *bvec;
6dc4f100 1413 struct bvec_iter_all iter_all;
1da177e4 1414
2b070cfe 1415 bio_for_each_segment_all(bvec, bio, iter_all) {
3bb50983
CH
1416 if (!PageCompound(bvec->bv_page))
1417 set_page_dirty_lock(bvec->bv_page);
1da177e4
LT
1418 }
1419}
1420
1da177e4
LT
1421/*
1422 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
1423 * If they are, then fine. If, however, some pages are clean then they must
1424 * have been written out during the direct-IO read. So we take another ref on
24d5493f 1425 * the BIO and re-dirty the pages in process context.
1da177e4
LT
1426 *
1427 * It is expected that bio_check_pages_dirty() will wholly own the BIO from
ea1754a0
KS
1428 * here on. It will run one put_page() against each page and will run one
1429 * bio_put() against the BIO.
1da177e4
LT
1430 */
1431
65f27f38 1432static void bio_dirty_fn(struct work_struct *work);
1da177e4 1433
65f27f38 1434static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
1da177e4
LT
1435static DEFINE_SPINLOCK(bio_dirty_lock);
1436static struct bio *bio_dirty_list;
1437
1438/*
1439 * This runs in process context
1440 */
65f27f38 1441static void bio_dirty_fn(struct work_struct *work)
1da177e4 1442{
24d5493f 1443 struct bio *bio, *next;
1da177e4 1444
24d5493f
CH
1445 spin_lock_irq(&bio_dirty_lock);
1446 next = bio_dirty_list;
1da177e4 1447 bio_dirty_list = NULL;
24d5493f 1448 spin_unlock_irq(&bio_dirty_lock);
1da177e4 1449
24d5493f
CH
1450 while ((bio = next) != NULL) {
1451 next = bio->bi_private;
1da177e4 1452
d241a95f 1453 bio_release_pages(bio, true);
1da177e4 1454 bio_put(bio);
1da177e4
LT
1455 }
1456}
1457
1458void bio_check_pages_dirty(struct bio *bio)
1459{
cb34e057 1460 struct bio_vec *bvec;
24d5493f 1461 unsigned long flags;
6dc4f100 1462 struct bvec_iter_all iter_all;
1da177e4 1463
2b070cfe 1464 bio_for_each_segment_all(bvec, bio, iter_all) {
24d5493f
CH
1465 if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
1466 goto defer;
1da177e4
LT
1467 }
1468
d241a95f 1469 bio_release_pages(bio, false);
24d5493f
CH
1470 bio_put(bio);
1471 return;
1472defer:
1473 spin_lock_irqsave(&bio_dirty_lock, flags);
1474 bio->bi_private = bio_dirty_list;
1475 bio_dirty_list = bio;
1476 spin_unlock_irqrestore(&bio_dirty_lock, flags);
1477 schedule_work(&bio_dirty_work);
1da177e4
LT
1478}
1479
c4cf5261
JA
1480static inline bool bio_remaining_done(struct bio *bio)
1481{
1482 /*
1483 * If we're not chaining, then ->__bi_remaining is always 1 and
1484 * we always end io on the first invocation.
1485 */
1486 if (!bio_flagged(bio, BIO_CHAIN))
1487 return true;
1488
1489 BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
1490
326e1dbb 1491 if (atomic_dec_and_test(&bio->__bi_remaining)) {
b7c44ed9 1492 bio_clear_flag(bio, BIO_CHAIN);
c4cf5261 1493 return true;
326e1dbb 1494 }
c4cf5261
JA
1495
1496 return false;
1497}
1498
1da177e4
LT
1499/**
1500 * bio_endio - end I/O on a bio
1501 * @bio: bio
1da177e4
LT
1502 *
1503 * Description:
4246a0b6
CH
1504 * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
1505 * way to end I/O on a bio. No one should call bi_end_io() directly on a
1506 * bio unless they own it and thus know that it has an end_io function.
fbbaf700
N
1507 *
1508 * bio_endio() can be called several times on a bio that has been chained
1509 * using bio_chain(). The ->bi_end_io() function will only be called the
60b6a7e6 1510 * last time.
1da177e4 1511 **/
4246a0b6 1512void bio_endio(struct bio *bio)
1da177e4 1513{
ba8c6967 1514again:
2b885517 1515 if (!bio_remaining_done(bio))
ba8c6967 1516 return;
7c20f116
CH
1517 if (!bio_integrity_endio(bio))
1518 return;
1da177e4 1519
a647a524 1520 if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
3caee463 1521 rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
67b42d0b 1522
60b6a7e6 1523 if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
3caee463 1524 trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
60b6a7e6
EH
1525 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
1526 }
1527
ba8c6967
CH
1528 /*
1529 * Need to have a real endio function for chained bios, otherwise
1530 * various corner cases will break (like stacking block devices that
1531 * save/restore bi_end_io) - however, we want to avoid unbounded
1532 * recursion and blowing the stack. Tail call optimization would
1533 * handle this, but compiling with frame pointers also disables
1534 * gcc's sibling call optimization.
1535 */
1536 if (bio->bi_end_io == bio_chain_endio) {
1537 bio = __bio_chain_endio(bio);
1538 goto again;
196d38bc 1539 }
ba8c6967 1540
9e234eea 1541 blk_throtl_bio_endio(bio);
b222dd2f
SL
1542 /* release cgroup info */
1543 bio_uninit(bio);
ba8c6967
CH
1544 if (bio->bi_end_io)
1545 bio->bi_end_io(bio);
1da177e4 1546}
a112a71d 1547EXPORT_SYMBOL(bio_endio);
1da177e4 1548
20d0189b
KO
1549/**
1550 * bio_split - split a bio
1551 * @bio: bio to split
1552 * @sectors: number of sectors to split from the front of @bio
1553 * @gfp: gfp mask
1554 * @bs: bio set to allocate from
1555 *
1556 * Allocates and returns a new bio which represents @sectors from the start of
1557 * @bio, and updates @bio to represent the remaining sectors.
1558 *
f3f5da62 1559 * Unless this is a discard request the newly allocated bio will point
dad77584
BVA
1560 * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
1561 * neither @bio nor @bs are freed before the split bio.
20d0189b
KO
1562 */
1563struct bio *bio_split(struct bio *bio, int sectors,
1564 gfp_t gfp, struct bio_set *bs)
1565{
f341a4d3 1566 struct bio *split;
20d0189b
KO
1567
1568 BUG_ON(sectors <= 0);
1569 BUG_ON(sectors >= bio_sectors(bio));
1570
0512a75b
KB
1571 /* Zone append commands cannot be split */
1572 if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
1573 return NULL;
1574
f9d03f96 1575 split = bio_clone_fast(bio, gfp, bs);
20d0189b
KO
1576 if (!split)
1577 return NULL;
1578
1579 split->bi_iter.bi_size = sectors << 9;
1580
1581 if (bio_integrity(split))
fbd08e76 1582 bio_integrity_trim(split);
20d0189b
KO
1583
1584 bio_advance(bio, split->bi_iter.bi_size);
1585
fbbaf700 1586 if (bio_flagged(bio, BIO_TRACE_COMPLETION))
20d59023 1587 bio_set_flag(split, BIO_TRACE_COMPLETION);
fbbaf700 1588
20d0189b
KO
1589 return split;
1590}
1591EXPORT_SYMBOL(bio_split);
1592
6678d83f
KO
1593/**
1594 * bio_trim - trim a bio
1595 * @bio: bio to trim
1596 * @offset: number of sectors to trim from the front of @bio
1597 * @size: size we want to trim @bio to, in sectors
e83502ca
CK
1598 *
1599 * This function is typically used for bios that are cloned and submitted
1600 * to the underlying device in parts.
6678d83f 1601 */
e83502ca 1602void bio_trim(struct bio *bio, sector_t offset, sector_t size)
6678d83f 1603{
e83502ca
CK
1604 if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
1605 offset + size > bio->bi_iter.bi_size))
1606 return;
6678d83f
KO
1607
1608 size <<= 9;
4f024f37 1609 if (offset == 0 && size == bio->bi_iter.bi_size)
6678d83f
KO
1610 return;
1611
6678d83f 1612 bio_advance(bio, offset << 9);
4f024f37 1613 bio->bi_iter.bi_size = size;
376a78ab
DM
1614
1615 if (bio_integrity(bio))
fbd08e76 1616 bio_integrity_trim(bio);
6678d83f
KO
1617}
1618EXPORT_SYMBOL_GPL(bio_trim);
1619
1da177e4
LT
1620/*
1621 * create memory pools for biovec's in a bio_set.
1622 * use the global biovec slabs created for general use.
1623 */
8aa6ba2f 1624int biovec_init_pool(mempool_t *pool, int pool_entries)
1da177e4 1625{
7a800a20 1626 struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
1da177e4 1627
8aa6ba2f 1628 return mempool_init_slab_pool(pool, pool_entries, bp->slab);
1da177e4
LT
1629}
1630
917a38c7
KO
1631/*
1632 * bioset_exit - exit a bioset initialized with bioset_init()
1633 *
1634 * May be called on a zeroed but uninitialized bioset (i.e. allocated with
1635 * kzalloc()).
1636 */
1637void bioset_exit(struct bio_set *bs)
1da177e4 1638{
be4d234d 1639 bio_alloc_cache_destroy(bs);
df2cb6da
KO
1640 if (bs->rescue_workqueue)
1641 destroy_workqueue(bs->rescue_workqueue);
917a38c7 1642 bs->rescue_workqueue = NULL;
df2cb6da 1643
8aa6ba2f
KO
1644 mempool_exit(&bs->bio_pool);
1645 mempool_exit(&bs->bvec_pool);
9f060e22 1646
7878cba9 1647 bioset_integrity_free(bs);
917a38c7
KO
1648 if (bs->bio_slab)
1649 bio_put_slab(bs);
1650 bs->bio_slab = NULL;
1651}
1652EXPORT_SYMBOL(bioset_exit);
1da177e4 1653
917a38c7
KO
1654/**
1655 * bioset_init - Initialize a bio_set
dad08527 1656 * @bs: pool to initialize
917a38c7
KO
1657 * @pool_size: Number of bio and bio_vecs to cache in the mempool
1658 * @front_pad: Number of bytes to allocate in front of the returned bio
1659 * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS
1660 * and %BIOSET_NEED_RESCUER
1661 *
dad08527
KO
1662 * Description:
1663 * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
1664 * to ask for a number of bytes to be allocated in front of the bio.
1665 * Front pad allocation is useful for embedding the bio inside
1666 * another structure, to avoid allocating extra data to go with the bio.
1667 * Note that the bio must be embedded at the END of that structure always,
1668 * or things will break badly.
1669 * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
1670 * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast().
1671 * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
1672 * dispatch queued requests when the mempool runs out of space.
1673 *
917a38c7
KO
1674 */
1675int bioset_init(struct bio_set *bs,
1676 unsigned int pool_size,
1677 unsigned int front_pad,
1678 int flags)
1679{
917a38c7 1680 bs->front_pad = front_pad;
9f180e31
ML
1681 if (flags & BIOSET_NEED_BVECS)
1682 bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
1683 else
1684 bs->back_pad = 0;
917a38c7
KO
1685
1686 spin_lock_init(&bs->rescue_lock);
1687 bio_list_init(&bs->rescue_list);
1688 INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
1689
49d1ec85 1690 bs->bio_slab = bio_find_or_create_slab(bs);
917a38c7
KO
1691 if (!bs->bio_slab)
1692 return -ENOMEM;
1693
1694 if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab))
1695 goto bad;
1696
1697 if ((flags & BIOSET_NEED_BVECS) &&
1698 biovec_init_pool(&bs->bvec_pool, pool_size))
1699 goto bad;
1700
be4d234d
JA
1701 if (flags & BIOSET_NEED_RESCUER) {
1702 bs->rescue_workqueue = alloc_workqueue("bioset",
1703 WQ_MEM_RECLAIM, 0);
1704 if (!bs->rescue_workqueue)
1705 goto bad;
1706 }
1707 if (flags & BIOSET_PERCPU_CACHE) {
1708 bs->cache = alloc_percpu(struct bio_alloc_cache);
1709 if (!bs->cache)
1710 goto bad;
1711 cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
1712 }
917a38c7
KO
1713
1714 return 0;
1715bad:
1716 bioset_exit(bs);
1717 return -ENOMEM;
1718}
1719EXPORT_SYMBOL(bioset_init);
1720
28e89fd9
JA
1721/*
1722 * Initialize and setup a new bio_set, based on the settings from
1723 * another bio_set.
1724 */
1725int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
1726{
1727 int flags;
1728
1729 flags = 0;
1730 if (src->bvec_pool.min_nr)
1731 flags |= BIOSET_NEED_BVECS;
1732 if (src->rescue_workqueue)
1733 flags |= BIOSET_NEED_RESCUER;
1734
1735 return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
1736}
1737EXPORT_SYMBOL(bioset_init_from_src);
1738
be4d234d
JA
1739/**
1740 * bio_alloc_kiocb - Allocate a bio from bio_set based on kiocb
1741 * @kiocb: kiocb describing the IO
b77c88c2 1742 * @bdev: block device to allocate the bio for (can be %NULL)
0ef47db1 1743 * @nr_vecs: number of iovecs to pre-allocate
b77c88c2 1744 * @opf: operation and flags for bio
be4d234d
JA
1745 * @bs: bio_set to allocate from
1746 *
1747 * Description:
1748 * Like @bio_alloc_bioset, but pass in the kiocb. The kiocb is only
1749 * used to check if we should dip into the per-cpu bio_set allocation
3d5b3fbe
JA
1750 * cache. The allocation uses GFP_KERNEL internally. On return, the
1751 * bio is marked BIO_PERCPU_CACHEABLE, and the final put of the bio
1752 * MUST be done from process context, not hard/soft IRQ.
be4d234d
JA
1753 *
1754 */
b77c88c2
CH
1755struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev,
1756 unsigned short nr_vecs, unsigned int opf, struct bio_set *bs)
be4d234d
JA
1757{
1758 struct bio_alloc_cache *cache;
1759 struct bio *bio;
1760
1761 if (!(kiocb->ki_flags & IOCB_ALLOC_CACHE) || nr_vecs > BIO_INLINE_VECS)
b77c88c2 1762 return bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
be4d234d
JA
1763
1764 cache = per_cpu_ptr(bs->cache, get_cpu());
fcade2ce
JA
1765 if (cache->free_list) {
1766 bio = cache->free_list;
1767 cache->free_list = bio->bi_next;
be4d234d
JA
1768 cache->nr--;
1769 put_cpu();
49add496
CH
1770 bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL,
1771 nr_vecs, opf);
be4d234d
JA
1772 bio->bi_pool = bs;
1773 bio_set_flag(bio, BIO_PERCPU_CACHE);
1774 return bio;
1775 }
1776 put_cpu();
b77c88c2 1777 bio = bio_alloc_bioset(bdev, nr_vecs, opf, GFP_KERNEL, bs);
be4d234d
JA
1778 bio_set_flag(bio, BIO_PERCPU_CACHE);
1779 return bio;
1780}
1781EXPORT_SYMBOL_GPL(bio_alloc_kiocb);
1782
de76fd89 1783static int __init init_bio(void)
1da177e4
LT
1784{
1785 int i;
1786
7878cba9 1787 bio_integrity_init();
1da177e4 1788
de76fd89
CH
1789 for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
1790 struct biovec_slab *bvs = bvec_slabs + i;
a7fcd37c 1791
de76fd89
CH
1792 bvs->slab = kmem_cache_create(bvs->name,
1793 bvs->nr_vecs * sizeof(struct bio_vec), 0,
1794 SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1da177e4 1795 }
1da177e4 1796
be4d234d
JA
1797 cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
1798 bio_cpu_dead);
1799
f4f8154a 1800 if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
1da177e4
LT
1801 panic("bio: can't allocate bios\n");
1802
f4f8154a 1803 if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
a91a2785
MP
1804 panic("bio: can't create integrity pool\n");
1805
1da177e4
LT
1806 return 0;
1807}
1da177e4 1808subsys_initcall(init_bio);