dm: change "unsigned" to "unsigned int"
[linux-2.6-block.git] / drivers / md / dm.c
CommitLineData
3bd94003 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4
LT
2/*
3 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
784aae73 4 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
1da177e4
LT
5 *
6 * This file is released under the GPL.
7 */
8
4cc96131
MS
9#include "dm-core.h"
10#include "dm-rq.h"
51e5b2bd 11#include "dm-uevent.h"
91ccbbac 12#include "dm-ima.h"
1da177e4
LT
13
14#include <linux/init.h>
15#include <linux/module.h>
48c9c27b 16#include <linux/mutex.h>
6958c1c6 17#include <linux/sched/mm.h>
174cd4b1 18#include <linux/sched/signal.h>
1da177e4
LT
19#include <linux/blkpg.h>
20#include <linux/bio.h>
1da177e4 21#include <linux/mempool.h>
f26c5719 22#include <linux/dax.h>
1da177e4
LT
23#include <linux/slab.h>
24#include <linux/idr.h>
7e026c8c 25#include <linux/uio.h>
3ac51e74 26#include <linux/hdreg.h>
3f77316d 27#include <linux/delay.h>
ffcc3936 28#include <linux/wait.h>
71cdb697 29#include <linux/pr.h>
b0b4d7c6 30#include <linux/refcount.h>
c6a564ff 31#include <linux/part_stat.h>
a892c8d5 32#include <linux/blk-crypto.h>
1e8d44bd 33#include <linux/blk-crypto-profile.h>
55782138 34
72d94861
AK
35#define DM_MSG_PREFIX "core"
36
60935eb2
MB
37/*
38 * Cookies are numeric values sent with CHANGE and REMOVE
39 * uevents while resuming, removing or renaming the device.
40 */
41#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
42#define DM_COOKIE_LENGTH 24
43
b99fdcdc
ML
44/*
45 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying
46 * dm_io into one list, and reuse bio->bi_private as the list head. Before
47 * ending this fs bio, we will recover its ->bi_private.
48 */
49#define REQ_DM_POLL_LIST REQ_DRV
50
1da177e4
LT
51static const char *_name = DM_NAME;
52
53static unsigned int major = 0;
54static unsigned int _major = 0;
55
d15b774c
AK
56static DEFINE_IDR(_minor_idr);
57
f32c10b0 58static DEFINE_SPINLOCK(_minor_lock);
2c140a24
MP
59
60static void do_deferred_remove(struct work_struct *w);
61
62static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
63
acfe0ad7
MP
64static struct workqueue_struct *deferred_remove_workqueue;
65
93e6442c
MP
66atomic_t dm_global_event_nr = ATOMIC_INIT(0);
67DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
68
62e08243
MP
69void dm_issue_global_event(void)
70{
71 atomic_inc(&dm_global_event_nr);
72 wake_up(&dm_global_eventq);
73}
74
442761fd
MS
75DEFINE_STATIC_KEY_FALSE(stats_enabled);
76DEFINE_STATIC_KEY_FALSE(swap_bios_enabled);
77DEFINE_STATIC_KEY_FALSE(zoned_enabled);
78
1da177e4 79/*
64f52b0e 80 * One of these is allocated (on-stack) per original bio.
1da177e4 81 */
64f52b0e 82struct clone_info {
64f52b0e
MS
83 struct dm_table *map;
84 struct bio *bio;
85 struct dm_io *io;
86 sector_t sector;
86a3238c 87 unsigned int sector_count;
4edadf6d
MS
88 bool is_abnormal_io:1;
89 bool submit_as_polled:1;
64f52b0e
MS
90};
91
6c23f0bd
CH
92static inline struct dm_target_io *clone_to_tio(struct bio *clone)
93{
94 return container_of(clone, struct dm_target_io, clone);
95}
96
64f52b0e
MS
97void *dm_per_bio_data(struct bio *bio, size_t data_size)
98{
655f3aad 99 if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO))
62f26317
JX
100 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
101 return (char *)bio - DM_IO_BIO_OFFSET - data_size;
64f52b0e
MS
102}
103EXPORT_SYMBOL_GPL(dm_per_bio_data);
104
105struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
106{
107 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
108 if (io->magic == DM_IO_MAGIC)
62f26317 109 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
64f52b0e 110 BUG_ON(io->magic != DM_TIO_MAGIC);
62f26317 111 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
64f52b0e
MS
112}
113EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
114
86a3238c 115unsigned int dm_bio_get_target_bio_nr(const struct bio *bio)
64f52b0e
MS
116{
117 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
118}
119EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
120
ba61fdd1
JM
121#define MINOR_ALLOCED ((void *)-1)
122
115485e8 123#define DM_NUMA_NODE NUMA_NO_NODE
115485e8 124static int dm_numa_node = DM_NUMA_NODE;
faad87df 125
a666e5c0
MP
126#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
127static int swap_bios = DEFAULT_SWAP_BIOS;
128static int get_swap_bios(void)
129{
130 int latch = READ_ONCE(swap_bios);
131 if (unlikely(latch <= 0))
132 latch = DEFAULT_SWAP_BIOS;
133 return latch;
134}
135
86f1152b
BM
136struct table_device {
137 struct list_head list;
b0b4d7c6 138 refcount_t count;
86f1152b
BM
139 struct dm_dev dm_dev;
140};
141
e8603136
MS
142/*
143 * Bio-based DM's mempools' reserved IOs set by the user.
144 */
4cc96131 145#define RESERVED_BIO_BASED_IOS 16
86a3238c 146static unsigned int reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
e8603136 147
115485e8
MS
148static int __dm_get_module_param_int(int *module_param, int min, int max)
149{
6aa7de05 150 int param = READ_ONCE(*module_param);
115485e8
MS
151 int modified_param = 0;
152 bool modified = true;
153
154 if (param < min)
155 modified_param = min;
156 else if (param > max)
157 modified_param = max;
158 else
159 modified = false;
160
161 if (modified) {
162 (void)cmpxchg(module_param, param, modified_param);
163 param = modified_param;
164 }
165
166 return param;
167}
168
86a3238c 169unsigned int __dm_get_module_param(unsigned int *module_param, unsigned int def, unsigned int max)
f4790826 170{
86a3238c
HM
171 unsigned int param = READ_ONCE(*module_param);
172 unsigned int modified_param = 0;
f4790826 173
09c2d531
MS
174 if (!param)
175 modified_param = def;
176 else if (param > max)
177 modified_param = max;
f4790826 178
09c2d531
MS
179 if (modified_param) {
180 (void)cmpxchg(module_param, param, modified_param);
181 param = modified_param;
f4790826
MS
182 }
183
09c2d531 184 return param;
f4790826
MS
185}
186
86a3238c 187unsigned int dm_get_reserved_bio_based_ios(void)
e8603136 188{
09c2d531 189 return __dm_get_module_param(&reserved_bio_based_ios,
4cc96131 190 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
e8603136
MS
191}
192EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
193
86a3238c 194static unsigned int dm_get_numa_node(void)
115485e8
MS
195{
196 return __dm_get_module_param_int(&dm_numa_node,
197 DM_NUMA_NODE, num_online_nodes() - 1);
198}
199
1da177e4
LT
200static int __init local_init(void)
201{
e689fbab 202 int r;
1ae49ea2 203
51e5b2bd 204 r = dm_uevent_init();
51157b4a 205 if (r)
e689fbab 206 return r;
51e5b2bd 207
acfe0ad7
MP
208 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
209 if (!deferred_remove_workqueue) {
210 r = -ENOMEM;
211 goto out_uevent_exit;
212 }
213
1da177e4
LT
214 _major = major;
215 r = register_blkdev(_major, _name);
51157b4a 216 if (r < 0)
acfe0ad7 217 goto out_free_workqueue;
1da177e4
LT
218
219 if (!_major)
220 _major = r;
221
222 return 0;
51157b4a 223
acfe0ad7
MP
224out_free_workqueue:
225 destroy_workqueue(deferred_remove_workqueue);
51157b4a
KU
226out_uevent_exit:
227 dm_uevent_exit();
51157b4a
KU
228
229 return r;
1da177e4
LT
230}
231
232static void local_exit(void)
233{
2c140a24 234 flush_scheduled_work();
acfe0ad7 235 destroy_workqueue(deferred_remove_workqueue);
2c140a24 236
00d59405 237 unregister_blkdev(_major, _name);
51e5b2bd 238 dm_uevent_exit();
1da177e4
LT
239
240 _major = 0;
241
242 DMINFO("cleaned up");
243}
244
b9249e55 245static int (*_inits[])(void) __initdata = {
1da177e4
LT
246 local_init,
247 dm_target_init,
248 dm_linear_init,
249 dm_stripe_init,
952b3557 250 dm_io_init,
945fa4d2 251 dm_kcopyd_init,
1da177e4 252 dm_interface_init,
fd2ed4d2 253 dm_statistics_init,
1da177e4
LT
254};
255
b9249e55 256static void (*_exits[])(void) = {
1da177e4
LT
257 local_exit,
258 dm_target_exit,
259 dm_linear_exit,
260 dm_stripe_exit,
952b3557 261 dm_io_exit,
945fa4d2 262 dm_kcopyd_exit,
1da177e4 263 dm_interface_exit,
fd2ed4d2 264 dm_statistics_exit,
1da177e4
LT
265};
266
267static int __init dm_init(void)
268{
269 const int count = ARRAY_SIZE(_inits);
1da177e4
LT
270 int r, i;
271
f1cd6cb2
TS
272#if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
273 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
274 " Duplicate IMA measurements will not be recorded in the IMA log.");
275#endif
276
1da177e4
LT
277 for (i = 0; i < count; i++) {
278 r = _inits[i]();
279 if (r)
280 goto bad;
281 }
282
283 return 0;
f1cd6cb2 284bad:
1da177e4
LT
285 while (i--)
286 _exits[i]();
287
288 return r;
289}
290
291static void __exit dm_exit(void)
292{
293 int i = ARRAY_SIZE(_exits);
294
295 while (i--)
296 _exits[i]();
d15b774c
AK
297
298 /*
299 * Should be empty by this point.
300 */
d15b774c 301 idr_destroy(&_minor_idr);
1da177e4
LT
302}
303
304/*
305 * Block device functions
306 */
432a212c
MA
307int dm_deleting_md(struct mapped_device *md)
308{
309 return test_bit(DMF_DELETING, &md->flags);
310}
311
fe5f9f2c 312static int dm_blk_open(struct block_device *bdev, fmode_t mode)
1da177e4
LT
313{
314 struct mapped_device *md;
315
fba9f90e
JM
316 spin_lock(&_minor_lock);
317
fe5f9f2c 318 md = bdev->bd_disk->private_data;
fba9f90e
JM
319 if (!md)
320 goto out;
321
5c6bd75d 322 if (test_bit(DMF_FREEING, &md->flags) ||
432a212c 323 dm_deleting_md(md)) {
fba9f90e
JM
324 md = NULL;
325 goto out;
326 }
327
1da177e4 328 dm_get(md);
5c6bd75d 329 atomic_inc(&md->open_count);
fba9f90e
JM
330out:
331 spin_unlock(&_minor_lock);
332
333 return md ? 0 : -ENXIO;
1da177e4
LT
334}
335
db2a144b 336static void dm_blk_close(struct gendisk *disk, fmode_t mode)
1da177e4 337{
63a4f065 338 struct mapped_device *md;
6e9624b8 339
4a1aeb98
MB
340 spin_lock(&_minor_lock);
341
63a4f065
MS
342 md = disk->private_data;
343 if (WARN_ON(!md))
344 goto out;
345
2c140a24
MP
346 if (atomic_dec_and_test(&md->open_count) &&
347 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
acfe0ad7 348 queue_work(deferred_remove_workqueue, &deferred_remove_work);
2c140a24 349
1da177e4 350 dm_put(md);
63a4f065 351out:
4a1aeb98 352 spin_unlock(&_minor_lock);
1da177e4
LT
353}
354
5c6bd75d
AK
355int dm_open_count(struct mapped_device *md)
356{
357 return atomic_read(&md->open_count);
358}
359
360/*
361 * Guarantees nothing is using the device before it's deleted.
362 */
2c140a24 363int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
5c6bd75d
AK
364{
365 int r = 0;
366
367 spin_lock(&_minor_lock);
368
2c140a24 369 if (dm_open_count(md)) {
5c6bd75d 370 r = -EBUSY;
2c140a24
MP
371 if (mark_deferred)
372 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
373 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
374 r = -EEXIST;
5c6bd75d
AK
375 else
376 set_bit(DMF_DELETING, &md->flags);
377
378 spin_unlock(&_minor_lock);
379
380 return r;
381}
382
2c140a24
MP
383int dm_cancel_deferred_remove(struct mapped_device *md)
384{
385 int r = 0;
386
387 spin_lock(&_minor_lock);
388
389 if (test_bit(DMF_DELETING, &md->flags))
390 r = -EBUSY;
391 else
392 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
393
394 spin_unlock(&_minor_lock);
395
396 return r;
397}
398
399static void do_deferred_remove(struct work_struct *w)
400{
401 dm_deferred_remove();
402}
403
3ac51e74
DW
404static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
405{
406 struct mapped_device *md = bdev->bd_disk->private_data;
407
408 return dm_get_geometry(md, geo);
409}
410
971888c4 411static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
5bd5e8d8 412 struct block_device **bdev)
aa129a22 413{
564b5c54 414 struct dm_target *ti;
6c182cd8 415 struct dm_table *map;
971888c4 416 int r;
aa129a22 417
6c182cd8 418retry:
e56f81e0 419 r = -ENOTTY;
971888c4 420 map = dm_get_live_table(md, srcu_idx);
aa129a22 421 if (!map || !dm_table_get_size(map))
971888c4 422 return r;
aa129a22
MB
423
424 /* We only support devices that have a single target */
2aec377a 425 if (map->num_targets != 1)
971888c4 426 return r;
aa129a22 427
564b5c54
MS
428 ti = dm_table_get_target(map, 0);
429 if (!ti->type->prepare_ioctl)
971888c4 430 return r;
519049af 431
971888c4
MS
432 if (dm_suspended_md(md))
433 return -EAGAIN;
aa129a22 434
564b5c54 435 r = ti->type->prepare_ioctl(ti, bdev);
5bbbfdf6 436 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
971888c4 437 dm_put_live_table(md, *srcu_idx);
238d991f 438 fsleep(10000);
6c182cd8
HR
439 goto retry;
440 }
971888c4 441
e56f81e0
CH
442 return r;
443}
444
971888c4 445static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
971888c4
MS
446{
447 dm_put_live_table(md, srcu_idx);
448}
449
e56f81e0
CH
450static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
451 unsigned int cmd, unsigned long arg)
452{
453 struct mapped_device *md = bdev->bd_disk->private_data;
971888c4 454 int r, srcu_idx;
e56f81e0 455
5bd5e8d8 456 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
e56f81e0 457 if (r < 0)
971888c4 458 goto out;
6c182cd8 459
e56f81e0
CH
460 if (r > 0) {
461 /*
e980f623
CH
462 * Target determined this ioctl is being issued against a
463 * subset of the parent bdev; require extra privileges.
e56f81e0 464 */
e980f623 465 if (!capable(CAP_SYS_RAWIO)) {
0378c625 466 DMDEBUG_LIMIT(
e980f623
CH
467 "%s: sending ioctl %x to DM device without required privilege.",
468 current->comm, cmd);
469 r = -ENOIOCTLCMD;
e56f81e0 470 goto out;
e980f623 471 }
e56f81e0 472 }
6c182cd8 473
a7cb3d2f
CH
474 if (!bdev->bd_disk->fops->ioctl)
475 r = -ENOTTY;
476 else
477 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
e56f81e0 478out:
971888c4 479 dm_unprepare_ioctl(md, srcu_idx);
aa129a22
MB
480 return r;
481}
482
7465d7ac
MS
483u64 dm_start_time_ns_from_clone(struct bio *bio)
484{
6c23f0bd 485 return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
7465d7ac
MS
486}
487EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
488
8d394bc4 489static bool bio_is_flush_with_data(struct bio *bio)
7465d7ac 490{
8d394bc4
MS
491 return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
492}
493
e6926ad0 494static void dm_io_acct(struct dm_io *io, bool end)
8d394bc4 495{
e6926ad0
ML
496 struct dm_stats_aux *stats_aux = &io->stats_aux;
497 unsigned long start_time = io->start_time;
498 struct mapped_device *md = io->md;
499 struct bio *bio = io->orig_bio;
d3de6d12
ML
500 unsigned int sectors;
501
502 /*
503 * If REQ_PREFLUSH set, don't account payload, it will be
504 * submitted (and accounted) after this flush completes.
505 */
506 if (bio_is_flush_with_data(bio))
507 sectors = 0;
7dd76d1f 508 else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT))))
d3de6d12 509 sectors = bio_sectors(bio);
7dd76d1f
ML
510 else
511 sectors = io->sectors;
8d394bc4
MS
512
513 if (!end)
d3de6d12
ML
514 bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio),
515 start_time);
8d394bc4 516 else
d3de6d12 517 bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
7465d7ac 518
442761fd 519 if (static_branch_unlikely(&stats_enabled) &&
7dd76d1f
ML
520 unlikely(dm_stats_used(&md->stats))) {
521 sector_t sector;
522
523 if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT)))
524 sector = bio->bi_iter.bi_sector;
525 else
526 sector = bio_end_sector(bio) - io->sector_offset;
527
7465d7ac 528 dm_stats_account_io(&md->stats, bio_data_dir(bio),
7dd76d1f 529 sector, sectors,
8d394bc4 530 end, start_time, stats_aux);
7dd76d1f 531 }
8d394bc4
MS
532}
533
b992b40d 534static void __dm_start_io_acct(struct dm_io *io)
8d394bc4 535{
e6926ad0 536 dm_io_acct(io, false);
7465d7ac
MS
537}
538
0fbb4d93 539static void dm_start_io_acct(struct dm_io *io, struct bio *clone)
7465d7ac 540{
0fbb4d93
MS
541 /*
542 * Ensure IO accounting is only ever started once.
0fbb4d93 543 */
3b03f7c1
MS
544 if (dm_io_flagged(io, DM_IO_ACCOUNTED))
545 return;
546
547 /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */
548 if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) {
82f6cdcc
MS
549 dm_io_set_flag(io, DM_IO_ACCOUNTED);
550 } else {
551 unsigned long flags;
655f3aad 552 /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */
4d7bca13 553 spin_lock_irqsave(&io->lock, flags);
10eb3a0d
BM
554 if (dm_io_flagged(io, DM_IO_ACCOUNTED)) {
555 spin_unlock_irqrestore(&io->lock, flags);
556 return;
557 }
82f6cdcc 558 dm_io_set_flag(io, DM_IO_ACCOUNTED);
4d7bca13 559 spin_unlock_irqrestore(&io->lock, flags);
82f6cdcc 560 }
7465d7ac 561
b992b40d 562 __dm_start_io_acct(io);
0fbb4d93 563}
7465d7ac 564
b992b40d 565static void dm_end_io_acct(struct dm_io *io)
0fbb4d93 566{
e6926ad0 567 dm_io_acct(io, true);
7465d7ac 568}
978e51ba
MS
569
570static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
1da177e4 571{
64f52b0e
MS
572 struct dm_io *io;
573 struct dm_target_io *tio;
574 struct bio *clone;
575
29dec90a 576 clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->mempools->io_bs);
6c23f0bd 577 tio = clone_to_tio(clone);
655f3aad
MS
578 tio->flags = 0;
579 dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
64f52b0e
MS
580 tio->io = NULL;
581
582 io = container_of(tio, struct dm_io, tio);
583 io->magic = DM_IO_MAGIC;
84b98f4c 584 io->status = BLK_STS_OK;
0f14d60a
ML
585
586 /* one ref is for submission, the other is for completion */
587 atomic_set(&io->io_count, 2);
9f6dc633 588 this_cpu_inc(*md->pending_io);
7dd76d1f 589 io->orig_bio = bio;
978e51ba 590 io->md = md;
4d7bca13 591 spin_lock_init(&io->lock);
b879f915 592 io->start_time = jiffies;
82f6cdcc 593 io->flags = 0;
64f52b0e 594
442761fd
MS
595 if (static_branch_unlikely(&stats_enabled))
596 dm_stats_record_start(&md->stats, &io->stats_aux);
64f52b0e
MS
597
598 return io;
1da177e4
LT
599}
600
0119ab14 601static void free_io(struct dm_io *io)
1da177e4 602{
64f52b0e
MS
603 bio_put(&io->tio.clone);
604}
605
1d1068ce 606static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
86a3238c 607 unsigned int target_bio_nr, unsigned int *len, gfp_t gfp_mask)
64f52b0e 608{
9dd1cd32 609 struct mapped_device *md = ci->io->md;
64f52b0e 610 struct dm_target_io *tio;
018b05eb 611 struct bio *clone;
64f52b0e
MS
612
613 if (!ci->io->tio.io) {
614 /* the dm_target_io embedded in ci->io is available */
615 tio = &ci->io->tio;
018b05eb
MS
616 /* alloc_io() already initialized embedded clone */
617 clone = &tio->clone;
64f52b0e 618 } else {
29dec90a
CH
619 clone = bio_alloc_clone(NULL, ci->bio, gfp_mask,
620 &md->mempools->bs);
64f52b0e
MS
621 if (!clone)
622 return NULL;
623
b99fdcdc
ML
624 /* REQ_DM_POLL_LIST shouldn't be inherited */
625 clone->bi_opf &= ~REQ_DM_POLL_LIST;
626
6c23f0bd 627 tio = clone_to_tio(clone);
655f3aad 628 tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */
64f52b0e
MS
629 }
630
631 tio->magic = DM_TIO_MAGIC;
632 tio->io = ci->io;
633 tio->ti = ti;
634 tio->target_bio_nr = target_bio_nr;
dc8e2021 635 tio->len_ptr = len;
743598f0 636 tio->old_sector = 0;
64f52b0e 637
9dd1cd32
MS
638 /* Set default bdev, but target must bio_set_dev() before issuing IO */
639 clone->bi_bdev = md->disk->part0;
640 if (unlikely(ti->needs_bio_set_dev))
641 bio_set_dev(clone, md->disk->part0);
642
018b05eb
MS
643 if (len) {
644 clone->bi_iter.bi_size = to_bytes(*len);
645 if (bio_integrity(clone))
646 bio_integrity_trim(clone);
647 }
64f52b0e 648
018b05eb 649 return clone;
1da177e4
LT
650}
651
1d1068ce 652static void free_tio(struct bio *clone)
1da177e4 653{
655f3aad 654 if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO))
64f52b0e 655 return;
1d1068ce 656 bio_put(clone);
1da177e4
LT
657}
658
659/*
660 * Add the bio to the list of deferred io.
661 */
92c63902 662static void queue_io(struct mapped_device *md, struct bio *bio)
1da177e4 663{
05447420 664 unsigned long flags;
1da177e4 665
05447420 666 spin_lock_irqsave(&md->deferred_lock, flags);
1da177e4 667 bio_list_add(&md->deferred, bio);
05447420 668 spin_unlock_irqrestore(&md->deferred_lock, flags);
6a8736d1 669 queue_work(md->wq, &md->work);
1da177e4
LT
670}
671
672/*
673 * Everyone (including functions in this file), should use this
674 * function to access the md->map field, and make sure they call
83d5e5b0 675 * dm_put_live_table() when finished.
1da177e4 676 */
563a225c
MS
677struct dm_table *dm_get_live_table(struct mapped_device *md,
678 int *srcu_idx) __acquires(md->io_barrier)
1da177e4 679{
83d5e5b0
MP
680 *srcu_idx = srcu_read_lock(&md->io_barrier);
681
682 return srcu_dereference(md->map, &md->io_barrier);
683}
1da177e4 684
563a225c
MS
685void dm_put_live_table(struct mapped_device *md,
686 int srcu_idx) __releases(md->io_barrier)
83d5e5b0
MP
687{
688 srcu_read_unlock(&md->io_barrier, srcu_idx);
689}
690
691void dm_sync_table(struct mapped_device *md)
692{
693 synchronize_srcu(&md->io_barrier);
694 synchronize_rcu_expedited();
695}
696
697/*
698 * A fast alternative to dm_get_live_table/dm_put_live_table.
699 * The caller must not block between these two functions.
700 */
701static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
702{
703 rcu_read_lock();
704 return rcu_dereference(md->map);
705}
1da177e4 706
83d5e5b0
MP
707static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
708{
709 rcu_read_unlock();
1da177e4
LT
710}
711
563a225c 712static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md,
a3282b43 713 int *srcu_idx, blk_opf_t bio_opf)
563a225c 714{
5d7362d0 715 if (bio_opf & REQ_NOWAIT)
563a225c
MS
716 return dm_get_live_table_fast(md);
717 else
718 return dm_get_live_table(md, srcu_idx);
719}
720
721static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx,
a3282b43 722 blk_opf_t bio_opf)
563a225c 723{
5d7362d0 724 if (bio_opf & REQ_NOWAIT)
563a225c
MS
725 dm_put_live_table_fast(md);
726 else
727 dm_put_live_table(md, srcu_idx);
728}
729
971888c4
MS
730static char *_dm_claim_ptr = "I belong to device-mapper";
731
86f1152b
BM
732/*
733 * Open a table device so we can use it as a map destination.
734 */
b9a785d2
CH
735static struct table_device *open_table_device(struct mapped_device *md,
736 dev_t dev, fmode_t mode)
86f1152b 737{
b9a785d2 738 struct table_device *td;
86f1152b 739 struct block_device *bdev;
cd913c76 740 u64 part_off;
86f1152b
BM
741 int r;
742
b9a785d2
CH
743 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
744 if (!td)
745 return ERR_PTR(-ENOMEM);
746 refcount_set(&td->count, 1);
86f1152b 747
b9a785d2
CH
748 bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr);
749 if (IS_ERR(bdev)) {
750 r = PTR_ERR(bdev);
751 goto out_free_td;
752 }
86f1152b 753
1a581b72
CH
754 /*
755 * We can be called before the dm disk is added. In that case we can't
756 * register the holder relation here. It will be done once add_disk was
757 * called.
758 */
759 if (md->disk->slave_dir) {
760 r = bd_link_disk_holder(bdev, md->disk);
761 if (r)
762 goto out_blkdev_put;
763 }
86f1152b 764
b9a785d2 765 td->dm_dev.mode = mode;
86f1152b 766 td->dm_dev.bdev = bdev;
8012b866 767 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
b9a785d2
CH
768 format_dev_t(td->dm_dev.name, dev);
769 list_add(&td->list, &md->table_devices);
770 return td;
771
772out_blkdev_put:
773 blkdev_put(bdev, mode | FMODE_EXCL);
774out_free_td:
775 kfree(td);
776 return ERR_PTR(r);
86f1152b
BM
777}
778
779/*
780 * Close a table device that we've been using.
781 */
782static void close_table_device(struct table_device *td, struct mapped_device *md)
783{
1a581b72
CH
784 if (md->disk->slave_dir)
785 bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
86f1152b 786 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
817bf402 787 put_dax(td->dm_dev.dax_dev);
7b586583
CH
788 list_del(&td->list);
789 kfree(td);
86f1152b
BM
790}
791
792static struct table_device *find_table_device(struct list_head *l, dev_t dev,
8454fca4
SS
793 fmode_t mode)
794{
86f1152b
BM
795 struct table_device *td;
796
797 list_for_each_entry(td, l, list)
798 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
799 return td;
800
801 return NULL;
802}
803
804int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
8454fca4
SS
805 struct dm_dev **result)
806{
86f1152b
BM
807 struct table_device *td;
808
809 mutex_lock(&md->table_devices_lock);
810 td = find_table_device(&md->table_devices, dev, mode);
811 if (!td) {
b9a785d2
CH
812 td = open_table_device(md, dev, mode);
813 if (IS_ERR(td)) {
86f1152b 814 mutex_unlock(&md->table_devices_lock);
b9a785d2 815 return PTR_ERR(td);
86f1152b 816 }
b0b4d7c6
ER
817 } else {
818 refcount_inc(&td->count);
86f1152b 819 }
86f1152b
BM
820 mutex_unlock(&md->table_devices_lock);
821
822 *result = &td->dm_dev;
823 return 0;
824}
86f1152b
BM
825
826void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
827{
828 struct table_device *td = container_of(d, struct table_device, dm_dev);
829
830 mutex_lock(&md->table_devices_lock);
7b586583 831 if (refcount_dec_and_test(&td->count))
86f1152b 832 close_table_device(td, md);
86f1152b
BM
833 mutex_unlock(&md->table_devices_lock);
834}
86f1152b 835
3ac51e74
DW
836/*
837 * Get the geometry associated with a dm device
838 */
839int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
840{
841 *geo = md->geometry;
842
843 return 0;
844}
845
846/*
847 * Set the geometry of a device.
848 */
849int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
850{
851 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
852
853 if (geo->start > sz) {
43e6c111 854 DMERR("Start sector is beyond the geometry limits.");
3ac51e74
DW
855 return -EINVAL;
856 }
857
858 md->geometry = *geo;
859
860 return 0;
861}
862
2e93ccc1
KU
863static int __noflush_suspending(struct mapped_device *md)
864{
865 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
866}
867
8b211aac 868static void dm_requeue_add_io(struct dm_io *io, bool first_stage)
1da177e4 869{
b35f8caa 870 struct mapped_device *md = io->md;
1da177e4 871
8b211aac
ML
872 if (first_stage) {
873 struct dm_io *next = md->requeue_list;
874
875 md->requeue_list = io;
876 io->next = next;
877 } else {
878 bio_list_add_head(&md->deferred, io->orig_bio);
879 }
880}
881
882static void dm_kick_requeue(struct mapped_device *md, bool first_stage)
883{
884 if (first_stage)
885 queue_work(md->wq, &md->requeue_work);
886 else
887 queue_work(md->wq, &md->work);
888}
889
444fe04f
ML
890/*
891 * Return true if the dm_io's original bio is requeued.
892 * io->status is updated with error if requeue disallowed.
893 */
8b211aac 894static bool dm_handle_requeue(struct dm_io *io, bool first_stage)
1da177e4 895{
8b211aac 896 struct bio *bio = io->orig_bio;
444fe04f
ML
897 bool handle_requeue = (io->status == BLK_STS_DM_REQUEUE);
898 bool handle_polled_eagain = ((io->status == BLK_STS_AGAIN) &&
899 (bio->bi_opf & REQ_POLLED));
900 struct mapped_device *md = io->md;
901 bool requeued = false;
1da177e4 902
444fe04f 903 if (handle_requeue || handle_polled_eagain) {
e2736347 904 unsigned long flags;
444fe04f
ML
905
906 if (bio->bi_opf & REQ_POLLED) {
907 /*
908 * Upper layer won't help us poll split bio
909 * (io->orig_bio may only reflect a subset of the
910 * pre-split original) so clear REQ_POLLED.
911 */
912 bio_clear_polled(bio);
913 }
914
e2736347 915 /*
444fe04f
ML
916 * Target requested pushing back the I/O or
917 * polled IO hit BLK_STS_AGAIN.
e2736347
MS
918 */
919 spin_lock_irqsave(&md->deferred_lock, flags);
444fe04f
ML
920 if ((__noflush_suspending(md) &&
921 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) ||
8b211aac
ML
922 handle_polled_eagain || first_stage) {
923 dm_requeue_add_io(io, first_stage);
444fe04f 924 requeued = true;
e2736347 925 } else {
2e93ccc1 926 /*
e2736347
MS
927 * noflush suspend was interrupted or this is
928 * a write to a zoned target.
2e93ccc1 929 */
e2736347 930 io->status = BLK_STS_IOERR;
2e93ccc1 931 }
e2736347
MS
932 spin_unlock_irqrestore(&md->deferred_lock, flags);
933 }
2e93ccc1 934
444fe04f 935 if (requeued)
8b211aac 936 dm_kick_requeue(md, first_stage);
444fe04f
ML
937
938 return requeued;
939}
940
8b211aac 941static void __dm_io_complete(struct dm_io *io, bool first_stage)
444fe04f 942{
8b211aac 943 struct bio *bio = io->orig_bio;
444fe04f
ML
944 struct mapped_device *md = io->md;
945 blk_status_t io_error;
946 bool requeued;
947
8b211aac
ML
948 requeued = dm_handle_requeue(io, first_stage);
949 if (requeued && first_stage)
950 return;
444fe04f 951
e2736347 952 io_error = io->status;
82f6cdcc 953 if (dm_io_flagged(io, DM_IO_ACCOUNTED))
b992b40d 954 dm_end_io_acct(io);
e2736347
MS
955 else if (!io_error) {
956 /*
957 * Must handle target that DM_MAPIO_SUBMITTED only to
958 * then bio_endio() rather than dm_submit_bio_remap()
959 */
b992b40d
ML
960 __dm_start_io_acct(io);
961 dm_end_io_acct(io);
e2736347
MS
962 }
963 free_io(io);
964 smp_wmb();
965 this_cpu_dec(*md->pending_io);
6a8736d1 966
e2736347
MS
967 /* nudge anyone waiting on suspend queue */
968 if (unlikely(wq_has_sleeper(&md->wait)))
969 wake_up(&md->wait);
2e93ccc1 970
444fe04f
ML
971 /* Return early if the original bio was requeued */
972 if (requeued)
973 return;
e2736347
MS
974
975 if (bio_is_flush_with_data(bio)) {
976 /*
977 * Preflush done for flush with data, reissue
978 * without REQ_PREFLUSH.
979 */
980 bio->bi_opf &= ~REQ_PREFLUSH;
981 queue_io(md, bio);
982 } else {
983 /* done with normal IO or empty flush */
984 if (io_error)
985 bio->bi_status = io_error;
986 bio_endio(bio);
987 }
988}
989
8b211aac
ML
990static void dm_wq_requeue_work(struct work_struct *work)
991{
992 struct mapped_device *md = container_of(work, struct mapped_device,
993 requeue_work);
994 unsigned long flags;
995 struct dm_io *io;
996
997 /* reuse deferred lock to simplify dm_handle_requeue */
998 spin_lock_irqsave(&md->deferred_lock, flags);
999 io = md->requeue_list;
1000 md->requeue_list = NULL;
1001 spin_unlock_irqrestore(&md->deferred_lock, flags);
1002
1003 while (io) {
1004 struct dm_io *next = io->next;
1005
46754bd0 1006 dm_io_rewind(io, &md->disk->bio_split);
8b211aac
ML
1007
1008 io->next = NULL;
1009 __dm_io_complete(io, false);
1010 io = next;
1011 }
1012}
1013
1014/*
1015 * Two staged requeue:
1016 *
1017 * 1) io->orig_bio points to the real original bio, and the part mapped to
1018 * this io must be requeued, instead of other parts of the original bio.
1019 *
1020 * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io.
1021 */
1022static void dm_io_complete(struct dm_io *io)
1023{
1024 bool first_requeue;
1025
1026 /*
1027 * Only dm_io that has been split needs two stage requeue, otherwise
1028 * we may run into long bio clone chain during suspend and OOM could
1029 * be triggered.
1030 *
1031 * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they
1032 * also aren't handled via the first stage requeue.
1033 */
1034 if (dm_io_flagged(io, DM_IO_WAS_SPLIT))
1035 first_requeue = true;
1036 else
1037 first_requeue = false;
1038
1039 __dm_io_complete(io, first_requeue);
1040}
1041
1da177e4
LT
1042/*
1043 * Decrements the number of outstanding ios that a bio has been
1044 * cloned into, completing the original io if necc.
1045 */
84b98f4c
MS
1046static inline void __dm_io_dec_pending(struct dm_io *io)
1047{
1048 if (atomic_dec_and_test(&io->io_count))
1049 dm_io_complete(io);
1050}
1051
1052static void dm_io_set_error(struct dm_io *io, blk_status_t error)
1da177e4 1053{
84b98f4c
MS
1054 unsigned long flags;
1055
2e93ccc1 1056 /* Push-back supersedes any I/O errors */
84b98f4c
MS
1057 spin_lock_irqsave(&io->lock, flags);
1058 if (!(io->status == BLK_STS_DM_REQUEUE &&
1059 __noflush_suspending(io->md))) {
1060 io->status = error;
1da177e4 1061 }
84b98f4c
MS
1062 spin_unlock_irqrestore(&io->lock, flags);
1063}
1da177e4 1064
2e803cd9 1065static void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
84b98f4c
MS
1066{
1067 if (unlikely(error))
1068 dm_io_set_error(io, error);
1069
1070 __dm_io_dec_pending(io);
1da177e4
LT
1071}
1072
bcb44433
MS
1073void disable_discard(struct mapped_device *md)
1074{
1075 struct queue_limits *limits = dm_get_queue_limits(md);
1076
1077 /* device doesn't really support DISCARD, disable it */
1078 limits->max_discard_sectors = 0;
bcb44433
MS
1079}
1080
ac62d620
CH
1081void disable_write_zeroes(struct mapped_device *md)
1082{
1083 struct queue_limits *limits = dm_get_queue_limits(md);
1084
1085 /* device doesn't really support WRITE ZEROES, disable it */
1086 limits->max_write_zeroes_sectors = 0;
1087}
1088
a666e5c0
MP
1089static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
1090{
1091 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
1092}
1093
4246a0b6 1094static void clone_endio(struct bio *bio)
1da177e4 1095{
4e4cbee9 1096 blk_status_t error = bio->bi_status;
6c23f0bd 1097 struct dm_target_io *tio = clone_to_tio(bio);
6cbce280
MS
1098 struct dm_target *ti = tio->ti;
1099 dm_endio_fn endio = ti->type->end_io;
b35f8caa 1100 struct dm_io *io = tio->io;
6cbce280 1101 struct mapped_device *md = io->md;
1da177e4 1102
dddf3056
MS
1103 if (unlikely(error == BLK_STS_TARGET)) {
1104 if (bio_op(bio) == REQ_OP_DISCARD &&
1105 !bdev_max_discard_sectors(bio->bi_bdev))
1106 disable_discard(md);
1107 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
1108 !bdev_write_zeroes_sectors(bio->bi_bdev))
1109 disable_write_zeroes(md);
ca522482 1110 }
415c79e1 1111
dddf3056 1112 if (static_branch_unlikely(&zoned_enabled) &&
edd1dbc8 1113 unlikely(bdev_is_zoned(bio->bi_bdev)))
dddf3056
MS
1114 dm_zone_endio(io, bio);
1115
1be56909 1116 if (endio) {
6cbce280 1117 int r = endio(ti, bio, &error);
1be56909
CH
1118 switch (r) {
1119 case DM_ENDIO_REQUEUE:
442761fd
MS
1120 if (static_branch_unlikely(&zoned_enabled)) {
1121 /*
1122 * Requeuing writes to a sequential zone of a zoned
1123 * target will break the sequential write pattern:
1124 * fail such IO.
1125 */
1126 if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
1127 error = BLK_STS_IOERR;
1128 else
1129 error = BLK_STS_DM_REQUEUE;
1130 } else
bf14e2b2 1131 error = BLK_STS_DM_REQUEUE;
df561f66 1132 fallthrough;
1be56909
CH
1133 case DM_ENDIO_DONE:
1134 break;
1135 case DM_ENDIO_INCOMPLETE:
1136 /* The target will handle the io */
1137 return;
1138 default:
43e6c111 1139 DMCRIT("unimplemented target endio return value: %d", r);
1be56909
CH
1140 BUG();
1141 }
1142 }
1143
442761fd
MS
1144 if (static_branch_unlikely(&swap_bios_enabled) &&
1145 unlikely(swap_bios_limit(ti, bio)))
a666e5c0 1146 up(&md->swap_bios_semaphore);
a666e5c0 1147
1d1068ce 1148 free_tio(bio);
e2118b3c 1149 dm_io_dec_pending(io, error);
1da177e4
LT
1150}
1151
56a67df7
MS
1152/*
1153 * Return maximum size of I/O possible at the supplied sector up to the current
1154 * target boundary.
1155 */
3720281d
MS
1156static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
1157 sector_t target_offset)
56a67df7 1158{
56a67df7
MS
1159 return ti->len - target_offset;
1160}
1161
3720281d 1162static sector_t max_io_len(struct dm_target *ti, sector_t sector)
1da177e4 1163{
3720281d
MS
1164 sector_t target_offset = dm_target_offset(ti, sector);
1165 sector_t len = max_io_len_target_boundary(ti, target_offset);
1da177e4
LT
1166
1167 /*
3ee16db3
MS
1168 * Does the target need to split IO even further?
1169 * - varied (per target) IO splitting is a tenet of DM; this
1170 * explains why stacked chunk_sectors based splitting via
5a97806f 1171 * bio_split_to_limits() isn't possible here.
1da177e4 1172 */
c3949322
CH
1173 if (!ti->max_io_len)
1174 return len;
1175 return min_t(sector_t, len,
1176 min(queue_max_sectors(ti->table->md->queue),
1177 blk_chunk_sectors_left(target_offset, ti->max_io_len)));
1da177e4
LT
1178}
1179
542f9038
MS
1180int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1181{
1182 if (len > UINT_MAX) {
1183 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1184 (unsigned long long)len, UINT_MAX);
1185 ti->error = "Maximum size of target IO is too large";
1186 return -EINVAL;
1187 }
1188
75ae1936 1189 ti->max_io_len = (uint32_t) len;
542f9038
MS
1190
1191 return 0;
1192}
1193EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1194
f26c5719 1195static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
3d97c829
MS
1196 sector_t sector, int *srcu_idx)
1197 __acquires(md->io_barrier)
545ed20e 1198{
545ed20e
TK
1199 struct dm_table *map;
1200 struct dm_target *ti;
545ed20e 1201
f26c5719 1202 map = dm_get_live_table(md, srcu_idx);
545ed20e 1203 if (!map)
f26c5719 1204 return NULL;
545ed20e
TK
1205
1206 ti = dm_table_find_target(map, sector);
123d87d5 1207 if (!ti)
f26c5719 1208 return NULL;
545ed20e 1209
f26c5719
DW
1210 return ti;
1211}
545ed20e 1212
f26c5719 1213static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
e511c4a3
JC
1214 long nr_pages, enum dax_access_mode mode, void **kaddr,
1215 pfn_t *pfn)
f26c5719
DW
1216{
1217 struct mapped_device *md = dax_get_private(dax_dev);
1218 sector_t sector = pgoff * PAGE_SECTORS;
1219 struct dm_target *ti;
1220 long len, ret = -EIO;
1221 int srcu_idx;
545ed20e 1222
f26c5719 1223 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
545ed20e 1224
f26c5719
DW
1225 if (!ti)
1226 goto out;
1227 if (!ti->type->direct_access)
1228 goto out;
3720281d 1229 len = max_io_len(ti, sector) / PAGE_SECTORS;
f26c5719
DW
1230 if (len < 1)
1231 goto out;
1232 nr_pages = min(len, nr_pages);
e511c4a3 1233 ret = ti->type->direct_access(ti, pgoff, nr_pages, mode, kaddr, pfn);
817bf402 1234
f26c5719 1235 out:
545ed20e 1236 dm_put_live_table(md, srcu_idx);
f26c5719
DW
1237
1238 return ret;
545ed20e
TK
1239}
1240
cdf6cdcd
VG
1241static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1242 size_t nr_pages)
1243{
1244 struct mapped_device *md = dax_get_private(dax_dev);
1245 sector_t sector = pgoff * PAGE_SECTORS;
1246 struct dm_target *ti;
1247 int ret = -EIO;
1248 int srcu_idx;
1249
1250 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1251
1252 if (!ti)
1253 goto out;
1254 if (WARN_ON(!ti->type->dax_zero_page_range)) {
1255 /*
1256 * ->zero_page_range() is mandatory dax operation. If we are
1257 * here, something is wrong.
1258 */
cdf6cdcd
VG
1259 goto out;
1260 }
1261 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
cdf6cdcd
VG
1262 out:
1263 dm_put_live_table(md, srcu_idx);
1264
1265 return ret;
1266}
1267
047218ec
JC
1268static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
1269 void *addr, size_t bytes, struct iov_iter *i)
1270{
1271 struct mapped_device *md = dax_get_private(dax_dev);
1272 sector_t sector = pgoff * PAGE_SECTORS;
1273 struct dm_target *ti;
1274 int srcu_idx;
1275 long ret = 0;
1276
1277 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1278 if (!ti || !ti->type->dax_recovery_write)
1279 goto out;
1280
1281 ret = ti->type->dax_recovery_write(ti, pgoff, addr, bytes, i);
1282out:
1283 dm_put_live_table(md, srcu_idx);
1284 return ret;
1285}
1286
1dd40c3e
MP
1287/*
1288 * A target may call dm_accept_partial_bio only from the map routine. It is
6842d264 1289 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
e6fc9f62
MS
1290 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by
1291 * __send_duplicate_bios().
1dd40c3e
MP
1292 *
1293 * dm_accept_partial_bio informs the dm that the target only wants to process
1294 * additional n_sectors sectors of the bio and the rest of the data should be
1295 * sent in a next bio.
1296 *
1297 * A diagram that explains the arithmetics:
1298 * +--------------------+---------------+-------+
1299 * | 1 | 2 | 3 |
1300 * +--------------------+---------------+-------+
1301 *
1302 * <-------------- *tio->len_ptr --------------->
bdb34759 1303 * <----- bio_sectors ----->
1dd40c3e
MP
1304 * <-- n_sectors -->
1305 *
1306 * Region 1 was already iterated over with bio_advance or similar function.
1307 * (it may be empty if the target doesn't use bio_advance)
1308 * Region 2 is the remaining bio size that the target wants to process.
1309 * (it may be empty if region 1 is non-empty, although there is no reason
1310 * to make it empty)
1311 * The target requires that region 3 is to be sent in the next bio.
1312 *
1313 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1314 * the partially processed part (the sum of regions 1+2) must be the same for all
1315 * copies of the bio.
1316 */
86a3238c 1317void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors)
1dd40c3e 1318{
6c23f0bd 1319 struct dm_target_io *tio = clone_to_tio(bio);
8b211aac 1320 struct dm_io *io = tio->io;
86a3238c 1321 unsigned int bio_sectors = bio_sectors(bio);
6842d264 1322
655f3aad 1323 BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
6842d264
DLM
1324 BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1325 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
bdb34759
MS
1326 BUG_ON(bio_sectors > *tio->len_ptr);
1327 BUG_ON(n_sectors > bio_sectors);
6842d264 1328
bdb34759 1329 *tio->len_ptr -= bio_sectors - n_sectors;
1dd40c3e 1330 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
7dd76d1f
ML
1331
1332 /*
1333 * __split_and_process_bio() may have already saved mapped part
1334 * for accounting but it is being reduced so update accordingly.
1335 */
8b211aac
ML
1336 dm_io_set_flag(io, DM_IO_WAS_SPLIT);
1337 io->sectors = n_sectors;
1338 io->sector_offset = bio_sectors(io->orig_bio);
1dd40c3e
MP
1339}
1340EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1341
0fbb4d93
MS
1342/*
1343 * @clone: clone bio that DM core passed to target's .map function
1344 * @tgt_clone: clone of @clone bio that target needs submitted
0fbb4d93
MS
1345 *
1346 * Targets should use this interface to submit bios they take
1347 * ownership of when returning DM_MAPIO_SUBMITTED.
1348 *
1349 * Target should also enable ti->accounts_remapped_io
1350 */
b7f8dff0 1351void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone)
0fbb4d93
MS
1352{
1353 struct dm_target_io *tio = clone_to_tio(clone);
1354 struct dm_io *io = tio->io;
1355
1356 /* establish bio that will get submitted */
1357 if (!tgt_clone)
1358 tgt_clone = clone;
1359
1360 /*
1361 * Account io->origin_bio to DM dev on behalf of target
1362 * that took ownership of IO with DM_MAPIO_SUBMITTED.
1363 */
9d20653f 1364 dm_start_io_acct(io, clone);
0fbb4d93 1365
9d20653f 1366 trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk),
0fbb4d93 1367 tio->old_sector);
9d20653f 1368 submit_bio_noacct(tgt_clone);
0fbb4d93
MS
1369}
1370EXPORT_SYMBOL_GPL(dm_submit_bio_remap);
1371
a666e5c0
MP
1372static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1373{
1374 mutex_lock(&md->swap_bios_lock);
1375 while (latch < md->swap_bios) {
1376 cond_resched();
1377 down(&md->swap_bios_semaphore);
1378 md->swap_bios--;
1379 }
1380 while (latch > md->swap_bios) {
1381 cond_resched();
1382 up(&md->swap_bios_semaphore);
1383 md->swap_bios++;
1384 }
1385 mutex_unlock(&md->swap_bios_lock);
1386}
1387
1561b396 1388static void __map_bio(struct bio *clone)
1da177e4 1389{
1561b396 1390 struct dm_target_io *tio = clone_to_tio(clone);
bd2a49b8 1391 struct dm_target *ti = tio->ti;
6cbce280
MS
1392 struct dm_io *io = tio->io;
1393 struct mapped_device *md = io->md;
1394 int r;
1da177e4 1395
1da177e4 1396 clone->bi_end_io = clone_endio;
1da177e4
LT
1397
1398 /*
0fbb4d93 1399 * Map the clone.
1da177e4 1400 */
743598f0 1401 tio->old_sector = clone->bi_iter.bi_sector;
d67a5f4b 1402
442761fd
MS
1403 if (static_branch_unlikely(&swap_bios_enabled) &&
1404 unlikely(swap_bios_limit(ti, clone))) {
a666e5c0
MP
1405 int latch = get_swap_bios();
1406 if (unlikely(latch != md->swap_bios))
1407 __set_swap_bios_limit(md, latch);
1408 down(&md->swap_bios_semaphore);
1409 }
1410
442761fd
MS
1411 if (static_branch_unlikely(&zoned_enabled)) {
1412 /*
1413 * Check if the IO needs a special mapping due to zone append
1414 * emulation on zoned target. In this case, dm_zone_map_bio()
1415 * calls the target map operation.
1416 */
1417 if (unlikely(dm_emulate_zone_append(md)))
1418 r = dm_zone_map_bio(tio);
1419 else
1420 r = ti->type->map(ti, clone);
1421 } else
bb37d772
DLM
1422 r = ti->type->map(ti, clone);
1423
846785e6
CH
1424 switch (r) {
1425 case DM_MAPIO_SUBMITTED:
0fbb4d93
MS
1426 /* target has assumed ownership of this io */
1427 if (!ti->accounts_remapped_io)
9d20653f 1428 dm_start_io_acct(io, clone);
846785e6
CH
1429 break;
1430 case DM_MAPIO_REMAPPED:
9d20653f 1431 dm_submit_bio_remap(clone, NULL);
846785e6
CH
1432 break;
1433 case DM_MAPIO_KILL:
846785e6 1434 case DM_MAPIO_REQUEUE:
442761fd
MS
1435 if (static_branch_unlikely(&swap_bios_enabled) &&
1436 unlikely(swap_bios_limit(ti, clone)))
6cbce280 1437 up(&md->swap_bios_semaphore);
1d1068ce 1438 free_tio(clone);
90a2326e
MS
1439 if (r == DM_MAPIO_KILL)
1440 dm_io_dec_pending(io, BLK_STS_IOERR);
1441 else
1442 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
846785e6
CH
1443 break;
1444 default:
43e6c111 1445 DMCRIT("unimplemented target map return value: %d", r);
45cbcd79 1446 BUG();
1da177e4
LT
1447 }
1448}
1da177e4 1449
86a3238c 1450static void setup_split_accounting(struct clone_info *ci, unsigned int len)
7dd76d1f
ML
1451{
1452 struct dm_io *io = ci->io;
1453
1454 if (ci->sector_count > len) {
1455 /*
1456 * Split needed, save the mapped part for accounting.
1457 * NOTE: dm_accept_partial_bio() will update accordingly.
1458 */
1459 dm_io_set_flag(io, DM_IO_WAS_SPLIT);
1460 io->sectors = len;
8b211aac 1461 io->sector_offset = bio_sectors(ci->bio);
7dd76d1f
ML
1462 }
1463}
1464
318716dd 1465static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
86a3238c 1466 struct dm_target *ti, unsigned int num_bios)
f9ab94ce 1467{
1d1068ce 1468 struct bio *bio;
318716dd 1469 int try;
dba14160 1470
318716dd
MS
1471 for (try = 0; try < 2; try++) {
1472 int bio_nr;
318716dd
MS
1473
1474 if (try)
bc02cdbe 1475 mutex_lock(&ci->io->md->table_devices_lock);
318716dd 1476 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
7dd06a25 1477 bio = alloc_tio(ci, ti, bio_nr, NULL,
dc8e2021 1478 try ? GFP_NOIO : GFP_NOWAIT);
1d1068ce 1479 if (!bio)
318716dd
MS
1480 break;
1481
1d1068ce 1482 bio_list_add(blist, bio);
318716dd
MS
1483 }
1484 if (try)
bc02cdbe 1485 mutex_unlock(&ci->io->md->table_devices_lock);
318716dd
MS
1486 if (bio_nr == num_bios)
1487 return;
1488
6c23f0bd 1489 while ((bio = bio_list_pop(blist)))
1d1068ce 1490 free_tio(bio);
318716dd 1491 }
9015df24
AK
1492}
1493
0f14d60a 1494static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
86a3238c 1495 unsigned int num_bios, unsigned int *len)
06a426ce 1496{
318716dd 1497 struct bio_list blist = BIO_EMPTY_LIST;
8eabf5d0 1498 struct bio *clone;
564b5c54 1499 unsigned int ret = 0;
06a426ce 1500
891fced6
CH
1501 switch (num_bios) {
1502 case 0:
1503 break;
1504 case 1:
7dd76d1f
ML
1505 if (len)
1506 setup_split_accounting(ci, *len);
891fced6 1507 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
1561b396 1508 __map_bio(clone);
0f14d60a 1509 ret = 1;
891fced6
CH
1510 break;
1511 default:
7dd06a25
MS
1512 /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */
1513 alloc_multiple_bios(&blist, ci, ti, num_bios);
891fced6 1514 while ((clone = bio_list_pop(&blist))) {
655f3aad 1515 dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
891fced6 1516 __map_bio(clone);
0f14d60a 1517 ret += 1;
891fced6
CH
1518 }
1519 break;
318716dd 1520 }
0f14d60a
ML
1521
1522 return ret;
06a426ce
MS
1523}
1524
332f2b1e 1525static void __send_empty_flush(struct clone_info *ci)
f9ab94ce 1526{
564b5c54 1527 struct dm_table *t = ci->map;
828678b8
MS
1528 struct bio flush_bio;
1529
1530 /*
1531 * Use an on-stack bio for this, it's safe since we don't
1532 * need to reference it after submit. It's just used as
1533 * the basis for the clone(s).
1534 */
49add496
CH
1535 bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
1536 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
47d95102 1537
828678b8
MS
1538 ci->bio = &flush_bio;
1539 ci->sector_count = 0;
92b914e2 1540 ci->io->tio.clone.bi_iter.bi_size = 0;
f9ab94ce 1541
564b5c54
MS
1542 for (unsigned int i = 0; i < t->num_targets; i++) {
1543 unsigned int bios;
1544 struct dm_target *ti = dm_table_get_target(t, i);
0f14d60a
ML
1545
1546 atomic_add(ti->num_flush_bios, &ci->io->io_count);
1547 bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1548 atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count);
1549 }
1550
1551 /*
1552 * alloc_io() takes one extra reference for submission, so the
1553 * reference won't reach 0 without the following subtraction
1554 */
1555 atomic_sub(1, &ci->io->io_count);
828678b8
MS
1556
1557 bio_uninit(ci->bio);
f9ab94ce
MP
1558}
1559
e6fc9f62 1560static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
86a3238c 1561 unsigned int num_bios)
ba1cbad9 1562{
86a3238c 1563 unsigned int len, bios;
ba1cbad9 1564
3720281d
MS
1565 len = min_t(sector_t, ci->sector_count,
1566 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
51b86f9a 1567
0f14d60a
ML
1568 atomic_add(num_bios, &ci->io->io_count);
1569 bios = __send_duplicate_bios(ci, ti, num_bios, &len);
1570 /*
1571 * alloc_io() takes one extra reference for submission, so the
1572 * reference won't reach 0 without the following (+1) subtraction
1573 */
1574 atomic_sub(num_bios - bios + 1, &ci->io->io_count);
7dd06a25 1575
3d7f4562
MS
1576 ci->sector += len;
1577 ci->sector_count -= len;
ba1cbad9
MS
1578}
1579
568c73a3
MS
1580static bool is_abnormal_io(struct bio *bio)
1581{
a3282b43 1582 enum req_op op = bio_op(bio);
568c73a3 1583
4edadf6d
MS
1584 if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) {
1585 switch (op) {
1586 case REQ_OP_DISCARD:
1587 case REQ_OP_SECURE_ERASE:
1588 case REQ_OP_WRITE_ZEROES:
1589 return true;
1590 default:
1591 break;
1592 }
568c73a3
MS
1593 }
1594
4edadf6d 1595 return false;
568c73a3
MS
1596}
1597
4edadf6d
MS
1598static blk_status_t __process_abnormal_io(struct clone_info *ci,
1599 struct dm_target *ti)
0519c71e 1600{
86a3238c 1601 unsigned int num_bios = 0;
0519c71e 1602
e6fc9f62 1603 switch (bio_op(ci->bio)) {
9679b5a7
MS
1604 case REQ_OP_DISCARD:
1605 num_bios = ti->num_discard_bios;
1606 break;
1607 case REQ_OP_SECURE_ERASE:
1608 num_bios = ti->num_secure_erase_bios;
1609 break;
9679b5a7
MS
1610 case REQ_OP_WRITE_ZEROES:
1611 num_bios = ti->num_write_zeroes_bios;
1612 break;
2d9b02be
BVA
1613 default:
1614 break;
9679b5a7 1615 }
0519c71e 1616
e6fc9f62
MS
1617 /*
1618 * Even though the device advertised support for this type of
1619 * request, that does not mean every target supports it, and
1620 * reconfiguration might also have changed that since the
1621 * check was performed.
1622 */
84b98f4c 1623 if (unlikely(!num_bios))
4edadf6d
MS
1624 return BLK_STS_NOTSUPP;
1625
1626 __send_changing_extent_only(ci, ti, num_bios);
1627 return BLK_STS_OK;
0519c71e
MS
1628}
1629
b99fdcdc 1630/*
ec211631 1631 * Reuse ->bi_private as dm_io list head for storing all dm_io instances
b99fdcdc
ML
1632 * associated with this bio, and this bio's bi_private needs to be
1633 * stored in dm_io->data before the reuse.
1634 *
1635 * bio->bi_private is owned by fs or upper layer, so block layer won't
1636 * touch it after splitting. Meantime it won't be changed by anyone after
1637 * bio is submitted. So this reuse is safe.
1638 */
ec211631 1639static inline struct dm_io **dm_poll_list_head(struct bio *bio)
b99fdcdc 1640{
ec211631 1641 return (struct dm_io **)&bio->bi_private;
b99fdcdc
ML
1642}
1643
1644static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
1645{
ec211631 1646 struct dm_io **head = dm_poll_list_head(bio);
b99fdcdc
ML
1647
1648 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
1649 bio->bi_opf |= REQ_DM_POLL_LIST;
1650 /*
1651 * Save .bi_private into dm_io, so that we can reuse
ec211631 1652 * .bi_private as dm_io list head for storing dm_io list
b99fdcdc
ML
1653 */
1654 io->data = bio->bi_private;
1655
b99fdcdc
ML
1656 /* tell block layer to poll for completion */
1657 bio->bi_cookie = ~BLK_QC_T_NONE;
ec211631
ML
1658
1659 io->next = NULL;
b99fdcdc
ML
1660 } else {
1661 /*
1662 * bio recursed due to split, reuse original poll list,
1663 * and save bio->bi_private too.
1664 */
ec211631
ML
1665 io->data = (*head)->data;
1666 io->next = *head;
b99fdcdc
ML
1667 }
1668
ec211631 1669 *head = io;
b99fdcdc
ML
1670}
1671
e4c93811
AK
1672/*
1673 * Select the correct strategy for processing a non-flush bio.
1674 */
84b98f4c 1675static blk_status_t __split_and_process_bio(struct clone_info *ci)
0ce65797 1676{
66bdaa43 1677 struct bio *clone;
512875bd 1678 struct dm_target *ti;
86a3238c 1679 unsigned int len;
0ce65797 1680
512875bd 1681 ti = dm_table_find_target(ci->map, ci->sector);
4edadf6d
MS
1682 if (unlikely(!ti))
1683 return BLK_STS_IOERR;
1ee88de3
MP
1684
1685 if (unlikely((ci->bio->bi_opf & REQ_NOWAIT) != 0) &&
1686 unlikely(!dm_target_supports_nowait(ti->type)))
1687 return BLK_STS_NOTSUPP;
1688
1689 if (unlikely(ci->is_abnormal_io))
4edadf6d 1690 return __process_abnormal_io(ci, ti);
3d7f4562 1691
b99fdcdc
ML
1692 /*
1693 * Only support bio polling for normal IO, and the target io is
1694 * exactly inside the dm_io instance (verified in dm_poll_dm_io)
1695 */
a3282b43 1696 ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
0ce65797 1697
3720281d 1698 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
7dd76d1f 1699 setup_split_accounting(ci, len);
66bdaa43 1700 clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
66bdaa43 1701 __map_bio(clone);
0ce65797 1702
1c3b13e6
KO
1703 ci->sector += len;
1704 ci->sector_count -= len;
0ce65797 1705
84b98f4c 1706 return BLK_STS_OK;
0ce65797
MS
1707}
1708
978e51ba 1709static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
4edadf6d 1710 struct dm_table *map, struct bio *bio, bool is_abnormal)
978e51ba
MS
1711{
1712 ci->map = map;
1713 ci->io = alloc_io(md, bio);
d41e077a 1714 ci->bio = bio;
4edadf6d 1715 ci->is_abnormal_io = is_abnormal;
b99fdcdc 1716 ci->submit_as_polled = false;
978e51ba 1717 ci->sector = bio->bi_iter.bi_sector;
d41e077a
MS
1718 ci->sector_count = bio_sectors(bio);
1719
1720 /* Shouldn't happen but sector_count was being set to 0 so... */
442761fd
MS
1721 if (static_branch_unlikely(&zoned_enabled) &&
1722 WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
d41e077a 1723 ci->sector_count = 0;
978e51ba
MS
1724}
1725
1da177e4 1726/*
14fe594d 1727 * Entry point to split a bio into clones and submit them to the targets.
1da177e4 1728 */
96c9865c
MS
1729static void dm_split_and_process_bio(struct mapped_device *md,
1730 struct dm_table *map, struct bio *bio)
0ce65797 1731{
1da177e4 1732 struct clone_info ci;
4857abf6 1733 struct dm_io *io;
84b98f4c 1734 blk_status_t error = BLK_STS_OK;
4edadf6d
MS
1735 bool is_abnormal;
1736
1737 is_abnormal = is_abnormal_io(bio);
1738 if (unlikely(is_abnormal)) {
1739 /*
5a97806f 1740 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
4edadf6d
MS
1741 * otherwise associated queue_limits won't be imposed.
1742 */
5a97806f 1743 bio = bio_split_to_limits(bio);
613b1488
JA
1744 if (!bio)
1745 return;
4edadf6d 1746 }
1da177e4 1747
4edadf6d 1748 init_clone_info(&ci, md, map, bio, is_abnormal);
4857abf6 1749 io = ci.io;
0ce65797 1750
1eff9d32 1751 if (bio->bi_opf & REQ_PREFLUSH) {
332f2b1e 1752 __send_empty_flush(&ci);
e2736347 1753 /* dm_io_complete submits any data associated with flush */
d41e077a 1754 goto out;
d87f4c14 1755 }
0ce65797 1756
d41e077a
MS
1757 error = __split_and_process_bio(&ci);
1758 if (error || !ci.sector_count)
1759 goto out;
d41e077a
MS
1760 /*
1761 * Remainder must be passed to submit_bio_noacct() so it gets handled
1762 * *after* bios already submitted have been completely processed.
d41e077a 1763 */
8b211aac
ML
1764 bio_trim(bio, io->sectors, ci.sector_count);
1765 trace_block_split(bio, bio->bi_iter.bi_sector);
1766 bio_inc_remaining(bio);
d41e077a
MS
1767 submit_bio_noacct(bio);
1768out:
b99fdcdc
ML
1769 /*
1770 * Drop the extra reference count for non-POLLED bio, and hold one
1771 * reference for POLLED bio, which will be released in dm_poll_bio
1772 *
ec211631
ML
1773 * Add every dm_io instance into the dm_io list head which is stored
1774 * in bio->bi_private, so that dm_poll_bio can poll them all.
b99fdcdc 1775 */
0f14d60a
ML
1776 if (error || !ci.submit_as_polled) {
1777 /*
1778 * In case of submission failure, the extra reference for
1779 * submitting io isn't consumed yet
1780 */
1781 if (error)
1782 atomic_dec(&io->io_count);
1783 dm_io_dec_pending(io, error);
1784 } else
4857abf6 1785 dm_queue_poll_io(bio, io);
0ce65797
MS
1786}
1787
3e08773c 1788static void dm_submit_bio(struct bio *bio)
cec47e3d 1789{
309dca30 1790 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
83d5e5b0
MP
1791 int srcu_idx;
1792 struct dm_table *map;
a3282b43 1793 blk_opf_t bio_opf = bio->bi_opf;
cec47e3d 1794
5d7362d0 1795 map = dm_get_live_table_bio(md, &srcu_idx, bio_opf);
29e4013d 1796
fa247089
MS
1797 /* If suspended, or map not yet available, queue this IO for later */
1798 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
1799 unlikely(!map)) {
6abc4946
KK
1800 if (bio->bi_opf & REQ_NOWAIT)
1801 bio_wouldblock_error(bio);
b2abdb1b 1802 else if (bio->bi_opf & REQ_RAHEAD)
54d9a1b4 1803 bio_io_error(bio);
b2abdb1b
MS
1804 else
1805 queue_io(md, bio);
1806 goto out;
cec47e3d 1807 }
1da177e4 1808
96c9865c 1809 dm_split_and_process_bio(md, map, bio);
b2abdb1b 1810out:
5d7362d0 1811 dm_put_live_table_bio(md, srcu_idx, bio_opf);
978e51ba
MS
1812}
1813
b99fdcdc
ML
1814static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
1815 unsigned int flags)
1816{
655f3aad 1817 WARN_ON_ONCE(!dm_tio_is_normal(&io->tio));
b99fdcdc
ML
1818
1819 /* don't poll if the mapped io is done */
1820 if (atomic_read(&io->io_count) > 1)
1821 bio_poll(&io->tio.clone, iob, flags);
1822
1823 /* bio_poll holds the last reference */
1824 return atomic_read(&io->io_count) == 1;
1825}
1826
1827static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
1828 unsigned int flags)
1829{
ec211631
ML
1830 struct dm_io **head = dm_poll_list_head(bio);
1831 struct dm_io *list = *head;
1832 struct dm_io *tmp = NULL;
1833 struct dm_io *curr, *next;
b99fdcdc
ML
1834
1835 /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
1836 if (!(bio->bi_opf & REQ_DM_POLL_LIST))
1837 return 0;
1838
ec211631 1839 WARN_ON_ONCE(!list);
b99fdcdc
ML
1840
1841 /*
1842 * Restore .bi_private before possibly completing dm_io.
1843 *
1844 * bio_poll() is only possible once @bio has been completely
1845 * submitted via submit_bio_noacct()'s depth-first submission.
1846 * So there is no dm_queue_poll_io() race associated with
1847 * clearing REQ_DM_POLL_LIST here.
1848 */
1849 bio->bi_opf &= ~REQ_DM_POLL_LIST;
ec211631 1850 bio->bi_private = list->data;
b99fdcdc 1851
ec211631
ML
1852 for (curr = list, next = curr->next; curr; curr = next, next =
1853 curr ? curr->next : NULL) {
1854 if (dm_poll_dm_io(curr, iob, flags)) {
b99fdcdc 1855 /*
84b98f4c
MS
1856 * clone_endio() has already occurred, so no
1857 * error handling is needed here.
b99fdcdc 1858 */
ec211631
ML
1859 __dm_io_dec_pending(curr);
1860 } else {
1861 curr->next = tmp;
1862 tmp = curr;
b99fdcdc
ML
1863 }
1864 }
1865
1866 /* Not done? */
ec211631 1867 if (tmp) {
b99fdcdc
ML
1868 bio->bi_opf |= REQ_DM_POLL_LIST;
1869 /* Reset bio->bi_private to dm_io list head */
ec211631 1870 *head = tmp;
b99fdcdc
ML
1871 return 0;
1872 }
1873 return 1;
1874}
1875
1da177e4
LT
1876/*-----------------------------------------------------------------
1877 * An IDR is used to keep track of allocated minor numbers.
1878 *---------------------------------------------------------------*/
2b06cfff 1879static void free_minor(int minor)
1da177e4 1880{
f32c10b0 1881 spin_lock(&_minor_lock);
1da177e4 1882 idr_remove(&_minor_idr, minor);
f32c10b0 1883 spin_unlock(&_minor_lock);
1da177e4
LT
1884}
1885
1886/*
1887 * See if the device with a specific minor # is free.
1888 */
cf13ab8e 1889static int specific_minor(int minor)
1da177e4 1890{
c9d76be6 1891 int r;
1da177e4
LT
1892
1893 if (minor >= (1 << MINORBITS))
1894 return -EINVAL;
1895
c9d76be6 1896 idr_preload(GFP_KERNEL);
f32c10b0 1897 spin_lock(&_minor_lock);
1da177e4 1898
c9d76be6 1899 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1da177e4 1900
f32c10b0 1901 spin_unlock(&_minor_lock);
c9d76be6
TH
1902 idr_preload_end();
1903 if (r < 0)
1904 return r == -ENOSPC ? -EBUSY : r;
1905 return 0;
1da177e4
LT
1906}
1907
cf13ab8e 1908static int next_free_minor(int *minor)
1da177e4 1909{
c9d76be6 1910 int r;
62f75c2f 1911
c9d76be6 1912 idr_preload(GFP_KERNEL);
f32c10b0 1913 spin_lock(&_minor_lock);
1da177e4 1914
c9d76be6 1915 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1da177e4 1916
f32c10b0 1917 spin_unlock(&_minor_lock);
c9d76be6
TH
1918 idr_preload_end();
1919 if (r < 0)
1920 return r;
1921 *minor = r;
1922 return 0;
1da177e4
LT
1923}
1924
83d5cde4 1925static const struct block_device_operations dm_blk_dops;
681cc5e8 1926static const struct block_device_operations dm_rq_blk_dops;
f26c5719 1927static const struct dax_operations dm_dax_ops;
1da177e4 1928
53d5914f
MP
1929static void dm_wq_work(struct work_struct *work);
1930
aa6ce87a 1931#ifdef CONFIG_BLK_INLINE_ENCRYPTION
cb77cb5a 1932static void dm_queue_destroy_crypto_profile(struct request_queue *q)
aa6ce87a 1933{
cb77cb5a 1934 dm_destroy_crypto_profile(q->crypto_profile);
aa6ce87a
ST
1935}
1936
1937#else /* CONFIG_BLK_INLINE_ENCRYPTION */
1938
cb77cb5a 1939static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
aa6ce87a
ST
1940{
1941}
1942#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
1943
0f20972f
MS
1944static void cleanup_mapped_device(struct mapped_device *md)
1945{
0f20972f
MS
1946 if (md->wq)
1947 destroy_workqueue(md->wq);
29dec90a 1948 dm_free_md_mempools(md->mempools);
0f20972f 1949
f26c5719 1950 if (md->dax_dev) {
fb08a190 1951 dax_remove_host(md->disk);
f26c5719
DW
1952 kill_dax(md->dax_dev);
1953 put_dax(md->dax_dev);
1954 md->dax_dev = NULL;
1955 }
1956
588b7f5d 1957 dm_cleanup_zoned_dev(md);
0f20972f
MS
1958 if (md->disk) {
1959 spin_lock(&_minor_lock);
1960 md->disk->private_data = NULL;
1961 spin_unlock(&_minor_lock);
89f871af 1962 if (dm_get_md_type(md) != DM_TYPE_NONE) {
1a581b72
CH
1963 struct table_device *td;
1964
89f871af 1965 dm_sysfs_exit(md);
1a581b72
CH
1966 list_for_each_entry(td, &md->table_devices, list) {
1967 bd_unlink_disk_holder(td->dm_dev.bdev,
1968 md->disk);
1969 }
d563792c
YK
1970
1971 /*
1972 * Hold lock to make sure del_gendisk() won't concurrent
1973 * with open/close_table_device().
1974 */
1975 mutex_lock(&md->table_devices_lock);
89f871af 1976 del_gendisk(md->disk);
d563792c 1977 mutex_unlock(&md->table_devices_lock);
89f871af 1978 }
cb77cb5a 1979 dm_queue_destroy_crypto_profile(md->queue);
8b9ab626 1980 put_disk(md->disk);
74a2b6ec 1981 }
0f20972f 1982
9f6dc633
MS
1983 if (md->pending_io) {
1984 free_percpu(md->pending_io);
1985 md->pending_io = NULL;
1986 }
1987
d09960b0
TE
1988 cleanup_srcu_struct(&md->io_barrier);
1989
d5ffebdd
MS
1990 mutex_destroy(&md->suspend_lock);
1991 mutex_destroy(&md->type_lock);
1992 mutex_destroy(&md->table_devices_lock);
a666e5c0 1993 mutex_destroy(&md->swap_bios_lock);
d5ffebdd 1994
4cc96131 1995 dm_mq_cleanup_mapped_device(md);
0f20972f
MS
1996}
1997
1da177e4
LT
1998/*
1999 * Allocate and initialise a blank device with a given minor.
2000 */
2b06cfff 2001static struct mapped_device *alloc_dev(int minor)
1da177e4 2002{
115485e8
MS
2003 int r, numa_node_id = dm_get_numa_node();
2004 struct mapped_device *md;
ba61fdd1 2005 void *old_md;
1da177e4 2006
856eb091 2007 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1da177e4 2008 if (!md) {
43e6c111 2009 DMERR("unable to allocate device, out of memory.");
1da177e4
LT
2010 return NULL;
2011 }
2012
10da4f79 2013 if (!try_module_get(THIS_MODULE))
6ed7ade8 2014 goto bad_module_get;
10da4f79 2015
1da177e4 2016 /* get a minor number for the dev */
2b06cfff 2017 if (minor == DM_ANY_MINOR)
cf13ab8e 2018 r = next_free_minor(&minor);
2b06cfff 2019 else
cf13ab8e 2020 r = specific_minor(minor);
1da177e4 2021 if (r < 0)
6ed7ade8 2022 goto bad_minor;
1da177e4 2023
83d5e5b0
MP
2024 r = init_srcu_struct(&md->io_barrier);
2025 if (r < 0)
2026 goto bad_io_barrier;
2027
115485e8 2028 md->numa_node_id = numa_node_id;
591ddcfc 2029 md->init_tio_pdu = false;
a5664dad 2030 md->type = DM_TYPE_NONE;
e61290a4 2031 mutex_init(&md->suspend_lock);
a5664dad 2032 mutex_init(&md->type_lock);
86f1152b 2033 mutex_init(&md->table_devices_lock);
022c2611 2034 spin_lock_init(&md->deferred_lock);
1da177e4 2035 atomic_set(&md->holders, 1);
5c6bd75d 2036 atomic_set(&md->open_count, 0);
1da177e4 2037 atomic_set(&md->event_nr, 0);
7a8c3d3b
MA
2038 atomic_set(&md->uevent_seq, 0);
2039 INIT_LIST_HEAD(&md->uevent_list);
86f1152b 2040 INIT_LIST_HEAD(&md->table_devices);
7a8c3d3b 2041 spin_lock_init(&md->uevent_lock);
1da177e4 2042
47ace7e0 2043 /*
c62b37d9
CH
2044 * default to bio-based until DM table is loaded and md->type
2045 * established. If request-based table is loaded: blk-mq will
2046 * override accordingly.
47ace7e0 2047 */
74fe6ba9 2048 md->disk = blk_alloc_disk(md->numa_node_id);
1da177e4 2049 if (!md->disk)
0f20972f 2050 goto bad;
74fe6ba9 2051 md->queue = md->disk->queue;
1da177e4 2052
f0b04115 2053 init_waitqueue_head(&md->wait);
53d5914f 2054 INIT_WORK(&md->work, dm_wq_work);
8b211aac 2055 INIT_WORK(&md->requeue_work, dm_wq_requeue_work);
f0b04115 2056 init_waitqueue_head(&md->eventq);
2995fa78 2057 init_completion(&md->kobj_holder.completion);
f0b04115 2058
8b211aac 2059 md->requeue_list = NULL;
a666e5c0
MP
2060 md->swap_bios = get_swap_bios();
2061 sema_init(&md->swap_bios_semaphore, md->swap_bios);
2062 mutex_init(&md->swap_bios_lock);
2063
1da177e4
LT
2064 md->disk->major = _major;
2065 md->disk->first_minor = minor;
74fe6ba9 2066 md->disk->minors = 1;
1ebe2e5f 2067 md->disk->flags |= GENHD_FL_NO_PART;
1da177e4 2068 md->disk->fops = &dm_blk_dops;
1da177e4
LT
2069 md->disk->private_data = md;
2070 sprintf(md->disk->disk_name, "dm-%d", minor);
f26c5719 2071
5d2a228b 2072 if (IS_ENABLED(CONFIG_FS_DAX)) {
30c6828a 2073 md->dax_dev = alloc_dax(md, &dm_dax_ops);
d7519392
CH
2074 if (IS_ERR(md->dax_dev)) {
2075 md->dax_dev = NULL;
976431b0 2076 goto bad;
d7519392 2077 }
7ac5360c
CH
2078 set_dax_nocache(md->dax_dev);
2079 set_dax_nomc(md->dax_dev);
fb08a190 2080 if (dax_add_host(md->dax_dev, md->disk))
976431b0
DW
2081 goto bad;
2082 }
f26c5719 2083
7e51f257 2084 format_dev_t(md->name, MKDEV(_major, minor));
1da177e4 2085
c7c879ee 2086 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name);
304f3f6a 2087 if (!md->wq)
0f20972f 2088 goto bad;
304f3f6a 2089
9f6dc633
MS
2090 md->pending_io = alloc_percpu(unsigned long);
2091 if (!md->pending_io)
2092 goto bad;
2093
fd2ed4d2
MP
2094 dm_stats_init(&md->stats);
2095
ba61fdd1 2096 /* Populate the mapping, nobody knows we exist yet */
f32c10b0 2097 spin_lock(&_minor_lock);
ba61fdd1 2098 old_md = idr_replace(&_minor_idr, md, minor);
f32c10b0 2099 spin_unlock(&_minor_lock);
ba61fdd1
JM
2100
2101 BUG_ON(old_md != MINOR_ALLOCED);
2102
1da177e4
LT
2103 return md;
2104
0f20972f
MS
2105bad:
2106 cleanup_mapped_device(md);
83d5e5b0 2107bad_io_barrier:
1da177e4 2108 free_minor(minor);
6ed7ade8 2109bad_minor:
10da4f79 2110 module_put(THIS_MODULE);
6ed7ade8 2111bad_module_get:
856eb091 2112 kvfree(md);
1da177e4
LT
2113 return NULL;
2114}
2115
ae9da83f
JN
2116static void unlock_fs(struct mapped_device *md);
2117
1da177e4
LT
2118static void free_dev(struct mapped_device *md)
2119{
f331c029 2120 int minor = MINOR(disk_devt(md->disk));
63d94e48 2121
32a926da 2122 unlock_fs(md);
2eb6e1e3 2123
0f20972f 2124 cleanup_mapped_device(md);
63a4f065 2125
992ec6a9 2126 WARN_ON_ONCE(!list_empty(&md->table_devices));
63a4f065 2127 dm_stats_cleanup(&md->stats);
63a4f065
MS
2128 free_minor(minor);
2129
10da4f79 2130 module_put(THIS_MODULE);
856eb091 2131 kvfree(md);
1da177e4
LT
2132}
2133
2134/*
2135 * Bind a table to the device.
2136 */
2137static void event_callback(void *context)
2138{
7a8c3d3b
MA
2139 unsigned long flags;
2140 LIST_HEAD(uevents);
1da177e4
LT
2141 struct mapped_device *md = (struct mapped_device *) context;
2142
7a8c3d3b
MA
2143 spin_lock_irqsave(&md->uevent_lock, flags);
2144 list_splice_init(&md->uevent_list, &uevents);
2145 spin_unlock_irqrestore(&md->uevent_lock, flags);
2146
ed9e1982 2147 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
7a8c3d3b 2148
1da177e4
LT
2149 atomic_inc(&md->event_nr);
2150 wake_up(&md->eventq);
62e08243 2151 dm_issue_global_event();
1da177e4
LT
2152}
2153
042d2a9b
AK
2154/*
2155 * Returns old map, which caller must destroy.
2156 */
2157static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2158 struct queue_limits *limits)
1da177e4 2159{
042d2a9b 2160 struct dm_table *old_map;
1da177e4 2161 sector_t size;
2a2a4c51 2162 int ret;
1da177e4 2163
5a8f1f80
BVA
2164 lockdep_assert_held(&md->suspend_lock);
2165
1da177e4 2166 size = dm_table_get_size(t);
3ac51e74
DW
2167
2168 /*
2169 * Wipe any geometry if the size of the table changed.
2170 */
fd2ed4d2 2171 if (size != dm_get_size(md))
3ac51e74
DW
2172 memset(&md->geometry, 0, sizeof(md->geometry));
2173
7533afa1 2174 set_capacity(md->disk, size);
d5816876 2175
2ca3310e
AK
2176 dm_table_event_callback(t, event_callback, md);
2177
f5b4aee1 2178 if (dm_table_request_based(t)) {
16f12266 2179 /*
9c37de29
MS
2180 * Leverage the fact that request-based DM targets are
2181 * immutable singletons - used to optimize dm_mq_queue_rq.
16f12266
MS
2182 */
2183 md->immutable_target = dm_table_get_immutable_target(t);
e6ee8c0b 2184
29dec90a
CH
2185 /*
2186 * There is no need to reload with request-based dm because the
2187 * size of front_pad doesn't change.
2188 *
2189 * Note for future: If you are to reload bioset, prep-ed
2190 * requests in the queue may refer to bio from the old bioset,
2191 * so you must walk through the queue to unprep.
2192 */
2193 if (!md->mempools) {
2194 md->mempools = t->mempools;
2195 t->mempools = NULL;
2196 }
2197 } else {
2198 /*
2199 * The md may already have mempools that need changing.
2200 * If so, reload bioset because front_pad may have changed
2201 * because a different table was loaded.
2202 */
2203 dm_free_md_mempools(md->mempools);
2204 md->mempools = t->mempools;
2205 t->mempools = NULL;
2a2a4c51 2206 }
e6ee8c0b 2207
f5b4aee1 2208 ret = dm_table_set_restrictions(t, md->queue, limits);
bb37d772
DLM
2209 if (ret) {
2210 old_map = ERR_PTR(ret);
2211 goto out;
2212 }
2213
a12f5d48 2214 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1d3aa6f6 2215 rcu_assign_pointer(md->map, (void *)t);
36a0456f
AK
2216 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2217
41abc4e1
HR
2218 if (old_map)
2219 dm_sync_table(md);
2a2a4c51 2220out:
042d2a9b 2221 return old_map;
1da177e4
LT
2222}
2223
a7940155
AK
2224/*
2225 * Returns unbound table for the caller to free.
2226 */
2227static struct dm_table *__unbind(struct mapped_device *md)
1da177e4 2228{
a12f5d48 2229 struct dm_table *map = rcu_dereference_protected(md->map, 1);
1da177e4
LT
2230
2231 if (!map)
a7940155 2232 return NULL;
1da177e4
LT
2233
2234 dm_table_event_callback(map, NULL, NULL);
9cdb8520 2235 RCU_INIT_POINTER(md->map, NULL);
83d5e5b0 2236 dm_sync_table(md);
a7940155
AK
2237
2238 return map;
1da177e4
LT
2239}
2240
2241/*
2242 * Constructor for a new device.
2243 */
2b06cfff 2244int dm_create(int minor, struct mapped_device **result)
1da177e4
LT
2245{
2246 struct mapped_device *md;
2247
2b06cfff 2248 md = alloc_dev(minor);
1da177e4
LT
2249 if (!md)
2250 return -ENXIO;
2251
91ccbbac
TS
2252 dm_ima_reset_data(md);
2253
1da177e4
LT
2254 *result = md;
2255 return 0;
2256}
2257
a5664dad
MS
2258/*
2259 * Functions to manage md->type.
2260 * All are required to hold md->type_lock.
2261 */
2262void dm_lock_md_type(struct mapped_device *md)
2263{
2264 mutex_lock(&md->type_lock);
2265}
2266
2267void dm_unlock_md_type(struct mapped_device *md)
2268{
2269 mutex_unlock(&md->type_lock);
2270}
2271
7e0d574f 2272void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
a5664dad 2273{
00c4fc3b 2274 BUG_ON(!mutex_is_locked(&md->type_lock));
a5664dad
MS
2275 md->type = type;
2276}
2277
7e0d574f 2278enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
a5664dad
MS
2279{
2280 return md->type;
2281}
2282
36a0456f
AK
2283struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2284{
2285 return md->immutable_target_type;
2286}
2287
f84cb8a4
MS
2288/*
2289 * The queue_limits are only valid as long as you have a reference
2290 * count on 'md'.
2291 */
2292struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2293{
2294 BUG_ON(!atomic_read(&md->holders));
2295 return &md->queue->limits;
2296}
2297EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2298
4a0b4ddf
MS
2299/*
2300 * Setup the DM device's queue based on md's type
2301 */
591ddcfc 2302int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
4a0b4ddf 2303{
ba305859 2304 enum dm_queue_mode type = dm_table_get_type(t);
c100ec49 2305 struct queue_limits limits;
1a581b72 2306 struct table_device *td;
ba305859 2307 int r;
bfebd1cd 2308
545ed20e 2309 switch (type) {
bfebd1cd 2310 case DM_TYPE_REQUEST_BASED:
681cc5e8 2311 md->disk->fops = &dm_rq_blk_dops;
e83068a5 2312 r = dm_mq_init_request_queue(md, t);
bfebd1cd 2313 if (r) {
681cc5e8 2314 DMERR("Cannot initialize queue for request-based dm mapped device");
bfebd1cd
MS
2315 return r;
2316 }
2317 break;
2318 case DM_TYPE_BIO_BASED:
545ed20e 2319 case DM_TYPE_DAX_BIO_BASED:
bfebd1cd 2320 break;
7e0d574f
BVA
2321 case DM_TYPE_NONE:
2322 WARN_ON_ONCE(true);
2323 break;
4a0b4ddf
MS
2324 }
2325
c100ec49
MS
2326 r = dm_calculate_queue_limits(t, &limits);
2327 if (r) {
2328 DMERR("Cannot calculate initial queue limits");
2329 return r;
2330 }
bb37d772
DLM
2331 r = dm_table_set_restrictions(t, md->queue, &limits);
2332 if (r)
2333 return r;
2334
d563792c
YK
2335 /*
2336 * Hold lock to make sure add_disk() and del_gendisk() won't concurrent
2337 * with open_table_device() and close_table_device().
2338 */
2339 mutex_lock(&md->table_devices_lock);
e7089f65 2340 r = add_disk(md->disk);
d563792c 2341 mutex_unlock(&md->table_devices_lock);
e7089f65
LC
2342 if (r)
2343 return r;
c100ec49 2344
1a581b72
CH
2345 /*
2346 * Register the holder relationship for devices added before the disk
2347 * was live.
2348 */
2349 list_for_each_entry(td, &md->table_devices, list) {
2350 r = bd_link_disk_holder(td->dm_dev.bdev, md->disk);
2351 if (r)
2352 goto out_undo_holders;
89f871af 2353 }
d563792c 2354
1a581b72
CH
2355 r = dm_sysfs_init(md);
2356 if (r)
2357 goto out_undo_holders;
2358
89f871af 2359 md->type = type;
4a0b4ddf 2360 return 0;
1a581b72
CH
2361
2362out_undo_holders:
2363 list_for_each_entry_continue_reverse(td, &md->table_devices, list)
2364 bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
2365 mutex_lock(&md->table_devices_lock);
2366 del_gendisk(md->disk);
2367 mutex_unlock(&md->table_devices_lock);
2368 return r;
4a0b4ddf
MS
2369}
2370
2bec1f4a 2371struct mapped_device *dm_get_md(dev_t dev)
1da177e4
LT
2372{
2373 struct mapped_device *md;
86a3238c 2374 unsigned int minor = MINOR(dev);
1da177e4
LT
2375
2376 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2377 return NULL;
2378
f32c10b0 2379 spin_lock(&_minor_lock);
1da177e4
LT
2380
2381 md = idr_find(&_minor_idr, minor);
49de5769
MS
2382 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2383 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2384 md = NULL;
2385 goto out;
fba9f90e 2386 }
49de5769 2387 dm_get(md);
fba9f90e 2388out:
f32c10b0 2389 spin_unlock(&_minor_lock);
1da177e4 2390
637842cf
DT
2391 return md;
2392}
3cf2e4ba 2393EXPORT_SYMBOL_GPL(dm_get_md);
d229a958 2394
9ade92a9 2395void *dm_get_mdptr(struct mapped_device *md)
637842cf 2396{
9ade92a9 2397 return md->interface_ptr;
1da177e4
LT
2398}
2399
2400void dm_set_mdptr(struct mapped_device *md, void *ptr)
2401{
2402 md->interface_ptr = ptr;
2403}
2404
2405void dm_get(struct mapped_device *md)
2406{
2407 atomic_inc(&md->holders);
3f77316d 2408 BUG_ON(test_bit(DMF_FREEING, &md->flags));
1da177e4
LT
2409}
2410
09ee96b2
MP
2411int dm_hold(struct mapped_device *md)
2412{
2413 spin_lock(&_minor_lock);
2414 if (test_bit(DMF_FREEING, &md->flags)) {
2415 spin_unlock(&_minor_lock);
2416 return -EBUSY;
2417 }
2418 dm_get(md);
2419 spin_unlock(&_minor_lock);
2420 return 0;
2421}
2422EXPORT_SYMBOL_GPL(dm_hold);
2423
72d94861
AK
2424const char *dm_device_name(struct mapped_device *md)
2425{
2426 return md->name;
2427}
2428EXPORT_SYMBOL_GPL(dm_device_name);
2429
3f77316d 2430static void __dm_destroy(struct mapped_device *md, bool wait)
1da177e4 2431{
1134e5ae 2432 struct dm_table *map;
83d5e5b0 2433 int srcu_idx;
1da177e4 2434
3f77316d 2435 might_sleep();
fba9f90e 2436
63a4f065 2437 spin_lock(&_minor_lock);
3f77316d
KU
2438 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2439 set_bit(DMF_FREEING, &md->flags);
2440 spin_unlock(&_minor_lock);
3b785fbc 2441
7a5428dc 2442 blk_mark_disk_dead(md->disk);
3f77316d 2443
ab7c7bb6
MP
2444 /*
2445 * Take suspend_lock so that presuspend and postsuspend methods
2446 * do not race with internal suspend.
2447 */
2448 mutex_lock(&md->suspend_lock);
2a708cff 2449 map = dm_get_live_table(md, &srcu_idx);
3f77316d
KU
2450 if (!dm_suspended_md(md)) {
2451 dm_table_presuspend_targets(map);
adc0daad 2452 set_bit(DMF_SUSPENDED, &md->flags);
5df96f2b 2453 set_bit(DMF_POST_SUSPENDING, &md->flags);
3f77316d 2454 dm_table_postsuspend_targets(map);
1da177e4 2455 }
238d991f 2456 /* dm_put_live_table must be before fsleep, otherwise deadlock is possible */
83d5e5b0 2457 dm_put_live_table(md, srcu_idx);
2a708cff 2458 mutex_unlock(&md->suspend_lock);
83d5e5b0 2459
3f77316d
KU
2460 /*
2461 * Rare, but there may be I/O requests still going to complete,
2462 * for example. Wait for all references to disappear.
2463 * No one should increment the reference count of the mapped_device,
2464 * after the mapped_device state becomes DMF_FREEING.
2465 */
2466 if (wait)
2467 while (atomic_read(&md->holders))
238d991f 2468 fsleep(1000);
3f77316d
KU
2469 else if (atomic_read(&md->holders))
2470 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2471 dm_device_name(md), atomic_read(&md->holders));
2472
3f77316d
KU
2473 dm_table_destroy(__unbind(md));
2474 free_dev(md);
2475}
2476
2477void dm_destroy(struct mapped_device *md)
2478{
2479 __dm_destroy(md, true);
2480}
2481
2482void dm_destroy_immediate(struct mapped_device *md)
2483{
2484 __dm_destroy(md, false);
2485}
2486
2487void dm_put(struct mapped_device *md)
2488{
2489 atomic_dec(&md->holders);
1da177e4 2490}
79eb885c 2491EXPORT_SYMBOL_GPL(dm_put);
1da177e4 2492
9f6dc633 2493static bool dm_in_flight_bios(struct mapped_device *md)
85067747
ML
2494{
2495 int cpu;
9f6dc633 2496 unsigned long sum = 0;
85067747 2497
9f6dc633
MS
2498 for_each_possible_cpu(cpu)
2499 sum += *per_cpu_ptr(md->pending_io, cpu);
85067747
ML
2500
2501 return sum != 0;
2502}
2503
2f064a59 2504static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
46125c1c
MB
2505{
2506 int r = 0;
9f4c3f87 2507 DEFINE_WAIT(wait);
46125c1c 2508
85067747 2509 while (true) {
9f4c3f87 2510 prepare_to_wait(&md->wait, &wait, task_state);
46125c1c 2511
9f6dc633 2512 if (!dm_in_flight_bios(md))
46125c1c
MB
2513 break;
2514
e3fabdfd 2515 if (signal_pending_state(task_state, current)) {
46125c1c
MB
2516 r = -EINTR;
2517 break;
2518 }
2519
2520 io_schedule();
2521 }
9f4c3f87 2522 finish_wait(&md->wait, &wait);
b44ebeb0 2523
9f6dc633
MS
2524 smp_rmb();
2525
46125c1c
MB
2526 return r;
2527}
2528
2f064a59 2529static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
85067747
ML
2530{
2531 int r = 0;
2532
2533 if (!queue_is_mq(md->queue))
2534 return dm_wait_for_bios_completion(md, task_state);
2535
2536 while (true) {
2537 if (!blk_mq_queue_inflight(md->queue))
2538 break;
2539
2540 if (signal_pending_state(task_state, current)) {
2541 r = -EINTR;
2542 break;
2543 }
2544
238d991f 2545 fsleep(5000);
85067747
ML
2546 }
2547
2548 return r;
2549}
2550
1da177e4
LT
2551/*
2552 * Process the deferred bios
2553 */
ef208587 2554static void dm_wq_work(struct work_struct *work)
1da177e4 2555{
0c2915b8
MS
2556 struct mapped_device *md = container_of(work, struct mapped_device, work);
2557 struct bio *bio;
ef208587 2558
3b00b203 2559 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
df12ee99 2560 spin_lock_irq(&md->deferred_lock);
0c2915b8 2561 bio = bio_list_pop(&md->deferred);
df12ee99
AK
2562 spin_unlock_irq(&md->deferred_lock);
2563
0c2915b8 2564 if (!bio)
df12ee99 2565 break;
022c2611 2566
0c2915b8 2567 submit_bio_noacct(bio);
022c2611 2568 }
1da177e4
LT
2569}
2570
9a1fb464 2571static void dm_queue_flush(struct mapped_device *md)
304f3f6a 2572{
3b00b203 2573 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
4e857c58 2574 smp_mb__after_atomic();
53d5914f 2575 queue_work(md->wq, &md->work);
304f3f6a
MB
2576}
2577
1da177e4 2578/*
042d2a9b 2579 * Swap in a new table, returning the old one for the caller to destroy.
1da177e4 2580 */
042d2a9b 2581struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
1da177e4 2582{
87eb5b21 2583 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
754c5fc7 2584 struct queue_limits limits;
042d2a9b 2585 int r;
1da177e4 2586
e61290a4 2587 mutex_lock(&md->suspend_lock);
1da177e4
LT
2588
2589 /* device must be suspended */
4f186f8b 2590 if (!dm_suspended_md(md))
93c534ae 2591 goto out;
1da177e4 2592
3ae70656
MS
2593 /*
2594 * If the new table has no data devices, retain the existing limits.
2595 * This helps multipath with queue_if_no_path if all paths disappear,
2596 * then new I/O is queued based on these limits, and then some paths
2597 * reappear.
2598 */
2599 if (dm_table_has_no_data_devices(table)) {
83d5e5b0 2600 live_map = dm_get_live_table_fast(md);
3ae70656
MS
2601 if (live_map)
2602 limits = md->queue->limits;
83d5e5b0 2603 dm_put_live_table_fast(md);
3ae70656
MS
2604 }
2605
87eb5b21
MC
2606 if (!live_map) {
2607 r = dm_calculate_queue_limits(table, &limits);
2608 if (r) {
2609 map = ERR_PTR(r);
2610 goto out;
2611 }
042d2a9b 2612 }
754c5fc7 2613
042d2a9b 2614 map = __bind(md, table, &limits);
62e08243 2615 dm_issue_global_event();
1da177e4 2616
93c534ae 2617out:
e61290a4 2618 mutex_unlock(&md->suspend_lock);
042d2a9b 2619 return map;
1da177e4
LT
2620}
2621
2622/*
2623 * Functions to lock and unlock any filesystem running on the
2624 * device.
2625 */
2ca3310e 2626static int lock_fs(struct mapped_device *md)
1da177e4 2627{
e39e2e95 2628 int r;
1da177e4 2629
040f04bd 2630 WARN_ON(test_bit(DMF_FROZEN, &md->flags));
aa8d7c2f 2631
977115c0 2632 r = freeze_bdev(md->disk->part0);
040f04bd
CH
2633 if (!r)
2634 set_bit(DMF_FROZEN, &md->flags);
2635 return r;
1da177e4
LT
2636}
2637
2ca3310e 2638static void unlock_fs(struct mapped_device *md)
1da177e4 2639{
aa8d7c2f
AK
2640 if (!test_bit(DMF_FROZEN, &md->flags))
2641 return;
977115c0 2642 thaw_bdev(md->disk->part0);
aa8d7c2f 2643 clear_bit(DMF_FROZEN, &md->flags);
1da177e4
LT
2644}
2645
2646/*
b48633f8
BVA
2647 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2648 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2649 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2650 *
ffcc3936
MS
2651 * If __dm_suspend returns 0, the device is completely quiescent
2652 * now. There is no request-processing activity. All new requests
2653 * are being added to md->deferred list.
cec47e3d 2654 */
ffcc3936 2655static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
86a3238c 2656 unsigned int suspend_flags, unsigned int task_state,
eaf9a736 2657 int dmf_suspended_flag)
1da177e4 2658{
ffcc3936
MS
2659 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2660 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2661 int r;
1da177e4 2662
5a8f1f80
BVA
2663 lockdep_assert_held(&md->suspend_lock);
2664
2e93ccc1
KU
2665 /*
2666 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2667 * This flag is cleared before dm_suspend returns.
2668 */
2669 if (noflush)
2670 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
86331f39 2671 else
ac75b09f 2672 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2e93ccc1 2673
d67ee213
MS
2674 /*
2675 * This gets reverted if there's an error later and the targets
2676 * provide the .presuspend_undo hook.
2677 */
cf222b37
AK
2678 dm_table_presuspend_targets(map);
2679
32a926da 2680 /*
9f518b27
KU
2681 * Flush I/O to the device.
2682 * Any I/O submitted after lock_fs() may not be flushed.
2683 * noflush takes precedence over do_lockfs.
2684 * (lock_fs() flushes I/Os and waits for them to complete.)
32a926da
MP
2685 */
2686 if (!noflush && do_lockfs) {
2687 r = lock_fs(md);
d67ee213
MS
2688 if (r) {
2689 dm_table_presuspend_undo_targets(map);
ffcc3936 2690 return r;
d67ee213 2691 }
aa8d7c2f 2692 }
1da177e4
LT
2693
2694 /*
3b00b203
MP
2695 * Here we must make sure that no processes are submitting requests
2696 * to target drivers i.e. no one may be executing
96c9865c 2697 * dm_split_and_process_bio from dm_submit_bio.
3b00b203 2698 *
96c9865c 2699 * To get all processes out of dm_split_and_process_bio in dm_submit_bio,
3b00b203 2700 * we take the write lock. To prevent any process from reentering
96c9865c 2701 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread
0cede372 2702 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
6a8736d1 2703 * flush_workqueue(md->wq).
1da177e4 2704 */
1eb787ec 2705 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
41abc4e1
HR
2706 if (map)
2707 synchronize_srcu(&md->io_barrier);
1da177e4 2708
d0bcb878 2709 /*
29e4013d
TH
2710 * Stop md->queue before flushing md->wq in case request-based
2711 * dm defers requests to md->wq from md->queue.
d0bcb878 2712 */
6a23e05c 2713 if (dm_request_based(md))
eca7ee6d 2714 dm_stop_queue(md->queue);
cec47e3d 2715
d0bcb878
KU
2716 flush_workqueue(md->wq);
2717
1da177e4 2718 /*
3b00b203
MP
2719 * At this point no more requests are entering target request routines.
2720 * We call dm_wait_for_completion to wait for all existing requests
2721 * to finish.
1da177e4 2722 */
b48633f8 2723 r = dm_wait_for_completion(md, task_state);
eaf9a736
MS
2724 if (!r)
2725 set_bit(dmf_suspended_flag, &md->flags);
1da177e4 2726
6d6f10df 2727 if (noflush)
022c2611 2728 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
41abc4e1
HR
2729 if (map)
2730 synchronize_srcu(&md->io_barrier);
2e93ccc1 2731
1da177e4 2732 /* were we interrupted ? */
46125c1c 2733 if (r < 0) {
9a1fb464 2734 dm_queue_flush(md);
73d410c0 2735
cec47e3d 2736 if (dm_request_based(md))
eca7ee6d 2737 dm_start_queue(md->queue);
cec47e3d 2738
2ca3310e 2739 unlock_fs(md);
d67ee213 2740 dm_table_presuspend_undo_targets(map);
ffcc3936 2741 /* pushback list is already flushed, so skip flush */
2ca3310e 2742 }
1da177e4 2743
ffcc3936
MS
2744 return r;
2745}
2746
2747/*
2748 * We need to be able to change a mapping table under a mounted
2749 * filesystem. For example we might want to move some data in
2750 * the background. Before the table can be swapped with
2751 * dm_bind_table, dm_suspend must be called to flush any in
2752 * flight bios and ensure that any further io gets deferred.
2753 */
2754/*
2755 * Suspend mechanism in request-based dm.
2756 *
2757 * 1. Flush all I/Os by lock_fs() if needed.
2758 * 2. Stop dispatching any I/O by stopping the request_queue.
2759 * 3. Wait for all in-flight I/Os to be completed or requeued.
2760 *
2761 * To abort suspend, start the request_queue.
2762 */
86a3238c 2763int dm_suspend(struct mapped_device *md, unsigned int suspend_flags)
ffcc3936
MS
2764{
2765 struct dm_table *map = NULL;
2766 int r = 0;
2767
2768retry:
2769 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2770
2771 if (dm_suspended_md(md)) {
2772 r = -EINVAL;
2773 goto out_unlock;
2774 }
2775
2776 if (dm_suspended_internally_md(md)) {
2777 /* already internally suspended, wait for internal resume */
2778 mutex_unlock(&md->suspend_lock);
2779 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2780 if (r)
2781 return r;
2782 goto retry;
2783 }
2784
a12f5d48 2785 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
ffcc3936 2786
eaf9a736 2787 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
ffcc3936
MS
2788 if (r)
2789 goto out_unlock;
3b00b203 2790
5df96f2b 2791 set_bit(DMF_POST_SUSPENDING, &md->flags);
4d4471cb 2792 dm_table_postsuspend_targets(map);
5df96f2b 2793 clear_bit(DMF_POST_SUSPENDING, &md->flags);
4d4471cb 2794
d287483d 2795out_unlock:
e61290a4 2796 mutex_unlock(&md->suspend_lock);
cf222b37 2797 return r;
1da177e4
LT
2798}
2799
ffcc3936
MS
2800static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2801{
2802 if (map) {
2803 int r = dm_table_resume_targets(map);
2804 if (r)
2805 return r;
2806 }
2807
2808 dm_queue_flush(md);
2809
2810 /*
2811 * Flushing deferred I/Os must be done after targets are resumed
2812 * so that mapping of targets can work correctly.
2813 * Request-based dm is queueing the deferred I/Os in its request_queue.
2814 */
2815 if (dm_request_based(md))
eca7ee6d 2816 dm_start_queue(md->queue);
ffcc3936
MS
2817
2818 unlock_fs(md);
2819
2820 return 0;
2821}
2822
1da177e4
LT
2823int dm_resume(struct mapped_device *md)
2824{
8dc23658 2825 int r;
cf222b37 2826 struct dm_table *map = NULL;
1da177e4 2827
ffcc3936 2828retry:
8dc23658 2829 r = -EINVAL;
ffcc3936
MS
2830 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2831
4f186f8b 2832 if (!dm_suspended_md(md))
cf222b37 2833 goto out;
cf222b37 2834
ffcc3936
MS
2835 if (dm_suspended_internally_md(md)) {
2836 /* already internally suspended, wait for internal resume */
2837 mutex_unlock(&md->suspend_lock);
2838 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2839 if (r)
2840 return r;
2841 goto retry;
2842 }
2843
a12f5d48 2844 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2ca3310e 2845 if (!map || !dm_table_get_size(map))
cf222b37 2846 goto out;
1da177e4 2847
ffcc3936 2848 r = __dm_resume(md, map);
8757b776
MB
2849 if (r)
2850 goto out;
2ca3310e 2851
2ca3310e 2852 clear_bit(DMF_SUSPENDED, &md->flags);
cf222b37 2853out:
e61290a4 2854 mutex_unlock(&md->suspend_lock);
2ca3310e 2855
cf222b37 2856 return r;
1da177e4
LT
2857}
2858
fd2ed4d2
MP
2859/*
2860 * Internal suspend/resume works like userspace-driven suspend. It waits
2861 * until all bios finish and prevents issuing new bios to the target drivers.
2862 * It may be used only from the kernel.
fd2ed4d2
MP
2863 */
2864
86a3238c 2865static void __dm_internal_suspend(struct mapped_device *md, unsigned int suspend_flags)
fd2ed4d2 2866{
ffcc3936
MS
2867 struct dm_table *map = NULL;
2868
1ea0654e
BVA
2869 lockdep_assert_held(&md->suspend_lock);
2870
96b26c8c 2871 if (md->internal_suspend_count++)
ffcc3936
MS
2872 return; /* nested internal suspend */
2873
2874 if (dm_suspended_md(md)) {
2875 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2876 return; /* nest suspend */
2877 }
2878
a12f5d48 2879 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
ffcc3936
MS
2880
2881 /*
2882 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2883 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
2884 * would require changing .presuspend to return an error -- avoid this
2885 * until there is a need for more elaborate variants of internal suspend.
2886 */
eaf9a736
MS
2887 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2888 DMF_SUSPENDED_INTERNALLY);
ffcc3936 2889
5df96f2b 2890 set_bit(DMF_POST_SUSPENDING, &md->flags);
ffcc3936 2891 dm_table_postsuspend_targets(map);
5df96f2b 2892 clear_bit(DMF_POST_SUSPENDING, &md->flags);
ffcc3936
MS
2893}
2894
2895static void __dm_internal_resume(struct mapped_device *md)
2896{
96b26c8c
MP
2897 BUG_ON(!md->internal_suspend_count);
2898
2899 if (--md->internal_suspend_count)
ffcc3936
MS
2900 return; /* resume from nested internal suspend */
2901
fd2ed4d2 2902 if (dm_suspended_md(md))
ffcc3936
MS
2903 goto done; /* resume from nested suspend */
2904
2905 /*
2906 * NOTE: existing callers don't need to call dm_table_resume_targets
2907 * (which may fail -- so best to avoid it for now by passing NULL map)
2908 */
2909 (void) __dm_resume(md, NULL);
2910
2911done:
2912 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2913 smp_mb__after_atomic();
2914 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2915}
2916
2917void dm_internal_suspend_noflush(struct mapped_device *md)
2918{
2919 mutex_lock(&md->suspend_lock);
2920 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2921 mutex_unlock(&md->suspend_lock);
2922}
2923EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2924
2925void dm_internal_resume(struct mapped_device *md)
2926{
2927 mutex_lock(&md->suspend_lock);
2928 __dm_internal_resume(md);
2929 mutex_unlock(&md->suspend_lock);
2930}
2931EXPORT_SYMBOL_GPL(dm_internal_resume);
2932
2933/*
2934 * Fast variants of internal suspend/resume hold md->suspend_lock,
2935 * which prevents interaction with userspace-driven suspend.
2936 */
2937
2938void dm_internal_suspend_fast(struct mapped_device *md)
2939{
2940 mutex_lock(&md->suspend_lock);
2941 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
fd2ed4d2
MP
2942 return;
2943
2944 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2945 synchronize_srcu(&md->io_barrier);
2946 flush_workqueue(md->wq);
2947 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2948}
b735fede 2949EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
fd2ed4d2 2950
ffcc3936 2951void dm_internal_resume_fast(struct mapped_device *md)
fd2ed4d2 2952{
ffcc3936 2953 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
fd2ed4d2
MP
2954 goto done;
2955
2956 dm_queue_flush(md);
2957
2958done:
2959 mutex_unlock(&md->suspend_lock);
2960}
b735fede 2961EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
fd2ed4d2 2962
1da177e4
LT
2963/*-----------------------------------------------------------------
2964 * Event notification.
2965 *---------------------------------------------------------------*/
3abf85b5 2966int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
86a3238c 2967 unsigned int cookie, bool need_resize_uevent)
69267a30 2968{
6958c1c6 2969 int r;
86a3238c 2970 unsigned int noio_flag;
60935eb2 2971 char udev_cookie[DM_COOKIE_LENGTH];
7533afa1
MP
2972 char *envp[3] = { NULL, NULL, NULL };
2973 char **envpp = envp;
2974 if (cookie) {
60935eb2
MB
2975 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2976 DM_COOKIE_ENV_VAR_NAME, cookie);
7533afa1 2977 *envpp++ = udev_cookie;
60935eb2 2978 }
7533afa1
MP
2979 if (need_resize_uevent) {
2980 *envpp++ = "RESIZE=1";
2981 }
2982
2983 noio_flag = memalloc_noio_save();
2984
2985 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
6958c1c6
MP
2986
2987 memalloc_noio_restore(noio_flag);
2988
2989 return r;
69267a30
AK
2990}
2991
7a8c3d3b
MA
2992uint32_t dm_next_uevent_seq(struct mapped_device *md)
2993{
2994 return atomic_add_return(1, &md->uevent_seq);
2995}
2996
1da177e4
LT
2997uint32_t dm_get_event_nr(struct mapped_device *md)
2998{
2999 return atomic_read(&md->event_nr);
3000}
3001
3002int dm_wait_event(struct mapped_device *md, int event_nr)
3003{
3004 return wait_event_interruptible(md->eventq,
3005 (event_nr != atomic_read(&md->event_nr)));
3006}
3007
7a8c3d3b
MA
3008void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
3009{
3010 unsigned long flags;
3011
3012 spin_lock_irqsave(&md->uevent_lock, flags);
3013 list_add(elist, &md->uevent_list);
3014 spin_unlock_irqrestore(&md->uevent_lock, flags);
3015}
3016
1da177e4
LT
3017/*
3018 * The gendisk is only valid as long as you have a reference
3019 * count on 'md'.
3020 */
3021struct gendisk *dm_disk(struct mapped_device *md)
3022{
3023 return md->disk;
3024}
65ff5b7d 3025EXPORT_SYMBOL_GPL(dm_disk);
1da177e4 3026
784aae73
MB
3027struct kobject *dm_kobject(struct mapped_device *md)
3028{
2995fa78 3029 return &md->kobj_holder.kobj;
784aae73
MB
3030}
3031
784aae73
MB
3032struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
3033{
3034 struct mapped_device *md;
3035
2995fa78 3036 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
784aae73 3037
b9a41d21
HT
3038 spin_lock(&_minor_lock);
3039 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
3040 md = NULL;
3041 goto out;
3042 }
784aae73 3043 dm_get(md);
b9a41d21
HT
3044out:
3045 spin_unlock(&_minor_lock);
3046
784aae73
MB
3047 return md;
3048}
3049
4f186f8b 3050int dm_suspended_md(struct mapped_device *md)
1da177e4
LT
3051{
3052 return test_bit(DMF_SUSPENDED, &md->flags);
3053}
3054
5df96f2b
MP
3055static int dm_post_suspending_md(struct mapped_device *md)
3056{
3057 return test_bit(DMF_POST_SUSPENDING, &md->flags);
3058}
3059
ffcc3936
MS
3060int dm_suspended_internally_md(struct mapped_device *md)
3061{
3062 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3063}
3064
2c140a24
MP
3065int dm_test_deferred_remove_flag(struct mapped_device *md)
3066{
3067 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3068}
3069
64dbce58
KU
3070int dm_suspended(struct dm_target *ti)
3071{
33bd6f06 3072 return dm_suspended_md(ti->table->md);
64dbce58
KU
3073}
3074EXPORT_SYMBOL_GPL(dm_suspended);
3075
5df96f2b
MP
3076int dm_post_suspending(struct dm_target *ti)
3077{
33bd6f06 3078 return dm_post_suspending_md(ti->table->md);
5df96f2b
MP
3079}
3080EXPORT_SYMBOL_GPL(dm_post_suspending);
3081
2e93ccc1
KU
3082int dm_noflush_suspending(struct dm_target *ti)
3083{
33bd6f06 3084 return __noflush_suspending(ti->table->md);
2e93ccc1
KU
3085}
3086EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3087
e6ee8c0b
KU
3088void dm_free_md_mempools(struct dm_md_mempools *pools)
3089{
3090 if (!pools)
3091 return;
3092
6f1c819c
KO
3093 bioset_exit(&pools->bs);
3094 bioset_exit(&pools->io_bs);
e6ee8c0b
KU
3095
3096 kfree(pools);
3097}
3098
9c72bad1
CH
3099struct dm_pr {
3100 u64 old_key;
3101 u64 new_key;
3102 u32 flags;
c6adada5 3103 bool abort;
9c72bad1 3104 bool fail_early;
8dd87f3c 3105 int ret;
70151087 3106 enum pr_type type;
9c72bad1
CH
3107};
3108
3109static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
8dd87f3c 3110 struct dm_pr *pr)
71cdb697
CH
3111{
3112 struct mapped_device *md = bdev->bd_disk->private_data;
9c72bad1
CH
3113 struct dm_table *table;
3114 struct dm_target *ti;
3115 int ret = -ENOTTY, srcu_idx;
71cdb697 3116
9c72bad1
CH
3117 table = dm_get_live_table(md, &srcu_idx);
3118 if (!table || !dm_table_get_size(table))
3119 goto out;
71cdb697 3120
9c72bad1 3121 /* We only support devices that have a single target */
2aec377a 3122 if (table->num_targets != 1)
9c72bad1
CH
3123 goto out;
3124 ti = dm_table_get_target(table, 0);
71cdb697 3125
e120a5f1
MS
3126 if (dm_suspended_md(md)) {
3127 ret = -EAGAIN;
3128 goto out;
3129 }
3130
9c72bad1
CH
3131 ret = -EINVAL;
3132 if (!ti->type->iterate_devices)
3133 goto out;
3134
8dd87f3c
MC
3135 ti->type->iterate_devices(ti, fn, pr);
3136 ret = 0;
9c72bad1
CH
3137out:
3138 dm_put_live_table(md, srcu_idx);
3139 return ret;
3140}
3141
3142/*
3143 * For register / unregister we need to manually call out to every path.
3144 */
3145static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3146 sector_t start, sector_t len, void *data)
3147{
3148 struct dm_pr *pr = data;
3149 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
8dd87f3c
MC
3150 int ret;
3151
3152 if (!ops || !ops->pr_register) {
3153 pr->ret = -EOPNOTSUPP;
3154 return -1;
3155 }
9c72bad1 3156
8dd87f3c
MC
3157 ret = ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3158 if (!ret)
3159 return 0;
3160
3161 if (!pr->ret)
3162 pr->ret = ret;
9c72bad1 3163
8dd87f3c
MC
3164 if (pr->fail_early)
3165 return -1;
3166
3167 return 0;
9c72bad1
CH
3168}
3169
3170static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3171 u32 flags)
3172{
3173 struct dm_pr pr = {
3174 .old_key = old_key,
3175 .new_key = new_key,
3176 .flags = flags,
3177 .fail_early = true,
8dd87f3c 3178 .ret = 0,
9c72bad1
CH
3179 };
3180 int ret;
3181
3182 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
8dd87f3c
MC
3183 if (ret) {
3184 /* Didn't even get to register a path */
3185 return ret;
9c72bad1
CH
3186 }
3187
8dd87f3c
MC
3188 if (!pr.ret)
3189 return 0;
3190 ret = pr.ret;
3191
3192 if (!new_key)
3193 return ret;
3194
3195 /* unregister all paths if we failed to register any path */
3196 pr.old_key = new_key;
3197 pr.new_key = 0;
3198 pr.flags = 0;
3199 pr.fail_early = false;
3200 (void) dm_call_pr(bdev, __dm_pr_register, &pr);
9c72bad1 3201 return ret;
71cdb697
CH
3202}
3203
70151087
MC
3204
3205static int __dm_pr_reserve(struct dm_target *ti, struct dm_dev *dev,
3206 sector_t start, sector_t len, void *data)
3207{
3208 struct dm_pr *pr = data;
3209 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3210
3211 if (!ops || !ops->pr_reserve) {
3212 pr->ret = -EOPNOTSUPP;
3213 return -1;
3214 }
3215
3216 pr->ret = ops->pr_reserve(dev->bdev, pr->old_key, pr->type, pr->flags);
3217 if (!pr->ret)
3218 return -1;
3219
3220 return 0;
3221}
3222
71cdb697 3223static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
956a4025 3224 u32 flags)
71cdb697 3225{
70151087
MC
3226 struct dm_pr pr = {
3227 .old_key = key,
3228 .flags = flags,
3229 .type = type,
3230 .fail_early = false,
3231 .ret = 0,
3232 };
3233 int ret;
71cdb697 3234
70151087
MC
3235 ret = dm_call_pr(bdev, __dm_pr_reserve, &pr);
3236 if (ret)
3237 return ret;
71cdb697 3238
70151087 3239 return pr.ret;
71cdb697
CH
3240}
3241
08a3c338
MC
3242/*
3243 * If there is a non-All Registrants type of reservation, the release must be
3244 * sent down the holding path. For the cases where there is no reservation or
3245 * the path is not the holder the device will also return success, so we must
3246 * try each path to make sure we got the correct path.
3247 */
3248static int __dm_pr_release(struct dm_target *ti, struct dm_dev *dev,
3249 sector_t start, sector_t len, void *data)
3250{
3251 struct dm_pr *pr = data;
3252 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3253
3254 if (!ops || !ops->pr_release) {
3255 pr->ret = -EOPNOTSUPP;
3256 return -1;
3257 }
3258
3259 pr->ret = ops->pr_release(dev->bdev, pr->old_key, pr->type);
3260 if (pr->ret)
3261 return -1;
3262
3263 return 0;
71cdb697
CH
3264}
3265
3266static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3267{
08a3c338
MC
3268 struct dm_pr pr = {
3269 .old_key = key,
3270 .type = type,
3271 .fail_early = false,
3272 };
3273 int ret;
71cdb697 3274
08a3c338
MC
3275 ret = dm_call_pr(bdev, __dm_pr_release, &pr);
3276 if (ret)
3277 return ret;
71cdb697 3278
08a3c338 3279 return pr.ret;
71cdb697
CH
3280}
3281
c6adada5
MC
3282static int __dm_pr_preempt(struct dm_target *ti, struct dm_dev *dev,
3283 sector_t start, sector_t len, void *data)
3284{
3285 struct dm_pr *pr = data;
3286 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3287
3288 if (!ops || !ops->pr_preempt) {
3289 pr->ret = -EOPNOTSUPP;
3290 return -1;
3291 }
3292
3293 pr->ret = ops->pr_preempt(dev->bdev, pr->old_key, pr->new_key, pr->type,
3294 pr->abort);
3295 if (!pr->ret)
3296 return -1;
3297
3298 return 0;
71cdb697
CH
3299}
3300
3301static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
956a4025 3302 enum pr_type type, bool abort)
71cdb697 3303{
c6adada5
MC
3304 struct dm_pr pr = {
3305 .new_key = new_key,
3306 .old_key = old_key,
3307 .type = type,
3308 .fail_early = false,
3309 };
3310 int ret;
71cdb697 3311
c6adada5
MC
3312 ret = dm_call_pr(bdev, __dm_pr_preempt, &pr);
3313 if (ret)
3314 return ret;
71cdb697 3315
c6adada5 3316 return pr.ret;
71cdb697
CH
3317}
3318
3319static int dm_pr_clear(struct block_device *bdev, u64 key)
3320{
3321 struct mapped_device *md = bdev->bd_disk->private_data;
3322 const struct pr_ops *ops;
971888c4 3323 int r, srcu_idx;
71cdb697 3324
5bd5e8d8 3325 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
71cdb697 3326 if (r < 0)
971888c4 3327 goto out;
71cdb697
CH
3328
3329 ops = bdev->bd_disk->fops->pr_ops;
3330 if (ops && ops->pr_clear)
3331 r = ops->pr_clear(bdev, key);
3332 else
3333 r = -EOPNOTSUPP;
971888c4
MS
3334out:
3335 dm_unprepare_ioctl(md, srcu_idx);
71cdb697
CH
3336 return r;
3337}
3338
3339static const struct pr_ops dm_pr_ops = {
3340 .pr_register = dm_pr_register,
3341 .pr_reserve = dm_pr_reserve,
3342 .pr_release = dm_pr_release,
3343 .pr_preempt = dm_pr_preempt,
3344 .pr_clear = dm_pr_clear,
3345};
3346
83d5cde4 3347static const struct block_device_operations dm_blk_dops = {
c62b37d9 3348 .submit_bio = dm_submit_bio,
b99fdcdc 3349 .poll_bio = dm_poll_bio,
1da177e4
LT
3350 .open = dm_blk_open,
3351 .release = dm_blk_close,
aa129a22 3352 .ioctl = dm_blk_ioctl,
3ac51e74 3353 .getgeo = dm_blk_getgeo,
e76239a3 3354 .report_zones = dm_blk_report_zones,
71cdb697 3355 .pr_ops = &dm_pr_ops,
1da177e4
LT
3356 .owner = THIS_MODULE
3357};
3358
681cc5e8
MS
3359static const struct block_device_operations dm_rq_blk_dops = {
3360 .open = dm_blk_open,
3361 .release = dm_blk_close,
3362 .ioctl = dm_blk_ioctl,
3363 .getgeo = dm_blk_getgeo,
3364 .pr_ops = &dm_pr_ops,
3365 .owner = THIS_MODULE
3366};
3367
f26c5719
DW
3368static const struct dax_operations dm_dax_ops = {
3369 .direct_access = dm_dax_direct_access,
cdf6cdcd 3370 .zero_page_range = dm_dax_zero_page_range,
047218ec 3371 .recovery_write = dm_dax_recovery_write,
f26c5719
DW
3372};
3373
1da177e4
LT
3374/*
3375 * module hooks
3376 */
3377module_init(dm_init);
3378module_exit(dm_exit);
3379
3380module_param(major, uint, 0);
3381MODULE_PARM_DESC(major, "The major number of the device mapper");
f4790826 3382
e8603136
MS
3383module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3384MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3385
115485e8
MS
3386module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3387MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3388
a666e5c0
MP
3389module_param(swap_bios, int, S_IRUGO | S_IWUSR);
3390MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3391
1da177e4
LT
3392MODULE_DESCRIPTION(DM_NAME " driver");
3393MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3394MODULE_LICENSE("GPL");