dm: retun the clone bio from alloc_tio
[linux-block.git] / drivers / md / dm.c
CommitLineData
1da177e4
LT
1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
784aae73 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
1da177e4
LT
4 *
5 * This file is released under the GPL.
6 */
7
4cc96131
MS
8#include "dm-core.h"
9#include "dm-rq.h"
51e5b2bd 10#include "dm-uevent.h"
91ccbbac 11#include "dm-ima.h"
1da177e4
LT
12
13#include <linux/init.h>
14#include <linux/module.h>
48c9c27b 15#include <linux/mutex.h>
6958c1c6 16#include <linux/sched/mm.h>
174cd4b1 17#include <linux/sched/signal.h>
1da177e4
LT
18#include <linux/blkpg.h>
19#include <linux/bio.h>
1da177e4 20#include <linux/mempool.h>
f26c5719 21#include <linux/dax.h>
1da177e4
LT
22#include <linux/slab.h>
23#include <linux/idr.h>
7e026c8c 24#include <linux/uio.h>
3ac51e74 25#include <linux/hdreg.h>
3f77316d 26#include <linux/delay.h>
ffcc3936 27#include <linux/wait.h>
71cdb697 28#include <linux/pr.h>
b0b4d7c6 29#include <linux/refcount.h>
c6a564ff 30#include <linux/part_stat.h>
a892c8d5 31#include <linux/blk-crypto.h>
1e8d44bd 32#include <linux/blk-crypto-profile.h>
55782138 33
72d94861
AK
34#define DM_MSG_PREFIX "core"
35
60935eb2
MB
36/*
37 * Cookies are numeric values sent with CHANGE and REMOVE
38 * uevents while resuming, removing or renaming the device.
39 */
40#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
41#define DM_COOKIE_LENGTH 24
42
1da177e4
LT
43static const char *_name = DM_NAME;
44
45static unsigned int major = 0;
46static unsigned int _major = 0;
47
d15b774c
AK
48static DEFINE_IDR(_minor_idr);
49
f32c10b0 50static DEFINE_SPINLOCK(_minor_lock);
2c140a24
MP
51
52static void do_deferred_remove(struct work_struct *w);
53
54static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
55
acfe0ad7
MP
56static struct workqueue_struct *deferred_remove_workqueue;
57
93e6442c
MP
58atomic_t dm_global_event_nr = ATOMIC_INIT(0);
59DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
60
62e08243
MP
61void dm_issue_global_event(void)
62{
63 atomic_inc(&dm_global_event_nr);
64 wake_up(&dm_global_eventq);
65}
66
1da177e4 67/*
64f52b0e 68 * One of these is allocated (on-stack) per original bio.
1da177e4 69 */
64f52b0e 70struct clone_info {
64f52b0e
MS
71 struct dm_table *map;
72 struct bio *bio;
73 struct dm_io *io;
74 sector_t sector;
75 unsigned sector_count;
76};
77
62f26317
JX
78#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
79#define DM_IO_BIO_OFFSET \
80 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
81
6c23f0bd
CH
82static inline struct dm_target_io *clone_to_tio(struct bio *clone)
83{
84 return container_of(clone, struct dm_target_io, clone);
85}
86
64f52b0e
MS
87void *dm_per_bio_data(struct bio *bio, size_t data_size)
88{
6c23f0bd 89 if (!clone_to_tio(bio)->inside_dm_io)
62f26317
JX
90 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
91 return (char *)bio - DM_IO_BIO_OFFSET - data_size;
64f52b0e
MS
92}
93EXPORT_SYMBOL_GPL(dm_per_bio_data);
94
95struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
96{
97 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
98 if (io->magic == DM_IO_MAGIC)
62f26317 99 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
64f52b0e 100 BUG_ON(io->magic != DM_TIO_MAGIC);
62f26317 101 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
64f52b0e
MS
102}
103EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
104
105unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
106{
107 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
108}
109EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
110
ba61fdd1
JM
111#define MINOR_ALLOCED ((void *)-1)
112
115485e8 113#define DM_NUMA_NODE NUMA_NO_NODE
115485e8 114static int dm_numa_node = DM_NUMA_NODE;
faad87df 115
a666e5c0
MP
116#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
117static int swap_bios = DEFAULT_SWAP_BIOS;
118static int get_swap_bios(void)
119{
120 int latch = READ_ONCE(swap_bios);
121 if (unlikely(latch <= 0))
122 latch = DEFAULT_SWAP_BIOS;
123 return latch;
124}
125
e6ee8c0b
KU
126/*
127 * For mempools pre-allocation at the table loading time.
128 */
129struct dm_md_mempools {
6f1c819c
KO
130 struct bio_set bs;
131 struct bio_set io_bs;
e6ee8c0b
KU
132};
133
86f1152b
BM
134struct table_device {
135 struct list_head list;
b0b4d7c6 136 refcount_t count;
86f1152b
BM
137 struct dm_dev dm_dev;
138};
139
e8603136
MS
140/*
141 * Bio-based DM's mempools' reserved IOs set by the user.
142 */
4cc96131 143#define RESERVED_BIO_BASED_IOS 16
e8603136
MS
144static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
145
115485e8
MS
146static int __dm_get_module_param_int(int *module_param, int min, int max)
147{
6aa7de05 148 int param = READ_ONCE(*module_param);
115485e8
MS
149 int modified_param = 0;
150 bool modified = true;
151
152 if (param < min)
153 modified_param = min;
154 else if (param > max)
155 modified_param = max;
156 else
157 modified = false;
158
159 if (modified) {
160 (void)cmpxchg(module_param, param, modified_param);
161 param = modified_param;
162 }
163
164 return param;
165}
166
4cc96131
MS
167unsigned __dm_get_module_param(unsigned *module_param,
168 unsigned def, unsigned max)
f4790826 169{
6aa7de05 170 unsigned param = READ_ONCE(*module_param);
09c2d531 171 unsigned modified_param = 0;
f4790826 172
09c2d531
MS
173 if (!param)
174 modified_param = def;
175 else if (param > max)
176 modified_param = max;
f4790826 177
09c2d531
MS
178 if (modified_param) {
179 (void)cmpxchg(module_param, param, modified_param);
180 param = modified_param;
f4790826
MS
181 }
182
09c2d531 183 return param;
f4790826
MS
184}
185
e8603136
MS
186unsigned dm_get_reserved_bio_based_ios(void)
187{
09c2d531 188 return __dm_get_module_param(&reserved_bio_based_ios,
4cc96131 189 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
e8603136
MS
190}
191EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
192
115485e8
MS
193static unsigned dm_get_numa_node(void)
194{
195 return __dm_get_module_param_int(&dm_numa_node,
196 DM_NUMA_NODE, num_online_nodes() - 1);
197}
198
1da177e4
LT
199static int __init local_init(void)
200{
e689fbab 201 int r;
1ae49ea2 202
51e5b2bd 203 r = dm_uevent_init();
51157b4a 204 if (r)
e689fbab 205 return r;
51e5b2bd 206
acfe0ad7
MP
207 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
208 if (!deferred_remove_workqueue) {
209 r = -ENOMEM;
210 goto out_uevent_exit;
211 }
212
1da177e4
LT
213 _major = major;
214 r = register_blkdev(_major, _name);
51157b4a 215 if (r < 0)
acfe0ad7 216 goto out_free_workqueue;
1da177e4
LT
217
218 if (!_major)
219 _major = r;
220
221 return 0;
51157b4a 222
acfe0ad7
MP
223out_free_workqueue:
224 destroy_workqueue(deferred_remove_workqueue);
51157b4a
KU
225out_uevent_exit:
226 dm_uevent_exit();
51157b4a
KU
227
228 return r;
1da177e4
LT
229}
230
231static void local_exit(void)
232{
2c140a24 233 flush_scheduled_work();
acfe0ad7 234 destroy_workqueue(deferred_remove_workqueue);
2c140a24 235
00d59405 236 unregister_blkdev(_major, _name);
51e5b2bd 237 dm_uevent_exit();
1da177e4
LT
238
239 _major = 0;
240
241 DMINFO("cleaned up");
242}
243
b9249e55 244static int (*_inits[])(void) __initdata = {
1da177e4
LT
245 local_init,
246 dm_target_init,
247 dm_linear_init,
248 dm_stripe_init,
952b3557 249 dm_io_init,
945fa4d2 250 dm_kcopyd_init,
1da177e4 251 dm_interface_init,
fd2ed4d2 252 dm_statistics_init,
1da177e4
LT
253};
254
b9249e55 255static void (*_exits[])(void) = {
1da177e4
LT
256 local_exit,
257 dm_target_exit,
258 dm_linear_exit,
259 dm_stripe_exit,
952b3557 260 dm_io_exit,
945fa4d2 261 dm_kcopyd_exit,
1da177e4 262 dm_interface_exit,
fd2ed4d2 263 dm_statistics_exit,
1da177e4
LT
264};
265
266static int __init dm_init(void)
267{
268 const int count = ARRAY_SIZE(_inits);
1da177e4
LT
269 int r, i;
270
f1cd6cb2
TS
271#if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
272 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
273 " Duplicate IMA measurements will not be recorded in the IMA log.");
274#endif
275
1da177e4
LT
276 for (i = 0; i < count; i++) {
277 r = _inits[i]();
278 if (r)
279 goto bad;
280 }
281
282 return 0;
f1cd6cb2 283bad:
1da177e4
LT
284 while (i--)
285 _exits[i]();
286
287 return r;
288}
289
290static void __exit dm_exit(void)
291{
292 int i = ARRAY_SIZE(_exits);
293
294 while (i--)
295 _exits[i]();
d15b774c
AK
296
297 /*
298 * Should be empty by this point.
299 */
d15b774c 300 idr_destroy(&_minor_idr);
1da177e4
LT
301}
302
303/*
304 * Block device functions
305 */
432a212c
MA
306int dm_deleting_md(struct mapped_device *md)
307{
308 return test_bit(DMF_DELETING, &md->flags);
309}
310
fe5f9f2c 311static int dm_blk_open(struct block_device *bdev, fmode_t mode)
1da177e4
LT
312{
313 struct mapped_device *md;
314
fba9f90e
JM
315 spin_lock(&_minor_lock);
316
fe5f9f2c 317 md = bdev->bd_disk->private_data;
fba9f90e
JM
318 if (!md)
319 goto out;
320
5c6bd75d 321 if (test_bit(DMF_FREEING, &md->flags) ||
432a212c 322 dm_deleting_md(md)) {
fba9f90e
JM
323 md = NULL;
324 goto out;
325 }
326
1da177e4 327 dm_get(md);
5c6bd75d 328 atomic_inc(&md->open_count);
fba9f90e
JM
329out:
330 spin_unlock(&_minor_lock);
331
332 return md ? 0 : -ENXIO;
1da177e4
LT
333}
334
db2a144b 335static void dm_blk_close(struct gendisk *disk, fmode_t mode)
1da177e4 336{
63a4f065 337 struct mapped_device *md;
6e9624b8 338
4a1aeb98
MB
339 spin_lock(&_minor_lock);
340
63a4f065
MS
341 md = disk->private_data;
342 if (WARN_ON(!md))
343 goto out;
344
2c140a24
MP
345 if (atomic_dec_and_test(&md->open_count) &&
346 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
acfe0ad7 347 queue_work(deferred_remove_workqueue, &deferred_remove_work);
2c140a24 348
1da177e4 349 dm_put(md);
63a4f065 350out:
4a1aeb98 351 spin_unlock(&_minor_lock);
1da177e4
LT
352}
353
5c6bd75d
AK
354int dm_open_count(struct mapped_device *md)
355{
356 return atomic_read(&md->open_count);
357}
358
359/*
360 * Guarantees nothing is using the device before it's deleted.
361 */
2c140a24 362int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
5c6bd75d
AK
363{
364 int r = 0;
365
366 spin_lock(&_minor_lock);
367
2c140a24 368 if (dm_open_count(md)) {
5c6bd75d 369 r = -EBUSY;
2c140a24
MP
370 if (mark_deferred)
371 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
372 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
373 r = -EEXIST;
5c6bd75d
AK
374 else
375 set_bit(DMF_DELETING, &md->flags);
376
377 spin_unlock(&_minor_lock);
378
379 return r;
380}
381
2c140a24
MP
382int dm_cancel_deferred_remove(struct mapped_device *md)
383{
384 int r = 0;
385
386 spin_lock(&_minor_lock);
387
388 if (test_bit(DMF_DELETING, &md->flags))
389 r = -EBUSY;
390 else
391 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
392
393 spin_unlock(&_minor_lock);
394
395 return r;
396}
397
398static void do_deferred_remove(struct work_struct *w)
399{
400 dm_deferred_remove();
401}
402
3ac51e74
DW
403static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
404{
405 struct mapped_device *md = bdev->bd_disk->private_data;
406
407 return dm_get_geometry(md, geo);
408}
409
971888c4 410static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
5bd5e8d8 411 struct block_device **bdev)
aa129a22 412{
66482026 413 struct dm_target *tgt;
6c182cd8 414 struct dm_table *map;
971888c4 415 int r;
aa129a22 416
6c182cd8 417retry:
e56f81e0 418 r = -ENOTTY;
971888c4 419 map = dm_get_live_table(md, srcu_idx);
aa129a22 420 if (!map || !dm_table_get_size(map))
971888c4 421 return r;
aa129a22
MB
422
423 /* We only support devices that have a single target */
424 if (dm_table_get_num_targets(map) != 1)
971888c4 425 return r;
aa129a22 426
66482026
MS
427 tgt = dm_table_get_target(map, 0);
428 if (!tgt->type->prepare_ioctl)
971888c4 429 return r;
519049af 430
971888c4
MS
431 if (dm_suspended_md(md))
432 return -EAGAIN;
aa129a22 433
5bd5e8d8 434 r = tgt->type->prepare_ioctl(tgt, bdev);
5bbbfdf6 435 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
971888c4 436 dm_put_live_table(md, *srcu_idx);
6c182cd8
HR
437 msleep(10);
438 goto retry;
439 }
971888c4 440
e56f81e0
CH
441 return r;
442}
443
971888c4 444static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
971888c4
MS
445{
446 dm_put_live_table(md, srcu_idx);
447}
448
e56f81e0
CH
449static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
450 unsigned int cmd, unsigned long arg)
451{
452 struct mapped_device *md = bdev->bd_disk->private_data;
971888c4 453 int r, srcu_idx;
e56f81e0 454
5bd5e8d8 455 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
e56f81e0 456 if (r < 0)
971888c4 457 goto out;
6c182cd8 458
e56f81e0
CH
459 if (r > 0) {
460 /*
e980f623
CH
461 * Target determined this ioctl is being issued against a
462 * subset of the parent bdev; require extra privileges.
e56f81e0 463 */
e980f623 464 if (!capable(CAP_SYS_RAWIO)) {
0378c625 465 DMDEBUG_LIMIT(
e980f623
CH
466 "%s: sending ioctl %x to DM device without required privilege.",
467 current->comm, cmd);
468 r = -ENOIOCTLCMD;
e56f81e0 469 goto out;
e980f623 470 }
e56f81e0 471 }
6c182cd8 472
a7cb3d2f
CH
473 if (!bdev->bd_disk->fops->ioctl)
474 r = -ENOTTY;
475 else
476 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
e56f81e0 477out:
971888c4 478 dm_unprepare_ioctl(md, srcu_idx);
aa129a22
MB
479 return r;
480}
481
7465d7ac
MS
482u64 dm_start_time_ns_from_clone(struct bio *bio)
483{
6c23f0bd 484 return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
7465d7ac
MS
485}
486EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
487
488static void start_io_acct(struct dm_io *io)
489{
490 struct mapped_device *md = io->md;
491 struct bio *bio = io->orig_bio;
492
b879f915 493 bio_start_io_acct_time(bio, io->start_time);
7465d7ac
MS
494 if (unlikely(dm_stats_used(&md->stats)))
495 dm_stats_account_io(&md->stats, bio_data_dir(bio),
496 bio->bi_iter.bi_sector, bio_sectors(bio),
497 false, 0, &io->stats_aux);
498}
499
d208b894
JL
500static void end_io_acct(struct mapped_device *md, struct bio *bio,
501 unsigned long start_time, struct dm_stats_aux *stats_aux)
7465d7ac 502{
d208b894 503 unsigned long duration = jiffies - start_time;
7465d7ac 504
d208b894 505 bio_end_io_acct(bio, start_time);
7465d7ac
MS
506
507 if (unlikely(dm_stats_used(&md->stats)))
508 dm_stats_account_io(&md->stats, bio_data_dir(bio),
509 bio->bi_iter.bi_sector, bio_sectors(bio),
d208b894 510 true, duration, stats_aux);
7465d7ac
MS
511
512 /* nudge anyone waiting on suspend queue */
513 if (unlikely(wq_has_sleeper(&md->wait)))
514 wake_up(&md->wait);
515}
978e51ba
MS
516
517static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
1da177e4 518{
64f52b0e
MS
519 struct dm_io *io;
520 struct dm_target_io *tio;
521 struct bio *clone;
522
609be106 523 clone = bio_alloc_bioset(NULL, 0, 0, GFP_NOIO, &md->io_bs);
64f52b0e 524
6c23f0bd 525 tio = clone_to_tio(clone);
64f52b0e
MS
526 tio->inside_dm_io = true;
527 tio->io = NULL;
528
529 io = container_of(tio, struct dm_io, tio);
530 io->magic = DM_IO_MAGIC;
978e51ba
MS
531 io->status = 0;
532 atomic_set(&io->io_count, 1);
533 io->orig_bio = bio;
534 io->md = md;
535 spin_lock_init(&io->endio_lock);
536
b879f915 537 io->start_time = jiffies;
64f52b0e
MS
538
539 return io;
1da177e4
LT
540}
541
028867ac 542static void free_io(struct mapped_device *md, struct dm_io *io)
1da177e4 543{
64f52b0e
MS
544 bio_put(&io->tio.clone);
545}
546
1d1068ce 547static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
dc8e2021 548 unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
64f52b0e
MS
549{
550 struct dm_target_io *tio;
551
552 if (!ci->io->tio.io) {
553 /* the dm_target_io embedded in ci->io is available */
554 tio = &ci->io->tio;
555 } else {
609be106
CH
556 struct bio *clone = bio_alloc_bioset(NULL, 0, 0, gfp_mask,
557 &ci->io->md->bs);
64f52b0e
MS
558 if (!clone)
559 return NULL;
560
6c23f0bd 561 tio = clone_to_tio(clone);
64f52b0e
MS
562 tio->inside_dm_io = false;
563 }
dc8e2021 564 __bio_clone_fast(&tio->clone, ci->bio);
64f52b0e
MS
565
566 tio->magic = DM_TIO_MAGIC;
567 tio->io = ci->io;
568 tio->ti = ti;
569 tio->target_bio_nr = target_bio_nr;
dc8e2021 570 tio->len_ptr = len;
64f52b0e 571
1d1068ce 572 return &tio->clone;
1da177e4
LT
573}
574
1d1068ce 575static void free_tio(struct bio *clone)
1da177e4 576{
1d1068ce 577 if (clone_to_tio(clone)->inside_dm_io)
64f52b0e 578 return;
1d1068ce 579 bio_put(clone);
1da177e4
LT
580}
581
582/*
583 * Add the bio to the list of deferred io.
584 */
92c63902 585static void queue_io(struct mapped_device *md, struct bio *bio)
1da177e4 586{
05447420 587 unsigned long flags;
1da177e4 588
05447420 589 spin_lock_irqsave(&md->deferred_lock, flags);
1da177e4 590 bio_list_add(&md->deferred, bio);
05447420 591 spin_unlock_irqrestore(&md->deferred_lock, flags);
6a8736d1 592 queue_work(md->wq, &md->work);
1da177e4
LT
593}
594
595/*
596 * Everyone (including functions in this file), should use this
597 * function to access the md->map field, and make sure they call
83d5e5b0 598 * dm_put_live_table() when finished.
1da177e4 599 */
83d5e5b0 600struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
1da177e4 601{
83d5e5b0
MP
602 *srcu_idx = srcu_read_lock(&md->io_barrier);
603
604 return srcu_dereference(md->map, &md->io_barrier);
605}
1da177e4 606
83d5e5b0
MP
607void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
608{
609 srcu_read_unlock(&md->io_barrier, srcu_idx);
610}
611
612void dm_sync_table(struct mapped_device *md)
613{
614 synchronize_srcu(&md->io_barrier);
615 synchronize_rcu_expedited();
616}
617
618/*
619 * A fast alternative to dm_get_live_table/dm_put_live_table.
620 * The caller must not block between these two functions.
621 */
622static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
623{
624 rcu_read_lock();
625 return rcu_dereference(md->map);
626}
1da177e4 627
83d5e5b0
MP
628static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
629{
630 rcu_read_unlock();
1da177e4
LT
631}
632
971888c4
MS
633static char *_dm_claim_ptr = "I belong to device-mapper";
634
86f1152b
BM
635/*
636 * Open a table device so we can use it as a map destination.
637 */
638static int open_table_device(struct table_device *td, dev_t dev,
639 struct mapped_device *md)
640{
86f1152b 641 struct block_device *bdev;
cd913c76 642 u64 part_off;
86f1152b
BM
643 int r;
644
645 BUG_ON(td->dm_dev.bdev);
646
519049af 647 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
86f1152b
BM
648 if (IS_ERR(bdev))
649 return PTR_ERR(bdev);
650
651 r = bd_link_disk_holder(bdev, dm_disk(md));
652 if (r) {
653 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
654 return r;
655 }
656
657 td->dm_dev.bdev = bdev;
cd913c76 658 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
86f1152b
BM
659 return 0;
660}
661
662/*
663 * Close a table device that we've been using.
664 */
665static void close_table_device(struct table_device *td, struct mapped_device *md)
666{
667 if (!td->dm_dev.bdev)
668 return;
669
670 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
671 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
817bf402 672 put_dax(td->dm_dev.dax_dev);
86f1152b 673 td->dm_dev.bdev = NULL;
817bf402 674 td->dm_dev.dax_dev = NULL;
86f1152b
BM
675}
676
677static struct table_device *find_table_device(struct list_head *l, dev_t dev,
8454fca4
SS
678 fmode_t mode)
679{
86f1152b
BM
680 struct table_device *td;
681
682 list_for_each_entry(td, l, list)
683 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
684 return td;
685
686 return NULL;
687}
688
689int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
8454fca4
SS
690 struct dm_dev **result)
691{
86f1152b
BM
692 int r;
693 struct table_device *td;
694
695 mutex_lock(&md->table_devices_lock);
696 td = find_table_device(&md->table_devices, dev, mode);
697 if (!td) {
115485e8 698 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
86f1152b
BM
699 if (!td) {
700 mutex_unlock(&md->table_devices_lock);
701 return -ENOMEM;
702 }
703
704 td->dm_dev.mode = mode;
705 td->dm_dev.bdev = NULL;
706
707 if ((r = open_table_device(td, dev, md))) {
708 mutex_unlock(&md->table_devices_lock);
709 kfree(td);
710 return r;
711 }
712
713 format_dev_t(td->dm_dev.name, dev);
714
b0b4d7c6 715 refcount_set(&td->count, 1);
86f1152b 716 list_add(&td->list, &md->table_devices);
b0b4d7c6
ER
717 } else {
718 refcount_inc(&td->count);
86f1152b 719 }
86f1152b
BM
720 mutex_unlock(&md->table_devices_lock);
721
722 *result = &td->dm_dev;
723 return 0;
724}
86f1152b
BM
725
726void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
727{
728 struct table_device *td = container_of(d, struct table_device, dm_dev);
729
730 mutex_lock(&md->table_devices_lock);
b0b4d7c6 731 if (refcount_dec_and_test(&td->count)) {
86f1152b
BM
732 close_table_device(td, md);
733 list_del(&td->list);
734 kfree(td);
735 }
736 mutex_unlock(&md->table_devices_lock);
737}
86f1152b
BM
738
739static void free_table_devices(struct list_head *devices)
740{
741 struct list_head *tmp, *next;
742
743 list_for_each_safe(tmp, next, devices) {
744 struct table_device *td = list_entry(tmp, struct table_device, list);
745
746 DMWARN("dm_destroy: %s still exists with %d references",
b0b4d7c6 747 td->dm_dev.name, refcount_read(&td->count));
86f1152b
BM
748 kfree(td);
749 }
750}
751
3ac51e74
DW
752/*
753 * Get the geometry associated with a dm device
754 */
755int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
756{
757 *geo = md->geometry;
758
759 return 0;
760}
761
762/*
763 * Set the geometry of a device.
764 */
765int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
766{
767 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
768
769 if (geo->start > sz) {
770 DMWARN("Start sector is beyond the geometry limits.");
771 return -EINVAL;
772 }
773
774 md->geometry = *geo;
775
776 return 0;
777}
778
2e93ccc1
KU
779static int __noflush_suspending(struct mapped_device *md)
780{
781 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
782}
783
1da177e4
LT
784/*
785 * Decrements the number of outstanding ios that a bio has been
786 * cloned into, completing the original io if necc.
787 */
e2118b3c 788void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
1da177e4 789{
2e93ccc1 790 unsigned long flags;
4e4cbee9 791 blk_status_t io_error;
b35f8caa
MB
792 struct bio *bio;
793 struct mapped_device *md = io->md;
d208b894
JL
794 unsigned long start_time = 0;
795 struct dm_stats_aux stats_aux;
2e93ccc1
KU
796
797 /* Push-back supersedes any I/O errors */
f88fb981
KU
798 if (unlikely(error)) {
799 spin_lock_irqsave(&io->endio_lock, flags);
745dc570 800 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
4e4cbee9 801 io->status = error;
f88fb981
KU
802 spin_unlock_irqrestore(&io->endio_lock, flags);
803 }
1da177e4
LT
804
805 if (atomic_dec_and_test(&io->io_count)) {
bf14e2b2 806 bio = io->orig_bio;
4e4cbee9 807 if (io->status == BLK_STS_DM_REQUEUE) {
2e93ccc1
KU
808 /*
809 * Target requested pushing back the I/O.
2e93ccc1 810 */
022c2611 811 spin_lock_irqsave(&md->deferred_lock, flags);
bf14e2b2
DLM
812 if (__noflush_suspending(md) &&
813 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
745dc570 814 /* NOTE early return due to BLK_STS_DM_REQUEUE below */
bf14e2b2
DLM
815 bio_list_add_head(&md->deferred, bio);
816 } else {
817 /*
818 * noflush suspend was interrupted or this is
819 * a write to a zoned target.
820 */
4e4cbee9 821 io->status = BLK_STS_IOERR;
bf14e2b2 822 }
022c2611 823 spin_unlock_irqrestore(&md->deferred_lock, flags);
2e93ccc1
KU
824 }
825
4e4cbee9 826 io_error = io->status;
d208b894
JL
827 start_time = io->start_time;
828 stats_aux = io->stats_aux;
6a8736d1 829 free_io(md, io);
d208b894 830 end_io_acct(md, bio, start_time, &stats_aux);
6a8736d1 831
4e4cbee9 832 if (io_error == BLK_STS_DM_REQUEUE)
6a8736d1 833 return;
2e93ccc1 834
1eff9d32 835 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
af7e466a 836 /*
6a8736d1 837 * Preflush done for flush with data, reissue
28a8f0d3 838 * without REQ_PREFLUSH.
af7e466a 839 */
1eff9d32 840 bio->bi_opf &= ~REQ_PREFLUSH;
6a8736d1 841 queue_io(md, bio);
af7e466a 842 } else {
b372d360 843 /* done with normal IO or empty flush */
8dd601fa
N
844 if (io_error)
845 bio->bi_status = io_error;
4246a0b6 846 bio_endio(bio);
b35f8caa 847 }
1da177e4
LT
848 }
849}
850
bcb44433
MS
851void disable_discard(struct mapped_device *md)
852{
853 struct queue_limits *limits = dm_get_queue_limits(md);
854
855 /* device doesn't really support DISCARD, disable it */
856 limits->max_discard_sectors = 0;
857 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
858}
859
4cc96131 860void disable_write_same(struct mapped_device *md)
7eee4ae2
MS
861{
862 struct queue_limits *limits = dm_get_queue_limits(md);
863
864 /* device doesn't really support WRITE SAME, disable it */
865 limits->max_write_same_sectors = 0;
866}
867
ac62d620
CH
868void disable_write_zeroes(struct mapped_device *md)
869{
870 struct queue_limits *limits = dm_get_queue_limits(md);
871
872 /* device doesn't really support WRITE ZEROES, disable it */
873 limits->max_write_zeroes_sectors = 0;
874}
875
a666e5c0
MP
876static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
877{
878 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
879}
880
4246a0b6 881static void clone_endio(struct bio *bio)
1da177e4 882{
4e4cbee9 883 blk_status_t error = bio->bi_status;
6c23f0bd 884 struct dm_target_io *tio = clone_to_tio(bio);
b35f8caa 885 struct dm_io *io = tio->io;
9faf400f 886 struct mapped_device *md = tio->io->md;
1da177e4 887 dm_endio_fn endio = tio->ti->type->end_io;
309dca30 888 struct request_queue *q = bio->bi_bdev->bd_disk->queue;
1da177e4 889
9c37de29 890 if (unlikely(error == BLK_STS_TARGET)) {
bcb44433 891 if (bio_op(bio) == REQ_OP_DISCARD &&
309dca30 892 !q->limits.max_discard_sectors)
bcb44433
MS
893 disable_discard(md);
894 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
309dca30 895 !q->limits.max_write_same_sectors)
ac62d620 896 disable_write_same(md);
bcb44433 897 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
309dca30 898 !q->limits.max_write_zeroes_sectors)
ac62d620
CH
899 disable_write_zeroes(md);
900 }
7eee4ae2 901
bb37d772
DLM
902 if (blk_queue_is_zoned(q))
903 dm_zone_endio(io, bio);
415c79e1 904
1be56909 905 if (endio) {
4e4cbee9 906 int r = endio(tio->ti, bio, &error);
1be56909
CH
907 switch (r) {
908 case DM_ENDIO_REQUEUE:
bf14e2b2
DLM
909 /*
910 * Requeuing writes to a sequential zone of a zoned
911 * target will break the sequential write pattern:
912 * fail such IO.
913 */
914 if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
915 error = BLK_STS_IOERR;
916 else
917 error = BLK_STS_DM_REQUEUE;
df561f66 918 fallthrough;
1be56909
CH
919 case DM_ENDIO_DONE:
920 break;
921 case DM_ENDIO_INCOMPLETE:
922 /* The target will handle the io */
923 return;
924 default:
925 DMWARN("unimplemented target endio return value: %d", r);
926 BUG();
927 }
928 }
929
a666e5c0
MP
930 if (unlikely(swap_bios_limit(tio->ti, bio))) {
931 struct mapped_device *md = io->md;
932 up(&md->swap_bios_semaphore);
933 }
934
1d1068ce 935 free_tio(bio);
e2118b3c 936 dm_io_dec_pending(io, error);
1da177e4
LT
937}
938
56a67df7
MS
939/*
940 * Return maximum size of I/O possible at the supplied sector up to the current
941 * target boundary.
942 */
3720281d
MS
943static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
944 sector_t target_offset)
56a67df7 945{
56a67df7
MS
946 return ti->len - target_offset;
947}
948
3720281d 949static sector_t max_io_len(struct dm_target *ti, sector_t sector)
1da177e4 950{
3720281d
MS
951 sector_t target_offset = dm_target_offset(ti, sector);
952 sector_t len = max_io_len_target_boundary(ti, target_offset);
5091cdec 953 sector_t max_len;
1da177e4
LT
954
955 /*
3ee16db3
MS
956 * Does the target need to split IO even further?
957 * - varied (per target) IO splitting is a tenet of DM; this
958 * explains why stacked chunk_sectors based splitting via
959 * blk_max_size_offset() isn't possible here. So pass in
960 * ti->max_io_len to override stacked chunk_sectors.
1da177e4 961 */
3ee16db3
MS
962 if (ti->max_io_len) {
963 max_len = blk_max_size_offset(ti->table->md->queue,
964 target_offset, ti->max_io_len);
965 if (len > max_len)
966 len = max_len;
967 }
1da177e4
LT
968
969 return len;
970}
971
542f9038
MS
972int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
973{
974 if (len > UINT_MAX) {
975 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
976 (unsigned long long)len, UINT_MAX);
977 ti->error = "Maximum size of target IO is too large";
978 return -EINVAL;
979 }
980
75ae1936 981 ti->max_io_len = (uint32_t) len;
542f9038
MS
982
983 return 0;
984}
985EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
986
f26c5719 987static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
3d97c829
MS
988 sector_t sector, int *srcu_idx)
989 __acquires(md->io_barrier)
545ed20e 990{
545ed20e
TK
991 struct dm_table *map;
992 struct dm_target *ti;
545ed20e 993
f26c5719 994 map = dm_get_live_table(md, srcu_idx);
545ed20e 995 if (!map)
f26c5719 996 return NULL;
545ed20e
TK
997
998 ti = dm_table_find_target(map, sector);
123d87d5 999 if (!ti)
f26c5719 1000 return NULL;
545ed20e 1001
f26c5719
DW
1002 return ti;
1003}
545ed20e 1004
f26c5719 1005static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
3d97c829 1006 long nr_pages, void **kaddr, pfn_t *pfn)
f26c5719
DW
1007{
1008 struct mapped_device *md = dax_get_private(dax_dev);
1009 sector_t sector = pgoff * PAGE_SECTORS;
1010 struct dm_target *ti;
1011 long len, ret = -EIO;
1012 int srcu_idx;
545ed20e 1013
f26c5719 1014 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
545ed20e 1015
f26c5719
DW
1016 if (!ti)
1017 goto out;
1018 if (!ti->type->direct_access)
1019 goto out;
3720281d 1020 len = max_io_len(ti, sector) / PAGE_SECTORS;
f26c5719
DW
1021 if (len < 1)
1022 goto out;
1023 nr_pages = min(len, nr_pages);
dbc62659 1024 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
817bf402 1025
f26c5719 1026 out:
545ed20e 1027 dm_put_live_table(md, srcu_idx);
f26c5719
DW
1028
1029 return ret;
545ed20e
TK
1030}
1031
cdf6cdcd
VG
1032static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1033 size_t nr_pages)
1034{
1035 struct mapped_device *md = dax_get_private(dax_dev);
1036 sector_t sector = pgoff * PAGE_SECTORS;
1037 struct dm_target *ti;
1038 int ret = -EIO;
1039 int srcu_idx;
1040
1041 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1042
1043 if (!ti)
1044 goto out;
1045 if (WARN_ON(!ti->type->dax_zero_page_range)) {
1046 /*
1047 * ->zero_page_range() is mandatory dax operation. If we are
1048 * here, something is wrong.
1049 */
cdf6cdcd
VG
1050 goto out;
1051 }
1052 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
cdf6cdcd
VG
1053 out:
1054 dm_put_live_table(md, srcu_idx);
1055
1056 return ret;
1057}
1058
1dd40c3e
MP
1059/*
1060 * A target may call dm_accept_partial_bio only from the map routine. It is
6842d264
DLM
1061 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1062 * operations and REQ_OP_ZONE_APPEND (zone append writes).
1dd40c3e
MP
1063 *
1064 * dm_accept_partial_bio informs the dm that the target only wants to process
1065 * additional n_sectors sectors of the bio and the rest of the data should be
1066 * sent in a next bio.
1067 *
1068 * A diagram that explains the arithmetics:
1069 * +--------------------+---------------+-------+
1070 * | 1 | 2 | 3 |
1071 * +--------------------+---------------+-------+
1072 *
1073 * <-------------- *tio->len_ptr --------------->
1074 * <------- bi_size ------->
1075 * <-- n_sectors -->
1076 *
1077 * Region 1 was already iterated over with bio_advance or similar function.
1078 * (it may be empty if the target doesn't use bio_advance)
1079 * Region 2 is the remaining bio size that the target wants to process.
1080 * (it may be empty if region 1 is non-empty, although there is no reason
1081 * to make it empty)
1082 * The target requires that region 3 is to be sent in the next bio.
1083 *
1084 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1085 * the partially processed part (the sum of regions 1+2) must be the same for all
1086 * copies of the bio.
1087 */
1088void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1089{
6c23f0bd 1090 struct dm_target_io *tio = clone_to_tio(bio);
1dd40c3e 1091 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
6842d264 1092
1eff9d32 1093 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
6842d264
DLM
1094 BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1095 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
1dd40c3e
MP
1096 BUG_ON(bi_size > *tio->len_ptr);
1097 BUG_ON(n_sectors > bi_size);
6842d264 1098
1dd40c3e
MP
1099 *tio->len_ptr -= bi_size - n_sectors;
1100 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1101}
1102EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1103
a666e5c0
MP
1104static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1105{
1106 mutex_lock(&md->swap_bios_lock);
1107 while (latch < md->swap_bios) {
1108 cond_resched();
1109 down(&md->swap_bios_semaphore);
1110 md->swap_bios--;
1111 }
1112 while (latch > md->swap_bios) {
1113 cond_resched();
1114 up(&md->swap_bios_semaphore);
1115 md->swap_bios++;
1116 }
1117 mutex_unlock(&md->swap_bios_lock);
1118}
1119
1561b396 1120static void __map_bio(struct bio *clone)
1da177e4 1121{
1561b396 1122 struct dm_target_io *tio = clone_to_tio(clone);
1da177e4 1123 int r;
2056a782 1124 sector_t sector;
64f52b0e 1125 struct dm_io *io = tio->io;
bd2a49b8 1126 struct dm_target *ti = tio->ti;
1da177e4 1127
1da177e4 1128 clone->bi_end_io = clone_endio;
1da177e4
LT
1129
1130 /*
1131 * Map the clone. If r == 0 we don't need to do
1132 * anything, the target has assumed ownership of
1133 * this io.
1134 */
e2118b3c 1135 dm_io_inc_pending(io);
4f024f37 1136 sector = clone->bi_iter.bi_sector;
d67a5f4b 1137
a666e5c0
MP
1138 if (unlikely(swap_bios_limit(ti, clone))) {
1139 struct mapped_device *md = io->md;
1140 int latch = get_swap_bios();
1141 if (unlikely(latch != md->swap_bios))
1142 __set_swap_bios_limit(md, latch);
1143 down(&md->swap_bios_semaphore);
1144 }
1145
bb37d772
DLM
1146 /*
1147 * Check if the IO needs a special mapping due to zone append emulation
1148 * on zoned target. In this case, dm_zone_map_bio() calls the target
1149 * map operation.
1150 */
1151 if (dm_emulate_zone_append(io->md))
1152 r = dm_zone_map_bio(tio);
1153 else
1154 r = ti->type->map(ti, clone);
1155
846785e6
CH
1156 switch (r) {
1157 case DM_MAPIO_SUBMITTED:
1158 break;
1159 case DM_MAPIO_REMAPPED:
1da177e4 1160 /* the bio has been remapped so dispatch it */
1c02fca6 1161 trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
3e08773c 1162 submit_bio_noacct(clone);
846785e6
CH
1163 break;
1164 case DM_MAPIO_KILL:
a666e5c0
MP
1165 if (unlikely(swap_bios_limit(ti, clone))) {
1166 struct mapped_device *md = io->md;
1167 up(&md->swap_bios_semaphore);
1168 }
1d1068ce 1169 free_tio(clone);
e2118b3c 1170 dm_io_dec_pending(io, BLK_STS_IOERR);
4e4cbee9 1171 break;
846785e6 1172 case DM_MAPIO_REQUEUE:
a666e5c0
MP
1173 if (unlikely(swap_bios_limit(ti, clone))) {
1174 struct mapped_device *md = io->md;
1175 up(&md->swap_bios_semaphore);
1176 }
1d1068ce 1177 free_tio(clone);
e2118b3c 1178 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
846785e6
CH
1179 break;
1180 default:
45cbcd79
KU
1181 DMWARN("unimplemented target map return value: %d", r);
1182 BUG();
1da177e4
LT
1183 }
1184}
1da177e4 1185
e0d6609a 1186static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
bd2a49b8 1187{
4f024f37
KO
1188 bio->bi_iter.bi_sector = sector;
1189 bio->bi_iter.bi_size = to_bytes(len);
1da177e4
LT
1190}
1191
1192/*
1193 * Creates a bio that consists of range of complete bvecs.
1194 */
b1bee792
CH
1195static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1196 sector_t sector, unsigned *len)
1da177e4 1197{
b1bee792 1198 struct bio *bio = ci->bio, *clone;
07560151 1199 int r;
1da177e4 1200
1d1068ce 1201 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
1c3b13e6 1202
07560151
EB
1203 r = bio_crypt_clone(clone, bio, GFP_NOIO);
1204 if (r < 0)
b1bee792 1205 goto free_tio;
a892c8d5 1206
57c36519 1207 if (bio_integrity(bio)) {
1d1068ce
CH
1208 struct dm_target_io *tio = clone_to_tio(clone);
1209
e2460f2a
MP
1210 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1211 !dm_target_passes_integrity(tio->ti->type))) {
1212 DMWARN("%s: the target %s doesn't support integrity data.",
1213 dm_device_name(tio->io->md),
1214 tio->ti->type->name);
b1bee792
CH
1215 r = -EIO;
1216 goto free_tio;
e2460f2a
MP
1217 }
1218
1219 r = bio_integrity_clone(clone, bio, GFP_NOIO);
c80914e8 1220 if (r < 0)
b1bee792 1221 goto free_tio;
c80914e8 1222 }
bd2a49b8 1223
fa8db494 1224 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
b1bee792 1225 clone->bi_iter.bi_size = to_bytes(*len);
fa8db494
MS
1226
1227 if (bio_integrity(bio))
1228 bio_integrity_trim(clone);
c80914e8 1229
1561b396 1230 __map_bio(clone);
c80914e8 1231 return 0;
b1bee792 1232free_tio:
1d1068ce 1233 free_tio(clone);
b1bee792 1234 return r;
1da177e4
LT
1235}
1236
318716dd 1237static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
dc8e2021
CH
1238 struct dm_target *ti, unsigned num_bios,
1239 unsigned *len)
f9ab94ce 1240{
1d1068ce 1241 struct bio *bio;
318716dd 1242 int try;
dba14160 1243
318716dd
MS
1244 if (!num_bios)
1245 return;
f9ab94ce 1246
318716dd 1247 if (num_bios == 1) {
1d1068ce
CH
1248 bio = alloc_tio(ci, ti, 0, len, GFP_NOIO);
1249 bio_list_add(blist, bio);
318716dd
MS
1250 return;
1251 }
9015df24 1252
318716dd
MS
1253 for (try = 0; try < 2; try++) {
1254 int bio_nr;
318716dd
MS
1255
1256 if (try)
bc02cdbe 1257 mutex_lock(&ci->io->md->table_devices_lock);
318716dd 1258 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1d1068ce 1259 bio = alloc_tio(ci, ti, bio_nr, len,
dc8e2021 1260 try ? GFP_NOIO : GFP_NOWAIT);
1d1068ce 1261 if (!bio)
318716dd
MS
1262 break;
1263
1d1068ce 1264 bio_list_add(blist, bio);
318716dd
MS
1265 }
1266 if (try)
bc02cdbe 1267 mutex_unlock(&ci->io->md->table_devices_lock);
318716dd
MS
1268 if (bio_nr == num_bios)
1269 return;
1270
6c23f0bd 1271 while ((bio = bio_list_pop(blist)))
1d1068ce 1272 free_tio(bio);
318716dd 1273 }
9015df24
AK
1274}
1275
14fe594d 1276static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1dd40c3e 1277 unsigned num_bios, unsigned *len)
06a426ce 1278{
318716dd 1279 struct bio_list blist = BIO_EMPTY_LIST;
8eabf5d0 1280 struct bio *clone;
318716dd 1281
dc8e2021 1282 alloc_multiple_bios(&blist, ci, ti, num_bios, len);
06a426ce 1283
8eabf5d0 1284 while ((clone = bio_list_pop(&blist))) {
8eabf5d0
CH
1285 if (len)
1286 bio_setup_sector(clone, ci->sector, *len);
1561b396 1287 __map_bio(clone);
8eabf5d0 1288 }
06a426ce
MS
1289}
1290
14fe594d 1291static int __send_empty_flush(struct clone_info *ci)
f9ab94ce 1292{
06a426ce 1293 unsigned target_nr = 0;
f9ab94ce 1294 struct dm_target *ti;
828678b8
MS
1295 struct bio flush_bio;
1296
1297 /*
1298 * Use an on-stack bio for this, it's safe since we don't
1299 * need to reference it after submit. It's just used as
1300 * the basis for the clone(s).
1301 */
49add496
CH
1302 bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
1303 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
47d95102 1304
828678b8
MS
1305 ci->bio = &flush_bio;
1306 ci->sector_count = 0;
f9ab94ce 1307
b372d360 1308 BUG_ON(bio_has_data(ci->bio));
f9ab94ce 1309 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1dd40c3e 1310 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
828678b8
MS
1311
1312 bio_uninit(ci->bio);
f9ab94ce
MP
1313 return 0;
1314}
1315
3d7f4562 1316static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
61697a6a 1317 unsigned num_bios)
ba1cbad9 1318{
51b86f9a 1319 unsigned len;
ba1cbad9 1320
3d7f4562
MS
1321 /*
1322 * Even though the device advertised support for this type of
1323 * request, that does not mean every target supports it, and
1324 * reconfiguration might also have changed that since the
1325 * check was performed.
1326 */
3d7f4562
MS
1327 if (!num_bios)
1328 return -EOPNOTSUPP;
ba1cbad9 1329
3720281d
MS
1330 len = min_t(sector_t, ci->sector_count,
1331 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
51b86f9a 1332
3d7f4562 1333 __send_duplicate_bios(ci, ti, num_bios, &len);
e262f347 1334
3d7f4562
MS
1335 ci->sector += len;
1336 ci->sector_count -= len;
5ae89a87
MS
1337
1338 return 0;
ba1cbad9
MS
1339}
1340
568c73a3
MS
1341static bool is_abnormal_io(struct bio *bio)
1342{
1343 bool r = false;
1344
1345 switch (bio_op(bio)) {
1346 case REQ_OP_DISCARD:
1347 case REQ_OP_SECURE_ERASE:
1348 case REQ_OP_WRITE_SAME:
1349 case REQ_OP_WRITE_ZEROES:
1350 r = true;
1351 break;
1352 }
1353
1354 return r;
1355}
1356
0519c71e
MS
1357static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1358 int *result)
1359{
1360 struct bio *bio = ci->bio;
9679b5a7 1361 unsigned num_bios = 0;
0519c71e 1362
9679b5a7
MS
1363 switch (bio_op(bio)) {
1364 case REQ_OP_DISCARD:
1365 num_bios = ti->num_discard_bios;
1366 break;
1367 case REQ_OP_SECURE_ERASE:
1368 num_bios = ti->num_secure_erase_bios;
1369 break;
1370 case REQ_OP_WRITE_SAME:
1371 num_bios = ti->num_write_same_bios;
1372 break;
1373 case REQ_OP_WRITE_ZEROES:
1374 num_bios = ti->num_write_zeroes_bios;
1375 break;
1376 default:
0519c71e 1377 return false;
9679b5a7 1378 }
0519c71e 1379
9679b5a7 1380 *result = __send_changing_extent_only(ci, ti, num_bios);
0519c71e
MS
1381 return true;
1382}
1383
e4c93811
AK
1384/*
1385 * Select the correct strategy for processing a non-flush bio.
1386 */
14fe594d 1387static int __split_and_process_non_flush(struct clone_info *ci)
0ce65797 1388{
512875bd 1389 struct dm_target *ti;
1c3b13e6 1390 unsigned len;
c80914e8 1391 int r;
0ce65797 1392
512875bd 1393 ti = dm_table_find_target(ci->map, ci->sector);
123d87d5 1394 if (!ti)
512875bd
JN
1395 return -EIO;
1396
568c73a3 1397 if (__process_abnormal_io(ci, ti, &r))
0519c71e 1398 return r;
3d7f4562 1399
3720281d 1400 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
0ce65797 1401
c80914e8
MS
1402 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1403 if (r < 0)
1404 return r;
0ce65797 1405
1c3b13e6
KO
1406 ci->sector += len;
1407 ci->sector_count -= len;
0ce65797 1408
1c3b13e6 1409 return 0;
0ce65797
MS
1410}
1411
978e51ba
MS
1412static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1413 struct dm_table *map, struct bio *bio)
1414{
1415 ci->map = map;
1416 ci->io = alloc_io(md, bio);
1417 ci->sector = bio->bi_iter.bi_sector;
1418}
1419
1da177e4 1420/*
14fe594d 1421 * Entry point to split a bio into clones and submit them to the targets.
1da177e4 1422 */
3e08773c 1423static void __split_and_process_bio(struct mapped_device *md,
978e51ba 1424 struct dm_table *map, struct bio *bio)
0ce65797 1425{
1da177e4 1426 struct clone_info ci;
512875bd 1427 int error = 0;
1da177e4 1428
978e51ba 1429 init_clone_info(&ci, md, map, bio);
0ce65797 1430
1eff9d32 1431 if (bio->bi_opf & REQ_PREFLUSH) {
14fe594d 1432 error = __send_empty_flush(&ci);
e2118b3c 1433 /* dm_io_dec_pending submits any data associated with flush */
2e2d6f7e 1434 } else if (op_is_zone_mgmt(bio_op(bio))) {
a4aa5e56
DLM
1435 ci.bio = bio;
1436 ci.sector_count = 0;
1437 error = __split_and_process_non_flush(&ci);
b372d360 1438 } else {
6a8736d1 1439 ci.bio = bio;
d87f4c14 1440 ci.sector_count = bio_sectors(bio);
8615cb65
MP
1441 error = __split_and_process_non_flush(&ci);
1442 if (ci.sector_count && !error) {
1443 /*
1444 * Remainder must be passed to submit_bio_noacct()
1445 * so that it gets handled *after* bios already submitted
1446 * have been completely processed.
1447 * We take a clone of the original to store in
1448 * ci.io->orig_bio to be used by end_io_acct() and
1449 * for dec_pending to use for completion handling.
1450 */
1451 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1452 GFP_NOIO, &md->queue->bio_split);
1453 ci.io->orig_bio = b;
1454
8615cb65
MP
1455 bio_chain(b, bio);
1456 trace_block_split(b, bio->bi_iter.bi_sector);
3e08773c 1457 submit_bio_noacct(bio);
18a25da8 1458 }
d87f4c14 1459 }
b879f915 1460 start_io_acct(ci.io);
0ce65797 1461
1da177e4 1462 /* drop the extra reference count */
e2118b3c 1463 dm_io_dec_pending(ci.io, errno_to_blk_status(error));
0ce65797
MS
1464}
1465
3e08773c 1466static void dm_submit_bio(struct bio *bio)
cec47e3d 1467{
309dca30 1468 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
83d5e5b0
MP
1469 int srcu_idx;
1470 struct dm_table *map;
cec47e3d 1471
83d5e5b0 1472 map = dm_get_live_table(md, &srcu_idx);
b2abdb1b
MS
1473 if (unlikely(!map)) {
1474 DMERR_LIMIT("%s: mapping table unavailable, erroring io",
1475 dm_device_name(md));
1476 bio_io_error(bio);
1477 goto out;
1478 }
29e4013d 1479
b2abdb1b 1480 /* If suspended, queue this IO for later */
6a8736d1 1481 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
6abc4946
KK
1482 if (bio->bi_opf & REQ_NOWAIT)
1483 bio_wouldblock_error(bio);
b2abdb1b 1484 else if (bio->bi_opf & REQ_RAHEAD)
54d9a1b4 1485 bio_io_error(bio);
b2abdb1b
MS
1486 else
1487 queue_io(md, bio);
1488 goto out;
cec47e3d 1489 }
1da177e4 1490
b2abdb1b
MS
1491 /*
1492 * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc)
1493 * otherwise associated queue_limits won't be imposed.
1494 */
1495 if (is_abnormal_io(bio))
1496 blk_queue_split(&bio);
978e51ba 1497
3e08773c 1498 __split_and_process_bio(md, map, bio);
b2abdb1b 1499out:
83d5e5b0 1500 dm_put_live_table(md, srcu_idx);
978e51ba
MS
1501}
1502
1da177e4
LT
1503/*-----------------------------------------------------------------
1504 * An IDR is used to keep track of allocated minor numbers.
1505 *---------------------------------------------------------------*/
2b06cfff 1506static void free_minor(int minor)
1da177e4 1507{
f32c10b0 1508 spin_lock(&_minor_lock);
1da177e4 1509 idr_remove(&_minor_idr, minor);
f32c10b0 1510 spin_unlock(&_minor_lock);
1da177e4
LT
1511}
1512
1513/*
1514 * See if the device with a specific minor # is free.
1515 */
cf13ab8e 1516static int specific_minor(int minor)
1da177e4 1517{
c9d76be6 1518 int r;
1da177e4
LT
1519
1520 if (minor >= (1 << MINORBITS))
1521 return -EINVAL;
1522
c9d76be6 1523 idr_preload(GFP_KERNEL);
f32c10b0 1524 spin_lock(&_minor_lock);
1da177e4 1525
c9d76be6 1526 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1da177e4 1527
f32c10b0 1528 spin_unlock(&_minor_lock);
c9d76be6
TH
1529 idr_preload_end();
1530 if (r < 0)
1531 return r == -ENOSPC ? -EBUSY : r;
1532 return 0;
1da177e4
LT
1533}
1534
cf13ab8e 1535static int next_free_minor(int *minor)
1da177e4 1536{
c9d76be6 1537 int r;
62f75c2f 1538
c9d76be6 1539 idr_preload(GFP_KERNEL);
f32c10b0 1540 spin_lock(&_minor_lock);
1da177e4 1541
c9d76be6 1542 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1da177e4 1543
f32c10b0 1544 spin_unlock(&_minor_lock);
c9d76be6
TH
1545 idr_preload_end();
1546 if (r < 0)
1547 return r;
1548 *minor = r;
1549 return 0;
1da177e4
LT
1550}
1551
83d5cde4 1552static const struct block_device_operations dm_blk_dops;
681cc5e8 1553static const struct block_device_operations dm_rq_blk_dops;
f26c5719 1554static const struct dax_operations dm_dax_ops;
1da177e4 1555
53d5914f
MP
1556static void dm_wq_work(struct work_struct *work);
1557
aa6ce87a 1558#ifdef CONFIG_BLK_INLINE_ENCRYPTION
cb77cb5a 1559static void dm_queue_destroy_crypto_profile(struct request_queue *q)
aa6ce87a 1560{
cb77cb5a 1561 dm_destroy_crypto_profile(q->crypto_profile);
aa6ce87a
ST
1562}
1563
1564#else /* CONFIG_BLK_INLINE_ENCRYPTION */
1565
cb77cb5a 1566static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
aa6ce87a
ST
1567{
1568}
1569#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
1570
0f20972f
MS
1571static void cleanup_mapped_device(struct mapped_device *md)
1572{
0f20972f
MS
1573 if (md->wq)
1574 destroy_workqueue(md->wq);
6f1c819c
KO
1575 bioset_exit(&md->bs);
1576 bioset_exit(&md->io_bs);
0f20972f 1577
f26c5719 1578 if (md->dax_dev) {
fb08a190 1579 dax_remove_host(md->disk);
f26c5719
DW
1580 kill_dax(md->dax_dev);
1581 put_dax(md->dax_dev);
1582 md->dax_dev = NULL;
1583 }
1584
0f20972f
MS
1585 if (md->disk) {
1586 spin_lock(&_minor_lock);
1587 md->disk->private_data = NULL;
1588 spin_unlock(&_minor_lock);
89f871af
CH
1589 if (dm_get_md_type(md) != DM_TYPE_NONE) {
1590 dm_sysfs_exit(md);
1591 del_gendisk(md->disk);
1592 }
cb77cb5a 1593 dm_queue_destroy_crypto_profile(md->queue);
74fe6ba9 1594 blk_cleanup_disk(md->disk);
74a2b6ec 1595 }
0f20972f 1596
d09960b0
TE
1597 cleanup_srcu_struct(&md->io_barrier);
1598
d5ffebdd
MS
1599 mutex_destroy(&md->suspend_lock);
1600 mutex_destroy(&md->type_lock);
1601 mutex_destroy(&md->table_devices_lock);
a666e5c0 1602 mutex_destroy(&md->swap_bios_lock);
d5ffebdd 1603
4cc96131 1604 dm_mq_cleanup_mapped_device(md);
bb37d772 1605 dm_cleanup_zoned_dev(md);
0f20972f
MS
1606}
1607
1da177e4
LT
1608/*
1609 * Allocate and initialise a blank device with a given minor.
1610 */
2b06cfff 1611static struct mapped_device *alloc_dev(int minor)
1da177e4 1612{
115485e8
MS
1613 int r, numa_node_id = dm_get_numa_node();
1614 struct mapped_device *md;
ba61fdd1 1615 void *old_md;
1da177e4 1616
856eb091 1617 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1da177e4
LT
1618 if (!md) {
1619 DMWARN("unable to allocate device, out of memory.");
1620 return NULL;
1621 }
1622
10da4f79 1623 if (!try_module_get(THIS_MODULE))
6ed7ade8 1624 goto bad_module_get;
10da4f79 1625
1da177e4 1626 /* get a minor number for the dev */
2b06cfff 1627 if (minor == DM_ANY_MINOR)
cf13ab8e 1628 r = next_free_minor(&minor);
2b06cfff 1629 else
cf13ab8e 1630 r = specific_minor(minor);
1da177e4 1631 if (r < 0)
6ed7ade8 1632 goto bad_minor;
1da177e4 1633
83d5e5b0
MP
1634 r = init_srcu_struct(&md->io_barrier);
1635 if (r < 0)
1636 goto bad_io_barrier;
1637
115485e8 1638 md->numa_node_id = numa_node_id;
591ddcfc 1639 md->init_tio_pdu = false;
a5664dad 1640 md->type = DM_TYPE_NONE;
e61290a4 1641 mutex_init(&md->suspend_lock);
a5664dad 1642 mutex_init(&md->type_lock);
86f1152b 1643 mutex_init(&md->table_devices_lock);
022c2611 1644 spin_lock_init(&md->deferred_lock);
1da177e4 1645 atomic_set(&md->holders, 1);
5c6bd75d 1646 atomic_set(&md->open_count, 0);
1da177e4 1647 atomic_set(&md->event_nr, 0);
7a8c3d3b
MA
1648 atomic_set(&md->uevent_seq, 0);
1649 INIT_LIST_HEAD(&md->uevent_list);
86f1152b 1650 INIT_LIST_HEAD(&md->table_devices);
7a8c3d3b 1651 spin_lock_init(&md->uevent_lock);
1da177e4 1652
47ace7e0 1653 /*
c62b37d9
CH
1654 * default to bio-based until DM table is loaded and md->type
1655 * established. If request-based table is loaded: blk-mq will
1656 * override accordingly.
47ace7e0 1657 */
74fe6ba9 1658 md->disk = blk_alloc_disk(md->numa_node_id);
1da177e4 1659 if (!md->disk)
0f20972f 1660 goto bad;
74fe6ba9 1661 md->queue = md->disk->queue;
1da177e4 1662
f0b04115 1663 init_waitqueue_head(&md->wait);
53d5914f 1664 INIT_WORK(&md->work, dm_wq_work);
f0b04115 1665 init_waitqueue_head(&md->eventq);
2995fa78 1666 init_completion(&md->kobj_holder.completion);
f0b04115 1667
a666e5c0
MP
1668 md->swap_bios = get_swap_bios();
1669 sema_init(&md->swap_bios_semaphore, md->swap_bios);
1670 mutex_init(&md->swap_bios_lock);
1671
1da177e4
LT
1672 md->disk->major = _major;
1673 md->disk->first_minor = minor;
74fe6ba9 1674 md->disk->minors = 1;
1ebe2e5f 1675 md->disk->flags |= GENHD_FL_NO_PART;
1da177e4
LT
1676 md->disk->fops = &dm_blk_dops;
1677 md->disk->queue = md->queue;
1678 md->disk->private_data = md;
1679 sprintf(md->disk->disk_name, "dm-%d", minor);
f26c5719 1680
5d2a228b 1681 if (IS_ENABLED(CONFIG_FS_DAX)) {
30c6828a 1682 md->dax_dev = alloc_dax(md, &dm_dax_ops);
d7519392
CH
1683 if (IS_ERR(md->dax_dev)) {
1684 md->dax_dev = NULL;
976431b0 1685 goto bad;
d7519392 1686 }
7ac5360c
CH
1687 set_dax_nocache(md->dax_dev);
1688 set_dax_nomc(md->dax_dev);
fb08a190 1689 if (dax_add_host(md->dax_dev, md->disk))
976431b0
DW
1690 goto bad;
1691 }
f26c5719 1692
7e51f257 1693 format_dev_t(md->name, MKDEV(_major, minor));
1da177e4 1694
c7c879ee 1695 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name);
304f3f6a 1696 if (!md->wq)
0f20972f 1697 goto bad;
304f3f6a 1698
fd2ed4d2
MP
1699 dm_stats_init(&md->stats);
1700
ba61fdd1 1701 /* Populate the mapping, nobody knows we exist yet */
f32c10b0 1702 spin_lock(&_minor_lock);
ba61fdd1 1703 old_md = idr_replace(&_minor_idr, md, minor);
f32c10b0 1704 spin_unlock(&_minor_lock);
ba61fdd1
JM
1705
1706 BUG_ON(old_md != MINOR_ALLOCED);
1707
1da177e4
LT
1708 return md;
1709
0f20972f
MS
1710bad:
1711 cleanup_mapped_device(md);
83d5e5b0 1712bad_io_barrier:
1da177e4 1713 free_minor(minor);
6ed7ade8 1714bad_minor:
10da4f79 1715 module_put(THIS_MODULE);
6ed7ade8 1716bad_module_get:
856eb091 1717 kvfree(md);
1da177e4
LT
1718 return NULL;
1719}
1720
ae9da83f
JN
1721static void unlock_fs(struct mapped_device *md);
1722
1da177e4
LT
1723static void free_dev(struct mapped_device *md)
1724{
f331c029 1725 int minor = MINOR(disk_devt(md->disk));
63d94e48 1726
32a926da 1727 unlock_fs(md);
2eb6e1e3 1728
0f20972f 1729 cleanup_mapped_device(md);
63a4f065 1730
86f1152b 1731 free_table_devices(&md->table_devices);
63a4f065 1732 dm_stats_cleanup(&md->stats);
63a4f065
MS
1733 free_minor(minor);
1734
10da4f79 1735 module_put(THIS_MODULE);
856eb091 1736 kvfree(md);
1da177e4
LT
1737}
1738
2a2a4c51 1739static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
e6ee8c0b 1740{
c0820cf5 1741 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2a2a4c51 1742 int ret = 0;
e6ee8c0b 1743
0776aa0e 1744 if (dm_table_bio_based(t)) {
64f52b0e
MS
1745 /*
1746 * The md may already have mempools that need changing.
1747 * If so, reload bioset because front_pad may have changed
1748 * because a different table was loaded.
1749 */
6f1c819c
KO
1750 bioset_exit(&md->bs);
1751 bioset_exit(&md->io_bs);
0776aa0e 1752
6f1c819c 1753 } else if (bioset_initialized(&md->bs)) {
4e6e36c3
MS
1754 /*
1755 * There's no need to reload with request-based dm
1756 * because the size of front_pad doesn't change.
1757 * Note for future: If you are to reload bioset,
1758 * prep-ed requests in the queue may refer
1759 * to bio from the old bioset, so you must walk
1760 * through the queue to unprep.
1761 */
1762 goto out;
c0820cf5 1763 }
e6ee8c0b 1764
6f1c819c
KO
1765 BUG_ON(!p ||
1766 bioset_initialized(&md->bs) ||
1767 bioset_initialized(&md->io_bs));
cbc4e3c1 1768
2a2a4c51
JA
1769 ret = bioset_init_from_src(&md->bs, &p->bs);
1770 if (ret)
1771 goto out;
1772 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
1773 if (ret)
1774 bioset_exit(&md->bs);
e6ee8c0b 1775out:
02233342 1776 /* mempool bind completed, no longer need any mempools in the table */
e6ee8c0b 1777 dm_table_free_md_mempools(t);
2a2a4c51 1778 return ret;
e6ee8c0b
KU
1779}
1780
1da177e4
LT
1781/*
1782 * Bind a table to the device.
1783 */
1784static void event_callback(void *context)
1785{
7a8c3d3b
MA
1786 unsigned long flags;
1787 LIST_HEAD(uevents);
1da177e4
LT
1788 struct mapped_device *md = (struct mapped_device *) context;
1789
7a8c3d3b
MA
1790 spin_lock_irqsave(&md->uevent_lock, flags);
1791 list_splice_init(&md->uevent_list, &uevents);
1792 spin_unlock_irqrestore(&md->uevent_lock, flags);
1793
ed9e1982 1794 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
7a8c3d3b 1795
1da177e4
LT
1796 atomic_inc(&md->event_nr);
1797 wake_up(&md->eventq);
62e08243 1798 dm_issue_global_event();
1da177e4
LT
1799}
1800
042d2a9b
AK
1801/*
1802 * Returns old map, which caller must destroy.
1803 */
1804static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1805 struct queue_limits *limits)
1da177e4 1806{
042d2a9b 1807 struct dm_table *old_map;
165125e1 1808 struct request_queue *q = md->queue;
978e51ba 1809 bool request_based = dm_table_request_based(t);
1da177e4 1810 sector_t size;
2a2a4c51 1811 int ret;
1da177e4 1812
5a8f1f80
BVA
1813 lockdep_assert_held(&md->suspend_lock);
1814
1da177e4 1815 size = dm_table_get_size(t);
3ac51e74
DW
1816
1817 /*
1818 * Wipe any geometry if the size of the table changed.
1819 */
fd2ed4d2 1820 if (size != dm_get_size(md))
3ac51e74
DW
1821 memset(&md->geometry, 0, sizeof(md->geometry));
1822
5424a0b8
MP
1823 if (!get_capacity(md->disk))
1824 set_capacity(md->disk, size);
1825 else
1826 set_capacity_and_notify(md->disk, size);
d5816876 1827
2ca3310e
AK
1828 dm_table_event_callback(t, event_callback, md);
1829
9c37de29 1830 if (request_based) {
16f12266 1831 /*
9c37de29
MS
1832 * Leverage the fact that request-based DM targets are
1833 * immutable singletons - used to optimize dm_mq_queue_rq.
16f12266
MS
1834 */
1835 md->immutable_target = dm_table_get_immutable_target(t);
1836 }
e6ee8c0b 1837
2a2a4c51
JA
1838 ret = __bind_mempools(md, t);
1839 if (ret) {
1840 old_map = ERR_PTR(ret);
1841 goto out;
1842 }
e6ee8c0b 1843
bb37d772
DLM
1844 ret = dm_table_set_restrictions(t, q, limits);
1845 if (ret) {
1846 old_map = ERR_PTR(ret);
1847 goto out;
1848 }
1849
a12f5d48 1850 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1d3aa6f6 1851 rcu_assign_pointer(md->map, (void *)t);
36a0456f
AK
1852 md->immutable_target_type = dm_table_get_immutable_target_type(t);
1853
41abc4e1
HR
1854 if (old_map)
1855 dm_sync_table(md);
1da177e4 1856
2a2a4c51 1857out:
042d2a9b 1858 return old_map;
1da177e4
LT
1859}
1860
a7940155
AK
1861/*
1862 * Returns unbound table for the caller to free.
1863 */
1864static struct dm_table *__unbind(struct mapped_device *md)
1da177e4 1865{
a12f5d48 1866 struct dm_table *map = rcu_dereference_protected(md->map, 1);
1da177e4
LT
1867
1868 if (!map)
a7940155 1869 return NULL;
1da177e4
LT
1870
1871 dm_table_event_callback(map, NULL, NULL);
9cdb8520 1872 RCU_INIT_POINTER(md->map, NULL);
83d5e5b0 1873 dm_sync_table(md);
a7940155
AK
1874
1875 return map;
1da177e4
LT
1876}
1877
1878/*
1879 * Constructor for a new device.
1880 */
2b06cfff 1881int dm_create(int minor, struct mapped_device **result)
1da177e4
LT
1882{
1883 struct mapped_device *md;
1884
2b06cfff 1885 md = alloc_dev(minor);
1da177e4
LT
1886 if (!md)
1887 return -ENXIO;
1888
91ccbbac
TS
1889 dm_ima_reset_data(md);
1890
1da177e4
LT
1891 *result = md;
1892 return 0;
1893}
1894
a5664dad
MS
1895/*
1896 * Functions to manage md->type.
1897 * All are required to hold md->type_lock.
1898 */
1899void dm_lock_md_type(struct mapped_device *md)
1900{
1901 mutex_lock(&md->type_lock);
1902}
1903
1904void dm_unlock_md_type(struct mapped_device *md)
1905{
1906 mutex_unlock(&md->type_lock);
1907}
1908
7e0d574f 1909void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
a5664dad 1910{
00c4fc3b 1911 BUG_ON(!mutex_is_locked(&md->type_lock));
a5664dad
MS
1912 md->type = type;
1913}
1914
7e0d574f 1915enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
a5664dad
MS
1916{
1917 return md->type;
1918}
1919
36a0456f
AK
1920struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
1921{
1922 return md->immutable_target_type;
1923}
1924
f84cb8a4
MS
1925/*
1926 * The queue_limits are only valid as long as you have a reference
1927 * count on 'md'.
1928 */
1929struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
1930{
1931 BUG_ON(!atomic_read(&md->holders));
1932 return &md->queue->limits;
1933}
1934EXPORT_SYMBOL_GPL(dm_get_queue_limits);
1935
4a0b4ddf
MS
1936/*
1937 * Setup the DM device's queue based on md's type
1938 */
591ddcfc 1939int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
4a0b4ddf 1940{
ba305859 1941 enum dm_queue_mode type = dm_table_get_type(t);
c100ec49 1942 struct queue_limits limits;
ba305859 1943 int r;
bfebd1cd 1944
545ed20e 1945 switch (type) {
bfebd1cd 1946 case DM_TYPE_REQUEST_BASED:
681cc5e8 1947 md->disk->fops = &dm_rq_blk_dops;
e83068a5 1948 r = dm_mq_init_request_queue(md, t);
bfebd1cd 1949 if (r) {
681cc5e8 1950 DMERR("Cannot initialize queue for request-based dm mapped device");
bfebd1cd
MS
1951 return r;
1952 }
1953 break;
1954 case DM_TYPE_BIO_BASED:
545ed20e 1955 case DM_TYPE_DAX_BIO_BASED:
bfebd1cd 1956 break;
7e0d574f
BVA
1957 case DM_TYPE_NONE:
1958 WARN_ON_ONCE(true);
1959 break;
4a0b4ddf
MS
1960 }
1961
c100ec49
MS
1962 r = dm_calculate_queue_limits(t, &limits);
1963 if (r) {
1964 DMERR("Cannot calculate initial queue limits");
1965 return r;
1966 }
bb37d772
DLM
1967 r = dm_table_set_restrictions(t, md->queue, &limits);
1968 if (r)
1969 return r;
1970
e7089f65
LC
1971 r = add_disk(md->disk);
1972 if (r)
1973 return r;
c100ec49 1974
89f871af
CH
1975 r = dm_sysfs_init(md);
1976 if (r) {
1977 del_gendisk(md->disk);
1978 return r;
1979 }
1980 md->type = type;
4a0b4ddf
MS
1981 return 0;
1982}
1983
2bec1f4a 1984struct mapped_device *dm_get_md(dev_t dev)
1da177e4
LT
1985{
1986 struct mapped_device *md;
1da177e4
LT
1987 unsigned minor = MINOR(dev);
1988
1989 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
1990 return NULL;
1991
f32c10b0 1992 spin_lock(&_minor_lock);
1da177e4
LT
1993
1994 md = idr_find(&_minor_idr, minor);
49de5769
MS
1995 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
1996 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
1997 md = NULL;
1998 goto out;
fba9f90e 1999 }
49de5769 2000 dm_get(md);
fba9f90e 2001out:
f32c10b0 2002 spin_unlock(&_minor_lock);
1da177e4 2003
637842cf
DT
2004 return md;
2005}
3cf2e4ba 2006EXPORT_SYMBOL_GPL(dm_get_md);
d229a958 2007
9ade92a9 2008void *dm_get_mdptr(struct mapped_device *md)
637842cf 2009{
9ade92a9 2010 return md->interface_ptr;
1da177e4
LT
2011}
2012
2013void dm_set_mdptr(struct mapped_device *md, void *ptr)
2014{
2015 md->interface_ptr = ptr;
2016}
2017
2018void dm_get(struct mapped_device *md)
2019{
2020 atomic_inc(&md->holders);
3f77316d 2021 BUG_ON(test_bit(DMF_FREEING, &md->flags));
1da177e4
LT
2022}
2023
09ee96b2
MP
2024int dm_hold(struct mapped_device *md)
2025{
2026 spin_lock(&_minor_lock);
2027 if (test_bit(DMF_FREEING, &md->flags)) {
2028 spin_unlock(&_minor_lock);
2029 return -EBUSY;
2030 }
2031 dm_get(md);
2032 spin_unlock(&_minor_lock);
2033 return 0;
2034}
2035EXPORT_SYMBOL_GPL(dm_hold);
2036
72d94861
AK
2037const char *dm_device_name(struct mapped_device *md)
2038{
2039 return md->name;
2040}
2041EXPORT_SYMBOL_GPL(dm_device_name);
2042
3f77316d 2043static void __dm_destroy(struct mapped_device *md, bool wait)
1da177e4 2044{
1134e5ae 2045 struct dm_table *map;
83d5e5b0 2046 int srcu_idx;
1da177e4 2047
3f77316d 2048 might_sleep();
fba9f90e 2049
63a4f065 2050 spin_lock(&_minor_lock);
3f77316d
KU
2051 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2052 set_bit(DMF_FREEING, &md->flags);
2053 spin_unlock(&_minor_lock);
3b785fbc 2054
c12c9a3c 2055 blk_set_queue_dying(md->queue);
3f77316d 2056
ab7c7bb6
MP
2057 /*
2058 * Take suspend_lock so that presuspend and postsuspend methods
2059 * do not race with internal suspend.
2060 */
2061 mutex_lock(&md->suspend_lock);
2a708cff 2062 map = dm_get_live_table(md, &srcu_idx);
3f77316d
KU
2063 if (!dm_suspended_md(md)) {
2064 dm_table_presuspend_targets(map);
adc0daad 2065 set_bit(DMF_SUSPENDED, &md->flags);
5df96f2b 2066 set_bit(DMF_POST_SUSPENDING, &md->flags);
3f77316d 2067 dm_table_postsuspend_targets(map);
1da177e4 2068 }
83d5e5b0
MP
2069 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2070 dm_put_live_table(md, srcu_idx);
2a708cff 2071 mutex_unlock(&md->suspend_lock);
83d5e5b0 2072
3f77316d
KU
2073 /*
2074 * Rare, but there may be I/O requests still going to complete,
2075 * for example. Wait for all references to disappear.
2076 * No one should increment the reference count of the mapped_device,
2077 * after the mapped_device state becomes DMF_FREEING.
2078 */
2079 if (wait)
2080 while (atomic_read(&md->holders))
2081 msleep(1);
2082 else if (atomic_read(&md->holders))
2083 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2084 dm_device_name(md), atomic_read(&md->holders));
2085
3f77316d
KU
2086 dm_table_destroy(__unbind(md));
2087 free_dev(md);
2088}
2089
2090void dm_destroy(struct mapped_device *md)
2091{
2092 __dm_destroy(md, true);
2093}
2094
2095void dm_destroy_immediate(struct mapped_device *md)
2096{
2097 __dm_destroy(md, false);
2098}
2099
2100void dm_put(struct mapped_device *md)
2101{
2102 atomic_dec(&md->holders);
1da177e4 2103}
79eb885c 2104EXPORT_SYMBOL_GPL(dm_put);
1da177e4 2105
85067747
ML
2106static bool md_in_flight_bios(struct mapped_device *md)
2107{
2108 int cpu;
8446fe92 2109 struct block_device *part = dm_disk(md)->part0;
85067747
ML
2110 long sum = 0;
2111
2112 for_each_possible_cpu(cpu) {
2113 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
2114 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
2115 }
2116
2117 return sum != 0;
2118}
2119
2f064a59 2120static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
46125c1c
MB
2121{
2122 int r = 0;
9f4c3f87 2123 DEFINE_WAIT(wait);
46125c1c 2124
85067747 2125 while (true) {
9f4c3f87 2126 prepare_to_wait(&md->wait, &wait, task_state);
46125c1c 2127
85067747 2128 if (!md_in_flight_bios(md))
46125c1c
MB
2129 break;
2130
e3fabdfd 2131 if (signal_pending_state(task_state, current)) {
46125c1c
MB
2132 r = -EINTR;
2133 break;
2134 }
2135
2136 io_schedule();
2137 }
9f4c3f87 2138 finish_wait(&md->wait, &wait);
b44ebeb0 2139
46125c1c
MB
2140 return r;
2141}
2142
2f064a59 2143static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
85067747
ML
2144{
2145 int r = 0;
2146
2147 if (!queue_is_mq(md->queue))
2148 return dm_wait_for_bios_completion(md, task_state);
2149
2150 while (true) {
2151 if (!blk_mq_queue_inflight(md->queue))
2152 break;
2153
2154 if (signal_pending_state(task_state, current)) {
2155 r = -EINTR;
2156 break;
2157 }
2158
2159 msleep(5);
2160 }
2161
2162 return r;
2163}
2164
1da177e4
LT
2165/*
2166 * Process the deferred bios
2167 */
ef208587 2168static void dm_wq_work(struct work_struct *work)
1da177e4 2169{
0c2915b8
MS
2170 struct mapped_device *md = container_of(work, struct mapped_device, work);
2171 struct bio *bio;
ef208587 2172
3b00b203 2173 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
df12ee99 2174 spin_lock_irq(&md->deferred_lock);
0c2915b8 2175 bio = bio_list_pop(&md->deferred);
df12ee99
AK
2176 spin_unlock_irq(&md->deferred_lock);
2177
0c2915b8 2178 if (!bio)
df12ee99 2179 break;
022c2611 2180
0c2915b8 2181 submit_bio_noacct(bio);
022c2611 2182 }
1da177e4
LT
2183}
2184
9a1fb464 2185static void dm_queue_flush(struct mapped_device *md)
304f3f6a 2186{
3b00b203 2187 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
4e857c58 2188 smp_mb__after_atomic();
53d5914f 2189 queue_work(md->wq, &md->work);
304f3f6a
MB
2190}
2191
1da177e4 2192/*
042d2a9b 2193 * Swap in a new table, returning the old one for the caller to destroy.
1da177e4 2194 */
042d2a9b 2195struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
1da177e4 2196{
87eb5b21 2197 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
754c5fc7 2198 struct queue_limits limits;
042d2a9b 2199 int r;
1da177e4 2200
e61290a4 2201 mutex_lock(&md->suspend_lock);
1da177e4
LT
2202
2203 /* device must be suspended */
4f186f8b 2204 if (!dm_suspended_md(md))
93c534ae 2205 goto out;
1da177e4 2206
3ae70656
MS
2207 /*
2208 * If the new table has no data devices, retain the existing limits.
2209 * This helps multipath with queue_if_no_path if all paths disappear,
2210 * then new I/O is queued based on these limits, and then some paths
2211 * reappear.
2212 */
2213 if (dm_table_has_no_data_devices(table)) {
83d5e5b0 2214 live_map = dm_get_live_table_fast(md);
3ae70656
MS
2215 if (live_map)
2216 limits = md->queue->limits;
83d5e5b0 2217 dm_put_live_table_fast(md);
3ae70656
MS
2218 }
2219
87eb5b21
MC
2220 if (!live_map) {
2221 r = dm_calculate_queue_limits(table, &limits);
2222 if (r) {
2223 map = ERR_PTR(r);
2224 goto out;
2225 }
042d2a9b 2226 }
754c5fc7 2227
042d2a9b 2228 map = __bind(md, table, &limits);
62e08243 2229 dm_issue_global_event();
1da177e4 2230
93c534ae 2231out:
e61290a4 2232 mutex_unlock(&md->suspend_lock);
042d2a9b 2233 return map;
1da177e4
LT
2234}
2235
2236/*
2237 * Functions to lock and unlock any filesystem running on the
2238 * device.
2239 */
2ca3310e 2240static int lock_fs(struct mapped_device *md)
1da177e4 2241{
e39e2e95 2242 int r;
1da177e4 2243
040f04bd 2244 WARN_ON(test_bit(DMF_FROZEN, &md->flags));
aa8d7c2f 2245
977115c0 2246 r = freeze_bdev(md->disk->part0);
040f04bd
CH
2247 if (!r)
2248 set_bit(DMF_FROZEN, &md->flags);
2249 return r;
1da177e4
LT
2250}
2251
2ca3310e 2252static void unlock_fs(struct mapped_device *md)
1da177e4 2253{
aa8d7c2f
AK
2254 if (!test_bit(DMF_FROZEN, &md->flags))
2255 return;
977115c0 2256 thaw_bdev(md->disk->part0);
aa8d7c2f 2257 clear_bit(DMF_FROZEN, &md->flags);
1da177e4
LT
2258}
2259
2260/*
b48633f8
BVA
2261 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2262 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2263 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2264 *
ffcc3936
MS
2265 * If __dm_suspend returns 0, the device is completely quiescent
2266 * now. There is no request-processing activity. All new requests
2267 * are being added to md->deferred list.
cec47e3d 2268 */
ffcc3936 2269static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2f064a59 2270 unsigned suspend_flags, unsigned int task_state,
eaf9a736 2271 int dmf_suspended_flag)
1da177e4 2272{
ffcc3936
MS
2273 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2274 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2275 int r;
1da177e4 2276
5a8f1f80
BVA
2277 lockdep_assert_held(&md->suspend_lock);
2278
2e93ccc1
KU
2279 /*
2280 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2281 * This flag is cleared before dm_suspend returns.
2282 */
2283 if (noflush)
2284 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
86331f39 2285 else
ac75b09f 2286 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2e93ccc1 2287
d67ee213
MS
2288 /*
2289 * This gets reverted if there's an error later and the targets
2290 * provide the .presuspend_undo hook.
2291 */
cf222b37
AK
2292 dm_table_presuspend_targets(map);
2293
32a926da 2294 /*
9f518b27
KU
2295 * Flush I/O to the device.
2296 * Any I/O submitted after lock_fs() may not be flushed.
2297 * noflush takes precedence over do_lockfs.
2298 * (lock_fs() flushes I/Os and waits for them to complete.)
32a926da
MP
2299 */
2300 if (!noflush && do_lockfs) {
2301 r = lock_fs(md);
d67ee213
MS
2302 if (r) {
2303 dm_table_presuspend_undo_targets(map);
ffcc3936 2304 return r;
d67ee213 2305 }
aa8d7c2f 2306 }
1da177e4
LT
2307
2308 /*
3b00b203
MP
2309 * Here we must make sure that no processes are submitting requests
2310 * to target drivers i.e. no one may be executing
0cede372 2311 * __split_and_process_bio from dm_submit_bio.
3b00b203 2312 *
0cede372 2313 * To get all processes out of __split_and_process_bio in dm_submit_bio,
3b00b203 2314 * we take the write lock. To prevent any process from reentering
0cede372
MS
2315 * __split_and_process_bio from dm_submit_bio and quiesce the thread
2316 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
6a8736d1 2317 * flush_workqueue(md->wq).
1da177e4 2318 */
1eb787ec 2319 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
41abc4e1
HR
2320 if (map)
2321 synchronize_srcu(&md->io_barrier);
1da177e4 2322
d0bcb878 2323 /*
29e4013d
TH
2324 * Stop md->queue before flushing md->wq in case request-based
2325 * dm defers requests to md->wq from md->queue.
d0bcb878 2326 */
6a23e05c 2327 if (dm_request_based(md))
eca7ee6d 2328 dm_stop_queue(md->queue);
cec47e3d 2329
d0bcb878
KU
2330 flush_workqueue(md->wq);
2331
1da177e4 2332 /*
3b00b203
MP
2333 * At this point no more requests are entering target request routines.
2334 * We call dm_wait_for_completion to wait for all existing requests
2335 * to finish.
1da177e4 2336 */
b48633f8 2337 r = dm_wait_for_completion(md, task_state);
eaf9a736
MS
2338 if (!r)
2339 set_bit(dmf_suspended_flag, &md->flags);
1da177e4 2340
6d6f10df 2341 if (noflush)
022c2611 2342 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
41abc4e1
HR
2343 if (map)
2344 synchronize_srcu(&md->io_barrier);
2e93ccc1 2345
1da177e4 2346 /* were we interrupted ? */
46125c1c 2347 if (r < 0) {
9a1fb464 2348 dm_queue_flush(md);
73d410c0 2349
cec47e3d 2350 if (dm_request_based(md))
eca7ee6d 2351 dm_start_queue(md->queue);
cec47e3d 2352
2ca3310e 2353 unlock_fs(md);
d67ee213 2354 dm_table_presuspend_undo_targets(map);
ffcc3936 2355 /* pushback list is already flushed, so skip flush */
2ca3310e 2356 }
1da177e4 2357
ffcc3936
MS
2358 return r;
2359}
2360
2361/*
2362 * We need to be able to change a mapping table under a mounted
2363 * filesystem. For example we might want to move some data in
2364 * the background. Before the table can be swapped with
2365 * dm_bind_table, dm_suspend must be called to flush any in
2366 * flight bios and ensure that any further io gets deferred.
2367 */
2368/*
2369 * Suspend mechanism in request-based dm.
2370 *
2371 * 1. Flush all I/Os by lock_fs() if needed.
2372 * 2. Stop dispatching any I/O by stopping the request_queue.
2373 * 3. Wait for all in-flight I/Os to be completed or requeued.
2374 *
2375 * To abort suspend, start the request_queue.
2376 */
2377int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2378{
2379 struct dm_table *map = NULL;
2380 int r = 0;
2381
2382retry:
2383 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2384
2385 if (dm_suspended_md(md)) {
2386 r = -EINVAL;
2387 goto out_unlock;
2388 }
2389
2390 if (dm_suspended_internally_md(md)) {
2391 /* already internally suspended, wait for internal resume */
2392 mutex_unlock(&md->suspend_lock);
2393 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2394 if (r)
2395 return r;
2396 goto retry;
2397 }
2398
a12f5d48 2399 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
ffcc3936 2400
eaf9a736 2401 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
ffcc3936
MS
2402 if (r)
2403 goto out_unlock;
3b00b203 2404
5df96f2b 2405 set_bit(DMF_POST_SUSPENDING, &md->flags);
4d4471cb 2406 dm_table_postsuspend_targets(map);
5df96f2b 2407 clear_bit(DMF_POST_SUSPENDING, &md->flags);
4d4471cb 2408
d287483d 2409out_unlock:
e61290a4 2410 mutex_unlock(&md->suspend_lock);
cf222b37 2411 return r;
1da177e4
LT
2412}
2413
ffcc3936
MS
2414static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2415{
2416 if (map) {
2417 int r = dm_table_resume_targets(map);
2418 if (r)
2419 return r;
2420 }
2421
2422 dm_queue_flush(md);
2423
2424 /*
2425 * Flushing deferred I/Os must be done after targets are resumed
2426 * so that mapping of targets can work correctly.
2427 * Request-based dm is queueing the deferred I/Os in its request_queue.
2428 */
2429 if (dm_request_based(md))
eca7ee6d 2430 dm_start_queue(md->queue);
ffcc3936
MS
2431
2432 unlock_fs(md);
2433
2434 return 0;
2435}
2436
1da177e4
LT
2437int dm_resume(struct mapped_device *md)
2438{
8dc23658 2439 int r;
cf222b37 2440 struct dm_table *map = NULL;
1da177e4 2441
ffcc3936 2442retry:
8dc23658 2443 r = -EINVAL;
ffcc3936
MS
2444 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2445
4f186f8b 2446 if (!dm_suspended_md(md))
cf222b37 2447 goto out;
cf222b37 2448
ffcc3936
MS
2449 if (dm_suspended_internally_md(md)) {
2450 /* already internally suspended, wait for internal resume */
2451 mutex_unlock(&md->suspend_lock);
2452 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2453 if (r)
2454 return r;
2455 goto retry;
2456 }
2457
a12f5d48 2458 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2ca3310e 2459 if (!map || !dm_table_get_size(map))
cf222b37 2460 goto out;
1da177e4 2461
ffcc3936 2462 r = __dm_resume(md, map);
8757b776
MB
2463 if (r)
2464 goto out;
2ca3310e 2465
2ca3310e 2466 clear_bit(DMF_SUSPENDED, &md->flags);
cf222b37 2467out:
e61290a4 2468 mutex_unlock(&md->suspend_lock);
2ca3310e 2469
cf222b37 2470 return r;
1da177e4
LT
2471}
2472
fd2ed4d2
MP
2473/*
2474 * Internal suspend/resume works like userspace-driven suspend. It waits
2475 * until all bios finish and prevents issuing new bios to the target drivers.
2476 * It may be used only from the kernel.
fd2ed4d2
MP
2477 */
2478
ffcc3936 2479static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
fd2ed4d2 2480{
ffcc3936
MS
2481 struct dm_table *map = NULL;
2482
1ea0654e
BVA
2483 lockdep_assert_held(&md->suspend_lock);
2484
96b26c8c 2485 if (md->internal_suspend_count++)
ffcc3936
MS
2486 return; /* nested internal suspend */
2487
2488 if (dm_suspended_md(md)) {
2489 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2490 return; /* nest suspend */
2491 }
2492
a12f5d48 2493 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
ffcc3936
MS
2494
2495 /*
2496 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2497 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
2498 * would require changing .presuspend to return an error -- avoid this
2499 * until there is a need for more elaborate variants of internal suspend.
2500 */
eaf9a736
MS
2501 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2502 DMF_SUSPENDED_INTERNALLY);
ffcc3936 2503
5df96f2b 2504 set_bit(DMF_POST_SUSPENDING, &md->flags);
ffcc3936 2505 dm_table_postsuspend_targets(map);
5df96f2b 2506 clear_bit(DMF_POST_SUSPENDING, &md->flags);
ffcc3936
MS
2507}
2508
2509static void __dm_internal_resume(struct mapped_device *md)
2510{
96b26c8c
MP
2511 BUG_ON(!md->internal_suspend_count);
2512
2513 if (--md->internal_suspend_count)
ffcc3936
MS
2514 return; /* resume from nested internal suspend */
2515
fd2ed4d2 2516 if (dm_suspended_md(md))
ffcc3936
MS
2517 goto done; /* resume from nested suspend */
2518
2519 /*
2520 * NOTE: existing callers don't need to call dm_table_resume_targets
2521 * (which may fail -- so best to avoid it for now by passing NULL map)
2522 */
2523 (void) __dm_resume(md, NULL);
2524
2525done:
2526 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2527 smp_mb__after_atomic();
2528 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2529}
2530
2531void dm_internal_suspend_noflush(struct mapped_device *md)
2532{
2533 mutex_lock(&md->suspend_lock);
2534 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2535 mutex_unlock(&md->suspend_lock);
2536}
2537EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2538
2539void dm_internal_resume(struct mapped_device *md)
2540{
2541 mutex_lock(&md->suspend_lock);
2542 __dm_internal_resume(md);
2543 mutex_unlock(&md->suspend_lock);
2544}
2545EXPORT_SYMBOL_GPL(dm_internal_resume);
2546
2547/*
2548 * Fast variants of internal suspend/resume hold md->suspend_lock,
2549 * which prevents interaction with userspace-driven suspend.
2550 */
2551
2552void dm_internal_suspend_fast(struct mapped_device *md)
2553{
2554 mutex_lock(&md->suspend_lock);
2555 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
fd2ed4d2
MP
2556 return;
2557
2558 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2559 synchronize_srcu(&md->io_barrier);
2560 flush_workqueue(md->wq);
2561 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2562}
b735fede 2563EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
fd2ed4d2 2564
ffcc3936 2565void dm_internal_resume_fast(struct mapped_device *md)
fd2ed4d2 2566{
ffcc3936 2567 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
fd2ed4d2
MP
2568 goto done;
2569
2570 dm_queue_flush(md);
2571
2572done:
2573 mutex_unlock(&md->suspend_lock);
2574}
b735fede 2575EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
fd2ed4d2 2576
1da177e4
LT
2577/*-----------------------------------------------------------------
2578 * Event notification.
2579 *---------------------------------------------------------------*/
3abf85b5 2580int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
60935eb2 2581 unsigned cookie)
69267a30 2582{
6958c1c6
MP
2583 int r;
2584 unsigned noio_flag;
60935eb2
MB
2585 char udev_cookie[DM_COOKIE_LENGTH];
2586 char *envp[] = { udev_cookie, NULL };
2587
6958c1c6
MP
2588 noio_flag = memalloc_noio_save();
2589
60935eb2 2590 if (!cookie)
6958c1c6 2591 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
60935eb2
MB
2592 else {
2593 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2594 DM_COOKIE_ENV_VAR_NAME, cookie);
6958c1c6
MP
2595 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2596 action, envp);
60935eb2 2597 }
6958c1c6
MP
2598
2599 memalloc_noio_restore(noio_flag);
2600
2601 return r;
69267a30
AK
2602}
2603
7a8c3d3b
MA
2604uint32_t dm_next_uevent_seq(struct mapped_device *md)
2605{
2606 return atomic_add_return(1, &md->uevent_seq);
2607}
2608
1da177e4
LT
2609uint32_t dm_get_event_nr(struct mapped_device *md)
2610{
2611 return atomic_read(&md->event_nr);
2612}
2613
2614int dm_wait_event(struct mapped_device *md, int event_nr)
2615{
2616 return wait_event_interruptible(md->eventq,
2617 (event_nr != atomic_read(&md->event_nr)));
2618}
2619
7a8c3d3b
MA
2620void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2621{
2622 unsigned long flags;
2623
2624 spin_lock_irqsave(&md->uevent_lock, flags);
2625 list_add(elist, &md->uevent_list);
2626 spin_unlock_irqrestore(&md->uevent_lock, flags);
2627}
2628
1da177e4
LT
2629/*
2630 * The gendisk is only valid as long as you have a reference
2631 * count on 'md'.
2632 */
2633struct gendisk *dm_disk(struct mapped_device *md)
2634{
2635 return md->disk;
2636}
65ff5b7d 2637EXPORT_SYMBOL_GPL(dm_disk);
1da177e4 2638
784aae73
MB
2639struct kobject *dm_kobject(struct mapped_device *md)
2640{
2995fa78 2641 return &md->kobj_holder.kobj;
784aae73
MB
2642}
2643
784aae73
MB
2644struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2645{
2646 struct mapped_device *md;
2647
2995fa78 2648 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
784aae73 2649
b9a41d21
HT
2650 spin_lock(&_minor_lock);
2651 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2652 md = NULL;
2653 goto out;
2654 }
784aae73 2655 dm_get(md);
b9a41d21
HT
2656out:
2657 spin_unlock(&_minor_lock);
2658
784aae73
MB
2659 return md;
2660}
2661
4f186f8b 2662int dm_suspended_md(struct mapped_device *md)
1da177e4
LT
2663{
2664 return test_bit(DMF_SUSPENDED, &md->flags);
2665}
2666
5df96f2b
MP
2667static int dm_post_suspending_md(struct mapped_device *md)
2668{
2669 return test_bit(DMF_POST_SUSPENDING, &md->flags);
2670}
2671
ffcc3936
MS
2672int dm_suspended_internally_md(struct mapped_device *md)
2673{
2674 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2675}
2676
2c140a24
MP
2677int dm_test_deferred_remove_flag(struct mapped_device *md)
2678{
2679 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2680}
2681
64dbce58
KU
2682int dm_suspended(struct dm_target *ti)
2683{
33bd6f06 2684 return dm_suspended_md(ti->table->md);
64dbce58
KU
2685}
2686EXPORT_SYMBOL_GPL(dm_suspended);
2687
5df96f2b
MP
2688int dm_post_suspending(struct dm_target *ti)
2689{
33bd6f06 2690 return dm_post_suspending_md(ti->table->md);
5df96f2b
MP
2691}
2692EXPORT_SYMBOL_GPL(dm_post_suspending);
2693
2e93ccc1
KU
2694int dm_noflush_suspending(struct dm_target *ti)
2695{
33bd6f06 2696 return __noflush_suspending(ti->table->md);
2e93ccc1
KU
2697}
2698EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2699
7e0d574f 2700struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
0776aa0e
MS
2701 unsigned integrity, unsigned per_io_data_size,
2702 unsigned min_pool_size)
e6ee8c0b 2703{
115485e8 2704 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
78d8e58a 2705 unsigned int pool_size = 0;
64f52b0e 2706 unsigned int front_pad, io_front_pad;
6f1c819c 2707 int ret;
e6ee8c0b
KU
2708
2709 if (!pools)
4e6e36c3 2710 return NULL;
e6ee8c0b 2711
78d8e58a
MS
2712 switch (type) {
2713 case DM_TYPE_BIO_BASED:
545ed20e 2714 case DM_TYPE_DAX_BIO_BASED:
0776aa0e 2715 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
62f26317
JX
2716 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
2717 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
6f1c819c
KO
2718 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2719 if (ret)
64f52b0e 2720 goto out;
6f1c819c 2721 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
eb8db831 2722 goto out;
78d8e58a
MS
2723 break;
2724 case DM_TYPE_REQUEST_BASED:
0776aa0e 2725 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
78d8e58a 2726 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
591ddcfc 2727 /* per_io_data_size is used for blk-mq pdu at queue allocation */
78d8e58a
MS
2728 break;
2729 default:
2730 BUG();
2731 }
2732
6f1c819c
KO
2733 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2734 if (ret)
5f015204 2735 goto out;
e6ee8c0b 2736
6f1c819c 2737 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
5f015204 2738 goto out;
a91a2785 2739
e6ee8c0b 2740 return pools;
5f1b670d 2741
5f1b670d
CH
2742out:
2743 dm_free_md_mempools(pools);
78d8e58a 2744
4e6e36c3 2745 return NULL;
e6ee8c0b
KU
2746}
2747
2748void dm_free_md_mempools(struct dm_md_mempools *pools)
2749{
2750 if (!pools)
2751 return;
2752
6f1c819c
KO
2753 bioset_exit(&pools->bs);
2754 bioset_exit(&pools->io_bs);
e6ee8c0b
KU
2755
2756 kfree(pools);
2757}
2758
9c72bad1
CH
2759struct dm_pr {
2760 u64 old_key;
2761 u64 new_key;
2762 u32 flags;
2763 bool fail_early;
2764};
2765
2766static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2767 void *data)
71cdb697
CH
2768{
2769 struct mapped_device *md = bdev->bd_disk->private_data;
9c72bad1
CH
2770 struct dm_table *table;
2771 struct dm_target *ti;
2772 int ret = -ENOTTY, srcu_idx;
71cdb697 2773
9c72bad1
CH
2774 table = dm_get_live_table(md, &srcu_idx);
2775 if (!table || !dm_table_get_size(table))
2776 goto out;
71cdb697 2777
9c72bad1
CH
2778 /* We only support devices that have a single target */
2779 if (dm_table_get_num_targets(table) != 1)
2780 goto out;
2781 ti = dm_table_get_target(table, 0);
71cdb697 2782
9c72bad1
CH
2783 ret = -EINVAL;
2784 if (!ti->type->iterate_devices)
2785 goto out;
2786
2787 ret = ti->type->iterate_devices(ti, fn, data);
2788out:
2789 dm_put_live_table(md, srcu_idx);
2790 return ret;
2791}
2792
2793/*
2794 * For register / unregister we need to manually call out to every path.
2795 */
2796static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2797 sector_t start, sector_t len, void *data)
2798{
2799 struct dm_pr *pr = data;
2800 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2801
2802 if (!ops || !ops->pr_register)
2803 return -EOPNOTSUPP;
2804 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2805}
2806
2807static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2808 u32 flags)
2809{
2810 struct dm_pr pr = {
2811 .old_key = old_key,
2812 .new_key = new_key,
2813 .flags = flags,
2814 .fail_early = true,
2815 };
2816 int ret;
2817
2818 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2819 if (ret && new_key) {
2820 /* unregister all paths if we failed to register any path */
2821 pr.old_key = new_key;
2822 pr.new_key = 0;
2823 pr.flags = 0;
2824 pr.fail_early = false;
2825 dm_call_pr(bdev, __dm_pr_register, &pr);
2826 }
2827
2828 return ret;
71cdb697
CH
2829}
2830
2831static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
956a4025 2832 u32 flags)
71cdb697
CH
2833{
2834 struct mapped_device *md = bdev->bd_disk->private_data;
2835 const struct pr_ops *ops;
971888c4 2836 int r, srcu_idx;
71cdb697 2837
5bd5e8d8 2838 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
71cdb697 2839 if (r < 0)
971888c4 2840 goto out;
71cdb697
CH
2841
2842 ops = bdev->bd_disk->fops->pr_ops;
2843 if (ops && ops->pr_reserve)
2844 r = ops->pr_reserve(bdev, key, type, flags);
2845 else
2846 r = -EOPNOTSUPP;
971888c4
MS
2847out:
2848 dm_unprepare_ioctl(md, srcu_idx);
71cdb697
CH
2849 return r;
2850}
2851
2852static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2853{
2854 struct mapped_device *md = bdev->bd_disk->private_data;
2855 const struct pr_ops *ops;
971888c4 2856 int r, srcu_idx;
71cdb697 2857
5bd5e8d8 2858 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
71cdb697 2859 if (r < 0)
971888c4 2860 goto out;
71cdb697
CH
2861
2862 ops = bdev->bd_disk->fops->pr_ops;
2863 if (ops && ops->pr_release)
2864 r = ops->pr_release(bdev, key, type);
2865 else
2866 r = -EOPNOTSUPP;
971888c4
MS
2867out:
2868 dm_unprepare_ioctl(md, srcu_idx);
71cdb697
CH
2869 return r;
2870}
2871
2872static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
956a4025 2873 enum pr_type type, bool abort)
71cdb697
CH
2874{
2875 struct mapped_device *md = bdev->bd_disk->private_data;
2876 const struct pr_ops *ops;
971888c4 2877 int r, srcu_idx;
71cdb697 2878
5bd5e8d8 2879 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
71cdb697 2880 if (r < 0)
971888c4 2881 goto out;
71cdb697
CH
2882
2883 ops = bdev->bd_disk->fops->pr_ops;
2884 if (ops && ops->pr_preempt)
2885 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
2886 else
2887 r = -EOPNOTSUPP;
971888c4
MS
2888out:
2889 dm_unprepare_ioctl(md, srcu_idx);
71cdb697
CH
2890 return r;
2891}
2892
2893static int dm_pr_clear(struct block_device *bdev, u64 key)
2894{
2895 struct mapped_device *md = bdev->bd_disk->private_data;
2896 const struct pr_ops *ops;
971888c4 2897 int r, srcu_idx;
71cdb697 2898
5bd5e8d8 2899 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
71cdb697 2900 if (r < 0)
971888c4 2901 goto out;
71cdb697
CH
2902
2903 ops = bdev->bd_disk->fops->pr_ops;
2904 if (ops && ops->pr_clear)
2905 r = ops->pr_clear(bdev, key);
2906 else
2907 r = -EOPNOTSUPP;
971888c4
MS
2908out:
2909 dm_unprepare_ioctl(md, srcu_idx);
71cdb697
CH
2910 return r;
2911}
2912
2913static const struct pr_ops dm_pr_ops = {
2914 .pr_register = dm_pr_register,
2915 .pr_reserve = dm_pr_reserve,
2916 .pr_release = dm_pr_release,
2917 .pr_preempt = dm_pr_preempt,
2918 .pr_clear = dm_pr_clear,
2919};
2920
83d5cde4 2921static const struct block_device_operations dm_blk_dops = {
c62b37d9 2922 .submit_bio = dm_submit_bio,
1da177e4
LT
2923 .open = dm_blk_open,
2924 .release = dm_blk_close,
aa129a22 2925 .ioctl = dm_blk_ioctl,
3ac51e74 2926 .getgeo = dm_blk_getgeo,
e76239a3 2927 .report_zones = dm_blk_report_zones,
71cdb697 2928 .pr_ops = &dm_pr_ops,
1da177e4
LT
2929 .owner = THIS_MODULE
2930};
2931
681cc5e8
MS
2932static const struct block_device_operations dm_rq_blk_dops = {
2933 .open = dm_blk_open,
2934 .release = dm_blk_close,
2935 .ioctl = dm_blk_ioctl,
2936 .getgeo = dm_blk_getgeo,
2937 .pr_ops = &dm_pr_ops,
2938 .owner = THIS_MODULE
2939};
2940
f26c5719
DW
2941static const struct dax_operations dm_dax_ops = {
2942 .direct_access = dm_dax_direct_access,
cdf6cdcd 2943 .zero_page_range = dm_dax_zero_page_range,
f26c5719
DW
2944};
2945
1da177e4
LT
2946/*
2947 * module hooks
2948 */
2949module_init(dm_init);
2950module_exit(dm_exit);
2951
2952module_param(major, uint, 0);
2953MODULE_PARM_DESC(major, "The major number of the device mapper");
f4790826 2954
e8603136
MS
2955module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
2956MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
2957
115485e8
MS
2958module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
2959MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
2960
a666e5c0
MP
2961module_param(swap_bios, int, S_IRUGO | S_IWUSR);
2962MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
2963
1da177e4
LT
2964MODULE_DESCRIPTION(DM_NAME " driver");
2965MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2966MODULE_LICENSE("GPL");