dm: refactor dm_split_and_process_bio a bit
[linux-block.git] / drivers / md / dm.c
CommitLineData
1da177e4
LT
1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
784aae73 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
1da177e4
LT
4 *
5 * This file is released under the GPL.
6 */
7
4cc96131
MS
8#include "dm-core.h"
9#include "dm-rq.h"
51e5b2bd 10#include "dm-uevent.h"
91ccbbac 11#include "dm-ima.h"
1da177e4
LT
12
13#include <linux/init.h>
14#include <linux/module.h>
48c9c27b 15#include <linux/mutex.h>
6958c1c6 16#include <linux/sched/mm.h>
174cd4b1 17#include <linux/sched/signal.h>
1da177e4
LT
18#include <linux/blkpg.h>
19#include <linux/bio.h>
1da177e4 20#include <linux/mempool.h>
f26c5719 21#include <linux/dax.h>
1da177e4
LT
22#include <linux/slab.h>
23#include <linux/idr.h>
7e026c8c 24#include <linux/uio.h>
3ac51e74 25#include <linux/hdreg.h>
3f77316d 26#include <linux/delay.h>
ffcc3936 27#include <linux/wait.h>
71cdb697 28#include <linux/pr.h>
b0b4d7c6 29#include <linux/refcount.h>
c6a564ff 30#include <linux/part_stat.h>
a892c8d5 31#include <linux/blk-crypto.h>
1e8d44bd 32#include <linux/blk-crypto-profile.h>
55782138 33
72d94861
AK
34#define DM_MSG_PREFIX "core"
35
60935eb2
MB
36/*
37 * Cookies are numeric values sent with CHANGE and REMOVE
38 * uevents while resuming, removing or renaming the device.
39 */
40#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
41#define DM_COOKIE_LENGTH 24
42
1da177e4
LT
43static const char *_name = DM_NAME;
44
45static unsigned int major = 0;
46static unsigned int _major = 0;
47
d15b774c
AK
48static DEFINE_IDR(_minor_idr);
49
f32c10b0 50static DEFINE_SPINLOCK(_minor_lock);
2c140a24
MP
51
52static void do_deferred_remove(struct work_struct *w);
53
54static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
55
acfe0ad7
MP
56static struct workqueue_struct *deferred_remove_workqueue;
57
93e6442c
MP
58atomic_t dm_global_event_nr = ATOMIC_INIT(0);
59DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
60
62e08243
MP
61void dm_issue_global_event(void)
62{
63 atomic_inc(&dm_global_event_nr);
64 wake_up(&dm_global_eventq);
65}
66
1da177e4 67/*
64f52b0e 68 * One of these is allocated (on-stack) per original bio.
1da177e4 69 */
64f52b0e 70struct clone_info {
64f52b0e
MS
71 struct dm_table *map;
72 struct bio *bio;
73 struct dm_io *io;
74 sector_t sector;
75 unsigned sector_count;
76};
77
62f26317
JX
78#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
79#define DM_IO_BIO_OFFSET \
80 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
81
6c23f0bd
CH
82static inline struct dm_target_io *clone_to_tio(struct bio *clone)
83{
84 return container_of(clone, struct dm_target_io, clone);
85}
86
64f52b0e
MS
87void *dm_per_bio_data(struct bio *bio, size_t data_size)
88{
6c23f0bd 89 if (!clone_to_tio(bio)->inside_dm_io)
62f26317
JX
90 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
91 return (char *)bio - DM_IO_BIO_OFFSET - data_size;
64f52b0e
MS
92}
93EXPORT_SYMBOL_GPL(dm_per_bio_data);
94
95struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
96{
97 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
98 if (io->magic == DM_IO_MAGIC)
62f26317 99 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
64f52b0e 100 BUG_ON(io->magic != DM_TIO_MAGIC);
62f26317 101 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
64f52b0e
MS
102}
103EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
104
105unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
106{
107 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
108}
109EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
110
ba61fdd1
JM
111#define MINOR_ALLOCED ((void *)-1)
112
115485e8 113#define DM_NUMA_NODE NUMA_NO_NODE
115485e8 114static int dm_numa_node = DM_NUMA_NODE;
faad87df 115
a666e5c0
MP
116#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
117static int swap_bios = DEFAULT_SWAP_BIOS;
118static int get_swap_bios(void)
119{
120 int latch = READ_ONCE(swap_bios);
121 if (unlikely(latch <= 0))
122 latch = DEFAULT_SWAP_BIOS;
123 return latch;
124}
125
e6ee8c0b
KU
126/*
127 * For mempools pre-allocation at the table loading time.
128 */
129struct dm_md_mempools {
6f1c819c
KO
130 struct bio_set bs;
131 struct bio_set io_bs;
e6ee8c0b
KU
132};
133
86f1152b
BM
134struct table_device {
135 struct list_head list;
b0b4d7c6 136 refcount_t count;
86f1152b
BM
137 struct dm_dev dm_dev;
138};
139
e8603136
MS
140/*
141 * Bio-based DM's mempools' reserved IOs set by the user.
142 */
4cc96131 143#define RESERVED_BIO_BASED_IOS 16
e8603136
MS
144static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
145
115485e8
MS
146static int __dm_get_module_param_int(int *module_param, int min, int max)
147{
6aa7de05 148 int param = READ_ONCE(*module_param);
115485e8
MS
149 int modified_param = 0;
150 bool modified = true;
151
152 if (param < min)
153 modified_param = min;
154 else if (param > max)
155 modified_param = max;
156 else
157 modified = false;
158
159 if (modified) {
160 (void)cmpxchg(module_param, param, modified_param);
161 param = modified_param;
162 }
163
164 return param;
165}
166
4cc96131
MS
167unsigned __dm_get_module_param(unsigned *module_param,
168 unsigned def, unsigned max)
f4790826 169{
6aa7de05 170 unsigned param = READ_ONCE(*module_param);
09c2d531 171 unsigned modified_param = 0;
f4790826 172
09c2d531
MS
173 if (!param)
174 modified_param = def;
175 else if (param > max)
176 modified_param = max;
f4790826 177
09c2d531
MS
178 if (modified_param) {
179 (void)cmpxchg(module_param, param, modified_param);
180 param = modified_param;
f4790826
MS
181 }
182
09c2d531 183 return param;
f4790826
MS
184}
185
e8603136
MS
186unsigned dm_get_reserved_bio_based_ios(void)
187{
09c2d531 188 return __dm_get_module_param(&reserved_bio_based_ios,
4cc96131 189 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
e8603136
MS
190}
191EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
192
115485e8
MS
193static unsigned dm_get_numa_node(void)
194{
195 return __dm_get_module_param_int(&dm_numa_node,
196 DM_NUMA_NODE, num_online_nodes() - 1);
197}
198
1da177e4
LT
199static int __init local_init(void)
200{
e689fbab 201 int r;
1ae49ea2 202
51e5b2bd 203 r = dm_uevent_init();
51157b4a 204 if (r)
e689fbab 205 return r;
51e5b2bd 206
acfe0ad7
MP
207 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
208 if (!deferred_remove_workqueue) {
209 r = -ENOMEM;
210 goto out_uevent_exit;
211 }
212
1da177e4
LT
213 _major = major;
214 r = register_blkdev(_major, _name);
51157b4a 215 if (r < 0)
acfe0ad7 216 goto out_free_workqueue;
1da177e4
LT
217
218 if (!_major)
219 _major = r;
220
221 return 0;
51157b4a 222
acfe0ad7
MP
223out_free_workqueue:
224 destroy_workqueue(deferred_remove_workqueue);
51157b4a
KU
225out_uevent_exit:
226 dm_uevent_exit();
51157b4a
KU
227
228 return r;
1da177e4
LT
229}
230
231static void local_exit(void)
232{
2c140a24 233 flush_scheduled_work();
acfe0ad7 234 destroy_workqueue(deferred_remove_workqueue);
2c140a24 235
00d59405 236 unregister_blkdev(_major, _name);
51e5b2bd 237 dm_uevent_exit();
1da177e4
LT
238
239 _major = 0;
240
241 DMINFO("cleaned up");
242}
243
b9249e55 244static int (*_inits[])(void) __initdata = {
1da177e4
LT
245 local_init,
246 dm_target_init,
247 dm_linear_init,
248 dm_stripe_init,
952b3557 249 dm_io_init,
945fa4d2 250 dm_kcopyd_init,
1da177e4 251 dm_interface_init,
fd2ed4d2 252 dm_statistics_init,
1da177e4
LT
253};
254
b9249e55 255static void (*_exits[])(void) = {
1da177e4
LT
256 local_exit,
257 dm_target_exit,
258 dm_linear_exit,
259 dm_stripe_exit,
952b3557 260 dm_io_exit,
945fa4d2 261 dm_kcopyd_exit,
1da177e4 262 dm_interface_exit,
fd2ed4d2 263 dm_statistics_exit,
1da177e4
LT
264};
265
266static int __init dm_init(void)
267{
268 const int count = ARRAY_SIZE(_inits);
1da177e4
LT
269 int r, i;
270
f1cd6cb2
TS
271#if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
272 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
273 " Duplicate IMA measurements will not be recorded in the IMA log.");
274#endif
275
1da177e4
LT
276 for (i = 0; i < count; i++) {
277 r = _inits[i]();
278 if (r)
279 goto bad;
280 }
281
282 return 0;
f1cd6cb2 283bad:
1da177e4
LT
284 while (i--)
285 _exits[i]();
286
287 return r;
288}
289
290static void __exit dm_exit(void)
291{
292 int i = ARRAY_SIZE(_exits);
293
294 while (i--)
295 _exits[i]();
d15b774c
AK
296
297 /*
298 * Should be empty by this point.
299 */
d15b774c 300 idr_destroy(&_minor_idr);
1da177e4
LT
301}
302
303/*
304 * Block device functions
305 */
432a212c
MA
306int dm_deleting_md(struct mapped_device *md)
307{
308 return test_bit(DMF_DELETING, &md->flags);
309}
310
fe5f9f2c 311static int dm_blk_open(struct block_device *bdev, fmode_t mode)
1da177e4
LT
312{
313 struct mapped_device *md;
314
fba9f90e
JM
315 spin_lock(&_minor_lock);
316
fe5f9f2c 317 md = bdev->bd_disk->private_data;
fba9f90e
JM
318 if (!md)
319 goto out;
320
5c6bd75d 321 if (test_bit(DMF_FREEING, &md->flags) ||
432a212c 322 dm_deleting_md(md)) {
fba9f90e
JM
323 md = NULL;
324 goto out;
325 }
326
1da177e4 327 dm_get(md);
5c6bd75d 328 atomic_inc(&md->open_count);
fba9f90e
JM
329out:
330 spin_unlock(&_minor_lock);
331
332 return md ? 0 : -ENXIO;
1da177e4
LT
333}
334
db2a144b 335static void dm_blk_close(struct gendisk *disk, fmode_t mode)
1da177e4 336{
63a4f065 337 struct mapped_device *md;
6e9624b8 338
4a1aeb98
MB
339 spin_lock(&_minor_lock);
340
63a4f065
MS
341 md = disk->private_data;
342 if (WARN_ON(!md))
343 goto out;
344
2c140a24
MP
345 if (atomic_dec_and_test(&md->open_count) &&
346 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
acfe0ad7 347 queue_work(deferred_remove_workqueue, &deferred_remove_work);
2c140a24 348
1da177e4 349 dm_put(md);
63a4f065 350out:
4a1aeb98 351 spin_unlock(&_minor_lock);
1da177e4
LT
352}
353
5c6bd75d
AK
354int dm_open_count(struct mapped_device *md)
355{
356 return atomic_read(&md->open_count);
357}
358
359/*
360 * Guarantees nothing is using the device before it's deleted.
361 */
2c140a24 362int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
5c6bd75d
AK
363{
364 int r = 0;
365
366 spin_lock(&_minor_lock);
367
2c140a24 368 if (dm_open_count(md)) {
5c6bd75d 369 r = -EBUSY;
2c140a24
MP
370 if (mark_deferred)
371 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
372 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
373 r = -EEXIST;
5c6bd75d
AK
374 else
375 set_bit(DMF_DELETING, &md->flags);
376
377 spin_unlock(&_minor_lock);
378
379 return r;
380}
381
2c140a24
MP
382int dm_cancel_deferred_remove(struct mapped_device *md)
383{
384 int r = 0;
385
386 spin_lock(&_minor_lock);
387
388 if (test_bit(DMF_DELETING, &md->flags))
389 r = -EBUSY;
390 else
391 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
392
393 spin_unlock(&_minor_lock);
394
395 return r;
396}
397
398static void do_deferred_remove(struct work_struct *w)
399{
400 dm_deferred_remove();
401}
402
3ac51e74
DW
403static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
404{
405 struct mapped_device *md = bdev->bd_disk->private_data;
406
407 return dm_get_geometry(md, geo);
408}
409
971888c4 410static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
5bd5e8d8 411 struct block_device **bdev)
aa129a22 412{
66482026 413 struct dm_target *tgt;
6c182cd8 414 struct dm_table *map;
971888c4 415 int r;
aa129a22 416
6c182cd8 417retry:
e56f81e0 418 r = -ENOTTY;
971888c4 419 map = dm_get_live_table(md, srcu_idx);
aa129a22 420 if (!map || !dm_table_get_size(map))
971888c4 421 return r;
aa129a22
MB
422
423 /* We only support devices that have a single target */
424 if (dm_table_get_num_targets(map) != 1)
971888c4 425 return r;
aa129a22 426
66482026
MS
427 tgt = dm_table_get_target(map, 0);
428 if (!tgt->type->prepare_ioctl)
971888c4 429 return r;
519049af 430
971888c4
MS
431 if (dm_suspended_md(md))
432 return -EAGAIN;
aa129a22 433
5bd5e8d8 434 r = tgt->type->prepare_ioctl(tgt, bdev);
5bbbfdf6 435 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
971888c4 436 dm_put_live_table(md, *srcu_idx);
6c182cd8
HR
437 msleep(10);
438 goto retry;
439 }
971888c4 440
e56f81e0
CH
441 return r;
442}
443
971888c4 444static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
971888c4
MS
445{
446 dm_put_live_table(md, srcu_idx);
447}
448
e56f81e0
CH
449static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
450 unsigned int cmd, unsigned long arg)
451{
452 struct mapped_device *md = bdev->bd_disk->private_data;
971888c4 453 int r, srcu_idx;
e56f81e0 454
5bd5e8d8 455 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
e56f81e0 456 if (r < 0)
971888c4 457 goto out;
6c182cd8 458
e56f81e0
CH
459 if (r > 0) {
460 /*
e980f623
CH
461 * Target determined this ioctl is being issued against a
462 * subset of the parent bdev; require extra privileges.
e56f81e0 463 */
e980f623 464 if (!capable(CAP_SYS_RAWIO)) {
0378c625 465 DMDEBUG_LIMIT(
e980f623
CH
466 "%s: sending ioctl %x to DM device without required privilege.",
467 current->comm, cmd);
468 r = -ENOIOCTLCMD;
e56f81e0 469 goto out;
e980f623 470 }
e56f81e0 471 }
6c182cd8 472
a7cb3d2f
CH
473 if (!bdev->bd_disk->fops->ioctl)
474 r = -ENOTTY;
475 else
476 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
e56f81e0 477out:
971888c4 478 dm_unprepare_ioctl(md, srcu_idx);
aa129a22
MB
479 return r;
480}
481
7465d7ac
MS
482u64 dm_start_time_ns_from_clone(struct bio *bio)
483{
6c23f0bd 484 return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
7465d7ac
MS
485}
486EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
487
8d394bc4 488static bool bio_is_flush_with_data(struct bio *bio)
7465d7ac 489{
8d394bc4
MS
490 return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
491}
492
493static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio,
494 unsigned long start_time, struct dm_stats_aux *stats_aux)
495{
496 bool is_flush_with_data;
497 unsigned int bi_size;
498
499 /* If REQ_PREFLUSH set save any payload but do not account it */
500 is_flush_with_data = bio_is_flush_with_data(bio);
501 if (is_flush_with_data) {
502 bi_size = bio->bi_iter.bi_size;
503 bio->bi_iter.bi_size = 0;
504 }
505
506 if (!end)
507 bio_start_io_acct_time(bio, start_time);
508 else
509 bio_end_io_acct(bio, start_time);
7465d7ac 510
7465d7ac
MS
511 if (unlikely(dm_stats_used(&md->stats)))
512 dm_stats_account_io(&md->stats, bio_data_dir(bio),
513 bio->bi_iter.bi_sector, bio_sectors(bio),
8d394bc4
MS
514 end, start_time, stats_aux);
515
516 /* Restore bio's payload so it does get accounted upon requeue */
517 if (is_flush_with_data)
518 bio->bi_iter.bi_size = bi_size;
519}
520
0ab30b40 521static void dm_start_io_acct(struct dm_io *io)
8d394bc4
MS
522{
523 dm_io_acct(false, io->md, io->orig_bio, io->start_time, &io->stats_aux);
7465d7ac
MS
524}
525
0ab30b40 526static void dm_end_io_acct(struct dm_io *io)
7465d7ac 527{
0ab30b40 528 dm_io_acct(true, io->md, io->orig_bio, io->start_time, &io->stats_aux);
7465d7ac 529}
978e51ba
MS
530
531static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
1da177e4 532{
64f52b0e
MS
533 struct dm_io *io;
534 struct dm_target_io *tio;
535 struct bio *clone;
536
abfc426d 537 clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs);
64f52b0e 538
6c23f0bd 539 tio = clone_to_tio(clone);
64f52b0e
MS
540 tio->inside_dm_io = true;
541 tio->io = NULL;
542
543 io = container_of(tio, struct dm_io, tio);
544 io->magic = DM_IO_MAGIC;
978e51ba
MS
545 io->status = 0;
546 atomic_set(&io->io_count, 1);
9f6dc633 547 this_cpu_inc(*md->pending_io);
978e51ba
MS
548 io->orig_bio = bio;
549 io->md = md;
550 spin_lock_init(&io->endio_lock);
551
b879f915 552 io->start_time = jiffies;
64f52b0e 553
0cdb90f0
MS
554 dm_stats_record_start(&md->stats, &io->stats_aux);
555
64f52b0e 556 return io;
1da177e4
LT
557}
558
028867ac 559static void free_io(struct mapped_device *md, struct dm_io *io)
1da177e4 560{
64f52b0e
MS
561 bio_put(&io->tio.clone);
562}
563
1d1068ce 564static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
dc8e2021 565 unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
64f52b0e
MS
566{
567 struct dm_target_io *tio;
568
569 if (!ci->io->tio.io) {
570 /* the dm_target_io embedded in ci->io is available */
571 tio = &ci->io->tio;
572 } else {
abfc426d
CH
573 struct bio *clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio,
574 gfp_mask, &ci->io->md->bs);
64f52b0e
MS
575 if (!clone)
576 return NULL;
577
6c23f0bd 578 tio = clone_to_tio(clone);
64f52b0e
MS
579 tio->inside_dm_io = false;
580 }
56b4b5ab 581
64f52b0e
MS
582 tio->magic = DM_TIO_MAGIC;
583 tio->io = ci->io;
584 tio->ti = ti;
585 tio->target_bio_nr = target_bio_nr;
dc8e2021 586 tio->len_ptr = len;
64f52b0e 587
1d1068ce 588 return &tio->clone;
1da177e4
LT
589}
590
1d1068ce 591static void free_tio(struct bio *clone)
1da177e4 592{
1d1068ce 593 if (clone_to_tio(clone)->inside_dm_io)
64f52b0e 594 return;
1d1068ce 595 bio_put(clone);
1da177e4
LT
596}
597
598/*
599 * Add the bio to the list of deferred io.
600 */
92c63902 601static void queue_io(struct mapped_device *md, struct bio *bio)
1da177e4 602{
05447420 603 unsigned long flags;
1da177e4 604
05447420 605 spin_lock_irqsave(&md->deferred_lock, flags);
1da177e4 606 bio_list_add(&md->deferred, bio);
05447420 607 spin_unlock_irqrestore(&md->deferred_lock, flags);
6a8736d1 608 queue_work(md->wq, &md->work);
1da177e4
LT
609}
610
611/*
612 * Everyone (including functions in this file), should use this
613 * function to access the md->map field, and make sure they call
83d5e5b0 614 * dm_put_live_table() when finished.
1da177e4 615 */
83d5e5b0 616struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
1da177e4 617{
83d5e5b0
MP
618 *srcu_idx = srcu_read_lock(&md->io_barrier);
619
620 return srcu_dereference(md->map, &md->io_barrier);
621}
1da177e4 622
83d5e5b0
MP
623void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
624{
625 srcu_read_unlock(&md->io_barrier, srcu_idx);
626}
627
628void dm_sync_table(struct mapped_device *md)
629{
630 synchronize_srcu(&md->io_barrier);
631 synchronize_rcu_expedited();
632}
633
634/*
635 * A fast alternative to dm_get_live_table/dm_put_live_table.
636 * The caller must not block between these two functions.
637 */
638static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
639{
640 rcu_read_lock();
641 return rcu_dereference(md->map);
642}
1da177e4 643
83d5e5b0
MP
644static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
645{
646 rcu_read_unlock();
1da177e4
LT
647}
648
971888c4
MS
649static char *_dm_claim_ptr = "I belong to device-mapper";
650
86f1152b
BM
651/*
652 * Open a table device so we can use it as a map destination.
653 */
654static int open_table_device(struct table_device *td, dev_t dev,
655 struct mapped_device *md)
656{
86f1152b 657 struct block_device *bdev;
cd913c76 658 u64 part_off;
86f1152b
BM
659 int r;
660
661 BUG_ON(td->dm_dev.bdev);
662
519049af 663 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
86f1152b
BM
664 if (IS_ERR(bdev))
665 return PTR_ERR(bdev);
666
667 r = bd_link_disk_holder(bdev, dm_disk(md));
668 if (r) {
669 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
670 return r;
671 }
672
673 td->dm_dev.bdev = bdev;
cd913c76 674 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
86f1152b
BM
675 return 0;
676}
677
678/*
679 * Close a table device that we've been using.
680 */
681static void close_table_device(struct table_device *td, struct mapped_device *md)
682{
683 if (!td->dm_dev.bdev)
684 return;
685
686 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
687 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
817bf402 688 put_dax(td->dm_dev.dax_dev);
86f1152b 689 td->dm_dev.bdev = NULL;
817bf402 690 td->dm_dev.dax_dev = NULL;
86f1152b
BM
691}
692
693static struct table_device *find_table_device(struct list_head *l, dev_t dev,
8454fca4
SS
694 fmode_t mode)
695{
86f1152b
BM
696 struct table_device *td;
697
698 list_for_each_entry(td, l, list)
699 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
700 return td;
701
702 return NULL;
703}
704
705int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
8454fca4
SS
706 struct dm_dev **result)
707{
86f1152b
BM
708 int r;
709 struct table_device *td;
710
711 mutex_lock(&md->table_devices_lock);
712 td = find_table_device(&md->table_devices, dev, mode);
713 if (!td) {
115485e8 714 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
86f1152b
BM
715 if (!td) {
716 mutex_unlock(&md->table_devices_lock);
717 return -ENOMEM;
718 }
719
720 td->dm_dev.mode = mode;
721 td->dm_dev.bdev = NULL;
722
723 if ((r = open_table_device(td, dev, md))) {
724 mutex_unlock(&md->table_devices_lock);
725 kfree(td);
726 return r;
727 }
728
729 format_dev_t(td->dm_dev.name, dev);
730
b0b4d7c6 731 refcount_set(&td->count, 1);
86f1152b 732 list_add(&td->list, &md->table_devices);
b0b4d7c6
ER
733 } else {
734 refcount_inc(&td->count);
86f1152b 735 }
86f1152b
BM
736 mutex_unlock(&md->table_devices_lock);
737
738 *result = &td->dm_dev;
739 return 0;
740}
86f1152b
BM
741
742void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
743{
744 struct table_device *td = container_of(d, struct table_device, dm_dev);
745
746 mutex_lock(&md->table_devices_lock);
b0b4d7c6 747 if (refcount_dec_and_test(&td->count)) {
86f1152b
BM
748 close_table_device(td, md);
749 list_del(&td->list);
750 kfree(td);
751 }
752 mutex_unlock(&md->table_devices_lock);
753}
86f1152b
BM
754
755static void free_table_devices(struct list_head *devices)
756{
757 struct list_head *tmp, *next;
758
759 list_for_each_safe(tmp, next, devices) {
760 struct table_device *td = list_entry(tmp, struct table_device, list);
761
762 DMWARN("dm_destroy: %s still exists with %d references",
b0b4d7c6 763 td->dm_dev.name, refcount_read(&td->count));
86f1152b
BM
764 kfree(td);
765 }
766}
767
3ac51e74
DW
768/*
769 * Get the geometry associated with a dm device
770 */
771int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
772{
773 *geo = md->geometry;
774
775 return 0;
776}
777
778/*
779 * Set the geometry of a device.
780 */
781int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
782{
783 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
784
785 if (geo->start > sz) {
786 DMWARN("Start sector is beyond the geometry limits.");
787 return -EINVAL;
788 }
789
790 md->geometry = *geo;
791
792 return 0;
793}
794
2e93ccc1
KU
795static int __noflush_suspending(struct mapped_device *md)
796{
797 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
798}
799
1da177e4
LT
800/*
801 * Decrements the number of outstanding ios that a bio has been
802 * cloned into, completing the original io if necc.
803 */
e2118b3c 804void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
1da177e4 805{
2e93ccc1 806 unsigned long flags;
4e4cbee9 807 blk_status_t io_error;
b35f8caa
MB
808 struct bio *bio;
809 struct mapped_device *md = io->md;
2e93ccc1
KU
810
811 /* Push-back supersedes any I/O errors */
f88fb981
KU
812 if (unlikely(error)) {
813 spin_lock_irqsave(&io->endio_lock, flags);
745dc570 814 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
4e4cbee9 815 io->status = error;
f88fb981
KU
816 spin_unlock_irqrestore(&io->endio_lock, flags);
817 }
1da177e4
LT
818
819 if (atomic_dec_and_test(&io->io_count)) {
bf14e2b2 820 bio = io->orig_bio;
4e4cbee9 821 if (io->status == BLK_STS_DM_REQUEUE) {
2e93ccc1
KU
822 /*
823 * Target requested pushing back the I/O.
2e93ccc1 824 */
022c2611 825 spin_lock_irqsave(&md->deferred_lock, flags);
bf14e2b2
DLM
826 if (__noflush_suspending(md) &&
827 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
745dc570 828 /* NOTE early return due to BLK_STS_DM_REQUEUE below */
bf14e2b2
DLM
829 bio_list_add_head(&md->deferred, bio);
830 } else {
831 /*
832 * noflush suspend was interrupted or this is
833 * a write to a zoned target.
834 */
4e4cbee9 835 io->status = BLK_STS_IOERR;
bf14e2b2 836 }
022c2611 837 spin_unlock_irqrestore(&md->deferred_lock, flags);
2e93ccc1
KU
838 }
839
4e4cbee9 840 io_error = io->status;
0ab30b40 841 dm_end_io_acct(io);
6a8736d1 842 free_io(md, io);
9f6dc633
MS
843 smp_wmb();
844 this_cpu_dec(*md->pending_io);
845
846 /* nudge anyone waiting on suspend queue */
847 if (unlikely(wq_has_sleeper(&md->wait)))
848 wake_up(&md->wait);
6a8736d1 849
4e4cbee9 850 if (io_error == BLK_STS_DM_REQUEUE)
6a8736d1 851 return;
2e93ccc1 852
8d394bc4 853 if (bio_is_flush_with_data(bio)) {
af7e466a 854 /*
6a8736d1 855 * Preflush done for flush with data, reissue
28a8f0d3 856 * without REQ_PREFLUSH.
af7e466a 857 */
1eff9d32 858 bio->bi_opf &= ~REQ_PREFLUSH;
6a8736d1 859 queue_io(md, bio);
af7e466a 860 } else {
b372d360 861 /* done with normal IO or empty flush */
8dd601fa
N
862 if (io_error)
863 bio->bi_status = io_error;
4246a0b6 864 bio_endio(bio);
b35f8caa 865 }
1da177e4
LT
866 }
867}
868
bcb44433
MS
869void disable_discard(struct mapped_device *md)
870{
871 struct queue_limits *limits = dm_get_queue_limits(md);
872
873 /* device doesn't really support DISCARD, disable it */
874 limits->max_discard_sectors = 0;
875 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
876}
877
4cc96131 878void disable_write_same(struct mapped_device *md)
7eee4ae2
MS
879{
880 struct queue_limits *limits = dm_get_queue_limits(md);
881
882 /* device doesn't really support WRITE SAME, disable it */
883 limits->max_write_same_sectors = 0;
884}
885
ac62d620
CH
886void disable_write_zeroes(struct mapped_device *md)
887{
888 struct queue_limits *limits = dm_get_queue_limits(md);
889
890 /* device doesn't really support WRITE ZEROES, disable it */
891 limits->max_write_zeroes_sectors = 0;
892}
893
a666e5c0
MP
894static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
895{
896 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
897}
898
4246a0b6 899static void clone_endio(struct bio *bio)
1da177e4 900{
4e4cbee9 901 blk_status_t error = bio->bi_status;
6c23f0bd 902 struct dm_target_io *tio = clone_to_tio(bio);
b35f8caa 903 struct dm_io *io = tio->io;
9faf400f 904 struct mapped_device *md = tio->io->md;
1da177e4 905 dm_endio_fn endio = tio->ti->type->end_io;
309dca30 906 struct request_queue *q = bio->bi_bdev->bd_disk->queue;
1da177e4 907
9c37de29 908 if (unlikely(error == BLK_STS_TARGET)) {
bcb44433 909 if (bio_op(bio) == REQ_OP_DISCARD &&
309dca30 910 !q->limits.max_discard_sectors)
bcb44433
MS
911 disable_discard(md);
912 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
309dca30 913 !q->limits.max_write_same_sectors)
ac62d620 914 disable_write_same(md);
bcb44433 915 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
309dca30 916 !q->limits.max_write_zeroes_sectors)
ac62d620
CH
917 disable_write_zeroes(md);
918 }
7eee4ae2 919
bb37d772
DLM
920 if (blk_queue_is_zoned(q))
921 dm_zone_endio(io, bio);
415c79e1 922
1be56909 923 if (endio) {
4e4cbee9 924 int r = endio(tio->ti, bio, &error);
1be56909
CH
925 switch (r) {
926 case DM_ENDIO_REQUEUE:
bf14e2b2
DLM
927 /*
928 * Requeuing writes to a sequential zone of a zoned
929 * target will break the sequential write pattern:
930 * fail such IO.
931 */
932 if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
933 error = BLK_STS_IOERR;
934 else
935 error = BLK_STS_DM_REQUEUE;
df561f66 936 fallthrough;
1be56909
CH
937 case DM_ENDIO_DONE:
938 break;
939 case DM_ENDIO_INCOMPLETE:
940 /* The target will handle the io */
941 return;
942 default:
943 DMWARN("unimplemented target endio return value: %d", r);
944 BUG();
945 }
946 }
947
a666e5c0
MP
948 if (unlikely(swap_bios_limit(tio->ti, bio))) {
949 struct mapped_device *md = io->md;
950 up(&md->swap_bios_semaphore);
951 }
952
1d1068ce 953 free_tio(bio);
e2118b3c 954 dm_io_dec_pending(io, error);
1da177e4
LT
955}
956
56a67df7
MS
957/*
958 * Return maximum size of I/O possible at the supplied sector up to the current
959 * target boundary.
960 */
3720281d
MS
961static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
962 sector_t target_offset)
56a67df7 963{
56a67df7
MS
964 return ti->len - target_offset;
965}
966
3720281d 967static sector_t max_io_len(struct dm_target *ti, sector_t sector)
1da177e4 968{
3720281d
MS
969 sector_t target_offset = dm_target_offset(ti, sector);
970 sector_t len = max_io_len_target_boundary(ti, target_offset);
5091cdec 971 sector_t max_len;
1da177e4
LT
972
973 /*
3ee16db3
MS
974 * Does the target need to split IO even further?
975 * - varied (per target) IO splitting is a tenet of DM; this
976 * explains why stacked chunk_sectors based splitting via
977 * blk_max_size_offset() isn't possible here. So pass in
978 * ti->max_io_len to override stacked chunk_sectors.
1da177e4 979 */
3ee16db3
MS
980 if (ti->max_io_len) {
981 max_len = blk_max_size_offset(ti->table->md->queue,
982 target_offset, ti->max_io_len);
983 if (len > max_len)
984 len = max_len;
985 }
1da177e4
LT
986
987 return len;
988}
989
542f9038
MS
990int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
991{
992 if (len > UINT_MAX) {
993 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
994 (unsigned long long)len, UINT_MAX);
995 ti->error = "Maximum size of target IO is too large";
996 return -EINVAL;
997 }
998
75ae1936 999 ti->max_io_len = (uint32_t) len;
542f9038
MS
1000
1001 return 0;
1002}
1003EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1004
f26c5719 1005static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
3d97c829
MS
1006 sector_t sector, int *srcu_idx)
1007 __acquires(md->io_barrier)
545ed20e 1008{
545ed20e
TK
1009 struct dm_table *map;
1010 struct dm_target *ti;
545ed20e 1011
f26c5719 1012 map = dm_get_live_table(md, srcu_idx);
545ed20e 1013 if (!map)
f26c5719 1014 return NULL;
545ed20e
TK
1015
1016 ti = dm_table_find_target(map, sector);
123d87d5 1017 if (!ti)
f26c5719 1018 return NULL;
545ed20e 1019
f26c5719
DW
1020 return ti;
1021}
545ed20e 1022
f26c5719 1023static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
3d97c829 1024 long nr_pages, void **kaddr, pfn_t *pfn)
f26c5719
DW
1025{
1026 struct mapped_device *md = dax_get_private(dax_dev);
1027 sector_t sector = pgoff * PAGE_SECTORS;
1028 struct dm_target *ti;
1029 long len, ret = -EIO;
1030 int srcu_idx;
545ed20e 1031
f26c5719 1032 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
545ed20e 1033
f26c5719
DW
1034 if (!ti)
1035 goto out;
1036 if (!ti->type->direct_access)
1037 goto out;
3720281d 1038 len = max_io_len(ti, sector) / PAGE_SECTORS;
f26c5719
DW
1039 if (len < 1)
1040 goto out;
1041 nr_pages = min(len, nr_pages);
dbc62659 1042 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
817bf402 1043
f26c5719 1044 out:
545ed20e 1045 dm_put_live_table(md, srcu_idx);
f26c5719
DW
1046
1047 return ret;
545ed20e
TK
1048}
1049
cdf6cdcd
VG
1050static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1051 size_t nr_pages)
1052{
1053 struct mapped_device *md = dax_get_private(dax_dev);
1054 sector_t sector = pgoff * PAGE_SECTORS;
1055 struct dm_target *ti;
1056 int ret = -EIO;
1057 int srcu_idx;
1058
1059 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1060
1061 if (!ti)
1062 goto out;
1063 if (WARN_ON(!ti->type->dax_zero_page_range)) {
1064 /*
1065 * ->zero_page_range() is mandatory dax operation. If we are
1066 * here, something is wrong.
1067 */
cdf6cdcd
VG
1068 goto out;
1069 }
1070 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
cdf6cdcd
VG
1071 out:
1072 dm_put_live_table(md, srcu_idx);
1073
1074 return ret;
1075}
1076
1dd40c3e
MP
1077/*
1078 * A target may call dm_accept_partial_bio only from the map routine. It is
6842d264
DLM
1079 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1080 * operations and REQ_OP_ZONE_APPEND (zone append writes).
1dd40c3e
MP
1081 *
1082 * dm_accept_partial_bio informs the dm that the target only wants to process
1083 * additional n_sectors sectors of the bio and the rest of the data should be
1084 * sent in a next bio.
1085 *
1086 * A diagram that explains the arithmetics:
1087 * +--------------------+---------------+-------+
1088 * | 1 | 2 | 3 |
1089 * +--------------------+---------------+-------+
1090 *
1091 * <-------------- *tio->len_ptr --------------->
1092 * <------- bi_size ------->
1093 * <-- n_sectors -->
1094 *
1095 * Region 1 was already iterated over with bio_advance or similar function.
1096 * (it may be empty if the target doesn't use bio_advance)
1097 * Region 2 is the remaining bio size that the target wants to process.
1098 * (it may be empty if region 1 is non-empty, although there is no reason
1099 * to make it empty)
1100 * The target requires that region 3 is to be sent in the next bio.
1101 *
1102 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1103 * the partially processed part (the sum of regions 1+2) must be the same for all
1104 * copies of the bio.
1105 */
1106void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1107{
6c23f0bd 1108 struct dm_target_io *tio = clone_to_tio(bio);
1dd40c3e 1109 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
6842d264 1110
1eff9d32 1111 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
6842d264
DLM
1112 BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1113 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
1dd40c3e
MP
1114 BUG_ON(bi_size > *tio->len_ptr);
1115 BUG_ON(n_sectors > bi_size);
6842d264 1116
1dd40c3e
MP
1117 *tio->len_ptr -= bi_size - n_sectors;
1118 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1119}
1120EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1121
a666e5c0
MP
1122static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1123{
1124 mutex_lock(&md->swap_bios_lock);
1125 while (latch < md->swap_bios) {
1126 cond_resched();
1127 down(&md->swap_bios_semaphore);
1128 md->swap_bios--;
1129 }
1130 while (latch > md->swap_bios) {
1131 cond_resched();
1132 up(&md->swap_bios_semaphore);
1133 md->swap_bios++;
1134 }
1135 mutex_unlock(&md->swap_bios_lock);
1136}
1137
1561b396 1138static void __map_bio(struct bio *clone)
1da177e4 1139{
1561b396 1140 struct dm_target_io *tio = clone_to_tio(clone);
1da177e4 1141 int r;
2056a782 1142 sector_t sector;
64f52b0e 1143 struct dm_io *io = tio->io;
bd2a49b8 1144 struct dm_target *ti = tio->ti;
1da177e4 1145
1da177e4 1146 clone->bi_end_io = clone_endio;
1da177e4
LT
1147
1148 /*
1149 * Map the clone. If r == 0 we don't need to do
1150 * anything, the target has assumed ownership of
1151 * this io.
1152 */
e2118b3c 1153 dm_io_inc_pending(io);
4f024f37 1154 sector = clone->bi_iter.bi_sector;
d67a5f4b 1155
a666e5c0
MP
1156 if (unlikely(swap_bios_limit(ti, clone))) {
1157 struct mapped_device *md = io->md;
1158 int latch = get_swap_bios();
1159 if (unlikely(latch != md->swap_bios))
1160 __set_swap_bios_limit(md, latch);
1161 down(&md->swap_bios_semaphore);
1162 }
1163
bb37d772
DLM
1164 /*
1165 * Check if the IO needs a special mapping due to zone append emulation
1166 * on zoned target. In this case, dm_zone_map_bio() calls the target
1167 * map operation.
1168 */
1169 if (dm_emulate_zone_append(io->md))
1170 r = dm_zone_map_bio(tio);
1171 else
1172 r = ti->type->map(ti, clone);
1173
846785e6
CH
1174 switch (r) {
1175 case DM_MAPIO_SUBMITTED:
1176 break;
1177 case DM_MAPIO_REMAPPED:
1da177e4 1178 /* the bio has been remapped so dispatch it */
1c02fca6 1179 trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
3e08773c 1180 submit_bio_noacct(clone);
846785e6
CH
1181 break;
1182 case DM_MAPIO_KILL:
a666e5c0
MP
1183 if (unlikely(swap_bios_limit(ti, clone))) {
1184 struct mapped_device *md = io->md;
1185 up(&md->swap_bios_semaphore);
1186 }
1d1068ce 1187 free_tio(clone);
e2118b3c 1188 dm_io_dec_pending(io, BLK_STS_IOERR);
4e4cbee9 1189 break;
846785e6 1190 case DM_MAPIO_REQUEUE:
a666e5c0
MP
1191 if (unlikely(swap_bios_limit(ti, clone))) {
1192 struct mapped_device *md = io->md;
1193 up(&md->swap_bios_semaphore);
1194 }
1d1068ce 1195 free_tio(clone);
e2118b3c 1196 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
846785e6
CH
1197 break;
1198 default:
45cbcd79
KU
1199 DMWARN("unimplemented target map return value: %d", r);
1200 BUG();
1da177e4
LT
1201 }
1202}
1da177e4 1203
e0d6609a 1204static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
bd2a49b8 1205{
4f024f37
KO
1206 bio->bi_iter.bi_sector = sector;
1207 bio->bi_iter.bi_size = to_bytes(len);
1da177e4
LT
1208}
1209
318716dd 1210static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
dc8e2021
CH
1211 struct dm_target *ti, unsigned num_bios,
1212 unsigned *len)
f9ab94ce 1213{
1d1068ce 1214 struct bio *bio;
318716dd 1215 int try;
dba14160 1216
318716dd
MS
1217 for (try = 0; try < 2; try++) {
1218 int bio_nr;
318716dd
MS
1219
1220 if (try)
bc02cdbe 1221 mutex_lock(&ci->io->md->table_devices_lock);
318716dd 1222 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1d1068ce 1223 bio = alloc_tio(ci, ti, bio_nr, len,
dc8e2021 1224 try ? GFP_NOIO : GFP_NOWAIT);
1d1068ce 1225 if (!bio)
318716dd
MS
1226 break;
1227
1d1068ce 1228 bio_list_add(blist, bio);
318716dd
MS
1229 }
1230 if (try)
bc02cdbe 1231 mutex_unlock(&ci->io->md->table_devices_lock);
318716dd
MS
1232 if (bio_nr == num_bios)
1233 return;
1234
6c23f0bd 1235 while ((bio = bio_list_pop(blist)))
1d1068ce 1236 free_tio(bio);
318716dd 1237 }
9015df24
AK
1238}
1239
14fe594d 1240static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1dd40c3e 1241 unsigned num_bios, unsigned *len)
06a426ce 1242{
318716dd 1243 struct bio_list blist = BIO_EMPTY_LIST;
8eabf5d0 1244 struct bio *clone;
318716dd 1245
891fced6
CH
1246 switch (num_bios) {
1247 case 0:
1248 break;
1249 case 1:
1250 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
8eabf5d0
CH
1251 if (len)
1252 bio_setup_sector(clone, ci->sector, *len);
1561b396 1253 __map_bio(clone);
891fced6
CH
1254 break;
1255 default:
1256 alloc_multiple_bios(&blist, ci, ti, num_bios, len);
1257 while ((clone = bio_list_pop(&blist))) {
1258 if (len)
1259 bio_setup_sector(clone, ci->sector, *len);
1260 __map_bio(clone);
1261 }
1262 break;
8eabf5d0 1263 }
06a426ce
MS
1264}
1265
14fe594d 1266static int __send_empty_flush(struct clone_info *ci)
f9ab94ce 1267{
06a426ce 1268 unsigned target_nr = 0;
f9ab94ce 1269 struct dm_target *ti;
828678b8
MS
1270 struct bio flush_bio;
1271
1272 /*
1273 * Use an on-stack bio for this, it's safe since we don't
1274 * need to reference it after submit. It's just used as
1275 * the basis for the clone(s).
1276 */
49add496
CH
1277 bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
1278 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
47d95102 1279
828678b8
MS
1280 ci->bio = &flush_bio;
1281 ci->sector_count = 0;
f9ab94ce 1282
b372d360 1283 BUG_ON(bio_has_data(ci->bio));
f9ab94ce 1284 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1dd40c3e 1285 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
828678b8
MS
1286
1287 bio_uninit(ci->bio);
f9ab94ce
MP
1288 return 0;
1289}
1290
3d7f4562 1291static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
61697a6a 1292 unsigned num_bios)
ba1cbad9 1293{
51b86f9a 1294 unsigned len;
ba1cbad9 1295
3d7f4562
MS
1296 /*
1297 * Even though the device advertised support for this type of
1298 * request, that does not mean every target supports it, and
1299 * reconfiguration might also have changed that since the
1300 * check was performed.
1301 */
3d7f4562
MS
1302 if (!num_bios)
1303 return -EOPNOTSUPP;
ba1cbad9 1304
3720281d
MS
1305 len = min_t(sector_t, ci->sector_count,
1306 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
51b86f9a 1307
3d7f4562 1308 __send_duplicate_bios(ci, ti, num_bios, &len);
e262f347 1309
3d7f4562
MS
1310 ci->sector += len;
1311 ci->sector_count -= len;
5ae89a87
MS
1312
1313 return 0;
ba1cbad9
MS
1314}
1315
568c73a3
MS
1316static bool is_abnormal_io(struct bio *bio)
1317{
1318 bool r = false;
1319
1320 switch (bio_op(bio)) {
1321 case REQ_OP_DISCARD:
1322 case REQ_OP_SECURE_ERASE:
1323 case REQ_OP_WRITE_SAME:
1324 case REQ_OP_WRITE_ZEROES:
1325 r = true;
1326 break;
1327 }
1328
1329 return r;
1330}
1331
0519c71e
MS
1332static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1333 int *result)
1334{
1335 struct bio *bio = ci->bio;
9679b5a7 1336 unsigned num_bios = 0;
0519c71e 1337
9679b5a7
MS
1338 switch (bio_op(bio)) {
1339 case REQ_OP_DISCARD:
1340 num_bios = ti->num_discard_bios;
1341 break;
1342 case REQ_OP_SECURE_ERASE:
1343 num_bios = ti->num_secure_erase_bios;
1344 break;
1345 case REQ_OP_WRITE_SAME:
1346 num_bios = ti->num_write_same_bios;
1347 break;
1348 case REQ_OP_WRITE_ZEROES:
1349 num_bios = ti->num_write_zeroes_bios;
1350 break;
1351 default:
0519c71e 1352 return false;
9679b5a7 1353 }
0519c71e 1354
9679b5a7 1355 *result = __send_changing_extent_only(ci, ti, num_bios);
0519c71e
MS
1356 return true;
1357}
1358
e4c93811
AK
1359/*
1360 * Select the correct strategy for processing a non-flush bio.
1361 */
96c9865c 1362static int __split_and_process_bio(struct clone_info *ci)
0ce65797 1363{
66bdaa43 1364 struct bio *clone;
512875bd 1365 struct dm_target *ti;
1c3b13e6 1366 unsigned len;
c80914e8 1367 int r;
0ce65797 1368
512875bd 1369 ti = dm_table_find_target(ci->map, ci->sector);
123d87d5 1370 if (!ti)
512875bd
JN
1371 return -EIO;
1372
568c73a3 1373 if (__process_abnormal_io(ci, ti, &r))
0519c71e 1374 return r;
3d7f4562 1375
3720281d 1376 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
0ce65797 1377
66bdaa43
MS
1378 clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
1379 bio_advance(clone, to_bytes(ci->sector - clone->bi_iter.bi_sector));
1380 clone->bi_iter.bi_size = to_bytes(len);
1381 if (bio_integrity(clone))
1382 bio_integrity_trim(clone);
1383
1384 __map_bio(clone);
0ce65797 1385
1c3b13e6
KO
1386 ci->sector += len;
1387 ci->sector_count -= len;
0ce65797 1388
1c3b13e6 1389 return 0;
0ce65797
MS
1390}
1391
978e51ba
MS
1392static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1393 struct dm_table *map, struct bio *bio)
1394{
1395 ci->map = map;
1396 ci->io = alloc_io(md, bio);
d41e077a 1397 ci->bio = bio;
978e51ba 1398 ci->sector = bio->bi_iter.bi_sector;
d41e077a
MS
1399 ci->sector_count = bio_sectors(bio);
1400
1401 /* Shouldn't happen but sector_count was being set to 0 so... */
1402 if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
1403 ci->sector_count = 0;
978e51ba
MS
1404}
1405
1da177e4 1406/*
14fe594d 1407 * Entry point to split a bio into clones and submit them to the targets.
1da177e4 1408 */
96c9865c
MS
1409static void dm_split_and_process_bio(struct mapped_device *md,
1410 struct dm_table *map, struct bio *bio)
0ce65797 1411{
1da177e4 1412 struct clone_info ci;
d41e077a 1413 struct bio *b;
512875bd 1414 int error = 0;
1da177e4 1415
978e51ba 1416 init_clone_info(&ci, md, map, bio);
0ce65797 1417
1eff9d32 1418 if (bio->bi_opf & REQ_PREFLUSH) {
14fe594d 1419 error = __send_empty_flush(&ci);
e2118b3c 1420 /* dm_io_dec_pending submits any data associated with flush */
d41e077a 1421 goto out;
d87f4c14 1422 }
0ce65797 1423
d41e077a
MS
1424 error = __split_and_process_bio(&ci);
1425 if (error || !ci.sector_count)
1426 goto out;
1427
1428 /*
1429 * Remainder must be passed to submit_bio_noacct() so it gets handled
1430 * *after* bios already submitted have been completely processed.
1431 * We take a clone of the original to store in ci.io->orig_bio to be
1432 * used by dm_end_io_acct() and for dm_io_dec_pending() to use for
1433 * completion handling.
1434 */
1435 b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1436 GFP_NOIO, &md->queue->bio_split);
1437 ci.io->orig_bio = b;
1438
1439 bio_chain(b, bio);
1440 trace_block_split(b, bio->bi_iter.bi_sector);
1441 submit_bio_noacct(bio);
1442out:
1443 dm_start_io_acct(ci.io);
1da177e4 1444 /* drop the extra reference count */
e2118b3c 1445 dm_io_dec_pending(ci.io, errno_to_blk_status(error));
0ce65797
MS
1446}
1447
3e08773c 1448static void dm_submit_bio(struct bio *bio)
cec47e3d 1449{
309dca30 1450 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
83d5e5b0
MP
1451 int srcu_idx;
1452 struct dm_table *map;
cec47e3d 1453
83d5e5b0 1454 map = dm_get_live_table(md, &srcu_idx);
b2abdb1b
MS
1455 if (unlikely(!map)) {
1456 DMERR_LIMIT("%s: mapping table unavailable, erroring io",
1457 dm_device_name(md));
1458 bio_io_error(bio);
1459 goto out;
1460 }
29e4013d 1461
b2abdb1b 1462 /* If suspended, queue this IO for later */
6a8736d1 1463 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
6abc4946
KK
1464 if (bio->bi_opf & REQ_NOWAIT)
1465 bio_wouldblock_error(bio);
b2abdb1b 1466 else if (bio->bi_opf & REQ_RAHEAD)
54d9a1b4 1467 bio_io_error(bio);
b2abdb1b
MS
1468 else
1469 queue_io(md, bio);
1470 goto out;
cec47e3d 1471 }
1da177e4 1472
b2abdb1b
MS
1473 /*
1474 * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc)
1475 * otherwise associated queue_limits won't be imposed.
1476 */
1477 if (is_abnormal_io(bio))
1478 blk_queue_split(&bio);
978e51ba 1479
96c9865c 1480 dm_split_and_process_bio(md, map, bio);
b2abdb1b 1481out:
83d5e5b0 1482 dm_put_live_table(md, srcu_idx);
978e51ba
MS
1483}
1484
1da177e4
LT
1485/*-----------------------------------------------------------------
1486 * An IDR is used to keep track of allocated minor numbers.
1487 *---------------------------------------------------------------*/
2b06cfff 1488static void free_minor(int minor)
1da177e4 1489{
f32c10b0 1490 spin_lock(&_minor_lock);
1da177e4 1491 idr_remove(&_minor_idr, minor);
f32c10b0 1492 spin_unlock(&_minor_lock);
1da177e4
LT
1493}
1494
1495/*
1496 * See if the device with a specific minor # is free.
1497 */
cf13ab8e 1498static int specific_minor(int minor)
1da177e4 1499{
c9d76be6 1500 int r;
1da177e4
LT
1501
1502 if (minor >= (1 << MINORBITS))
1503 return -EINVAL;
1504
c9d76be6 1505 idr_preload(GFP_KERNEL);
f32c10b0 1506 spin_lock(&_minor_lock);
1da177e4 1507
c9d76be6 1508 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1da177e4 1509
f32c10b0 1510 spin_unlock(&_minor_lock);
c9d76be6
TH
1511 idr_preload_end();
1512 if (r < 0)
1513 return r == -ENOSPC ? -EBUSY : r;
1514 return 0;
1da177e4
LT
1515}
1516
cf13ab8e 1517static int next_free_minor(int *minor)
1da177e4 1518{
c9d76be6 1519 int r;
62f75c2f 1520
c9d76be6 1521 idr_preload(GFP_KERNEL);
f32c10b0 1522 spin_lock(&_minor_lock);
1da177e4 1523
c9d76be6 1524 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1da177e4 1525
f32c10b0 1526 spin_unlock(&_minor_lock);
c9d76be6
TH
1527 idr_preload_end();
1528 if (r < 0)
1529 return r;
1530 *minor = r;
1531 return 0;
1da177e4
LT
1532}
1533
83d5cde4 1534static const struct block_device_operations dm_blk_dops;
681cc5e8 1535static const struct block_device_operations dm_rq_blk_dops;
f26c5719 1536static const struct dax_operations dm_dax_ops;
1da177e4 1537
53d5914f
MP
1538static void dm_wq_work(struct work_struct *work);
1539
aa6ce87a 1540#ifdef CONFIG_BLK_INLINE_ENCRYPTION
cb77cb5a 1541static void dm_queue_destroy_crypto_profile(struct request_queue *q)
aa6ce87a 1542{
cb77cb5a 1543 dm_destroy_crypto_profile(q->crypto_profile);
aa6ce87a
ST
1544}
1545
1546#else /* CONFIG_BLK_INLINE_ENCRYPTION */
1547
cb77cb5a 1548static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
aa6ce87a
ST
1549{
1550}
1551#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
1552
0f20972f
MS
1553static void cleanup_mapped_device(struct mapped_device *md)
1554{
0f20972f
MS
1555 if (md->wq)
1556 destroy_workqueue(md->wq);
6f1c819c
KO
1557 bioset_exit(&md->bs);
1558 bioset_exit(&md->io_bs);
0f20972f 1559
f26c5719 1560 if (md->dax_dev) {
fb08a190 1561 dax_remove_host(md->disk);
f26c5719
DW
1562 kill_dax(md->dax_dev);
1563 put_dax(md->dax_dev);
1564 md->dax_dev = NULL;
1565 }
1566
0f20972f
MS
1567 if (md->disk) {
1568 spin_lock(&_minor_lock);
1569 md->disk->private_data = NULL;
1570 spin_unlock(&_minor_lock);
89f871af
CH
1571 if (dm_get_md_type(md) != DM_TYPE_NONE) {
1572 dm_sysfs_exit(md);
1573 del_gendisk(md->disk);
1574 }
cb77cb5a 1575 dm_queue_destroy_crypto_profile(md->queue);
74fe6ba9 1576 blk_cleanup_disk(md->disk);
74a2b6ec 1577 }
0f20972f 1578
9f6dc633
MS
1579 if (md->pending_io) {
1580 free_percpu(md->pending_io);
1581 md->pending_io = NULL;
1582 }
1583
d09960b0
TE
1584 cleanup_srcu_struct(&md->io_barrier);
1585
d5ffebdd
MS
1586 mutex_destroy(&md->suspend_lock);
1587 mutex_destroy(&md->type_lock);
1588 mutex_destroy(&md->table_devices_lock);
a666e5c0 1589 mutex_destroy(&md->swap_bios_lock);
d5ffebdd 1590
4cc96131 1591 dm_mq_cleanup_mapped_device(md);
bb37d772 1592 dm_cleanup_zoned_dev(md);
0f20972f
MS
1593}
1594
1da177e4
LT
1595/*
1596 * Allocate and initialise a blank device with a given minor.
1597 */
2b06cfff 1598static struct mapped_device *alloc_dev(int minor)
1da177e4 1599{
115485e8
MS
1600 int r, numa_node_id = dm_get_numa_node();
1601 struct mapped_device *md;
ba61fdd1 1602 void *old_md;
1da177e4 1603
856eb091 1604 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1da177e4
LT
1605 if (!md) {
1606 DMWARN("unable to allocate device, out of memory.");
1607 return NULL;
1608 }
1609
10da4f79 1610 if (!try_module_get(THIS_MODULE))
6ed7ade8 1611 goto bad_module_get;
10da4f79 1612
1da177e4 1613 /* get a minor number for the dev */
2b06cfff 1614 if (minor == DM_ANY_MINOR)
cf13ab8e 1615 r = next_free_minor(&minor);
2b06cfff 1616 else
cf13ab8e 1617 r = specific_minor(minor);
1da177e4 1618 if (r < 0)
6ed7ade8 1619 goto bad_minor;
1da177e4 1620
83d5e5b0
MP
1621 r = init_srcu_struct(&md->io_barrier);
1622 if (r < 0)
1623 goto bad_io_barrier;
1624
115485e8 1625 md->numa_node_id = numa_node_id;
591ddcfc 1626 md->init_tio_pdu = false;
a5664dad 1627 md->type = DM_TYPE_NONE;
e61290a4 1628 mutex_init(&md->suspend_lock);
a5664dad 1629 mutex_init(&md->type_lock);
86f1152b 1630 mutex_init(&md->table_devices_lock);
022c2611 1631 spin_lock_init(&md->deferred_lock);
1da177e4 1632 atomic_set(&md->holders, 1);
5c6bd75d 1633 atomic_set(&md->open_count, 0);
1da177e4 1634 atomic_set(&md->event_nr, 0);
7a8c3d3b
MA
1635 atomic_set(&md->uevent_seq, 0);
1636 INIT_LIST_HEAD(&md->uevent_list);
86f1152b 1637 INIT_LIST_HEAD(&md->table_devices);
7a8c3d3b 1638 spin_lock_init(&md->uevent_lock);
1da177e4 1639
47ace7e0 1640 /*
c62b37d9
CH
1641 * default to bio-based until DM table is loaded and md->type
1642 * established. If request-based table is loaded: blk-mq will
1643 * override accordingly.
47ace7e0 1644 */
74fe6ba9 1645 md->disk = blk_alloc_disk(md->numa_node_id);
1da177e4 1646 if (!md->disk)
0f20972f 1647 goto bad;
74fe6ba9 1648 md->queue = md->disk->queue;
1da177e4 1649
f0b04115 1650 init_waitqueue_head(&md->wait);
53d5914f 1651 INIT_WORK(&md->work, dm_wq_work);
f0b04115 1652 init_waitqueue_head(&md->eventq);
2995fa78 1653 init_completion(&md->kobj_holder.completion);
f0b04115 1654
a666e5c0
MP
1655 md->swap_bios = get_swap_bios();
1656 sema_init(&md->swap_bios_semaphore, md->swap_bios);
1657 mutex_init(&md->swap_bios_lock);
1658
1da177e4
LT
1659 md->disk->major = _major;
1660 md->disk->first_minor = minor;
74fe6ba9 1661 md->disk->minors = 1;
1ebe2e5f 1662 md->disk->flags |= GENHD_FL_NO_PART;
1da177e4
LT
1663 md->disk->fops = &dm_blk_dops;
1664 md->disk->queue = md->queue;
1665 md->disk->private_data = md;
1666 sprintf(md->disk->disk_name, "dm-%d", minor);
f26c5719 1667
5d2a228b 1668 if (IS_ENABLED(CONFIG_FS_DAX)) {
30c6828a 1669 md->dax_dev = alloc_dax(md, &dm_dax_ops);
d7519392
CH
1670 if (IS_ERR(md->dax_dev)) {
1671 md->dax_dev = NULL;
976431b0 1672 goto bad;
d7519392 1673 }
7ac5360c
CH
1674 set_dax_nocache(md->dax_dev);
1675 set_dax_nomc(md->dax_dev);
fb08a190 1676 if (dax_add_host(md->dax_dev, md->disk))
976431b0
DW
1677 goto bad;
1678 }
f26c5719 1679
7e51f257 1680 format_dev_t(md->name, MKDEV(_major, minor));
1da177e4 1681
c7c879ee 1682 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name);
304f3f6a 1683 if (!md->wq)
0f20972f 1684 goto bad;
304f3f6a 1685
9f6dc633
MS
1686 md->pending_io = alloc_percpu(unsigned long);
1687 if (!md->pending_io)
1688 goto bad;
1689
fd2ed4d2
MP
1690 dm_stats_init(&md->stats);
1691
ba61fdd1 1692 /* Populate the mapping, nobody knows we exist yet */
f32c10b0 1693 spin_lock(&_minor_lock);
ba61fdd1 1694 old_md = idr_replace(&_minor_idr, md, minor);
f32c10b0 1695 spin_unlock(&_minor_lock);
ba61fdd1
JM
1696
1697 BUG_ON(old_md != MINOR_ALLOCED);
1698
1da177e4
LT
1699 return md;
1700
0f20972f
MS
1701bad:
1702 cleanup_mapped_device(md);
83d5e5b0 1703bad_io_barrier:
1da177e4 1704 free_minor(minor);
6ed7ade8 1705bad_minor:
10da4f79 1706 module_put(THIS_MODULE);
6ed7ade8 1707bad_module_get:
856eb091 1708 kvfree(md);
1da177e4
LT
1709 return NULL;
1710}
1711
ae9da83f
JN
1712static void unlock_fs(struct mapped_device *md);
1713
1da177e4
LT
1714static void free_dev(struct mapped_device *md)
1715{
f331c029 1716 int minor = MINOR(disk_devt(md->disk));
63d94e48 1717
32a926da 1718 unlock_fs(md);
2eb6e1e3 1719
0f20972f 1720 cleanup_mapped_device(md);
63a4f065 1721
86f1152b 1722 free_table_devices(&md->table_devices);
63a4f065 1723 dm_stats_cleanup(&md->stats);
63a4f065
MS
1724 free_minor(minor);
1725
10da4f79 1726 module_put(THIS_MODULE);
856eb091 1727 kvfree(md);
1da177e4
LT
1728}
1729
2a2a4c51 1730static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
e6ee8c0b 1731{
c0820cf5 1732 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2a2a4c51 1733 int ret = 0;
e6ee8c0b 1734
0776aa0e 1735 if (dm_table_bio_based(t)) {
64f52b0e
MS
1736 /*
1737 * The md may already have mempools that need changing.
1738 * If so, reload bioset because front_pad may have changed
1739 * because a different table was loaded.
1740 */
6f1c819c
KO
1741 bioset_exit(&md->bs);
1742 bioset_exit(&md->io_bs);
0776aa0e 1743
6f1c819c 1744 } else if (bioset_initialized(&md->bs)) {
4e6e36c3
MS
1745 /*
1746 * There's no need to reload with request-based dm
1747 * because the size of front_pad doesn't change.
1748 * Note for future: If you are to reload bioset,
1749 * prep-ed requests in the queue may refer
1750 * to bio from the old bioset, so you must walk
1751 * through the queue to unprep.
1752 */
1753 goto out;
c0820cf5 1754 }
e6ee8c0b 1755
6f1c819c
KO
1756 BUG_ON(!p ||
1757 bioset_initialized(&md->bs) ||
1758 bioset_initialized(&md->io_bs));
cbc4e3c1 1759
2a2a4c51
JA
1760 ret = bioset_init_from_src(&md->bs, &p->bs);
1761 if (ret)
1762 goto out;
1763 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
1764 if (ret)
1765 bioset_exit(&md->bs);
e6ee8c0b 1766out:
02233342 1767 /* mempool bind completed, no longer need any mempools in the table */
e6ee8c0b 1768 dm_table_free_md_mempools(t);
2a2a4c51 1769 return ret;
e6ee8c0b
KU
1770}
1771
1da177e4
LT
1772/*
1773 * Bind a table to the device.
1774 */
1775static void event_callback(void *context)
1776{
7a8c3d3b
MA
1777 unsigned long flags;
1778 LIST_HEAD(uevents);
1da177e4
LT
1779 struct mapped_device *md = (struct mapped_device *) context;
1780
7a8c3d3b
MA
1781 spin_lock_irqsave(&md->uevent_lock, flags);
1782 list_splice_init(&md->uevent_list, &uevents);
1783 spin_unlock_irqrestore(&md->uevent_lock, flags);
1784
ed9e1982 1785 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
7a8c3d3b 1786
1da177e4
LT
1787 atomic_inc(&md->event_nr);
1788 wake_up(&md->eventq);
62e08243 1789 dm_issue_global_event();
1da177e4
LT
1790}
1791
042d2a9b
AK
1792/*
1793 * Returns old map, which caller must destroy.
1794 */
1795static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1796 struct queue_limits *limits)
1da177e4 1797{
042d2a9b 1798 struct dm_table *old_map;
165125e1 1799 struct request_queue *q = md->queue;
978e51ba 1800 bool request_based = dm_table_request_based(t);
1da177e4 1801 sector_t size;
2a2a4c51 1802 int ret;
1da177e4 1803
5a8f1f80
BVA
1804 lockdep_assert_held(&md->suspend_lock);
1805
1da177e4 1806 size = dm_table_get_size(t);
3ac51e74
DW
1807
1808 /*
1809 * Wipe any geometry if the size of the table changed.
1810 */
fd2ed4d2 1811 if (size != dm_get_size(md))
3ac51e74
DW
1812 memset(&md->geometry, 0, sizeof(md->geometry));
1813
5424a0b8
MP
1814 if (!get_capacity(md->disk))
1815 set_capacity(md->disk, size);
1816 else
1817 set_capacity_and_notify(md->disk, size);
d5816876 1818
2ca3310e
AK
1819 dm_table_event_callback(t, event_callback, md);
1820
9c37de29 1821 if (request_based) {
16f12266 1822 /*
9c37de29
MS
1823 * Leverage the fact that request-based DM targets are
1824 * immutable singletons - used to optimize dm_mq_queue_rq.
16f12266
MS
1825 */
1826 md->immutable_target = dm_table_get_immutable_target(t);
1827 }
e6ee8c0b 1828
2a2a4c51
JA
1829 ret = __bind_mempools(md, t);
1830 if (ret) {
1831 old_map = ERR_PTR(ret);
1832 goto out;
1833 }
e6ee8c0b 1834
bb37d772
DLM
1835 ret = dm_table_set_restrictions(t, q, limits);
1836 if (ret) {
1837 old_map = ERR_PTR(ret);
1838 goto out;
1839 }
1840
a12f5d48 1841 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1d3aa6f6 1842 rcu_assign_pointer(md->map, (void *)t);
36a0456f
AK
1843 md->immutable_target_type = dm_table_get_immutable_target_type(t);
1844
41abc4e1
HR
1845 if (old_map)
1846 dm_sync_table(md);
1da177e4 1847
2a2a4c51 1848out:
042d2a9b 1849 return old_map;
1da177e4
LT
1850}
1851
a7940155
AK
1852/*
1853 * Returns unbound table for the caller to free.
1854 */
1855static struct dm_table *__unbind(struct mapped_device *md)
1da177e4 1856{
a12f5d48 1857 struct dm_table *map = rcu_dereference_protected(md->map, 1);
1da177e4
LT
1858
1859 if (!map)
a7940155 1860 return NULL;
1da177e4
LT
1861
1862 dm_table_event_callback(map, NULL, NULL);
9cdb8520 1863 RCU_INIT_POINTER(md->map, NULL);
83d5e5b0 1864 dm_sync_table(md);
a7940155
AK
1865
1866 return map;
1da177e4
LT
1867}
1868
1869/*
1870 * Constructor for a new device.
1871 */
2b06cfff 1872int dm_create(int minor, struct mapped_device **result)
1da177e4
LT
1873{
1874 struct mapped_device *md;
1875
2b06cfff 1876 md = alloc_dev(minor);
1da177e4
LT
1877 if (!md)
1878 return -ENXIO;
1879
91ccbbac
TS
1880 dm_ima_reset_data(md);
1881
1da177e4
LT
1882 *result = md;
1883 return 0;
1884}
1885
a5664dad
MS
1886/*
1887 * Functions to manage md->type.
1888 * All are required to hold md->type_lock.
1889 */
1890void dm_lock_md_type(struct mapped_device *md)
1891{
1892 mutex_lock(&md->type_lock);
1893}
1894
1895void dm_unlock_md_type(struct mapped_device *md)
1896{
1897 mutex_unlock(&md->type_lock);
1898}
1899
7e0d574f 1900void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
a5664dad 1901{
00c4fc3b 1902 BUG_ON(!mutex_is_locked(&md->type_lock));
a5664dad
MS
1903 md->type = type;
1904}
1905
7e0d574f 1906enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
a5664dad
MS
1907{
1908 return md->type;
1909}
1910
36a0456f
AK
1911struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
1912{
1913 return md->immutable_target_type;
1914}
1915
f84cb8a4
MS
1916/*
1917 * The queue_limits are only valid as long as you have a reference
1918 * count on 'md'.
1919 */
1920struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
1921{
1922 BUG_ON(!atomic_read(&md->holders));
1923 return &md->queue->limits;
1924}
1925EXPORT_SYMBOL_GPL(dm_get_queue_limits);
1926
4a0b4ddf
MS
1927/*
1928 * Setup the DM device's queue based on md's type
1929 */
591ddcfc 1930int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
4a0b4ddf 1931{
ba305859 1932 enum dm_queue_mode type = dm_table_get_type(t);
c100ec49 1933 struct queue_limits limits;
ba305859 1934 int r;
bfebd1cd 1935
545ed20e 1936 switch (type) {
bfebd1cd 1937 case DM_TYPE_REQUEST_BASED:
681cc5e8 1938 md->disk->fops = &dm_rq_blk_dops;
e83068a5 1939 r = dm_mq_init_request_queue(md, t);
bfebd1cd 1940 if (r) {
681cc5e8 1941 DMERR("Cannot initialize queue for request-based dm mapped device");
bfebd1cd
MS
1942 return r;
1943 }
1944 break;
1945 case DM_TYPE_BIO_BASED:
545ed20e 1946 case DM_TYPE_DAX_BIO_BASED:
bfebd1cd 1947 break;
7e0d574f
BVA
1948 case DM_TYPE_NONE:
1949 WARN_ON_ONCE(true);
1950 break;
4a0b4ddf
MS
1951 }
1952
c100ec49
MS
1953 r = dm_calculate_queue_limits(t, &limits);
1954 if (r) {
1955 DMERR("Cannot calculate initial queue limits");
1956 return r;
1957 }
bb37d772
DLM
1958 r = dm_table_set_restrictions(t, md->queue, &limits);
1959 if (r)
1960 return r;
1961
e7089f65
LC
1962 r = add_disk(md->disk);
1963 if (r)
1964 return r;
c100ec49 1965
89f871af
CH
1966 r = dm_sysfs_init(md);
1967 if (r) {
1968 del_gendisk(md->disk);
1969 return r;
1970 }
1971 md->type = type;
4a0b4ddf
MS
1972 return 0;
1973}
1974
2bec1f4a 1975struct mapped_device *dm_get_md(dev_t dev)
1da177e4
LT
1976{
1977 struct mapped_device *md;
1da177e4
LT
1978 unsigned minor = MINOR(dev);
1979
1980 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
1981 return NULL;
1982
f32c10b0 1983 spin_lock(&_minor_lock);
1da177e4
LT
1984
1985 md = idr_find(&_minor_idr, minor);
49de5769
MS
1986 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
1987 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
1988 md = NULL;
1989 goto out;
fba9f90e 1990 }
49de5769 1991 dm_get(md);
fba9f90e 1992out:
f32c10b0 1993 spin_unlock(&_minor_lock);
1da177e4 1994
637842cf
DT
1995 return md;
1996}
3cf2e4ba 1997EXPORT_SYMBOL_GPL(dm_get_md);
d229a958 1998
9ade92a9 1999void *dm_get_mdptr(struct mapped_device *md)
637842cf 2000{
9ade92a9 2001 return md->interface_ptr;
1da177e4
LT
2002}
2003
2004void dm_set_mdptr(struct mapped_device *md, void *ptr)
2005{
2006 md->interface_ptr = ptr;
2007}
2008
2009void dm_get(struct mapped_device *md)
2010{
2011 atomic_inc(&md->holders);
3f77316d 2012 BUG_ON(test_bit(DMF_FREEING, &md->flags));
1da177e4
LT
2013}
2014
09ee96b2
MP
2015int dm_hold(struct mapped_device *md)
2016{
2017 spin_lock(&_minor_lock);
2018 if (test_bit(DMF_FREEING, &md->flags)) {
2019 spin_unlock(&_minor_lock);
2020 return -EBUSY;
2021 }
2022 dm_get(md);
2023 spin_unlock(&_minor_lock);
2024 return 0;
2025}
2026EXPORT_SYMBOL_GPL(dm_hold);
2027
72d94861
AK
2028const char *dm_device_name(struct mapped_device *md)
2029{
2030 return md->name;
2031}
2032EXPORT_SYMBOL_GPL(dm_device_name);
2033
3f77316d 2034static void __dm_destroy(struct mapped_device *md, bool wait)
1da177e4 2035{
1134e5ae 2036 struct dm_table *map;
83d5e5b0 2037 int srcu_idx;
1da177e4 2038
3f77316d 2039 might_sleep();
fba9f90e 2040
63a4f065 2041 spin_lock(&_minor_lock);
3f77316d
KU
2042 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2043 set_bit(DMF_FREEING, &md->flags);
2044 spin_unlock(&_minor_lock);
3b785fbc 2045
c12c9a3c 2046 blk_set_queue_dying(md->queue);
3f77316d 2047
ab7c7bb6
MP
2048 /*
2049 * Take suspend_lock so that presuspend and postsuspend methods
2050 * do not race with internal suspend.
2051 */
2052 mutex_lock(&md->suspend_lock);
2a708cff 2053 map = dm_get_live_table(md, &srcu_idx);
3f77316d
KU
2054 if (!dm_suspended_md(md)) {
2055 dm_table_presuspend_targets(map);
adc0daad 2056 set_bit(DMF_SUSPENDED, &md->flags);
5df96f2b 2057 set_bit(DMF_POST_SUSPENDING, &md->flags);
3f77316d 2058 dm_table_postsuspend_targets(map);
1da177e4 2059 }
83d5e5b0
MP
2060 /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2061 dm_put_live_table(md, srcu_idx);
2a708cff 2062 mutex_unlock(&md->suspend_lock);
83d5e5b0 2063
3f77316d
KU
2064 /*
2065 * Rare, but there may be I/O requests still going to complete,
2066 * for example. Wait for all references to disappear.
2067 * No one should increment the reference count of the mapped_device,
2068 * after the mapped_device state becomes DMF_FREEING.
2069 */
2070 if (wait)
2071 while (atomic_read(&md->holders))
2072 msleep(1);
2073 else if (atomic_read(&md->holders))
2074 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2075 dm_device_name(md), atomic_read(&md->holders));
2076
3f77316d
KU
2077 dm_table_destroy(__unbind(md));
2078 free_dev(md);
2079}
2080
2081void dm_destroy(struct mapped_device *md)
2082{
2083 __dm_destroy(md, true);
2084}
2085
2086void dm_destroy_immediate(struct mapped_device *md)
2087{
2088 __dm_destroy(md, false);
2089}
2090
2091void dm_put(struct mapped_device *md)
2092{
2093 atomic_dec(&md->holders);
1da177e4 2094}
79eb885c 2095EXPORT_SYMBOL_GPL(dm_put);
1da177e4 2096
9f6dc633 2097static bool dm_in_flight_bios(struct mapped_device *md)
85067747
ML
2098{
2099 int cpu;
9f6dc633 2100 unsigned long sum = 0;
85067747 2101
9f6dc633
MS
2102 for_each_possible_cpu(cpu)
2103 sum += *per_cpu_ptr(md->pending_io, cpu);
85067747
ML
2104
2105 return sum != 0;
2106}
2107
2f064a59 2108static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
46125c1c
MB
2109{
2110 int r = 0;
9f4c3f87 2111 DEFINE_WAIT(wait);
46125c1c 2112
85067747 2113 while (true) {
9f4c3f87 2114 prepare_to_wait(&md->wait, &wait, task_state);
46125c1c 2115
9f6dc633 2116 if (!dm_in_flight_bios(md))
46125c1c
MB
2117 break;
2118
e3fabdfd 2119 if (signal_pending_state(task_state, current)) {
46125c1c
MB
2120 r = -EINTR;
2121 break;
2122 }
2123
2124 io_schedule();
2125 }
9f4c3f87 2126 finish_wait(&md->wait, &wait);
b44ebeb0 2127
9f6dc633
MS
2128 smp_rmb();
2129
46125c1c
MB
2130 return r;
2131}
2132
2f064a59 2133static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
85067747
ML
2134{
2135 int r = 0;
2136
2137 if (!queue_is_mq(md->queue))
2138 return dm_wait_for_bios_completion(md, task_state);
2139
2140 while (true) {
2141 if (!blk_mq_queue_inflight(md->queue))
2142 break;
2143
2144 if (signal_pending_state(task_state, current)) {
2145 r = -EINTR;
2146 break;
2147 }
2148
2149 msleep(5);
2150 }
2151
2152 return r;
2153}
2154
1da177e4
LT
2155/*
2156 * Process the deferred bios
2157 */
ef208587 2158static void dm_wq_work(struct work_struct *work)
1da177e4 2159{
0c2915b8
MS
2160 struct mapped_device *md = container_of(work, struct mapped_device, work);
2161 struct bio *bio;
ef208587 2162
3b00b203 2163 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
df12ee99 2164 spin_lock_irq(&md->deferred_lock);
0c2915b8 2165 bio = bio_list_pop(&md->deferred);
df12ee99
AK
2166 spin_unlock_irq(&md->deferred_lock);
2167
0c2915b8 2168 if (!bio)
df12ee99 2169 break;
022c2611 2170
0c2915b8 2171 submit_bio_noacct(bio);
022c2611 2172 }
1da177e4
LT
2173}
2174
9a1fb464 2175static void dm_queue_flush(struct mapped_device *md)
304f3f6a 2176{
3b00b203 2177 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
4e857c58 2178 smp_mb__after_atomic();
53d5914f 2179 queue_work(md->wq, &md->work);
304f3f6a
MB
2180}
2181
1da177e4 2182/*
042d2a9b 2183 * Swap in a new table, returning the old one for the caller to destroy.
1da177e4 2184 */
042d2a9b 2185struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
1da177e4 2186{
87eb5b21 2187 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
754c5fc7 2188 struct queue_limits limits;
042d2a9b 2189 int r;
1da177e4 2190
e61290a4 2191 mutex_lock(&md->suspend_lock);
1da177e4
LT
2192
2193 /* device must be suspended */
4f186f8b 2194 if (!dm_suspended_md(md))
93c534ae 2195 goto out;
1da177e4 2196
3ae70656
MS
2197 /*
2198 * If the new table has no data devices, retain the existing limits.
2199 * This helps multipath with queue_if_no_path if all paths disappear,
2200 * then new I/O is queued based on these limits, and then some paths
2201 * reappear.
2202 */
2203 if (dm_table_has_no_data_devices(table)) {
83d5e5b0 2204 live_map = dm_get_live_table_fast(md);
3ae70656
MS
2205 if (live_map)
2206 limits = md->queue->limits;
83d5e5b0 2207 dm_put_live_table_fast(md);
3ae70656
MS
2208 }
2209
87eb5b21
MC
2210 if (!live_map) {
2211 r = dm_calculate_queue_limits(table, &limits);
2212 if (r) {
2213 map = ERR_PTR(r);
2214 goto out;
2215 }
042d2a9b 2216 }
754c5fc7 2217
042d2a9b 2218 map = __bind(md, table, &limits);
62e08243 2219 dm_issue_global_event();
1da177e4 2220
93c534ae 2221out:
e61290a4 2222 mutex_unlock(&md->suspend_lock);
042d2a9b 2223 return map;
1da177e4
LT
2224}
2225
2226/*
2227 * Functions to lock and unlock any filesystem running on the
2228 * device.
2229 */
2ca3310e 2230static int lock_fs(struct mapped_device *md)
1da177e4 2231{
e39e2e95 2232 int r;
1da177e4 2233
040f04bd 2234 WARN_ON(test_bit(DMF_FROZEN, &md->flags));
aa8d7c2f 2235
977115c0 2236 r = freeze_bdev(md->disk->part0);
040f04bd
CH
2237 if (!r)
2238 set_bit(DMF_FROZEN, &md->flags);
2239 return r;
1da177e4
LT
2240}
2241
2ca3310e 2242static void unlock_fs(struct mapped_device *md)
1da177e4 2243{
aa8d7c2f
AK
2244 if (!test_bit(DMF_FROZEN, &md->flags))
2245 return;
977115c0 2246 thaw_bdev(md->disk->part0);
aa8d7c2f 2247 clear_bit(DMF_FROZEN, &md->flags);
1da177e4
LT
2248}
2249
2250/*
b48633f8
BVA
2251 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2252 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2253 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2254 *
ffcc3936
MS
2255 * If __dm_suspend returns 0, the device is completely quiescent
2256 * now. There is no request-processing activity. All new requests
2257 * are being added to md->deferred list.
cec47e3d 2258 */
ffcc3936 2259static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2f064a59 2260 unsigned suspend_flags, unsigned int task_state,
eaf9a736 2261 int dmf_suspended_flag)
1da177e4 2262{
ffcc3936
MS
2263 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2264 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2265 int r;
1da177e4 2266
5a8f1f80
BVA
2267 lockdep_assert_held(&md->suspend_lock);
2268
2e93ccc1
KU
2269 /*
2270 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2271 * This flag is cleared before dm_suspend returns.
2272 */
2273 if (noflush)
2274 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
86331f39 2275 else
ac75b09f 2276 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2e93ccc1 2277
d67ee213
MS
2278 /*
2279 * This gets reverted if there's an error later and the targets
2280 * provide the .presuspend_undo hook.
2281 */
cf222b37
AK
2282 dm_table_presuspend_targets(map);
2283
32a926da 2284 /*
9f518b27
KU
2285 * Flush I/O to the device.
2286 * Any I/O submitted after lock_fs() may not be flushed.
2287 * noflush takes precedence over do_lockfs.
2288 * (lock_fs() flushes I/Os and waits for them to complete.)
32a926da
MP
2289 */
2290 if (!noflush && do_lockfs) {
2291 r = lock_fs(md);
d67ee213
MS
2292 if (r) {
2293 dm_table_presuspend_undo_targets(map);
ffcc3936 2294 return r;
d67ee213 2295 }
aa8d7c2f 2296 }
1da177e4
LT
2297
2298 /*
3b00b203
MP
2299 * Here we must make sure that no processes are submitting requests
2300 * to target drivers i.e. no one may be executing
96c9865c 2301 * dm_split_and_process_bio from dm_submit_bio.
3b00b203 2302 *
96c9865c 2303 * To get all processes out of dm_split_and_process_bio in dm_submit_bio,
3b00b203 2304 * we take the write lock. To prevent any process from reentering
96c9865c 2305 * dm_split_and_process_bio from dm_submit_bio and quiesce the thread
0cede372 2306 * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
6a8736d1 2307 * flush_workqueue(md->wq).
1da177e4 2308 */
1eb787ec 2309 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
41abc4e1
HR
2310 if (map)
2311 synchronize_srcu(&md->io_barrier);
1da177e4 2312
d0bcb878 2313 /*
29e4013d
TH
2314 * Stop md->queue before flushing md->wq in case request-based
2315 * dm defers requests to md->wq from md->queue.
d0bcb878 2316 */
6a23e05c 2317 if (dm_request_based(md))
eca7ee6d 2318 dm_stop_queue(md->queue);
cec47e3d 2319
d0bcb878
KU
2320 flush_workqueue(md->wq);
2321
1da177e4 2322 /*
3b00b203
MP
2323 * At this point no more requests are entering target request routines.
2324 * We call dm_wait_for_completion to wait for all existing requests
2325 * to finish.
1da177e4 2326 */
b48633f8 2327 r = dm_wait_for_completion(md, task_state);
eaf9a736
MS
2328 if (!r)
2329 set_bit(dmf_suspended_flag, &md->flags);
1da177e4 2330
6d6f10df 2331 if (noflush)
022c2611 2332 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
41abc4e1
HR
2333 if (map)
2334 synchronize_srcu(&md->io_barrier);
2e93ccc1 2335
1da177e4 2336 /* were we interrupted ? */
46125c1c 2337 if (r < 0) {
9a1fb464 2338 dm_queue_flush(md);
73d410c0 2339
cec47e3d 2340 if (dm_request_based(md))
eca7ee6d 2341 dm_start_queue(md->queue);
cec47e3d 2342
2ca3310e 2343 unlock_fs(md);
d67ee213 2344 dm_table_presuspend_undo_targets(map);
ffcc3936 2345 /* pushback list is already flushed, so skip flush */
2ca3310e 2346 }
1da177e4 2347
ffcc3936
MS
2348 return r;
2349}
2350
2351/*
2352 * We need to be able to change a mapping table under a mounted
2353 * filesystem. For example we might want to move some data in
2354 * the background. Before the table can be swapped with
2355 * dm_bind_table, dm_suspend must be called to flush any in
2356 * flight bios and ensure that any further io gets deferred.
2357 */
2358/*
2359 * Suspend mechanism in request-based dm.
2360 *
2361 * 1. Flush all I/Os by lock_fs() if needed.
2362 * 2. Stop dispatching any I/O by stopping the request_queue.
2363 * 3. Wait for all in-flight I/Os to be completed or requeued.
2364 *
2365 * To abort suspend, start the request_queue.
2366 */
2367int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2368{
2369 struct dm_table *map = NULL;
2370 int r = 0;
2371
2372retry:
2373 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2374
2375 if (dm_suspended_md(md)) {
2376 r = -EINVAL;
2377 goto out_unlock;
2378 }
2379
2380 if (dm_suspended_internally_md(md)) {
2381 /* already internally suspended, wait for internal resume */
2382 mutex_unlock(&md->suspend_lock);
2383 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2384 if (r)
2385 return r;
2386 goto retry;
2387 }
2388
a12f5d48 2389 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
ffcc3936 2390
eaf9a736 2391 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
ffcc3936
MS
2392 if (r)
2393 goto out_unlock;
3b00b203 2394
5df96f2b 2395 set_bit(DMF_POST_SUSPENDING, &md->flags);
4d4471cb 2396 dm_table_postsuspend_targets(map);
5df96f2b 2397 clear_bit(DMF_POST_SUSPENDING, &md->flags);
4d4471cb 2398
d287483d 2399out_unlock:
e61290a4 2400 mutex_unlock(&md->suspend_lock);
cf222b37 2401 return r;
1da177e4
LT
2402}
2403
ffcc3936
MS
2404static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2405{
2406 if (map) {
2407 int r = dm_table_resume_targets(map);
2408 if (r)
2409 return r;
2410 }
2411
2412 dm_queue_flush(md);
2413
2414 /*
2415 * Flushing deferred I/Os must be done after targets are resumed
2416 * so that mapping of targets can work correctly.
2417 * Request-based dm is queueing the deferred I/Os in its request_queue.
2418 */
2419 if (dm_request_based(md))
eca7ee6d 2420 dm_start_queue(md->queue);
ffcc3936
MS
2421
2422 unlock_fs(md);
2423
2424 return 0;
2425}
2426
1da177e4
LT
2427int dm_resume(struct mapped_device *md)
2428{
8dc23658 2429 int r;
cf222b37 2430 struct dm_table *map = NULL;
1da177e4 2431
ffcc3936 2432retry:
8dc23658 2433 r = -EINVAL;
ffcc3936
MS
2434 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2435
4f186f8b 2436 if (!dm_suspended_md(md))
cf222b37 2437 goto out;
cf222b37 2438
ffcc3936
MS
2439 if (dm_suspended_internally_md(md)) {
2440 /* already internally suspended, wait for internal resume */
2441 mutex_unlock(&md->suspend_lock);
2442 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2443 if (r)
2444 return r;
2445 goto retry;
2446 }
2447
a12f5d48 2448 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2ca3310e 2449 if (!map || !dm_table_get_size(map))
cf222b37 2450 goto out;
1da177e4 2451
ffcc3936 2452 r = __dm_resume(md, map);
8757b776
MB
2453 if (r)
2454 goto out;
2ca3310e 2455
2ca3310e 2456 clear_bit(DMF_SUSPENDED, &md->flags);
cf222b37 2457out:
e61290a4 2458 mutex_unlock(&md->suspend_lock);
2ca3310e 2459
cf222b37 2460 return r;
1da177e4
LT
2461}
2462
fd2ed4d2
MP
2463/*
2464 * Internal suspend/resume works like userspace-driven suspend. It waits
2465 * until all bios finish and prevents issuing new bios to the target drivers.
2466 * It may be used only from the kernel.
fd2ed4d2
MP
2467 */
2468
ffcc3936 2469static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
fd2ed4d2 2470{
ffcc3936
MS
2471 struct dm_table *map = NULL;
2472
1ea0654e
BVA
2473 lockdep_assert_held(&md->suspend_lock);
2474
96b26c8c 2475 if (md->internal_suspend_count++)
ffcc3936
MS
2476 return; /* nested internal suspend */
2477
2478 if (dm_suspended_md(md)) {
2479 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2480 return; /* nest suspend */
2481 }
2482
a12f5d48 2483 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
ffcc3936
MS
2484
2485 /*
2486 * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2487 * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend
2488 * would require changing .presuspend to return an error -- avoid this
2489 * until there is a need for more elaborate variants of internal suspend.
2490 */
eaf9a736
MS
2491 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2492 DMF_SUSPENDED_INTERNALLY);
ffcc3936 2493
5df96f2b 2494 set_bit(DMF_POST_SUSPENDING, &md->flags);
ffcc3936 2495 dm_table_postsuspend_targets(map);
5df96f2b 2496 clear_bit(DMF_POST_SUSPENDING, &md->flags);
ffcc3936
MS
2497}
2498
2499static void __dm_internal_resume(struct mapped_device *md)
2500{
96b26c8c
MP
2501 BUG_ON(!md->internal_suspend_count);
2502
2503 if (--md->internal_suspend_count)
ffcc3936
MS
2504 return; /* resume from nested internal suspend */
2505
fd2ed4d2 2506 if (dm_suspended_md(md))
ffcc3936
MS
2507 goto done; /* resume from nested suspend */
2508
2509 /*
2510 * NOTE: existing callers don't need to call dm_table_resume_targets
2511 * (which may fail -- so best to avoid it for now by passing NULL map)
2512 */
2513 (void) __dm_resume(md, NULL);
2514
2515done:
2516 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2517 smp_mb__after_atomic();
2518 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2519}
2520
2521void dm_internal_suspend_noflush(struct mapped_device *md)
2522{
2523 mutex_lock(&md->suspend_lock);
2524 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2525 mutex_unlock(&md->suspend_lock);
2526}
2527EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2528
2529void dm_internal_resume(struct mapped_device *md)
2530{
2531 mutex_lock(&md->suspend_lock);
2532 __dm_internal_resume(md);
2533 mutex_unlock(&md->suspend_lock);
2534}
2535EXPORT_SYMBOL_GPL(dm_internal_resume);
2536
2537/*
2538 * Fast variants of internal suspend/resume hold md->suspend_lock,
2539 * which prevents interaction with userspace-driven suspend.
2540 */
2541
2542void dm_internal_suspend_fast(struct mapped_device *md)
2543{
2544 mutex_lock(&md->suspend_lock);
2545 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
fd2ed4d2
MP
2546 return;
2547
2548 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2549 synchronize_srcu(&md->io_barrier);
2550 flush_workqueue(md->wq);
2551 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2552}
b735fede 2553EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
fd2ed4d2 2554
ffcc3936 2555void dm_internal_resume_fast(struct mapped_device *md)
fd2ed4d2 2556{
ffcc3936 2557 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
fd2ed4d2
MP
2558 goto done;
2559
2560 dm_queue_flush(md);
2561
2562done:
2563 mutex_unlock(&md->suspend_lock);
2564}
b735fede 2565EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
fd2ed4d2 2566
1da177e4
LT
2567/*-----------------------------------------------------------------
2568 * Event notification.
2569 *---------------------------------------------------------------*/
3abf85b5 2570int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
60935eb2 2571 unsigned cookie)
69267a30 2572{
6958c1c6
MP
2573 int r;
2574 unsigned noio_flag;
60935eb2
MB
2575 char udev_cookie[DM_COOKIE_LENGTH];
2576 char *envp[] = { udev_cookie, NULL };
2577
6958c1c6
MP
2578 noio_flag = memalloc_noio_save();
2579
60935eb2 2580 if (!cookie)
6958c1c6 2581 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
60935eb2
MB
2582 else {
2583 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2584 DM_COOKIE_ENV_VAR_NAME, cookie);
6958c1c6
MP
2585 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2586 action, envp);
60935eb2 2587 }
6958c1c6
MP
2588
2589 memalloc_noio_restore(noio_flag);
2590
2591 return r;
69267a30
AK
2592}
2593
7a8c3d3b
MA
2594uint32_t dm_next_uevent_seq(struct mapped_device *md)
2595{
2596 return atomic_add_return(1, &md->uevent_seq);
2597}
2598
1da177e4
LT
2599uint32_t dm_get_event_nr(struct mapped_device *md)
2600{
2601 return atomic_read(&md->event_nr);
2602}
2603
2604int dm_wait_event(struct mapped_device *md, int event_nr)
2605{
2606 return wait_event_interruptible(md->eventq,
2607 (event_nr != atomic_read(&md->event_nr)));
2608}
2609
7a8c3d3b
MA
2610void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2611{
2612 unsigned long flags;
2613
2614 spin_lock_irqsave(&md->uevent_lock, flags);
2615 list_add(elist, &md->uevent_list);
2616 spin_unlock_irqrestore(&md->uevent_lock, flags);
2617}
2618
1da177e4
LT
2619/*
2620 * The gendisk is only valid as long as you have a reference
2621 * count on 'md'.
2622 */
2623struct gendisk *dm_disk(struct mapped_device *md)
2624{
2625 return md->disk;
2626}
65ff5b7d 2627EXPORT_SYMBOL_GPL(dm_disk);
1da177e4 2628
784aae73
MB
2629struct kobject *dm_kobject(struct mapped_device *md)
2630{
2995fa78 2631 return &md->kobj_holder.kobj;
784aae73
MB
2632}
2633
784aae73
MB
2634struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2635{
2636 struct mapped_device *md;
2637
2995fa78 2638 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
784aae73 2639
b9a41d21
HT
2640 spin_lock(&_minor_lock);
2641 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2642 md = NULL;
2643 goto out;
2644 }
784aae73 2645 dm_get(md);
b9a41d21
HT
2646out:
2647 spin_unlock(&_minor_lock);
2648
784aae73
MB
2649 return md;
2650}
2651
4f186f8b 2652int dm_suspended_md(struct mapped_device *md)
1da177e4
LT
2653{
2654 return test_bit(DMF_SUSPENDED, &md->flags);
2655}
2656
5df96f2b
MP
2657static int dm_post_suspending_md(struct mapped_device *md)
2658{
2659 return test_bit(DMF_POST_SUSPENDING, &md->flags);
2660}
2661
ffcc3936
MS
2662int dm_suspended_internally_md(struct mapped_device *md)
2663{
2664 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2665}
2666
2c140a24
MP
2667int dm_test_deferred_remove_flag(struct mapped_device *md)
2668{
2669 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2670}
2671
64dbce58
KU
2672int dm_suspended(struct dm_target *ti)
2673{
33bd6f06 2674 return dm_suspended_md(ti->table->md);
64dbce58
KU
2675}
2676EXPORT_SYMBOL_GPL(dm_suspended);
2677
5df96f2b
MP
2678int dm_post_suspending(struct dm_target *ti)
2679{
33bd6f06 2680 return dm_post_suspending_md(ti->table->md);
5df96f2b
MP
2681}
2682EXPORT_SYMBOL_GPL(dm_post_suspending);
2683
2e93ccc1
KU
2684int dm_noflush_suspending(struct dm_target *ti)
2685{
33bd6f06 2686 return __noflush_suspending(ti->table->md);
2e93ccc1
KU
2687}
2688EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2689
7e0d574f 2690struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
0776aa0e
MS
2691 unsigned integrity, unsigned per_io_data_size,
2692 unsigned min_pool_size)
e6ee8c0b 2693{
115485e8 2694 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
78d8e58a 2695 unsigned int pool_size = 0;
64f52b0e 2696 unsigned int front_pad, io_front_pad;
6f1c819c 2697 int ret;
e6ee8c0b
KU
2698
2699 if (!pools)
4e6e36c3 2700 return NULL;
e6ee8c0b 2701
78d8e58a
MS
2702 switch (type) {
2703 case DM_TYPE_BIO_BASED:
545ed20e 2704 case DM_TYPE_DAX_BIO_BASED:
0776aa0e 2705 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
62f26317
JX
2706 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
2707 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
6f1c819c
KO
2708 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2709 if (ret)
64f52b0e 2710 goto out;
6f1c819c 2711 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
eb8db831 2712 goto out;
78d8e58a
MS
2713 break;
2714 case DM_TYPE_REQUEST_BASED:
0776aa0e 2715 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
78d8e58a 2716 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
591ddcfc 2717 /* per_io_data_size is used for blk-mq pdu at queue allocation */
78d8e58a
MS
2718 break;
2719 default:
2720 BUG();
2721 }
2722
6f1c819c
KO
2723 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2724 if (ret)
5f015204 2725 goto out;
e6ee8c0b 2726
6f1c819c 2727 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
5f015204 2728 goto out;
a91a2785 2729
e6ee8c0b 2730 return pools;
5f1b670d 2731
5f1b670d
CH
2732out:
2733 dm_free_md_mempools(pools);
78d8e58a 2734
4e6e36c3 2735 return NULL;
e6ee8c0b
KU
2736}
2737
2738void dm_free_md_mempools(struct dm_md_mempools *pools)
2739{
2740 if (!pools)
2741 return;
2742
6f1c819c
KO
2743 bioset_exit(&pools->bs);
2744 bioset_exit(&pools->io_bs);
e6ee8c0b
KU
2745
2746 kfree(pools);
2747}
2748
9c72bad1
CH
2749struct dm_pr {
2750 u64 old_key;
2751 u64 new_key;
2752 u32 flags;
2753 bool fail_early;
2754};
2755
2756static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2757 void *data)
71cdb697
CH
2758{
2759 struct mapped_device *md = bdev->bd_disk->private_data;
9c72bad1
CH
2760 struct dm_table *table;
2761 struct dm_target *ti;
2762 int ret = -ENOTTY, srcu_idx;
71cdb697 2763
9c72bad1
CH
2764 table = dm_get_live_table(md, &srcu_idx);
2765 if (!table || !dm_table_get_size(table))
2766 goto out;
71cdb697 2767
9c72bad1
CH
2768 /* We only support devices that have a single target */
2769 if (dm_table_get_num_targets(table) != 1)
2770 goto out;
2771 ti = dm_table_get_target(table, 0);
71cdb697 2772
9c72bad1
CH
2773 ret = -EINVAL;
2774 if (!ti->type->iterate_devices)
2775 goto out;
2776
2777 ret = ti->type->iterate_devices(ti, fn, data);
2778out:
2779 dm_put_live_table(md, srcu_idx);
2780 return ret;
2781}
2782
2783/*
2784 * For register / unregister we need to manually call out to every path.
2785 */
2786static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2787 sector_t start, sector_t len, void *data)
2788{
2789 struct dm_pr *pr = data;
2790 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2791
2792 if (!ops || !ops->pr_register)
2793 return -EOPNOTSUPP;
2794 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2795}
2796
2797static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2798 u32 flags)
2799{
2800 struct dm_pr pr = {
2801 .old_key = old_key,
2802 .new_key = new_key,
2803 .flags = flags,
2804 .fail_early = true,
2805 };
2806 int ret;
2807
2808 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2809 if (ret && new_key) {
2810 /* unregister all paths if we failed to register any path */
2811 pr.old_key = new_key;
2812 pr.new_key = 0;
2813 pr.flags = 0;
2814 pr.fail_early = false;
2815 dm_call_pr(bdev, __dm_pr_register, &pr);
2816 }
2817
2818 return ret;
71cdb697
CH
2819}
2820
2821static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
956a4025 2822 u32 flags)
71cdb697
CH
2823{
2824 struct mapped_device *md = bdev->bd_disk->private_data;
2825 const struct pr_ops *ops;
971888c4 2826 int r, srcu_idx;
71cdb697 2827
5bd5e8d8 2828 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
71cdb697 2829 if (r < 0)
971888c4 2830 goto out;
71cdb697
CH
2831
2832 ops = bdev->bd_disk->fops->pr_ops;
2833 if (ops && ops->pr_reserve)
2834 r = ops->pr_reserve(bdev, key, type, flags);
2835 else
2836 r = -EOPNOTSUPP;
971888c4
MS
2837out:
2838 dm_unprepare_ioctl(md, srcu_idx);
71cdb697
CH
2839 return r;
2840}
2841
2842static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2843{
2844 struct mapped_device *md = bdev->bd_disk->private_data;
2845 const struct pr_ops *ops;
971888c4 2846 int r, srcu_idx;
71cdb697 2847
5bd5e8d8 2848 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
71cdb697 2849 if (r < 0)
971888c4 2850 goto out;
71cdb697
CH
2851
2852 ops = bdev->bd_disk->fops->pr_ops;
2853 if (ops && ops->pr_release)
2854 r = ops->pr_release(bdev, key, type);
2855 else
2856 r = -EOPNOTSUPP;
971888c4
MS
2857out:
2858 dm_unprepare_ioctl(md, srcu_idx);
71cdb697
CH
2859 return r;
2860}
2861
2862static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
956a4025 2863 enum pr_type type, bool abort)
71cdb697
CH
2864{
2865 struct mapped_device *md = bdev->bd_disk->private_data;
2866 const struct pr_ops *ops;
971888c4 2867 int r, srcu_idx;
71cdb697 2868
5bd5e8d8 2869 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
71cdb697 2870 if (r < 0)
971888c4 2871 goto out;
71cdb697
CH
2872
2873 ops = bdev->bd_disk->fops->pr_ops;
2874 if (ops && ops->pr_preempt)
2875 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
2876 else
2877 r = -EOPNOTSUPP;
971888c4
MS
2878out:
2879 dm_unprepare_ioctl(md, srcu_idx);
71cdb697
CH
2880 return r;
2881}
2882
2883static int dm_pr_clear(struct block_device *bdev, u64 key)
2884{
2885 struct mapped_device *md = bdev->bd_disk->private_data;
2886 const struct pr_ops *ops;
971888c4 2887 int r, srcu_idx;
71cdb697 2888
5bd5e8d8 2889 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
71cdb697 2890 if (r < 0)
971888c4 2891 goto out;
71cdb697
CH
2892
2893 ops = bdev->bd_disk->fops->pr_ops;
2894 if (ops && ops->pr_clear)
2895 r = ops->pr_clear(bdev, key);
2896 else
2897 r = -EOPNOTSUPP;
971888c4
MS
2898out:
2899 dm_unprepare_ioctl(md, srcu_idx);
71cdb697
CH
2900 return r;
2901}
2902
2903static const struct pr_ops dm_pr_ops = {
2904 .pr_register = dm_pr_register,
2905 .pr_reserve = dm_pr_reserve,
2906 .pr_release = dm_pr_release,
2907 .pr_preempt = dm_pr_preempt,
2908 .pr_clear = dm_pr_clear,
2909};
2910
83d5cde4 2911static const struct block_device_operations dm_blk_dops = {
c62b37d9 2912 .submit_bio = dm_submit_bio,
1da177e4
LT
2913 .open = dm_blk_open,
2914 .release = dm_blk_close,
aa129a22 2915 .ioctl = dm_blk_ioctl,
3ac51e74 2916 .getgeo = dm_blk_getgeo,
e76239a3 2917 .report_zones = dm_blk_report_zones,
71cdb697 2918 .pr_ops = &dm_pr_ops,
1da177e4
LT
2919 .owner = THIS_MODULE
2920};
2921
681cc5e8
MS
2922static const struct block_device_operations dm_rq_blk_dops = {
2923 .open = dm_blk_open,
2924 .release = dm_blk_close,
2925 .ioctl = dm_blk_ioctl,
2926 .getgeo = dm_blk_getgeo,
2927 .pr_ops = &dm_pr_ops,
2928 .owner = THIS_MODULE
2929};
2930
f26c5719
DW
2931static const struct dax_operations dm_dax_ops = {
2932 .direct_access = dm_dax_direct_access,
cdf6cdcd 2933 .zero_page_range = dm_dax_zero_page_range,
f26c5719
DW
2934};
2935
1da177e4
LT
2936/*
2937 * module hooks
2938 */
2939module_init(dm_init);
2940module_exit(dm_exit);
2941
2942module_param(major, uint, 0);
2943MODULE_PARM_DESC(major, "The major number of the device mapper");
f4790826 2944
e8603136
MS
2945module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
2946MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
2947
115485e8
MS
2948module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
2949MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
2950
a666e5c0
MP
2951module_param(swap_bios, int, S_IRUGO | S_IWUSR);
2952MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
2953
1da177e4
LT
2954MODULE_DESCRIPTION(DM_NAME " driver");
2955MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2956MODULE_LICENSE("GPL");