drivers/md/dm.c

   1 /*
   2  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4  *
   5  * This file is released under the GPL.
   6  */
   7
   8 #include "dm-core.h"
   9 #include "dm-rq.h"
  10 #include "dm-uevent.h"
  11
  12 #include <linux/init.h>
  13 #include <linux/module.h>
  14 #include <linux/mutex.h>
  15 #include <linux/sched/signal.h>
  16 #include <linux/blkpg.h>
  17 #include <linux/bio.h>
  18 #include <linux/mempool.h>
  19 #include <linux/dax.h>
  20 #include <linux/slab.h>
  21 #include <linux/idr.h>
  22 #include <linux/uio.h>
  23 #include <linux/hdreg.h>
  24 #include <linux/delay.h>
  25 #include <linux/wait.h>
  26 #include <linux/pr.h>
  27 #include <linux/refcount.h>
  28 #include <linux/part_stat.h>
  29 #include <linux/blk-crypto.h>
  30
  31 #define DM_MSG_PREFIX "core"
  32
  33 /*
  34  * Cookies are numeric values sent with CHANGE and REMOVE
  35  * uevents while resuming, removing or renaming the device.
  36  */
  37 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  38 #define DM_COOKIE_LENGTH 24
  39
  40 static const char *_name = DM_NAME;
  41
  42 static unsigned int major = 0;
  43 static unsigned int _major = 0;
  44
  45 static DEFINE_IDR(_minor_idr);
  46
  47 static DEFINE_SPINLOCK(_minor_lock);
  48
  49 static void do_deferred_remove(struct work_struct *w);
  50
  51 static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  52
  53 static struct workqueue_struct *deferred_remove_workqueue;
  54
  55 atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  56 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  57
  58 void dm_issue_global_event(void)
  59 {
  60         atomic_inc(&dm_global_event_nr);
  61         wake_up(&dm_global_eventq);
  62 }
  63
  64 /*
  65  * One of these is allocated (on-stack) per original bio.
  66  */
  67 struct clone_info {
  68         struct dm_table *map;
  69         struct bio *bio;
  70         struct dm_io *io;
  71         sector_t sector;
  72         unsigned sector_count;
  73 };
  74
  75 /*
  76  * One of these is allocated per clone bio.
  77  */
  78 #define DM_TIO_MAGIC 7282014
  79 struct dm_target_io {
  80         unsigned magic;
  81         struct dm_io *io;
  82         struct dm_target *ti;
  83         unsigned target_bio_nr;
  84         unsigned *len_ptr;
  85         bool inside_dm_io;
  86         struct bio clone;
  87 };
  88
  89 /*
  90  * One of these is allocated per original bio.
  91  * It contains the first clone used for that original.
  92  */
  93 #define DM_IO_MAGIC 5191977
  94 struct dm_io {
  95         unsigned magic;
  96         struct mapped_device *md;
  97         blk_status_t status;
  98         atomic_t io_count;
  99         struct bio *orig_bio;
 100         unsigned long start_time;
 101         spinlock_t endio_lock;
 102         struct dm_stats_aux stats_aux;
 103         /* last member of dm_target_io is 'struct bio' */
 104         struct dm_target_io tio;
 105 };
 106
 107 void *dm_per_bio_data(struct bio *bio, size_t data_size)
 108 {
 109         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 110         if (!tio->inside_dm_io)
 111                 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
 112         return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
 113 }
 114 EXPORT_SYMBOL_GPL(dm_per_bio_data);
 115
 116 struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
 117 {
 118         struct dm_io *io = (struct dm_io *)((char *)data + data_size);
 119         if (io->magic == DM_IO_MAGIC)
 120                 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
 121         BUG_ON(io->magic != DM_TIO_MAGIC);
 122         return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
 123 }
 124 EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
 125
 126 unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
 127 {
 128         return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
 129 }
 130 EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 131
 132 #define MINOR_ALLOCED ((void *)-1)
 133
 134 /*
 135  * Bits for the md->flags field.
 136  */
 137 #define DMF_BLOCK_IO_FOR_SUSPEND 0
 138 #define DMF_SUSPENDED 1
 139 #define DMF_FROZEN 2
 140 #define DMF_FREEING 3
 141 #define DMF_DELETING 4
 142 #define DMF_NOFLUSH_SUSPENDING 5
 143 #define DMF_DEFERRED_REMOVE 6
 144 #define DMF_SUSPENDED_INTERNALLY 7
 145
 146 #define DM_NUMA_NODE NUMA_NO_NODE
 147 static int dm_numa_node = DM_NUMA_NODE;
 148
 149 /*
 150  * For mempools pre-allocation at the table loading time.
 151  */
 152 struct dm_md_mempools {
 153         struct bio_set bs;
 154         struct bio_set io_bs;
 155 };
 156
 157 struct table_device {
 158         struct list_head list;
 159         refcount_t count;
 160         struct dm_dev dm_dev;
 161 };
 162
 163 /*
 164  * Bio-based DM's mempools' reserved IOs set by the user.
 165  */
 166 #define RESERVED_BIO_BASED_IOS          16
 167 static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 168
 169 static int __dm_get_module_param_int(int *module_param, int min, int max)
 170 {
 171         int param = READ_ONCE(*module_param);
 172         int modified_param = 0;
 173         bool modified = true;
 174
 175         if (param < min)
 176                 modified_param = min;
 177         else if (param > max)
 178                 modified_param = max;
 179         else
 180                 modified = false;
 181
 182         if (modified) {
 183                 (void)cmpxchg(module_param, param, modified_param);
 184                 param = modified_param;
 185         }
 186
 187         return param;
 188 }
 189
 190 unsigned __dm_get_module_param(unsigned *module_param,
 191                                unsigned def, unsigned max)
 192 {
 193         unsigned param = READ_ONCE(*module_param);
 194         unsigned modified_param = 0;
 195
 196         if (!param)
 197                 modified_param = def;
 198         else if (param > max)
 199                 modified_param = max;
 200
 201         if (modified_param) {
 202                 (void)cmpxchg(module_param, param, modified_param);
 203                 param = modified_param;
 204         }
 205
 206         return param;
 207 }
 208
 209 unsigned dm_get_reserved_bio_based_ios(void)
 210 {
 211         return __dm_get_module_param(&reserved_bio_based_ios,
 212                                      RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 213 }
 214 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 215
 216 static unsigned dm_get_numa_node(void)
 217 {
 218         return __dm_get_module_param_int(&dm_numa_node,
 219                                          DM_NUMA_NODE, num_online_nodes() - 1);
 220 }
 221
 222 static int __init local_init(void)
 223 {
 224         int r;
 225
 226         r = dm_uevent_init();
 227         if (r)
 228                 return r;
 229
 230         deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 231         if (!deferred_remove_workqueue) {
 232                 r = -ENOMEM;
 233                 goto out_uevent_exit;
 234         }
 235
 236         _major = major;
 237         r = register_blkdev(_major, _name);
 238         if (r < 0)
 239                 goto out_free_workqueue;
 240
 241         if (!_major)
 242                 _major = r;
 243
 244         return 0;
 245
 246 out_free_workqueue:
 247         destroy_workqueue(deferred_remove_workqueue);
 248 out_uevent_exit:
 249         dm_uevent_exit();
 250
 251         return r;
 252 }
 253
 254 static void local_exit(void)
 255 {
 256         flush_scheduled_work();
 257         destroy_workqueue(deferred_remove_workqueue);
 258
 259         unregister_blkdev(_major, _name);
 260         dm_uevent_exit();
 261
 262         _major = 0;
 263
 264         DMINFO("cleaned up");
 265 }
 266
 267 static int (*_inits[])(void) __initdata = {
 268         local_init,
 269         dm_target_init,
 270         dm_linear_init,
 271         dm_stripe_init,
 272         dm_io_init,
 273         dm_kcopyd_init,
 274         dm_interface_init,
 275         dm_statistics_init,
 276 };
 277
 278 static void (*_exits[])(void) = {
 279         local_exit,
 280         dm_target_exit,
 281         dm_linear_exit,
 282         dm_stripe_exit,
 283         dm_io_exit,
 284         dm_kcopyd_exit,
 285         dm_interface_exit,
 286         dm_statistics_exit,
 287 };
 288
 289 static int __init dm_init(void)
 290 {
 291         const int count = ARRAY_SIZE(_inits);
 292
 293         int r, i;
 294
 295         for (i = 0; i < count; i++) {
 296                 r = _inits[i]();
 297                 if (r)
 298                         goto bad;
 299         }
 300
 301         return 0;
 302
 303       bad:
 304         while (i--)
 305                 _exits[i]();
 306
 307         return r;
 308 }
 309
 310 static void __exit dm_exit(void)
 311 {
 312         int i = ARRAY_SIZE(_exits);
 313
 314         while (i--)
 315                 _exits[i]();
 316
 317         /*
 318          * Should be empty by this point.
 319          */
 320         idr_destroy(&_minor_idr);
 321 }
 322
 323 /*
 324  * Block device functions
 325  */
 326 int dm_deleting_md(struct mapped_device *md)
 327 {
 328         return test_bit(DMF_DELETING, &md->flags);
 329 }
 330
 331 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 332 {
 333         struct mapped_device *md;
 334
 335         spin_lock(&_minor_lock);
 336
 337         md = bdev->bd_disk->private_data;
 338         if (!md)
 339                 goto out;
 340
 341         if (test_bit(DMF_FREEING, &md->flags) ||
 342             dm_deleting_md(md)) {
 343                 md = NULL;
 344                 goto out;
 345         }
 346
 347         dm_get(md);
 348         atomic_inc(&md->open_count);
 349 out:
 350         spin_unlock(&_minor_lock);
 351
 352         return md ? 0 : -ENXIO;
 353 }
 354
 355 static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 356 {
 357         struct mapped_device *md;
 358
 359         spin_lock(&_minor_lock);
 360
 361         md = disk->private_data;
 362         if (WARN_ON(!md))
 363                 goto out;
 364
 365         if (atomic_dec_and_test(&md->open_count) &&
 366             (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 367                 queue_work(deferred_remove_workqueue, &deferred_remove_work);
 368
 369         dm_put(md);
 370 out:
 371         spin_unlock(&_minor_lock);
 372 }
 373
 374 int dm_open_count(struct mapped_device *md)
 375 {
 376         return atomic_read(&md->open_count);
 377 }
 378
 379 /*
 380  * Guarantees nothing is using the device before it's deleted.
 381  */
 382 int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 383 {
 384         int r = 0;
 385
 386         spin_lock(&_minor_lock);
 387
 388         if (dm_open_count(md)) {
 389                 r = -EBUSY;
 390                 if (mark_deferred)
 391                         set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 392         } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 393                 r = -EEXIST;
 394         else
 395                 set_bit(DMF_DELETING, &md->flags);
 396
 397         spin_unlock(&_minor_lock);
 398
 399         return r;
 400 }
 401
 402 int dm_cancel_deferred_remove(struct mapped_device *md)
 403 {
 404         int r = 0;
 405
 406         spin_lock(&_minor_lock);
 407
 408         if (test_bit(DMF_DELETING, &md->flags))
 409                 r = -EBUSY;
 410         else
 411                 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 412
 413         spin_unlock(&_minor_lock);
 414
 415         return r;
 416 }
 417
 418 static void do_deferred_remove(struct work_struct *w)
 419 {
 420         dm_deferred_remove();
 421 }
 422
 423 sector_t dm_get_size(struct mapped_device *md)
 424 {
 425         return get_capacity(md->disk);
 426 }
 427
 428 struct request_queue *dm_get_md_queue(struct mapped_device *md)
 429 {
 430         return md->queue;
 431 }
 432
 433 struct dm_stats *dm_get_stats(struct mapped_device *md)
 434 {
 435         return &md->stats;
 436 }
 437
 438 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 439 {
 440         struct mapped_device *md = bdev->bd_disk->private_data;
 441
 442         return dm_get_geometry(md, geo);
 443 }
 444
 445 #ifdef CONFIG_BLK_DEV_ZONED
 446 int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
 447 {
 448         struct dm_report_zones_args *args = data;
 449         sector_t sector_diff = args->tgt->begin - args->start;
 450
 451         /*
 452          * Ignore zones beyond the target range.
 453          */
 454         if (zone->start >= args->start + args->tgt->len)
 455                 return 0;
 456
 457         /*
 458          * Remap the start sector and write pointer position of the zone
 459          * to match its position in the target range.
 460          */
 461         zone->start += sector_diff;
 462         if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
 463                 if (zone->cond == BLK_ZONE_COND_FULL)
 464                         zone->wp = zone->start + zone->len;
 465                 else if (zone->cond == BLK_ZONE_COND_EMPTY)
 466                         zone->wp = zone->start;
 467                 else
 468                         zone->wp += sector_diff;
 469         }
 470
 471         args->next_sector = zone->start + zone->len;
 472         return args->orig_cb(zone, args->zone_idx++, args->orig_data);
 473 }
 474 EXPORT_SYMBOL_GPL(dm_report_zones_cb);
 475
 476 static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 477                 unsigned int nr_zones, report_zones_cb cb, void *data)
 478 {
 479         struct mapped_device *md = disk->private_data;
 480         struct dm_table *map;
 481         int srcu_idx, ret;
 482         struct dm_report_zones_args args = {
 483                 .next_sector = sector,
 484                 .orig_data = data,
 485                 .orig_cb = cb,
 486         };
 487
 488         if (dm_suspended_md(md))
 489                 return -EAGAIN;
 490
 491         map = dm_get_live_table(md, &srcu_idx);
 492         if (!map)
 493                 return -EIO;
 494
 495         do {
 496                 struct dm_target *tgt;
 497
 498                 tgt = dm_table_find_target(map, args.next_sector);
 499                 if (WARN_ON_ONCE(!tgt->type->report_zones)) {
 500                         ret = -EIO;
 501                         goto out;
 502                 }
 503
 504                 args.tgt = tgt;
 505                 ret = tgt->type->report_zones(tgt, &args, nr_zones);
 506                 if (ret < 0)
 507                         goto out;
 508         } while (args.zone_idx < nr_zones &&
 509                  args.next_sector < get_capacity(disk));
 510
 511         ret = args.zone_idx;
 512 out:
 513         dm_put_live_table(md, srcu_idx);
 514         return ret;
 515 }
 516 #else
 517 #define dm_blk_report_zones             NULL
 518 #endif /* CONFIG_BLK_DEV_ZONED */
 519
 520 static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
 521                             struct block_device **bdev)
 522         __acquires(md->io_barrier)
 523 {
 524         struct dm_target *tgt;
 525         struct dm_table *map;
 526         int r;
 527
 528 retry:
 529         r = -ENOTTY;
 530         map = dm_get_live_table(md, srcu_idx);
 531         if (!map || !dm_table_get_size(map))
 532                 return r;
 533
 534         /* We only support devices that have a single target */
 535         if (dm_table_get_num_targets(map) != 1)
 536                 return r;
 537
 538         tgt = dm_table_get_target(map, 0);
 539         if (!tgt->type->prepare_ioctl)
 540                 return r;
 541
 542         if (dm_suspended_md(md))
 543                 return -EAGAIN;
 544
 545         r = tgt->type->prepare_ioctl(tgt, bdev);
 546         if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 547                 dm_put_live_table(md, *srcu_idx);
 548                 msleep(10);
 549                 goto retry;
 550         }
 551
 552         return r;
 553 }
 554
 555 static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
 556         __releases(md->io_barrier)
 557 {
 558         dm_put_live_table(md, srcu_idx);
 559 }
 560
 561 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 562                         unsigned int cmd, unsigned long arg)
 563 {
 564         struct mapped_device *md = bdev->bd_disk->private_data;
 565         int r, srcu_idx;
 566
 567         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
 568         if (r < 0)
 569                 goto out;
 570
 571         if (r > 0) {
 572                 /*
 573                  * Target determined this ioctl is being issued against a
 574                  * subset of the parent bdev; require extra privileges.
 575                  */
 576                 if (!capable(CAP_SYS_RAWIO)) {
 577                         DMWARN_LIMIT(
 578         "%s: sending ioctl %x to DM device without required privilege.",
 579                                 current->comm, cmd);
 580                         r = -ENOIOCTLCMD;
 581                         goto out;
 582                 }
 583         }
 584
 585         r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 586 out:
 587         dm_unprepare_ioctl(md, srcu_idx);
 588         return r;
 589 }
 590
 591 static void start_io_acct(struct dm_io *io);
 592
 593 static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 594 {
 595         struct dm_io *io;
 596         struct dm_target_io *tio;
 597         struct bio *clone;
 598
 599         clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
 600         if (!clone)
 601                 return NULL;
 602
 603         tio = container_of(clone, struct dm_target_io, clone);
 604         tio->inside_dm_io = true;
 605         tio->io = NULL;
 606
 607         io = container_of(tio, struct dm_io, tio);
 608         io->magic = DM_IO_MAGIC;
 609         io->status = 0;
 610         atomic_set(&io->io_count, 1);
 611         io->orig_bio = bio;
 612         io->md = md;
 613         spin_lock_init(&io->endio_lock);
 614
 615         start_io_acct(io);
 616
 617         return io;
 618 }
 619
 620 static void free_io(struct mapped_device *md, struct dm_io *io)
 621 {
 622         bio_put(&io->tio.clone);
 623 }
 624
 625 static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 626                                       unsigned target_bio_nr, gfp_t gfp_mask)
 627 {
 628         struct dm_target_io *tio;
 629
 630         if (!ci->io->tio.io) {
 631                 /* the dm_target_io embedded in ci->io is available */
 632                 tio = &ci->io->tio;
 633         } else {
 634                 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
 635                 if (!clone)
 636                         return NULL;
 637
 638                 tio = container_of(clone, struct dm_target_io, clone);
 639                 tio->inside_dm_io = false;
 640         }
 641
 642         tio->magic = DM_TIO_MAGIC;
 643         tio->io = ci->io;
 644         tio->ti = ti;
 645         tio->target_bio_nr = target_bio_nr;
 646
 647         return tio;
 648 }
 649
 650 static void free_tio(struct dm_target_io *tio)
 651 {
 652         if (tio->inside_dm_io)
 653                 return;
 654         bio_put(&tio->clone);
 655 }
 656
 657 static bool md_in_flight_bios(struct mapped_device *md)
 658 {
 659         int cpu;
 660         struct hd_struct *part = &dm_disk(md)->part0;
 661         long sum = 0;
 662
 663         for_each_possible_cpu(cpu) {
 664                 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
 665                 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
 666         }
 667
 668         return sum != 0;
 669 }
 670
 671 static bool md_in_flight(struct mapped_device *md)
 672 {
 673         if (queue_is_mq(md->queue))
 674                 return blk_mq_queue_inflight(md->queue);
 675         else
 676                 return md_in_flight_bios(md);
 677 }
 678
 679 u64 dm_start_time_ns_from_clone(struct bio *bio)
 680 {
 681         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 682         struct dm_io *io = tio->io;
 683
 684         return jiffies_to_nsecs(io->start_time);
 685 }
 686 EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
 687
 688 static void start_io_acct(struct dm_io *io)
 689 {
 690         struct mapped_device *md = io->md;
 691         struct bio *bio = io->orig_bio;
 692
 693         io->start_time = bio_start_io_acct(bio);
 694         if (unlikely(dm_stats_used(&md->stats)))
 695                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
 696                                     bio->bi_iter.bi_sector, bio_sectors(bio),
 697                                     false, 0, &io->stats_aux);
 698 }
 699
 700 static void end_io_acct(struct dm_io *io)
 701 {
 702         struct mapped_device *md = io->md;
 703         struct bio *bio = io->orig_bio;
 704         unsigned long duration = jiffies - io->start_time;
 705
 706         bio_end_io_acct(bio, io->start_time);
 707
 708         if (unlikely(dm_stats_used(&md->stats)))
 709                 dm_stats_account_io(&md->stats, bio_data_dir(bio),
 710                                     bio->bi_iter.bi_sector, bio_sectors(bio),
 711                                     true, duration, &io->stats_aux);
 712
 713         /* nudge anyone waiting on suspend queue */
 714         if (unlikely(wq_has_sleeper(&md->wait)))
 715                 wake_up(&md->wait);
 716 }
 717
 718 /*
 719  * Add the bio to the list of deferred io.
 720  */
 721 static void queue_io(struct mapped_device *md, struct bio *bio)
 722 {
 723         unsigned long flags;
 724
 725         spin_lock_irqsave(&md->deferred_lock, flags);
 726         bio_list_add(&md->deferred, bio);
 727         spin_unlock_irqrestore(&md->deferred_lock, flags);
 728         queue_work(md->wq, &md->work);
 729 }
 730
 731 /*
 732  * Everyone (including functions in this file), should use this
 733  * function to access the md->map field, and make sure they call
 734  * dm_put_live_table() when finished.
 735  */
 736 struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 737 {
 738         *srcu_idx = srcu_read_lock(&md->io_barrier);
 739
 740         return srcu_dereference(md->map, &md->io_barrier);
 741 }
 742
 743 void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 744 {
 745         srcu_read_unlock(&md->io_barrier, srcu_idx);
 746 }
 747
 748 void dm_sync_table(struct mapped_device *md)
 749 {
 750         synchronize_srcu(&md->io_barrier);
 751         synchronize_rcu_expedited();
 752 }
 753
 754 /*
 755  * A fast alternative to dm_get_live_table/dm_put_live_table.
 756  * The caller must not block between these two functions.
 757  */
 758 static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 759 {
 760         rcu_read_lock();
 761         return rcu_dereference(md->map);
 762 }
 763
 764 static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 765 {
 766         rcu_read_unlock();
 767 }
 768
 769 static char *_dm_claim_ptr = "I belong to device-mapper";
 770
 771 /*
 772  * Open a table device so we can use it as a map destination.
 773  */
 774 static int open_table_device(struct table_device *td, dev_t dev,
 775                              struct mapped_device *md)
 776 {
 777         struct block_device *bdev;
 778
 779         int r;
 780
 781         BUG_ON(td->dm_dev.bdev);
 782
 783         bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
 784         if (IS_ERR(bdev))
 785                 return PTR_ERR(bdev);
 786
 787         r = bd_link_disk_holder(bdev, dm_disk(md));
 788         if (r) {
 789                 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 790                 return r;
 791         }
 792
 793         td->dm_dev.bdev = bdev;
 794         td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 795         return 0;
 796 }
 797
 798 /*
 799  * Close a table device that we've been using.
 800  */
 801 static void close_table_device(struct table_device *td, struct mapped_device *md)
 802 {
 803         if (!td->dm_dev.bdev)
 804                 return;
 805
 806         bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 807         blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 808         put_dax(td->dm_dev.dax_dev);
 809         td->dm_dev.bdev = NULL;
 810         td->dm_dev.dax_dev = NULL;
 811 }
 812
 813 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 814                                               fmode_t mode)
 815 {
 816         struct table_device *td;
 817
 818         list_for_each_entry(td, l, list)
 819                 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 820                         return td;
 821
 822         return NULL;
 823 }
 824
 825 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 826                         struct dm_dev **result)
 827 {
 828         int r;
 829         struct table_device *td;
 830
 831         mutex_lock(&md->table_devices_lock);
 832         td = find_table_device(&md->table_devices, dev, mode);
 833         if (!td) {
 834                 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 835                 if (!td) {
 836                         mutex_unlock(&md->table_devices_lock);
 837                         return -ENOMEM;
 838                 }
 839
 840                 td->dm_dev.mode = mode;
 841                 td->dm_dev.bdev = NULL;
 842
 843                 if ((r = open_table_device(td, dev, md))) {
 844                         mutex_unlock(&md->table_devices_lock);
 845                         kfree(td);
 846                         return r;
 847                 }
 848
 849                 format_dev_t(td->dm_dev.name, dev);
 850
 851                 refcount_set(&td->count, 1);
 852                 list_add(&td->list, &md->table_devices);
 853         } else {
 854                 refcount_inc(&td->count);
 855         }
 856         mutex_unlock(&md->table_devices_lock);
 857
 858         *result = &td->dm_dev;
 859         return 0;
 860 }
 861 EXPORT_SYMBOL_GPL(dm_get_table_device);
 862
 863 void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 864 {
 865         struct table_device *td = container_of(d, struct table_device, dm_dev);
 866
 867         mutex_lock(&md->table_devices_lock);
 868         if (refcount_dec_and_test(&td->count)) {
 869                 close_table_device(td, md);
 870                 list_del(&td->list);
 871                 kfree(td);
 872         }
 873         mutex_unlock(&md->table_devices_lock);
 874 }
 875 EXPORT_SYMBOL(dm_put_table_device);
 876
 877 static void free_table_devices(struct list_head *devices)
 878 {
 879         struct list_head *tmp, *next;
 880
 881         list_for_each_safe(tmp, next, devices) {
 882                 struct table_device *td = list_entry(tmp, struct table_device, list);
 883
 884                 DMWARN("dm_destroy: %s still exists with %d references",
 885                        td->dm_dev.name, refcount_read(&td->count));
 886                 kfree(td);
 887         }
 888 }
 889
 890 /*
 891  * Get the geometry associated with a dm device
 892  */
 893 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 894 {
 895         *geo = md->geometry;
 896
 897         return 0;
 898 }
 899
 900 /*
 901  * Set the geometry of a device.
 902  */
 903 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 904 {
 905         sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 906
 907         if (geo->start > sz) {
 908                 DMWARN("Start sector is beyond the geometry limits.");
 909                 return -EINVAL;
 910         }
 911
 912         md->geometry = *geo;
 913
 914         return 0;
 915 }
 916
 917 static int __noflush_suspending(struct mapped_device *md)
 918 {
 919         return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 920 }
 921
 922 /*
 923  * Decrements the number of outstanding ios that a bio has been
 924  * cloned into, completing the original io if necc.
 925  */
 926 static void dec_pending(struct dm_io *io, blk_status_t error)
 927 {
 928         unsigned long flags;
 929         blk_status_t io_error;
 930         struct bio *bio;
 931         struct mapped_device *md = io->md;
 932
 933         /* Push-back supersedes any I/O errors */
 934         if (unlikely(error)) {
 935                 spin_lock_irqsave(&io->endio_lock, flags);
 936                 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
 937                         io->status = error;
 938                 spin_unlock_irqrestore(&io->endio_lock, flags);
 939         }
 940
 941         if (atomic_dec_and_test(&io->io_count)) {
 942                 if (io->status == BLK_STS_DM_REQUEUE) {
 943                         /*
 944                          * Target requested pushing back the I/O.
 945                          */
 946                         spin_lock_irqsave(&md->deferred_lock, flags);
 947                         if (__noflush_suspending(md))
 948                                 /* NOTE early return due to BLK_STS_DM_REQUEUE below */
 949                                 bio_list_add_head(&md->deferred, io->orig_bio);
 950                         else
 951                                 /* noflush suspend was interrupted. */
 952                                 io->status = BLK_STS_IOERR;
 953                         spin_unlock_irqrestore(&md->deferred_lock, flags);
 954                 }
 955
 956                 io_error = io->status;
 957                 bio = io->orig_bio;
 958                 end_io_acct(io);
 959                 free_io(md, io);
 960
 961                 if (io_error == BLK_STS_DM_REQUEUE)
 962                         return;
 963
 964                 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 965                         /*
 966                          * Preflush done for flush with data, reissue
 967                          * without REQ_PREFLUSH.
 968                          */
 969                         bio->bi_opf &= ~REQ_PREFLUSH;
 970                         queue_io(md, bio);
 971                 } else {
 972                         /* done with normal IO or empty flush */
 973                         if (io_error)
 974                                 bio->bi_status = io_error;
 975                         bio_endio(bio);
 976                 }
 977         }
 978 }
 979
 980 void disable_discard(struct mapped_device *md)
 981 {
 982         struct queue_limits *limits = dm_get_queue_limits(md);
 983
 984         /* device doesn't really support DISCARD, disable it */
 985         limits->max_discard_sectors = 0;
 986         blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
 987 }
 988
 989 void disable_write_same(struct mapped_device *md)
 990 {
 991         struct queue_limits *limits = dm_get_queue_limits(md);
 992
 993         /* device doesn't really support WRITE SAME, disable it */
 994         limits->max_write_same_sectors = 0;
 995 }
 996
 997 void disable_write_zeroes(struct mapped_device *md)
 998 {
 999         struct queue_limits *limits = dm_get_queue_limits(md);
1000
1001         /* device doesn't really support WRITE ZEROES, disable it */
1002         limits->max_write_zeroes_sectors = 0;
1003 }
1004
1005 static void clone_endio(struct bio *bio)
1006 {
1007         blk_status_t error = bio->bi_status;
1008         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1009         struct dm_io *io = tio->io;
1010         struct mapped_device *md = tio->io->md;
1011         dm_endio_fn endio = tio->ti->type->end_io;
1012
1013         if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
1014                 if (bio_op(bio) == REQ_OP_DISCARD &&
1015                     !bio->bi_disk->queue->limits.max_discard_sectors)
1016                         disable_discard(md);
1017                 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
1018                          !bio->bi_disk->queue->limits.max_write_same_sectors)
1019                         disable_write_same(md);
1020                 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
1021                          !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
1022                         disable_write_zeroes(md);
1023         }
1024
1025         if (endio) {
1026                 int r = endio(tio->ti, bio, &error);
1027                 switch (r) {
1028                 case DM_ENDIO_REQUEUE:
1029                         error = BLK_STS_DM_REQUEUE;
1030                         /*FALLTHRU*/
1031                 case DM_ENDIO_DONE:
1032                         break;
1033                 case DM_ENDIO_INCOMPLETE:
1034                         /* The target will handle the io */
1035                         return;
1036                 default:
1037                         DMWARN("unimplemented target endio return value: %d", r);
1038                         BUG();
1039                 }
1040         }
1041
1042         free_tio(tio);
1043         dec_pending(io, error);
1044 }
1045
1046 /*
1047  * Return maximum size of I/O possible at the supplied sector up to the current
1048  * target boundary.
1049  */
1050 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1051 {
1052         sector_t target_offset = dm_target_offset(ti, sector);
1053
1054         return ti->len - target_offset;
1055 }
1056
1057 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1058 {
1059         sector_t len = max_io_len_target_boundary(sector, ti);
1060         sector_t offset, max_len;
1061
1062         /*
1063          * Does the target need to split even further?
1064          */
1065         if (ti->max_io_len) {
1066                 offset = dm_target_offset(ti, sector);
1067                 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1068                         max_len = sector_div(offset, ti->max_io_len);
1069                 else
1070                         max_len = offset & (ti->max_io_len - 1);
1071                 max_len = ti->max_io_len - max_len;
1072
1073                 if (len > max_len)
1074                         len = max_len;
1075         }
1076
1077         return len;
1078 }
1079
1080 int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1081 {
1082         if (len > UINT_MAX) {
1083                 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1084                       (unsigned long long)len, UINT_MAX);
1085                 ti->error = "Maximum size of target IO is too large";
1086                 return -EINVAL;
1087         }
1088
1089         ti->max_io_len = (uint32_t) len;
1090
1091         return 0;
1092 }
1093 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1094
1095 static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1096                                                 sector_t sector, int *srcu_idx)
1097         __acquires(md->io_barrier)
1098 {
1099         struct dm_table *map;
1100         struct dm_target *ti;
1101
1102         map = dm_get_live_table(md, srcu_idx);
1103         if (!map)
1104                 return NULL;
1105
1106         ti = dm_table_find_target(map, sector);
1107         if (!ti)
1108                 return NULL;
1109
1110         return ti;
1111 }
1112
1113 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1114                                  long nr_pages, void **kaddr, pfn_t *pfn)
1115 {
1116         struct mapped_device *md = dax_get_private(dax_dev);
1117         sector_t sector = pgoff * PAGE_SECTORS;
1118         struct dm_target *ti;
1119         long len, ret = -EIO;
1120         int srcu_idx;
1121
1122         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1123
1124         if (!ti)
1125                 goto out;
1126         if (!ti->type->direct_access)
1127                 goto out;
1128         len = max_io_len(sector, ti) / PAGE_SECTORS;
1129         if (len < 1)
1130                 goto out;
1131         nr_pages = min(len, nr_pages);
1132         ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1133
1134  out:
1135         dm_put_live_table(md, srcu_idx);
1136
1137         return ret;
1138 }
1139
1140 static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1141                 int blocksize, sector_t start, sector_t len)
1142 {
1143         struct mapped_device *md = dax_get_private(dax_dev);
1144         struct dm_table *map;
1145         int srcu_idx;
1146         bool ret;
1147
1148         map = dm_get_live_table(md, &srcu_idx);
1149         if (!map)
1150                 return false;
1151
1152         ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
1153
1154         dm_put_live_table(md, srcu_idx);
1155
1156         return ret;
1157 }
1158
1159 static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1160                                     void *addr, size_t bytes, struct iov_iter *i)
1161 {
1162         struct mapped_device *md = dax_get_private(dax_dev);
1163         sector_t sector = pgoff * PAGE_SECTORS;
1164         struct dm_target *ti;
1165         long ret = 0;
1166         int srcu_idx;
1167
1168         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1169
1170         if (!ti)
1171                 goto out;
1172         if (!ti->type->dax_copy_from_iter) {
1173                 ret = copy_from_iter(addr, bytes, i);
1174                 goto out;
1175         }
1176         ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1177  out:
1178         dm_put_live_table(md, srcu_idx);
1179
1180         return ret;
1181 }
1182
1183 static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1184                 void *addr, size_t bytes, struct iov_iter *i)
1185 {
1186         struct mapped_device *md = dax_get_private(dax_dev);
1187         sector_t sector = pgoff * PAGE_SECTORS;
1188         struct dm_target *ti;
1189         long ret = 0;
1190         int srcu_idx;
1191
1192         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1193
1194         if (!ti)
1195                 goto out;
1196         if (!ti->type->dax_copy_to_iter) {
1197                 ret = copy_to_iter(addr, bytes, i);
1198                 goto out;
1199         }
1200         ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1201  out:
1202         dm_put_live_table(md, srcu_idx);
1203
1204         return ret;
1205 }
1206
1207 static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1208                                   size_t nr_pages)
1209 {
1210         struct mapped_device *md = dax_get_private(dax_dev);
1211         sector_t sector = pgoff * PAGE_SECTORS;
1212         struct dm_target *ti;
1213         int ret = -EIO;
1214         int srcu_idx;
1215
1216         ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1217
1218         if (!ti)
1219                 goto out;
1220         if (WARN_ON(!ti->type->dax_zero_page_range)) {
1221                 /*
1222                  * ->zero_page_range() is mandatory dax operation. If we are
1223                  *  here, something is wrong.
1224                  */
1225                 dm_put_live_table(md, srcu_idx);
1226                 goto out;
1227         }
1228         ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1229
1230  out:
1231         dm_put_live_table(md, srcu_idx);
1232
1233         return ret;
1234 }
1235
1236 /*
1237  * A target may call dm_accept_partial_bio only from the map routine.  It is
1238  * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
1239  * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH.
1240  *
1241  * dm_accept_partial_bio informs the dm that the target only wants to process
1242  * additional n_sectors sectors of the bio and the rest of the data should be
1243  * sent in a next bio.
1244  *
1245  * A diagram that explains the arithmetics:
1246  * +--------------------+---------------+-------+
1247  * |         1          |       2       |   3   |
1248  * +--------------------+---------------+-------+
1249  *
1250  * <-------------- *tio->len_ptr --------------->
1251  *                      <------- bi_size ------->
1252  *                      <-- n_sectors -->
1253  *
1254  * Region 1 was already iterated over with bio_advance or similar function.
1255  *      (it may be empty if the target doesn't use bio_advance)
1256  * Region 2 is the remaining bio size that the target wants to process.
1257  *      (it may be empty if region 1 is non-empty, although there is no reason
1258  *       to make it empty)
1259  * The target requires that region 3 is to be sent in the next bio.
1260  *
1261  * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1262  * the partially processed part (the sum of regions 1+2) must be the same for all
1263  * copies of the bio.
1264  */
1265 void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1266 {
1267         struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1268         unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1269         BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1270         BUG_ON(bi_size > *tio->len_ptr);
1271         BUG_ON(n_sectors > bi_size);
1272         *tio->len_ptr -= bi_size - n_sectors;
1273         bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1274 }
1275 EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1276
1277 static blk_qc_t __map_bio(struct dm_target_io *tio)
1278 {
1279         int r;
1280         sector_t sector;
1281         struct bio *clone = &tio->clone;
1282         struct dm_io *io = tio->io;
1283         struct mapped_device *md = io->md;
1284         struct dm_target *ti = tio->ti;
1285         blk_qc_t ret = BLK_QC_T_NONE;
1286
1287         clone->bi_end_io = clone_endio;
1288
1289         /*
1290          * Map the clone.  If r == 0 we don't need to do
1291          * anything, the target has assumed ownership of
1292          * this io.
1293          */
1294         atomic_inc(&io->io_count);
1295         sector = clone->bi_iter.bi_sector;
1296
1297         r = ti->type->map(ti, clone);
1298         switch (r) {
1299         case DM_MAPIO_SUBMITTED:
1300                 break;
1301         case DM_MAPIO_REMAPPED:
1302                 /* the bio has been remapped so dispatch it */
1303                 trace_block_bio_remap(clone->bi_disk->queue, clone,
1304                                       bio_dev(io->orig_bio), sector);
1305                 if (md->type == DM_TYPE_NVME_BIO_BASED)
1306                         ret = direct_make_request(clone);
1307                 else
1308                         ret = generic_make_request(clone);
1309                 break;
1310         case DM_MAPIO_KILL:
1311                 free_tio(tio);
1312                 dec_pending(io, BLK_STS_IOERR);
1313                 break;
1314         case DM_MAPIO_REQUEUE:
1315                 free_tio(tio);
1316                 dec_pending(io, BLK_STS_DM_REQUEUE);
1317                 break;
1318         default:
1319                 DMWARN("unimplemented target map return value: %d", r);
1320                 BUG();
1321         }
1322
1323         return ret;
1324 }
1325
1326 static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1327 {
1328         bio->bi_iter.bi_sector = sector;
1329         bio->bi_iter.bi_size = to_bytes(len);
1330 }
1331
1332 /*
1333  * Creates a bio that consists of range of complete bvecs.
1334  */
1335 static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1336                      sector_t sector, unsigned len)
1337 {
1338         struct bio *clone = &tio->clone;
1339
1340         __bio_clone_fast(clone, bio);
1341
1342         bio_crypt_clone(clone, bio, GFP_NOIO);
1343
1344         if (bio_integrity(bio)) {
1345                 int r;
1346
1347                 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1348                              !dm_target_passes_integrity(tio->ti->type))) {
1349                         DMWARN("%s: the target %s doesn't support integrity data.",
1350                                 dm_device_name(tio->io->md),
1351                                 tio->ti->type->name);
1352                         return -EIO;
1353                 }
1354
1355                 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1356                 if (r < 0)
1357                         return r;
1358         }
1359
1360         bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1361         clone->bi_iter.bi_size = to_bytes(len);
1362
1363         if (bio_integrity(bio))
1364                 bio_integrity_trim(clone);
1365
1366         return 0;
1367 }
1368
1369 static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1370                                 struct dm_target *ti, unsigned num_bios)
1371 {
1372         struct dm_target_io *tio;
1373         int try;
1374
1375         if (!num_bios)
1376                 return;
1377
1378         if (num_bios == 1) {
1379                 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1380                 bio_list_add(blist, &tio->clone);
1381                 return;
1382         }
1383
1384         for (try = 0; try < 2; try++) {
1385                 int bio_nr;
1386                 struct bio *bio;
1387
1388                 if (try)
1389                         mutex_lock(&ci->io->md->table_devices_lock);
1390                 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1391                         tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1392                         if (!tio)
1393                                 break;
1394
1395                         bio_list_add(blist, &tio->clone);
1396                 }
1397                 if (try)
1398                         mutex_unlock(&ci->io->md->table_devices_lock);
1399                 if (bio_nr == num_bios)
1400                         return;
1401
1402                 while ((bio = bio_list_pop(blist))) {
1403                         tio = container_of(bio, struct dm_target_io, clone);
1404                         free_tio(tio);
1405                 }
1406         }
1407 }
1408
1409 static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1410                                            struct dm_target_io *tio, unsigned *len)
1411 {
1412         struct bio *clone = &tio->clone;
1413
1414         tio->len_ptr = len;
1415
1416         __bio_clone_fast(clone, ci->bio);
1417         if (len)
1418                 bio_setup_sector(clone, ci->sector, *len);
1419
1420         return __map_bio(tio);
1421 }
1422
1423 static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1424                                   unsigned num_bios, unsigned *len)
1425 {
1426         struct bio_list blist = BIO_EMPTY_LIST;
1427         struct bio *bio;
1428         struct dm_target_io *tio;
1429
1430         alloc_multiple_bios(&blist, ci, ti, num_bios);
1431
1432         while ((bio = bio_list_pop(&blist))) {
1433                 tio = container_of(bio, struct dm_target_io, clone);
1434                 (void) __clone_and_map_simple_bio(ci, tio, len);
1435         }
1436 }
1437
1438 static int __send_empty_flush(struct clone_info *ci)
1439 {
1440         unsigned target_nr = 0;
1441         struct dm_target *ti;
1442
1443         /*
1444          * Empty flush uses a statically initialized bio, as the base for
1445          * cloning.  However, blkg association requires that a bdev is
1446          * associated with a gendisk, which doesn't happen until the bdev is
1447          * opened.  So, blkg association is done at issue time of the flush
1448          * rather than when the device is created in alloc_dev().
1449          */
1450         bio_set_dev(ci->bio, ci->io->md->bdev);
1451
1452         BUG_ON(bio_has_data(ci->bio));
1453         while ((ti = dm_table_get_target(ci->map, target_nr++)))
1454                 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1455         return 0;
1456 }
1457
1458 static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1459                                     sector_t sector, unsigned *len)
1460 {
1461         struct bio *bio = ci->bio;
1462         struct dm_target_io *tio;
1463         int r;
1464
1465         tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1466         tio->len_ptr = len;
1467         r = clone_bio(tio, bio, sector, *len);
1468         if (r < 0) {
1469                 free_tio(tio);
1470                 return r;
1471         }
1472         (void) __map_bio(tio);
1473
1474         return 0;
1475 }
1476
1477 typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1478
1479 static unsigned get_num_discard_bios(struct dm_target *ti)
1480 {
1481         return ti->num_discard_bios;
1482 }
1483
1484 static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1485 {
1486         return ti->num_secure_erase_bios;
1487 }
1488
1489 static unsigned get_num_write_same_bios(struct dm_target *ti)
1490 {
1491         return ti->num_write_same_bios;
1492 }
1493
1494 static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1495 {
1496         return ti->num_write_zeroes_bios;
1497 }
1498
1499 static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1500                                        unsigned num_bios)
1501 {
1502         unsigned len;
1503
1504         /*
1505          * Even though the device advertised support for this type of
1506          * request, that does not mean every target supports it, and
1507          * reconfiguration might also have changed that since the
1508          * check was performed.
1509          */
1510         if (!num_bios)
1511                 return -EOPNOTSUPP;
1512
1513         len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1514
1515         __send_duplicate_bios(ci, ti, num_bios, &len);
1516
1517         ci->sector += len;
1518         ci->sector_count -= len;
1519
1520         return 0;
1521 }
1522
1523 static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1524 {
1525         return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
1526 }
1527
1528 static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1529 {
1530         return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
1531 }
1532
1533 static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1534 {
1535         return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
1536 }
1537
1538 static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1539 {
1540         return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
1541 }
1542
1543 static bool is_abnormal_io(struct bio *bio)
1544 {
1545         bool r = false;
1546
1547         switch (bio_op(bio)) {
1548         case REQ_OP_DISCARD:
1549         case REQ_OP_SECURE_ERASE:
1550         case REQ_OP_WRITE_SAME:
1551         case REQ_OP_WRITE_ZEROES:
1552                 r = true;
1553                 break;
1554         }
1555
1556         return r;
1557 }
1558
1559 static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1560                                   int *result)
1561 {
1562         struct bio *bio = ci->bio;
1563
1564         if (bio_op(bio) == REQ_OP_DISCARD)
1565                 *result = __send_discard(ci, ti);
1566         else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1567                 *result = __send_secure_erase(ci, ti);
1568         else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1569                 *result = __send_write_same(ci, ti);
1570         else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1571                 *result = __send_write_zeroes(ci, ti);
1572         else
1573                 return false;
1574
1575         return true;
1576 }
1577
1578 /*
1579  * Select the correct strategy for processing a non-flush bio.
1580  */
1581 static int __split_and_process_non_flush(struct clone_info *ci)
1582 {
1583         struct dm_target *ti;
1584         unsigned len;
1585         int r;
1586
1587         ti = dm_table_find_target(ci->map, ci->sector);
1588         if (!ti)
1589                 return -EIO;
1590
1591         if (__process_abnormal_io(ci, ti, &r))
1592                 return r;
1593
1594         len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1595
1596         r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1597         if (r < 0)
1598                 return r;
1599
1600         ci->sector += len;
1601         ci->sector_count -= len;
1602
1603         return 0;
1604 }
1605
1606 static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1607                             struct dm_table *map, struct bio *bio)
1608 {
1609         ci->map = map;
1610         ci->io = alloc_io(md, bio);
1611         ci->sector = bio->bi_iter.bi_sector;
1612 }
1613
1614 #define __dm_part_stat_sub(part, field, subnd)  \
1615         (part_stat_get(part, field) -= (subnd))
1616
1617 /*
1618  * Entry point to split a bio into clones and submit them to the targets.
1619  */
1620 static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1621                                         struct dm_table *map, struct bio *bio)
1622 {
1623         struct clone_info ci;
1624         blk_qc_t ret = BLK_QC_T_NONE;
1625         int error = 0;
1626
1627         init_clone_info(&ci, md, map, bio);
1628
1629         if (bio->bi_opf & REQ_PREFLUSH) {
1630                 struct bio flush_bio;
1631
1632                 /*
1633                  * Use an on-stack bio for this, it's safe since we don't
1634                  * need to reference it after submit. It's just used as
1635                  * the basis for the clone(s).
1636                  */
1637                 bio_init(&flush_bio, NULL, 0);
1638                 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1639                 ci.bio = &flush_bio;
1640                 ci.sector_count = 0;
1641                 error = __send_empty_flush(&ci);
1642                 bio_uninit(ci.bio);
1643                 /* dec_pending submits any data associated with flush */
1644         } else if (op_is_zone_mgmt(bio_op(bio))) {
1645                 ci.bio = bio;
1646                 ci.sector_count = 0;
1647                 error = __split_and_process_non_flush(&ci);
1648         } else {
1649                 ci.bio = bio;
1650                 ci.sector_count = bio_sectors(bio);
1651                 while (ci.sector_count && !error) {
1652                         error = __split_and_process_non_flush(&ci);
1653                         if (current->bio_list && ci.sector_count && !error) {
1654                                 /*
1655                                  * Remainder must be passed to generic_make_request()
1656                                  * so that it gets handled *after* bios already submitted
1657                                  * have been completely processed.
1658                                  * We take a clone of the original to store in
1659                                  * ci.io->orig_bio to be used by end_io_acct() and
1660                                  * for dec_pending to use for completion handling.
1661                                  */
1662                                 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1663                                                           GFP_NOIO, &md->queue->bio_split);
1664                                 ci.io->orig_bio = b;
1665
1666                                 /*
1667                                  * Adjust IO stats for each split, otherwise upon queue
1668                                  * reentry there will be redundant IO accounting.
1669                                  * NOTE: this is a stop-gap fix, a proper fix involves
1670                                  * significant refactoring of DM core's bio splitting
1671                                  * (by eliminating DM's splitting and just using bio_split)
1672                                  */
1673                                 part_stat_lock();
1674                                 __dm_part_stat_sub(&dm_disk(md)->part0,
1675                                                    sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1676                                 part_stat_unlock();
1677
1678                                 bio_chain(b, bio);
1679                                 trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1680                                 ret = generic_make_request(bio);
1681                                 break;
1682                         }
1683                 }
1684         }
1685
1686         /* drop the extra reference count */
1687         dec_pending(ci.io, errno_to_blk_status(error));
1688         return ret;
1689 }
1690
1691 /*
1692  * Optimized variant of __split_and_process_bio that leverages the
1693  * fact that targets that use it do _not_ have a need to split bios.
1694  */
1695 static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map,
1696                               struct bio *bio, struct dm_target *ti)
1697 {
1698         struct clone_info ci;
1699         blk_qc_t ret = BLK_QC_T_NONE;
1700         int error = 0;
1701
1702         init_clone_info(&ci, md, map, bio);
1703
1704         if (bio->bi_opf & REQ_PREFLUSH) {
1705                 struct bio flush_bio;
1706
1707                 /*
1708                  * Use an on-stack bio for this, it's safe since we don't
1709                  * need to reference it after submit. It's just used as
1710                  * the basis for the clone(s).
1711                  */
1712                 bio_init(&flush_bio, NULL, 0);
1713                 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1714                 ci.bio = &flush_bio;
1715                 ci.sector_count = 0;
1716                 error = __send_empty_flush(&ci);
1717                 bio_uninit(ci.bio);
1718                 /* dec_pending submits any data associated with flush */
1719         } else {
1720                 struct dm_target_io *tio;
1721
1722                 ci.bio = bio;
1723                 ci.sector_count = bio_sectors(bio);
1724                 if (__process_abnormal_io(&ci, ti, &error))
1725                         goto out;
1726
1727                 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1728                 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1729         }
1730 out:
1731         /* drop the extra reference count */
1732         dec_pending(ci.io, errno_to_blk_status(error));
1733         return ret;
1734 }
1735
1736 static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struct bio **bio)
1737 {
1738         unsigned len, sector_count;
1739
1740         sector_count = bio_sectors(*bio);
1741         len = min_t(sector_t, max_io_len((*bio)->bi_iter.bi_sector, ti), sector_count);
1742
1743         if (sector_count > len) {
1744                 struct bio *split = bio_split(*bio, len, GFP_NOIO, &md->queue->bio_split);
1745
1746                 bio_chain(split, *bio);
1747                 trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector);
1748                 generic_make_request(*bio);
1749                 *bio = split;
1750         }
1751 }
1752
1753 static blk_qc_t dm_process_bio(struct mapped_device *md,
1754                                struct dm_table *map, struct bio *bio)
1755 {
1756         blk_qc_t ret = BLK_QC_T_NONE;
1757         struct dm_target *ti = md->immutable_target;
1758
1759         if (unlikely(!map)) {
1760                 bio_io_error(bio);
1761                 return ret;
1762         }
1763
1764         if (!ti) {
1765                 ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
1766                 if (unlikely(!ti)) {
1767                         bio_io_error(bio);
1768                         return ret;
1769                 }
1770         }
1771
1772         /*
1773          * If in ->make_request_fn we need to use blk_queue_split(), otherwise
1774          * queue_limits for abnormal requests (e.g. discard, writesame, etc)
1775          * won't be imposed.
1776          */
1777         if (current->bio_list) {
1778                 if (is_abnormal_io(bio))
1779                         blk_queue_split(&bio);
1780                 else
1781                         dm_queue_split(md, ti, &bio);
1782         }
1783
1784         if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
1785                 return __process_bio(md, map, bio, ti);
1786         else
1787                 return __split_and_process_bio(md, map, bio);
1788 }
1789
1790 static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1791 {
1792         struct mapped_device *md = bio->bi_disk->private_data;
1793         blk_qc_t ret = BLK_QC_T_NONE;
1794         int srcu_idx;
1795         struct dm_table *map;
1796
1797         if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
1798                 /*
1799                  * We are called with a live reference on q_usage_counter, but
1800                  * that one will be released as soon as we return.  Grab an
1801                  * extra one as blk_mq_make_request expects to be able to
1802                  * consume a reference (which lives until the request is freed
1803                  * in case a request is allocated).
1804                  */
1805                 percpu_ref_get(&q->q_usage_counter);
1806                 return blk_mq_make_request(q, bio);
1807         }
1808
1809         map = dm_get_live_table(md, &srcu_idx);
1810
1811         /* if we're suspended, we have to queue this io for later */
1812         if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1813                 dm_put_live_table(md, srcu_idx);
1814
1815                 if (!(bio->bi_opf & REQ_RAHEAD))
1816                         queue_io(md, bio);
1817                 else
1818                         bio_io_error(bio);
1819                 return ret;
1820         }
1821
1822         ret = dm_process_bio(md, map, bio);
1823
1824         dm_put_live_table(md, srcu_idx);
1825         return ret;
1826 }
1827
1828 static int dm_any_congested(void *congested_data, int bdi_bits)
1829 {
1830         int r = bdi_bits;
1831         struct mapped_device *md = congested_data;
1832         struct dm_table *map;
1833
1834         if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1835                 if (dm_request_based(md)) {
1836                         /*
1837                          * With request-based DM we only need to check the
1838                          * top-level queue for congestion.
1839                          */
1840                         struct backing_dev_info *bdi = md->queue->backing_dev_info;
1841                         r = bdi->wb.congested->state & bdi_bits;
1842                 } else {
1843                         map = dm_get_live_table_fast(md);
1844                         if (map)
1845                                 r = dm_table_any_congested(map, bdi_bits);
1846                         dm_put_live_table_fast(md);
1847                 }
1848         }
1849
1850         return r;
1851 }
1852
1853 /*-----------------------------------------------------------------
1854  * An IDR is used to keep track of allocated minor numbers.
1855  *---------------------------------------------------------------*/
1856 static void free_minor(int minor)
1857 {
1858         spin_lock(&_minor_lock);
1859         idr_remove(&_minor_idr, minor);
1860         spin_unlock(&_minor_lock);
1861 }
1862
1863 /*
1864  * See if the device with a specific minor # is free.
1865  */
1866 static int specific_minor(int minor)
1867 {
1868         int r;
1869
1870         if (minor >= (1 << MINORBITS))
1871                 return -EINVAL;
1872
1873         idr_preload(GFP_KERNEL);
1874         spin_lock(&_minor_lock);
1875
1876         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1877
1878         spin_unlock(&_minor_lock);
1879         idr_preload_end();
1880         if (r < 0)
1881                 return r == -ENOSPC ? -EBUSY : r;
1882         return 0;
1883 }
1884
1885 static int next_free_minor(int *minor)
1886 {
1887         int r;
1888
1889         idr_preload(GFP_KERNEL);
1890         spin_lock(&_minor_lock);
1891
1892         r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1893
1894         spin_unlock(&_minor_lock);
1895         idr_preload_end();
1896         if (r < 0)
1897                 return r;
1898         *minor = r;
1899         return 0;
1900 }
1901
1902 static const struct block_device_operations dm_blk_dops;
1903 static const struct dax_operations dm_dax_ops;
1904
1905 static void dm_wq_work(struct work_struct *work);
1906
1907 static void cleanup_mapped_device(struct mapped_device *md)
1908 {
1909         if (md->wq)
1910                 destroy_workqueue(md->wq);
1911         bioset_exit(&md->bs);
1912         bioset_exit(&md->io_bs);
1913
1914         if (md->dax_dev) {
1915                 kill_dax(md->dax_dev);
1916                 put_dax(md->dax_dev);
1917                 md->dax_dev = NULL;
1918         }
1919
1920         if (md->disk) {
1921                 spin_lock(&_minor_lock);
1922                 md->disk->private_data = NULL;
1923                 spin_unlock(&_minor_lock);
1924                 del_gendisk(md->disk);
1925                 put_disk(md->disk);
1926         }
1927
1928         if (md->queue)
1929                 blk_cleanup_queue(md->queue);
1930
1931         cleanup_srcu_struct(&md->io_barrier);
1932
1933         if (md->bdev) {
1934                 bdput(md->bdev);
1935                 md->bdev = NULL;
1936         }
1937
1938         mutex_destroy(&md->suspend_lock);
1939         mutex_destroy(&md->type_lock);
1940         mutex_destroy(&md->table_devices_lock);
1941
1942         dm_mq_cleanup_mapped_device(md);
1943 }
1944
1945 /*
1946  * Allocate and initialise a blank device with a given minor.
1947  */
1948 static struct mapped_device *alloc_dev(int minor)
1949 {
1950         int r, numa_node_id = dm_get_numa_node();
1951         struct mapped_device *md;
1952         void *old_md;
1953
1954         md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1955         if (!md) {
1956                 DMWARN("unable to allocate device, out of memory.");
1957                 return NULL;
1958         }
1959
1960         if (!try_module_get(THIS_MODULE))
1961                 goto bad_module_get;
1962
1963         /* get a minor number for the dev */
1964         if (minor == DM_ANY_MINOR)
1965                 r = next_free_minor(&minor);
1966         else
1967                 r = specific_minor(minor);
1968         if (r < 0)
1969                 goto bad_minor;
1970
1971         r = init_srcu_struct(&md->io_barrier);
1972         if (r < 0)
1973                 goto bad_io_barrier;
1974
1975         md->numa_node_id = numa_node_id;
1976         md->init_tio_pdu = false;
1977         md->type = DM_TYPE_NONE;
1978         mutex_init(&md->suspend_lock);
1979         mutex_init(&md->type_lock);
1980         mutex_init(&md->table_devices_lock);
1981         spin_lock_init(&md->deferred_lock);
1982         atomic_set(&md->holders, 1);
1983         atomic_set(&md->open_count, 0);
1984         atomic_set(&md->event_nr, 0);
1985         atomic_set(&md->uevent_seq, 0);
1986         INIT_LIST_HEAD(&md->uevent_list);
1987         INIT_LIST_HEAD(&md->table_devices);
1988         spin_lock_init(&md->uevent_lock);
1989
1990         /*
1991          * default to bio-based required ->make_request_fn until DM
1992          * table is loaded and md->type established. If request-based
1993          * table is loaded: blk-mq will override accordingly.
1994          */
1995         md->queue = blk_alloc_queue(dm_make_request, numa_node_id);
1996         if (!md->queue)
1997                 goto bad;
1998
1999         md->disk = alloc_disk_node(1, md->numa_node_id);
2000         if (!md->disk)
2001                 goto bad;
2002
2003         init_waitqueue_head(&md->wait);
2004         INIT_WORK(&md->work, dm_wq_work);
2005         init_waitqueue_head(&md->eventq);
2006         init_completion(&md->kobj_holder.completion);
2007
2008         md->disk->major = _major;
2009         md->disk->first_minor = minor;
2010         md->disk->fops = &dm_blk_dops;
2011         md->disk->queue = md->queue;
2012         md->disk->private_data = md;
2013         sprintf(md->disk->disk_name, "dm-%d", minor);
2014
2015         if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
2016                 md->dax_dev = alloc_dax(md, md->disk->disk_name,
2017                                         &dm_dax_ops, 0);
2018                 if (IS_ERR(md->dax_dev))
2019                         goto bad;
2020         }
2021
2022         add_disk_no_queue_reg(md->disk);
2023         format_dev_t(md->name, MKDEV(_major, minor));
2024
2025         md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2026         if (!md->wq)
2027                 goto bad;
2028
2029         md->bdev = bdget_disk(md->disk, 0);
2030         if (!md->bdev)
2031                 goto bad;
2032
2033         dm_stats_init(&md->stats);
2034
2035         /* Populate the mapping, nobody knows we exist yet */
2036         spin_lock(&_minor_lock);
2037         old_md = idr_replace(&_minor_idr, md, minor);
2038         spin_unlock(&_minor_lock);
2039
2040         BUG_ON(old_md != MINOR_ALLOCED);
2041
2042         return md;
2043
2044 bad:
2045         cleanup_mapped_device(md);
2046 bad_io_barrier:
2047         free_minor(minor);
2048 bad_minor:
2049         module_put(THIS_MODULE);
2050 bad_module_get:
2051         kvfree(md);
2052         return NULL;
2053 }
2054
2055 static void unlock_fs(struct mapped_device *md);
2056
2057 static void free_dev(struct mapped_device *md)
2058 {
2059         int minor = MINOR(disk_devt(md->disk));
2060
2061         unlock_fs(md);
2062
2063         cleanup_mapped_device(md);
2064
2065         free_table_devices(&md->table_devices);
2066         dm_stats_cleanup(&md->stats);
2067         free_minor(minor);
2068
2069         module_put(THIS_MODULE);
2070         kvfree(md);
2071 }
2072
2073 static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
2074 {
2075         struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2076         int ret = 0;
2077
2078         if (dm_table_bio_based(t)) {
2079                 /*
2080                  * The md may already have mempools that need changing.
2081                  * If so, reload bioset because front_pad may have changed
2082                  * because a different table was loaded.
2083                  */
2084                 bioset_exit(&md->bs);
2085                 bioset_exit(&md->io_bs);
2086
2087         } else if (bioset_initialized(&md->bs)) {
2088                 /*
2089                  * There's no need to reload with request-based dm
2090                  * because the size of front_pad doesn't change.
2091                  * Note for future: If you are to reload bioset,
2092                  * prep-ed requests in the queue may refer
2093                  * to bio from the old bioset, so you must walk
2094                  * through the queue to unprep.
2095                  */
2096                 goto out;
2097         }
2098
2099         BUG_ON(!p ||
2100                bioset_initialized(&md->bs) ||
2101                bioset_initialized(&md->io_bs));
2102
2103         ret = bioset_init_from_src(&md->bs, &p->bs);
2104         if (ret)
2105                 goto out;
2106         ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2107         if (ret)
2108                 bioset_exit(&md->bs);
2109 out:
2110         /* mempool bind completed, no longer need any mempools in the table */
2111         dm_table_free_md_mempools(t);
2112         return ret;
2113 }
2114
2115 /*
2116  * Bind a table to the device.
2117  */
2118 static void event_callback(void *context)
2119 {
2120         unsigned long flags;
2121         LIST_HEAD(uevents);
2122         struct mapped_device *md = (struct mapped_device *) context;
2123
2124         spin_lock_irqsave(&md->uevent_lock, flags);
2125         list_splice_init(&md->uevent_list, &uevents);
2126         spin_unlock_irqrestore(&md->uevent_lock, flags);
2127
2128         dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2129
2130         atomic_inc(&md->event_nr);
2131         wake_up(&md->eventq);
2132         dm_issue_global_event();
2133 }
2134
2135 /*
2136  * Protected by md->suspend_lock obtained by dm_swap_table().
2137  */
2138 static void __set_size(struct mapped_device *md, sector_t size)
2139 {
2140         lockdep_assert_held(&md->suspend_lock);
2141
2142         set_capacity(md->disk, size);
2143
2144         i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2145 }
2146
2147 /*
2148  * Returns old map, which caller must destroy.
2149  */
2150 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2151                                struct queue_limits *limits)
2152 {
2153         struct dm_table *old_map;
2154         struct request_queue *q = md->queue;
2155         bool request_based = dm_table_request_based(t);
2156         sector_t size;
2157         int ret;
2158
2159         lockdep_assert_held(&md->suspend_lock);
2160
2161         size = dm_table_get_size(t);
2162
2163         /*
2164          * Wipe any geometry if the size of the table changed.
2165          */
2166         if (size != dm_get_size(md))
2167                 memset(&md->geometry, 0, sizeof(md->geometry));
2168
2169         __set_size(md, size);
2170
2171         dm_table_event_callback(t, event_callback, md);
2172
2173         /*
2174          * The queue hasn't been stopped yet, if the old table type wasn't
2175          * for request-based during suspension.  So stop it to prevent
2176          * I/O mapping before resume.
2177          * This must be done before setting the queue restrictions,
2178          * because request-based dm may be run just after the setting.
2179          */
2180         if (request_based)
2181                 dm_stop_queue(q);
2182
2183         if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2184                 /*
2185                  * Leverage the fact that request-based DM targets and
2186                  * NVMe bio based targets are immutable singletons
2187                  * - used to optimize both dm_request_fn and dm_mq_queue_rq;
2188                  *   and __process_bio.
2189                  */
2190                 md->immutable_target = dm_table_get_immutable_target(t);
2191         }
2192
2193         ret = __bind_mempools(md, t);
2194         if (ret) {
2195                 old_map = ERR_PTR(ret);
2196                 goto out;
2197         }
2198
2199         old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2200         rcu_assign_pointer(md->map, (void *)t);
2201         md->immutable_target_type = dm_table_get_immutable_target_type(t);
2202
2203         dm_table_set_restrictions(t, q, limits);
2204         if (old_map)
2205                 dm_sync_table(md);
2206
2207 out:
2208         return old_map;
2209 }
2210
2211 /*
2212  * Returns unbound table for the caller to free.
2213  */
2214 static struct dm_table *__unbind(struct mapped_device *md)
2215 {
2216         struct dm_table *map = rcu_dereference_protected(md->map, 1);
2217
2218         if (!map)
2219                 return NULL;
2220
2221         dm_table_event_callback(map, NULL, NULL);
2222         RCU_INIT_POINTER(md->map, NULL);
2223         dm_sync_table(md);
2224
2225         return map;
2226 }
2227
2228 /*
2229  * Constructor for a new device.
2230  */
2231 int dm_create(int minor, struct mapped_device **result)
2232 {
2233         int r;
2234         struct mapped_device *md;
2235
2236         md = alloc_dev(minor);
2237         if (!md)
2238                 return -ENXIO;
2239
2240         r = dm_sysfs_init(md);
2241         if (r) {
2242                 free_dev(md);
2243                 return r;
2244         }
2245
2246         *result = md;
2247         return 0;
2248 }
2249
2250 /*
2251  * Functions to manage md->type.
2252  * All are required to hold md->type_lock.
2253  */
2254 void dm_lock_md_type(struct mapped_device *md)
2255 {
2256         mutex_lock(&md->type_lock);
2257 }
2258
2259 void dm_unlock_md_type(struct mapped_device *md)
2260 {
2261         mutex_unlock(&md->type_lock);
2262 }
2263
2264 void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2265 {
2266         BUG_ON(!mutex_is_locked(&md->type_lock));
2267         md->type = type;
2268 }
2269
2270 enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2271 {
2272         return md->type;
2273 }
2274
2275 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2276 {
2277         return md->immutable_target_type;
2278 }
2279
2280 /*
2281  * The queue_limits are only valid as long as you have a reference
2282  * count on 'md'.
2283  */
2284 struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2285 {
2286         BUG_ON(!atomic_read(&md->holders));
2287         return &md->queue->limits;
2288 }
2289 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2290
2291 static void dm_init_congested_fn(struct mapped_device *md)
2292 {
2293         md->queue->backing_dev_info->congested_data = md;
2294         md->queue->backing_dev_info->congested_fn = dm_any_congested;
2295 }
2296
2297 /*
2298  * Setup the DM device's queue based on md's type
2299  */
2300 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2301 {
2302         int r;
2303         struct queue_limits limits;
2304         enum dm_queue_mode type = dm_get_md_type(md);
2305
2306         switch (type) {
2307         case DM_TYPE_REQUEST_BASED:
2308                 r = dm_mq_init_request_queue(md, t);
2309                 if (r) {
2310                         DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2311                         return r;
2312                 }
2313                 dm_init_congested_fn(md);
2314                 break;
2315         case DM_TYPE_BIO_BASED:
2316         case DM_TYPE_DAX_BIO_BASED:
2317         case DM_TYPE_NVME_BIO_BASED:
2318                 dm_init_congested_fn(md);
2319                 break;
2320         case DM_TYPE_NONE:
2321                 WARN_ON_ONCE(true);
2322                 break;
2323         }
2324
2325         r = dm_calculate_queue_limits(t, &limits);
2326         if (r) {
2327                 DMERR("Cannot calculate initial queue limits");
2328                 return r;
2329         }
2330         dm_table_set_restrictions(t, md->queue, &limits);
2331         blk_register_queue(md->disk);
2332
2333         return 0;
2334 }
2335
2336 struct mapped_device *dm_get_md(dev_t dev)
2337 {
2338         struct mapped_device *md;
2339         unsigned minor = MINOR(dev);
2340
2341         if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2342                 return NULL;
2343
2344         spin_lock(&_minor_lock);
2345
2346         md = idr_find(&_minor_idr, minor);
2347         if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2348             test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2349                 md = NULL;
2350                 goto out;
2351         }
2352         dm_get(md);
2353 out:
2354         spin_unlock(&_minor_lock);
2355
2356         return md;
2357 }
2358 EXPORT_SYMBOL_GPL(dm_get_md);
2359
2360 void *dm_get_mdptr(struct mapped_device *md)
2361 {
2362         return md->interface_ptr;
2363 }
2364
2365 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2366 {
2367         md->interface_ptr = ptr;
2368 }
2369
2370 void dm_get(struct mapped_device *md)
2371 {
2372         atomic_inc(&md->holders);
2373         BUG_ON(test_bit(DMF_FREEING, &md->flags));
2374 }
2375
2376 int dm_hold(struct mapped_device *md)
2377 {
2378         spin_lock(&_minor_lock);
2379         if (test_bit(DMF_FREEING, &md->flags)) {
2380                 spin_unlock(&_minor_lock);
2381                 return -EBUSY;
2382         }
2383         dm_get(md);
2384         spin_unlock(&_minor_lock);
2385         return 0;
2386 }
2387 EXPORT_SYMBOL_GPL(dm_hold);
2388
2389 const char *dm_device_name(struct mapped_device *md)
2390 {
2391         return md->name;
2392 }
2393 EXPORT_SYMBOL_GPL(dm_device_name);
2394
2395 static void __dm_destroy(struct mapped_device *md, bool wait)
2396 {
2397         struct dm_table *map;
2398         int srcu_idx;
2399
2400         might_sleep();
2401
2402         spin_lock(&_minor_lock);
2403         idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2404         set_bit(DMF_FREEING, &md->flags);
2405         spin_unlock(&_minor_lock);
2406
2407         blk_set_queue_dying(md->queue);
2408
2409         /*
2410          * Take suspend_lock so that presuspend and postsuspend methods
2411          * do not race with internal suspend.
2412          */
2413         mutex_lock(&md->suspend_lock);
2414         map = dm_get_live_table(md, &srcu_idx);
2415         if (!dm_suspended_md(md)) {
2416                 dm_table_presuspend_targets(map);
2417                 set_bit(DMF_SUSPENDED, &md->flags);
2418                 dm_table_postsuspend_targets(map);
2419         }
2420         /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2421         dm_put_live_table(md, srcu_idx);
2422         mutex_unlock(&md->suspend_lock);
2423
2424         /*
2425          * Rare, but there may be I/O requests still going to complete,
2426          * for example.  Wait for all references to disappear.
2427          * No one should increment the reference count of the mapped_device,
2428          * after the mapped_device state becomes DMF_FREEING.
2429          */
2430         if (wait)
2431                 while (atomic_read(&md->holders))
2432                         msleep(1);
2433         else if (atomic_read(&md->holders))
2434                 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2435                        dm_device_name(md), atomic_read(&md->holders));
2436
2437         dm_sysfs_exit(md);
2438         dm_table_destroy(__unbind(md));
2439         free_dev(md);
2440 }
2441
2442 void dm_destroy(struct mapped_device *md)
2443 {
2444         __dm_destroy(md, true);
2445 }
2446
2447 void dm_destroy_immediate(struct mapped_device *md)
2448 {
2449         __dm_destroy(md, false);
2450 }
2451
2452 void dm_put(struct mapped_device *md)
2453 {
2454         atomic_dec(&md->holders);
2455 }
2456 EXPORT_SYMBOL_GPL(dm_put);
2457
2458 static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2459 {
2460         int r = 0;
2461         DEFINE_WAIT(wait);
2462
2463         while (1) {
2464                 prepare_to_wait(&md->wait, &wait, task_state);
2465
2466                 if (!md_in_flight(md))
2467                         break;
2468
2469                 if (signal_pending_state(task_state, current)) {
2470                         r = -EINTR;
2471                         break;
2472                 }
2473
2474                 io_schedule();
2475         }
2476         finish_wait(&md->wait, &wait);
2477
2478         return r;
2479 }
2480
2481 /*
2482  * Process the deferred bios
2483  */
2484 static void dm_wq_work(struct work_struct *work)
2485 {
2486         struct mapped_device *md = container_of(work, struct mapped_device,
2487                                                 work);
2488         struct bio *c;
2489         int srcu_idx;
2490         struct dm_table *map;
2491
2492         map = dm_get_live_table(md, &srcu_idx);
2493
2494         while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2495                 spin_lock_irq(&md->deferred_lock);
2496                 c = bio_list_pop(&md->deferred);
2497                 spin_unlock_irq(&md->deferred_lock);
2498
2499                 if (!c)
2500                         break;
2501
2502                 if (dm_request_based(md))
2503                         (void) generic_make_request(c);
2504                 else
2505                         (void) dm_process_bio(md, map, c);
2506         }
2507
2508         dm_put_live_table(md, srcu_idx);
2509 }
2510
2511 static void dm_queue_flush(struct mapped_device *md)
2512 {
2513         clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2514         smp_mb__after_atomic();
2515         queue_work(md->wq, &md->work);
2516 }
2517
2518 /*
2519  * Swap in a new table, returning the old one for the caller to destroy.
2520  */
2521 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2522 {
2523         struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2524         struct queue_limits limits;
2525         int r;
2526
2527         mutex_lock(&md->suspend_lock);
2528
2529         /* device must be suspended */
2530         if (!dm_suspended_md(md))
2531                 goto out;
2532
2533         /*
2534          * If the new table has no data devices, retain the existing limits.
2535          * This helps multipath with queue_if_no_path if all paths disappear,
2536          * then new I/O is queued based on these limits, and then some paths
2537          * reappear.
2538          */
2539         if (dm_table_has_no_data_devices(table)) {
2540                 live_map = dm_get_live_table_fast(md);
2541                 if (live_map)
2542                         limits = md->queue->limits;
2543                 dm_put_live_table_fast(md);
2544         }
2545
2546         if (!live_map) {
2547                 r = dm_calculate_queue_limits(table, &limits);
2548                 if (r) {
2549                         map = ERR_PTR(r);
2550                         goto out;
2551                 }
2552         }
2553
2554         map = __bind(md, table, &limits);
2555         dm_issue_global_event();
2556
2557 out:
2558         mutex_unlock(&md->suspend_lock);
2559         return map;
2560 }
2561
2562 /*
2563  * Functions to lock and unlock any filesystem running on the
2564  * device.
2565  */
2566 static int lock_fs(struct mapped_device *md)
2567 {
2568         int r;
2569
2570         WARN_ON(md->frozen_sb);
2571
2572         md->frozen_sb = freeze_bdev(md->bdev);
2573         if (IS_ERR(md->frozen_sb)) {
2574                 r = PTR_ERR(md->frozen_sb);
2575                 md->frozen_sb = NULL;
2576                 return r;
2577         }
2578
2579         set_bit(DMF_FROZEN, &md->flags);
2580
2581         return 0;
2582 }
2583
2584 static void unlock_fs(struct mapped_device *md)
2585 {
2586         if (!test_bit(DMF_FROZEN, &md->flags))
2587                 return;
2588
2589         thaw_bdev(md->bdev, md->frozen_sb);
2590         md->frozen_sb = NULL;
2591         clear_bit(DMF_FROZEN, &md->flags);
2592 }
2593
2594 /*
2595  * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2596  * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2597  * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2598  *
2599  * If __dm_suspend returns 0, the device is completely quiescent
2600  * now. There is no request-processing activity. All new requests
2601  * are being added to md->deferred list.
2602  */
2603 static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2604                         unsigned suspend_flags, long task_state,
2605                         int dmf_suspended_flag)
2606 {
2607         bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2608         bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2609         int r;
2610
2611         lockdep_assert_held(&md->suspend_lock);
2612
2613         /*
2614          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2615          * This flag is cleared before dm_suspend returns.
2616          */
2617         if (noflush)
2618                 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2619         else
2620                 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2621
2622         /*
2623          * This gets reverted if there's an error later and the targets
2624          * provide the .presuspend_undo hook.
2625          */
2626         dm_table_presuspend_targets(map);
2627
2628         /*
2629          * Flush I/O to the device.
2630          * Any I/O submitted after lock_fs() may not be flushed.
2631          * noflush takes precedence over do_lockfs.
2632          * (lock_fs() flushes I/Os and waits for them to complete.)
2633          */
2634         if (!noflush && do_lockfs) {
2635                 r = lock_fs(md);
2636                 if (r) {
2637                         dm_table_presuspend_undo_targets(map);
2638                         return r;
2639                 }
2640         }
2641
2642         /*
2643          * Here we must make sure that no processes are submitting requests
2644          * to target drivers i.e. no one may be executing
2645          * __split_and_process_bio. This is called from dm_request and
2646          * dm_wq_work.
2647          *
2648          * To get all processes out of __split_and_process_bio in dm_request,
2649          * we take the write lock. To prevent any process from reentering
2650          * __split_and_process_bio from dm_request and quiesce the thread
2651          * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2652          * flush_workqueue(md->wq).
2653          */
2654         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2655         if (map)
2656                 synchronize_srcu(&md->io_barrier);
2657
2658         /*
2659          * Stop md->queue before flushing md->wq in case request-based
2660          * dm defers requests to md->wq from md->queue.
2661          */
2662         if (dm_request_based(md))
2663                 dm_stop_queue(md->queue);
2664
2665         flush_workqueue(md->wq);
2666
2667         /*
2668          * At this point no more requests are entering target request routines.
2669          * We call dm_wait_for_completion to wait for all existing requests
2670          * to finish.
2671          */
2672         r = dm_wait_for_completion(md, task_state);
2673         if (!r)
2674                 set_bit(dmf_suspended_flag, &md->flags);
2675
2676         if (noflush)
2677                 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2678         if (map)
2679                 synchronize_srcu(&md->io_barrier);
2680
2681         /* were we interrupted ? */
2682         if (r < 0) {
2683                 dm_queue_flush(md);
2684
2685                 if (dm_request_based(md))
2686                         dm_start_queue(md->queue);
2687
2688                 unlock_fs(md);
2689                 dm_table_presuspend_undo_targets(map);
2690                 /* pushback list is already flushed, so skip flush */
2691         }
2692
2693         return r;
2694 }
2695
2696 /*
2697  * We need to be able to change a mapping table under a mounted
2698  * filesystem.  For example we might want to move some data in
2699  * the background.  Before the table can be swapped with
2700  * dm_bind_table, dm_suspend must be called to flush any in
2701  * flight bios and ensure that any further io gets deferred.
2702  */
2703 /*
2704  * Suspend mechanism in request-based dm.
2705  *
2706  * 1. Flush all I/Os by lock_fs() if needed.
2707  * 2. Stop dispatching any I/O by stopping the request_queue.
2708  * 3. Wait for all in-flight I/Os to be completed or requeued.
2709  *
2710  * To abort suspend, start the request_queue.
2711  */
2712 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2713 {
2714         struct dm_table *map = NULL;
2715         int r = 0;
2716
2717 retry:
2718         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2719
2720         if (dm_suspended_md(md)) {
2721                 r = -EINVAL;
2722                 goto out_unlock;
2723         }
2724
2725         if (dm_suspended_internally_md(md)) {
2726                 /* already internally suspended, wait for internal resume */
2727                 mutex_unlock(&md->suspend_lock);
2728                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2729                 if (r)
2730                         return r;
2731                 goto retry;
2732         }
2733
2734         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2735
2736         r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2737         if (r)
2738                 goto out_unlock;
2739
2740         dm_table_postsuspend_targets(map);
2741
2742 out_unlock:
2743         mutex_unlock(&md->suspend_lock);
2744         return r;
2745 }
2746
2747 static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2748 {
2749         if (map) {
2750                 int r = dm_table_resume_targets(map);
2751                 if (r)
2752                         return r;
2753         }
2754
2755         dm_queue_flush(md);
2756
2757         /*
2758          * Flushing deferred I/Os must be done after targets are resumed
2759          * so that mapping of targets can work correctly.
2760          * Request-based dm is queueing the deferred I/Os in its request_queue.
2761          */
2762         if (dm_request_based(md))
2763                 dm_start_queue(md->queue);
2764
2765         unlock_fs(md);
2766
2767         return 0;
2768 }
2769
2770 int dm_resume(struct mapped_device *md)
2771 {
2772         int r;
2773         struct dm_table *map = NULL;
2774
2775 retry:
2776         r = -EINVAL;
2777         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2778
2779         if (!dm_suspended_md(md))
2780                 goto out;
2781
2782         if (dm_suspended_internally_md(md)) {
2783                 /* already internally suspended, wait for internal resume */
2784                 mutex_unlock(&md->suspend_lock);
2785                 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2786                 if (r)
2787                         return r;
2788                 goto retry;
2789         }
2790
2791         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2792         if (!map || !dm_table_get_size(map))
2793                 goto out;
2794
2795         r = __dm_resume(md, map);
2796         if (r)
2797                 goto out;
2798
2799         clear_bit(DMF_SUSPENDED, &md->flags);
2800 out:
2801         mutex_unlock(&md->suspend_lock);
2802
2803         return r;
2804 }
2805
2806 /*
2807  * Internal suspend/resume works like userspace-driven suspend. It waits
2808  * until all bios finish and prevents issuing new bios to the target drivers.
2809  * It may be used only from the kernel.
2810  */
2811
2812 static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2813 {
2814         struct dm_table *map = NULL;
2815
2816         lockdep_assert_held(&md->suspend_lock);
2817
2818         if (md->internal_suspend_count++)
2819                 return; /* nested internal suspend */
2820
2821         if (dm_suspended_md(md)) {
2822                 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2823                 return; /* nest suspend */
2824         }
2825
2826         map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2827
2828         /*
2829          * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2830          * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2831          * would require changing .presuspend to return an error -- avoid this
2832          * until there is a need for more elaborate variants of internal suspend.
2833          */
2834         (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2835                             DMF_SUSPENDED_INTERNALLY);
2836
2837         dm_table_postsuspend_targets(map);
2838 }
2839
2840 static void __dm_internal_resume(struct mapped_device *md)
2841 {
2842         BUG_ON(!md->internal_suspend_count);
2843
2844         if (--md->internal_suspend_count)
2845                 return; /* resume from nested internal suspend */
2846
2847         if (dm_suspended_md(md))
2848                 goto done; /* resume from nested suspend */
2849
2850         /*
2851          * NOTE: existing callers don't need to call dm_table_resume_targets
2852          * (which may fail -- so best to avoid it for now by passing NULL map)
2853          */
2854         (void) __dm_resume(md, NULL);
2855
2856 done:
2857         clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2858         smp_mb__after_atomic();
2859         wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2860 }
2861
2862 void dm_internal_suspend_noflush(struct mapped_device *md)
2863 {
2864         mutex_lock(&md->suspend_lock);
2865         __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2866         mutex_unlock(&md->suspend_lock);
2867 }
2868 EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2869
2870 void dm_internal_resume(struct mapped_device *md)
2871 {
2872         mutex_lock(&md->suspend_lock);
2873         __dm_internal_resume(md);
2874         mutex_unlock(&md->suspend_lock);
2875 }
2876 EXPORT_SYMBOL_GPL(dm_internal_resume);
2877
2878 /*
2879  * Fast variants of internal suspend/resume hold md->suspend_lock,
2880  * which prevents interaction with userspace-driven suspend.
2881  */
2882
2883 void dm_internal_suspend_fast(struct mapped_device *md)
2884 {
2885         mutex_lock(&md->suspend_lock);
2886         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2887                 return;
2888
2889         set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2890         synchronize_srcu(&md->io_barrier);
2891         flush_workqueue(md->wq);
2892         dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2893 }
2894 EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2895
2896 void dm_internal_resume_fast(struct mapped_device *md)
2897 {
2898         if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2899                 goto done;
2900
2901         dm_queue_flush(md);
2902
2903 done:
2904         mutex_unlock(&md->suspend_lock);
2905 }
2906 EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2907
2908 /*-----------------------------------------------------------------
2909  * Event notification.
2910  *---------------------------------------------------------------*/
2911 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2912                        unsigned cookie)
2913 {
2914         char udev_cookie[DM_COOKIE_LENGTH];
2915         char *envp[] = { udev_cookie, NULL };
2916
2917         if (!cookie)
2918                 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2919         else {
2920                 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2921                          DM_COOKIE_ENV_VAR_NAME, cookie);
2922                 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2923                                           action, envp);
2924         }
2925 }
2926
2927 uint32_t dm_next_uevent_seq(struct mapped_device *md)
2928 {
2929         return atomic_add_return(1, &md->uevent_seq);
2930 }
2931
2932 uint32_t dm_get_event_nr(struct mapped_device *md)
2933 {
2934         return atomic_read(&md->event_nr);
2935 }
2936
2937 int dm_wait_event(struct mapped_device *md, int event_nr)
2938 {
2939         return wait_event_interruptible(md->eventq,
2940                         (event_nr != atomic_read(&md->event_nr)));
2941 }
2942
2943 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2944 {
2945         unsigned long flags;
2946
2947         spin_lock_irqsave(&md->uevent_lock, flags);
2948         list_add(elist, &md->uevent_list);
2949         spin_unlock_irqrestore(&md->uevent_lock, flags);
2950 }
2951
2952 /*
2953  * The gendisk is only valid as long as you have a reference
2954  * count on 'md'.
2955  */
2956 struct gendisk *dm_disk(struct mapped_device *md)
2957 {
2958         return md->disk;
2959 }
2960 EXPORT_SYMBOL_GPL(dm_disk);
2961
2962 struct kobject *dm_kobject(struct mapped_device *md)
2963 {
2964         return &md->kobj_holder.kobj;
2965 }
2966
2967 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2968 {
2969         struct mapped_device *md;
2970
2971         md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2972
2973         spin_lock(&_minor_lock);
2974         if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2975                 md = NULL;
2976                 goto out;
2977         }
2978         dm_get(md);
2979 out:
2980         spin_unlock(&_minor_lock);
2981
2982         return md;
2983 }
2984
2985 int dm_suspended_md(struct mapped_device *md)
2986 {
2987         return test_bit(DMF_SUSPENDED, &md->flags);
2988 }
2989
2990 int dm_suspended_internally_md(struct mapped_device *md)
2991 {
2992         return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2993 }
2994
2995 int dm_test_deferred_remove_flag(struct mapped_device *md)
2996 {
2997         return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2998 }
2999
3000 int dm_suspended(struct dm_target *ti)
3001 {
3002         return dm_suspended_md(dm_table_get_md(ti->table));
3003 }
3004 EXPORT_SYMBOL_GPL(dm_suspended);
3005
3006 int dm_noflush_suspending(struct dm_target *ti)
3007 {
3008         return __noflush_suspending(dm_table_get_md(ti->table));
3009 }
3010 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3011
3012 struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
3013                                             unsigned integrity, unsigned per_io_data_size,
3014                                             unsigned min_pool_size)
3015 {
3016         struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
3017         unsigned int pool_size = 0;
3018         unsigned int front_pad, io_front_pad;
3019         int ret;
3020
3021         if (!pools)
3022                 return NULL;
3023
3024         switch (type) {
3025         case DM_TYPE_BIO_BASED:
3026         case DM_TYPE_DAX_BIO_BASED:
3027         case DM_TYPE_NVME_BIO_BASED:
3028                 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
3029                 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3030                 io_front_pad = roundup(front_pad,  __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
3031                 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
3032                 if (ret)
3033                         goto out;
3034                 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
3035                         goto out;
3036                 break;
3037         case DM_TYPE_REQUEST_BASED:
3038                 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
3039                 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3040                 /* per_io_data_size is used for blk-mq pdu at queue allocation */
3041                 break;
3042         default:
3043                 BUG();
3044         }
3045
3046         ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
3047         if (ret)
3048                 goto out;
3049
3050         if (integrity && bioset_integrity_create(&pools->bs, pool_size))
3051                 goto out;
3052
3053         return pools;
3054
3055 out:
3056         dm_free_md_mempools(pools);
3057
3058         return NULL;
3059 }
3060
3061 void dm_free_md_mempools(struct dm_md_mempools *pools)
3062 {
3063         if (!pools)
3064                 return;
3065
3066         bioset_exit(&pools->bs);
3067         bioset_exit(&pools->io_bs);
3068
3069         kfree(pools);
3070 }
3071
3072 struct dm_pr {
3073         u64     old_key;
3074         u64     new_key;
3075         u32     flags;
3076         bool    fail_early;
3077 };
3078
3079 static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3080                       void *data)
3081 {
3082         struct mapped_device *md = bdev->bd_disk->private_data;
3083         struct dm_table *table;
3084         struct dm_target *ti;
3085         int ret = -ENOTTY, srcu_idx;
3086
3087         table = dm_get_live_table(md, &srcu_idx);
3088         if (!table || !dm_table_get_size(table))
3089                 goto out;
3090
3091         /* We only support devices that have a single target */
3092         if (dm_table_get_num_targets(table) != 1)
3093                 goto out;
3094         ti = dm_table_get_target(table, 0);
3095
3096         ret = -EINVAL;
3097         if (!ti->type->iterate_devices)
3098                 goto out;
3099
3100         ret = ti->type->iterate_devices(ti, fn, data);
3101 out:
3102         dm_put_live_table(md, srcu_idx);
3103         return ret;
3104 }
3105
3106 /*
3107  * For register / unregister we need to manually call out to every path.
3108  */
3109 static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3110                             sector_t start, sector_t len, void *data)
3111 {
3112         struct dm_pr *pr = data;
3113         const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3114
3115         if (!ops || !ops->pr_register)
3116                 return -EOPNOTSUPP;
3117         return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3118 }
3119
3120 static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3121                           u32 flags)
3122 {
3123         struct dm_pr pr = {
3124                 .old_key        = old_key,
3125                 .new_key        = new_key,
3126                 .flags          = flags,
3127                 .fail_early     = true,
3128         };
3129         int ret;
3130
3131         ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3132         if (ret && new_key) {
3133                 /* unregister all paths if we failed to register any path */
3134                 pr.old_key = new_key;
3135                 pr.new_key = 0;
3136                 pr.flags = 0;
3137                 pr.fail_early = false;
3138                 dm_call_pr(bdev, __dm_pr_register, &pr);
3139         }
3140
3141         return ret;
3142 }
3143
3144 static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3145                          u32 flags)
3146 {
3147         struct mapped_device *md = bdev->bd_disk->private_data;
3148         const struct pr_ops *ops;
3149         int r, srcu_idx;
3150
3151         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3152         if (r < 0)
3153                 goto out;
3154
3155         ops = bdev->bd_disk->fops->pr_ops;
3156         if (ops && ops->pr_reserve)
3157                 r = ops->pr_reserve(bdev, key, type, flags);
3158         else
3159                 r = -EOPNOTSUPP;
3160 out:
3161         dm_unprepare_ioctl(md, srcu_idx);
3162         return r;
3163 }
3164
3165 static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3166 {
3167         struct mapped_device *md = bdev->bd_disk->private_data;
3168         const struct pr_ops *ops;
3169         int r, srcu_idx;
3170
3171         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3172         if (r < 0)
3173                 goto out;
3174
3175         ops = bdev->bd_disk->fops->pr_ops;
3176         if (ops && ops->pr_release)
3177                 r = ops->pr_release(bdev, key, type);
3178         else
3179                 r = -EOPNOTSUPP;
3180 out:
3181         dm_unprepare_ioctl(md, srcu_idx);
3182         return r;
3183 }
3184
3185 static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3186                          enum pr_type type, bool abort)
3187 {
3188         struct mapped_device *md = bdev->bd_disk->private_data;
3189         const struct pr_ops *ops;
3190         int r, srcu_idx;
3191
3192         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3193         if (r < 0)
3194                 goto out;
3195
3196         ops = bdev->bd_disk->fops->pr_ops;
3197         if (ops && ops->pr_preempt)
3198                 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3199         else
3200                 r = -EOPNOTSUPP;
3201 out:
3202         dm_unprepare_ioctl(md, srcu_idx);
3203         return r;
3204 }
3205
3206 static int dm_pr_clear(struct block_device *bdev, u64 key)
3207 {
3208         struct mapped_device *md = bdev->bd_disk->private_data;
3209         const struct pr_ops *ops;
3210         int r, srcu_idx;
3211
3212         r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3213         if (r < 0)
3214                 goto out;
3215
3216         ops = bdev->bd_disk->fops->pr_ops;
3217         if (ops && ops->pr_clear)
3218                 r = ops->pr_clear(bdev, key);
3219         else
3220                 r = -EOPNOTSUPP;
3221 out:
3222         dm_unprepare_ioctl(md, srcu_idx);
3223         return r;
3224 }
3225
3226 static const struct pr_ops dm_pr_ops = {
3227         .pr_register    = dm_pr_register,
3228         .pr_reserve     = dm_pr_reserve,
3229         .pr_release     = dm_pr_release,
3230         .pr_preempt     = dm_pr_preempt,
3231         .pr_clear       = dm_pr_clear,
3232 };
3233
3234 static const struct block_device_operations dm_blk_dops = {
3235         .open = dm_blk_open,
3236         .release = dm_blk_close,
3237         .ioctl = dm_blk_ioctl,
3238         .getgeo = dm_blk_getgeo,
3239         .report_zones = dm_blk_report_zones,
3240         .pr_ops = &dm_pr_ops,
3241         .owner = THIS_MODULE
3242 };
3243
3244 static const struct dax_operations dm_dax_ops = {
3245         .direct_access = dm_dax_direct_access,
3246         .dax_supported = dm_dax_supported,
3247         .copy_from_iter = dm_dax_copy_from_iter,
3248         .copy_to_iter = dm_dax_copy_to_iter,
3249         .zero_page_range = dm_dax_zero_page_range,
3250 };
3251
3252 /*
3253  * module hooks
3254  */
3255 module_init(dm_init);
3256 module_exit(dm_exit);
3257
3258 module_param(major, uint, 0);
3259 MODULE_PARM_DESC(major, "The major number of the device mapper");
3260
3261 module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3262 MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3263
3264 module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3265 MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3266
3267 MODULE_DESCRIPTION(DM_NAME " driver");
3268 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3269 MODULE_LICENSE("GPL");