Merge tag 'soc-drivers-6.9' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc
[linux-block.git] / drivers / md / raid1.c
index 286f8b16c7bde7fbc0bca0705d470748d9f5eeb5..be8ac24f50b6ad651fd107f9af9a448bb1f7780a 100644 (file)
@@ -46,9 +46,6 @@
 static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
 static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
-#define raid1_log(md, fmt, args...)                            \
-       do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
-
 #define RAID_1_10_NAME "raid1"
 #include "raid1-10.c"
 
@@ -498,9 +495,6 @@ static void raid1_end_write_request(struct bio *bio)
                 * to user-side. So if something waits for IO, then it
                 * will wait for the 'master' bio.
                 */
-               sector_t first_bad;
-               int bad_sectors;
-
                r1_bio->bios[mirror] = NULL;
                to_put = bio;
                /*
@@ -516,8 +510,8 @@ static void raid1_end_write_request(struct bio *bio)
                        set_bit(R1BIO_Uptodate, &r1_bio->state);
 
                /* Maybe we can clear some bad blocks. */
-               if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
-                               &first_bad, &bad_sectors) && !discard_error) {
+               if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
+                   !discard_error) {
                        r1_bio->bios[mirror] = IO_MADE_GOOD;
                        set_bit(R1BIO_MadeGood, &r1_bio->state);
                }
@@ -582,211 +576,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector,
        return len;
 }
 
-/*
- * This routine returns the disk from which the requested read should
- * be done. There is a per-array 'next expected sequential IO' sector
- * number - if this matches on the next IO then we use the last disk.
- * There is also a per-disk 'last know head position' sector that is
- * maintained from IRQ contexts, both the normal and the resync IO
- * completion handlers update this position correctly. If there is no
- * perfect sequential match then we pick the disk whose head is closest.
- *
- * If there are 2 mirrors in the same 2 devices, performance degrades
- * because position is mirror, not device based.
- *
- * The rdev for the device selected will have nr_pending incremented.
- */
-static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
+static void update_read_sectors(struct r1conf *conf, int disk,
+                               sector_t this_sector, int len)
 {
-       const sector_t this_sector = r1_bio->sector;
-       int sectors;
-       int best_good_sectors;
-       int best_disk, best_dist_disk, best_pending_disk;
-       int has_nonrot_disk;
+       struct raid1_info *info = &conf->mirrors[disk];
+
+       atomic_inc(&info->rdev->nr_pending);
+       if (info->next_seq_sect != this_sector)
+               info->seq_start = this_sector;
+       info->next_seq_sect = this_sector + len;
+}
+
+static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
+                            int *max_sectors)
+{
+       sector_t this_sector = r1_bio->sector;
+       int len = r1_bio->sectors;
        int disk;
-       sector_t best_dist;
-       unsigned int min_pending;
-       struct md_rdev *rdev;
-       int choose_first;
-       int choose_next_idle;
 
-       /*
-        * Check if we can balance. We can balance on the whole
-        * device if no resync is going on, or below the resync window.
-        * We take the first readable disk when above the resync window.
-        */
- retry:
-       sectors = r1_bio->sectors;
-       best_disk = -1;
-       best_dist_disk = -1;
-       best_dist = MaxSector;
-       best_pending_disk = -1;
-       min_pending = UINT_MAX;
-       best_good_sectors = 0;
-       has_nonrot_disk = 0;
-       choose_next_idle = 0;
-       clear_bit(R1BIO_FailFast, &r1_bio->state);
+       for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+               struct md_rdev *rdev;
+               int read_len;
 
-       if ((conf->mddev->recovery_cp < this_sector + sectors) ||
-           (mddev_is_clustered(conf->mddev) &&
-           md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
-                   this_sector + sectors)))
-               choose_first = 1;
-       else
-               choose_first = 0;
+               if (r1_bio->bios[disk] == IO_BLOCKED)
+                       continue;
+
+               rdev = conf->mirrors[disk].rdev;
+               if (!rdev || test_bit(Faulty, &rdev->flags))
+                       continue;
+
+               /* choose the first disk even if it has some bad blocks. */
+               read_len = raid1_check_read_range(rdev, this_sector, &len);
+               if (read_len > 0) {
+                       update_read_sectors(conf, disk, this_sector, read_len);
+                       *max_sectors = read_len;
+                       return disk;
+               }
+       }
+
+       return -1;
+}
+
+static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
+                         int *max_sectors)
+{
+       sector_t this_sector = r1_bio->sector;
+       int best_disk = -1;
+       int best_len = 0;
+       int disk;
 
        for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
-               sector_t dist;
-               sector_t first_bad;
-               int bad_sectors;
-               unsigned int pending;
-               bool nonrot;
+               struct md_rdev *rdev;
+               int len;
+               int read_len;
+
+               if (r1_bio->bios[disk] == IO_BLOCKED)
+                       continue;
 
                rdev = conf->mirrors[disk].rdev;
-               if (r1_bio->bios[disk] == IO_BLOCKED
-                   || rdev == NULL
-                   || test_bit(Faulty, &rdev->flags))
+               if (!rdev || test_bit(Faulty, &rdev->flags) ||
+                   test_bit(WriteMostly, &rdev->flags))
                        continue;
-               if (!test_bit(In_sync, &rdev->flags) &&
-                   rdev->recovery_offset < this_sector + sectors)
+
+               /* keep track of the disk with the most readable sectors. */
+               len = r1_bio->sectors;
+               read_len = raid1_check_read_range(rdev, this_sector, &len);
+               if (read_len > best_len) {
+                       best_disk = disk;
+                       best_len = read_len;
+               }
+       }
+
+       if (best_disk != -1) {
+               *max_sectors = best_len;
+               update_read_sectors(conf, best_disk, this_sector, best_len);
+       }
+
+       return best_disk;
+}
+
+static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
+                           int *max_sectors)
+{
+       sector_t this_sector = r1_bio->sector;
+       int bb_disk = -1;
+       int bb_read_len = 0;
+       int disk;
+
+       for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+               struct md_rdev *rdev;
+               int len;
+               int read_len;
+
+               if (r1_bio->bios[disk] == IO_BLOCKED)
                        continue;
-               if (test_bit(WriteMostly, &rdev->flags)) {
-                       /* Don't balance among write-mostly, just
-                        * use the first as a last resort */
-                       if (best_dist_disk < 0) {
-                               if (is_badblock(rdev, this_sector, sectors,
-                                               &first_bad, &bad_sectors)) {
-                                       if (first_bad <= this_sector)
-                                               /* Cannot use this */
-                                               continue;
-                                       best_good_sectors = first_bad - this_sector;
-                               } else
-                                       best_good_sectors = sectors;
-                               best_dist_disk = disk;
-                               best_pending_disk = disk;
-                       }
+
+               rdev = conf->mirrors[disk].rdev;
+               if (!rdev || test_bit(Faulty, &rdev->flags) ||
+                   !test_bit(WriteMostly, &rdev->flags))
                        continue;
+
+               /* there are no bad blocks, we can use this disk */
+               len = r1_bio->sectors;
+               read_len = raid1_check_read_range(rdev, this_sector, &len);
+               if (read_len == r1_bio->sectors) {
+                       update_read_sectors(conf, disk, this_sector, read_len);
+                       return disk;
                }
-               /* This is a reasonable device to use.  It might
-                * even be best.
+
+               /*
+                * there are partial bad blocks, choose the rdev with largest
+                * read length.
                 */
-               if (is_badblock(rdev, this_sector, sectors,
-                               &first_bad, &bad_sectors)) {
-                       if (best_dist < MaxSector)
-                               /* already have a better device */
-                               continue;
-                       if (first_bad <= this_sector) {
-                               /* cannot read here. If this is the 'primary'
-                                * device, then we must not read beyond
-                                * bad_sectors from another device..
-                                */
-                               bad_sectors -= (this_sector - first_bad);
-                               if (choose_first && sectors > bad_sectors)
-                                       sectors = bad_sectors;
-                               if (best_good_sectors > sectors)
-                                       best_good_sectors = sectors;
-
-                       } else {
-                               sector_t good_sectors = first_bad - this_sector;
-                               if (good_sectors > best_good_sectors) {
-                                       best_good_sectors = good_sectors;
-                                       best_disk = disk;
-                               }
-                               if (choose_first)
-                                       break;
-                       }
-                       continue;
-               } else {
-                       if ((sectors > best_good_sectors) && (best_disk >= 0))
-                               best_disk = -1;
-                       best_good_sectors = sectors;
+               if (read_len > bb_read_len) {
+                       bb_disk = disk;
+                       bb_read_len = read_len;
                }
+       }
+
+       if (bb_disk != -1) {
+               *max_sectors = bb_read_len;
+               update_read_sectors(conf, bb_disk, this_sector, bb_read_len);
+       }
+
+       return bb_disk;
+}
+
+static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio)
+{
+       /* TODO: address issues with this check and concurrency. */
+       return conf->mirrors[disk].next_seq_sect == r1_bio->sector ||
+              conf->mirrors[disk].head_position == r1_bio->sector;
+}
+
+/*
+ * If buffered sequential IO size exceeds optimal iosize, check if there is idle
+ * disk. If yes, choose the idle disk.
+ */
+static bool should_choose_next(struct r1conf *conf, int disk)
+{
+       struct raid1_info *mirror = &conf->mirrors[disk];
+       int opt_iosize;
+
+       if (!test_bit(Nonrot, &mirror->rdev->flags))
+               return false;
+
+       opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9;
+       return opt_iosize > 0 && mirror->seq_start != MaxSector &&
+              mirror->next_seq_sect > opt_iosize &&
+              mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
+}
+
+static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
+{
+       if (!rdev || test_bit(Faulty, &rdev->flags))
+               return false;
+
+       /* still in recovery */
+       if (!test_bit(In_sync, &rdev->flags) &&
+           rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
+               return false;
+
+       /* don't read from slow disk unless have to */
+       if (test_bit(WriteMostly, &rdev->flags))
+               return false;
+
+       /* don't split IO for bad blocks unless have to */
+       if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
+               return false;
+
+       return true;
+}
+
+struct read_balance_ctl {
+       sector_t closest_dist;
+       int closest_dist_disk;
+       int min_pending;
+       int min_pending_disk;
+       int sequential_disk;
+       int readable_disks;
+};
+
+static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
+{
+       int disk;
+       struct read_balance_ctl ctl = {
+               .closest_dist_disk      = -1,
+               .closest_dist           = MaxSector,
+               .min_pending_disk       = -1,
+               .min_pending            = UINT_MAX,
+               .sequential_disk        = -1,
+       };
+
+       for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+               struct md_rdev *rdev;
+               sector_t dist;
+               unsigned int pending;
 
-               if (best_disk >= 0)
-                       /* At least two disks to choose from so failfast is OK */
+               if (r1_bio->bios[disk] == IO_BLOCKED)
+                       continue;
+
+               rdev = conf->mirrors[disk].rdev;
+               if (!rdev_readable(rdev, r1_bio))
+                       continue;
+
+               /* At least two disks to choose from so failfast is OK */
+               if (ctl.readable_disks++ == 1)
                        set_bit(R1BIO_FailFast, &r1_bio->state);
 
-               nonrot = bdev_nonrot(rdev->bdev);
-               has_nonrot_disk |= nonrot;
                pending = atomic_read(&rdev->nr_pending);
-               dist = abs(this_sector - conf->mirrors[disk].head_position);
-               if (choose_first) {
-                       best_disk = disk;
-                       break;
-               }
+               dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
+
                /* Don't change to another disk for sequential reads */
-               if (conf->mirrors[disk].next_seq_sect == this_sector
-                   || dist == 0) {
-                       int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
-                       struct raid1_info *mirror = &conf->mirrors[disk];
+               if (is_sequential(conf, disk, r1_bio)) {
+                       if (!should_choose_next(conf, disk))
+                               return disk;
 
-                       best_disk = disk;
                        /*
-                        * If buffered sequential IO size exceeds optimal
-                        * iosize, check if there is idle disk. If yes, choose
-                        * the idle disk. read_balance could already choose an
-                        * idle disk before noticing it's a sequential IO in
-                        * this disk. This doesn't matter because this disk
-                        * will idle, next time it will be utilized after the
-                        * first disk has IO size exceeds optimal iosize. In
-                        * this way, iosize of the first disk will be optimal
-                        * iosize at least. iosize of the second disk might be
-                        * small, but not a big deal since when the second disk
-                        * starts IO, the first disk is likely still busy.
+                        * Add 'pending' to avoid choosing this disk if
+                        * there is other idle disk.
                         */
-                       if (nonrot && opt_iosize > 0 &&
-                           mirror->seq_start != MaxSector &&
-                           mirror->next_seq_sect > opt_iosize &&
-                           mirror->next_seq_sect - opt_iosize >=
-                           mirror->seq_start) {
-                               choose_next_idle = 1;
-                               continue;
-                       }
-                       break;
+                       pending++;
+                       /*
+                        * If there is no other idle disk, this disk
+                        * will be chosen.
+                        */
+                       ctl.sequential_disk = disk;
                }
 
-               if (choose_next_idle)
-                       continue;
-
-               if (min_pending > pending) {
-                       min_pending = pending;
-                       best_pending_disk = disk;
+               if (ctl.min_pending > pending) {
+                       ctl.min_pending = pending;
+                       ctl.min_pending_disk = disk;
                }
 
-               if (dist < best_dist) {
-                       best_dist = dist;
-                       best_dist_disk = disk;
+               if (ctl.closest_dist > dist) {
+                       ctl.closest_dist = dist;
+                       ctl.closest_dist_disk = disk;
                }
        }
 
+       /*
+        * sequential IO size exceeds optimal iosize, however, there is no other
+        * idle disk, so choose the sequential disk.
+        */
+       if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
+               return ctl.sequential_disk;
+
        /*
         * If all disks are rotational, choose the closest disk. If any disk is
         * non-rotational, choose the disk with less pending request even the
         * disk is rotational, which might/might not be optimal for raids with
         * mixed ratation/non-rotational disks depending on workload.
         */
-       if (best_disk == -1) {
-               if (has_nonrot_disk || min_pending == 0)
-                       best_disk = best_pending_disk;
-               else
-                       best_disk = best_dist_disk;
-       }
+       if (ctl.min_pending_disk != -1 &&
+           (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
+               return ctl.min_pending_disk;
+       else
+               return ctl.closest_dist_disk;
+}
 
-       if (best_disk >= 0) {
-               rdev = conf->mirrors[best_disk].rdev;
-               if (!rdev)
-                       goto retry;
-               atomic_inc(&rdev->nr_pending);
-               sectors = best_good_sectors;
+/*
+ * This routine returns the disk from which the requested read should be done.
+ *
+ * 1) If resync is in progress, find the first usable disk and use it even if it
+ * has some bad blocks.
+ *
+ * 2) Now that there is no resync, loop through all disks and skipping slow
+ * disks and disks with bad blocks for now. Only pay attention to key disk
+ * choice.
+ *
+ * 3) If we've made it this far, now look for disks with bad blocks and choose
+ * the one with most number of sectors.
+ *
+ * 4) If we are all the way at the end, we have no choice but to use a disk even
+ * if it is write mostly.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
+                       int *max_sectors)
+{
+       int disk;
 
-               if (conf->mirrors[best_disk].next_seq_sect != this_sector)
-                       conf->mirrors[best_disk].seq_start = this_sector;
+       clear_bit(R1BIO_FailFast, &r1_bio->state);
+
+       if (raid1_should_read_first(conf->mddev, r1_bio->sector,
+                                   r1_bio->sectors))
+               return choose_first_rdev(conf, r1_bio, max_sectors);
 
-               conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
+       disk = choose_best_rdev(conf, r1_bio);
+       if (disk >= 0) {
+               *max_sectors = r1_bio->sectors;
+               update_read_sectors(conf, disk, r1_bio->sector,
+                                   r1_bio->sectors);
+               return disk;
        }
-       *max_sectors = sectors;
 
-       return best_disk;
+       /*
+        * If we are here it means we didn't find a perfectly good disk so
+        * now spend a bit more time trying to find one with the most good
+        * sectors.
+        */
+       disk = choose_bb_rdev(conf, r1_bio, max_sectors);
+       if (disk >= 0)
+               return disk;
+
+       return choose_slow_rdev(conf, r1_bio, max_sectors);
 }
 
 static void wake_up_barrier(struct r1conf *conf)
@@ -1098,7 +1193,7 @@ static void freeze_array(struct r1conf *conf, int extra)
         */
        spin_lock_irq(&conf->resync_lock);
        conf->array_frozen = 1;
-       raid1_log(conf->mddev, "wait freeze");
+       mddev_add_trace_msg(conf->mddev, "raid1 wait freeze");
        wait_event_lock_irq_cmd(
                conf->wait_barrier,
                get_unqueued_pending(conf) == extra,
@@ -1287,7 +1382,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
                 * Reading from a write-mostly device must take care not to
                 * over-take any writes that are 'behind'
                 */
-               raid1_log(mddev, "wait behind writes");
+               mddev_add_trace_msg(mddev, "raid1 wait behind writes");
                wait_event(bitmap->behind_wait,
                           atomic_read(&bitmap->behind_writes) == 0);
        }
@@ -1320,11 +1415,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
            test_bit(R1BIO_FailFast, &r1_bio->state))
                read_bio->bi_opf |= MD_FAILFAST;
        read_bio->bi_private = r1_bio;
-
-       if (mddev->gendisk)
-               trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
-                                     r1_bio->sector);
-
+       mddev_trace_remap(mddev, read_bio, r1_bio->sector);
        submit_bio_noacct(read_bio);
 }
 
@@ -1474,7 +1565,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                        bio_wouldblock_error(bio);
                        return;
                }
-               raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
+               mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked",
+                               blocked_rdev->raid_disk);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
                wait_barrier(conf, bio->bi_iter.bi_sector, false);
                goto retry_write;
@@ -1557,10 +1649,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
                mbio->bi_private = r1_bio;
 
                atomic_inc(&r1_bio->remaining);
-
-               if (mddev->gendisk)
-                       trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
-                                             r1_bio->sector);
+               mddev_trace_remap(mddev, mbio, r1_bio->sector);
                /* flush_pending_writes() needs access to the rdev so...*/
                mbio->bi_bdev = (void *)rdev;
                if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
@@ -1760,6 +1849,52 @@ static int raid1_spare_active(struct mddev *mddev)
        return count;
 }
 
+static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
+                          bool replacement)
+{
+       struct raid1_info *info = conf->mirrors + disk;
+
+       if (replacement)
+               info += conf->raid_disks;
+
+       if (info->rdev)
+               return false;
+
+       if (bdev_nonrot(rdev->bdev)) {
+               set_bit(Nonrot, &rdev->flags);
+               WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
+       }
+
+       rdev->raid_disk = disk;
+       info->head_position = 0;
+       info->seq_start = MaxSector;
+       WRITE_ONCE(info->rdev, rdev);
+
+       return true;
+}
+
+static bool raid1_remove_conf(struct r1conf *conf, int disk)
+{
+       struct raid1_info *info = conf->mirrors + disk;
+       struct md_rdev *rdev = info->rdev;
+
+       if (!rdev || test_bit(In_sync, &rdev->flags) ||
+           atomic_read(&rdev->nr_pending))
+               return false;
+
+       /* Only remove non-faulty devices if recovery is not possible. */
+       if (!test_bit(Faulty, &rdev->flags) &&
+           rdev->mddev->recovery_disabled != conf->recovery_disabled &&
+           rdev->mddev->degraded < conf->raid_disks)
+               return false;
+
+       if (test_and_clear_bit(Nonrot, &rdev->flags))
+               WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
+
+       WRITE_ONCE(info->rdev, NULL);
+       return true;
+}
+
 static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
        struct r1conf *conf = mddev->private;
@@ -1791,19 +1926,16 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
        for (mirror = first; mirror <= last; mirror++) {
                p = conf->mirrors + mirror;
                if (!p->rdev) {
-                       if (mddev->gendisk)
-                               disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                                 rdev->data_offset << 9);
+                       err = mddev_stack_new_rdev(mddev, rdev);
+                       if (err)
+                               return err;
 
-                       p->head_position = 0;
-                       rdev->raid_disk = mirror;
-                       err = 0;
+                       raid1_add_conf(conf, rdev, mirror, false);
                        /* As all devices are equivalent, we don't need a full recovery
                         * if this was recently any drive of the array
                         */
                        if (rdev->saved_raid_disk < 0)
                                conf->fullsync = 1;
-                       WRITE_ONCE(p->rdev, rdev);
                        break;
                }
                if (test_bit(WantReplacement, &p->rdev->flags) &&
@@ -1813,13 +1945,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 
        if (err && repl_slot >= 0) {
                /* Add this device as a replacement */
-               p = conf->mirrors + repl_slot;
                clear_bit(In_sync, &rdev->flags);
                set_bit(Replacement, &rdev->flags);
-               rdev->raid_disk = repl_slot;
+               raid1_add_conf(conf, rdev, repl_slot, true);
                err = 0;
                conf->fullsync = 1;
-               WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
        }
 
        print_conf(conf);
@@ -1836,27 +1966,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
        if (unlikely(number >= conf->raid_disks))
                goto abort;
 
-       if (rdev != p->rdev)
-               p = conf->mirrors + conf->raid_disks + number;
+       if (rdev != p->rdev) {
+               number += conf->raid_disks;
+               p = conf->mirrors + number;
+       }
 
        print_conf(conf);
        if (rdev == p->rdev) {
-               if (test_bit(In_sync, &rdev->flags) ||
-                   atomic_read(&rdev->nr_pending)) {
+               if (!raid1_remove_conf(conf, number)) {
                        err = -EBUSY;
                        goto abort;
                }
-               /* Only remove non-faulty devices if recovery
-                * is not possible.
-                */
-               if (!test_bit(Faulty, &rdev->flags) &&
-                   mddev->recovery_disabled != conf->recovery_disabled &&
-                   mddev->degraded < conf->raid_disks) {
-                       err = -EBUSY;
-                       goto abort;
-               }
-               WRITE_ONCE(p->rdev, NULL);
-               if (conf->mirrors[conf->raid_disks + number].rdev) {
+
+               if (number < conf->raid_disks &&
+                   conf->mirrors[conf->raid_disks + number].rdev) {
                        /* We just removed a device that is being replaced.
                         * Move down the replacement.  We drain all IO before
                         * doing this to avoid confusion.
@@ -1944,8 +2067,6 @@ static void end_sync_write(struct bio *bio)
        struct r1bio *r1_bio = get_resync_r1bio(bio);
        struct mddev *mddev = r1_bio->mddev;
        struct r1conf *conf = mddev->private;
-       sector_t first_bad;
-       int bad_sectors;
        struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
 
        if (!uptodate) {
@@ -1955,14 +2076,11 @@ static void end_sync_write(struct bio *bio)
                        set_bit(MD_RECOVERY_NEEDED, &
                                mddev->recovery);
                set_bit(R1BIO_WriteError, &r1_bio->state);
-       } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
-                              &first_bad, &bad_sectors) &&
-                  !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
-                               r1_bio->sector,
-                               r1_bio->sectors,
-                               &first_bad, &bad_sectors)
-               )
+       } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
+                  !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev,
+                                     r1_bio->sector, r1_bio->sectors)) {
                set_bit(R1BIO_MadeGood, &r1_bio->state);
+       }
 
        put_sync_write_buf(r1_bio, uptodate);
 }
@@ -2279,16 +2397,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
                        s = PAGE_SIZE >> 9;
 
                do {
-                       sector_t first_bad;
-                       int bad_sectors;
-
                        rdev = conf->mirrors[d].rdev;
                        if (rdev &&
                            (test_bit(In_sync, &rdev->flags) ||
                             (!test_bit(Faulty, &rdev->flags) &&
                              rdev->recovery_offset >= sect + s)) &&
-                           is_badblock(rdev, sect, s,
-                                       &first_bad, &bad_sectors) == 0) {
+                           rdev_has_badblock(rdev, sect, s) == 0) {
                                atomic_inc(&rdev->nr_pending);
                                if (sync_page_io(rdev, sect, s<<9,
                                         conf->tmppage, REQ_OP_READ, false))
@@ -3006,23 +3120,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 
        err = -EINVAL;
        spin_lock_init(&conf->device_lock);
+       conf->raid_disks = mddev->raid_disks;
        rdev_for_each(rdev, mddev) {
                int disk_idx = rdev->raid_disk;
-               if (disk_idx >= mddev->raid_disks
-                   || disk_idx < 0)
+
+               if (disk_idx >= conf->raid_disks || disk_idx < 0)
                        continue;
-               if (test_bit(Replacement, &rdev->flags))
-                       disk = conf->mirrors + mddev->raid_disks + disk_idx;
-               else
-                       disk = conf->mirrors + disk_idx;
 
-               if (disk->rdev)
+               if (!raid1_add_conf(conf, rdev, disk_idx,
+                                   test_bit(Replacement, &rdev->flags)))
                        goto abort;
-               disk->rdev = rdev;
-               disk->head_position = 0;
-               disk->seq_start = MaxSector;
        }
-       conf->raid_disks = mddev->raid_disks;
        conf->mddev = mddev;
        INIT_LIST_HEAD(&conf->retry_list);
        INIT_LIST_HEAD(&conf->bio_end_io_list);
@@ -3086,12 +3194,21 @@ static struct r1conf *setup_conf(struct mddev *mddev)
        return ERR_PTR(err);
 }
 
+static int raid1_set_limits(struct mddev *mddev)
+{
+       struct queue_limits lim;
+
+       blk_set_stacking_limits(&lim);
+       lim.max_write_zeroes_sectors = 0;
+       mddev_stack_rdev_limits(mddev, &lim);
+       return queue_limits_set(mddev->gendisk->queue, &lim);
+}
+
 static void raid1_free(struct mddev *mddev, void *priv);
 static int raid1_run(struct mddev *mddev)
 {
        struct r1conf *conf;
        int i;
-       struct md_rdev *rdev;
        int ret;
 
        if (mddev->level != 1) {
@@ -3118,14 +3235,10 @@ static int raid1_run(struct mddev *mddev)
        if (IS_ERR(conf))
                return PTR_ERR(conf);
 
-       if (mddev->queue)
-               blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
-
-       rdev_for_each(rdev, mddev) {
-               if (!mddev->gendisk)
-                       continue;
-               disk_stack_limits(mddev->gendisk, rdev->bdev,
-                                 rdev->data_offset << 9);
+       if (!mddev_is_dm(mddev)) {
+               ret = raid1_set_limits(mddev);
+               if (ret)
+                       goto abort;
        }
 
        mddev->degraded = 0;