md/raid1: record nonrot rdevs while adding/removing rdevs to conf
authorYu Kuai <yukuai3@huawei.com>
Thu, 29 Feb 2024 09:57:06 +0000 (17:57 +0800)
committerSong Liu <song@kernel.org>
Fri, 1 Mar 2024 06:49:45 +0000 (22:49 -0800)
For raid1, each read will iterate all the rdevs from conf and check if
any rdev is non-rotational, then choose rdev with minimal IO inflight
if so, or rdev with closest distance otherwise.

Disk nonrot info can be changed through sysfs entry:

/sys/block/[disk_name]/queue/rotational

However, consider that this should only be used for testing, and user
really shouldn't do this in real life. Record the number of non-rotational
disks in conf, to avoid checking each rdev in IO fast path and simplify
read_balance() a little bit.

Co-developed-by: Paul Luse <paul.e.luse@linux.intel.com>
Signed-off-by: Paul Luse <paul.e.luse@linux.intel.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20240229095714.926789-4-yukuai1@huaweicloud.com
drivers/md/md.h
drivers/md/raid1.c
drivers/md/raid1.h

index a49ab04ab7074868499014a8ea0a652b113bbf04..b2076a165c1050d84084b7e1adbd7c6b6abab3e8 100644 (file)
@@ -207,6 +207,7 @@ enum flag_bits {
                                 * check if there is collision between raid1
                                 * serial bios.
                                 */
+       Nonrot,                 /* non-rotational device (SSD) */
 };
 
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
index 6ec9998f62573cab618ffab25eede66a68cc1bed..de6ea87d4d2491aa4f43c5dcdc05de6d7025925b 100644 (file)
@@ -599,7 +599,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
        int sectors;
        int best_good_sectors;
        int best_disk, best_dist_disk, best_pending_disk;
-       int has_nonrot_disk;
        int disk;
        sector_t best_dist;
        unsigned int min_pending;
@@ -620,7 +619,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
        best_pending_disk = -1;
        min_pending = UINT_MAX;
        best_good_sectors = 0;
-       has_nonrot_disk = 0;
        choose_next_idle = 0;
        clear_bit(R1BIO_FailFast, &r1_bio->state);
 
@@ -637,7 +635,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                sector_t first_bad;
                int bad_sectors;
                unsigned int pending;
-               bool nonrot;
 
                rdev = conf->mirrors[disk].rdev;
                if (r1_bio->bios[disk] == IO_BLOCKED
@@ -703,8 +700,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                        /* At least two disks to choose from so failfast is OK */
                        set_bit(R1BIO_FailFast, &r1_bio->state);
 
-               nonrot = bdev_nonrot(rdev->bdev);
-               has_nonrot_disk |= nonrot;
                pending = atomic_read(&rdev->nr_pending);
                dist = abs(this_sector - conf->mirrors[disk].head_position);
                if (choose_first) {
@@ -731,7 +726,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
                         * small, but not a big deal since when the second disk
                         * starts IO, the first disk is likely still busy.
                         */
-                       if (nonrot && opt_iosize > 0 &&
+                       if (test_bit(Nonrot, &rdev->flags) && opt_iosize > 0 &&
                            mirror->seq_start != MaxSector &&
                            mirror->next_seq_sect > opt_iosize &&
                            mirror->next_seq_sect - opt_iosize >=
@@ -763,7 +758,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
         * mixed ratation/non-rotational disks depending on workload.
         */
        if (best_disk == -1) {
-               if (has_nonrot_disk || min_pending == 0)
+               if (READ_ONCE(conf->nonrot_disks) || min_pending == 0)
                        best_disk = best_pending_disk;
                else
                        best_disk = best_dist_disk;
@@ -1768,6 +1763,11 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
        if (info->rdev)
                return false;
 
+       if (bdev_nonrot(rdev->bdev)) {
+               set_bit(Nonrot, &rdev->flags);
+               WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
+       }
+
        rdev->raid_disk = disk;
        info->head_position = 0;
        info->seq_start = MaxSector;
@@ -1791,6 +1791,9 @@ static bool raid1_remove_conf(struct r1conf *conf, int disk)
            rdev->mddev->degraded < conf->raid_disks)
                return false;
 
+       if (test_and_clear_bit(Nonrot, &rdev->flags))
+               WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
+
        WRITE_ONCE(info->rdev, NULL);
        return true;
 }
index 14d4211a123a8e4007689919d5f86092c7feb659..5300cbaa58a415a0c6f3f41078d59394418bcb3a 100644 (file)
@@ -71,6 +71,7 @@ struct r1conf {
                                                 * allow for replacements.
                                                 */
        int                     raid_disks;
+       int                     nonrot_disks;
 
        spinlock_t              device_lock;