md: don't retry recovery of raid1 that fails due to error on source drive.
authorNeilBrown <neilb@suse.de>
Thu, 8 Jan 2009 21:31:11 +0000 (08:31 +1100)
committerNeilBrown <neilb@suse.de>
Thu, 8 Jan 2009 21:31:11 +0000 (08:31 +1100)
If a raid1 has only one working drive and it has a sector which
gives an error on read, then an attempt to recover onto a spare will
fail, but as the single remaining drive is not removed from the
array, the recovery will be immediately re-attempted, resulting
in an infinite recovery loop.

So detect this situation and don't retry recovery once an error
on the lone remaining drive is detected.

Allow recovery to be retried once every time a spare is added
in case the problem wasn't actually a media error.

Signed-off-by: NeilBrown <neilb@suse.de>
drivers/md/md.c
drivers/md/raid1.c
include/linux/raid/md_k.h

index f5cbb9d2371a14dd3d316a81b6653cde0fa39e76..41e2509bf896c9356f9f9bd55f93f811120d7639 100644 (file)
@@ -1500,6 +1500,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 
        list_add_rcu(&rdev->same_set, &mddev->disks);
        bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
+
+       /* May as well allow recovery to be retried once */
+       mddev->recovery_disabled = 0;
        return 0;
 
  fail:
@@ -6175,7 +6178,7 @@ static int remove_and_add_spares(mddev_t *mddev)
                        }
                }
 
-       if (mddev->degraded && ! mddev->ro) {
+       if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
                list_for_each_entry(rdev, &mddev->disks, same_set) {
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(In_sync, &rdev->flags) &&
index c165b1eed8bb37745d3d1284dc855eeb3b771c7a..7b4f5f7155d8726705c1f42757f68dd1de869624 100644 (file)
@@ -1016,12 +1016,16 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
         * else mark the drive as failed
         */
        if (test_bit(In_sync, &rdev->flags)
-           && (conf->raid_disks - mddev->degraded) == 1)
+           && (conf->raid_disks - mddev->degraded) == 1) {
                /*
                 * Don't fail the drive, act as though we were just a
-                * normal single drive
+                * normal single drive.
+                * However don't try a recovery from this drive as
+                * it is very likely to fail.
                 */
+               mddev->recovery_disabled = 1;
                return;
+       }
        if (test_and_clear_bit(In_sync, &rdev->flags)) {
                unsigned long flags;
                spin_lock_irqsave(&conf->device_lock, flags);
index dac4217194b88fe868ea05f790b73e400842efa0..9743e4dbc9188031b0c180584df4c5e6ba898513 100644 (file)
@@ -218,6 +218,9 @@ struct mddev_s
 #define        MD_RECOVERY_FROZEN      9
 
        unsigned long                   recovery;
+       int                             recovery_disabled; /* if we detect that recovery
+                                                           * will always fail, set this
+                                                           * so we don't loop trying */
 
        int                             in_sync;        /* know to not need resync */
        struct mutex                    reconfig_mutex;