Fix read-balancing during node failure
authorGoldwyn Rodrigues <rgoldwyn@suse.com>
Wed, 24 Jun 2015 14:30:32 +0000 (09:30 -0500)
committerNeilBrown <neilb@suse.com>
Fri, 24 Jul 2015 03:37:59 +0000 (13:37 +1000)
During a node failure, We need to suspend read balancing so that the
reads are directed to the first device and stale data is not read.
Suspending writes is not required because these would be recorded and
synced eventually.

A new flag MD_CLUSTER_SUSPEND_READ_BALANCING is set in recover_prep().
area_resyncing() will respond true for the entire devices if this
flag is set and the request type is READ. The flag is cleared
in recover_done().

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reported-By: David Teigland <teigland@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.com>
drivers/md/md-cluster.c
drivers/md/md-cluster.h
drivers/md/raid1.c

index fcfc4b9b26728029e59023ae3762d271b8f7a676..0072190515e0f6edca1e09718dae5102b3e7d274 100644 (file)
@@ -44,6 +44,7 @@ struct resync_info {
 
 /* md_cluster_info flags */
 #define                MD_CLUSTER_WAITING_FOR_NEWDISK          1
+#define                MD_CLUSTER_SUSPEND_READ_BALANCING       2
 
 
 struct md_cluster_info {
@@ -275,6 +276,9 @@ clear_bit:
 
 static void recover_prep(void *arg)
 {
+       struct mddev *mddev = arg;
+       struct md_cluster_info *cinfo = mddev->cluster_info;
+       set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 }
 
 static void recover_slot(void *arg, struct dlm_slot *slot)
@@ -307,6 +311,7 @@ static void recover_done(void *arg, struct dlm_slot *slots,
 
        cinfo->slot_number = our_slot;
        complete(&cinfo->completion);
+       clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 }
 
 static const struct dlm_lockspace_ops md_ls_ops = {
@@ -816,12 +821,17 @@ static void resync_finish(struct mddev *mddev)
        resync_send(mddev, RESYNCING, 0, 0);
 }
 
-static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
+static int area_resyncing(struct mddev *mddev, int direction,
+               sector_t lo, sector_t hi)
 {
        struct md_cluster_info *cinfo = mddev->cluster_info;
        int ret = 0;
        struct suspend_info *s;
 
+       if ((direction == READ) &&
+               test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
+               return 1;
+
        spin_lock_irq(&cinfo->suspend_lock);
        if (list_empty(&cinfo->suspend_list))
                goto out;
index 6817ee00e053d7d7e74b734185414c0cb3b18a38..00defe2badbc7952ec7d4cf358ad0408a2460b7f 100644 (file)
@@ -18,7 +18,7 @@ struct md_cluster_operations {
        int (*metadata_update_start)(struct mddev *mddev);
        int (*metadata_update_finish)(struct mddev *mddev);
        int (*metadata_update_cancel)(struct mddev *mddev);
-       int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
+       int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
        int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
        int (*add_new_disk_finish)(struct mddev *mddev);
        int (*new_disk_ack)(struct mddev *mddev, bool ack);
index 50cf0c893b16851d83c0e9a3cbe58268c9b80ffc..94f5b55069e09610f21ea640f5dff3efd7e580ca 100644 (file)
@@ -541,7 +541,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 
        if ((conf->mddev->recovery_cp < this_sector + sectors) ||
            (mddev_is_clustered(conf->mddev) &&
-           md_cluster_ops->area_resyncing(conf->mddev, this_sector,
+           md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
                    this_sector + sectors)))
                choose_first = 1;
        else
@@ -1111,7 +1111,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
            ((bio_end_sector(bio) > mddev->suspend_lo &&
            bio->bi_iter.bi_sector < mddev->suspend_hi) ||
            (mddev_is_clustered(mddev) &&
-            md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
+            md_cluster_ops->area_resyncing(mddev, WRITE,
+                    bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
                /* As the suspend_* range is controlled by
                 * userspace, we want an interruptible
                 * wait.
@@ -1124,7 +1125,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                        if (bio_end_sector(bio) <= mddev->suspend_lo ||
                            bio->bi_iter.bi_sector >= mddev->suspend_hi ||
                            (mddev_is_clustered(mddev) &&
-                            !md_cluster_ops->area_resyncing(mddev,
+                            !md_cluster_ops->area_resyncing(mddev, WRITE,
                                     bio->bi_iter.bi_sector, bio_end_sector(bio))))
                                break;
                        schedule();