Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 26 Oct 2018 20:00:44 +0000 (13:00 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 26 Oct 2018 20:00:44 +0000 (13:00 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 20:00:44 +0000 (13:00 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 20:00:44 +0000 (13:00 -0700)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c

index 2fc8c113977fbec7877220e034dd34561399741b..1cd4f991792cf138448584b1e5328d3a1a73ff1e 100644 (file)
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2288,9 +2288,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
                         goto out;
                 }
                 if (mddev->pers) {
-                       mddev->pers->quiesce(mddev, 1);
+                       mddev_suspend(mddev);
                         md_bitmap_destroy(mddev);
-                       mddev->pers->quiesce(mddev, 0);
+                       mddev_resume(mddev);
                 }
                 mddev->bitmap_info.offset = 0;
                 if (mddev->bitmap_info.file) {
@@ -2327,8 +2327,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
                         mddev->bitmap_info.offset = offset;
                         if (mddev->pers) {
                                 struct bitmap *bitmap;
-                               mddev->pers->quiesce(mddev, 1);
                                 bitmap = md_bitmap_create(mddev, -1);
+                               mddev_suspend(mddev);
                                 if (IS_ERR(bitmap))
                                         rv = PTR_ERR(bitmap);
                                 else {
@@ -2337,11 +2337,12 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
                                         if (rv)
                                                 mddev->bitmap_info.offset = 0;
                                 }
-                               mddev->pers->quiesce(mddev, 0);
                                 if (rv) {
                                         md_bitmap_destroy(mddev);
+                                       mddev_resume(mddev);
                                         goto out;
                                 }
+                               mddev_resume(mddev);
                         }
                 }
         }
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c

index 0b2af6e74fc375ed163824fa9cdaf84d1b9ffd95..8dff19d5502eff6ff0a3f586f4db255bb743b431 100644 (file)
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -33,13 +33,6 @@ struct dlm_lock_resource {
         int mode;
  };
  
-struct suspend_info {
-       int slot;
-       sector_t lo;
-       sector_t hi;
-       struct list_head list;
-};
-
  struct resync_info {
         __le64 lo;
         __le64 hi;
@@ -80,7 +73,13 @@ struct md_cluster_info {
         struct dlm_lock_resource **other_bitmap_lockres;
         struct dlm_lock_resource *resync_lockres;
         struct list_head suspend_list;
+
         spinlock_t suspend_lock;
+       /* record the region which write should be suspended */
+       sector_t suspend_lo;
+       sector_t suspend_hi;
+       int suspend_from; /* the slot which broadcast suspend_lo/hi */
+
         struct md_thread *recovery_thread;
         unsigned long recovery_map;
         /* communication loc resources */
@@ -105,6 +104,7 @@ enum msg_type {
         RE_ADD,
         BITMAP_NEEDS_SYNC,
         CHANGE_CAPACITY,
+       BITMAP_RESIZE,
  };
  
  struct cluster_msg {
@@ -270,25 +270,22 @@ static void add_resync_info(struct dlm_lock_resource *lockres,
         ri->hi = cpu_to_le64(hi);
  }
  
-static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
+static int read_resync_info(struct mddev *mddev,
+                           struct dlm_lock_resource *lockres)
  {
         struct resync_info ri;
-       struct suspend_info *s = NULL;
-       sector_t hi = 0;
+       struct md_cluster_info *cinfo = mddev->cluster_info;
+       int ret = 0;
  
         dlm_lock_sync(lockres, DLM_LOCK_CR);
         memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
-       hi = le64_to_cpu(ri.hi);
-       if (hi > 0) {
-               s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
-               if (!s)
-                       goto out;
-               s->hi = hi;
-               s->lo = le64_to_cpu(ri.lo);
+       if (le64_to_cpu(ri.hi) > 0) {
+               cinfo->suspend_hi = le64_to_cpu(ri.hi);
+               cinfo->suspend_lo = le64_to_cpu(ri.lo);
+               ret = 1;
         }
         dlm_unlock_sync(lockres);
-out:
-       return s;
+       return ret;
  }
  
  static void recover_bitmaps(struct md_thread *thread)
@@ -298,7 +295,6 @@ static void recover_bitmaps(struct md_thread *thread)
         struct dlm_lock_resource *bm_lockres;
         char str[64];
         int slot, ret;
-       struct suspend_info *s, *tmp;
         sector_t lo, hi;
  
         while (cinfo->recovery_map) {
@@ -325,13 +321,17 @@ static void recover_bitmaps(struct md_thread *thread)
  
                 /* Clear suspend_area associated with the bitmap */
                 spin_lock_irq(&cinfo->suspend_lock);
-               list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
-                       if (slot == s->slot) {
-                               list_del(&s->list);
-                               kfree(s);
-                       }
+               cinfo->suspend_hi = 0;
+               cinfo->suspend_lo = 0;
+               cinfo->suspend_from = -1;
                 spin_unlock_irq(&cinfo->suspend_lock);
  
+               /* Kick off a reshape if needed */
+               if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+                   test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+                   mddev->reshape_position != MaxSector)
+                       md_wakeup_thread(mddev->sync_thread);
+
                 if (hi > 0) {
                         if (lo < mddev->recovery_cp)
                                 mddev->recovery_cp = lo;
@@ -434,34 +434,23 @@ static void ack_bast(void *arg, int mode)
         }
  }
  
-static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
-{
-       struct suspend_info *s, *tmp;
-
-       list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
-               if (slot == s->slot) {
-                       list_del(&s->list);
-                       kfree(s);
-                       break;
-               }
-}
-
  static void remove_suspend_info(struct mddev *mddev, int slot)
  {
         struct md_cluster_info *cinfo = mddev->cluster_info;
         mddev->pers->quiesce(mddev, 1);
         spin_lock_irq(&cinfo->suspend_lock);
-       __remove_suspend_info(cinfo, slot);
+       cinfo->suspend_hi = 0;
+       cinfo->suspend_lo = 0;
         spin_unlock_irq(&cinfo->suspend_lock);
         mddev->pers->quiesce(mddev, 0);
  }
  
-
  static void process_suspend_info(struct mddev *mddev,
                 int slot, sector_t lo, sector_t hi)
  {
         struct md_cluster_info *cinfo = mddev->cluster_info;
-       struct suspend_info *s;
+       struct mdp_superblock_1 *sb = NULL;
+       struct md_rdev *rdev;
  
         if (!hi) {
                 /*
@@ -475,6 +464,12 @@ static void process_suspend_info(struct mddev *mddev,
                 return;
         }
  
+       rdev_for_each(rdev, mddev)
+               if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
+                       sb = page_address(rdev->sb_page);
+                       break;
+               }
+
         /*
          * The bitmaps are not same for different nodes
          * if RESYNCING is happening in one node, then
@@ -487,26 +482,26 @@ static void process_suspend_info(struct mddev *mddev,
          * sync_low/hi is used to record the region which
          * arrived in the previous RESYNCING message,
          *
-        * Call bitmap_sync_with_cluster to clear
-        * NEEDED_MASK and set RESYNC_MASK since
-        * resync thread is running in another node,
-        * so we don't need to do the resync again
-        * with the same section */
-       md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, cinfo->sync_hi, lo, hi);
+        * Call md_bitmap_sync_with_cluster to clear NEEDED_MASK
+        * and set RESYNC_MASK since  resync thread is running
+        * in another node, so we don't need to do the resync
+        * again with the same section.
+        *
+        * Skip md_bitmap_sync_with_cluster in case reshape
+        * happening, because reshaping region is small and
+        * we don't want to trigger lots of WARN.
+        */
+       if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
+               md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
+                                           cinfo->sync_hi, lo, hi);
         cinfo->sync_low = lo;
         cinfo->sync_hi = hi;
  
-       s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
-       if (!s)
-               return;
-       s->slot = slot;
-       s->lo = lo;
-       s->hi = hi;
         mddev->pers->quiesce(mddev, 1);
         spin_lock_irq(&cinfo->suspend_lock);
-       /* Remove existing entry (if exists) before adding */
-       __remove_suspend_info(cinfo, slot);
-       list_add(&s->list, &cinfo->suspend_list);
+       cinfo->suspend_from = slot;
+       cinfo->suspend_lo = lo;
+       cinfo->suspend_hi = hi;
         spin_unlock_irq(&cinfo->suspend_lock);
         mddev->pers->quiesce(mddev, 0);
  }
@@ -612,6 +607,11 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
         case BITMAP_NEEDS_SYNC:
                 __recover_slot(mddev, le32_to_cpu(msg->slot));
                 break;
+       case BITMAP_RESIZE:
+               if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
+                       ret = md_bitmap_resize(mddev->bitmap,
+                                           le64_to_cpu(msg->high), 0, 0);
+               break;
         default:
                 ret = -1;
                 pr_warn("%s:%d Received unknown message from %d\n",
@@ -800,7 +800,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
         struct md_cluster_info *cinfo = mddev->cluster_info;
         int i, ret = 0;
         struct dlm_lock_resource *bm_lockres;
-       struct suspend_info *s;
         char str[64];
         sector_t lo, hi;
  
@@ -819,16 +818,13 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
                 bm_lockres->flags |= DLM_LKF_NOQUEUE;
                 ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
                 if (ret == -EAGAIN) {
-                       s = read_resync_info(mddev, bm_lockres);
-                       if (s) {
+                       if (read_resync_info(mddev, bm_lockres)) {
                                 pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
                                                 __func__, __LINE__,
-                                               (unsigned long long) s->lo,
-                                               (unsigned long long) s->hi, i);
-                               spin_lock_irq(&cinfo->suspend_lock);
-                               s->slot = i;
-                               list_add(&s->list, &cinfo->suspend_list);
-                               spin_unlock_irq(&cinfo->suspend_lock);
+                                       (unsigned long long) cinfo->suspend_lo,
+                                       (unsigned long long) cinfo->suspend_hi,
+                                       i);
+                               cinfo->suspend_from = i;
                         }
                         ret = 0;
                         lockres_free(bm_lockres);
@@ -1001,10 +997,17 @@ static int leave(struct mddev *mddev)
         if (!cinfo)
                 return 0;
  
-       /* BITMAP_NEEDS_SYNC message should be sent when node
+       /*
+        * BITMAP_NEEDS_SYNC message should be sent when node
          * is leaving the cluster with dirty bitmap, also we
-        * can only deliver it when dlm connection is available */
-       if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
+        * can only deliver it when dlm connection is available.
+        *
+        * Also, we should send BITMAP_NEEDS_SYNC message in
+        * case reshaping is interrupted.
+        */
+       if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) ||
+           (mddev->reshape_position != MaxSector &&
+            test_bit(MD_CLOSING, &mddev->flags)))
                 resync_bitmap(mddev);
  
         set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
@@ -1102,6 +1105,80 @@ static void metadata_update_cancel(struct mddev *mddev)
         unlock_comm(cinfo);
  }
  
+static int update_bitmap_size(struct mddev *mddev, sector_t size)
+{
+       struct md_cluster_info *cinfo = mddev->cluster_info;
+       struct cluster_msg cmsg = {0};
+       int ret;
+
+       cmsg.type = cpu_to_le32(BITMAP_RESIZE);
+       cmsg.high = cpu_to_le64(size);
+       ret = sendmsg(cinfo, &cmsg, 0);
+       if (ret)
+               pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n",
+                       __func__, __LINE__, ret);
+       return ret;
+}
+
+static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
+{
+       struct bitmap_counts *counts;
+       char str[64];
+       struct dlm_lock_resource *bm_lockres;
+       struct bitmap *bitmap = mddev->bitmap;
+       unsigned long my_pages = bitmap->counts.pages;
+       int i, rv;
+
+       /*
+        * We need to ensure all the nodes can grow to a larger
+        * bitmap size before make the reshaping.
+        */
+       rv = update_bitmap_size(mddev, newsize);
+       if (rv)
+               return rv;
+
+       for (i = 0; i < mddev->bitmap_info.nodes; i++) {
+               if (i == md_cluster_ops->slot_number(mddev))
+                       continue;
+
+               bitmap = get_bitmap_from_slot(mddev, i);
+               if (IS_ERR(bitmap)) {
+                       pr_err("can't get bitmap from slot %d\n", i);
+                       goto out;
+               }
+               counts = &bitmap->counts;
+
+               /*
+                * If we can hold the bitmap lock of one node then
+                * the slot is not occupied, update the pages.
+                */
+               snprintf(str, 64, "bitmap%04d", i);
+               bm_lockres = lockres_init(mddev, str, NULL, 1);
+               if (!bm_lockres) {
+                       pr_err("Cannot initialize %s lock\n", str);
+                       goto out;
+               }
+               bm_lockres->flags |= DLM_LKF_NOQUEUE;
+               rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+               if (!rv)
+                       counts->pages = my_pages;
+               lockres_free(bm_lockres);
+
+               if (my_pages != counts->pages)
+                       /*
+                        * Let's revert the bitmap size if one node
+                        * can't resize bitmap
+                        */
+                       goto out;
+       }
+
+       return 0;
+out:
+       md_bitmap_free(bitmap);
+       update_bitmap_size(mddev, oldsize);
+       return -1;
+}
+
  /*
   * return 0 if all the bitmaps have the same sync_size
   */
@@ -1243,6 +1320,16 @@ static int resync_start(struct mddev *mddev)
         return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev);
  }
  
+static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
+{
+       struct md_cluster_info *cinfo = mddev->cluster_info;
+
+       spin_lock_irq(&cinfo->suspend_lock);
+       *lo = cinfo->suspend_lo;
+       *hi = cinfo->suspend_hi;
+       spin_unlock_irq(&cinfo->suspend_lock);
+}
+
  static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
  {
         struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1295,21 +1382,14 @@ static int area_resyncing(struct mddev *mddev, int direction,
  {
         struct md_cluster_info *cinfo = mddev->cluster_info;
         int ret = 0;
-       struct suspend_info *s;
  
         if ((direction == READ) &&
                 test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
                 return 1;
  
         spin_lock_irq(&cinfo->suspend_lock);
-       if (list_empty(&cinfo->suspend_list))
-               goto out;
-       list_for_each_entry(s, &cinfo->suspend_list, list)
-               if (hi > s->lo && lo < s->hi) {
-                       ret = 1;
-                       break;
-               }
-out:
+       if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi)
+               ret = 1;
         spin_unlock_irq(&cinfo->suspend_lock);
         return ret;
  }
@@ -1482,6 +1562,7 @@ static struct md_cluster_operations cluster_ops = {
         .resync_start = resync_start,
         .resync_finish = resync_finish,
         .resync_info_update = resync_info_update,
+       .resync_info_get = resync_info_get,
         .metadata_update_start = metadata_update_start,
         .metadata_update_finish = metadata_update_finish,
         .metadata_update_cancel = metadata_update_cancel,
@@ -1492,6 +1573,7 @@ static struct md_cluster_operations cluster_ops = {
         .remove_disk = remove_disk,
         .load_bitmaps = load_bitmaps,
         .gather_bitmaps = gather_bitmaps,
+       .resize_bitmaps = resize_bitmaps,
         .lock_all_bitmaps = lock_all_bitmaps,
         .unlock_all_bitmaps = unlock_all_bitmaps,
         .update_size = update_size,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h

index c0240708f44332c3fbb8e37f32bbd3bd84eef926..a78e3021775d5a1cfba0246713db2a876b66899b 100644 (file)
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -14,6 +14,7 @@ struct md_cluster_operations {
         int (*leave)(struct mddev *mddev);
         int (*slot_number)(struct mddev *mddev);
         int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+       void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi);
         int (*metadata_update_start)(struct mddev *mddev);
         int (*metadata_update_finish)(struct mddev *mddev);
         void (*metadata_update_cancel)(struct mddev *mddev);
@@ -26,6 +27,7 @@ struct md_cluster_operations {
         int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
         void (*load_bitmaps)(struct mddev *mddev, int total_slots);
         int (*gather_bitmaps)(struct md_rdev *rdev);
+       int (*resize_bitmaps)(struct mddev *mddev, sector_t newsize, sector_t oldsize);
         int (*lock_all_bitmaps)(struct mddev *mddev);
         void (*unlock_all_bitmaps)(struct mddev *mddev);
         void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
diff --git a/drivers/md/md.c b/drivers/md/md.c

index 63ceabb4e020f656313fabff5e25e3ba593a8786..fc488cb30a94780d8a4fee0674d718aa901b047f 100644 (file)
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -452,10 +452,11 @@ static void md_end_flush(struct bio *fbio)
         rdev_dec_pending(rdev, mddev);
  
         if (atomic_dec_and_test(&fi->flush_pending)) {
-               if (bio->bi_iter.bi_size == 0)
+               if (bio->bi_iter.bi_size == 0) {
                         /* an empty barrier - all done */
                         bio_endio(bio);
-               else {
+                       mempool_free(fi, mddev->flush_pool);
+               } else {
                         INIT_WORK(&fi->flush_work, submit_flushes);
                         queue_work(md_wq, &fi->flush_work);
                 }
@@ -509,10 +510,11 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
         rcu_read_unlock();
  
         if (atomic_dec_and_test(&fi->flush_pending)) {
-               if (bio->bi_iter.bi_size == 0)
+               if (bio->bi_iter.bi_size == 0) {
                         /* an empty barrier - all done */
                         bio_endio(bio);
-               else {
+                       mempool_free(fi, mddev->flush_pool);
+               } else {
                         INIT_WORK(&fi->flush_work, submit_flushes);
                         queue_work(md_wq, &fi->flush_work);
                 }
@@ -5904,14 +5906,6 @@ static void __md_stop(struct mddev *mddev)
                 mddev->to_remove = &md_redundancy_group;
         module_put(pers->owner);
         clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-}
-
-void md_stop(struct mddev *mddev)
-{
-       /* stop the array and free an attached data structures.
-        * This is called from dm-raid
-        */
-       __md_stop(mddev);
         if (mddev->flush_bio_pool) {
                 mempool_destroy(mddev->flush_bio_pool);
                 mddev->flush_bio_pool = NULL;
@@ -5920,6 +5914,14 @@ void md_stop(struct mddev *mddev)
                 mempool_destroy(mddev->flush_pool);
                 mddev->flush_pool = NULL;
         }
+}
+
+void md_stop(struct mddev *mddev)
+{
+       /* stop the array and free an attached data structures.
+        * This is called from dm-raid
+        */
+       __md_stop(mddev);
         bioset_exit(&mddev->bio_set);
         bioset_exit(&mddev->sync_set);
  }
@@ -8370,9 +8372,17 @@ void md_do_sync(struct md_thread *thread)
                 else if (!mddev->bitmap)
                         j = mddev->recovery_cp;
  
-       } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+       } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
                 max_sectors = mddev->resync_max_sectors;
-       else {
+               /*
+                * If the original node aborts reshaping then we continue the
+                * reshaping, so set j again to avoid restart reshape from the
+                * first beginning
+                */
+               if (mddev_is_clustered(mddev) &&
+                   mddev->reshape_position != MaxSector)
+                       j = mddev->reshape_position;
+       } else {
                 /* recovery follows the physical size of devices */
                 max_sectors = mddev->dev_sectors;
                 j = MaxSector;
@@ -8623,8 +8633,10 @@ void md_do_sync(struct md_thread *thread)
                 mddev_lock_nointr(mddev);
                 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
                 mddev_unlock(mddev);
-               set_capacity(mddev->gendisk, mddev->array_sectors);
-               revalidate_disk(mddev->gendisk);
+               if (!mddev_is_clustered(mddev)) {
+                       set_capacity(mddev->gendisk, mddev->array_sectors);
+                       revalidate_disk(mddev->gendisk);
+               }
         }
  
         spin_lock(&mddev->lock);
@@ -8790,6 +8802,18 @@ static void md_start_sync(struct work_struct *ws)
   */
  void md_check_recovery(struct mddev *mddev)
  {
+       if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
+               /* Write superblock - thread that called mddev_suspend()
+                * holds reconfig_mutex for us.
+                */
+               set_bit(MD_UPDATING_SB, &mddev->flags);
+               smp_mb__after_atomic();
+               if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
+                       md_update_sb(mddev, 0);
+               clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
+               wake_up(&mddev->sb_wait);
+       }
+
         if (mddev->suspended)
                 return;
  
@@ -8949,16 +8973,6 @@ void md_check_recovery(struct mddev *mddev)
         unlock:
                 wake_up(&mddev->sb_wait);
                 mddev_unlock(mddev);
-       } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
-               /* Write superblock - thread that called mddev_suspend()
-                * holds reconfig_mutex for us.
-                */
-               set_bit(MD_UPDATING_SB, &mddev->flags);
-               smp_mb__after_atomic();
-               if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
-                       md_update_sb(mddev, 0);
-               clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
-               wake_up(&mddev->sb_wait);
         }
  }
  EXPORT_SYMBOL(md_check_recovery);
@@ -8966,6 +8980,8 @@ EXPORT_SYMBOL(md_check_recovery);
  void md_reap_sync_thread(struct mddev *mddev)
  {
         struct md_rdev *rdev;
+       sector_t old_dev_sectors = mddev->dev_sectors;
+       bool is_reshaped = false;
  
         /* resync has finished, collect result */
         md_unregister_thread(&mddev->sync_thread);
@@ -8980,8 +8996,11 @@ void md_reap_sync_thread(struct mddev *mddev)
                 }
         }
         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-           mddev->pers->finish_reshape)
+           mddev->pers->finish_reshape) {
                 mddev->pers->finish_reshape(mddev);
+               if (mddev_is_clustered(mddev))
+                       is_reshaped = true;
+       }
  
         /* If array is no-longer degraded, then any saved_raid_disk
          * information must be scrapped.
@@ -9002,6 +9021,14 @@ void md_reap_sync_thread(struct mddev *mddev)
         clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+       /*
+        * We call md_cluster_ops->update_size here because sync_size could
+        * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
+        * so it is time to update size across cluster.
+        */
+       if (mddev_is_clustered(mddev) && is_reshaped
+                                     && !test_bit(MD_CLOSING, &mddev->flags))
+               md_cluster_ops->update_size(mddev, old_dev_sectors);
         wake_up(&resync_wait);
         /* flag recovery needed just to double check */
         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9201,8 +9228,12 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
                 }
  
                 if (role != rdev2->raid_disk) {
-                       /* got activated */
-                       if (rdev2->raid_disk == -1 && role != 0xffff) {
+                       /*
+                        * got activated except reshape is happening.
+                        */
+                       if (rdev2->raid_disk == -1 && role != 0xffff &&
+                           !(le32_to_cpu(sb->feature_map) &
+                             MD_FEATURE_RESHAPE_ACTIVE)) {
                                 rdev2->saved_raid_disk = role;
                                 ret = remove_and_add_spares(mddev, rdev2);
                                 pr_info("Activated spare: %s\n",
@@ -9228,6 +9259,30 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
         if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
                 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
  
+       /*
+        * Since mddev->delta_disks has already updated in update_raid_disks,
+        * so it is time to check reshape.
+        */
+       if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+           (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+               /*
+                * reshape is happening in the remote node, we need to
+                * update reshape_position and call start_reshape.
+                */
+               mddev->reshape_position = sb->reshape_position;
+               if (mddev->pers->update_reshape_pos)
+                       mddev->pers->update_reshape_pos(mddev);
+               if (mddev->pers->start_reshape)
+                       mddev->pers->start_reshape(mddev);
+       } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+                  mddev->reshape_position != MaxSector &&
+                  !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+               /* reshape is just done in another node. */
+               mddev->reshape_position = MaxSector;
+               if (mddev->pers->update_reshape_pos)
+                       mddev->pers->update_reshape_pos(mddev);
+       }
+
         /* Finally set the event to be up to date */
         mddev->events = le64_to_cpu(sb->events);
  }
diff --git a/drivers/md/md.h b/drivers/md/md.h

index 8afd6bfdbfb9b5934097b3d0b3378dbd0d53f778..c52afb52c77608aa749a3aa8d79f87e268eba21e 100644 (file)
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -557,6 +557,7 @@ struct md_personality
         int (*check_reshape) (struct mddev *mddev);
         int (*start_reshape) (struct mddev *mddev);
         void (*finish_reshape) (struct mddev *mddev);
+       void (*update_reshape_pos) (struct mddev *mddev);
         /* quiesce suspends or resumes internal processing.
          * 1 - stop new actions and wait for action io to complete
          * 0 - return to normal behaviour
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c

index 4e990246225eada686e4d77f657c515a13283a12..1d54109071cc87a0f95be9aec0801072389fcedf 100644 (file)
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1734,6 +1734,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
          */
         if (rdev->saved_raid_disk >= 0 &&
             rdev->saved_raid_disk >= first &&
+           rdev->saved_raid_disk < conf->raid_disks &&
             conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
                 first = last = rdev->saved_raid_disk;
  
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c

index d6f7978b4449e92aba522035941ff51137e2ce38..b98e746e7fc4fd05fb8c0eaf2118f9f6a4f778d3 100644 (file)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -25,6 +25,7 @@
  #include <linux/seq_file.h>
  #include <linux/ratelimit.h>
  #include <linux/kthread.h>
+#include <linux/raid/md_p.h>
  #include <trace/events/block.h>
  #include "md.h"
  #include "raid10.h"
@@ -1808,6 +1809,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
                 first = last = rdev->raid_disk;
  
         if (rdev->saved_raid_disk >= first &&
+           rdev->saved_raid_disk < conf->geo.raid_disks &&
             conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
                 mirror = rdev->saved_raid_disk;
         else
@@ -3079,6 +3081,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
                         sector_t sect;
                         int must_sync;
                         int any_working;
+                       int need_recover = 0;
+                       int need_replace = 0;
                         struct raid10_info *mirror = &conf->mirrors[i];
                         struct md_rdev *mrdev, *mreplace;
  
@@ -3086,11 +3090,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
                         mrdev = rcu_dereference(mirror->rdev);
                         mreplace = rcu_dereference(mirror->replacement);
  
-                       if ((mrdev == NULL ||
-                            test_bit(Faulty, &mrdev->flags) ||
-                            test_bit(In_sync, &mrdev->flags)) &&
-                           (mreplace == NULL ||
-                            test_bit(Faulty, &mreplace->flags))) {
+                       if (mrdev != NULL &&
+                           !test_bit(Faulty, &mrdev->flags) &&
+                           !test_bit(In_sync, &mrdev->flags))
+                               need_recover = 1;
+                       if (mreplace != NULL &&
+                           !test_bit(Faulty, &mreplace->flags))
+                               need_replace = 1;
+
+                       if (!need_recover && !need_replace) {
                                 rcu_read_unlock();
                                 continue;
                         }
@@ -3213,7 +3221,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
                                 r10_bio->devs[1].devnum = i;
                                 r10_bio->devs[1].addr = to_addr;
  
-                               if (!test_bit(In_sync, &mrdev->flags)) {
+                               if (need_recover) {
                                         bio = r10_bio->devs[1].bio;
                                         bio->bi_next = biolist;
                                         biolist = bio;
@@ -3230,16 +3238,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
                                 bio = r10_bio->devs[1].repl_bio;
                                 if (bio)
                                         bio->bi_end_io = NULL;
-                               /* Note: if mreplace != NULL, then bio
+                               /* Note: if need_replace, then bio
                                  * cannot be NULL as r10buf_pool_alloc will
                                  * have allocated it.
-                                * So the second test here is pointless.
-                                * But it keeps semantic-checkers happy, and
-                                * this comment keeps human reviewers
-                                * happy.
                                  */
-                               if (mreplace == NULL || bio == NULL ||
-                                   test_bit(Faulty, &mreplace->flags))
+                               if (!need_replace)
                                         break;
                                 bio->bi_next = biolist;
                                 biolist = bio;
@@ -4286,12 +4289,46 @@ static int raid10_start_reshape(struct mddev *mddev)
         spin_unlock_irq(&conf->device_lock);
  
         if (mddev->delta_disks && mddev->bitmap) {
-               ret = md_bitmap_resize(mddev->bitmap,
-                                      raid10_size(mddev, 0, conf->geo.raid_disks),
-                                      0, 0);
+               struct mdp_superblock_1 *sb = NULL;
+               sector_t oldsize, newsize;
+
+               oldsize = raid10_size(mddev, 0, 0);
+               newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
+
+               if (!mddev_is_clustered(mddev)) {
+                       ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
+                       if (ret)
+                               goto abort;
+                       else
+                               goto out;
+               }
+
+               rdev_for_each(rdev, mddev) {
+                       if (rdev->raid_disk > -1 &&
+                           !test_bit(Faulty, &rdev->flags))
+                               sb = page_address(rdev->sb_page);
+               }
+
+               /*
+                * some node is already performing reshape, and no need to
+                * call md_bitmap_resize again since it should be called when
+                * receiving BITMAP_RESIZE msg
+                */
+               if ((sb && (le32_to_cpu(sb->feature_map) &
+                           MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
+                       goto out;
+
+               ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
                 if (ret)
                         goto abort;
+
+               ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
+               if (ret) {
+                       md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
+                       goto abort;
+               }
         }
+out:
         if (mddev->delta_disks > 0) {
                 rdev_for_each(rdev, mddev)
                         if (rdev->raid_disk < 0 &&
@@ -4568,6 +4605,32 @@ read_more:
         r10_bio->master_bio = read_bio;
         r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
  
+       /*
+        * Broadcast RESYNC message to other nodes, so all nodes would not
+        * write to the region to avoid conflict.
+       */
+       if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
+               struct mdp_superblock_1 *sb = NULL;
+               int sb_reshape_pos = 0;
+
+               conf->cluster_sync_low = sector_nr;
+               conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
+               sb = page_address(rdev->sb_page);
+               if (sb) {
+                       sb_reshape_pos = le64_to_cpu(sb->reshape_position);
+                       /*
+                        * Set cluster_sync_low again if next address for array
+                        * reshape is less than cluster_sync_low. Since we can't
+                        * update cluster_sync_low until it has finished reshape.
+                        */
+                       if (sb_reshape_pos < conf->cluster_sync_low)
+                               conf->cluster_sync_low = sb_reshape_pos;
+               }
+
+               md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
+                                                         conf->cluster_sync_high);
+       }
+
         /* Now find the locations in the new layout */
         __raid10_find_phys(&conf->geo, r10_bio);
  
@@ -4719,6 +4782,19 @@ static void end_reshape(struct r10conf *conf)
         conf->fullsync = 0;
  }
  
+static void raid10_update_reshape_pos(struct mddev *mddev)
+{
+       struct r10conf *conf = mddev->private;
+       sector_t lo, hi;
+
+       md_cluster_ops->resync_info_get(mddev, &lo, &hi);
+       if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
+           || mddev->reshape_position == MaxSector)
+               conf->reshape_progress = mddev->reshape_position;
+       else
+               WARN_ON_ONCE(1);
+}
+
  static int handle_reshape_read_error(struct mddev *mddev,
                                      struct r10bio *r10_bio)
  {
@@ -4887,6 +4963,7 @@ static struct md_personality raid10_personality =
         .check_reshape  = raid10_check_reshape,
         .start_reshape  = raid10_start_reshape,
         .finish_reshape = raid10_finish_reshape,
+       .update_reshape_pos = raid10_update_reshape_pos,
         .congested      = raid10_congested,
  };
  
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c

index e6e925add7005786861ef1cb7f22d83d2d89df65..ec3a5ef7fee0b8c099e729d49ad4a0665eddad82 100644 (file)
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -3151,8 +3151,6 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
         set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
         return 0;
  
-       rcu_assign_pointer(conf->log, NULL);
-       md_unregister_thread(&log->reclaim_thread);
  reclaim_thread:
         mempool_exit(&log->meta_pool);
  out_mempool:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index e4e98f47865def0449979058c6c7e51228b9b42b..4990f0319f6cf729165a2c55a73aaf7e9bf3f567 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2681,6 +2681,18 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
         pr_debug("raid456: error called\n");
  
         spin_lock_irqsave(&conf->device_lock, flags);
+
+       if (test_bit(In_sync, &rdev->flags) &&
+           mddev->degraded == conf->max_degraded) {
+               /*
+                * Don't allow to achieve failed state
+                * Don't try to recover this device
+                */
+               conf->recovery_disabled = mddev->recovery_disabled;
+               spin_unlock_irqrestore(&conf->device_lock, flags);
+               return;
+       }
+
         set_bit(Faulty, &rdev->flags);
         clear_bit(In_sync, &rdev->flags);
         mddev->degraded = raid5_calc_degraded(conf);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 26 Oct 2018 20:00:44 +0000 (13:00 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 26 Oct 2018 20:00:44 +0000 (13:00 -0700)
drivers/md/md-bitmap.c		patch \| blob \| blame \| history
drivers/md/md-cluster.c		patch \| blob \| blame \| history
drivers/md/md-cluster.h		patch \| blob \| blame \| history
drivers/md/md.c		patch \| blob \| blame \| history
drivers/md/md.h		patch \| blob \| blame \| history
drivers/md/raid1.c		patch \| blob \| blame \| history
drivers/md/raid10.c		patch \| blob \| blame \| history
drivers/md/raid5-cache.c		patch \| blob \| blame \| history
drivers/md/raid5.c		patch \| blob \| blame \| history