md: set and test the ->persistent flag for md devices more consistently
[linux-2.6-block.git] / drivers / md / md.c
index c28a120b4161ea7324eae5160ed2a4c2fc7e0ab6..78fe3e97ff993293c1d018431c8f5bf1e2531f16 100644 (file)
@@ -275,6 +275,7 @@ static mddev_t * mddev_find(dev_t unit)
        spin_lock_init(&new->write_lock);
        init_waitqueue_head(&new->sb_wait);
        new->reshape_position = MaxSector;
+       new->resync_max = MaxSector;
 
        new->queue = blk_alloc_queue(GFP_KERNEL);
        if (!new->queue) {
@@ -778,7 +779,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->major_version = 0;
                mddev->minor_version = sb->minor_version;
                mddev->patch_version = sb->patch_version;
-               mddev->persistent = ! sb->not_persistent;
+               mddev->external = 0;
                mddev->chunk_size = sb->chunk_size;
                mddev->ctime = sb->ctime;
                mddev->utime = sb->utime;
@@ -904,7 +905,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->size  = mddev->size;
        sb->raid_disks = mddev->raid_disks;
        sb->md_minor = mddev->md_minor;
-       sb->not_persistent = !mddev->persistent;
+       sb->not_persistent = 0;
        sb->utime = mddev->utime;
        sb->state = 0;
        sb->events_hi = (mddev->events>>32);
@@ -1157,7 +1158,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        if (mddev->raid_disks == 0) {
                mddev->major_version = 1;
                mddev->patch_version = 0;
-               mddev->persistent = 1;
+               mddev->external = 0;
                mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
                mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
                mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
@@ -1696,18 +1697,20 @@ repeat:
                MD_BUG();
                mddev->events --;
        }
-       sync_sbs(mddev, nospares);
 
        /*
         * do not write anything to disk if using
         * nonpersistent superblocks
         */
        if (!mddev->persistent) {
-               clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+               if (!mddev->external)
+                       clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+
                spin_unlock_irq(&mddev->write_lock);
                wake_up(&mddev->sb_wait);
                return;
        }
+       sync_sbs(mddev, nospares);
        spin_unlock_irq(&mddev->write_lock);
 
        dprintk(KERN_INFO 
@@ -1887,20 +1890,44 @@ static ssize_t
 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 {
        char *e;
+       int err;
+       char nm[20];
        int slot = simple_strtoul(buf, &e, 10);
        if (strncmp(buf, "none", 4)==0)
                slot = -1;
        else if (e==buf || (*e && *e!= '\n'))
                return -EINVAL;
-       if (rdev->mddev->pers)
-               /* Cannot set slot in active array (yet) */
-               return -EBUSY;
-       if (slot >= rdev->mddev->raid_disks)
-               return -ENOSPC;
-       rdev->raid_disk = slot;
-       /* assume it is working */
-       rdev->flags = 0;
-       set_bit(In_sync, &rdev->flags);
+       if (rdev->mddev->pers) {
+               /* Setting 'slot' on an active array requires also
+                * updating the 'rd%d' link, and communicating
+                * with the personality with ->hot_*_disk.
+                * For now we only support removing
+                * failed/spare devices.  This normally happens automatically,
+                * but not when the metadata is externally managed.
+                */
+               if (slot != -1)
+                       return -EBUSY;
+               if (rdev->raid_disk == -1)
+                       return -EEXIST;
+               /* personality does all needed checks */
+               if (rdev->mddev->pers->hot_add_disk == NULL)
+                       return -EINVAL;
+               err = rdev->mddev->pers->
+                       hot_remove_disk(rdev->mddev, rdev->raid_disk);
+               if (err)
+                       return err;
+               sprintf(nm, "rd%d", rdev->raid_disk);
+               sysfs_remove_link(&rdev->mddev->kobj, nm);
+               set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+               md_wakeup_thread(rdev->mddev->thread);
+       } else {
+               if (slot >= rdev->mddev->raid_disks)
+                       return -ENOSPC;
+               rdev->raid_disk = slot;
+               /* assume it is working */
+               rdev->flags = 0;
+               set_bit(In_sync, &rdev->flags);
+       }
        return len;
 }
 
@@ -2425,6 +2452,8 @@ array_state_show(mddev_t *mddev, char *page)
                case 0:
                        if (mddev->in_sync)
                                st = clean;
+                       else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+                               st = write_pending;
                        else if (mddev->safemode)
                                st = active_idle;
                        else
@@ -2455,11 +2484,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
                break;
        case clear:
                /* stopping an active array */
-               if (mddev->pers) {
-                       if (atomic_read(&mddev->active) > 1)
-                               return -EBUSY;
-                       err = do_md_stop(mddev, 0);
-               }
+               if (atomic_read(&mddev->active) > 1)
+                       return -EBUSY;
+               err = do_md_stop(mddev, 0);
                break;
        case inactive:
                /* stopping an active array */
@@ -2467,7 +2494,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
                        if (atomic_read(&mddev->active) > 1)
                                return -EBUSY;
                        err = do_md_stop(mddev, 2);
-               }
+               } else
+                       err = 0; /* already inactive */
                break;
        case suspended:
                break; /* not supported yet */
@@ -2495,9 +2523,15 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
                        restart_array(mddev);
                        spin_lock_irq(&mddev->write_lock);
                        if (atomic_read(&mddev->writes_pending) == 0) {
-                               mddev->in_sync = 1;
-                               set_bit(MD_CHANGE_CLEAN, &mddev->flags);
-                       }
+                               if (mddev->in_sync == 0) {
+                                       mddev->in_sync = 1;
+                                       if (mddev->persistent)
+                                               set_bit(MD_CHANGE_CLEAN,
+                                                       &mddev->flags);
+                               }
+                               err = 0;
+                       } else
+                               err = -EBUSY;
                        spin_unlock_irq(&mddev->write_lock);
                } else {
                        mddev->ro = 0;
@@ -2508,7 +2542,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
        case active:
                if (mddev->pers) {
                        restart_array(mddev);
-                       clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
+                       if (mddev->external)
+                               clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        wake_up(&mddev->sb_wait);
                        err = 0;
                } else {
@@ -2659,7 +2694,9 @@ __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
 
 
 /* Metdata version.
- * This is either 'none' for arrays with externally managed metadata,
+ * This is one of
+ *   'none' for arrays with no metadata (good luck...)
+ *   'external' for arrays with externally managed metadata,
  * or N.M for internally known formats
  */
 static ssize_t
@@ -2668,6 +2705,8 @@ metadata_show(mddev_t *mddev, char *page)
        if (mddev->persistent)
                return sprintf(page, "%d.%d\n",
                               mddev->major_version, mddev->minor_version);
+       else if (mddev->external)
+               return sprintf(page, "external:%s\n", mddev->metadata_type);
        else
                return sprintf(page, "none\n");
 }
@@ -2682,6 +2721,21 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
 
        if (cmd_match(buf, "none")) {
                mddev->persistent = 0;
+               mddev->external = 0;
+               mddev->major_version = 0;
+               mddev->minor_version = 90;
+               return len;
+       }
+       if (strncmp(buf, "external:", 9) == 0) {
+               int namelen = len-9;
+               if (namelen >= sizeof(mddev->metadata_type))
+                       namelen = sizeof(mddev->metadata_type)-1;
+               strncpy(mddev->metadata_type, buf+9, namelen);
+               mddev->metadata_type[namelen] = 0;
+               if (namelen && mddev->metadata_type[namelen-1] == '\n')
+                       mddev->metadata_type[--namelen] = 0;
+               mddev->persistent = 0;
+               mddev->external = 1;
                mddev->major_version = 0;
                mddev->minor_version = 90;
                return len;
@@ -2698,6 +2752,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
        mddev->major_version = major;
        mddev->minor_version = minor;
        mddev->persistent = 1;
+       mddev->external = 0;
        return len;
 }
 
@@ -2864,6 +2919,43 @@ sync_completed_show(mddev_t *mddev, char *page)
 
 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
 
+static ssize_t
+max_sync_show(mddev_t *mddev, char *page)
+{
+       if (mddev->resync_max == MaxSector)
+               return sprintf(page, "max\n");
+       else
+               return sprintf(page, "%llu\n",
+                              (unsigned long long)mddev->resync_max);
+}
+static ssize_t
+max_sync_store(mddev_t *mddev, const char *buf, size_t len)
+{
+       if (strncmp(buf, "max", 3) == 0)
+               mddev->resync_max = MaxSector;
+       else {
+               char *ep;
+               unsigned long long max = simple_strtoull(buf, &ep, 10);
+               if (ep == buf || (*ep != 0 && *ep != '\n'))
+                       return -EINVAL;
+               if (max < mddev->resync_max &&
+                   test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+                       return -EBUSY;
+
+               /* Must be a multiple of chunk_size */
+               if (mddev->chunk_size) {
+                       if (max & (sector_t)((mddev->chunk_size>>9)-1))
+                               return -EINVAL;
+               }
+               mddev->resync_max = max;
+       }
+       wake_up(&mddev->recovery_wait);
+       return len;
+}
+
+static struct md_sysfs_entry md_max_sync =
+__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
+
 static ssize_t
 suspend_lo_show(mddev_t *mddev, char *page)
 {
@@ -2974,6 +3066,7 @@ static struct attribute *md_redundancy_attrs[] = {
        &md_sync_max.attr,
        &md_sync_speed.attr,
        &md_sync_completed.attr,
+       &md_max_sync.attr,
        &md_suspend_lo.attr,
        &md_suspend_hi.attr,
        &md_bitmap.attr,
@@ -3118,8 +3211,11 @@ static int do_md_run(mddev_t * mddev)
        /*
         * Analyze all RAID superblock(s)
         */
-       if (!mddev->raid_disks)
+       if (!mddev->raid_disks) {
+               if (!mddev->persistent)
+                       return -EINVAL;
                analyze_sbs(mddev);
+       }
 
        chunk_size = mddev->chunk_size;
 
@@ -3523,7 +3619,10 @@ static int do_md_stop(mddev_t * mddev, int mode)
                mddev->size = 0;
                mddev->raid_disks = 0;
                mddev->recovery_cp = 0;
+               mddev->resync_max = MaxSector;
                mddev->reshape_position = MaxSector;
+               mddev->external = 0;
+               mddev->persistent = 0;
 
        } else if (mddev->pers)
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -3632,6 +3731,7 @@ static void autorun_devices(int part)
                        mddev_unlock(mddev);
                } else {
                        printk(KERN_INFO "md: created %s\n", mdname(mddev));
+                       mddev->persistent = 1;
                        ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
                                list_del_init(&rdev->same_set);
                                if (bind_rdev_to_array(rdev, mddev))
@@ -4165,13 +4265,15 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        else
                mddev->recovery_cp = 0;
        mddev->persistent    = ! info->not_persistent;
+       mddev->external      = 0;
 
        mddev->layout        = info->layout;
        mddev->chunk_size    = info->chunk_size;
 
        mddev->max_disks     = MD_SB_DISKS;
 
-       mddev->flags         = 0;
+       if (mddev->persistent)
+               mddev->flags         = 0;
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -4982,7 +5084,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
                                           mddev->major_version,
                                           mddev->minor_version);
                        }
-               } else
+               } else if (mddev->external)
+                       seq_printf(seq, " super external:%s",
+                                  mddev->metadata_type);
+               else
                        seq_printf(seq, " super non-persistent");
 
                if (mddev->pers) {
@@ -5381,8 +5486,16 @@ void md_do_sync(mddev_t *mddev)
                sector_t sectors;
 
                skipped = 0;
+               if (j >= mddev->resync_max) {
+                       sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+                       wait_event(mddev->recovery_wait,
+                                  mddev->resync_max > j
+                                  || kthread_should_stop());
+               }
+               if (kthread_should_stop())
+                       goto interrupted;
                sectors = mddev->pers->sync_request(mddev, j, &skipped,
-                                           currspeed < speed_min(mddev));
+                                                 currspeed < speed_min(mddev));
                if (sectors == 0) {
                        set_bit(MD_RECOVERY_ERR, &mddev->recovery);
                        goto out;
@@ -5424,15 +5537,9 @@ void md_do_sync(mddev_t *mddev)
                }
 
 
-               if (kthread_should_stop()) {
-                       /*
-                        * got a signal, exit.
-                        */
-                       printk(KERN_INFO 
-                               "md: md_do_sync() got signal ... exiting\n");
-                       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-                       goto out;
-               }
+               if (kthread_should_stop())
+                       goto interrupted;
+
 
                /*
                 * this loop exits only if either when we are slower than
@@ -5496,9 +5603,22 @@ void md_do_sync(mddev_t *mddev)
 
  skip:
        mddev->curr_resync = 0;
+       mddev->resync_max = MaxSector;
+       sysfs_notify(&mddev->kobj, NULL, "sync_completed");
        wake_up(&resync_wait);
        set_bit(MD_RECOVERY_DONE, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
+       return;
+
+ interrupted:
+       /*
+        * got a signal, exit.
+        */
+       printk(KERN_INFO
+              "md: md_do_sync() got signal ... exiting\n");
+       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+       goto out;
+
 }
 EXPORT_SYMBOL_GPL(md_do_sync);
 
@@ -5511,6 +5631,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 
        ITERATE_RDEV(mddev,rdev,rtmp)
                if (rdev->raid_disk >= 0 &&
+                   !mddev->external &&
                    (test_bit(Faulty, &rdev->flags) ||
                     ! test_bit(In_sync, &rdev->flags)) &&
                    atomic_read(&rdev->nr_pending)==0) {
@@ -5589,7 +5710,7 @@ void md_check_recovery(mddev_t *mddev)
        }
 
        if ( ! (
-               mddev->flags ||
+               (mddev->flags && !mddev->external) ||
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
                (mddev->safemode == 1) ||
@@ -5605,7 +5726,8 @@ void md_check_recovery(mddev_t *mddev)
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
                        mddev->in_sync = 1;
-                       set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+                       if (mddev->persistent)
+                               set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;