Merge tag 'char-misc-4.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh...
[linux-2.6-block.git] / drivers / md / md.c
CommitLineData
1da177e4
LT
1/*
2 md.c : Multiple Devices driver for Linux
f72ffdd6 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
1da177e4
LT
4
5 completely rewritten, based on the MD driver code from Marc Zyngier
6
7 Changes:
8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
19
20 Neil Brown <neilb@cse.unsw.edu.au>.
21
32a7627c
N
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
1da177e4
LT
25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option)
28 any later version.
29
30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33*/
34
a6fb0934 35#include <linux/kthread.h>
bff61975 36#include <linux/blkdev.h>
1da177e4 37#include <linux/sysctl.h>
bff61975 38#include <linux/seq_file.h>
ff01bb48 39#include <linux/fs.h>
d7603b7e 40#include <linux/poll.h>
16f17b39 41#include <linux/ctype.h>
e7d2860b 42#include <linux/string.h>
fb4d8c76
N
43#include <linux/hdreg.h>
44#include <linux/proc_fs.h>
45#include <linux/random.h>
056075c7 46#include <linux/module.h>
fb4d8c76 47#include <linux/reboot.h>
32a7627c 48#include <linux/file.h>
aa98aa31 49#include <linux/compat.h>
25570727 50#include <linux/delay.h>
bff61975
N
51#include <linux/raid/md_p.h>
52#include <linux/raid/md_u.h>
5a0e3ad6 53#include <linux/slab.h>
43b2e5d8 54#include "md.h"
ef740c37 55#include "bitmap.h"
edb39c9d 56#include "md-cluster.h"
1da177e4 57
1da177e4 58#ifndef MODULE
d710e138 59static void autostart_arrays(int part);
1da177e4
LT
60#endif
61
01f96c0a
N
62/* pers_list is a list of registered personalities protected
63 * by pers_lock.
64 * pers_lock does extra service to protect accesses to
65 * mddev->thread when the mutex cannot be held.
66 */
2604b703 67static LIST_HEAD(pers_list);
1da177e4
LT
68static DEFINE_SPINLOCK(pers_lock);
69
edb39c9d 70struct md_cluster_operations *md_cluster_ops;
589a1c49 71EXPORT_SYMBOL(md_cluster_ops);
edb39c9d
GR
72struct module *md_cluster_mod;
73EXPORT_SYMBOL(md_cluster_mod);
74
90b08710 75static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
e804ac78
TH
76static struct workqueue_struct *md_wq;
77static struct workqueue_struct *md_misc_wq;
90b08710 78
746d3207
N
79static int remove_and_add_spares(struct mddev *mddev,
80 struct md_rdev *this);
5aa61f42 81static void mddev_detach(struct mddev *mddev);
746d3207 82
1e50915f
RB
83/*
84 * Default number of read corrections we'll attempt on an rdev
85 * before ejecting it from the array. We divide the read error
86 * count by 2 for every hour elapsed between read errors.
87 */
88#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
1da177e4
LT
89/*
90 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
91 * is 1000 KB/sec, so the extra system load does not show up that much.
92 * Increase it if you want to have more _guaranteed_ speed. Note that
338cec32 93 * the RAID driver will use the maximum available bandwidth if the IO
1da177e4
LT
94 * subsystem is idle. There is also an 'absolute maximum' reconstruction
95 * speed limit - in case reconstruction slows down your system despite
96 * idle IO detection.
97 *
98 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
88202a0c 99 * or /sys/block/mdX/md/sync_speed_{min,max}
1da177e4
LT
100 */
101
102static int sysctl_speed_limit_min = 1000;
103static int sysctl_speed_limit_max = 200000;
fd01b88c 104static inline int speed_min(struct mddev *mddev)
88202a0c
N
105{
106 return mddev->sync_speed_min ?
107 mddev->sync_speed_min : sysctl_speed_limit_min;
108}
109
fd01b88c 110static inline int speed_max(struct mddev *mddev)
88202a0c
N
111{
112 return mddev->sync_speed_max ?
113 mddev->sync_speed_max : sysctl_speed_limit_max;
114}
1da177e4
LT
115
116static struct ctl_table_header *raid_table_header;
117
82592c38 118static struct ctl_table raid_table[] = {
1da177e4 119 {
1da177e4
LT
120 .procname = "speed_limit_min",
121 .data = &sysctl_speed_limit_min,
122 .maxlen = sizeof(int),
80ca3a44 123 .mode = S_IRUGO|S_IWUSR,
6d456111 124 .proc_handler = proc_dointvec,
1da177e4
LT
125 },
126 {
1da177e4
LT
127 .procname = "speed_limit_max",
128 .data = &sysctl_speed_limit_max,
129 .maxlen = sizeof(int),
80ca3a44 130 .mode = S_IRUGO|S_IWUSR,
6d456111 131 .proc_handler = proc_dointvec,
1da177e4 132 },
894d2491 133 { }
1da177e4
LT
134};
135
82592c38 136static struct ctl_table raid_dir_table[] = {
1da177e4 137 {
1da177e4
LT
138 .procname = "raid",
139 .maxlen = 0,
80ca3a44 140 .mode = S_IRUGO|S_IXUGO,
1da177e4
LT
141 .child = raid_table,
142 },
894d2491 143 { }
1da177e4
LT
144};
145
82592c38 146static struct ctl_table raid_root_table[] = {
1da177e4 147 {
1da177e4
LT
148 .procname = "dev",
149 .maxlen = 0,
150 .mode = 0555,
151 .child = raid_dir_table,
152 },
894d2491 153 { }
1da177e4
LT
154};
155
83d5cde4 156static const struct block_device_operations md_fops;
1da177e4 157
f91de92e
N
158static int start_readonly;
159
a167f663
N
160/* bio_clone_mddev
161 * like bio_clone, but with a local bio set
162 */
163
a167f663 164struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
fd01b88c 165 struct mddev *mddev)
a167f663
N
166{
167 struct bio *b;
a167f663
N
168
169 if (!mddev || !mddev->bio_set)
170 return bio_alloc(gfp_mask, nr_iovecs);
171
395c72a7 172 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
a167f663
N
173 if (!b)
174 return NULL;
a167f663
N
175 return b;
176}
177EXPORT_SYMBOL_GPL(bio_alloc_mddev);
178
179struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
fd01b88c 180 struct mddev *mddev)
a167f663 181{
a167f663
N
182 if (!mddev || !mddev->bio_set)
183 return bio_clone(bio, gfp_mask);
184
bf800ef1 185 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
a167f663
N
186}
187EXPORT_SYMBOL_GPL(bio_clone_mddev);
188
d7603b7e
N
189/*
190 * We have a system wide 'event count' that is incremented
191 * on any 'interesting' event, and readers of /proc/mdstat
192 * can use 'poll' or 'select' to find out when the event
193 * count increases.
194 *
195 * Events are:
196 * start array, stop array, error, add device, remove device,
197 * start build, activate spare
198 */
2989ddbd 199static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
d7603b7e 200static atomic_t md_event_count;
fd01b88c 201void md_new_event(struct mddev *mddev)
d7603b7e
N
202{
203 atomic_inc(&md_event_count);
204 wake_up(&md_event_waiters);
205}
29269553 206EXPORT_SYMBOL_GPL(md_new_event);
d7603b7e 207
c331eb04
N
208/* Alternate version that can be called from interrupts
209 * when calling sysfs_notify isn't needed.
210 */
fd01b88c 211static void md_new_event_inintr(struct mddev *mddev)
c331eb04
N
212{
213 atomic_inc(&md_event_count);
214 wake_up(&md_event_waiters);
215}
216
1da177e4
LT
217/*
218 * Enables to iterate over all existing md arrays
219 * all_mddevs_lock protects this list.
220 */
221static LIST_HEAD(all_mddevs);
222static DEFINE_SPINLOCK(all_mddevs_lock);
223
1da177e4
LT
224/*
225 * iterates through all used mddevs in the system.
226 * We take care to grab the all_mddevs_lock whenever navigating
227 * the list, and to always hold a refcount when unlocked.
228 * Any code which breaks out of this loop while own
229 * a reference to the current mddev and must mddev_put it.
230 */
fd01b88c 231#define for_each_mddev(_mddev,_tmp) \
1da177e4 232 \
f72ffdd6 233 for (({ spin_lock(&all_mddevs_lock); \
fd01b88c
N
234 _tmp = all_mddevs.next; \
235 _mddev = NULL;}); \
236 ({ if (_tmp != &all_mddevs) \
237 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
1da177e4 238 spin_unlock(&all_mddevs_lock); \
fd01b88c
N
239 if (_mddev) mddev_put(_mddev); \
240 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
241 _tmp != &all_mddevs;}); \
1da177e4 242 ({ spin_lock(&all_mddevs_lock); \
fd01b88c 243 _tmp = _tmp->next;}) \
1da177e4
LT
244 )
245
409c57f3
N
246/* Rather than calling directly into the personality make_request function,
247 * IO requests come here first so that we can check if the device is
248 * being suspended pending a reconfiguration.
249 * We hold a refcount over the call to ->make_request. By the time that
250 * call has finished, the bio has been linked into some internal structure
251 * and so is visible to ->quiesce(), so we don't need the refcount any more.
252 */
5a7bbad2 253static void md_make_request(struct request_queue *q, struct bio *bio)
1da177e4 254{
49077326 255 const int rw = bio_data_dir(bio);
fd01b88c 256 struct mddev *mddev = q->queuedata;
e91ece55 257 unsigned int sectors;
74672d06 258 int cpu;
49077326 259
54efd50b
KO
260 blk_queue_split(q, &bio, q->bio_split);
261
0ca69886
N
262 if (mddev == NULL || mddev->pers == NULL
263 || !mddev->ready) {
409c57f3 264 bio_io_error(bio);
5a7bbad2 265 return;
409c57f3 266 }
bbfa57c0 267 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
4246a0b6
CH
268 if (bio_sectors(bio) != 0)
269 bio->bi_error = -EROFS;
270 bio_endio(bio);
bbfa57c0
SR
271 return;
272 }
0ca69886 273 smp_rmb(); /* Ensure implications of 'active' are visible */
409c57f3 274 rcu_read_lock();
e9c7469b 275 if (mddev->suspended) {
409c57f3
N
276 DEFINE_WAIT(__wait);
277 for (;;) {
278 prepare_to_wait(&mddev->sb_wait, &__wait,
279 TASK_UNINTERRUPTIBLE);
e9c7469b 280 if (!mddev->suspended)
409c57f3
N
281 break;
282 rcu_read_unlock();
283 schedule();
284 rcu_read_lock();
285 }
286 finish_wait(&mddev->sb_wait, &__wait);
287 }
288 atomic_inc(&mddev->active_io);
289 rcu_read_unlock();
49077326 290
e91ece55
CM
291 /*
292 * save the sectors now since our bio can
293 * go away inside make_request
294 */
295 sectors = bio_sectors(bio);
5a7bbad2 296 mddev->pers->make_request(mddev, bio);
49077326 297
74672d06
GZ
298 cpu = part_stat_lock();
299 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
300 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
301 part_stat_unlock();
49077326 302
409c57f3
N
303 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
304 wake_up(&mddev->sb_wait);
409c57f3
N
305}
306
9e35b99c
N
307/* mddev_suspend makes sure no new requests are submitted
308 * to the device, and that any requests that have been submitted
309 * are completely handled.
afa0f557
N
310 * Once mddev_detach() is called and completes, the module will be
311 * completely unused.
9e35b99c 312 */
fd01b88c 313void mddev_suspend(struct mddev *mddev)
409c57f3
N
314{
315 BUG_ON(mddev->suspended);
316 mddev->suspended = 1;
317 synchronize_rcu();
318 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
319 mddev->pers->quiesce(mddev, 1);
0d9f4f13
JB
320
321 del_timer_sync(&mddev->safemode_timer);
409c57f3 322}
390ee602 323EXPORT_SYMBOL_GPL(mddev_suspend);
409c57f3 324
fd01b88c 325void mddev_resume(struct mddev *mddev)
409c57f3
N
326{
327 mddev->suspended = 0;
328 wake_up(&mddev->sb_wait);
329 mddev->pers->quiesce(mddev, 0);
0fd018af 330
47525e59 331 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
0fd018af
JB
332 md_wakeup_thread(mddev->thread);
333 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
1da177e4 334}
390ee602 335EXPORT_SYMBOL_GPL(mddev_resume);
1da177e4 336
fd01b88c 337int mddev_congested(struct mddev *mddev, int bits)
3fa841d7 338{
5c675f83
N
339 struct md_personality *pers = mddev->pers;
340 int ret = 0;
341
342 rcu_read_lock();
343 if (mddev->suspended)
344 ret = 1;
345 else if (pers && pers->congested)
346 ret = pers->congested(mddev, bits);
347 rcu_read_unlock();
348 return ret;
349}
350EXPORT_SYMBOL_GPL(mddev_congested);
351static int md_congested(void *data, int bits)
352{
353 struct mddev *mddev = data;
354 return mddev_congested(mddev, bits);
3fa841d7 355}
3fa841d7 356
a2826aa9 357/*
e9c7469b 358 * Generic flush handling for md
a2826aa9
N
359 */
360
4246a0b6 361static void md_end_flush(struct bio *bio)
a2826aa9 362{
3cb03002 363 struct md_rdev *rdev = bio->bi_private;
fd01b88c 364 struct mddev *mddev = rdev->mddev;
a2826aa9
N
365
366 rdev_dec_pending(rdev, mddev);
367
368 if (atomic_dec_and_test(&mddev->flush_pending)) {
e9c7469b 369 /* The pre-request flush has finished */
e804ac78 370 queue_work(md_wq, &mddev->flush_work);
a2826aa9
N
371 }
372 bio_put(bio);
373}
374
a7a07e69
N
375static void md_submit_flush_data(struct work_struct *ws);
376
a035fc3e 377static void submit_flushes(struct work_struct *ws)
a2826aa9 378{
fd01b88c 379 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
3cb03002 380 struct md_rdev *rdev;
a2826aa9 381
a7a07e69
N
382 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
383 atomic_set(&mddev->flush_pending, 1);
a2826aa9 384 rcu_read_lock();
dafb20fa 385 rdev_for_each_rcu(rdev, mddev)
a2826aa9
N
386 if (rdev->raid_disk >= 0 &&
387 !test_bit(Faulty, &rdev->flags)) {
388 /* Take two references, one is dropped
389 * when request finishes, one after
390 * we reclaim rcu_read_lock
391 */
392 struct bio *bi;
393 atomic_inc(&rdev->nr_pending);
394 atomic_inc(&rdev->nr_pending);
395 rcu_read_unlock();
b5e1b8ce 396 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
e9c7469b 397 bi->bi_end_io = md_end_flush;
a2826aa9
N
398 bi->bi_private = rdev;
399 bi->bi_bdev = rdev->bdev;
400 atomic_inc(&mddev->flush_pending);
e9c7469b 401 submit_bio(WRITE_FLUSH, bi);
a2826aa9
N
402 rcu_read_lock();
403 rdev_dec_pending(rdev, mddev);
404 }
405 rcu_read_unlock();
a7a07e69
N
406 if (atomic_dec_and_test(&mddev->flush_pending))
407 queue_work(md_wq, &mddev->flush_work);
a2826aa9
N
408}
409
e9c7469b 410static void md_submit_flush_data(struct work_struct *ws)
a2826aa9 411{
fd01b88c 412 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
e9c7469b 413 struct bio *bio = mddev->flush_bio;
a2826aa9 414
4f024f37 415 if (bio->bi_iter.bi_size == 0)
a2826aa9 416 /* an empty barrier - all done */
4246a0b6 417 bio_endio(bio);
a2826aa9 418 else {
e9c7469b 419 bio->bi_rw &= ~REQ_FLUSH;
5a7bbad2 420 mddev->pers->make_request(mddev, bio);
a2826aa9 421 }
2b74e12e
N
422
423 mddev->flush_bio = NULL;
424 wake_up(&mddev->sb_wait);
a2826aa9
N
425}
426
fd01b88c 427void md_flush_request(struct mddev *mddev, struct bio *bio)
a2826aa9 428{
85572d7c 429 spin_lock_irq(&mddev->lock);
a2826aa9 430 wait_event_lock_irq(mddev->sb_wait,
e9c7469b 431 !mddev->flush_bio,
85572d7c 432 mddev->lock);
e9c7469b 433 mddev->flush_bio = bio;
85572d7c 434 spin_unlock_irq(&mddev->lock);
a2826aa9 435
a035fc3e
N
436 INIT_WORK(&mddev->flush_work, submit_flushes);
437 queue_work(md_wq, &mddev->flush_work);
a2826aa9 438}
e9c7469b 439EXPORT_SYMBOL(md_flush_request);
409c57f3 440
74018dc3 441void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
97658cdd 442{
9cbb1750
N
443 struct mddev *mddev = cb->data;
444 md_wakeup_thread(mddev->thread);
445 kfree(cb);
97658cdd 446}
9cbb1750 447EXPORT_SYMBOL(md_unplug);
2ac87401 448
fd01b88c 449static inline struct mddev *mddev_get(struct mddev *mddev)
1da177e4
LT
450{
451 atomic_inc(&mddev->active);
452 return mddev;
453}
454
5fd3a17e 455static void mddev_delayed_delete(struct work_struct *ws);
d3374825 456
fd01b88c 457static void mddev_put(struct mddev *mddev)
1da177e4 458{
a167f663
N
459 struct bio_set *bs = NULL;
460
1da177e4
LT
461 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
462 return;
d3374825 463 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
cbd19983
N
464 mddev->ctime == 0 && !mddev->hold_active) {
465 /* Array is not configured at all, and not held active,
466 * so destroy it */
af8a2434 467 list_del_init(&mddev->all_mddevs);
a167f663
N
468 bs = mddev->bio_set;
469 mddev->bio_set = NULL;
d3374825 470 if (mddev->gendisk) {
e804ac78
TH
471 /* We did a probe so need to clean up. Call
472 * queue_work inside the spinlock so that
473 * flush_workqueue() after mddev_find will
474 * succeed in waiting for the work to be done.
d3374825
N
475 */
476 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
e804ac78 477 queue_work(md_misc_wq, &mddev->del_work);
d3374825
N
478 } else
479 kfree(mddev);
480 }
481 spin_unlock(&all_mddevs_lock);
a167f663
N
482 if (bs)
483 bioset_free(bs);
1da177e4
LT
484}
485
25b2edfa
SL
486static void md_safemode_timeout(unsigned long data);
487
fd01b88c 488void mddev_init(struct mddev *mddev)
fafd7fb0
N
489{
490 mutex_init(&mddev->open_mutex);
491 mutex_init(&mddev->reconfig_mutex);
492 mutex_init(&mddev->bitmap_info.mutex);
493 INIT_LIST_HEAD(&mddev->disks);
494 INIT_LIST_HEAD(&mddev->all_mddevs);
25b2edfa
SL
495 setup_timer(&mddev->safemode_timer, md_safemode_timeout,
496 (unsigned long) mddev);
fafd7fb0
N
497 atomic_set(&mddev->active, 1);
498 atomic_set(&mddev->openers, 0);
499 atomic_set(&mddev->active_io, 0);
85572d7c 500 spin_lock_init(&mddev->lock);
fafd7fb0
N
501 atomic_set(&mddev->flush_pending, 0);
502 init_waitqueue_head(&mddev->sb_wait);
503 init_waitqueue_head(&mddev->recovery_wait);
504 mddev->reshape_position = MaxSector;
2c810cdd 505 mddev->reshape_backwards = 0;
c4a39551 506 mddev->last_sync_action = "none";
fafd7fb0
N
507 mddev->resync_min = 0;
508 mddev->resync_max = MaxSector;
509 mddev->level = LEVEL_NONE;
510}
390ee602 511EXPORT_SYMBOL_GPL(mddev_init);
fafd7fb0 512
f72ffdd6 513static struct mddev *mddev_find(dev_t unit)
1da177e4 514{
fd01b88c 515 struct mddev *mddev, *new = NULL;
1da177e4 516
8f5f02c4
N
517 if (unit && MAJOR(unit) != MD_MAJOR)
518 unit &= ~((1<<MdpMinorShift)-1);
519
1da177e4
LT
520 retry:
521 spin_lock(&all_mddevs_lock);
efeb53c0
N
522
523 if (unit) {
524 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
525 if (mddev->unit == unit) {
526 mddev_get(mddev);
527 spin_unlock(&all_mddevs_lock);
528 kfree(new);
529 return mddev;
530 }
531
532 if (new) {
533 list_add(&new->all_mddevs, &all_mddevs);
1da177e4 534 spin_unlock(&all_mddevs_lock);
efeb53c0
N
535 new->hold_active = UNTIL_IOCTL;
536 return new;
1da177e4 537 }
efeb53c0
N
538 } else if (new) {
539 /* find an unused unit number */
540 static int next_minor = 512;
541 int start = next_minor;
542 int is_free = 0;
543 int dev = 0;
544 while (!is_free) {
545 dev = MKDEV(MD_MAJOR, next_minor);
546 next_minor++;
547 if (next_minor > MINORMASK)
548 next_minor = 0;
549 if (next_minor == start) {
550 /* Oh dear, all in use. */
551 spin_unlock(&all_mddevs_lock);
552 kfree(new);
553 return NULL;
554 }
f72ffdd6 555
efeb53c0
N
556 is_free = 1;
557 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
558 if (mddev->unit == dev) {
559 is_free = 0;
560 break;
561 }
562 }
563 new->unit = dev;
564 new->md_minor = MINOR(dev);
565 new->hold_active = UNTIL_STOP;
1da177e4
LT
566 list_add(&new->all_mddevs, &all_mddevs);
567 spin_unlock(&all_mddevs_lock);
568 return new;
569 }
570 spin_unlock(&all_mddevs_lock);
571
9ffae0cf 572 new = kzalloc(sizeof(*new), GFP_KERNEL);
1da177e4
LT
573 if (!new)
574 return NULL;
575
1da177e4
LT
576 new->unit = unit;
577 if (MAJOR(unit) == MD_MAJOR)
578 new->md_minor = MINOR(unit);
579 else
580 new->md_minor = MINOR(unit) >> MdpMinorShift;
581
fafd7fb0 582 mddev_init(new);
1da177e4 583
1da177e4
LT
584 goto retry;
585}
586
b6eb127d
N
587static struct attribute_group md_redundancy_group;
588
5c47daf6 589void mddev_unlock(struct mddev *mddev)
1da177e4 590{
a64c876f 591 if (mddev->to_remove) {
b6eb127d
N
592 /* These cannot be removed under reconfig_mutex as
593 * an access to the files will try to take reconfig_mutex
594 * while holding the file unremovable, which leads to
595 * a deadlock.
bb4f1e9d
N
596 * So hold set sysfs_active while the remove in happeing,
597 * and anything else which might set ->to_remove or my
598 * otherwise change the sysfs namespace will fail with
599 * -EBUSY if sysfs_active is still set.
600 * We set sysfs_active under reconfig_mutex and elsewhere
601 * test it under the same mutex to ensure its correct value
602 * is seen.
b6eb127d 603 */
a64c876f
N
604 struct attribute_group *to_remove = mddev->to_remove;
605 mddev->to_remove = NULL;
bb4f1e9d 606 mddev->sysfs_active = 1;
b6eb127d
N
607 mutex_unlock(&mddev->reconfig_mutex);
608
00bcb4ac
N
609 if (mddev->kobj.sd) {
610 if (to_remove != &md_redundancy_group)
611 sysfs_remove_group(&mddev->kobj, to_remove);
612 if (mddev->pers == NULL ||
613 mddev->pers->sync_request == NULL) {
614 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
615 if (mddev->sysfs_action)
616 sysfs_put(mddev->sysfs_action);
617 mddev->sysfs_action = NULL;
618 }
a64c876f 619 }
bb4f1e9d 620 mddev->sysfs_active = 0;
b6eb127d
N
621 } else
622 mutex_unlock(&mddev->reconfig_mutex);
1da177e4 623
751e67ca
CD
624 /* As we've dropped the mutex we need a spinlock to
625 * make sure the thread doesn't disappear
01f96c0a
N
626 */
627 spin_lock(&pers_lock);
005eca5e 628 md_wakeup_thread(mddev->thread);
01f96c0a 629 spin_unlock(&pers_lock);
1da177e4 630}
5c47daf6 631EXPORT_SYMBOL_GPL(mddev_unlock);
1da177e4 632
57d051dc 633struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
1ca69c4b
N
634{
635 struct md_rdev *rdev;
636
637 rdev_for_each_rcu(rdev, mddev)
638 if (rdev->desc_nr == nr)
639 return rdev;
640
641 return NULL;
642}
57d051dc 643EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
1ca69c4b
N
644
645static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
1da177e4 646{
3cb03002 647 struct md_rdev *rdev;
1da177e4 648
dafb20fa 649 rdev_for_each(rdev, mddev)
1da177e4
LT
650 if (rdev->bdev->bd_dev == dev)
651 return rdev;
159ec1fc 652
1da177e4
LT
653 return NULL;
654}
655
1ca69c4b
N
656static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
657{
658 struct md_rdev *rdev;
659
660 rdev_for_each_rcu(rdev, mddev)
661 if (rdev->bdev->bd_dev == dev)
662 return rdev;
663
664 return NULL;
665}
666
84fc4b56 667static struct md_personality *find_pers(int level, char *clevel)
2604b703 668{
84fc4b56 669 struct md_personality *pers;
d9d166c2
N
670 list_for_each_entry(pers, &pers_list, list) {
671 if (level != LEVEL_NONE && pers->level == level)
2604b703 672 return pers;
d9d166c2
N
673 if (strcmp(pers->name, clevel)==0)
674 return pers;
675 }
2604b703
N
676 return NULL;
677}
678
b73df2d3 679/* return the offset of the super block in 512byte sectors */
3cb03002 680static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
1da177e4 681{
57b2caa3 682 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
b73df2d3 683 return MD_NEW_SIZE_SECTORS(num_sectors);
1da177e4
LT
684}
685
f72ffdd6 686static int alloc_disk_sb(struct md_rdev *rdev)
1da177e4 687{
1da177e4
LT
688 rdev->sb_page = alloc_page(GFP_KERNEL);
689 if (!rdev->sb_page) {
690 printk(KERN_ALERT "md: out of memory.\n");
ebc24337 691 return -ENOMEM;
1da177e4
LT
692 }
693
694 return 0;
695}
696
545c8795 697void md_rdev_clear(struct md_rdev *rdev)
1da177e4
LT
698{
699 if (rdev->sb_page) {
2d1f3b5d 700 put_page(rdev->sb_page);
1da177e4
LT
701 rdev->sb_loaded = 0;
702 rdev->sb_page = NULL;
0f420358 703 rdev->sb_start = 0;
dd8ac336 704 rdev->sectors = 0;
1da177e4 705 }
2699b672
N
706 if (rdev->bb_page) {
707 put_page(rdev->bb_page);
708 rdev->bb_page = NULL;
709 }
4fa2f327
N
710 kfree(rdev->badblocks.page);
711 rdev->badblocks.page = NULL;
1da177e4 712}
545c8795 713EXPORT_SYMBOL_GPL(md_rdev_clear);
1da177e4 714
4246a0b6 715static void super_written(struct bio *bio)
7bfa19f2 716{
3cb03002 717 struct md_rdev *rdev = bio->bi_private;
fd01b88c 718 struct mddev *mddev = rdev->mddev;
7bfa19f2 719
4246a0b6
CH
720 if (bio->bi_error) {
721 printk("md: super_written gets error=%d\n", bio->bi_error);
a9701a30 722 md_error(mddev, rdev);
3a0f5bbb 723 }
7bfa19f2 724
a9701a30
N
725 if (atomic_dec_and_test(&mddev->pending_writes))
726 wake_up(&mddev->sb_wait);
f8b58edf 727 bio_put(bio);
7bfa19f2
N
728}
729
fd01b88c 730void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
7bfa19f2
N
731 sector_t sector, int size, struct page *page)
732{
733 /* write first size bytes of page to sector of rdev
734 * Increment mddev->pending_writes before returning
735 * and decrement it on completion, waking up sb_wait
736 * if zero is reached.
737 * If an error occurred, call md_error
738 */
a167f663 739 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
7bfa19f2 740
a6ff7e08 741 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
4f024f37 742 bio->bi_iter.bi_sector = sector;
7bfa19f2
N
743 bio_add_page(bio, page, size, 0);
744 bio->bi_private = rdev;
745 bio->bi_end_io = super_written;
a9701a30 746
7bfa19f2 747 atomic_inc(&mddev->pending_writes);
a5bf4df0 748 submit_bio(WRITE_FLUSH_FUA, bio);
a9701a30
N
749}
750
fd01b88c 751void md_super_wait(struct mddev *mddev)
a9701a30 752{
e9c7469b 753 /* wait for all superblock writes that were scheduled to complete */
1967cd56 754 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
7bfa19f2
N
755}
756
3cb03002 757int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
ccebd4c4 758 struct page *page, int rw, bool metadata_op)
1da177e4 759{
a167f663 760 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
1da177e4
LT
761 int ret;
762
a6ff7e08
JB
763 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
764 rdev->meta_bdev : rdev->bdev;
ccebd4c4 765 if (metadata_op)
4f024f37 766 bio->bi_iter.bi_sector = sector + rdev->sb_start;
1fdd6fc9
N
767 else if (rdev->mddev->reshape_position != MaxSector &&
768 (rdev->mddev->reshape_backwards ==
769 (sector >= rdev->mddev->reshape_position)))
4f024f37 770 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
ccebd4c4 771 else
4f024f37 772 bio->bi_iter.bi_sector = sector + rdev->data_offset;
1da177e4 773 bio_add_page(bio, page, size, 0);
c170bbb4 774 submit_bio_wait(rw, bio);
1da177e4 775
4246a0b6 776 ret = !bio->bi_error;
1da177e4
LT
777 bio_put(bio);
778 return ret;
779}
a8745db2 780EXPORT_SYMBOL_GPL(sync_page_io);
1da177e4 781
f72ffdd6 782static int read_disk_sb(struct md_rdev *rdev, int size)
1da177e4
LT
783{
784 char b[BDEVNAME_SIZE];
403df478 785
1da177e4
LT
786 if (rdev->sb_loaded)
787 return 0;
788
ccebd4c4 789 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
1da177e4
LT
790 goto fail;
791 rdev->sb_loaded = 1;
792 return 0;
793
794fail:
795 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
796 bdevname(rdev->bdev,b));
797 return -EINVAL;
798}
799
800static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
801{
f72ffdd6 802 return sb1->set_uuid0 == sb2->set_uuid0 &&
05710466
AN
803 sb1->set_uuid1 == sb2->set_uuid1 &&
804 sb1->set_uuid2 == sb2->set_uuid2 &&
805 sb1->set_uuid3 == sb2->set_uuid3;
1da177e4
LT
806}
807
1da177e4
LT
808static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
809{
810 int ret;
811 mdp_super_t *tmp1, *tmp2;
812
813 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
814 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
815
816 if (!tmp1 || !tmp2) {
817 ret = 0;
35020f1a 818 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
1da177e4
LT
819 goto abort;
820 }
821
822 *tmp1 = *sb1;
823 *tmp2 = *sb2;
824
825 /*
826 * nr_disks is not constant
827 */
828 tmp1->nr_disks = 0;
829 tmp2->nr_disks = 0;
830
ce0c8e05 831 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1da177e4 832abort:
990a8baf
JJ
833 kfree(tmp1);
834 kfree(tmp2);
1da177e4
LT
835 return ret;
836}
837
4d167f09
N
838static u32 md_csum_fold(u32 csum)
839{
840 csum = (csum & 0xffff) + (csum >> 16);
841 return (csum & 0xffff) + (csum >> 16);
842}
843
f72ffdd6 844static unsigned int calc_sb_csum(mdp_super_t *sb)
1da177e4 845{
4d167f09
N
846 u64 newcsum = 0;
847 u32 *sb32 = (u32*)sb;
848 int i;
1da177e4
LT
849 unsigned int disk_csum, csum;
850
851 disk_csum = sb->sb_csum;
852 sb->sb_csum = 0;
4d167f09
N
853
854 for (i = 0; i < MD_SB_BYTES/4 ; i++)
855 newcsum += sb32[i];
856 csum = (newcsum & 0xffffffff) + (newcsum>>32);
857
4d167f09
N
858#ifdef CONFIG_ALPHA
859 /* This used to use csum_partial, which was wrong for several
860 * reasons including that different results are returned on
861 * different architectures. It isn't critical that we get exactly
862 * the same return value as before (we always csum_fold before
863 * testing, and that removes any differences). However as we
864 * know that csum_partial always returned a 16bit value on
865 * alphas, do a fold to maximise conformity to previous behaviour.
866 */
867 sb->sb_csum = md_csum_fold(disk_csum);
868#else
1da177e4 869 sb->sb_csum = disk_csum;
4d167f09 870#endif
1da177e4
LT
871 return csum;
872}
873
1da177e4
LT
874/*
875 * Handle superblock details.
876 * We want to be able to handle multiple superblock formats
877 * so we have a common interface to them all, and an array of
878 * different handlers.
879 * We rely on user-space to write the initial superblock, and support
880 * reading and updating of superblocks.
881 * Interface methods are:
3cb03002 882 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1da177e4
LT
883 * loads and validates a superblock on dev.
884 * if refdev != NULL, compare superblocks on both devices
885 * Return:
886 * 0 - dev has a superblock that is compatible with refdev
887 * 1 - dev has a superblock that is compatible and newer than refdev
888 * so dev should be used as the refdev in future
889 * -EINVAL superblock incompatible or invalid
890 * -othererror e.g. -EIO
891 *
fd01b88c 892 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
1da177e4
LT
893 * Verify that dev is acceptable into mddev.
894 * The first time, mddev->raid_disks will be 0, and data from
895 * dev should be merged in. Subsequent calls check that dev
896 * is new enough. Return 0 or -EINVAL
897 *
fd01b88c 898 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
1da177e4
LT
899 * Update the superblock for rdev with data in mddev
900 * This does not write to disc.
901 *
902 */
903
904struct super_type {
0cd17fec
CW
905 char *name;
906 struct module *owner;
c6563a8c
N
907 int (*load_super)(struct md_rdev *rdev,
908 struct md_rdev *refdev,
0cd17fec 909 int minor_version);
c6563a8c
N
910 int (*validate_super)(struct mddev *mddev,
911 struct md_rdev *rdev);
912 void (*sync_super)(struct mddev *mddev,
913 struct md_rdev *rdev);
3cb03002 914 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
15f4a5fd 915 sector_t num_sectors);
c6563a8c
N
916 int (*allow_new_offset)(struct md_rdev *rdev,
917 unsigned long long new_offset);
1da177e4
LT
918};
919
0894cc30
AN
920/*
921 * Check that the given mddev has no bitmap.
922 *
923 * This function is called from the run method of all personalities that do not
924 * support bitmaps. It prints an error message and returns non-zero if mddev
925 * has a bitmap. Otherwise, it returns 0.
926 *
927 */
fd01b88c 928int md_check_no_bitmap(struct mddev *mddev)
0894cc30 929{
c3d9714e 930 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
0894cc30
AN
931 return 0;
932 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
933 mdname(mddev), mddev->pers->name);
934 return 1;
935}
936EXPORT_SYMBOL(md_check_no_bitmap);
937
1da177e4 938/*
f72ffdd6 939 * load_super for 0.90.0
1da177e4 940 */
3cb03002 941static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1da177e4
LT
942{
943 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
944 mdp_super_t *sb;
945 int ret;
1da177e4
LT
946
947 /*
0f420358 948 * Calculate the position of the superblock (512byte sectors),
1da177e4
LT
949 * it's at the end of the disk.
950 *
951 * It also happens to be a multiple of 4Kb.
952 */
57b2caa3 953 rdev->sb_start = calc_dev_sboffset(rdev);
1da177e4 954
0002b271 955 ret = read_disk_sb(rdev, MD_SB_BYTES);
1da177e4
LT
956 if (ret) return ret;
957
958 ret = -EINVAL;
959
960 bdevname(rdev->bdev, b);
65a06f06 961 sb = page_address(rdev->sb_page);
1da177e4
LT
962
963 if (sb->md_magic != MD_SB_MAGIC) {
964 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
965 b);
966 goto abort;
967 }
968
969 if (sb->major_version != 0 ||
f6705578
N
970 sb->minor_version < 90 ||
971 sb->minor_version > 91) {
1da177e4
LT
972 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
973 sb->major_version, sb->minor_version,
974 b);
975 goto abort;
976 }
977
978 if (sb->raid_disks <= 0)
979 goto abort;
980
4d167f09 981 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1da177e4
LT
982 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
983 b);
984 goto abort;
985 }
986
987 rdev->preferred_minor = sb->md_minor;
988 rdev->data_offset = 0;
c6563a8c 989 rdev->new_data_offset = 0;
0002b271 990 rdev->sb_size = MD_SB_BYTES;
9f2f3830 991 rdev->badblocks.shift = -1;
1da177e4
LT
992
993 if (sb->level == LEVEL_MULTIPATH)
994 rdev->desc_nr = -1;
995 else
996 rdev->desc_nr = sb->this_disk.number;
997
9a7b2b0f 998 if (!refdev) {
1da177e4 999 ret = 1;
9a7b2b0f 1000 } else {
1da177e4 1001 __u64 ev1, ev2;
65a06f06 1002 mdp_super_t *refsb = page_address(refdev->sb_page);
1da177e4
LT
1003 if (!uuid_equal(refsb, sb)) {
1004 printk(KERN_WARNING "md: %s has different UUID to %s\n",
1005 b, bdevname(refdev->bdev,b2));
1006 goto abort;
1007 }
1008 if (!sb_equal(refsb, sb)) {
1009 printk(KERN_WARNING "md: %s has same UUID"
1010 " but different superblock to %s\n",
1011 b, bdevname(refdev->bdev, b2));
1012 goto abort;
1013 }
1014 ev1 = md_event(sb);
1015 ev2 = md_event(refsb);
1016 if (ev1 > ev2)
1017 ret = 1;
f72ffdd6 1018 else
1da177e4
LT
1019 ret = 0;
1020 }
8190e754 1021 rdev->sectors = rdev->sb_start;
667a5313
N
1022 /* Limit to 4TB as metadata cannot record more than that.
1023 * (not needed for Linear and RAID0 as metadata doesn't
1024 * record this size)
1025 */
1026 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
27a7b260 1027 rdev->sectors = (2ULL << 32) - 2;
1da177e4 1028
27a7b260 1029 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
2bf071bf
N
1030 /* "this cannot possibly happen" ... */
1031 ret = -EINVAL;
1032
1da177e4
LT
1033 abort:
1034 return ret;
1035}
1036
1037/*
1038 * validate_super for 0.90.0
1039 */
fd01b88c 1040static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1da177e4
LT
1041{
1042 mdp_disk_t *desc;
65a06f06 1043 mdp_super_t *sb = page_address(rdev->sb_page);
07d84d10 1044 __u64 ev1 = md_event(sb);
1da177e4 1045
41158c7e 1046 rdev->raid_disk = -1;
c5d79adb
N
1047 clear_bit(Faulty, &rdev->flags);
1048 clear_bit(In_sync, &rdev->flags);
8313b8e5 1049 clear_bit(Bitmap_sync, &rdev->flags);
c5d79adb 1050 clear_bit(WriteMostly, &rdev->flags);
c5d79adb 1051
1da177e4
LT
1052 if (mddev->raid_disks == 0) {
1053 mddev->major_version = 0;
1054 mddev->minor_version = sb->minor_version;
1055 mddev->patch_version = sb->patch_version;
e691063a 1056 mddev->external = 0;
9d8f0363 1057 mddev->chunk_sectors = sb->chunk_size >> 9;
1da177e4
LT
1058 mddev->ctime = sb->ctime;
1059 mddev->utime = sb->utime;
1060 mddev->level = sb->level;
d9d166c2 1061 mddev->clevel[0] = 0;
1da177e4
LT
1062 mddev->layout = sb->layout;
1063 mddev->raid_disks = sb->raid_disks;
27a7b260 1064 mddev->dev_sectors = ((sector_t)sb->size) * 2;
07d84d10 1065 mddev->events = ev1;
c3d9714e 1066 mddev->bitmap_info.offset = 0;
6409bb05
N
1067 mddev->bitmap_info.space = 0;
1068 /* bitmap can use 60 K after the 4K superblocks */
c3d9714e 1069 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6409bb05 1070 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
2c810cdd 1071 mddev->reshape_backwards = 0;
1da177e4 1072
f6705578
N
1073 if (mddev->minor_version >= 91) {
1074 mddev->reshape_position = sb->reshape_position;
1075 mddev->delta_disks = sb->delta_disks;
1076 mddev->new_level = sb->new_level;
1077 mddev->new_layout = sb->new_layout;
664e7c41 1078 mddev->new_chunk_sectors = sb->new_chunk >> 9;
2c810cdd
N
1079 if (mddev->delta_disks < 0)
1080 mddev->reshape_backwards = 1;
f6705578
N
1081 } else {
1082 mddev->reshape_position = MaxSector;
1083 mddev->delta_disks = 0;
1084 mddev->new_level = mddev->level;
1085 mddev->new_layout = mddev->layout;
664e7c41 1086 mddev->new_chunk_sectors = mddev->chunk_sectors;
f6705578
N
1087 }
1088
1da177e4
LT
1089 if (sb->state & (1<<MD_SB_CLEAN))
1090 mddev->recovery_cp = MaxSector;
1091 else {
f72ffdd6 1092 if (sb->events_hi == sb->cp_events_hi &&
1da177e4
LT
1093 sb->events_lo == sb->cp_events_lo) {
1094 mddev->recovery_cp = sb->recovery_cp;
1095 } else
1096 mddev->recovery_cp = 0;
1097 }
1098
1099 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1100 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1101 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1102 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1103
1104 mddev->max_disks = MD_SB_DISKS;
a654b9d8
N
1105
1106 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
6409bb05 1107 mddev->bitmap_info.file == NULL) {
c3d9714e
N
1108 mddev->bitmap_info.offset =
1109 mddev->bitmap_info.default_offset;
6409bb05 1110 mddev->bitmap_info.space =
c9ad020f 1111 mddev->bitmap_info.default_space;
6409bb05 1112 }
a654b9d8 1113
41158c7e 1114 } else if (mddev->pers == NULL) {
be6800a7
N
1115 /* Insist on good event counter while assembling, except
1116 * for spares (which don't need an event count) */
1da177e4 1117 ++ev1;
be6800a7
N
1118 if (sb->disks[rdev->desc_nr].state & (
1119 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
f72ffdd6 1120 if (ev1 < mddev->events)
be6800a7 1121 return -EINVAL;
41158c7e
N
1122 } else if (mddev->bitmap) {
1123 /* if adding to array with a bitmap, then we can accept an
1124 * older device ... but not too old.
1125 */
41158c7e
N
1126 if (ev1 < mddev->bitmap->events_cleared)
1127 return 0;
8313b8e5
N
1128 if (ev1 < mddev->events)
1129 set_bit(Bitmap_sync, &rdev->flags);
07d84d10
N
1130 } else {
1131 if (ev1 < mddev->events)
1132 /* just a hot-add of a new device, leave raid_disk at -1 */
1133 return 0;
1134 }
41158c7e 1135
1da177e4 1136 if (mddev->level != LEVEL_MULTIPATH) {
1da177e4
LT
1137 desc = sb->disks + rdev->desc_nr;
1138
1139 if (desc->state & (1<<MD_DISK_FAULTY))
b2d444d7 1140 set_bit(Faulty, &rdev->flags);
7c7546cc
N
1141 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1142 desc->raid_disk < mddev->raid_disks */) {
b2d444d7 1143 set_bit(In_sync, &rdev->flags);
1da177e4 1144 rdev->raid_disk = desc->raid_disk;
f466722c 1145 rdev->saved_raid_disk = desc->raid_disk;
0261cd9f
N
1146 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1147 /* active but not in sync implies recovery up to
1148 * reshape position. We don't know exactly where
1149 * that is, so set to zero for now */
1150 if (mddev->minor_version >= 91) {
1151 rdev->recovery_offset = 0;
1152 rdev->raid_disk = desc->raid_disk;
1153 }
1da177e4 1154 }
8ddf9efe
N
1155 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1156 set_bit(WriteMostly, &rdev->flags);
41158c7e 1157 } else /* MULTIPATH are always insync */
b2d444d7 1158 set_bit(In_sync, &rdev->flags);
1da177e4
LT
1159 return 0;
1160}
1161
1162/*
1163 * sync_super for 0.90.0
1164 */
fd01b88c 1165static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1da177e4
LT
1166{
1167 mdp_super_t *sb;
3cb03002 1168 struct md_rdev *rdev2;
1da177e4 1169 int next_spare = mddev->raid_disks;
19133a42 1170
1da177e4
LT
1171 /* make rdev->sb match mddev data..
1172 *
1173 * 1/ zero out disks
1174 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1175 * 3/ any empty disks < next_spare become removed
1176 *
1177 * disks[0] gets initialised to REMOVED because
1178 * we cannot be sure from other fields if it has
1179 * been initialised or not.
1180 */
1181 int i;
1182 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1183
61181565
N
1184 rdev->sb_size = MD_SB_BYTES;
1185
65a06f06 1186 sb = page_address(rdev->sb_page);
1da177e4
LT
1187
1188 memset(sb, 0, sizeof(*sb));
1189
1190 sb->md_magic = MD_SB_MAGIC;
1191 sb->major_version = mddev->major_version;
1da177e4
LT
1192 sb->patch_version = mddev->patch_version;
1193 sb->gvalid_words = 0; /* ignored */
1194 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1195 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1196 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1197 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1198
1199 sb->ctime = mddev->ctime;
1200 sb->level = mddev->level;
58c0fed4 1201 sb->size = mddev->dev_sectors / 2;
1da177e4
LT
1202 sb->raid_disks = mddev->raid_disks;
1203 sb->md_minor = mddev->md_minor;
e691063a 1204 sb->not_persistent = 0;
1da177e4
LT
1205 sb->utime = mddev->utime;
1206 sb->state = 0;
1207 sb->events_hi = (mddev->events>>32);
1208 sb->events_lo = (u32)mddev->events;
1209
f6705578
N
1210 if (mddev->reshape_position == MaxSector)
1211 sb->minor_version = 90;
1212 else {
1213 sb->minor_version = 91;
1214 sb->reshape_position = mddev->reshape_position;
1215 sb->new_level = mddev->new_level;
1216 sb->delta_disks = mddev->delta_disks;
1217 sb->new_layout = mddev->new_layout;
664e7c41 1218 sb->new_chunk = mddev->new_chunk_sectors << 9;
f6705578
N
1219 }
1220 mddev->minor_version = sb->minor_version;
1da177e4
LT
1221 if (mddev->in_sync)
1222 {
1223 sb->recovery_cp = mddev->recovery_cp;
1224 sb->cp_events_hi = (mddev->events>>32);
1225 sb->cp_events_lo = (u32)mddev->events;
1226 if (mddev->recovery_cp == MaxSector)
1227 sb->state = (1<< MD_SB_CLEAN);
1228 } else
1229 sb->recovery_cp = 0;
1230
1231 sb->layout = mddev->layout;
9d8f0363 1232 sb->chunk_size = mddev->chunk_sectors << 9;
1da177e4 1233
c3d9714e 1234 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
a654b9d8
N
1235 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1236
1da177e4 1237 sb->disks[0].state = (1<<MD_DISK_REMOVED);
dafb20fa 1238 rdev_for_each(rdev2, mddev) {
1da177e4 1239 mdp_disk_t *d;
86e6ffdd 1240 int desc_nr;
0261cd9f
N
1241 int is_active = test_bit(In_sync, &rdev2->flags);
1242
1243 if (rdev2->raid_disk >= 0 &&
1244 sb->minor_version >= 91)
1245 /* we have nowhere to store the recovery_offset,
1246 * but if it is not below the reshape_position,
1247 * we can piggy-back on that.
1248 */
1249 is_active = 1;
1250 if (rdev2->raid_disk < 0 ||
1251 test_bit(Faulty, &rdev2->flags))
1252 is_active = 0;
1253 if (is_active)
86e6ffdd 1254 desc_nr = rdev2->raid_disk;
1da177e4 1255 else
86e6ffdd 1256 desc_nr = next_spare++;
19133a42 1257 rdev2->desc_nr = desc_nr;
1da177e4
LT
1258 d = &sb->disks[rdev2->desc_nr];
1259 nr_disks++;
1260 d->number = rdev2->desc_nr;
1261 d->major = MAJOR(rdev2->bdev->bd_dev);
1262 d->minor = MINOR(rdev2->bdev->bd_dev);
0261cd9f 1263 if (is_active)
1da177e4
LT
1264 d->raid_disk = rdev2->raid_disk;
1265 else
1266 d->raid_disk = rdev2->desc_nr; /* compatibility */
1be7892f 1267 if (test_bit(Faulty, &rdev2->flags))
1da177e4 1268 d->state = (1<<MD_DISK_FAULTY);
0261cd9f 1269 else if (is_active) {
1da177e4 1270 d->state = (1<<MD_DISK_ACTIVE);
0261cd9f
N
1271 if (test_bit(In_sync, &rdev2->flags))
1272 d->state |= (1<<MD_DISK_SYNC);
1da177e4
LT
1273 active++;
1274 working++;
1275 } else {
1276 d->state = 0;
1277 spare++;
1278 working++;
1279 }
8ddf9efe
N
1280 if (test_bit(WriteMostly, &rdev2->flags))
1281 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1da177e4 1282 }
1da177e4
LT
1283 /* now set the "removed" and "faulty" bits on any missing devices */
1284 for (i=0 ; i < mddev->raid_disks ; i++) {
1285 mdp_disk_t *d = &sb->disks[i];
1286 if (d->state == 0 && d->number == 0) {
1287 d->number = i;
1288 d->raid_disk = i;
1289 d->state = (1<<MD_DISK_REMOVED);
1290 d->state |= (1<<MD_DISK_FAULTY);
1291 failed++;
1292 }
1293 }
1294 sb->nr_disks = nr_disks;
1295 sb->active_disks = active;
1296 sb->working_disks = working;
1297 sb->failed_disks = failed;
1298 sb->spare_disks = spare;
1299
1300 sb->this_disk = sb->disks[rdev->desc_nr];
1301 sb->sb_csum = calc_sb_csum(sb);
1302}
1303
0cd17fec
CW
1304/*
1305 * rdev_size_change for 0.90.0
1306 */
1307static unsigned long long
3cb03002 1308super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
0cd17fec 1309{
58c0fed4 1310 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
0cd17fec 1311 return 0; /* component must fit device */
c3d9714e 1312 if (rdev->mddev->bitmap_info.offset)
0cd17fec 1313 return 0; /* can't move bitmap */
57b2caa3 1314 rdev->sb_start = calc_dev_sboffset(rdev);
15f4a5fd
AN
1315 if (!num_sectors || num_sectors > rdev->sb_start)
1316 num_sectors = rdev->sb_start;
27a7b260
N
1317 /* Limit to 4TB as metadata cannot record more than that.
1318 * 4TB == 2^32 KB, or 2*2^32 sectors.
1319 */
667a5313 1320 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
27a7b260 1321 num_sectors = (2ULL << 32) - 2;
0f420358 1322 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
0cd17fec
CW
1323 rdev->sb_page);
1324 md_super_wait(rdev->mddev);
c26a44ed 1325 return num_sectors;
0cd17fec
CW
1326}
1327
c6563a8c
N
1328static int
1329super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1330{
1331 /* non-zero offset changes not possible with v0.90 */
1332 return new_offset == 0;
1333}
0cd17fec 1334
1da177e4
LT
1335/*
1336 * version 1 superblock
1337 */
1338
f72ffdd6 1339static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1da177e4 1340{
1c05b4bc
N
1341 __le32 disk_csum;
1342 u32 csum;
1da177e4
LT
1343 unsigned long long newcsum;
1344 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1c05b4bc 1345 __le32 *isuper = (__le32*)sb;
1da177e4
LT
1346
1347 disk_csum = sb->sb_csum;
1348 sb->sb_csum = 0;
1349 newcsum = 0;
1f3c9907 1350 for (; size >= 4; size -= 4)
1da177e4
LT
1351 newcsum += le32_to_cpu(*isuper++);
1352
1353 if (size == 2)
1c05b4bc 1354 newcsum += le16_to_cpu(*(__le16*) isuper);
1da177e4
LT
1355
1356 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1357 sb->sb_csum = disk_csum;
1358 return cpu_to_le32(csum);
1359}
1360
2699b672
N
1361static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1362 int acknowledged);
3cb03002 1363static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1da177e4
LT
1364{
1365 struct mdp_superblock_1 *sb;
1366 int ret;
0f420358 1367 sector_t sb_start;
c6563a8c 1368 sector_t sectors;
1da177e4 1369 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
0002b271 1370 int bmask;
1da177e4
LT
1371
1372 /*
0f420358 1373 * Calculate the position of the superblock in 512byte sectors.
1da177e4
LT
1374 * It is always aligned to a 4K boundary and
1375 * depeding on minor_version, it can be:
1376 * 0: At least 8K, but less than 12K, from end of device
1377 * 1: At start of device
1378 * 2: 4K from start of device.
1379 */
1380 switch(minor_version) {
1381 case 0:
77304d2a 1382 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
0f420358
AN
1383 sb_start -= 8*2;
1384 sb_start &= ~(sector_t)(4*2-1);
1da177e4
LT
1385 break;
1386 case 1:
0f420358 1387 sb_start = 0;
1da177e4
LT
1388 break;
1389 case 2:
0f420358 1390 sb_start = 8;
1da177e4
LT
1391 break;
1392 default:
1393 return -EINVAL;
1394 }
0f420358 1395 rdev->sb_start = sb_start;
1da177e4 1396
0002b271
N
1397 /* superblock is rarely larger than 1K, but it can be larger,
1398 * and it is safe to read 4k, so we do that
1399 */
1400 ret = read_disk_sb(rdev, 4096);
1da177e4
LT
1401 if (ret) return ret;
1402
65a06f06 1403 sb = page_address(rdev->sb_page);
1da177e4
LT
1404
1405 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1406 sb->major_version != cpu_to_le32(1) ||
1407 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
0f420358 1408 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
71c0805c 1409 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1da177e4
LT
1410 return -EINVAL;
1411
1412 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1413 printk("md: invalid superblock checksum on %s\n",
1414 bdevname(rdev->bdev,b));
1415 return -EINVAL;
1416 }
1417 if (le64_to_cpu(sb->data_size) < 10) {
1418 printk("md: data_size too small on %s\n",
1419 bdevname(rdev->bdev,b));
1420 return -EINVAL;
1421 }
c6563a8c
N
1422 if (sb->pad0 ||
1423 sb->pad3[0] ||
1424 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1425 /* Some padding is non-zero, might be a new feature */
1426 return -EINVAL;
e11e93fa 1427
1da177e4
LT
1428 rdev->preferred_minor = 0xffff;
1429 rdev->data_offset = le64_to_cpu(sb->data_offset);
c6563a8c
N
1430 rdev->new_data_offset = rdev->data_offset;
1431 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1432 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1433 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
4dbcdc75 1434 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1da177e4 1435
0002b271 1436 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
e1defc4f 1437 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
0002b271 1438 if (rdev->sb_size & bmask)
a1801f85
N
1439 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1440
1441 if (minor_version
0f420358 1442 && rdev->data_offset < sb_start + (rdev->sb_size/512))
a1801f85 1443 return -EINVAL;
c6563a8c
N
1444 if (minor_version
1445 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1446 return -EINVAL;
0002b271 1447
31b65a0d
N
1448 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1449 rdev->desc_nr = -1;
1450 else
1451 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1452
2699b672
N
1453 if (!rdev->bb_page) {
1454 rdev->bb_page = alloc_page(GFP_KERNEL);
1455 if (!rdev->bb_page)
1456 return -ENOMEM;
1457 }
1458 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1459 rdev->badblocks.count == 0) {
1460 /* need to load the bad block list.
1461 * Currently we limit it to one page.
1462 */
1463 s32 offset;
1464 sector_t bb_sector;
1465 u64 *bbp;
1466 int i;
1467 int sectors = le16_to_cpu(sb->bblog_size);
1468 if (sectors > (PAGE_SIZE / 512))
1469 return -EINVAL;
1470 offset = le32_to_cpu(sb->bblog_offset);
1471 if (offset == 0)
1472 return -EINVAL;
1473 bb_sector = (long long)offset;
1474 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1475 rdev->bb_page, READ, true))
1476 return -EIO;
1477 bbp = (u64 *)page_address(rdev->bb_page);
1478 rdev->badblocks.shift = sb->bblog_shift;
1479 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1480 u64 bb = le64_to_cpu(*bbp);
1481 int count = bb & (0x3ff);
1482 u64 sector = bb >> 10;
1483 sector <<= sb->bblog_shift;
1484 count <<= sb->bblog_shift;
1485 if (bb + 1 == 0)
1486 break;
1487 if (md_set_badblocks(&rdev->badblocks,
1488 sector, count, 1) == 0)
1489 return -EINVAL;
1490 }
486adf72
N
1491 } else if (sb->bblog_offset != 0)
1492 rdev->badblocks.shift = 0;
2699b672 1493
9a7b2b0f 1494 if (!refdev) {
8ed75463 1495 ret = 1;
9a7b2b0f 1496 } else {
1da177e4 1497 __u64 ev1, ev2;
65a06f06 1498 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1da177e4
LT
1499
1500 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1501 sb->level != refsb->level ||
1502 sb->layout != refsb->layout ||
1503 sb->chunksize != refsb->chunksize) {
1504 printk(KERN_WARNING "md: %s has strangely different"
1505 " superblock to %s\n",
1506 bdevname(rdev->bdev,b),
1507 bdevname(refdev->bdev,b2));
1508 return -EINVAL;
1509 }
1510 ev1 = le64_to_cpu(sb->events);
1511 ev2 = le64_to_cpu(refsb->events);
1512
1513 if (ev1 > ev2)
8ed75463
N
1514 ret = 1;
1515 else
1516 ret = 0;
1da177e4 1517 }
c6563a8c
N
1518 if (minor_version) {
1519 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1520 sectors -= rdev->data_offset;
1521 } else
1522 sectors = rdev->sb_start;
1523 if (sectors < le64_to_cpu(sb->data_size))
1da177e4 1524 return -EINVAL;
dd8ac336 1525 rdev->sectors = le64_to_cpu(sb->data_size);
8ed75463 1526 return ret;
1da177e4
LT
1527}
1528
fd01b88c 1529static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1da177e4 1530{
65a06f06 1531 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
07d84d10 1532 __u64 ev1 = le64_to_cpu(sb->events);
1da177e4 1533
41158c7e 1534 rdev->raid_disk = -1;
c5d79adb
N
1535 clear_bit(Faulty, &rdev->flags);
1536 clear_bit(In_sync, &rdev->flags);
8313b8e5 1537 clear_bit(Bitmap_sync, &rdev->flags);
c5d79adb 1538 clear_bit(WriteMostly, &rdev->flags);
c5d79adb 1539
1da177e4
LT
1540 if (mddev->raid_disks == 0) {
1541 mddev->major_version = 1;
1542 mddev->patch_version = 0;
e691063a 1543 mddev->external = 0;
9d8f0363 1544 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1da177e4
LT
1545 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1546 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1547 mddev->level = le32_to_cpu(sb->level);
d9d166c2 1548 mddev->clevel[0] = 0;
1da177e4
LT
1549 mddev->layout = le32_to_cpu(sb->layout);
1550 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
58c0fed4 1551 mddev->dev_sectors = le64_to_cpu(sb->size);
07d84d10 1552 mddev->events = ev1;
c3d9714e 1553 mddev->bitmap_info.offset = 0;
6409bb05
N
1554 mddev->bitmap_info.space = 0;
1555 /* Default location for bitmap is 1K after superblock
1556 * using 3K - total of 4K
1557 */
c3d9714e 1558 mddev->bitmap_info.default_offset = 1024 >> 9;
6409bb05 1559 mddev->bitmap_info.default_space = (4096-1024) >> 9;
2c810cdd
N
1560 mddev->reshape_backwards = 0;
1561
1da177e4
LT
1562 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1563 memcpy(mddev->uuid, sb->set_uuid, 16);
1564
1565 mddev->max_disks = (4096-256)/2;
a654b9d8 1566
71c0805c 1567 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
6409bb05 1568 mddev->bitmap_info.file == NULL) {
c3d9714e
N
1569 mddev->bitmap_info.offset =
1570 (__s32)le32_to_cpu(sb->bitmap_offset);
6409bb05
N
1571 /* Metadata doesn't record how much space is available.
1572 * For 1.0, we assume we can use up to the superblock
1573 * if before, else to 4K beyond superblock.
1574 * For others, assume no change is possible.
1575 */
1576 if (mddev->minor_version > 0)
1577 mddev->bitmap_info.space = 0;
1578 else if (mddev->bitmap_info.offset > 0)
1579 mddev->bitmap_info.space =
1580 8 - mddev->bitmap_info.offset;
1581 else
1582 mddev->bitmap_info.space =
1583 -mddev->bitmap_info.offset;
1584 }
e11e93fa 1585
f6705578
N
1586 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1587 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1588 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1589 mddev->new_level = le32_to_cpu(sb->new_level);
1590 mddev->new_layout = le32_to_cpu(sb->new_layout);
664e7c41 1591 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
2c810cdd
N
1592 if (mddev->delta_disks < 0 ||
1593 (mddev->delta_disks == 0 &&
1594 (le32_to_cpu(sb->feature_map)
1595 & MD_FEATURE_RESHAPE_BACKWARDS)))
1596 mddev->reshape_backwards = 1;
f6705578
N
1597 } else {
1598 mddev->reshape_position = MaxSector;
1599 mddev->delta_disks = 0;
1600 mddev->new_level = mddev->level;
1601 mddev->new_layout = mddev->layout;
664e7c41 1602 mddev->new_chunk_sectors = mddev->chunk_sectors;
f6705578
N
1603 }
1604
41158c7e 1605 } else if (mddev->pers == NULL) {
be6800a7
N
1606 /* Insist of good event counter while assembling, except for
1607 * spares (which don't need an event count) */
1da177e4 1608 ++ev1;
be6800a7
N
1609 if (rdev->desc_nr >= 0 &&
1610 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
a3dfbdaa
SL
1611 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1612 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
be6800a7
N
1613 if (ev1 < mddev->events)
1614 return -EINVAL;
41158c7e
N
1615 } else if (mddev->bitmap) {
1616 /* If adding to array with a bitmap, then we can accept an
1617 * older device, but not too old.
1618 */
41158c7e
N
1619 if (ev1 < mddev->bitmap->events_cleared)
1620 return 0;
8313b8e5
N
1621 if (ev1 < mddev->events)
1622 set_bit(Bitmap_sync, &rdev->flags);
07d84d10
N
1623 } else {
1624 if (ev1 < mddev->events)
1625 /* just a hot-add of a new device, leave raid_disk at -1 */
1626 return 0;
1627 }
1da177e4
LT
1628 if (mddev->level != LEVEL_MULTIPATH) {
1629 int role;
3673f305
N
1630 if (rdev->desc_nr < 0 ||
1631 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
c4d4c91b 1632 role = MD_DISK_ROLE_SPARE;
3673f305
N
1633 rdev->desc_nr = -1;
1634 } else
1635 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1da177e4 1636 switch(role) {
c4d4c91b 1637 case MD_DISK_ROLE_SPARE: /* spare */
1da177e4 1638 break;
c4d4c91b 1639 case MD_DISK_ROLE_FAULTY: /* faulty */
b2d444d7 1640 set_bit(Faulty, &rdev->flags);
1da177e4 1641 break;
bac624f3
SL
1642 case MD_DISK_ROLE_JOURNAL: /* journal device */
1643 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1644 /* journal device without journal feature */
1645 printk(KERN_WARNING
1646 "md: journal device provided without journal feature, ignoring the device\n");
1647 return -EINVAL;
1648 }
1649 set_bit(Journal, &rdev->flags);
3069aa8d 1650 rdev->journal_tail = le64_to_cpu(sb->journal_tail);
bd18f646
SL
1651 if (mddev->recovery_cp == MaxSector)
1652 set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
f2076e7d 1653 rdev->raid_disk = mddev->raid_disks;
bac624f3 1654 break;
1da177e4 1655 default:
f466722c 1656 rdev->saved_raid_disk = role;
5fd6c1dc 1657 if ((le32_to_cpu(sb->feature_map) &
f466722c 1658 MD_FEATURE_RECOVERY_OFFSET)) {
5fd6c1dc 1659 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
f466722c
N
1660 if (!(le32_to_cpu(sb->feature_map) &
1661 MD_FEATURE_RECOVERY_BITMAP))
1662 rdev->saved_raid_disk = -1;
1663 } else
5fd6c1dc 1664 set_bit(In_sync, &rdev->flags);
1da177e4
LT
1665 rdev->raid_disk = role;
1666 break;
1667 }
8ddf9efe
N
1668 if (sb->devflags & WriteMostly1)
1669 set_bit(WriteMostly, &rdev->flags);
2d78f8c4
N
1670 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1671 set_bit(Replacement, &rdev->flags);
a97b7896
SL
1672 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1673 set_bit(MD_HAS_JOURNAL, &mddev->flags);
41158c7e 1674 } else /* MULTIPATH are always insync */
b2d444d7 1675 set_bit(In_sync, &rdev->flags);
41158c7e 1676
1da177e4
LT
1677 return 0;
1678}
1679
fd01b88c 1680static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1da177e4
LT
1681{
1682 struct mdp_superblock_1 *sb;
3cb03002 1683 struct md_rdev *rdev2;
1da177e4
LT
1684 int max_dev, i;
1685 /* make rdev->sb match mddev and rdev data. */
1686
65a06f06 1687 sb = page_address(rdev->sb_page);
1da177e4
LT
1688
1689 sb->feature_map = 0;
1690 sb->pad0 = 0;
5fd6c1dc 1691 sb->recovery_offset = cpu_to_le64(0);
1da177e4
LT
1692 memset(sb->pad3, 0, sizeof(sb->pad3));
1693
1694 sb->utime = cpu_to_le64((__u64)mddev->utime);
1695 sb->events = cpu_to_le64(mddev->events);
1696 if (mddev->in_sync)
1697 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
bd18f646
SL
1698 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1699 sb->resync_offset = cpu_to_le64(MaxSector);
1da177e4
LT
1700 else
1701 sb->resync_offset = cpu_to_le64(0);
1702
1c05b4bc 1703 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
4dbcdc75 1704
f0ca340c 1705 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
58c0fed4 1706 sb->size = cpu_to_le64(mddev->dev_sectors);
9d8f0363 1707 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
62e1e389
N
1708 sb->level = cpu_to_le32(mddev->level);
1709 sb->layout = cpu_to_le32(mddev->layout);
f0ca340c 1710
aeb9b211
N
1711 if (test_bit(WriteMostly, &rdev->flags))
1712 sb->devflags |= WriteMostly1;
1713 else
1714 sb->devflags &= ~WriteMostly1;
c6563a8c
N
1715 sb->data_offset = cpu_to_le64(rdev->data_offset);
1716 sb->data_size = cpu_to_le64(rdev->sectors);
aeb9b211 1717
c3d9714e
N
1718 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1719 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
71c0805c 1720 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
a654b9d8 1721 }
5fd6c1dc 1722
f2076e7d 1723 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
97e4f42d 1724 !test_bit(In_sync, &rdev->flags)) {
93be75ff
N
1725 sb->feature_map |=
1726 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1727 sb->recovery_offset =
1728 cpu_to_le64(rdev->recovery_offset);
f466722c
N
1729 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1730 sb->feature_map |=
1731 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
5fd6c1dc 1732 }
3069aa8d
SL
1733 /* Note: recovery_offset and journal_tail share space */
1734 if (test_bit(Journal, &rdev->flags))
1735 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2d78f8c4
N
1736 if (test_bit(Replacement, &rdev->flags))
1737 sb->feature_map |=
1738 cpu_to_le32(MD_FEATURE_REPLACEMENT);
5fd6c1dc 1739
f6705578
N
1740 if (mddev->reshape_position != MaxSector) {
1741 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1742 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1743 sb->new_layout = cpu_to_le32(mddev->new_layout);
1744 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1745 sb->new_level = cpu_to_le32(mddev->new_level);
664e7c41 1746 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2c810cdd
N
1747 if (mddev->delta_disks == 0 &&
1748 mddev->reshape_backwards)
1749 sb->feature_map
1750 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
c6563a8c
N
1751 if (rdev->new_data_offset != rdev->data_offset) {
1752 sb->feature_map
1753 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1754 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1755 - rdev->data_offset));
1756 }
f6705578 1757 }
a654b9d8 1758
3c462c88
GR
1759 if (mddev_is_clustered(mddev))
1760 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1761
2699b672
N
1762 if (rdev->badblocks.count == 0)
1763 /* Nothing to do for bad blocks*/ ;
1764 else if (sb->bblog_offset == 0)
1765 /* Cannot record bad blocks on this device */
1766 md_error(mddev, rdev);
1767 else {
1768 struct badblocks *bb = &rdev->badblocks;
1769 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1770 u64 *p = bb->page;
1771 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1772 if (bb->changed) {
1773 unsigned seq;
1774
1775retry:
1776 seq = read_seqbegin(&bb->lock);
1777
1778 memset(bbp, 0xff, PAGE_SIZE);
1779
1780 for (i = 0 ; i < bb->count ; i++) {
35f9ac2d 1781 u64 internal_bb = p[i];
2699b672
N
1782 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1783 | BB_LEN(internal_bb));
35f9ac2d 1784 bbp[i] = cpu_to_le64(store_bb);
2699b672 1785 }
d0962936 1786 bb->changed = 0;
2699b672
N
1787 if (read_seqretry(&bb->lock, seq))
1788 goto retry;
1789
1790 bb->sector = (rdev->sb_start +
1791 (int)le32_to_cpu(sb->bblog_offset));
1792 bb->size = le16_to_cpu(sb->bblog_size);
2699b672
N
1793 }
1794 }
1795
1da177e4 1796 max_dev = 0;
dafb20fa 1797 rdev_for_each(rdev2, mddev)
1da177e4
LT
1798 if (rdev2->desc_nr+1 > max_dev)
1799 max_dev = rdev2->desc_nr+1;
a778b73f 1800
70471daf
N
1801 if (max_dev > le32_to_cpu(sb->max_dev)) {
1802 int bmask;
a778b73f 1803 sb->max_dev = cpu_to_le32(max_dev);
70471daf
N
1804 rdev->sb_size = max_dev * 2 + 256;
1805 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1806 if (rdev->sb_size & bmask)
1807 rdev->sb_size = (rdev->sb_size | bmask) + 1;
ddcf3522
N
1808 } else
1809 max_dev = le32_to_cpu(sb->max_dev);
1810
1da177e4 1811 for (i=0; i<max_dev;i++)
c4d4c91b 1812 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
f72ffdd6 1813
a97b7896
SL
1814 if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
1815 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
f72ffdd6 1816
dafb20fa 1817 rdev_for_each(rdev2, mddev) {
1da177e4 1818 i = rdev2->desc_nr;
b2d444d7 1819 if (test_bit(Faulty, &rdev2->flags))
c4d4c91b 1820 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
b2d444d7 1821 else if (test_bit(In_sync, &rdev2->flags))
1da177e4 1822 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
a97b7896 1823 else if (test_bit(Journal, &rdev2->flags))
bac624f3 1824 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
93be75ff 1825 else if (rdev2->raid_disk >= 0)
5fd6c1dc 1826 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1da177e4 1827 else
c4d4c91b 1828 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1da177e4
LT
1829 }
1830
1da177e4
LT
1831 sb->sb_csum = calc_sb_1_csum(sb);
1832}
1833
0cd17fec 1834static unsigned long long
3cb03002 1835super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
0cd17fec
CW
1836{
1837 struct mdp_superblock_1 *sb;
15f4a5fd 1838 sector_t max_sectors;
58c0fed4 1839 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
0cd17fec 1840 return 0; /* component must fit device */
c6563a8c
N
1841 if (rdev->data_offset != rdev->new_data_offset)
1842 return 0; /* too confusing */
0f420358 1843 if (rdev->sb_start < rdev->data_offset) {
0cd17fec 1844 /* minor versions 1 and 2; superblock before data */
77304d2a 1845 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
15f4a5fd
AN
1846 max_sectors -= rdev->data_offset;
1847 if (!num_sectors || num_sectors > max_sectors)
1848 num_sectors = max_sectors;
c3d9714e 1849 } else if (rdev->mddev->bitmap_info.offset) {
0cd17fec
CW
1850 /* minor version 0 with bitmap we can't move */
1851 return 0;
1852 } else {
1853 /* minor version 0; superblock after data */
0f420358 1854 sector_t sb_start;
77304d2a 1855 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
0f420358 1856 sb_start &= ~(sector_t)(4*2 - 1);
dd8ac336 1857 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
15f4a5fd
AN
1858 if (!num_sectors || num_sectors > max_sectors)
1859 num_sectors = max_sectors;
0f420358 1860 rdev->sb_start = sb_start;
0cd17fec 1861 }
65a06f06 1862 sb = page_address(rdev->sb_page);
15f4a5fd 1863 sb->data_size = cpu_to_le64(num_sectors);
0f420358 1864 sb->super_offset = rdev->sb_start;
0cd17fec 1865 sb->sb_csum = calc_sb_1_csum(sb);
0f420358 1866 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
0cd17fec
CW
1867 rdev->sb_page);
1868 md_super_wait(rdev->mddev);
c26a44ed 1869 return num_sectors;
c6563a8c
N
1870
1871}
1872
1873static int
1874super_1_allow_new_offset(struct md_rdev *rdev,
1875 unsigned long long new_offset)
1876{
1877 /* All necessary checks on new >= old have been done */
1878 struct bitmap *bitmap;
1879 if (new_offset >= rdev->data_offset)
1880 return 1;
1881
1882 /* with 1.0 metadata, there is no metadata to tread on
1883 * so we can always move back */
1884 if (rdev->mddev->minor_version == 0)
1885 return 1;
1886
1887 /* otherwise we must be sure not to step on
1888 * any metadata, so stay:
1889 * 36K beyond start of superblock
1890 * beyond end of badblocks
1891 * beyond write-intent bitmap
1892 */
1893 if (rdev->sb_start + (32+4)*2 > new_offset)
1894 return 0;
1895 bitmap = rdev->mddev->bitmap;
1896 if (bitmap && !rdev->mddev->bitmap_info.file &&
1897 rdev->sb_start + rdev->mddev->bitmap_info.offset +
1ec885cd 1898 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
c6563a8c
N
1899 return 0;
1900 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1901 return 0;
1902
1903 return 1;
0cd17fec 1904}
1da177e4 1905
75c96f85 1906static struct super_type super_types[] = {
1da177e4
LT
1907 [0] = {
1908 .name = "0.90.0",
1909 .owner = THIS_MODULE,
0cd17fec
CW
1910 .load_super = super_90_load,
1911 .validate_super = super_90_validate,
1912 .sync_super = super_90_sync,
1913 .rdev_size_change = super_90_rdev_size_change,
c6563a8c 1914 .allow_new_offset = super_90_allow_new_offset,
1da177e4
LT
1915 },
1916 [1] = {
1917 .name = "md-1",
1918 .owner = THIS_MODULE,
0cd17fec
CW
1919 .load_super = super_1_load,
1920 .validate_super = super_1_validate,
1921 .sync_super = super_1_sync,
1922 .rdev_size_change = super_1_rdev_size_change,
c6563a8c 1923 .allow_new_offset = super_1_allow_new_offset,
1da177e4
LT
1924 },
1925};
1da177e4 1926
fd01b88c 1927static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
076f968b
JB
1928{
1929 if (mddev->sync_super) {
1930 mddev->sync_super(mddev, rdev);
1931 return;
1932 }
1933
1934 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1935
1936 super_types[mddev->major_version].sync_super(mddev, rdev);
1937}
1938
fd01b88c 1939static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1da177e4 1940{
3cb03002 1941 struct md_rdev *rdev, *rdev2;
1da177e4 1942
4b80991c 1943 rcu_read_lock();
0b020e85
SL
1944 rdev_for_each_rcu(rdev, mddev1) {
1945 if (test_bit(Faulty, &rdev->flags) ||
1946 test_bit(Journal, &rdev->flags) ||
1947 rdev->raid_disk == -1)
1948 continue;
1949 rdev_for_each_rcu(rdev2, mddev2) {
1950 if (test_bit(Faulty, &rdev2->flags) ||
1951 test_bit(Journal, &rdev2->flags) ||
1952 rdev2->raid_disk == -1)
1953 continue;
7dd5e7c3 1954 if (rdev->bdev->bd_contains ==
4b80991c
N
1955 rdev2->bdev->bd_contains) {
1956 rcu_read_unlock();
7dd5e7c3 1957 return 1;
4b80991c 1958 }
0b020e85
SL
1959 }
1960 }
4b80991c 1961 rcu_read_unlock();
1da177e4
LT
1962 return 0;
1963}
1964
1965static LIST_HEAD(pending_raid_disks);
1966
ac5e7113
AN
1967/*
1968 * Try to register data integrity profile for an mddev
1969 *
1970 * This is called when an array is started and after a disk has been kicked
1971 * from the array. It only succeeds if all working and active component devices
1972 * are integrity capable with matching profiles.
1973 */
fd01b88c 1974int md_integrity_register(struct mddev *mddev)
ac5e7113 1975{
3cb03002 1976 struct md_rdev *rdev, *reference = NULL;
ac5e7113
AN
1977
1978 if (list_empty(&mddev->disks))
1979 return 0; /* nothing to do */
629acb6a
JB
1980 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1981 return 0; /* shouldn't register, or already is */
dafb20fa 1982 rdev_for_each(rdev, mddev) {
ac5e7113
AN
1983 /* skip spares and non-functional disks */
1984 if (test_bit(Faulty, &rdev->flags))
1985 continue;
1986 if (rdev->raid_disk < 0)
1987 continue;
ac5e7113
AN
1988 if (!reference) {
1989 /* Use the first rdev as the reference */
1990 reference = rdev;
1991 continue;
1992 }
1993 /* does this rdev's profile match the reference profile? */
1994 if (blk_integrity_compare(reference->bdev->bd_disk,
1995 rdev->bdev->bd_disk) < 0)
1996 return -EINVAL;
1997 }
89078d57
MP
1998 if (!reference || !bdev_get_integrity(reference->bdev))
1999 return 0;
ac5e7113
AN
2000 /*
2001 * All component devices are integrity capable and have matching
2002 * profiles, register the common profile for the md device.
2003 */
25520d55
MP
2004 blk_integrity_register(mddev->gendisk,
2005 bdev_get_integrity(reference->bdev));
2006
a91a2785
MP
2007 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
2008 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2009 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
2010 mdname(mddev));
2011 return -EINVAL;
2012 }
ac5e7113
AN
2013 return 0;
2014}
2015EXPORT_SYMBOL(md_integrity_register);
2016
2017/* Disable data integrity if non-capable/non-matching disk is being added */
fd01b88c 2018void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
3f9d99c1 2019{
2863b9eb
JB
2020 struct blk_integrity *bi_rdev;
2021 struct blk_integrity *bi_mddev;
2022
2023 if (!mddev->gendisk)
2024 return;
2025
2026 bi_rdev = bdev_get_integrity(rdev->bdev);
2027 bi_mddev = blk_get_integrity(mddev->gendisk);
3f9d99c1 2028
ac5e7113 2029 if (!bi_mddev) /* nothing to do */
3f9d99c1 2030 return;
ac5e7113 2031 if (rdev->raid_disk < 0) /* skip spares */
3f9d99c1 2032 return;
ac5e7113
AN
2033 if (bi_rdev && blk_integrity_compare(mddev->gendisk,
2034 rdev->bdev->bd_disk) >= 0)
2035 return;
c7bfced9 2036 WARN_ON_ONCE(!mddev->suspended);
ac5e7113
AN
2037 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
2038 blk_integrity_unregister(mddev->gendisk);
3f9d99c1 2039}
ac5e7113 2040EXPORT_SYMBOL(md_integrity_add_rdev);
3f9d99c1 2041
f72ffdd6 2042static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
1da177e4 2043{
7dd5e7c3 2044 char b[BDEVNAME_SIZE];
f637b9f9 2045 struct kobject *ko;
5e55e2f5 2046 int err;
1da177e4 2047
11e2ede0
DW
2048 /* prevent duplicates */
2049 if (find_rdev(mddev, rdev->bdev->bd_dev))
2050 return -EEXIST;
2051
dd8ac336
AN
2052 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2053 if (rdev->sectors && (mddev->dev_sectors == 0 ||
2054 rdev->sectors < mddev->dev_sectors)) {
a778b73f
N
2055 if (mddev->pers) {
2056 /* Cannot change size, so fail
2057 * If mddev->level <= 0, then we don't care
2058 * about aligning sizes (e.g. linear)
2059 */
2060 if (mddev->level > 0)
2061 return -ENOSPC;
2062 } else
dd8ac336 2063 mddev->dev_sectors = rdev->sectors;
2bf071bf 2064 }
1da177e4
LT
2065
2066 /* Verify rdev->desc_nr is unique.
2067 * If it is -1, assign a free number, else
2068 * check number is not in use
2069 */
4878e9eb 2070 rcu_read_lock();
1da177e4
LT
2071 if (rdev->desc_nr < 0) {
2072 int choice = 0;
4878e9eb
N
2073 if (mddev->pers)
2074 choice = mddev->raid_disks;
57d051dc 2075 while (md_find_rdev_nr_rcu(mddev, choice))
1da177e4
LT
2076 choice++;
2077 rdev->desc_nr = choice;
2078 } else {
57d051dc 2079 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
4878e9eb 2080 rcu_read_unlock();
1da177e4 2081 return -EBUSY;
4878e9eb 2082 }
1da177e4 2083 }
4878e9eb 2084 rcu_read_unlock();
de01dfad
N
2085 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2086 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2087 mdname(mddev), mddev->max_disks);
2088 return -EBUSY;
2089 }
19133a42 2090 bdevname(rdev->bdev,b);
90a9befb 2091 strreplace(b, '/', '!');
649316b2 2092
1da177e4 2093 rdev->mddev = mddev;
19133a42 2094 printk(KERN_INFO "md: bind<%s>\n", b);
86e6ffdd 2095
b2d6db58 2096 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
5e55e2f5 2097 goto fail;
86e6ffdd 2098
0762b8bd 2099 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
00bcb4ac
N
2100 if (sysfs_create_link(&rdev->kobj, ko, "block"))
2101 /* failure here is OK */;
2102 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
3c0ee63a 2103
4b80991c 2104 list_add_rcu(&rdev->same_set, &mddev->disks);
e09b457b 2105 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
4044ba58
N
2106
2107 /* May as well allow recovery to be retried once */
5389042f 2108 mddev->recovery_disabled++;
3f9d99c1 2109
1da177e4 2110 return 0;
5e55e2f5
N
2111
2112 fail:
2113 printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2114 b, mdname(mddev));
2115 return err;
1da177e4
LT
2116}
2117
177a99b2 2118static void md_delayed_delete(struct work_struct *ws)
5792a285 2119{
3cb03002 2120 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
5792a285 2121 kobject_del(&rdev->kobj);
177a99b2 2122 kobject_put(&rdev->kobj);
5792a285
N
2123}
2124
f72ffdd6 2125static void unbind_rdev_from_array(struct md_rdev *rdev)
1da177e4
LT
2126{
2127 char b[BDEVNAME_SIZE];
403df478 2128
49731baa 2129 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
4b80991c 2130 list_del_rcu(&rdev->same_set);
1da177e4
LT
2131 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2132 rdev->mddev = NULL;
86e6ffdd 2133 sysfs_remove_link(&rdev->kobj, "block");
3c0ee63a
N
2134 sysfs_put(rdev->sysfs_state);
2135 rdev->sysfs_state = NULL;
2230dfe4 2136 rdev->badblocks.count = 0;
5792a285 2137 /* We need to delay this, otherwise we can deadlock when
4b80991c
N
2138 * writing to 'remove' to "dev/state". We also need
2139 * to delay it due to rcu usage.
5792a285 2140 */
4b80991c 2141 synchronize_rcu();
177a99b2
N
2142 INIT_WORK(&rdev->del_work, md_delayed_delete);
2143 kobject_get(&rdev->kobj);
e804ac78 2144 queue_work(md_misc_wq, &rdev->del_work);
1da177e4
LT
2145}
2146
2147/*
2148 * prevent the device from being mounted, repartitioned or
2149 * otherwise reused by a RAID array (or any other kernel
2150 * subsystem), by bd_claiming the device.
2151 */
3cb03002 2152static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
1da177e4
LT
2153{
2154 int err = 0;
2155 struct block_device *bdev;
2156 char b[BDEVNAME_SIZE];
2157
d4d77629 2158 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
3cb03002 2159 shared ? (struct md_rdev *)lock_rdev : rdev);
1da177e4
LT
2160 if (IS_ERR(bdev)) {
2161 printk(KERN_ERR "md: could not open %s.\n",
2162 __bdevname(dev, b));
2163 return PTR_ERR(bdev);
2164 }
1da177e4
LT
2165 rdev->bdev = bdev;
2166 return err;
2167}
2168
3cb03002 2169static void unlock_rdev(struct md_rdev *rdev)
1da177e4
LT
2170{
2171 struct block_device *bdev = rdev->bdev;
2172 rdev->bdev = NULL;
e525fd89 2173 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1da177e4
LT
2174}
2175
2176void md_autodetect_dev(dev_t dev);
2177
f72ffdd6 2178static void export_rdev(struct md_rdev *rdev)
1da177e4
LT
2179{
2180 char b[BDEVNAME_SIZE];
403df478 2181
1da177e4
LT
2182 printk(KERN_INFO "md: export_rdev(%s)\n",
2183 bdevname(rdev->bdev,b));
545c8795 2184 md_rdev_clear(rdev);
1da177e4 2185#ifndef MODULE
d0fae18f
N
2186 if (test_bit(AutoDetected, &rdev->flags))
2187 md_autodetect_dev(rdev->bdev->bd_dev);
1da177e4
LT
2188#endif
2189 unlock_rdev(rdev);
86e6ffdd 2190 kobject_put(&rdev->kobj);
1da177e4
LT
2191}
2192
fb56dfef 2193void md_kick_rdev_from_array(struct md_rdev *rdev)
1da177e4
LT
2194{
2195 unbind_rdev_from_array(rdev);
2196 export_rdev(rdev);
2197}
fb56dfef 2198EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
1da177e4 2199
fd01b88c 2200static void export_array(struct mddev *mddev)
1da177e4 2201{
0638bb0e 2202 struct md_rdev *rdev;
1da177e4 2203
0638bb0e
N
2204 while (!list_empty(&mddev->disks)) {
2205 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2206 same_set);
fb56dfef 2207 md_kick_rdev_from_array(rdev);
1da177e4 2208 }
1da177e4
LT
2209 mddev->raid_disks = 0;
2210 mddev->major_version = 0;
2211}
2212
f72ffdd6 2213static void sync_sbs(struct mddev *mddev, int nospares)
1da177e4 2214{
42543769
N
2215 /* Update each superblock (in-memory image), but
2216 * if we are allowed to, skip spares which already
2217 * have the right event counter, or have one earlier
2218 * (which would mean they aren't being marked as dirty
2219 * with the rest of the array)
2220 */
3cb03002 2221 struct md_rdev *rdev;
dafb20fa 2222 rdev_for_each(rdev, mddev) {
42543769
N
2223 if (rdev->sb_events == mddev->events ||
2224 (nospares &&
2225 rdev->raid_disk < 0 &&
42543769
N
2226 rdev->sb_events+1 == mddev->events)) {
2227 /* Don't update this superblock */
2228 rdev->sb_loaded = 2;
2229 } else {
076f968b 2230 sync_super(mddev, rdev);
42543769
N
2231 rdev->sb_loaded = 1;
2232 }
1da177e4
LT
2233 }
2234}
2235
2aa82191
GR
2236static bool does_sb_need_changing(struct mddev *mddev)
2237{
2238 struct md_rdev *rdev;
2239 struct mdp_superblock_1 *sb;
2240 int role;
2241
2242 /* Find a good rdev */
2243 rdev_for_each(rdev, mddev)
2244 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2245 break;
2246
2247 /* No good device found. */
2248 if (!rdev)
2249 return false;
2250
2251 sb = page_address(rdev->sb_page);
2252 /* Check if a device has become faulty or a spare become active */
2253 rdev_for_each(rdev, mddev) {
2254 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2255 /* Device activated? */
2256 if (role == 0xffff && rdev->raid_disk >=0 &&
2257 !test_bit(Faulty, &rdev->flags))
2258 return true;
2259 /* Device turned faulty? */
2260 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2261 return true;
2262 }
2263
2264 /* Check if any mddev parameters have changed */
2265 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2266 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2aa82191
GR
2267 (mddev->layout != le64_to_cpu(sb->layout)) ||
2268 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2269 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2270 return true;
2271
2272 return false;
2273}
2274
1aee41f6 2275void md_update_sb(struct mddev *mddev, int force_change)
1da177e4 2276{
3cb03002 2277 struct md_rdev *rdev;
06d91a5f 2278 int sync_req;
42543769 2279 int nospares = 0;
2699b672 2280 int any_badblocks_changed = 0;
23b63f9f 2281 int ret = -1;
1da177e4 2282
d87f064f
N
2283 if (mddev->ro) {
2284 if (force_change)
2285 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2286 return;
2287 }
2aa82191
GR
2288
2289 if (mddev_is_clustered(mddev)) {
2290 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2291 force_change = 1;
23b63f9f 2292 ret = md_cluster_ops->metadata_update_start(mddev);
2aa82191
GR
2293 /* Has someone else has updated the sb */
2294 if (!does_sb_need_changing(mddev)) {
23b63f9f
GJ
2295 if (ret == 0)
2296 md_cluster_ops->metadata_update_cancel(mddev);
2aa82191
GR
2297 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2298 return;
2299 }
2300 }
1da177e4 2301repeat:
3a3a5ddb 2302 /* First make sure individual recovery_offsets are correct */
dafb20fa 2303 rdev_for_each(rdev, mddev) {
3a3a5ddb
N
2304 if (rdev->raid_disk >= 0 &&
2305 mddev->delta_disks >= 0 &&
f2076e7d 2306 !test_bit(Journal, &rdev->flags) &&
3a3a5ddb
N
2307 !test_bit(In_sync, &rdev->flags) &&
2308 mddev->curr_resync_completed > rdev->recovery_offset)
2309 rdev->recovery_offset = mddev->curr_resync_completed;
2310
f72ffdd6 2311 }
bd52b746 2312 if (!mddev->persistent) {
070dc6dd 2313 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
3a3a5ddb 2314 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
de393cde 2315 if (!mddev->external) {
d97a41dc 2316 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
dafb20fa 2317 rdev_for_each(rdev, mddev) {
de393cde 2318 if (rdev->badblocks.changed) {
d0962936 2319 rdev->badblocks.changed = 0;
de393cde
N
2320 md_ack_all_badblocks(&rdev->badblocks);
2321 md_error(mddev, rdev);
2322 }
2323 clear_bit(Blocked, &rdev->flags);
2324 clear_bit(BlockedBadBlocks, &rdev->flags);
2325 wake_up(&rdev->blocked_wait);
2326 }
2327 }
3a3a5ddb
N
2328 wake_up(&mddev->sb_wait);
2329 return;
2330 }
2331
85572d7c 2332 spin_lock(&mddev->lock);
84692195 2333
3a3a5ddb
N
2334 mddev->utime = get_seconds();
2335
850b2b42
N
2336 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2337 force_change = 1;
2338 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2339 /* just a clean<-> dirty transition, possibly leave spares alone,
2340 * though if events isn't the right even/odd, we will have to do
2341 * spares after all
2342 */
2343 nospares = 1;
2344 if (force_change)
2345 nospares = 0;
2346 if (mddev->degraded)
84692195
N
2347 /* If the array is degraded, then skipping spares is both
2348 * dangerous and fairly pointless.
2349 * Dangerous because a device that was removed from the array
2350 * might have a event_count that still looks up-to-date,
2351 * so it can be re-added without a resync.
2352 * Pointless because if there are any spares to skip,
2353 * then a recovery will happen and soon that array won't
2354 * be degraded any more and the spare can go back to sleep then.
2355 */
850b2b42 2356 nospares = 0;
84692195 2357
06d91a5f 2358 sync_req = mddev->in_sync;
42543769
N
2359
2360 /* If this is just a dirty<->clean transition, and the array is clean
2361 * and 'events' is odd, we can roll back to the previous clean state */
850b2b42 2362 if (nospares
42543769 2363 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
a8707c08
N
2364 && mddev->can_decrease_events
2365 && mddev->events != 1) {
42543769 2366 mddev->events--;
a8707c08
N
2367 mddev->can_decrease_events = 0;
2368 } else {
42543769
N
2369 /* otherwise we have to go forward and ... */
2370 mddev->events ++;
a8707c08 2371 mddev->can_decrease_events = nospares;
42543769 2372 }
1da177e4 2373
403df478
N
2374 /*
2375 * This 64-bit counter should never wrap.
2376 * Either we are in around ~1 trillion A.C., assuming
2377 * 1 reboot per second, or we have a bug...
2378 */
2379 WARN_ON(mddev->events == 0);
2699b672 2380
dafb20fa 2381 rdev_for_each(rdev, mddev) {
2699b672
N
2382 if (rdev->badblocks.changed)
2383 any_badblocks_changed++;
de393cde
N
2384 if (test_bit(Faulty, &rdev->flags))
2385 set_bit(FaultRecorded, &rdev->flags);
2386 }
2699b672 2387
e691063a 2388 sync_sbs(mddev, nospares);
85572d7c 2389 spin_unlock(&mddev->lock);
1da177e4 2390
36a4e1fe
N
2391 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2392 mdname(mddev), mddev->in_sync);
1da177e4 2393
4ad13663 2394 bitmap_update_sb(mddev->bitmap);
dafb20fa 2395 rdev_for_each(rdev, mddev) {
1da177e4 2396 char b[BDEVNAME_SIZE];
36a4e1fe 2397
42543769
N
2398 if (rdev->sb_loaded != 1)
2399 continue; /* no noise on spare devices */
1da177e4 2400
f466722c 2401 if (!test_bit(Faulty, &rdev->flags)) {
7bfa19f2 2402 md_super_write(mddev,rdev,
0f420358 2403 rdev->sb_start, rdev->sb_size,
7bfa19f2 2404 rdev->sb_page);
36a4e1fe
N
2405 pr_debug("md: (write) %s's sb offset: %llu\n",
2406 bdevname(rdev->bdev, b),
2407 (unsigned long long)rdev->sb_start);
42543769 2408 rdev->sb_events = mddev->events;
2699b672
N
2409 if (rdev->badblocks.size) {
2410 md_super_write(mddev, rdev,
2411 rdev->badblocks.sector,
2412 rdev->badblocks.size << 9,
2413 rdev->bb_page);
2414 rdev->badblocks.size = 0;
2415 }
7bfa19f2 2416
f466722c 2417 } else
36a4e1fe
N
2418 pr_debug("md: %s (skipping faulty)\n",
2419 bdevname(rdev->bdev, b));
d70ed2e4 2420
7bfa19f2 2421 if (mddev->level == LEVEL_MULTIPATH)
1da177e4
LT
2422 /* only need to write one superblock... */
2423 break;
2424 }
a9701a30 2425 md_super_wait(mddev);
850b2b42 2426 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
7bfa19f2 2427
85572d7c 2428 spin_lock(&mddev->lock);
850b2b42
N
2429 if (mddev->in_sync != sync_req ||
2430 test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
06d91a5f 2431 /* have to write it out again */
85572d7c 2432 spin_unlock(&mddev->lock);
06d91a5f
N
2433 goto repeat;
2434 }
850b2b42 2435 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
85572d7c 2436 spin_unlock(&mddev->lock);
3d310eb7 2437 wake_up(&mddev->sb_wait);
acb180b0
N
2438 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2439 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
06d91a5f 2440
dafb20fa 2441 rdev_for_each(rdev, mddev) {
de393cde
N
2442 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2443 clear_bit(Blocked, &rdev->flags);
2444
2445 if (any_badblocks_changed)
2699b672 2446 md_ack_all_badblocks(&rdev->badblocks);
de393cde
N
2447 clear_bit(BlockedBadBlocks, &rdev->flags);
2448 wake_up(&rdev->blocked_wait);
2449 }
2aa82191 2450
23b63f9f 2451 if (mddev_is_clustered(mddev) && ret == 0)
2aa82191 2452 md_cluster_ops->metadata_update_finish(mddev);
1da177e4 2453}
1aee41f6 2454EXPORT_SYMBOL(md_update_sb);
1da177e4 2455
a6da4ef8
GR
2456static int add_bound_rdev(struct md_rdev *rdev)
2457{
2458 struct mddev *mddev = rdev->mddev;
2459 int err = 0;
2460
2461 if (!mddev->pers->hot_remove_disk) {
2462 /* If there is hot_add_disk but no hot_remove_disk
2463 * then added disks for geometry changes,
2464 * and should be added immediately.
2465 */
2466 super_types[mddev->major_version].
2467 validate_super(mddev, rdev);
2468 err = mddev->pers->hot_add_disk(mddev, rdev);
2469 if (err) {
2470 unbind_rdev_from_array(rdev);
2471 export_rdev(rdev);
2472 return err;
2473 }
2474 }
2475 sysfs_notify_dirent_safe(rdev->sysfs_state);
2476
2477 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2478 if (mddev->degraded)
2479 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2480 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2481 md_new_event(mddev);
2482 md_wakeup_thread(mddev->thread);
2483 return 0;
2484}
1da177e4 2485
7f6ce769 2486/* words written to sysfs files may, or may not, be \n terminated.
bce74dac
N
2487 * We want to accept with case. For this we use cmd_match.
2488 */
2489static int cmd_match(const char *cmd, const char *str)
2490{
2491 /* See if cmd, written into a sysfs file, matches
2492 * str. They must either be the same, or cmd can
2493 * have a trailing newline
2494 */
2495 while (*cmd && *str && *cmd == *str) {
2496 cmd++;
2497 str++;
2498 }
2499 if (*cmd == '\n')
2500 cmd++;
2501 if (*str || *cmd)
2502 return 0;
2503 return 1;
2504}
2505
86e6ffdd
N
2506struct rdev_sysfs_entry {
2507 struct attribute attr;
3cb03002
N
2508 ssize_t (*show)(struct md_rdev *, char *);
2509 ssize_t (*store)(struct md_rdev *, const char *, size_t);
86e6ffdd
N
2510};
2511
2512static ssize_t
3cb03002 2513state_show(struct md_rdev *rdev, char *page)
86e6ffdd
N
2514{
2515 char *sep = "";
20a49ff6 2516 size_t len = 0;
758bfc8a 2517 unsigned long flags = ACCESS_ONCE(rdev->flags);
86e6ffdd 2518
758bfc8a 2519 if (test_bit(Faulty, &flags) ||
de393cde 2520 rdev->badblocks.unacked_exist) {
86e6ffdd
N
2521 len+= sprintf(page+len, "%sfaulty",sep);
2522 sep = ",";
2523 }
758bfc8a 2524 if (test_bit(In_sync, &flags)) {
86e6ffdd
N
2525 len += sprintf(page+len, "%sin_sync",sep);
2526 sep = ",";
2527 }
ac6096e9
SL
2528 if (test_bit(Journal, &flags)) {
2529 len += sprintf(page+len, "%sjournal",sep);
2530 sep = ",";
2531 }
758bfc8a 2532 if (test_bit(WriteMostly, &flags)) {
f655675b
N
2533 len += sprintf(page+len, "%swrite_mostly",sep);
2534 sep = ",";
2535 }
758bfc8a 2536 if (test_bit(Blocked, &flags) ||
52c64152 2537 (rdev->badblocks.unacked_exist
758bfc8a 2538 && !test_bit(Faulty, &flags))) {
6bfe0b49
DW
2539 len += sprintf(page+len, "%sblocked", sep);
2540 sep = ",";
2541 }
758bfc8a 2542 if (!test_bit(Faulty, &flags) &&
f2076e7d 2543 !test_bit(Journal, &flags) &&
758bfc8a 2544 !test_bit(In_sync, &flags)) {
86e6ffdd
N
2545 len += sprintf(page+len, "%sspare", sep);
2546 sep = ",";
2547 }
758bfc8a 2548 if (test_bit(WriteErrorSeen, &flags)) {
d7a9d443
N
2549 len += sprintf(page+len, "%swrite_error", sep);
2550 sep = ",";
2551 }
758bfc8a 2552 if (test_bit(WantReplacement, &flags)) {
2d78f8c4
N
2553 len += sprintf(page+len, "%swant_replacement", sep);
2554 sep = ",";
2555 }
758bfc8a 2556 if (test_bit(Replacement, &flags)) {
2d78f8c4
N
2557 len += sprintf(page+len, "%sreplacement", sep);
2558 sep = ",";
2559 }
2560
86e6ffdd
N
2561 return len+sprintf(page+len, "\n");
2562}
2563
45dc2de1 2564static ssize_t
3cb03002 2565state_store(struct md_rdev *rdev, const char *buf, size_t len)
45dc2de1
N
2566{
2567 /* can write
de393cde 2568 * faulty - simulates an error
45dc2de1 2569 * remove - disconnects the device
f655675b
N
2570 * writemostly - sets write_mostly
2571 * -writemostly - clears write_mostly
de393cde
N
2572 * blocked - sets the Blocked flags
2573 * -blocked - clears the Blocked and possibly simulates an error
6d56e278 2574 * insync - sets Insync providing device isn't active
f466722c
N
2575 * -insync - clear Insync for a device with a slot assigned,
2576 * so that it gets rebuilt based on bitmap
d7a9d443
N
2577 * write_error - sets WriteErrorSeen
2578 * -write_error - clears WriteErrorSeen
45dc2de1
N
2579 */
2580 int err = -EINVAL;
2581 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2582 md_error(rdev->mddev, rdev);
5ef56c8f
N
2583 if (test_bit(Faulty, &rdev->flags))
2584 err = 0;
2585 else
2586 err = -EBUSY;
45dc2de1
N
2587 } else if (cmd_match(buf, "remove")) {
2588 if (rdev->raid_disk >= 0)
2589 err = -EBUSY;
2590 else {
fd01b88c 2591 struct mddev *mddev = rdev->mddev;
45dc2de1 2592 err = 0;
a9720903
GJ
2593 if (mddev_is_clustered(mddev))
2594 err = md_cluster_ops->remove_disk(mddev, rdev);
2595
2596 if (err == 0) {
2597 md_kick_rdev_from_array(rdev);
2598 if (mddev->pers)
2599 md_update_sb(mddev, 1);
2600 md_new_event(mddev);
2601 }
45dc2de1 2602 }
f655675b
N
2603 } else if (cmd_match(buf, "writemostly")) {
2604 set_bit(WriteMostly, &rdev->flags);
2605 err = 0;
2606 } else if (cmd_match(buf, "-writemostly")) {
2607 clear_bit(WriteMostly, &rdev->flags);
6bfe0b49
DW
2608 err = 0;
2609 } else if (cmd_match(buf, "blocked")) {
2610 set_bit(Blocked, &rdev->flags);
2611 err = 0;
2612 } else if (cmd_match(buf, "-blocked")) {
de393cde 2613 if (!test_bit(Faulty, &rdev->flags) &&
7da64a0a 2614 rdev->badblocks.unacked_exist) {
de393cde
N
2615 /* metadata handler doesn't understand badblocks,
2616 * so we need to fail the device
2617 */
2618 md_error(rdev->mddev, rdev);
2619 }
6bfe0b49 2620 clear_bit(Blocked, &rdev->flags);
de393cde 2621 clear_bit(BlockedBadBlocks, &rdev->flags);
6bfe0b49
DW
2622 wake_up(&rdev->blocked_wait);
2623 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2624 md_wakeup_thread(rdev->mddev->thread);
2625
6d56e278
N
2626 err = 0;
2627 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2628 set_bit(In_sync, &rdev->flags);
f655675b 2629 err = 0;
f2076e7d
SL
2630 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2631 !test_bit(Journal, &rdev->flags)) {
e1960f8c
N
2632 if (rdev->mddev->pers == NULL) {
2633 clear_bit(In_sync, &rdev->flags);
2634 rdev->saved_raid_disk = rdev->raid_disk;
2635 rdev->raid_disk = -1;
2636 err = 0;
2637 }
d7a9d443
N
2638 } else if (cmd_match(buf, "write_error")) {
2639 set_bit(WriteErrorSeen, &rdev->flags);
2640 err = 0;
2641 } else if (cmd_match(buf, "-write_error")) {
2642 clear_bit(WriteErrorSeen, &rdev->flags);
2643 err = 0;
2d78f8c4
N
2644 } else if (cmd_match(buf, "want_replacement")) {
2645 /* Any non-spare device that is not a replacement can
2646 * become want_replacement at any time, but we then need to
2647 * check if recovery is needed.
2648 */
2649 if (rdev->raid_disk >= 0 &&
f2076e7d 2650 !test_bit(Journal, &rdev->flags) &&
2d78f8c4
N
2651 !test_bit(Replacement, &rdev->flags))
2652 set_bit(WantReplacement, &rdev->flags);
2653 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2654 md_wakeup_thread(rdev->mddev->thread);
2655 err = 0;
2656 } else if (cmd_match(buf, "-want_replacement")) {
2657 /* Clearing 'want_replacement' is always allowed.
2658 * Once replacements starts it is too late though.
2659 */
2660 err = 0;
2661 clear_bit(WantReplacement, &rdev->flags);
2662 } else if (cmd_match(buf, "replacement")) {
2663 /* Can only set a device as a replacement when array has not
2664 * yet been started. Once running, replacement is automatic
2665 * from spares, or by assigning 'slot'.
2666 */
2667 if (rdev->mddev->pers)
2668 err = -EBUSY;
2669 else {
2670 set_bit(Replacement, &rdev->flags);
2671 err = 0;
2672 }
2673 } else if (cmd_match(buf, "-replacement")) {
2674 /* Similarly, can only clear Replacement before start */
2675 if (rdev->mddev->pers)
2676 err = -EBUSY;
2677 else {
2678 clear_bit(Replacement, &rdev->flags);
2679 err = 0;
2680 }
a6da4ef8
GR
2681 } else if (cmd_match(buf, "re-add")) {
2682 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
97f6cd39
GR
2683 /* clear_bit is performed _after_ all the devices
2684 * have their local Faulty bit cleared. If any writes
2685 * happen in the meantime in the local node, they
2686 * will land in the local bitmap, which will be synced
2687 * by this node eventually
2688 */
2689 if (!mddev_is_clustered(rdev->mddev) ||
2690 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2691 clear_bit(Faulty, &rdev->flags);
2692 err = add_bound_rdev(rdev);
2693 }
a6da4ef8
GR
2694 } else
2695 err = -EBUSY;
45dc2de1 2696 }
00bcb4ac
N
2697 if (!err)
2698 sysfs_notify_dirent_safe(rdev->sysfs_state);
45dc2de1
N
2699 return err ? err : len;
2700}
80ca3a44 2701static struct rdev_sysfs_entry rdev_state =
750f199e 2702__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
86e6ffdd 2703
4dbcdc75 2704static ssize_t
3cb03002 2705errors_show(struct md_rdev *rdev, char *page)
4dbcdc75
N
2706{
2707 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2708}
2709
2710static ssize_t
3cb03002 2711errors_store(struct md_rdev *rdev, const char *buf, size_t len)
4dbcdc75 2712{
4c9309c0
AD
2713 unsigned int n;
2714 int rv;
2715
2716 rv = kstrtouint(buf, 10, &n);
2717 if (rv < 0)
2718 return rv;
2719 atomic_set(&rdev->corrected_errors, n);
2720 return len;
4dbcdc75
N
2721}
2722static struct rdev_sysfs_entry rdev_errors =
80ca3a44 2723__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
4dbcdc75 2724
014236d2 2725static ssize_t
3cb03002 2726slot_show(struct md_rdev *rdev, char *page)
014236d2 2727{
f2076e7d
SL
2728 if (test_bit(Journal, &rdev->flags))
2729 return sprintf(page, "journal\n");
2730 else if (rdev->raid_disk < 0)
014236d2
N
2731 return sprintf(page, "none\n");
2732 else
2733 return sprintf(page, "%d\n", rdev->raid_disk);
2734}
2735
2736static ssize_t
3cb03002 2737slot_store(struct md_rdev *rdev, const char *buf, size_t len)
014236d2 2738{
4c9309c0 2739 int slot;
c303da6d 2740 int err;
4c9309c0 2741
f2076e7d
SL
2742 if (test_bit(Journal, &rdev->flags))
2743 return -EBUSY;
014236d2
N
2744 if (strncmp(buf, "none", 4)==0)
2745 slot = -1;
4c9309c0
AD
2746 else {
2747 err = kstrtouint(buf, 10, (unsigned int *)&slot);
2748 if (err < 0)
2749 return err;
2750 }
6c2fce2e 2751 if (rdev->mddev->pers && slot == -1) {
c303da6d
N
2752 /* Setting 'slot' on an active array requires also
2753 * updating the 'rd%d' link, and communicating
2754 * with the personality with ->hot_*_disk.
2755 * For now we only support removing
2756 * failed/spare devices. This normally happens automatically,
2757 * but not when the metadata is externally managed.
2758 */
c303da6d
N
2759 if (rdev->raid_disk == -1)
2760 return -EEXIST;
2761 /* personality does all needed checks */
01393f3d 2762 if (rdev->mddev->pers->hot_remove_disk == NULL)
c303da6d 2763 return -EINVAL;
746d3207
N
2764 clear_bit(Blocked, &rdev->flags);
2765 remove_and_add_spares(rdev->mddev, rdev);
2766 if (rdev->raid_disk >= 0)
2767 return -EBUSY;
c303da6d
N
2768 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2769 md_wakeup_thread(rdev->mddev->thread);
6c2fce2e 2770 } else if (rdev->mddev->pers) {
6c2fce2e 2771 /* Activating a spare .. or possibly reactivating
6d56e278 2772 * if we ever get bitmaps working here.
6c2fce2e
NB
2773 */
2774
2775 if (rdev->raid_disk != -1)
2776 return -EBUSY;
2777
c6751b2b
N
2778 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2779 return -EBUSY;
2780
6c2fce2e
NB
2781 if (rdev->mddev->pers->hot_add_disk == NULL)
2782 return -EINVAL;
2783
ba1b41b6
N
2784 if (slot >= rdev->mddev->raid_disks &&
2785 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2786 return -ENOSPC;
2787
6c2fce2e
NB
2788 rdev->raid_disk = slot;
2789 if (test_bit(In_sync, &rdev->flags))
2790 rdev->saved_raid_disk = slot;
2791 else
2792 rdev->saved_raid_disk = -1;
d30519fc 2793 clear_bit(In_sync, &rdev->flags);
8313b8e5 2794 clear_bit(Bitmap_sync, &rdev->flags);
2910ff17
GR
2795 remove_and_add_spares(rdev->mddev, rdev);
2796 if (rdev->raid_disk == -1)
2797 return -EBUSY;
6c2fce2e 2798 /* don't wakeup anyone, leave that to userspace. */
c303da6d 2799 } else {
ba1b41b6
N
2800 if (slot >= rdev->mddev->raid_disks &&
2801 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
c303da6d
N
2802 return -ENOSPC;
2803 rdev->raid_disk = slot;
2804 /* assume it is working */
c5d79adb
N
2805 clear_bit(Faulty, &rdev->flags);
2806 clear_bit(WriteMostly, &rdev->flags);
c303da6d 2807 set_bit(In_sync, &rdev->flags);
00bcb4ac 2808 sysfs_notify_dirent_safe(rdev->sysfs_state);
c303da6d 2809 }
014236d2
N
2810 return len;
2811}
2812
014236d2 2813static struct rdev_sysfs_entry rdev_slot =
80ca3a44 2814__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
014236d2 2815
93c8cad0 2816static ssize_t
3cb03002 2817offset_show(struct md_rdev *rdev, char *page)
93c8cad0 2818{
6961ece4 2819 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
93c8cad0
N
2820}
2821
2822static ssize_t
3cb03002 2823offset_store(struct md_rdev *rdev, const char *buf, size_t len)
93c8cad0 2824{
c6563a8c 2825 unsigned long long offset;
b29bebd6 2826 if (kstrtoull(buf, 10, &offset) < 0)
93c8cad0 2827 return -EINVAL;
8ed0a521 2828 if (rdev->mddev->pers && rdev->raid_disk >= 0)
93c8cad0 2829 return -EBUSY;
dd8ac336 2830 if (rdev->sectors && rdev->mddev->external)
c5d79adb
N
2831 /* Must set offset before size, so overlap checks
2832 * can be sane */
2833 return -EBUSY;
93c8cad0 2834 rdev->data_offset = offset;
25f7fd47 2835 rdev->new_data_offset = offset;
93c8cad0
N
2836 return len;
2837}
2838
2839static struct rdev_sysfs_entry rdev_offset =
80ca3a44 2840__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
93c8cad0 2841
c6563a8c
N
2842static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2843{
2844 return sprintf(page, "%llu\n",
2845 (unsigned long long)rdev->new_data_offset);
2846}
2847
2848static ssize_t new_offset_store(struct md_rdev *rdev,
2849 const char *buf, size_t len)
2850{
2851 unsigned long long new_offset;
2852 struct mddev *mddev = rdev->mddev;
2853
b29bebd6 2854 if (kstrtoull(buf, 10, &new_offset) < 0)
c6563a8c
N
2855 return -EINVAL;
2856
f851b60d
N
2857 if (mddev->sync_thread ||
2858 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
c6563a8c
N
2859 return -EBUSY;
2860 if (new_offset == rdev->data_offset)
2861 /* reset is always permitted */
2862 ;
2863 else if (new_offset > rdev->data_offset) {
2864 /* must not push array size beyond rdev_sectors */
2865 if (new_offset - rdev->data_offset
2866 + mddev->dev_sectors > rdev->sectors)
2867 return -E2BIG;
2868 }
2869 /* Metadata worries about other space details. */
2870
2871 /* decreasing the offset is inconsistent with a backwards
2872 * reshape.
2873 */
2874 if (new_offset < rdev->data_offset &&
2875 mddev->reshape_backwards)
2876 return -EINVAL;
2877 /* Increasing offset is inconsistent with forwards
2878 * reshape. reshape_direction should be set to
2879 * 'backwards' first.
2880 */
2881 if (new_offset > rdev->data_offset &&
2882 !mddev->reshape_backwards)
2883 return -EINVAL;
2884
2885 if (mddev->pers && mddev->persistent &&
2886 !super_types[mddev->major_version]
2887 .allow_new_offset(rdev, new_offset))
2888 return -E2BIG;
2889 rdev->new_data_offset = new_offset;
2890 if (new_offset > rdev->data_offset)
2891 mddev->reshape_backwards = 1;
2892 else if (new_offset < rdev->data_offset)
2893 mddev->reshape_backwards = 0;
2894
2895 return len;
2896}
2897static struct rdev_sysfs_entry rdev_new_offset =
2898__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2899
83303b61 2900static ssize_t
3cb03002 2901rdev_size_show(struct md_rdev *rdev, char *page)
83303b61 2902{
dd8ac336 2903 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
83303b61
N
2904}
2905
c5d79adb
N
2906static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2907{
2908 /* check if two start/length pairs overlap */
2909 if (s1+l1 <= s2)
2910 return 0;
2911 if (s2+l2 <= s1)
2912 return 0;
2913 return 1;
2914}
2915
b522adcd
DW
2916static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2917{
2918 unsigned long long blocks;
2919 sector_t new;
2920
b29bebd6 2921 if (kstrtoull(buf, 10, &blocks) < 0)
b522adcd
DW
2922 return -EINVAL;
2923
2924 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2925 return -EINVAL; /* sector conversion overflow */
2926
2927 new = blocks * 2;
2928 if (new != blocks * 2)
2929 return -EINVAL; /* unsigned long long to sector_t overflow */
2930
2931 *sectors = new;
2932 return 0;
2933}
2934
83303b61 2935static ssize_t
3cb03002 2936rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
83303b61 2937{
fd01b88c 2938 struct mddev *my_mddev = rdev->mddev;
dd8ac336 2939 sector_t oldsectors = rdev->sectors;
b522adcd 2940 sector_t sectors;
27c529bb 2941
f2076e7d
SL
2942 if (test_bit(Journal, &rdev->flags))
2943 return -EBUSY;
b522adcd 2944 if (strict_blocks_to_sectors(buf, &sectors) < 0)
d7027458 2945 return -EINVAL;
c6563a8c
N
2946 if (rdev->data_offset != rdev->new_data_offset)
2947 return -EINVAL; /* too confusing */
0cd17fec 2948 if (my_mddev->pers && rdev->raid_disk >= 0) {
d7027458 2949 if (my_mddev->persistent) {
dd8ac336
AN
2950 sectors = super_types[my_mddev->major_version].
2951 rdev_size_change(rdev, sectors);
2952 if (!sectors)
0cd17fec 2953 return -EBUSY;
dd8ac336 2954 } else if (!sectors)
77304d2a 2955 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
dd8ac336 2956 rdev->data_offset;
a6468539
N
2957 if (!my_mddev->pers->resize)
2958 /* Cannot change size for RAID0 or Linear etc */
2959 return -EINVAL;
0cd17fec 2960 }
dd8ac336 2961 if (sectors < my_mddev->dev_sectors)
7d3c6f87 2962 return -EINVAL; /* component must fit device */
0cd17fec 2963
dd8ac336
AN
2964 rdev->sectors = sectors;
2965 if (sectors > oldsectors && my_mddev->external) {
8b1afc3d
N
2966 /* Need to check that all other rdevs with the same
2967 * ->bdev do not overlap. 'rcu' is sufficient to walk
2968 * the rdev lists safely.
2969 * This check does not provide a hard guarantee, it
2970 * just helps avoid dangerous mistakes.
c5d79adb 2971 */
fd01b88c 2972 struct mddev *mddev;
c5d79adb 2973 int overlap = 0;
159ec1fc 2974 struct list_head *tmp;
c5d79adb 2975
8b1afc3d 2976 rcu_read_lock();
29ac4aa3 2977 for_each_mddev(mddev, tmp) {
3cb03002 2978 struct md_rdev *rdev2;
c5d79adb 2979
dafb20fa 2980 rdev_for_each(rdev2, mddev)
f21e9ff7
N
2981 if (rdev->bdev == rdev2->bdev &&
2982 rdev != rdev2 &&
2983 overlaps(rdev->data_offset, rdev->sectors,
2984 rdev2->data_offset,
2985 rdev2->sectors)) {
c5d79adb
N
2986 overlap = 1;
2987 break;
2988 }
c5d79adb
N
2989 if (overlap) {
2990 mddev_put(mddev);
2991 break;
2992 }
2993 }
8b1afc3d 2994 rcu_read_unlock();
c5d79adb
N
2995 if (overlap) {
2996 /* Someone else could have slipped in a size
2997 * change here, but doing so is just silly.
dd8ac336 2998 * We put oldsectors back because we *know* it is
c5d79adb
N
2999 * safe, and trust userspace not to race with
3000 * itself
3001 */
dd8ac336 3002 rdev->sectors = oldsectors;
c5d79adb
N
3003 return -EBUSY;
3004 }
3005 }
83303b61
N
3006 return len;
3007}
3008
3009static struct rdev_sysfs_entry rdev_size =
80ca3a44 3010__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
83303b61 3011
3cb03002 3012static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
06e3c817
DW
3013{
3014 unsigned long long recovery_start = rdev->recovery_offset;
3015
3016 if (test_bit(In_sync, &rdev->flags) ||
3017 recovery_start == MaxSector)
3018 return sprintf(page, "none\n");
3019
3020 return sprintf(page, "%llu\n", recovery_start);
3021}
3022
3cb03002 3023static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
06e3c817
DW
3024{
3025 unsigned long long recovery_start;
3026
3027 if (cmd_match(buf, "none"))
3028 recovery_start = MaxSector;
b29bebd6 3029 else if (kstrtoull(buf, 10, &recovery_start))
06e3c817
DW
3030 return -EINVAL;
3031
3032 if (rdev->mddev->pers &&
3033 rdev->raid_disk >= 0)
3034 return -EBUSY;
3035
3036 rdev->recovery_offset = recovery_start;
3037 if (recovery_start == MaxSector)
3038 set_bit(In_sync, &rdev->flags);
3039 else
3040 clear_bit(In_sync, &rdev->flags);
3041 return len;
3042}
3043
3044static struct rdev_sysfs_entry rdev_recovery_start =
3045__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3046
16c791a5
N
3047static ssize_t
3048badblocks_show(struct badblocks *bb, char *page, int unack);
3049static ssize_t
3050badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
3051
3cb03002 3052static ssize_t bb_show(struct md_rdev *rdev, char *page)
16c791a5
N
3053{
3054 return badblocks_show(&rdev->badblocks, page, 0);
3055}
3cb03002 3056static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
16c791a5 3057{
de393cde
N
3058 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3059 /* Maybe that ack was all we needed */
3060 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3061 wake_up(&rdev->blocked_wait);
3062 return rv;
16c791a5
N
3063}
3064static struct rdev_sysfs_entry rdev_bad_blocks =
3065__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3066
3cb03002 3067static ssize_t ubb_show(struct md_rdev *rdev, char *page)
16c791a5
N
3068{
3069 return badblocks_show(&rdev->badblocks, page, 1);
3070}
3cb03002 3071static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
16c791a5
N
3072{
3073 return badblocks_store(&rdev->badblocks, page, len, 1);
3074}
3075static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3076__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3077
86e6ffdd
N
3078static struct attribute *rdev_default_attrs[] = {
3079 &rdev_state.attr,
4dbcdc75 3080 &rdev_errors.attr,
014236d2 3081 &rdev_slot.attr,
93c8cad0 3082 &rdev_offset.attr,
c6563a8c 3083 &rdev_new_offset.attr,
83303b61 3084 &rdev_size.attr,
06e3c817 3085 &rdev_recovery_start.attr,
16c791a5
N
3086 &rdev_bad_blocks.attr,
3087 &rdev_unack_bad_blocks.attr,
86e6ffdd
N
3088 NULL,
3089};
3090static ssize_t
3091rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3092{
3093 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3cb03002 3094 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
86e6ffdd
N
3095
3096 if (!entry->show)
3097 return -EIO;
758bfc8a
N
3098 if (!rdev->mddev)
3099 return -EBUSY;
3100 return entry->show(rdev, page);
86e6ffdd
N
3101}
3102
3103static ssize_t
3104rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3105 const char *page, size_t length)
3106{
3107 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3cb03002 3108 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
27c529bb 3109 ssize_t rv;
fd01b88c 3110 struct mddev *mddev = rdev->mddev;
86e6ffdd
N
3111
3112 if (!entry->store)
3113 return -EIO;
67463acb
N
3114 if (!capable(CAP_SYS_ADMIN))
3115 return -EACCES;
27c529bb 3116 rv = mddev ? mddev_lock(mddev): -EBUSY;
ca388059 3117 if (!rv) {
27c529bb
N
3118 if (rdev->mddev == NULL)
3119 rv = -EBUSY;
3120 else
3121 rv = entry->store(rdev, page, length);
6a51830e 3122 mddev_unlock(mddev);
ca388059
N
3123 }
3124 return rv;
86e6ffdd
N
3125}
3126
3127static void rdev_free(struct kobject *ko)
3128{
3cb03002 3129 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
86e6ffdd
N
3130 kfree(rdev);
3131}
52cf25d0 3132static const struct sysfs_ops rdev_sysfs_ops = {
86e6ffdd
N
3133 .show = rdev_attr_show,
3134 .store = rdev_attr_store,
3135};
3136static struct kobj_type rdev_ktype = {
3137 .release = rdev_free,
3138 .sysfs_ops = &rdev_sysfs_ops,
3139 .default_attrs = rdev_default_attrs,
3140};
3141
3cb03002 3142int md_rdev_init(struct md_rdev *rdev)
e8bb9a83
N
3143{
3144 rdev->desc_nr = -1;
3145 rdev->saved_raid_disk = -1;
3146 rdev->raid_disk = -1;
3147 rdev->flags = 0;
3148 rdev->data_offset = 0;
c6563a8c 3149 rdev->new_data_offset = 0;
e8bb9a83
N
3150 rdev->sb_events = 0;
3151 rdev->last_read_error.tv_sec = 0;
3152 rdev->last_read_error.tv_nsec = 0;
2699b672
N
3153 rdev->sb_loaded = 0;
3154 rdev->bb_page = NULL;
e8bb9a83
N
3155 atomic_set(&rdev->nr_pending, 0);
3156 atomic_set(&rdev->read_errors, 0);
3157 atomic_set(&rdev->corrected_errors, 0);
3158
3159 INIT_LIST_HEAD(&rdev->same_set);
3160 init_waitqueue_head(&rdev->blocked_wait);
2230dfe4
N
3161
3162 /* Add space to store bad block list.
3163 * This reserves the space even on arrays where it cannot
3164 * be used - I wonder if that matters
3165 */
3166 rdev->badblocks.count = 0;
486adf72 3167 rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
2230dfe4
N
3168 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3169 seqlock_init(&rdev->badblocks.lock);
3170 if (rdev->badblocks.page == NULL)
3171 return -ENOMEM;
3172
3173 return 0;
e8bb9a83
N
3174}
3175EXPORT_SYMBOL_GPL(md_rdev_init);
1da177e4
LT
3176/*
3177 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3178 *
3179 * mark the device faulty if:
3180 *
3181 * - the device is nonexistent (zero size)
3182 * - the device has no valid superblock
3183 *
3184 * a faulty rdev _never_ has rdev->sb set.
3185 */
3cb03002 3186static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
1da177e4
LT
3187{
3188 char b[BDEVNAME_SIZE];
3189 int err;
3cb03002 3190 struct md_rdev *rdev;
1da177e4
LT
3191 sector_t size;
3192
9ffae0cf 3193 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
1da177e4
LT
3194 if (!rdev) {
3195 printk(KERN_ERR "md: could not alloc mem for new device!\n");
3196 return ERR_PTR(-ENOMEM);
3197 }
1da177e4 3198
2230dfe4
N
3199 err = md_rdev_init(rdev);
3200 if (err)
3201 goto abort_free;
3202 err = alloc_disk_sb(rdev);
3203 if (err)
1da177e4
LT
3204 goto abort_free;
3205
c5d79adb 3206 err = lock_rdev(rdev, newdev, super_format == -2);
1da177e4
LT
3207 if (err)
3208 goto abort_free;
3209
f9cb074b 3210 kobject_init(&rdev->kobj, &rdev_ktype);
86e6ffdd 3211
77304d2a 3212 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
1da177e4 3213 if (!size) {
f72ffdd6 3214 printk(KERN_WARNING
1da177e4
LT
3215 "md: %s has zero or unknown size, marking faulty!\n",
3216 bdevname(rdev->bdev,b));
3217 err = -EINVAL;
3218 goto abort_free;
3219 }
3220
3221 if (super_format >= 0) {
3222 err = super_types[super_format].
3223 load_super(rdev, NULL, super_minor);
3224 if (err == -EINVAL) {
df968c4e
N
3225 printk(KERN_WARNING
3226 "md: %s does not have a valid v%d.%d "
3227 "superblock, not importing!\n",
3228 bdevname(rdev->bdev,b),
3229 super_format, super_minor);
1da177e4
LT
3230 goto abort_free;
3231 }
3232 if (err < 0) {
f72ffdd6 3233 printk(KERN_WARNING
1da177e4
LT
3234 "md: could not read %s's sb, not importing!\n",
3235 bdevname(rdev->bdev,b));
3236 goto abort_free;
3237 }
3238 }
6bfe0b49 3239
1da177e4
LT
3240 return rdev;
3241
3242abort_free:
2699b672
N
3243 if (rdev->bdev)
3244 unlock_rdev(rdev);
545c8795 3245 md_rdev_clear(rdev);
1da177e4
LT
3246 kfree(rdev);
3247 return ERR_PTR(err);
3248}
3249
3250/*
3251 * Check a full RAID array for plausibility
3252 */
3253
f72ffdd6 3254static void analyze_sbs(struct mddev *mddev)
1da177e4
LT
3255{
3256 int i;
3cb03002 3257 struct md_rdev *rdev, *freshest, *tmp;
1da177e4
LT
3258 char b[BDEVNAME_SIZE];
3259
3260 freshest = NULL;
dafb20fa 3261 rdev_for_each_safe(rdev, tmp, mddev)
1da177e4
LT
3262 switch (super_types[mddev->major_version].
3263 load_super(rdev, freshest, mddev->minor_version)) {
3264 case 1:
3265 freshest = rdev;
3266 break;
3267 case 0:
3268 break;
3269 default:
3270 printk( KERN_ERR \
3271 "md: fatal superblock inconsistency in %s"
f72ffdd6 3272 " -- removing from array\n",
1da177e4 3273 bdevname(rdev->bdev,b));
fb56dfef 3274 md_kick_rdev_from_array(rdev);
1da177e4
LT
3275 }
3276
1da177e4
LT
3277 super_types[mddev->major_version].
3278 validate_super(mddev, freshest);
3279
3280 i = 0;
dafb20fa 3281 rdev_for_each_safe(rdev, tmp, mddev) {
233fca36
N
3282 if (mddev->max_disks &&
3283 (rdev->desc_nr >= mddev->max_disks ||
3284 i > mddev->max_disks)) {
de01dfad
N
3285 printk(KERN_WARNING
3286 "md: %s: %s: only %d devices permitted\n",
3287 mdname(mddev), bdevname(rdev->bdev, b),
3288 mddev->max_disks);
fb56dfef 3289 md_kick_rdev_from_array(rdev);
de01dfad
N
3290 continue;
3291 }
1aee41f6 3292 if (rdev != freshest) {
1da177e4
LT
3293 if (super_types[mddev->major_version].
3294 validate_super(mddev, rdev)) {
3295 printk(KERN_WARNING "md: kicking non-fresh %s"
3296 " from array!\n",
3297 bdevname(rdev->bdev,b));
fb56dfef 3298 md_kick_rdev_from_array(rdev);
1da177e4
LT
3299 continue;
3300 }
1aee41f6 3301 }
1da177e4
LT
3302 if (mddev->level == LEVEL_MULTIPATH) {
3303 rdev->desc_nr = i++;
3304 rdev->raid_disk = rdev->desc_nr;
b2d444d7 3305 set_bit(In_sync, &rdev->flags);
f2076e7d
SL
3306 } else if (rdev->raid_disk >=
3307 (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3308 !test_bit(Journal, &rdev->flags)) {
a778b73f
N
3309 rdev->raid_disk = -1;
3310 clear_bit(In_sync, &rdev->flags);
1da177e4
LT
3311 }
3312 }
1da177e4
LT
3313}
3314
72e02075
N
3315/* Read a fixed-point number.
3316 * Numbers in sysfs attributes should be in "standard" units where
3317 * possible, so time should be in seconds.
f72ffdd6 3318 * However we internally use a a much smaller unit such as
72e02075
N
3319 * milliseconds or jiffies.
3320 * This function takes a decimal number with a possible fractional
3321 * component, and produces an integer which is the result of
3322 * multiplying that number by 10^'scale'.
3323 * all without any floating-point arithmetic.
3324 */
3325int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3326{
3327 unsigned long result = 0;
3328 long decimals = -1;
3329 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3330 if (*cp == '.')
3331 decimals = 0;
3332 else if (decimals < scale) {
3333 unsigned int value;
3334 value = *cp - '0';
3335 result = result * 10 + value;
3336 if (decimals >= 0)
3337 decimals++;
3338 }
3339 cp++;
3340 }
3341 if (*cp == '\n')
3342 cp++;
3343 if (*cp)
3344 return -EINVAL;
3345 if (decimals < 0)
3346 decimals = 0;
3347 while (decimals < scale) {
3348 result *= 10;
3349 decimals ++;
3350 }
3351 *res = result;
3352 return 0;
3353}
3354
16f17b39 3355static ssize_t
fd01b88c 3356safe_delay_show(struct mddev *mddev, char *page)
16f17b39
N
3357{
3358 int msec = (mddev->safemode_delay*1000)/HZ;
3359 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3360}
3361static ssize_t
fd01b88c 3362safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
16f17b39 3363{
16f17b39 3364 unsigned long msec;
97ce0a7f 3365
28c1b9fd
GR
3366 if (mddev_is_clustered(mddev)) {
3367 pr_info("md: Safemode is disabled for clustered mode\n");
3368 return -EINVAL;
3369 }
3370
72e02075 3371 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
16f17b39 3372 return -EINVAL;
16f17b39
N
3373 if (msec == 0)
3374 mddev->safemode_delay = 0;
3375 else {
19052c0e 3376 unsigned long old_delay = mddev->safemode_delay;
1b30e66f
N
3377 unsigned long new_delay = (msec*HZ)/1000;
3378
3379 if (new_delay == 0)
3380 new_delay = 1;
3381 mddev->safemode_delay = new_delay;
3382 if (new_delay < old_delay || old_delay == 0)
3383 mod_timer(&mddev->safemode_timer, jiffies+1);
16f17b39
N
3384 }
3385 return len;
3386}
3387static struct md_sysfs_entry md_safe_delay =
80ca3a44 3388__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
16f17b39 3389
eae1701f 3390static ssize_t
fd01b88c 3391level_show(struct mddev *mddev, char *page)
eae1701f 3392{
36d091f4
N
3393 struct md_personality *p;
3394 int ret;
3395 spin_lock(&mddev->lock);
3396 p = mddev->pers;
d9d166c2 3397 if (p)
36d091f4 3398 ret = sprintf(page, "%s\n", p->name);
d9d166c2 3399 else if (mddev->clevel[0])
36d091f4 3400 ret = sprintf(page, "%s\n", mddev->clevel);
d9d166c2 3401 else if (mddev->level != LEVEL_NONE)
36d091f4 3402 ret = sprintf(page, "%d\n", mddev->level);
d9d166c2 3403 else
36d091f4
N
3404 ret = 0;
3405 spin_unlock(&mddev->lock);
3406 return ret;
eae1701f
N
3407}
3408
d9d166c2 3409static ssize_t
fd01b88c 3410level_store(struct mddev *mddev, const char *buf, size_t len)
d9d166c2 3411{
f2859af6 3412 char clevel[16];
6791875e
N
3413 ssize_t rv;
3414 size_t slen = len;
db721d32 3415 struct md_personality *pers, *oldpers;
f2859af6 3416 long level;
db721d32 3417 void *priv, *oldpriv;
3cb03002 3418 struct md_rdev *rdev;
245f46c2 3419
6791875e
N
3420 if (slen == 0 || slen >= sizeof(clevel))
3421 return -EINVAL;
3422
3423 rv = mddev_lock(mddev);
3424 if (rv)
3425 return rv;
3426
245f46c2 3427 if (mddev->pers == NULL) {
6791875e
N
3428 strncpy(mddev->clevel, buf, slen);
3429 if (mddev->clevel[slen-1] == '\n')
3430 slen--;
3431 mddev->clevel[slen] = 0;
245f46c2 3432 mddev->level = LEVEL_NONE;
6791875e
N
3433 rv = len;
3434 goto out_unlock;
245f46c2 3435 }
6791875e 3436 rv = -EROFS;
bd8839e0 3437 if (mddev->ro)
6791875e 3438 goto out_unlock;
245f46c2
N
3439
3440 /* request to change the personality. Need to ensure:
3441 * - array is not engaged in resync/recovery/reshape
3442 * - old personality can be suspended
3443 * - new personality will access other array.
3444 */
3445
6791875e 3446 rv = -EBUSY;
bb4f1e9d 3447 if (mddev->sync_thread ||
f851b60d 3448 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
bb4f1e9d
N
3449 mddev->reshape_position != MaxSector ||
3450 mddev->sysfs_active)
6791875e 3451 goto out_unlock;
245f46c2 3452
6791875e 3453 rv = -EINVAL;
245f46c2
N
3454 if (!mddev->pers->quiesce) {
3455 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3456 mdname(mddev), mddev->pers->name);
6791875e 3457 goto out_unlock;
245f46c2
N
3458 }
3459
3460 /* Now find the new personality */
6791875e
N
3461 strncpy(clevel, buf, slen);
3462 if (clevel[slen-1] == '\n')
3463 slen--;
3464 clevel[slen] = 0;
b29bebd6 3465 if (kstrtol(clevel, 10, &level))
f2859af6 3466 level = LEVEL_NONE;
245f46c2 3467
f2859af6
DW
3468 if (request_module("md-%s", clevel) != 0)
3469 request_module("md-level-%s", clevel);
245f46c2 3470 spin_lock(&pers_lock);
f2859af6 3471 pers = find_pers(level, clevel);
245f46c2
N
3472 if (!pers || !try_module_get(pers->owner)) {
3473 spin_unlock(&pers_lock);
f2859af6 3474 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
6791875e
N
3475 rv = -EINVAL;
3476 goto out_unlock;
245f46c2
N
3477 }
3478 spin_unlock(&pers_lock);
3479
3480 if (pers == mddev->pers) {
3481 /* Nothing to do! */
3482 module_put(pers->owner);
6791875e
N
3483 rv = len;
3484 goto out_unlock;
245f46c2
N
3485 }
3486 if (!pers->takeover) {
3487 module_put(pers->owner);
3488 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
f2859af6 3489 mdname(mddev), clevel);
6791875e
N
3490 rv = -EINVAL;
3491 goto out_unlock;
245f46c2
N
3492 }
3493
dafb20fa 3494 rdev_for_each(rdev, mddev)
e93f68a1
N
3495 rdev->new_raid_disk = rdev->raid_disk;
3496
245f46c2
N
3497 /* ->takeover must set new_* and/or delta_disks
3498 * if it succeeds, and may set them when it fails.
3499 */
3500 priv = pers->takeover(mddev);
3501 if (IS_ERR(priv)) {
3502 mddev->new_level = mddev->level;
3503 mddev->new_layout = mddev->layout;
664e7c41 3504 mddev->new_chunk_sectors = mddev->chunk_sectors;
245f46c2
N
3505 mddev->raid_disks -= mddev->delta_disks;
3506 mddev->delta_disks = 0;
2c810cdd 3507 mddev->reshape_backwards = 0;
245f46c2
N
3508 module_put(pers->owner);
3509 printk(KERN_WARNING "md: %s: %s would not accept array\n",
f2859af6 3510 mdname(mddev), clevel);
6791875e
N
3511 rv = PTR_ERR(priv);
3512 goto out_unlock;
245f46c2
N
3513 }
3514
3515 /* Looks like we have a winner */
3516 mddev_suspend(mddev);
5aa61f42 3517 mddev_detach(mddev);
36d091f4
N
3518
3519 spin_lock(&mddev->lock);
db721d32
N
3520 oldpers = mddev->pers;
3521 oldpriv = mddev->private;
3522 mddev->pers = pers;
3523 mddev->private = priv;
3524 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3525 mddev->level = mddev->new_level;
3526 mddev->layout = mddev->new_layout;
3527 mddev->chunk_sectors = mddev->new_chunk_sectors;
3528 mddev->delta_disks = 0;
3529 mddev->reshape_backwards = 0;
3530 mddev->degraded = 0;
36d091f4 3531 spin_unlock(&mddev->lock);
db721d32
N
3532
3533 if (oldpers->sync_request == NULL &&
3534 mddev->external) {
3535 /* We are converting from a no-redundancy array
3536 * to a redundancy array and metadata is managed
3537 * externally so we need to be sure that writes
3538 * won't block due to a need to transition
3539 * clean->dirty
3540 * until external management is started.
3541 */
3542 mddev->in_sync = 0;
3543 mddev->safemode_delay = 0;
3544 mddev->safemode = 0;
3545 }
f72ffdd6 3546
db721d32
N
3547 oldpers->free(mddev, oldpriv);
3548
3549 if (oldpers->sync_request == NULL &&
a64c876f
N
3550 pers->sync_request != NULL) {
3551 /* need to add the md_redundancy_group */
3552 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3553 printk(KERN_WARNING
3554 "md: cannot register extra attributes for %s\n",
3555 mdname(mddev));
388975cc 3556 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
f72ffdd6 3557 }
db721d32 3558 if (oldpers->sync_request != NULL &&
a64c876f
N
3559 pers->sync_request == NULL) {
3560 /* need to remove the md_redundancy_group */
3561 if (mddev->to_remove == NULL)
3562 mddev->to_remove = &md_redundancy_group;
3563 }
3564
dafb20fa 3565 rdev_for_each(rdev, mddev) {
e93f68a1
N
3566 if (rdev->raid_disk < 0)
3567 continue;
bf2cb0da 3568 if (rdev->new_raid_disk >= mddev->raid_disks)
e93f68a1
N
3569 rdev->new_raid_disk = -1;
3570 if (rdev->new_raid_disk == rdev->raid_disk)
3571 continue;
36fad858 3572 sysfs_unlink_rdev(mddev, rdev);
e93f68a1 3573 }
dafb20fa 3574 rdev_for_each(rdev, mddev) {
e93f68a1
N
3575 if (rdev->raid_disk < 0)
3576 continue;
3577 if (rdev->new_raid_disk == rdev->raid_disk)
3578 continue;
3579 rdev->raid_disk = rdev->new_raid_disk;
3580 if (rdev->raid_disk < 0)
3a981b03 3581 clear_bit(In_sync, &rdev->flags);
e93f68a1 3582 else {
36fad858
NK
3583 if (sysfs_link_rdev(mddev, rdev))
3584 printk(KERN_WARNING "md: cannot register rd%d"
3585 " for %s after level change\n",
3586 rdev->raid_disk, mdname(mddev));
3a981b03 3587 }
e93f68a1
N
3588 }
3589
db721d32 3590 if (pers->sync_request == NULL) {
9af204cf
TM
3591 /* this is now an array without redundancy, so
3592 * it must always be in_sync
3593 */
3594 mddev->in_sync = 1;
3595 del_timer_sync(&mddev->safemode_timer);
3596 }
02e5f5c0 3597 blk_set_stacking_limits(&mddev->queue->limits);
245f46c2 3598 pers->run(mddev);
245f46c2 3599 set_bit(MD_CHANGE_DEVS, &mddev->flags);
47525e59 3600 mddev_resume(mddev);
830778a1
N
3601 if (!mddev->thread)
3602 md_update_sb(mddev, 1);
5cac7861 3603 sysfs_notify(&mddev->kobj, NULL, "level");
bb7f8d22 3604 md_new_event(mddev);
6791875e
N
3605 rv = len;
3606out_unlock:
3607 mddev_unlock(mddev);
d9d166c2
N
3608 return rv;
3609}
3610
3611static struct md_sysfs_entry md_level =
80ca3a44 3612__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
eae1701f 3613
d4dbd025 3614static ssize_t
fd01b88c 3615layout_show(struct mddev *mddev, char *page)
d4dbd025
N
3616{
3617 /* just a number, not meaningful for all levels */
08a02ecd
N
3618 if (mddev->reshape_position != MaxSector &&
3619 mddev->layout != mddev->new_layout)
3620 return sprintf(page, "%d (%d)\n",
3621 mddev->new_layout, mddev->layout);
d4dbd025
N
3622 return sprintf(page, "%d\n", mddev->layout);
3623}
3624
3625static ssize_t
fd01b88c 3626layout_store(struct mddev *mddev, const char *buf, size_t len)
d4dbd025 3627{
4c9309c0 3628 unsigned int n;
6791875e 3629 int err;
d4dbd025 3630
4c9309c0
AD
3631 err = kstrtouint(buf, 10, &n);
3632 if (err < 0)
3633 return err;
6791875e
N
3634 err = mddev_lock(mddev);
3635 if (err)
3636 return err;
d4dbd025 3637
b3546035 3638 if (mddev->pers) {
50ac168a 3639 if (mddev->pers->check_reshape == NULL)
6791875e
N
3640 err = -EBUSY;
3641 else if (mddev->ro)
3642 err = -EROFS;
3643 else {
3644 mddev->new_layout = n;
3645 err = mddev->pers->check_reshape(mddev);
3646 if (err)
3647 mddev->new_layout = mddev->layout;
597a711b 3648 }
b3546035 3649 } else {
08a02ecd 3650 mddev->new_layout = n;
b3546035
N
3651 if (mddev->reshape_position == MaxSector)
3652 mddev->layout = n;
3653 }
6791875e
N
3654 mddev_unlock(mddev);
3655 return err ?: len;
d4dbd025
N
3656}
3657static struct md_sysfs_entry md_layout =
80ca3a44 3658__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
d4dbd025 3659
eae1701f 3660static ssize_t
fd01b88c 3661raid_disks_show(struct mddev *mddev, char *page)
eae1701f 3662{
bb636547
N
3663 if (mddev->raid_disks == 0)
3664 return 0;
08a02ecd
N
3665 if (mddev->reshape_position != MaxSector &&
3666 mddev->delta_disks != 0)
3667 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3668 mddev->raid_disks - mddev->delta_disks);
eae1701f
N
3669 return sprintf(page, "%d\n", mddev->raid_disks);
3670}
3671
fd01b88c 3672static int update_raid_disks(struct mddev *mddev, int raid_disks);
da943b99
N
3673
3674static ssize_t
fd01b88c 3675raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
da943b99 3676{
4c9309c0 3677 unsigned int n;
6791875e 3678 int err;
da943b99 3679
4c9309c0
AD
3680 err = kstrtouint(buf, 10, &n);
3681 if (err < 0)
3682 return err;
da943b99 3683
6791875e
N
3684 err = mddev_lock(mddev);
3685 if (err)
3686 return err;
da943b99 3687 if (mddev->pers)
6791875e 3688 err = update_raid_disks(mddev, n);
08a02ecd 3689 else if (mddev->reshape_position != MaxSector) {
c6563a8c 3690 struct md_rdev *rdev;
08a02ecd 3691 int olddisks = mddev->raid_disks - mddev->delta_disks;
c6563a8c 3692
6791875e 3693 err = -EINVAL;
c6563a8c
N
3694 rdev_for_each(rdev, mddev) {
3695 if (olddisks < n &&
3696 rdev->data_offset < rdev->new_data_offset)
6791875e 3697 goto out_unlock;
c6563a8c
N
3698 if (olddisks > n &&
3699 rdev->data_offset > rdev->new_data_offset)
6791875e 3700 goto out_unlock;
c6563a8c 3701 }
6791875e 3702 err = 0;
08a02ecd
N
3703 mddev->delta_disks = n - olddisks;
3704 mddev->raid_disks = n;
2c810cdd 3705 mddev->reshape_backwards = (mddev->delta_disks < 0);
08a02ecd 3706 } else
da943b99 3707 mddev->raid_disks = n;
6791875e
N
3708out_unlock:
3709 mddev_unlock(mddev);
3710 return err ? err : len;
da943b99
N
3711}
3712static struct md_sysfs_entry md_raid_disks =
80ca3a44 3713__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
eae1701f 3714
3b34380a 3715static ssize_t
fd01b88c 3716chunk_size_show(struct mddev *mddev, char *page)
3b34380a 3717{
08a02ecd 3718 if (mddev->reshape_position != MaxSector &&
664e7c41
AN
3719 mddev->chunk_sectors != mddev->new_chunk_sectors)
3720 return sprintf(page, "%d (%d)\n",
3721 mddev->new_chunk_sectors << 9,
9d8f0363
AN
3722 mddev->chunk_sectors << 9);
3723 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3b34380a
N
3724}
3725
3726static ssize_t
fd01b88c 3727chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3b34380a 3728{
4c9309c0 3729 unsigned long n;
6791875e 3730 int err;
3b34380a 3731
4c9309c0
AD
3732 err = kstrtoul(buf, 10, &n);
3733 if (err < 0)
3734 return err;
3b34380a 3735
6791875e
N
3736 err = mddev_lock(mddev);
3737 if (err)
3738 return err;
b3546035 3739 if (mddev->pers) {
50ac168a 3740 if (mddev->pers->check_reshape == NULL)
6791875e
N
3741 err = -EBUSY;
3742 else if (mddev->ro)
3743 err = -EROFS;
3744 else {
3745 mddev->new_chunk_sectors = n >> 9;
3746 err = mddev->pers->check_reshape(mddev);
3747 if (err)
3748 mddev->new_chunk_sectors = mddev->chunk_sectors;
597a711b 3749 }
b3546035 3750 } else {
664e7c41 3751 mddev->new_chunk_sectors = n >> 9;
b3546035 3752 if (mddev->reshape_position == MaxSector)
9d8f0363 3753 mddev->chunk_sectors = n >> 9;
b3546035 3754 }
6791875e
N
3755 mddev_unlock(mddev);
3756 return err ?: len;
3b34380a
N
3757}
3758static struct md_sysfs_entry md_chunk_size =
80ca3a44 3759__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3b34380a 3760
a94213b1 3761static ssize_t
fd01b88c 3762resync_start_show(struct mddev *mddev, char *page)
a94213b1 3763{
d1a7c503
N
3764 if (mddev->recovery_cp == MaxSector)
3765 return sprintf(page, "none\n");
a94213b1
N
3766 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3767}
3768
3769static ssize_t
fd01b88c 3770resync_start_store(struct mddev *mddev, const char *buf, size_t len)
a94213b1 3771{
4c9309c0 3772 unsigned long long n;
6791875e 3773 int err;
4c9309c0
AD
3774
3775 if (cmd_match(buf, "none"))
3776 n = MaxSector;
3777 else {
3778 err = kstrtoull(buf, 10, &n);
3779 if (err < 0)
3780 return err;
3781 if (n != (sector_t)n)
3782 return -EINVAL;
3783 }
a94213b1 3784
6791875e
N
3785 err = mddev_lock(mddev);
3786 if (err)
3787 return err;
b098636c 3788 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6791875e 3789 err = -EBUSY;
a94213b1 3790
6791875e
N
3791 if (!err) {
3792 mddev->recovery_cp = n;
3793 if (mddev->pers)
3794 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3795 }
3796 mddev_unlock(mddev);
3797 return err ?: len;
a94213b1
N
3798}
3799static struct md_sysfs_entry md_resync_start =
750f199e
N
3800__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
3801 resync_start_show, resync_start_store);
a94213b1 3802
9e653b63
N
3803/*
3804 * The array state can be:
3805 *
3806 * clear
3807 * No devices, no size, no level
3808 * Equivalent to STOP_ARRAY ioctl
3809 * inactive
3810 * May have some settings, but array is not active
3811 * all IO results in error
3812 * When written, doesn't tear down array, but just stops it
3813 * suspended (not supported yet)
3814 * All IO requests will block. The array can be reconfigured.
910d8cb3 3815 * Writing this, if accepted, will block until array is quiescent
9e653b63
N
3816 * readonly
3817 * no resync can happen. no superblocks get written.
3818 * write requests fail
3819 * read-auto
3820 * like readonly, but behaves like 'clean' on a write request.
3821 *
3822 * clean - no pending writes, but otherwise active.
3823 * When written to inactive array, starts without resync
3824 * If a write request arrives then
3825 * if metadata is known, mark 'dirty' and switch to 'active'.
3826 * if not known, block and switch to write-pending
3827 * If written to an active array that has pending writes, then fails.
3828 * active
3829 * fully active: IO and resync can be happening.
3830 * When written to inactive array, starts with resync
3831 *
3832 * write-pending
3833 * clean, but writes are blocked waiting for 'active' to be written.
3834 *
3835 * active-idle
3836 * like active, but no writes have been seen for a while (100msec).
3837 *
3838 */
3839enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3840 write_pending, active_idle, bad_word};
05381954 3841static char *array_states[] = {
9e653b63
N
3842 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3843 "write-pending", "active-idle", NULL };
3844
3845static int match_word(const char *word, char **list)
3846{
3847 int n;
3848 for (n=0; list[n]; n++)
3849 if (cmd_match(word, list[n]))
3850 break;
3851 return n;
3852}
3853
3854static ssize_t
fd01b88c 3855array_state_show(struct mddev *mddev, char *page)
9e653b63
N
3856{
3857 enum array_state st = inactive;
3858
3859 if (mddev->pers)
3860 switch(mddev->ro) {
3861 case 1:
3862 st = readonly;
3863 break;
3864 case 2:
3865 st = read_auto;
3866 break;
3867 case 0:
3868 if (mddev->in_sync)
3869 st = clean;
070dc6dd 3870 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
e691063a 3871 st = write_pending;
9e653b63
N
3872 else if (mddev->safemode)
3873 st = active_idle;
3874 else
3875 st = active;
3876 }
3877 else {
3878 if (list_empty(&mddev->disks) &&
3879 mddev->raid_disks == 0 &&
58c0fed4 3880 mddev->dev_sectors == 0)
9e653b63
N
3881 st = clear;
3882 else
3883 st = inactive;
3884 }
3885 return sprintf(page, "%s\n", array_states[st]);
3886}
3887
f72ffdd6
N
3888static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
3889static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
3890static int do_md_run(struct mddev *mddev);
fd01b88c 3891static int restart_array(struct mddev *mddev);
9e653b63
N
3892
3893static ssize_t
fd01b88c 3894array_state_store(struct mddev *mddev, const char *buf, size_t len)
9e653b63 3895{
6791875e 3896 int err;
9e653b63 3897 enum array_state st = match_word(buf, array_states);
6791875e
N
3898
3899 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
3900 /* don't take reconfig_mutex when toggling between
3901 * clean and active
3902 */
3903 spin_lock(&mddev->lock);
3904 if (st == active) {
3905 restart_array(mddev);
3906 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3907 wake_up(&mddev->sb_wait);
3908 err = 0;
3909 } else /* st == clean */ {
3910 restart_array(mddev);
3911 if (atomic_read(&mddev->writes_pending) == 0) {
3912 if (mddev->in_sync == 0) {
3913 mddev->in_sync = 1;
3914 if (mddev->safemode == 1)
3915 mddev->safemode = 0;
3916 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3917 }
3918 err = 0;
3919 } else
3920 err = -EBUSY;
3921 }
3922 spin_unlock(&mddev->lock);
c008f1d3 3923 return err ?: len;
6791875e
N
3924 }
3925 err = mddev_lock(mddev);
3926 if (err)
3927 return err;
3928 err = -EINVAL;
9e653b63
N
3929 switch(st) {
3930 case bad_word:
3931 break;
3932 case clear:
3933 /* stopping an active array */
a05b7ea0 3934 err = do_md_stop(mddev, 0, NULL);
9e653b63
N
3935 break;
3936 case inactive:
3937 /* stopping an active array */
90cf195d 3938 if (mddev->pers)
a05b7ea0 3939 err = do_md_stop(mddev, 2, NULL);
90cf195d 3940 else
e691063a 3941 err = 0; /* already inactive */
9e653b63
N
3942 break;
3943 case suspended:
3944 break; /* not supported yet */
3945 case readonly:
3946 if (mddev->pers)
a05b7ea0 3947 err = md_set_readonly(mddev, NULL);
9e653b63
N
3948 else {
3949 mddev->ro = 1;
648b629e 3950 set_disk_ro(mddev->gendisk, 1);
9e653b63
N
3951 err = do_md_run(mddev);
3952 }
3953 break;
3954 case read_auto:
9e653b63 3955 if (mddev->pers) {
80268ee9 3956 if (mddev->ro == 0)
a05b7ea0 3957 err = md_set_readonly(mddev, NULL);
80268ee9 3958 else if (mddev->ro == 1)
648b629e
N
3959 err = restart_array(mddev);
3960 if (err == 0) {
3961 mddev->ro = 2;
3962 set_disk_ro(mddev->gendisk, 0);
3963 }
9e653b63
N
3964 } else {
3965 mddev->ro = 2;
3966 err = do_md_run(mddev);
3967 }
3968 break;
3969 case clean:
3970 if (mddev->pers) {
339421de
SL
3971 err = restart_array(mddev);
3972 if (err)
3973 break;
85572d7c 3974 spin_lock(&mddev->lock);
9e653b63 3975 if (atomic_read(&mddev->writes_pending) == 0) {
e691063a
N
3976 if (mddev->in_sync == 0) {
3977 mddev->in_sync = 1;
31a59e34
N
3978 if (mddev->safemode == 1)
3979 mddev->safemode = 0;
070dc6dd 3980 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
e691063a
N
3981 }
3982 err = 0;
3983 } else
3984 err = -EBUSY;
85572d7c 3985 spin_unlock(&mddev->lock);
5bf29597
N
3986 } else
3987 err = -EINVAL;
9e653b63
N
3988 break;
3989 case active:
3990 if (mddev->pers) {
339421de
SL
3991 err = restart_array(mddev);
3992 if (err)
3993 break;
070dc6dd 3994 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
9e653b63
N
3995 wake_up(&mddev->sb_wait);
3996 err = 0;
3997 } else {
3998 mddev->ro = 0;
648b629e 3999 set_disk_ro(mddev->gendisk, 0);
9e653b63
N
4000 err = do_md_run(mddev);
4001 }
4002 break;
4003 case write_pending:
4004 case active_idle:
4005 /* these cannot be set */
4006 break;
4007 }
6791875e
N
4008
4009 if (!err) {
1d23f178
N
4010 if (mddev->hold_active == UNTIL_IOCTL)
4011 mddev->hold_active = 0;
00bcb4ac 4012 sysfs_notify_dirent_safe(mddev->sysfs_state);
0fd62b86 4013 }
6791875e
N
4014 mddev_unlock(mddev);
4015 return err ?: len;
9e653b63 4016}
80ca3a44 4017static struct md_sysfs_entry md_array_state =
750f199e 4018__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
9e653b63 4019
1e50915f 4020static ssize_t
fd01b88c 4021max_corrected_read_errors_show(struct mddev *mddev, char *page) {
1e50915f
RB
4022 return sprintf(page, "%d\n",
4023 atomic_read(&mddev->max_corr_read_errors));
4024}
4025
4026static ssize_t
fd01b88c 4027max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
1e50915f 4028{
4c9309c0
AD
4029 unsigned int n;
4030 int rv;
1e50915f 4031
4c9309c0
AD
4032 rv = kstrtouint(buf, 10, &n);
4033 if (rv < 0)
4034 return rv;
4035 atomic_set(&mddev->max_corr_read_errors, n);
4036 return len;
1e50915f
RB
4037}
4038
4039static struct md_sysfs_entry max_corr_read_errors =
4040__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4041 max_corrected_read_errors_store);
4042
6d7ff738 4043static ssize_t
fd01b88c 4044null_show(struct mddev *mddev, char *page)
6d7ff738
N
4045{
4046 return -EINVAL;
4047}
4048
4049static ssize_t
fd01b88c 4050new_dev_store(struct mddev *mddev, const char *buf, size_t len)
6d7ff738
N
4051{
4052 /* buf must be %d:%d\n? giving major and minor numbers */
4053 /* The new device is added to the array.
4054 * If the array has a persistent superblock, we read the
4055 * superblock to initialise info and check validity.
4056 * Otherwise, only checking done is that in bind_rdev_to_array,
4057 * which mainly checks size.
4058 */
4059 char *e;
4060 int major = simple_strtoul(buf, &e, 10);
4061 int minor;
4062 dev_t dev;
3cb03002 4063 struct md_rdev *rdev;
6d7ff738
N
4064 int err;
4065
4066 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4067 return -EINVAL;
4068 minor = simple_strtoul(e+1, &e, 10);
4069 if (*e && *e != '\n')
4070 return -EINVAL;
4071 dev = MKDEV(major, minor);
4072 if (major != MAJOR(dev) ||
4073 minor != MINOR(dev))
4074 return -EOVERFLOW;
4075
6791875e
N
4076 flush_workqueue(md_misc_wq);
4077
4078 err = mddev_lock(mddev);
4079 if (err)
4080 return err;
6d7ff738
N
4081 if (mddev->persistent) {
4082 rdev = md_import_device(dev, mddev->major_version,
4083 mddev->minor_version);
4084 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3cb03002
N
4085 struct md_rdev *rdev0
4086 = list_entry(mddev->disks.next,
4087 struct md_rdev, same_set);
6d7ff738
N
4088 err = super_types[mddev->major_version]
4089 .load_super(rdev, rdev0, mddev->minor_version);
4090 if (err < 0)
4091 goto out;
4092 }
c5d79adb
N
4093 } else if (mddev->external)
4094 rdev = md_import_device(dev, -2, -1);
4095 else
6d7ff738
N
4096 rdev = md_import_device(dev, -1, -1);
4097
9a8c0fa8
N
4098 if (IS_ERR(rdev)) {
4099 mddev_unlock(mddev);
6d7ff738 4100 return PTR_ERR(rdev);
9a8c0fa8 4101 }
6d7ff738
N
4102 err = bind_rdev_to_array(rdev, mddev);
4103 out:
4104 if (err)
4105 export_rdev(rdev);
6791875e 4106 mddev_unlock(mddev);
6d7ff738
N
4107 return err ? err : len;
4108}
4109
4110static struct md_sysfs_entry md_new_device =
80ca3a44 4111__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3b34380a 4112
9b1d1dac 4113static ssize_t
fd01b88c 4114bitmap_store(struct mddev *mddev, const char *buf, size_t len)
9b1d1dac
PC
4115{
4116 char *end;
4117 unsigned long chunk, end_chunk;
6791875e 4118 int err;
9b1d1dac 4119
6791875e
N
4120 err = mddev_lock(mddev);
4121 if (err)
4122 return err;
9b1d1dac
PC
4123 if (!mddev->bitmap)
4124 goto out;
4125 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4126 while (*buf) {
4127 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4128 if (buf == end) break;
4129 if (*end == '-') { /* range */
4130 buf = end + 1;
4131 end_chunk = simple_strtoul(buf, &end, 0);
4132 if (buf == end) break;
4133 }
4134 if (*end && !isspace(*end)) break;
4135 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
e7d2860b 4136 buf = skip_spaces(end);
9b1d1dac
PC
4137 }
4138 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4139out:
6791875e 4140 mddev_unlock(mddev);
9b1d1dac
PC
4141 return len;
4142}
4143
4144static struct md_sysfs_entry md_bitmap =
4145__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4146
a35b0d69 4147static ssize_t
fd01b88c 4148size_show(struct mddev *mddev, char *page)
a35b0d69 4149{
58c0fed4
AN
4150 return sprintf(page, "%llu\n",
4151 (unsigned long long)mddev->dev_sectors / 2);
a35b0d69
N
4152}
4153
fd01b88c 4154static int update_size(struct mddev *mddev, sector_t num_sectors);
a35b0d69
N
4155
4156static ssize_t
fd01b88c 4157size_store(struct mddev *mddev, const char *buf, size_t len)
a35b0d69
N
4158{
4159 /* If array is inactive, we can reduce the component size, but
4160 * not increase it (except from 0).
4161 * If array is active, we can try an on-line resize
4162 */
b522adcd
DW
4163 sector_t sectors;
4164 int err = strict_blocks_to_sectors(buf, &sectors);
a35b0d69 4165
58c0fed4
AN
4166 if (err < 0)
4167 return err;
6791875e
N
4168 err = mddev_lock(mddev);
4169 if (err)
4170 return err;
a35b0d69 4171 if (mddev->pers) {
58c0fed4 4172 err = update_size(mddev, sectors);
850b2b42 4173 md_update_sb(mddev, 1);
a35b0d69 4174 } else {
58c0fed4
AN
4175 if (mddev->dev_sectors == 0 ||
4176 mddev->dev_sectors > sectors)
4177 mddev->dev_sectors = sectors;
a35b0d69
N
4178 else
4179 err = -ENOSPC;
4180 }
6791875e 4181 mddev_unlock(mddev);
a35b0d69
N
4182 return err ? err : len;
4183}
4184
4185static struct md_sysfs_entry md_size =
80ca3a44 4186__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
a35b0d69 4187
83f0d77a 4188/* Metadata version.
e691063a
N
4189 * This is one of
4190 * 'none' for arrays with no metadata (good luck...)
4191 * 'external' for arrays with externally managed metadata,
8bb93aac
N
4192 * or N.M for internally known formats
4193 */
4194static ssize_t
fd01b88c 4195metadata_show(struct mddev *mddev, char *page)
8bb93aac
N
4196{
4197 if (mddev->persistent)
4198 return sprintf(page, "%d.%d\n",
4199 mddev->major_version, mddev->minor_version);
e691063a
N
4200 else if (mddev->external)
4201 return sprintf(page, "external:%s\n", mddev->metadata_type);
8bb93aac
N
4202 else
4203 return sprintf(page, "none\n");
4204}
4205
4206static ssize_t
fd01b88c 4207metadata_store(struct mddev *mddev, const char *buf, size_t len)
8bb93aac
N
4208{
4209 int major, minor;
4210 char *e;
6791875e 4211 int err;
ea43ddd8
N
4212 /* Changing the details of 'external' metadata is
4213 * always permitted. Otherwise there must be
4214 * no devices attached to the array.
4215 */
6791875e
N
4216
4217 err = mddev_lock(mddev);
4218 if (err)
4219 return err;
4220 err = -EBUSY;
ea43ddd8
N
4221 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4222 ;
4223 else if (!list_empty(&mddev->disks))
6791875e 4224 goto out_unlock;
8bb93aac 4225
6791875e 4226 err = 0;
8bb93aac
N
4227 if (cmd_match(buf, "none")) {
4228 mddev->persistent = 0;
e691063a
N
4229 mddev->external = 0;
4230 mddev->major_version = 0;
4231 mddev->minor_version = 90;
6791875e 4232 goto out_unlock;
e691063a
N
4233 }
4234 if (strncmp(buf, "external:", 9) == 0) {
20a49ff6 4235 size_t namelen = len-9;
e691063a
N
4236 if (namelen >= sizeof(mddev->metadata_type))
4237 namelen = sizeof(mddev->metadata_type)-1;
4238 strncpy(mddev->metadata_type, buf+9, namelen);
4239 mddev->metadata_type[namelen] = 0;
4240 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4241 mddev->metadata_type[--namelen] = 0;
4242 mddev->persistent = 0;
4243 mddev->external = 1;
8bb93aac
N
4244 mddev->major_version = 0;
4245 mddev->minor_version = 90;
6791875e 4246 goto out_unlock;
8bb93aac
N
4247 }
4248 major = simple_strtoul(buf, &e, 10);
6791875e 4249 err = -EINVAL;
8bb93aac 4250 if (e==buf || *e != '.')
6791875e 4251 goto out_unlock;
8bb93aac
N
4252 buf = e+1;
4253 minor = simple_strtoul(buf, &e, 10);
3f9d7b0d 4254 if (e==buf || (*e && *e != '\n') )
6791875e
N
4255 goto out_unlock;
4256 err = -ENOENT;
50511da3 4257 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
6791875e 4258 goto out_unlock;
8bb93aac
N
4259 mddev->major_version = major;
4260 mddev->minor_version = minor;
4261 mddev->persistent = 1;
e691063a 4262 mddev->external = 0;
6791875e
N
4263 err = 0;
4264out_unlock:
4265 mddev_unlock(mddev);
4266 return err ?: len;
8bb93aac
N
4267}
4268
4269static struct md_sysfs_entry md_metadata =
750f199e 4270__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
8bb93aac 4271
24dd469d 4272static ssize_t
fd01b88c 4273action_show(struct mddev *mddev, char *page)
24dd469d 4274{
7eec314d 4275 char *type = "idle";
b7b17c9b
N
4276 unsigned long recovery = mddev->recovery;
4277 if (test_bit(MD_RECOVERY_FROZEN, &recovery))
b6a9ce68 4278 type = "frozen";
b7b17c9b
N
4279 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4280 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4281 if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
ccfcc3c1 4282 type = "reshape";
b7b17c9b
N
4283 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4284 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
24dd469d 4285 type = "resync";
b7b17c9b 4286 else if (test_bit(MD_RECOVERY_CHECK, &recovery))
24dd469d
N
4287 type = "check";
4288 else
4289 type = "repair";
b7b17c9b 4290 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
24dd469d 4291 type = "recover";
985ca973
N
4292 else if (mddev->reshape_position != MaxSector)
4293 type = "reshape";
24dd469d
N
4294 }
4295 return sprintf(page, "%s\n", type);
4296}
4297
4298static ssize_t
fd01b88c 4299action_store(struct mddev *mddev, const char *page, size_t len)
24dd469d 4300{
7eec314d
N
4301 if (!mddev->pers || !mddev->pers->sync_request)
4302 return -EINVAL;
4303
b6a9ce68
N
4304
4305 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
56ccc112
N
4306 if (cmd_match(page, "frozen"))
4307 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4308 else
4309 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
8e8e2518
N
4310 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4311 mddev_lock(mddev) == 0) {
4312 flush_workqueue(md_misc_wq);
4313 if (mddev->sync_thread) {
4314 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6791875e 4315 md_reap_sync_thread(mddev);
6791875e 4316 }
8e8e2518 4317 mddev_unlock(mddev);
7eec314d 4318 }
03c902e1
N
4319 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4320 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
24dd469d 4321 return -EBUSY;
72a23c21 4322 else if (cmd_match(page, "resync"))
56ccc112 4323 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
72a23c21 4324 else if (cmd_match(page, "recover")) {
56ccc112 4325 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
72a23c21 4326 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
72a23c21 4327 } else if (cmd_match(page, "reshape")) {
16484bf5
N
4328 int err;
4329 if (mddev->pers->start_reshape == NULL)
4330 return -EINVAL;
6791875e
N
4331 err = mddev_lock(mddev);
4332 if (!err) {
56ccc112 4333 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6791875e
N
4334 err = mddev->pers->start_reshape(mddev);
4335 mddev_unlock(mddev);
4336 }
16484bf5
N
4337 if (err)
4338 return err;
a99ac971 4339 sysfs_notify(&mddev->kobj, NULL, "degraded");
16484bf5 4340 } else {
bce74dac 4341 if (cmd_match(page, "check"))
7eec314d 4342 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2adc7d47 4343 else if (!cmd_match(page, "repair"))
7eec314d 4344 return -EINVAL;
56ccc112 4345 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
7eec314d
N
4346 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4347 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7eec314d 4348 }
48c26ddc
N
4349 if (mddev->ro == 2) {
4350 /* A write to sync_action is enough to justify
4351 * canceling read-auto mode
4352 */
4353 mddev->ro = 0;
4354 md_wakeup_thread(mddev->sync_thread);
4355 }
03c902e1 4356 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
24dd469d 4357 md_wakeup_thread(mddev->thread);
00bcb4ac 4358 sysfs_notify_dirent_safe(mddev->sysfs_action);
24dd469d
N
4359 return len;
4360}
4361
c4a39551 4362static struct md_sysfs_entry md_scan_mode =
750f199e 4363__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
c4a39551
JB
4364
4365static ssize_t
4366last_sync_action_show(struct mddev *mddev, char *page)
4367{
4368 return sprintf(page, "%s\n", mddev->last_sync_action);
4369}
4370
4371static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4372
9d88883e 4373static ssize_t
fd01b88c 4374mismatch_cnt_show(struct mddev *mddev, char *page)
9d88883e
N
4375{
4376 return sprintf(page, "%llu\n",
7f7583d4
JM
4377 (unsigned long long)
4378 atomic64_read(&mddev->resync_mismatches));
9d88883e
N
4379}
4380
80ca3a44 4381static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
9d88883e 4382
88202a0c 4383static ssize_t
fd01b88c 4384sync_min_show(struct mddev *mddev, char *page)
88202a0c
N
4385{
4386 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4387 mddev->sync_speed_min ? "local": "system");
4388}
4389
4390static ssize_t
fd01b88c 4391sync_min_store(struct mddev *mddev, const char *buf, size_t len)
88202a0c 4392{
4c9309c0
AD
4393 unsigned int min;
4394 int rv;
4395
88202a0c 4396 if (strncmp(buf, "system", 6)==0) {
4c9309c0
AD
4397 min = 0;
4398 } else {
4399 rv = kstrtouint(buf, 10, &min);
4400 if (rv < 0)
4401 return rv;
4402 if (min == 0)
4403 return -EINVAL;
88202a0c 4404 }
88202a0c
N
4405 mddev->sync_speed_min = min;
4406 return len;
4407}
4408
4409static struct md_sysfs_entry md_sync_min =
4410__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4411
4412static ssize_t
fd01b88c 4413sync_max_show(struct mddev *mddev, char *page)
88202a0c
N
4414{
4415 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4416 mddev->sync_speed_max ? "local": "system");
4417}
4418
4419static ssize_t
fd01b88c 4420sync_max_store(struct mddev *mddev, const char *buf, size_t len)
88202a0c 4421{
4c9309c0
AD
4422 unsigned int max;
4423 int rv;
4424
88202a0c 4425 if (strncmp(buf, "system", 6)==0) {
4c9309c0
AD
4426 max = 0;
4427 } else {
4428 rv = kstrtouint(buf, 10, &max);
4429 if (rv < 0)
4430 return rv;
4431 if (max == 0)
4432 return -EINVAL;
88202a0c 4433 }
88202a0c
N
4434 mddev->sync_speed_max = max;
4435 return len;
4436}
4437
4438static struct md_sysfs_entry md_sync_max =
4439__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4440
d7f3d291 4441static ssize_t
fd01b88c 4442degraded_show(struct mddev *mddev, char *page)
d7f3d291
IP
4443{
4444 return sprintf(page, "%d\n", mddev->degraded);
4445}
4446static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
88202a0c 4447
90b08710 4448static ssize_t
fd01b88c 4449sync_force_parallel_show(struct mddev *mddev, char *page)
90b08710
BS
4450{
4451 return sprintf(page, "%d\n", mddev->parallel_resync);
4452}
4453
4454static ssize_t
fd01b88c 4455sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
90b08710
BS
4456{
4457 long n;
4458
b29bebd6 4459 if (kstrtol(buf, 10, &n))
90b08710
BS
4460 return -EINVAL;
4461
4462 if (n != 0 && n != 1)
4463 return -EINVAL;
4464
4465 mddev->parallel_resync = n;
4466
4467 if (mddev->sync_thread)
4468 wake_up(&resync_wait);
4469
4470 return len;
4471}
4472
4473/* force parallel resync, even with shared block devices */
4474static struct md_sysfs_entry md_sync_force_parallel =
4475__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4476 sync_force_parallel_show, sync_force_parallel_store);
4477
88202a0c 4478static ssize_t
fd01b88c 4479sync_speed_show(struct mddev *mddev, char *page)
88202a0c
N
4480{
4481 unsigned long resync, dt, db;
d1a7c503
N
4482 if (mddev->curr_resync == 0)
4483 return sprintf(page, "none\n");
9687a60c
AN
4484 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4485 dt = (jiffies - mddev->resync_mark) / HZ;
88202a0c 4486 if (!dt) dt++;
9687a60c
AN
4487 db = resync - mddev->resync_mark_cnt;
4488 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
88202a0c
N
4489}
4490
80ca3a44 4491static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
88202a0c
N
4492
4493static ssize_t
fd01b88c 4494sync_completed_show(struct mddev *mddev, char *page)
88202a0c 4495{
13ae864b 4496 unsigned long long max_sectors, resync;
88202a0c 4497
acb180b0
N
4498 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4499 return sprintf(page, "none\n");
4500
72f36d59
N
4501 if (mddev->curr_resync == 1 ||
4502 mddev->curr_resync == 2)
4503 return sprintf(page, "delayed\n");
4504
c804cdec
N
4505 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4506 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
58c0fed4 4507 max_sectors = mddev->resync_max_sectors;
88202a0c 4508 else
58c0fed4 4509 max_sectors = mddev->dev_sectors;
88202a0c 4510
acb180b0 4511 resync = mddev->curr_resync_completed;
13ae864b 4512 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
88202a0c
N
4513}
4514
750f199e
N
4515static struct md_sysfs_entry md_sync_completed =
4516 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
88202a0c 4517
5e96ee65 4518static ssize_t
fd01b88c 4519min_sync_show(struct mddev *mddev, char *page)
5e96ee65
NB
4520{
4521 return sprintf(page, "%llu\n",
4522 (unsigned long long)mddev->resync_min);
4523}
4524static ssize_t
fd01b88c 4525min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5e96ee65
NB
4526{
4527 unsigned long long min;
23da422b 4528 int err;
23da422b 4529
b29bebd6 4530 if (kstrtoull(buf, 10, &min))
5e96ee65 4531 return -EINVAL;
23da422b
N
4532
4533 spin_lock(&mddev->lock);
4534 err = -EINVAL;
5e96ee65 4535 if (min > mddev->resync_max)
23da422b
N
4536 goto out_unlock;
4537
4538 err = -EBUSY;
5e96ee65 4539 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
23da422b 4540 goto out_unlock;
5e96ee65 4541
50c37b13
N
4542 /* Round down to multiple of 4K for safety */
4543 mddev->resync_min = round_down(min, 8);
23da422b 4544 err = 0;
5e96ee65 4545
23da422b
N
4546out_unlock:
4547 spin_unlock(&mddev->lock);
4548 return err ?: len;
5e96ee65
NB
4549}
4550
4551static struct md_sysfs_entry md_min_sync =
4552__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4553
c6207277 4554static ssize_t
fd01b88c 4555max_sync_show(struct mddev *mddev, char *page)
c6207277
N
4556{
4557 if (mddev->resync_max == MaxSector)
4558 return sprintf(page, "max\n");
4559 else
4560 return sprintf(page, "%llu\n",
4561 (unsigned long long)mddev->resync_max);
4562}
4563static ssize_t
fd01b88c 4564max_sync_store(struct mddev *mddev, const char *buf, size_t len)
c6207277 4565{
23da422b
N
4566 int err;
4567 spin_lock(&mddev->lock);
c6207277
N
4568 if (strncmp(buf, "max", 3) == 0)
4569 mddev->resync_max = MaxSector;
4570 else {
5e96ee65 4571 unsigned long long max;
23da422b
N
4572 int chunk;
4573
4574 err = -EINVAL;
b29bebd6 4575 if (kstrtoull(buf, 10, &max))
23da422b 4576 goto out_unlock;
5e96ee65 4577 if (max < mddev->resync_min)
23da422b
N
4578 goto out_unlock;
4579
4580 err = -EBUSY;
c6207277 4581 if (max < mddev->resync_max &&
4d484a4a 4582 mddev->ro == 0 &&
c6207277 4583 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
23da422b 4584 goto out_unlock;
c6207277
N
4585
4586 /* Must be a multiple of chunk_size */
23da422b
N
4587 chunk = mddev->chunk_sectors;
4588 if (chunk) {
2ac06c33 4589 sector_t temp = max;
23da422b
N
4590
4591 err = -EINVAL;
4592 if (sector_div(temp, chunk))
4593 goto out_unlock;
c6207277
N
4594 }
4595 mddev->resync_max = max;
4596 }
4597 wake_up(&mddev->recovery_wait);
23da422b
N
4598 err = 0;
4599out_unlock:
4600 spin_unlock(&mddev->lock);
4601 return err ?: len;
c6207277
N
4602}
4603
4604static struct md_sysfs_entry md_max_sync =
4605__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4606
e464eafd 4607static ssize_t
fd01b88c 4608suspend_lo_show(struct mddev *mddev, char *page)
e464eafd
N
4609{
4610 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4611}
4612
4613static ssize_t
fd01b88c 4614suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
e464eafd 4615{
4c9309c0 4616 unsigned long long old, new;
6791875e 4617 int err;
e464eafd 4618
4c9309c0
AD
4619 err = kstrtoull(buf, 10, &new);
4620 if (err < 0)
4621 return err;
4622 if (new != (sector_t)new)
e464eafd 4623 return -EINVAL;
23ddff37 4624
6791875e
N
4625 err = mddev_lock(mddev);
4626 if (err)
4627 return err;
4628 err = -EINVAL;
4629 if (mddev->pers == NULL ||
4630 mddev->pers->quiesce == NULL)
4631 goto unlock;
4632 old = mddev->suspend_lo;
23ddff37
N
4633 mddev->suspend_lo = new;
4634 if (new >= old)
4635 /* Shrinking suspended region */
e464eafd 4636 mddev->pers->quiesce(mddev, 2);
23ddff37
N
4637 else {
4638 /* Expanding suspended region - need to wait */
4639 mddev->pers->quiesce(mddev, 1);
4640 mddev->pers->quiesce(mddev, 0);
4641 }
6791875e
N
4642 err = 0;
4643unlock:
4644 mddev_unlock(mddev);
4645 return err ?: len;
e464eafd
N
4646}
4647static struct md_sysfs_entry md_suspend_lo =
4648__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4649
e464eafd 4650static ssize_t
fd01b88c 4651suspend_hi_show(struct mddev *mddev, char *page)
e464eafd
N
4652{
4653 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4654}
4655
4656static ssize_t
fd01b88c 4657suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
e464eafd 4658{
4c9309c0 4659 unsigned long long old, new;
6791875e 4660 int err;
e464eafd 4661
4c9309c0
AD
4662 err = kstrtoull(buf, 10, &new);
4663 if (err < 0)
4664 return err;
4665 if (new != (sector_t)new)
e464eafd 4666 return -EINVAL;
23ddff37 4667
6791875e
N
4668 err = mddev_lock(mddev);
4669 if (err)
4670 return err;
4671 err = -EINVAL;
4672 if (mddev->pers == NULL ||
4673 mddev->pers->quiesce == NULL)
4674 goto unlock;
4675 old = mddev->suspend_hi;
23ddff37
N
4676 mddev->suspend_hi = new;
4677 if (new <= old)
4678 /* Shrinking suspended region */
4679 mddev->pers->quiesce(mddev, 2);
4680 else {
4681 /* Expanding suspended region - need to wait */
e464eafd
N
4682 mddev->pers->quiesce(mddev, 1);
4683 mddev->pers->quiesce(mddev, 0);
23ddff37 4684 }
6791875e
N
4685 err = 0;
4686unlock:
4687 mddev_unlock(mddev);
4688 return err ?: len;
e464eafd
N
4689}
4690static struct md_sysfs_entry md_suspend_hi =
4691__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4692
08a02ecd 4693static ssize_t
fd01b88c 4694reshape_position_show(struct mddev *mddev, char *page)
08a02ecd
N
4695{
4696 if (mddev->reshape_position != MaxSector)
4697 return sprintf(page, "%llu\n",
4698 (unsigned long long)mddev->reshape_position);
4699 strcpy(page, "none\n");
4700 return 5;
4701}
4702
4703static ssize_t
fd01b88c 4704reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
08a02ecd 4705{
c6563a8c 4706 struct md_rdev *rdev;
4c9309c0 4707 unsigned long long new;
6791875e 4708 int err;
6791875e 4709
4c9309c0
AD
4710 err = kstrtoull(buf, 10, &new);
4711 if (err < 0)
4712 return err;
4713 if (new != (sector_t)new)
08a02ecd 4714 return -EINVAL;
6791875e
N
4715 err = mddev_lock(mddev);
4716 if (err)
4717 return err;
4718 err = -EBUSY;
4719 if (mddev->pers)
4720 goto unlock;
08a02ecd
N
4721 mddev->reshape_position = new;
4722 mddev->delta_disks = 0;
2c810cdd 4723 mddev->reshape_backwards = 0;
08a02ecd
N
4724 mddev->new_level = mddev->level;
4725 mddev->new_layout = mddev->layout;
664e7c41 4726 mddev->new_chunk_sectors = mddev->chunk_sectors;
c6563a8c
N
4727 rdev_for_each(rdev, mddev)
4728 rdev->new_data_offset = rdev->data_offset;
6791875e
N
4729 err = 0;
4730unlock:
4731 mddev_unlock(mddev);
4732 return err ?: len;
08a02ecd
N
4733}
4734
4735static struct md_sysfs_entry md_reshape_position =
4736__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4737 reshape_position_store);
4738
2c810cdd
N
4739static ssize_t
4740reshape_direction_show(struct mddev *mddev, char *page)
4741{
4742 return sprintf(page, "%s\n",
4743 mddev->reshape_backwards ? "backwards" : "forwards");
4744}
4745
4746static ssize_t
4747reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4748{
4749 int backwards = 0;
6791875e
N
4750 int err;
4751
2c810cdd
N
4752 if (cmd_match(buf, "forwards"))
4753 backwards = 0;
4754 else if (cmd_match(buf, "backwards"))
4755 backwards = 1;
4756 else
4757 return -EINVAL;
4758 if (mddev->reshape_backwards == backwards)
4759 return len;
4760
6791875e
N
4761 err = mddev_lock(mddev);
4762 if (err)
4763 return err;
2c810cdd
N
4764 /* check if we are allowed to change */
4765 if (mddev->delta_disks)
6791875e
N
4766 err = -EBUSY;
4767 else if (mddev->persistent &&
2c810cdd 4768 mddev->major_version == 0)
6791875e
N
4769 err = -EINVAL;
4770 else
4771 mddev->reshape_backwards = backwards;
4772 mddev_unlock(mddev);
4773 return err ?: len;
2c810cdd
N
4774}
4775
4776static struct md_sysfs_entry md_reshape_direction =
4777__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4778 reshape_direction_store);
4779
b522adcd 4780static ssize_t
fd01b88c 4781array_size_show(struct mddev *mddev, char *page)
b522adcd
DW
4782{
4783 if (mddev->external_size)
4784 return sprintf(page, "%llu\n",
4785 (unsigned long long)mddev->array_sectors/2);
4786 else
4787 return sprintf(page, "default\n");
4788}
4789
4790static ssize_t
fd01b88c 4791array_size_store(struct mddev *mddev, const char *buf, size_t len)
b522adcd
DW
4792{
4793 sector_t sectors;
6791875e
N
4794 int err;
4795
4796 err = mddev_lock(mddev);
4797 if (err)
4798 return err;
b522adcd
DW
4799
4800 if (strncmp(buf, "default", 7) == 0) {
4801 if (mddev->pers)
4802 sectors = mddev->pers->size(mddev, 0, 0);
4803 else
4804 sectors = mddev->array_sectors;
4805
4806 mddev->external_size = 0;
4807 } else {
4808 if (strict_blocks_to_sectors(buf, &sectors) < 0)
6791875e
N
4809 err = -EINVAL;
4810 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4811 err = -E2BIG;
4812 else
4813 mddev->external_size = 1;
b522adcd
DW
4814 }
4815
6791875e
N
4816 if (!err) {
4817 mddev->array_sectors = sectors;
4818 if (mddev->pers) {
4819 set_capacity(mddev->gendisk, mddev->array_sectors);
4820 revalidate_disk(mddev->gendisk);
4821 }
cbe6ef1d 4822 }
6791875e
N
4823 mddev_unlock(mddev);
4824 return err ?: len;
b522adcd
DW
4825}
4826
4827static struct md_sysfs_entry md_array_size =
4828__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4829 array_size_store);
e464eafd 4830
eae1701f
N
4831static struct attribute *md_default_attrs[] = {
4832 &md_level.attr,
d4dbd025 4833 &md_layout.attr,
eae1701f 4834 &md_raid_disks.attr,
3b34380a 4835 &md_chunk_size.attr,
a35b0d69 4836 &md_size.attr,
a94213b1 4837 &md_resync_start.attr,
8bb93aac 4838 &md_metadata.attr,
6d7ff738 4839 &md_new_device.attr,
16f17b39 4840 &md_safe_delay.attr,
9e653b63 4841 &md_array_state.attr,
08a02ecd 4842 &md_reshape_position.attr,
2c810cdd 4843 &md_reshape_direction.attr,
b522adcd 4844 &md_array_size.attr,
1e50915f 4845 &max_corr_read_errors.attr,
411036fa
N
4846 NULL,
4847};
4848
4849static struct attribute *md_redundancy_attrs[] = {
24dd469d 4850 &md_scan_mode.attr,
c4a39551 4851 &md_last_scan_mode.attr,
9d88883e 4852 &md_mismatches.attr,
88202a0c
N
4853 &md_sync_min.attr,
4854 &md_sync_max.attr,
4855 &md_sync_speed.attr,
90b08710 4856 &md_sync_force_parallel.attr,
88202a0c 4857 &md_sync_completed.attr,
5e96ee65 4858 &md_min_sync.attr,
c6207277 4859 &md_max_sync.attr,
e464eafd
N
4860 &md_suspend_lo.attr,
4861 &md_suspend_hi.attr,
9b1d1dac 4862 &md_bitmap.attr,
d7f3d291 4863 &md_degraded.attr,
eae1701f
N
4864 NULL,
4865};
411036fa
N
4866static struct attribute_group md_redundancy_group = {
4867 .name = NULL,
4868 .attrs = md_redundancy_attrs,
4869};
4870
eae1701f
N
4871static ssize_t
4872md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4873{
4874 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
fd01b88c 4875 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
96de1e66 4876 ssize_t rv;
eae1701f
N
4877
4878 if (!entry->show)
4879 return -EIO;
af8a2434
N
4880 spin_lock(&all_mddevs_lock);
4881 if (list_empty(&mddev->all_mddevs)) {
4882 spin_unlock(&all_mddevs_lock);
4883 return -EBUSY;
4884 }
4885 mddev_get(mddev);
4886 spin_unlock(&all_mddevs_lock);
4887
b7b17c9b 4888 rv = entry->show(mddev, page);
af8a2434 4889 mddev_put(mddev);
96de1e66 4890 return rv;
eae1701f
N
4891}
4892
4893static ssize_t
4894md_attr_store(struct kobject *kobj, struct attribute *attr,
4895 const char *page, size_t length)
4896{
4897 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
fd01b88c 4898 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
96de1e66 4899 ssize_t rv;
eae1701f
N
4900
4901 if (!entry->store)
4902 return -EIO;
67463acb
N
4903 if (!capable(CAP_SYS_ADMIN))
4904 return -EACCES;
af8a2434
N
4905 spin_lock(&all_mddevs_lock);
4906 if (list_empty(&mddev->all_mddevs)) {
4907 spin_unlock(&all_mddevs_lock);
4908 return -EBUSY;
4909 }
4910 mddev_get(mddev);
4911 spin_unlock(&all_mddevs_lock);
6791875e 4912 rv = entry->store(mddev, page, length);
af8a2434 4913 mddev_put(mddev);
96de1e66 4914 return rv;
eae1701f
N
4915}
4916
4917static void md_free(struct kobject *ko)
4918{
fd01b88c 4919 struct mddev *mddev = container_of(ko, struct mddev, kobj);
a21d1504
N
4920
4921 if (mddev->sysfs_state)
4922 sysfs_put(mddev->sysfs_state);
4923
6cd18e71
N
4924 if (mddev->queue)
4925 blk_cleanup_queue(mddev->queue);
a21d1504
N
4926 if (mddev->gendisk) {
4927 del_gendisk(mddev->gendisk);
4928 put_disk(mddev->gendisk);
4929 }
a21d1504 4930
eae1701f
N
4931 kfree(mddev);
4932}
4933
52cf25d0 4934static const struct sysfs_ops md_sysfs_ops = {
eae1701f
N
4935 .show = md_attr_show,
4936 .store = md_attr_store,
4937};
4938static struct kobj_type md_ktype = {
4939 .release = md_free,
4940 .sysfs_ops = &md_sysfs_ops,
4941 .default_attrs = md_default_attrs,
4942};
4943
1da177e4
LT
4944int mdp_major = 0;
4945
5fd3a17e
DW
4946static void mddev_delayed_delete(struct work_struct *ws)
4947{
fd01b88c 4948 struct mddev *mddev = container_of(ws, struct mddev, del_work);
5fd3a17e 4949
43a70507 4950 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5fd3a17e
DW
4951 kobject_del(&mddev->kobj);
4952 kobject_put(&mddev->kobj);
4953}
4954
efeb53c0 4955static int md_alloc(dev_t dev, char *name)
1da177e4 4956{
48c9c27b 4957 static DEFINE_MUTEX(disks_mutex);
fd01b88c 4958 struct mddev *mddev = mddev_find(dev);
1da177e4 4959 struct gendisk *disk;
efeb53c0
N
4960 int partitioned;
4961 int shift;
4962 int unit;
3830c62f 4963 int error;
1da177e4
LT
4964
4965 if (!mddev)
efeb53c0
N
4966 return -ENODEV;
4967
4968 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4969 shift = partitioned ? MdpMinorShift : 0;
4970 unit = MINOR(mddev->unit) >> shift;
1da177e4 4971
e804ac78
TH
4972 /* wait for any previous instance of this device to be
4973 * completely removed (mddev_delayed_delete).
d3374825 4974 */
e804ac78 4975 flush_workqueue(md_misc_wq);
d3374825 4976
48c9c27b 4977 mutex_lock(&disks_mutex);
0909dc44
N
4978 error = -EEXIST;
4979 if (mddev->gendisk)
4980 goto abort;
efeb53c0
N
4981
4982 if (name) {
4983 /* Need to ensure that 'name' is not a duplicate.
4984 */
fd01b88c 4985 struct mddev *mddev2;
efeb53c0
N
4986 spin_lock(&all_mddevs_lock);
4987
4988 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4989 if (mddev2->gendisk &&
4990 strcmp(mddev2->gendisk->disk_name, name) == 0) {
4991 spin_unlock(&all_mddevs_lock);
0909dc44 4992 goto abort;
efeb53c0
N
4993 }
4994 spin_unlock(&all_mddevs_lock);
1da177e4 4995 }
8b765398 4996
0909dc44 4997 error = -ENOMEM;
8b765398 4998 mddev->queue = blk_alloc_queue(GFP_KERNEL);
0909dc44
N
4999 if (!mddev->queue)
5000 goto abort;
409c57f3
N
5001 mddev->queue->queuedata = mddev;
5002
409c57f3 5003 blk_queue_make_request(mddev->queue, md_make_request);
b1bd055d 5004 blk_set_stacking_limits(&mddev->queue->limits);
8b765398 5005
1da177e4
LT
5006 disk = alloc_disk(1 << shift);
5007 if (!disk) {
8b765398
N
5008 blk_cleanup_queue(mddev->queue);
5009 mddev->queue = NULL;
0909dc44 5010 goto abort;
1da177e4 5011 }
efeb53c0 5012 disk->major = MAJOR(mddev->unit);
1da177e4 5013 disk->first_minor = unit << shift;
efeb53c0
N
5014 if (name)
5015 strcpy(disk->disk_name, name);
5016 else if (partitioned)
1da177e4 5017 sprintf(disk->disk_name, "md_d%d", unit);
ce7b0f46 5018 else
1da177e4 5019 sprintf(disk->disk_name, "md%d", unit);
1da177e4
LT
5020 disk->fops = &md_fops;
5021 disk->private_data = mddev;
5022 disk->queue = mddev->queue;
b0140891 5023 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
92850bbd 5024 /* Allow extended partitions. This makes the
d3374825 5025 * 'mdp' device redundant, but we can't really
92850bbd
N
5026 * remove it now.
5027 */
5028 disk->flags |= GENHD_FL_EXT_DEVT;
1da177e4 5029 mddev->gendisk = disk;
b0140891
N
5030 /* As soon as we call add_disk(), another thread could get
5031 * through to md_open, so make sure it doesn't get too far
5032 */
5033 mutex_lock(&mddev->open_mutex);
5034 add_disk(disk);
5035
ed9e1982
TH
5036 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
5037 &disk_to_dev(disk)->kobj, "%s", "md");
0909dc44
N
5038 if (error) {
5039 /* This isn't possible, but as kobject_init_and_add is marked
5040 * __must_check, we must do something with the result
5041 */
5e55e2f5
N
5042 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
5043 disk->disk_name);
0909dc44
N
5044 error = 0;
5045 }
00bcb4ac
N
5046 if (mddev->kobj.sd &&
5047 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
43a70507 5048 printk(KERN_DEBUG "pointless warning\n");
b0140891 5049 mutex_unlock(&mddev->open_mutex);
0909dc44
N
5050 abort:
5051 mutex_unlock(&disks_mutex);
00bcb4ac 5052 if (!error && mddev->kobj.sd) {
3830c62f 5053 kobject_uevent(&mddev->kobj, KOBJ_ADD);
00bcb4ac 5054 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
b62b7590 5055 }
d3374825 5056 mddev_put(mddev);
0909dc44 5057 return error;
efeb53c0
N
5058}
5059
5060static struct kobject *md_probe(dev_t dev, int *part, void *data)
5061{
5062 md_alloc(dev, NULL);
1da177e4
LT
5063 return NULL;
5064}
5065
efeb53c0
N
5066static int add_named_array(const char *val, struct kernel_param *kp)
5067{
5068 /* val must be "md_*" where * is not all digits.
5069 * We allocate an array with a large free minor number, and
5070 * set the name to val. val must not already be an active name.
5071 */
5072 int len = strlen(val);
5073 char buf[DISK_NAME_LEN];
5074
5075 while (len && val[len-1] == '\n')
5076 len--;
5077 if (len >= DISK_NAME_LEN)
5078 return -E2BIG;
5079 strlcpy(buf, val, len+1);
5080 if (strncmp(buf, "md_", 3) != 0)
5081 return -EINVAL;
5082 return md_alloc(0, buf);
5083}
5084
1da177e4
LT
5085static void md_safemode_timeout(unsigned long data)
5086{
fd01b88c 5087 struct mddev *mddev = (struct mddev *) data;
1da177e4 5088
0fd62b86
NB
5089 if (!atomic_read(&mddev->writes_pending)) {
5090 mddev->safemode = 1;
5091 if (mddev->external)
00bcb4ac 5092 sysfs_notify_dirent_safe(mddev->sysfs_state);
0fd62b86 5093 }
1da177e4
LT
5094 md_wakeup_thread(mddev->thread);
5095}
5096
6ff8d8ec 5097static int start_dirty_degraded;
1da177e4 5098
fd01b88c 5099int md_run(struct mddev *mddev)
1da177e4 5100{
2604b703 5101 int err;
3cb03002 5102 struct md_rdev *rdev;
84fc4b56 5103 struct md_personality *pers;
1da177e4 5104
a757e64c
N
5105 if (list_empty(&mddev->disks))
5106 /* cannot run an array with no devices.. */
1da177e4 5107 return -EINVAL;
1da177e4
LT
5108
5109 if (mddev->pers)
5110 return -EBUSY;
bb4f1e9d
N
5111 /* Cannot run until previous stop completes properly */
5112 if (mddev->sysfs_active)
5113 return -EBUSY;
b6eb127d 5114
1da177e4
LT
5115 /*
5116 * Analyze all RAID superblock(s)
5117 */
1ec4a939
N
5118 if (!mddev->raid_disks) {
5119 if (!mddev->persistent)
5120 return -EINVAL;
a757e64c 5121 analyze_sbs(mddev);
1ec4a939 5122 }
1da177e4 5123
d9d166c2
N
5124 if (mddev->level != LEVEL_NONE)
5125 request_module("md-level-%d", mddev->level);
5126 else if (mddev->clevel[0])
5127 request_module("md-%s", mddev->clevel);
1da177e4
LT
5128
5129 /*
5130 * Drop all container device buffers, from now on
5131 * the only valid external interface is through the md
5132 * device.
1da177e4 5133 */
dafb20fa 5134 rdev_for_each(rdev, mddev) {
b2d444d7 5135 if (test_bit(Faulty, &rdev->flags))
1da177e4
LT
5136 continue;
5137 sync_blockdev(rdev->bdev);
f98393a6 5138 invalidate_bdev(rdev->bdev);
f0d76d70
N
5139
5140 /* perform some consistency tests on the device.
5141 * We don't want the data to overlap the metadata,
58c0fed4 5142 * Internal Bitmap issues have been handled elsewhere.
f0d76d70 5143 */
a6ff7e08
JB
5144 if (rdev->meta_bdev) {
5145 /* Nothing to check */;
5146 } else if (rdev->data_offset < rdev->sb_start) {
58c0fed4
AN
5147 if (mddev->dev_sectors &&
5148 rdev->data_offset + mddev->dev_sectors
0f420358 5149 > rdev->sb_start) {
f0d76d70
N
5150 printk("md: %s: data overlaps metadata\n",
5151 mdname(mddev));
5152 return -EINVAL;
5153 }
5154 } else {
0f420358 5155 if (rdev->sb_start + rdev->sb_size/512
f0d76d70
N
5156 > rdev->data_offset) {
5157 printk("md: %s: metadata overlaps data\n",
5158 mdname(mddev));
5159 return -EINVAL;
5160 }
5161 }
00bcb4ac 5162 sysfs_notify_dirent_safe(rdev->sysfs_state);
1da177e4
LT
5163 }
5164
a167f663 5165 if (mddev->bio_set == NULL)
395c72a7 5166 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
a167f663 5167
1da177e4 5168 spin_lock(&pers_lock);
d9d166c2 5169 pers = find_pers(mddev->level, mddev->clevel);
2604b703 5170 if (!pers || !try_module_get(pers->owner)) {
1da177e4 5171 spin_unlock(&pers_lock);
d9d166c2
N
5172 if (mddev->level != LEVEL_NONE)
5173 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
5174 mddev->level);
5175 else
5176 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
5177 mddev->clevel);
1da177e4
LT
5178 return -EINVAL;
5179 }
1da177e4 5180 spin_unlock(&pers_lock);
34817e8c
N
5181 if (mddev->level != pers->level) {
5182 mddev->level = pers->level;
5183 mddev->new_level = pers->level;
5184 }
d9d166c2 5185 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
1da177e4 5186
f6705578 5187 if (mddev->reshape_position != MaxSector &&
63c70c4f 5188 pers->start_reshape == NULL) {
f6705578 5189 /* This personality cannot handle reshaping... */
f6705578
N
5190 module_put(pers->owner);
5191 return -EINVAL;
5192 }
5193
7dd5e7c3
N
5194 if (pers->sync_request) {
5195 /* Warn if this is a potentially silly
5196 * configuration.
5197 */
5198 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3cb03002 5199 struct md_rdev *rdev2;
7dd5e7c3 5200 int warned = 0;
159ec1fc 5201
dafb20fa
N
5202 rdev_for_each(rdev, mddev)
5203 rdev_for_each(rdev2, mddev) {
7dd5e7c3
N
5204 if (rdev < rdev2 &&
5205 rdev->bdev->bd_contains ==
5206 rdev2->bdev->bd_contains) {
5207 printk(KERN_WARNING
5208 "%s: WARNING: %s appears to be"
5209 " on the same physical disk as"
5210 " %s.\n",
5211 mdname(mddev),
5212 bdevname(rdev->bdev,b),
5213 bdevname(rdev2->bdev,b2));
5214 warned = 1;
5215 }
5216 }
159ec1fc 5217
7dd5e7c3
N
5218 if (warned)
5219 printk(KERN_WARNING
5220 "True protection against single-disk"
5221 " failure might be compromised.\n");
5222 }
5223
657390d2 5224 mddev->recovery = 0;
58c0fed4
AN
5225 /* may be over-ridden by personality */
5226 mddev->resync_max_sectors = mddev->dev_sectors;
5227
6ff8d8ec 5228 mddev->ok_start_degraded = start_dirty_degraded;
1da177e4 5229
0f9552b5 5230 if (start_readonly && mddev->ro == 0)
f91de92e
N
5231 mddev->ro = 2; /* read-only, but switch on first write */
5232
36d091f4 5233 err = pers->run(mddev);
13e53df3
AN
5234 if (err)
5235 printk(KERN_ERR "md: pers->run() failed ...\n");
36d091f4 5236 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
b522adcd
DW
5237 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
5238 " but 'external_size' not in effect?\n", __func__);
5239 printk(KERN_ERR
5240 "md: invalid array_size %llu > default size %llu\n",
5241 (unsigned long long)mddev->array_sectors / 2,
36d091f4 5242 (unsigned long long)pers->size(mddev, 0, 0) / 2);
b522adcd 5243 err = -EINVAL;
b522adcd 5244 }
36d091f4 5245 if (err == 0 && pers->sync_request &&
ef99bf48 5246 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
f9209a32
GR
5247 struct bitmap *bitmap;
5248
5249 bitmap = bitmap_create(mddev, -1);
5250 if (IS_ERR(bitmap)) {
5251 err = PTR_ERR(bitmap);
b15c2e57
N
5252 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
5253 mdname(mddev), err);
f9209a32
GR
5254 } else
5255 mddev->bitmap = bitmap;
5256
b15c2e57 5257 }
1da177e4 5258 if (err) {
5aa61f42 5259 mddev_detach(mddev);
0c35bd47
N
5260 if (mddev->private)
5261 pers->free(mddev, mddev->private);
bd691922 5262 mddev->private = NULL;
36d091f4 5263 module_put(pers->owner);
32a7627c
N
5264 bitmap_destroy(mddev);
5265 return err;
1da177e4 5266 }
5c675f83
N
5267 if (mddev->queue) {
5268 mddev->queue->backing_dev_info.congested_data = mddev;
5269 mddev->queue->backing_dev_info.congested_fn = md_congested;
5270 }
36d091f4 5271 if (pers->sync_request) {
00bcb4ac
N
5272 if (mddev->kobj.sd &&
5273 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5e55e2f5
N
5274 printk(KERN_WARNING
5275 "md: cannot register extra attributes for %s\n",
5276 mdname(mddev));
00bcb4ac 5277 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5e55e2f5 5278 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
fd9d49ca
N
5279 mddev->ro = 0;
5280
f72ffdd6 5281 atomic_set(&mddev->writes_pending,0);
1e50915f
RB
5282 atomic_set(&mddev->max_corr_read_errors,
5283 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
1da177e4 5284 mddev->safemode = 0;
28c1b9fd
GR
5285 if (mddev_is_clustered(mddev))
5286 mddev->safemode_delay = 0;
5287 else
5288 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
1da177e4 5289 mddev->in_sync = 1;
0ca69886 5290 smp_wmb();
36d091f4
N
5291 spin_lock(&mddev->lock);
5292 mddev->pers = pers;
0ca69886 5293 mddev->ready = 1;
36d091f4 5294 spin_unlock(&mddev->lock);
dafb20fa 5295 rdev_for_each(rdev, mddev)
36fad858
NK
5296 if (rdev->raid_disk >= 0)
5297 if (sysfs_link_rdev(mddev, rdev))
00bcb4ac 5298 /* failure here is OK */;
f72ffdd6 5299
a4a3d26d
N
5300 if (mddev->degraded && !mddev->ro)
5301 /* This ensures that recovering status is reported immediately
5302 * via sysfs - until a lack of spares is confirmed.
5303 */
5304 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
1da177e4 5305 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
f72ffdd6 5306
7a0a5355 5307 if (mddev->flags & MD_UPDATE_SB_FLAGS)
850b2b42 5308 md_update_sb(mddev, 0);
1da177e4 5309
d7603b7e 5310 md_new_event(mddev);
00bcb4ac
N
5311 sysfs_notify_dirent_safe(mddev->sysfs_state);
5312 sysfs_notify_dirent_safe(mddev->sysfs_action);
a99ac971 5313 sysfs_notify(&mddev->kobj, NULL, "degraded");
1da177e4
LT
5314 return 0;
5315}
390ee602 5316EXPORT_SYMBOL_GPL(md_run);
1da177e4 5317
fd01b88c 5318static int do_md_run(struct mddev *mddev)
fe60b014
N
5319{
5320 int err;
5321
5322 err = md_run(mddev);
5323 if (err)
5324 goto out;
69e51b44
N
5325 err = bitmap_load(mddev);
5326 if (err) {
5327 bitmap_destroy(mddev);
5328 goto out;
5329 }
0fd018af 5330
28c1b9fd
GR
5331 if (mddev_is_clustered(mddev))
5332 md_allow_write(mddev);
5333
0fd018af
JB
5334 md_wakeup_thread(mddev->thread);
5335 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5336
fe60b014
N
5337 set_capacity(mddev->gendisk, mddev->array_sectors);
5338 revalidate_disk(mddev->gendisk);
f0b4f7e2 5339 mddev->changed = 1;
fe60b014
N
5340 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5341out:
5342 return err;
5343}
5344
fd01b88c 5345static int restart_array(struct mddev *mddev)
1da177e4
LT
5346{
5347 struct gendisk *disk = mddev->gendisk;
1da177e4 5348
80fab1d7 5349 /* Complain if it has no devices */
1da177e4 5350 if (list_empty(&mddev->disks))
80fab1d7
AN
5351 return -ENXIO;
5352 if (!mddev->pers)
5353 return -EINVAL;
5354 if (!mddev->ro)
5355 return -EBUSY;
339421de
SL
5356 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5357 struct md_rdev *rdev;
5358 bool has_journal = false;
5359
5360 rcu_read_lock();
5361 rdev_for_each_rcu(rdev, mddev) {
5362 if (test_bit(Journal, &rdev->flags) &&
5363 !test_bit(Faulty, &rdev->flags)) {
5364 has_journal = true;
5365 break;
5366 }
5367 }
5368 rcu_read_unlock();
5369
5370 /* Don't restart rw with journal missing/faulty */
5371 if (!has_journal)
5372 return -EINVAL;
5373 }
5374
80fab1d7
AN
5375 mddev->safemode = 0;
5376 mddev->ro = 0;
5377 set_disk_ro(disk, 0);
5378 printk(KERN_INFO "md: %s switched to read-write mode.\n",
5379 mdname(mddev));
5380 /* Kick recovery or resync if necessary */
5381 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5382 md_wakeup_thread(mddev->thread);
5383 md_wakeup_thread(mddev->sync_thread);
00bcb4ac 5384 sysfs_notify_dirent_safe(mddev->sysfs_state);
80fab1d7 5385 return 0;
1da177e4
LT
5386}
5387
fd01b88c 5388static void md_clean(struct mddev *mddev)
6177b472
N
5389{
5390 mddev->array_sectors = 0;
5391 mddev->external_size = 0;
5392 mddev->dev_sectors = 0;
5393 mddev->raid_disks = 0;
5394 mddev->recovery_cp = 0;
5395 mddev->resync_min = 0;
5396 mddev->resync_max = MaxSector;
5397 mddev->reshape_position = MaxSector;
5398 mddev->external = 0;
5399 mddev->persistent = 0;
5400 mddev->level = LEVEL_NONE;
5401 mddev->clevel[0] = 0;
5402 mddev->flags = 0;
5403 mddev->ro = 0;
5404 mddev->metadata_type[0] = 0;
5405 mddev->chunk_sectors = 0;
5406 mddev->ctime = mddev->utime = 0;
5407 mddev->layout = 0;
5408 mddev->max_disks = 0;
5409 mddev->events = 0;
a8707c08 5410 mddev->can_decrease_events = 0;
6177b472 5411 mddev->delta_disks = 0;
2c810cdd 5412 mddev->reshape_backwards = 0;
6177b472
N
5413 mddev->new_level = LEVEL_NONE;
5414 mddev->new_layout = 0;
5415 mddev->new_chunk_sectors = 0;
5416 mddev->curr_resync = 0;
7f7583d4 5417 atomic64_set(&mddev->resync_mismatches, 0);
6177b472
N
5418 mddev->suspend_lo = mddev->suspend_hi = 0;
5419 mddev->sync_speed_min = mddev->sync_speed_max = 0;
5420 mddev->recovery = 0;
5421 mddev->in_sync = 0;
f0b4f7e2 5422 mddev->changed = 0;
6177b472 5423 mddev->degraded = 0;
6177b472 5424 mddev->safemode = 0;
bd691922 5425 mddev->private = NULL;
6177b472
N
5426 mddev->bitmap_info.offset = 0;
5427 mddev->bitmap_info.default_offset = 0;
6409bb05 5428 mddev->bitmap_info.default_space = 0;
6177b472
N
5429 mddev->bitmap_info.chunksize = 0;
5430 mddev->bitmap_info.daemon_sleep = 0;
5431 mddev->bitmap_info.max_write_behind = 0;
5432}
5433
fd01b88c 5434static void __md_stop_writes(struct mddev *mddev)
a047e125 5435{
6b6204ee 5436 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
f851b60d 5437 flush_workqueue(md_misc_wq);
a047e125 5438 if (mddev->sync_thread) {
a047e125 5439 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
a91d5ac0 5440 md_reap_sync_thread(mddev);
a047e125
N
5441 }
5442
5443 del_timer_sync(&mddev->safemode_timer);
5444
5445 bitmap_flush(mddev);
5446 md_super_wait(mddev);
5447
b6d428c6 5448 if (mddev->ro == 0 &&
28c1b9fd
GR
5449 ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
5450 (mddev->flags & MD_UPDATE_SB_FLAGS))) {
a047e125 5451 /* mark array as shutdown cleanly */
28c1b9fd
GR
5452 if (!mddev_is_clustered(mddev))
5453 mddev->in_sync = 1;
a047e125
N
5454 md_update_sb(mddev, 1);
5455 }
5456}
defad61a 5457
fd01b88c 5458void md_stop_writes(struct mddev *mddev)
defad61a 5459{
29f097c4 5460 mddev_lock_nointr(mddev);
defad61a
N
5461 __md_stop_writes(mddev);
5462 mddev_unlock(mddev);
5463}
390ee602 5464EXPORT_SYMBOL_GPL(md_stop_writes);
a047e125 5465
5aa61f42
N
5466static void mddev_detach(struct mddev *mddev)
5467{
5468 struct bitmap *bitmap = mddev->bitmap;
5469 /* wait for behind writes to complete */
5470 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
5471 printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
5472 mdname(mddev));
5473 /* need to kick something here to make sure I/O goes? */
5474 wait_event(bitmap->behind_wait,
5475 atomic_read(&bitmap->behind_writes) == 0);
5476 }
36d091f4 5477 if (mddev->pers && mddev->pers->quiesce) {
5aa61f42
N
5478 mddev->pers->quiesce(mddev, 1);
5479 mddev->pers->quiesce(mddev, 0);
5480 }
5481 md_unregister_thread(&mddev->thread);
5482 if (mddev->queue)
5483 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5484}
5485
5eff3c43 5486static void __md_stop(struct mddev *mddev)
6177b472 5487{
36d091f4 5488 struct md_personality *pers = mddev->pers;
5aa61f42 5489 mddev_detach(mddev);
ee5d004f
N
5490 /* Ensure ->event_work is done */
5491 flush_workqueue(md_misc_wq);
36d091f4
N
5492 spin_lock(&mddev->lock);
5493 mddev->ready = 0;
6177b472 5494 mddev->pers = NULL;
36d091f4
N
5495 spin_unlock(&mddev->lock);
5496 pers->free(mddev, mddev->private);
bd691922 5497 mddev->private = NULL;
36d091f4
N
5498 if (pers->sync_request && mddev->to_remove == NULL)
5499 mddev->to_remove = &md_redundancy_group;
5500 module_put(pers->owner);
cca9cf90 5501 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6177b472 5502}
5eff3c43
N
5503
5504void md_stop(struct mddev *mddev)
5505{
5506 /* stop the array and free an attached data structures.
5507 * This is called from dm-raid
5508 */
5509 __md_stop(mddev);
5510 bitmap_destroy(mddev);
5511 if (mddev->bio_set)
5512 bioset_free(mddev->bio_set);
5513}
5514
390ee602 5515EXPORT_SYMBOL_GPL(md_stop);
6177b472 5516
a05b7ea0 5517static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
a4bd82d0
N
5518{
5519 int err = 0;
30b8feb7
N
5520 int did_freeze = 0;
5521
5522 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5523 did_freeze = 1;
5524 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5525 md_wakeup_thread(mddev->thread);
5526 }
f851b60d 5527 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
30b8feb7 5528 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
f851b60d 5529 if (mddev->sync_thread)
30b8feb7
N
5530 /* Thread might be blocked waiting for metadata update
5531 * which will now never happen */
5532 wake_up_process(mddev->sync_thread->tsk);
f851b60d 5533
88724bfa
N
5534 if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags))
5535 return -EBUSY;
30b8feb7 5536 mddev_unlock(mddev);
f851b60d
N
5537 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
5538 &mddev->recovery));
88724bfa
N
5539 wait_event(mddev->sb_wait,
5540 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
30b8feb7
N
5541 mddev_lock_nointr(mddev);
5542
a4bd82d0 5543 mutex_lock(&mddev->open_mutex);
9ba3b7f5 5544 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
30b8feb7 5545 mddev->sync_thread ||
f851b60d 5546 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
30b8feb7 5547 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
a4bd82d0 5548 printk("md: %s still in use.\n",mdname(mddev));
30b8feb7
N
5549 if (did_freeze) {
5550 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
45eaf45d 5551 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
30b8feb7
N
5552 md_wakeup_thread(mddev->thread);
5553 }
a4bd82d0
N
5554 err = -EBUSY;
5555 goto out;
5556 }
5557 if (mddev->pers) {
defad61a 5558 __md_stop_writes(mddev);
a4bd82d0
N
5559
5560 err = -ENXIO;
5561 if (mddev->ro==1)
5562 goto out;
5563 mddev->ro = 1;
5564 set_disk_ro(mddev->gendisk, 1);
5565 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
45eaf45d
N
5566 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5567 md_wakeup_thread(mddev->thread);
00bcb4ac 5568 sysfs_notify_dirent_safe(mddev->sysfs_state);
30b8feb7 5569 err = 0;
a4bd82d0
N
5570 }
5571out:
5572 mutex_unlock(&mddev->open_mutex);
5573 return err;
5574}
5575
9e653b63
N
5576/* mode:
5577 * 0 - completely stop and dis-assemble array
9e653b63
N
5578 * 2 - stop but do not disassemble array
5579 */
f72ffdd6 5580static int do_md_stop(struct mddev *mddev, int mode,
a05b7ea0 5581 struct block_device *bdev)
1da177e4 5582{
1da177e4 5583 struct gendisk *disk = mddev->gendisk;
3cb03002 5584 struct md_rdev *rdev;
30b8feb7
N
5585 int did_freeze = 0;
5586
5587 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5588 did_freeze = 1;
5589 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5590 md_wakeup_thread(mddev->thread);
5591 }
f851b60d 5592 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
30b8feb7 5593 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
f851b60d 5594 if (mddev->sync_thread)
30b8feb7
N
5595 /* Thread might be blocked waiting for metadata update
5596 * which will now never happen */
5597 wake_up_process(mddev->sync_thread->tsk);
f851b60d 5598
30b8feb7 5599 mddev_unlock(mddev);
f851b60d
N
5600 wait_event(resync_wait, (mddev->sync_thread == NULL &&
5601 !test_bit(MD_RECOVERY_RUNNING,
5602 &mddev->recovery)));
30b8feb7 5603 mddev_lock_nointr(mddev);
1da177e4 5604
c8c00a69 5605 mutex_lock(&mddev->open_mutex);
9ba3b7f5 5606 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
30b8feb7
N
5607 mddev->sysfs_active ||
5608 mddev->sync_thread ||
f851b60d 5609 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
30b8feb7 5610 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
df5b20cf 5611 printk("md: %s still in use.\n",mdname(mddev));
6e17b027 5612 mutex_unlock(&mddev->open_mutex);
30b8feb7
N
5613 if (did_freeze) {
5614 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
45eaf45d 5615 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
30b8feb7
N
5616 md_wakeup_thread(mddev->thread);
5617 }
260fa034
N
5618 return -EBUSY;
5619 }
6e17b027 5620 if (mddev->pers) {
a4bd82d0
N
5621 if (mddev->ro)
5622 set_disk_ro(disk, 0);
409c57f3 5623
defad61a 5624 __md_stop_writes(mddev);
5eff3c43 5625 __md_stop(mddev);
a4bd82d0 5626 mddev->queue->backing_dev_info.congested_fn = NULL;
6177b472 5627
a4bd82d0 5628 /* tell userspace to handle 'inactive' */
00bcb4ac 5629 sysfs_notify_dirent_safe(mddev->sysfs_state);
0d4ca600 5630
dafb20fa 5631 rdev_for_each(rdev, mddev)
36fad858
NK
5632 if (rdev->raid_disk >= 0)
5633 sysfs_unlink_rdev(mddev, rdev);
c4647292 5634
a4bd82d0 5635 set_capacity(disk, 0);
6e17b027 5636 mutex_unlock(&mddev->open_mutex);
f0b4f7e2 5637 mddev->changed = 1;
a4bd82d0 5638 revalidate_disk(disk);
0d4ca600 5639
a4bd82d0
N
5640 if (mddev->ro)
5641 mddev->ro = 0;
6e17b027
N
5642 } else
5643 mutex_unlock(&mddev->open_mutex);
1da177e4
LT
5644 /*
5645 * Free resources if final stop
5646 */
9e653b63 5647 if (mode == 0) {
1da177e4
LT
5648 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
5649
978f946b 5650 bitmap_destroy(mddev);
c3d9714e 5651 if (mddev->bitmap_info.file) {
4af1a041
N
5652 struct file *f = mddev->bitmap_info.file;
5653 spin_lock(&mddev->lock);
c3d9714e 5654 mddev->bitmap_info.file = NULL;
4af1a041
N
5655 spin_unlock(&mddev->lock);
5656 fput(f);
978f946b 5657 }
c3d9714e 5658 mddev->bitmap_info.offset = 0;
978f946b 5659
1da177e4
LT
5660 export_array(mddev);
5661
6177b472 5662 md_clean(mddev);
934d9c23 5663 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
efeb53c0
N
5664 if (mddev->hold_active == UNTIL_STOP)
5665 mddev->hold_active = 0;
a4bd82d0 5666 }
d7603b7e 5667 md_new_event(mddev);
00bcb4ac 5668 sysfs_notify_dirent_safe(mddev->sysfs_state);
6e17b027 5669 return 0;
1da177e4
LT
5670}
5671
fdee8ae4 5672#ifndef MODULE
fd01b88c 5673static void autorun_array(struct mddev *mddev)
1da177e4 5674{
3cb03002 5675 struct md_rdev *rdev;
1da177e4
LT
5676 int err;
5677
a757e64c 5678 if (list_empty(&mddev->disks))
1da177e4 5679 return;
1da177e4
LT
5680
5681 printk(KERN_INFO "md: running: ");
5682
dafb20fa 5683 rdev_for_each(rdev, mddev) {
1da177e4
LT
5684 char b[BDEVNAME_SIZE];
5685 printk("<%s>", bdevname(rdev->bdev,b));
5686 }
5687 printk("\n");
5688
d710e138 5689 err = do_md_run(mddev);
1da177e4
LT
5690 if (err) {
5691 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
a05b7ea0 5692 do_md_stop(mddev, 0, NULL);
1da177e4
LT
5693 }
5694}
5695
5696/*
5697 * lets try to run arrays based on all disks that have arrived
5698 * until now. (those are in pending_raid_disks)
5699 *
5700 * the method: pick the first pending disk, collect all disks with
5701 * the same UUID, remove all from the pending list and put them into
5702 * the 'same_array' list. Then order this list based on superblock
5703 * update time (freshest comes first), kick out 'old' disks and
5704 * compare superblocks. If everything's fine then run it.
5705 *
5706 * If "unit" is allocated, then bump its reference count
5707 */
5708static void autorun_devices(int part)
5709{
3cb03002 5710 struct md_rdev *rdev0, *rdev, *tmp;
fd01b88c 5711 struct mddev *mddev;
1da177e4
LT
5712 char b[BDEVNAME_SIZE];
5713
5714 printk(KERN_INFO "md: autorun ...\n");
5715 while (!list_empty(&pending_raid_disks)) {
e8703fe1 5716 int unit;
1da177e4 5717 dev_t dev;
ad01c9e3 5718 LIST_HEAD(candidates);
1da177e4 5719 rdev0 = list_entry(pending_raid_disks.next,
3cb03002 5720 struct md_rdev, same_set);
1da177e4
LT
5721
5722 printk(KERN_INFO "md: considering %s ...\n",
5723 bdevname(rdev0->bdev,b));
5724 INIT_LIST_HEAD(&candidates);
159ec1fc 5725 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
1da177e4
LT
5726 if (super_90_load(rdev, rdev0, 0) >= 0) {
5727 printk(KERN_INFO "md: adding %s ...\n",
5728 bdevname(rdev->bdev,b));
5729 list_move(&rdev->same_set, &candidates);
5730 }
5731 /*
5732 * now we have a set of devices, with all of them having
5733 * mostly sane superblocks. It's time to allocate the
5734 * mddev.
5735 */
e8703fe1
N
5736 if (part) {
5737 dev = MKDEV(mdp_major,
5738 rdev0->preferred_minor << MdpMinorShift);
5739 unit = MINOR(dev) >> MdpMinorShift;
5740 } else {
5741 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5742 unit = MINOR(dev);
5743 }
5744 if (rdev0->preferred_minor != unit) {
1da177e4
LT
5745 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
5746 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5747 break;
5748 }
1da177e4
LT
5749
5750 md_probe(dev, NULL, NULL);
5751 mddev = mddev_find(dev);
9bbbca3a
NB
5752 if (!mddev || !mddev->gendisk) {
5753 if (mddev)
5754 mddev_put(mddev);
5755 printk(KERN_ERR
1da177e4
LT
5756 "md: cannot allocate memory for md drive.\n");
5757 break;
5758 }
f72ffdd6 5759 if (mddev_lock(mddev))
1da177e4
LT
5760 printk(KERN_WARNING "md: %s locked, cannot run\n",
5761 mdname(mddev));
5762 else if (mddev->raid_disks || mddev->major_version
5763 || !list_empty(&mddev->disks)) {
f72ffdd6 5764 printk(KERN_WARNING
1da177e4
LT
5765 "md: %s already running, cannot run %s\n",
5766 mdname(mddev), bdevname(rdev0->bdev,b));
5767 mddev_unlock(mddev);
5768 } else {
5769 printk(KERN_INFO "md: created %s\n", mdname(mddev));
1ec4a939 5770 mddev->persistent = 1;
159ec1fc 5771 rdev_for_each_list(rdev, tmp, &candidates) {
1da177e4
LT
5772 list_del_init(&rdev->same_set);
5773 if (bind_rdev_to_array(rdev, mddev))
5774 export_rdev(rdev);
5775 }
5776 autorun_array(mddev);
5777 mddev_unlock(mddev);
5778 }
5779 /* on success, candidates will be empty, on error
5780 * it won't...
5781 */
159ec1fc 5782 rdev_for_each_list(rdev, tmp, &candidates) {
4b80991c 5783 list_del_init(&rdev->same_set);
1da177e4 5784 export_rdev(rdev);
4b80991c 5785 }
1da177e4
LT
5786 mddev_put(mddev);
5787 }
5788 printk(KERN_INFO "md: ... autorun DONE.\n");
5789}
fdee8ae4 5790#endif /* !MODULE */
1da177e4 5791
f72ffdd6 5792static int get_version(void __user *arg)
1da177e4
LT
5793{
5794 mdu_version_t ver;
5795
5796 ver.major = MD_MAJOR_VERSION;
5797 ver.minor = MD_MINOR_VERSION;
5798 ver.patchlevel = MD_PATCHLEVEL_VERSION;
5799
5800 if (copy_to_user(arg, &ver, sizeof(ver)))
5801 return -EFAULT;
5802
5803 return 0;
5804}
5805
f72ffdd6 5806static int get_array_info(struct mddev *mddev, void __user *arg)
1da177e4
LT
5807{
5808 mdu_array_info_t info;
a9f326eb 5809 int nr,working,insync,failed,spare;
3cb03002 5810 struct md_rdev *rdev;
1da177e4 5811
1ca69c4b
N
5812 nr = working = insync = failed = spare = 0;
5813 rcu_read_lock();
5814 rdev_for_each_rcu(rdev, mddev) {
1da177e4 5815 nr++;
b2d444d7 5816 if (test_bit(Faulty, &rdev->flags))
1da177e4
LT
5817 failed++;
5818 else {
5819 working++;
b2d444d7 5820 if (test_bit(In_sync, &rdev->flags))
f72ffdd6 5821 insync++;
1da177e4
LT
5822 else
5823 spare++;
5824 }
5825 }
1ca69c4b 5826 rcu_read_unlock();
1da177e4
LT
5827
5828 info.major_version = mddev->major_version;
5829 info.minor_version = mddev->minor_version;
5830 info.patch_version = MD_PATCHLEVEL_VERSION;
5831 info.ctime = mddev->ctime;
5832 info.level = mddev->level;
58c0fed4
AN
5833 info.size = mddev->dev_sectors / 2;
5834 if (info.size != mddev->dev_sectors / 2) /* overflow */
284ae7ca 5835 info.size = -1;
1da177e4
LT
5836 info.nr_disks = nr;
5837 info.raid_disks = mddev->raid_disks;
5838 info.md_minor = mddev->md_minor;
5839 info.not_persistent= !mddev->persistent;
5840
5841 info.utime = mddev->utime;
5842 info.state = 0;
5843 if (mddev->in_sync)
5844 info.state = (1<<MD_SB_CLEAN);
c3d9714e 5845 if (mddev->bitmap && mddev->bitmap_info.offset)
9bd35920 5846 info.state |= (1<<MD_SB_BITMAP_PRESENT);
ca8895d9
GR
5847 if (mddev_is_clustered(mddev))
5848 info.state |= (1<<MD_SB_CLUSTERED);
a9f326eb 5849 info.active_disks = insync;
1da177e4
LT
5850 info.working_disks = working;
5851 info.failed_disks = failed;
5852 info.spare_disks = spare;
5853
5854 info.layout = mddev->layout;
9d8f0363 5855 info.chunk_size = mddev->chunk_sectors << 9;
1da177e4
LT
5856
5857 if (copy_to_user(arg, &info, sizeof(info)))
5858 return -EFAULT;
5859
5860 return 0;
5861}
5862
f72ffdd6 5863static int get_bitmap_file(struct mddev *mddev, void __user * arg)
32a7627c
N
5864{
5865 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
f4ad3d38 5866 char *ptr;
4af1a041 5867 int err;
32a7627c 5868
b6878d9e 5869 file = kzalloc(sizeof(*file), GFP_NOIO);
32a7627c 5870 if (!file)
4af1a041 5871 return -ENOMEM;
32a7627c 5872
4af1a041
N
5873 err = 0;
5874 spin_lock(&mddev->lock);
25eafe1a
BR
5875 /* bitmap enabled */
5876 if (mddev->bitmap_info.file) {
5877 ptr = file_path(mddev->bitmap_info.file, file->pathname,
5878 sizeof(file->pathname));
5879 if (IS_ERR(ptr))
5880 err = PTR_ERR(ptr);
5881 else
5882 memmove(file->pathname, ptr,
5883 sizeof(file->pathname)-(ptr-file->pathname));
5884 }
4af1a041 5885 spin_unlock(&mddev->lock);
32a7627c 5886
4af1a041
N
5887 if (err == 0 &&
5888 copy_to_user(arg, file, sizeof(*file)))
32a7627c 5889 err = -EFAULT;
4af1a041 5890
32a7627c
N
5891 kfree(file);
5892 return err;
5893}
5894
f72ffdd6 5895static int get_disk_info(struct mddev *mddev, void __user * arg)
1da177e4
LT
5896{
5897 mdu_disk_info_t info;
3cb03002 5898 struct md_rdev *rdev;
1da177e4
LT
5899
5900 if (copy_from_user(&info, arg, sizeof(info)))
5901 return -EFAULT;
5902
1ca69c4b 5903 rcu_read_lock();
57d051dc 5904 rdev = md_find_rdev_nr_rcu(mddev, info.number);
1da177e4
LT
5905 if (rdev) {
5906 info.major = MAJOR(rdev->bdev->bd_dev);
5907 info.minor = MINOR(rdev->bdev->bd_dev);
5908 info.raid_disk = rdev->raid_disk;
5909 info.state = 0;
b2d444d7 5910 if (test_bit(Faulty, &rdev->flags))
1da177e4 5911 info.state |= (1<<MD_DISK_FAULTY);
b2d444d7 5912 else if (test_bit(In_sync, &rdev->flags)) {
1da177e4
LT
5913 info.state |= (1<<MD_DISK_ACTIVE);
5914 info.state |= (1<<MD_DISK_SYNC);
5915 }
9efdca16 5916 if (test_bit(Journal, &rdev->flags))
bac624f3 5917 info.state |= (1<<MD_DISK_JOURNAL);
8ddf9efe
N
5918 if (test_bit(WriteMostly, &rdev->flags))
5919 info.state |= (1<<MD_DISK_WRITEMOSTLY);
1da177e4
LT
5920 } else {
5921 info.major = info.minor = 0;
5922 info.raid_disk = -1;
5923 info.state = (1<<MD_DISK_REMOVED);
5924 }
1ca69c4b 5925 rcu_read_unlock();
1da177e4
LT
5926
5927 if (copy_to_user(arg, &info, sizeof(info)))
5928 return -EFAULT;
5929
5930 return 0;
5931}
5932
f72ffdd6 5933static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
1da177e4
LT
5934{
5935 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3cb03002 5936 struct md_rdev *rdev;
1da177e4
LT
5937 dev_t dev = MKDEV(info->major,info->minor);
5938
1aee41f6
GR
5939 if (mddev_is_clustered(mddev) &&
5940 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
fa8259da 5941 pr_err("%s: Cannot add to clustered mddev.\n",
1aee41f6
GR
5942 mdname(mddev));
5943 return -EINVAL;
5944 }
5945
1da177e4
LT
5946 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
5947 return -EOVERFLOW;
5948
5949 if (!mddev->raid_disks) {
5950 int err;
5951 /* expecting a device which has a superblock */
5952 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
5953 if (IS_ERR(rdev)) {
f72ffdd6 5954 printk(KERN_WARNING
1da177e4
LT
5955 "md: md_import_device returned %ld\n",
5956 PTR_ERR(rdev));
5957 return PTR_ERR(rdev);
5958 }
5959 if (!list_empty(&mddev->disks)) {
3cb03002
N
5960 struct md_rdev *rdev0
5961 = list_entry(mddev->disks.next,
5962 struct md_rdev, same_set);
a9f326eb 5963 err = super_types[mddev->major_version]
1da177e4
LT
5964 .load_super(rdev, rdev0, mddev->minor_version);
5965 if (err < 0) {
f72ffdd6 5966 printk(KERN_WARNING
1da177e4 5967 "md: %s has different UUID to %s\n",
f72ffdd6 5968 bdevname(rdev->bdev,b),
1da177e4
LT
5969 bdevname(rdev0->bdev,b2));
5970 export_rdev(rdev);
5971 return -EINVAL;
5972 }
5973 }
5974 err = bind_rdev_to_array(rdev, mddev);
5975 if (err)
5976 export_rdev(rdev);
5977 return err;
5978 }
5979
5980 /*
5981 * add_new_disk can be used once the array is assembled
5982 * to add "hot spares". They must already have a superblock
5983 * written
5984 */
5985 if (mddev->pers) {
5986 int err;
5987 if (!mddev->pers->hot_add_disk) {
f72ffdd6 5988 printk(KERN_WARNING
1da177e4
LT
5989 "%s: personality does not support diskops!\n",
5990 mdname(mddev));
5991 return -EINVAL;
5992 }
7b1e35f6
N
5993 if (mddev->persistent)
5994 rdev = md_import_device(dev, mddev->major_version,
5995 mddev->minor_version);
5996 else
5997 rdev = md_import_device(dev, -1, -1);
1da177e4 5998 if (IS_ERR(rdev)) {
f72ffdd6 5999 printk(KERN_WARNING
1da177e4
LT
6000 "md: md_import_device returned %ld\n",
6001 PTR_ERR(rdev));
6002 return PTR_ERR(rdev);
6003 }
1a855a06 6004 /* set saved_raid_disk if appropriate */
41158c7e
N
6005 if (!mddev->persistent) {
6006 if (info->state & (1<<MD_DISK_SYNC) &&
bf572541 6007 info->raid_disk < mddev->raid_disks) {
41158c7e 6008 rdev->raid_disk = info->raid_disk;
bf572541 6009 set_bit(In_sync, &rdev->flags);
8313b8e5 6010 clear_bit(Bitmap_sync, &rdev->flags);
bf572541 6011 } else
41158c7e 6012 rdev->raid_disk = -1;
f466722c 6013 rdev->saved_raid_disk = rdev->raid_disk;
41158c7e
N
6014 } else
6015 super_types[mddev->major_version].
6016 validate_super(mddev, rdev);
bedd86b7 6017 if ((info->state & (1<<MD_DISK_SYNC)) &&
f4563091 6018 rdev->raid_disk != info->raid_disk) {
bedd86b7
N
6019 /* This was a hot-add request, but events doesn't
6020 * match, so reject it.
6021 */
6022 export_rdev(rdev);
6023 return -EINVAL;
6024 }
6025
b2d444d7 6026 clear_bit(In_sync, &rdev->flags); /* just to be sure */
8ddf9efe
N
6027 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6028 set_bit(WriteMostly, &rdev->flags);
575a80fa
N
6029 else
6030 clear_bit(WriteMostly, &rdev->flags);
8ddf9efe 6031
bac624f3
SL
6032 if (info->state & (1<<MD_DISK_JOURNAL))
6033 set_bit(Journal, &rdev->flags);
1aee41f6
GR
6034 /*
6035 * check whether the device shows up in other nodes
6036 */
6037 if (mddev_is_clustered(mddev)) {
dbb64f86 6038 if (info->state & (1 << MD_DISK_CANDIDATE))
1aee41f6 6039 set_bit(Candidate, &rdev->flags);
dbb64f86 6040 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
1aee41f6 6041 /* --add initiated by this node */
dbb64f86 6042 err = md_cluster_ops->add_new_disk(mddev, rdev);
1aee41f6 6043 if (err) {
1aee41f6
GR
6044 export_rdev(rdev);
6045 return err;
6046 }
6047 }
6048 }
6049
1da177e4
LT
6050 rdev->raid_disk = -1;
6051 err = bind_rdev_to_array(rdev, mddev);
dbb64f86 6052
1da177e4
LT
6053 if (err)
6054 export_rdev(rdev);
dbb64f86
GR
6055
6056 if (mddev_is_clustered(mddev)) {
6057 if (info->state & (1 << MD_DISK_CANDIDATE))
6058 md_cluster_ops->new_disk_ack(mddev, (err == 0));
6059 else {
6060 if (err)
6061 md_cluster_ops->add_new_disk_cancel(mddev);
6062 else
6063 err = add_bound_rdev(rdev);
6064 }
6065
6066 } else if (!err)
a6da4ef8 6067 err = add_bound_rdev(rdev);
dbb64f86 6068
1da177e4
LT
6069 return err;
6070 }
6071
6072 /* otherwise, add_new_disk is only allowed
6073 * for major_version==0 superblocks
6074 */
6075 if (mddev->major_version != 0) {
6076 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
6077 mdname(mddev));
6078 return -EINVAL;
6079 }
6080
6081 if (!(info->state & (1<<MD_DISK_FAULTY))) {
6082 int err;
d710e138 6083 rdev = md_import_device(dev, -1, 0);
1da177e4 6084 if (IS_ERR(rdev)) {
f72ffdd6 6085 printk(KERN_WARNING
1da177e4
LT
6086 "md: error, md_import_device() returned %ld\n",
6087 PTR_ERR(rdev));
6088 return PTR_ERR(rdev);
6089 }
6090 rdev->desc_nr = info->number;
6091 if (info->raid_disk < mddev->raid_disks)
6092 rdev->raid_disk = info->raid_disk;
6093 else
6094 rdev->raid_disk = -1;
6095
1da177e4 6096 if (rdev->raid_disk < mddev->raid_disks)
b2d444d7
N
6097 if (info->state & (1<<MD_DISK_SYNC))
6098 set_bit(In_sync, &rdev->flags);
1da177e4 6099
8ddf9efe
N
6100 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6101 set_bit(WriteMostly, &rdev->flags);
6102
1da177e4
LT
6103 if (!mddev->persistent) {
6104 printk(KERN_INFO "md: nonpersistent superblock ...\n");
77304d2a
MS
6105 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6106 } else
57b2caa3 6107 rdev->sb_start = calc_dev_sboffset(rdev);
8190e754 6108 rdev->sectors = rdev->sb_start;
1da177e4 6109
2bf071bf
N
6110 err = bind_rdev_to_array(rdev, mddev);
6111 if (err) {
6112 export_rdev(rdev);
6113 return err;
6114 }
1da177e4
LT
6115 }
6116
6117 return 0;
6118}
6119
f72ffdd6 6120static int hot_remove_disk(struct mddev *mddev, dev_t dev)
1da177e4
LT
6121{
6122 char b[BDEVNAME_SIZE];
3cb03002 6123 struct md_rdev *rdev;
23b63f9f 6124 int ret = -1;
1da177e4 6125
1da177e4
LT
6126 rdev = find_rdev(mddev, dev);
6127 if (!rdev)
6128 return -ENXIO;
6129
293467aa 6130 if (mddev_is_clustered(mddev))
23b63f9f 6131 ret = md_cluster_ops->metadata_update_start(mddev);
293467aa 6132
2910ff17
GR
6133 if (rdev->raid_disk < 0)
6134 goto kick_rdev;
293467aa 6135
3ea8929d
N
6136 clear_bit(Blocked, &rdev->flags);
6137 remove_and_add_spares(mddev, rdev);
6138
1da177e4
LT
6139 if (rdev->raid_disk >= 0)
6140 goto busy;
6141
2910ff17 6142kick_rdev:
23b63f9f 6143 if (mddev_is_clustered(mddev) && ret == 0)
88bcfef7
GR
6144 md_cluster_ops->remove_disk(mddev, rdev);
6145
fb56dfef 6146 md_kick_rdev_from_array(rdev);
850b2b42 6147 md_update_sb(mddev, 1);
d7603b7e 6148 md_new_event(mddev);
1da177e4
LT
6149
6150 return 0;
6151busy:
23b63f9f 6152 if (mddev_is_clustered(mddev) && ret == 0)
293467aa 6153 md_cluster_ops->metadata_update_cancel(mddev);
2910ff17 6154
fdefa4d8 6155 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
1da177e4
LT
6156 bdevname(rdev->bdev,b), mdname(mddev));
6157 return -EBUSY;
6158}
6159
f72ffdd6 6160static int hot_add_disk(struct mddev *mddev, dev_t dev)
1da177e4
LT
6161{
6162 char b[BDEVNAME_SIZE];
6163 int err;
3cb03002 6164 struct md_rdev *rdev;
1da177e4
LT
6165
6166 if (!mddev->pers)
6167 return -ENODEV;
6168
6169 if (mddev->major_version != 0) {
6170 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
6171 " version-0 superblocks.\n",
6172 mdname(mddev));
6173 return -EINVAL;
6174 }
6175 if (!mddev->pers->hot_add_disk) {
f72ffdd6 6176 printk(KERN_WARNING
1da177e4
LT
6177 "%s: personality does not support diskops!\n",
6178 mdname(mddev));
6179 return -EINVAL;
6180 }
6181
d710e138 6182 rdev = md_import_device(dev, -1, 0);
1da177e4 6183 if (IS_ERR(rdev)) {
f72ffdd6 6184 printk(KERN_WARNING
1da177e4
LT
6185 "md: error, md_import_device() returned %ld\n",
6186 PTR_ERR(rdev));
6187 return -EINVAL;
6188 }
6189
6190 if (mddev->persistent)
57b2caa3 6191 rdev->sb_start = calc_dev_sboffset(rdev);
1da177e4 6192 else
77304d2a 6193 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
1da177e4 6194
8190e754 6195 rdev->sectors = rdev->sb_start;
1da177e4 6196
b2d444d7 6197 if (test_bit(Faulty, &rdev->flags)) {
f72ffdd6 6198 printk(KERN_WARNING
1da177e4
LT
6199 "md: can not hot-add faulty %s disk to %s!\n",
6200 bdevname(rdev->bdev,b), mdname(mddev));
6201 err = -EINVAL;
6202 goto abort_export;
6203 }
293467aa 6204
b2d444d7 6205 clear_bit(In_sync, &rdev->flags);
1da177e4 6206 rdev->desc_nr = -1;
5842730d 6207 rdev->saved_raid_disk = -1;
2bf071bf
N
6208 err = bind_rdev_to_array(rdev, mddev);
6209 if (err)
2aa82191 6210 goto abort_export;
1da177e4
LT
6211
6212 /*
6213 * The rest should better be atomic, we can have disk failures
6214 * noticed in interrupt contexts ...
6215 */
6216
1da177e4
LT
6217 rdev->raid_disk = -1;
6218
850b2b42 6219 md_update_sb(mddev, 1);
1da177e4
LT
6220 /*
6221 * Kick recovery, maybe this spare has to be added to the
6222 * array immediately.
6223 */
6224 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6225 md_wakeup_thread(mddev->thread);
d7603b7e 6226 md_new_event(mddev);
1da177e4
LT
6227 return 0;
6228
1da177e4
LT
6229abort_export:
6230 export_rdev(rdev);
6231 return err;
6232}
6233
fd01b88c 6234static int set_bitmap_file(struct mddev *mddev, int fd)
32a7627c 6235{
035328c2 6236 int err = 0;
32a7627c 6237
36fa3063 6238 if (mddev->pers) {
d66b1b39 6239 if (!mddev->pers->quiesce || !mddev->thread)
36fa3063
N
6240 return -EBUSY;
6241 if (mddev->recovery || mddev->sync_thread)
6242 return -EBUSY;
6243 /* we should be able to change the bitmap.. */
6244 }
32a7627c 6245
36fa3063 6246 if (fd >= 0) {
035328c2 6247 struct inode *inode;
1e594bb2
N
6248 struct file *f;
6249
6250 if (mddev->bitmap || mddev->bitmap_info.file)
36fa3063 6251 return -EEXIST; /* cannot add when bitmap is present */
1e594bb2 6252 f = fget(fd);
32a7627c 6253
1e594bb2 6254 if (f == NULL) {
36fa3063
N
6255 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
6256 mdname(mddev));
6257 return -EBADF;
6258 }
6259
1e594bb2 6260 inode = f->f_mapping->host;
035328c2
N
6261 if (!S_ISREG(inode->i_mode)) {
6262 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
6263 mdname(mddev));
6264 err = -EBADF;
1e594bb2 6265 } else if (!(f->f_mode & FMODE_WRITE)) {
035328c2
N
6266 printk(KERN_ERR "%s: error: bitmap file must open for write\n",
6267 mdname(mddev));
6268 err = -EBADF;
6269 } else if (atomic_read(&inode->i_writecount) != 1) {
36fa3063
N
6270 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
6271 mdname(mddev));
035328c2
N
6272 err = -EBUSY;
6273 }
6274 if (err) {
1e594bb2 6275 fput(f);
36fa3063
N
6276 return err;
6277 }
1e594bb2 6278 mddev->bitmap_info.file = f;
c3d9714e 6279 mddev->bitmap_info.offset = 0; /* file overrides offset */
36fa3063
N
6280 } else if (mddev->bitmap == NULL)
6281 return -ENOENT; /* cannot remove what isn't there */
6282 err = 0;
6283 if (mddev->pers) {
6284 mddev->pers->quiesce(mddev, 1);
69e51b44 6285 if (fd >= 0) {
f9209a32
GR
6286 struct bitmap *bitmap;
6287
6288 bitmap = bitmap_create(mddev, -1);
6289 if (!IS_ERR(bitmap)) {
6290 mddev->bitmap = bitmap;
69e51b44 6291 err = bitmap_load(mddev);
ba599aca
N
6292 } else
6293 err = PTR_ERR(bitmap);
69e51b44 6294 }
d7375ab3 6295 if (fd < 0 || err) {
36fa3063 6296 bitmap_destroy(mddev);
d7375ab3
N
6297 fd = -1; /* make sure to put the file */
6298 }
36fa3063 6299 mddev->pers->quiesce(mddev, 0);
d7375ab3
N
6300 }
6301 if (fd < 0) {
4af1a041
N
6302 struct file *f = mddev->bitmap_info.file;
6303 if (f) {
6304 spin_lock(&mddev->lock);
6305 mddev->bitmap_info.file = NULL;
6306 spin_unlock(&mddev->lock);
6307 fput(f);
6308 }
36fa3063
N
6309 }
6310
32a7627c
N
6311 return err;
6312}
6313
1da177e4
LT
6314/*
6315 * set_array_info is used two different ways
6316 * The original usage is when creating a new array.
6317 * In this usage, raid_disks is > 0 and it together with
6318 * level, size, not_persistent,layout,chunksize determine the
6319 * shape of the array.
6320 * This will always create an array with a type-0.90.0 superblock.
6321 * The newer usage is when assembling an array.
6322 * In this case raid_disks will be 0, and the major_version field is
6323 * use to determine which style super-blocks are to be found on the devices.
6324 * The minor and patch _version numbers are also kept incase the
6325 * super_block handler wishes to interpret them.
6326 */
f72ffdd6 6327static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
1da177e4
LT
6328{
6329
6330 if (info->raid_disks == 0) {
6331 /* just setting version number for superblock loading */
6332 if (info->major_version < 0 ||
50511da3 6333 info->major_version >= ARRAY_SIZE(super_types) ||
1da177e4
LT
6334 super_types[info->major_version].name == NULL) {
6335 /* maybe try to auto-load a module? */
f72ffdd6 6336 printk(KERN_INFO
1da177e4
LT
6337 "md: superblock version %d not known\n",
6338 info->major_version);
6339 return -EINVAL;
6340 }
6341 mddev->major_version = info->major_version;
6342 mddev->minor_version = info->minor_version;
6343 mddev->patch_version = info->patch_version;
3f9d7b0d 6344 mddev->persistent = !info->not_persistent;
cbd19983
N
6345 /* ensure mddev_put doesn't delete this now that there
6346 * is some minimal configuration.
6347 */
6348 mddev->ctime = get_seconds();
1da177e4
LT
6349 return 0;
6350 }
6351 mddev->major_version = MD_MAJOR_VERSION;
6352 mddev->minor_version = MD_MINOR_VERSION;
6353 mddev->patch_version = MD_PATCHLEVEL_VERSION;
6354 mddev->ctime = get_seconds();
6355
6356 mddev->level = info->level;
17115e03 6357 mddev->clevel[0] = 0;
58c0fed4 6358 mddev->dev_sectors = 2 * (sector_t)info->size;
1da177e4
LT
6359 mddev->raid_disks = info->raid_disks;
6360 /* don't set md_minor, it is determined by which /dev/md* was
6361 * openned
6362 */
6363 if (info->state & (1<<MD_SB_CLEAN))
6364 mddev->recovery_cp = MaxSector;
6365 else
6366 mddev->recovery_cp = 0;
6367 mddev->persistent = ! info->not_persistent;
e691063a 6368 mddev->external = 0;
1da177e4
LT
6369
6370 mddev->layout = info->layout;
9d8f0363 6371 mddev->chunk_sectors = info->chunk_size >> 9;
1da177e4
LT
6372
6373 mddev->max_disks = MD_SB_DISKS;
6374
e691063a
N
6375 if (mddev->persistent)
6376 mddev->flags = 0;
850b2b42 6377 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1da177e4 6378
c3d9714e 6379 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6409bb05 6380 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
c3d9714e 6381 mddev->bitmap_info.offset = 0;
b2a2703c 6382
f6705578
N
6383 mddev->reshape_position = MaxSector;
6384
1da177e4
LT
6385 /*
6386 * Generate a 128 bit UUID
6387 */
6388 get_random_bytes(mddev->uuid, 16);
6389
f6705578 6390 mddev->new_level = mddev->level;
664e7c41 6391 mddev->new_chunk_sectors = mddev->chunk_sectors;
f6705578
N
6392 mddev->new_layout = mddev->layout;
6393 mddev->delta_disks = 0;
2c810cdd 6394 mddev->reshape_backwards = 0;
f6705578 6395
1da177e4
LT
6396 return 0;
6397}
6398
fd01b88c 6399void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
1f403624 6400{
b522adcd
DW
6401 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
6402
6403 if (mddev->external_size)
6404 return;
6405
1f403624
DW
6406 mddev->array_sectors = array_sectors;
6407}
6408EXPORT_SYMBOL(md_set_array_sectors);
6409
fd01b88c 6410static int update_size(struct mddev *mddev, sector_t num_sectors)
a35b0d69 6411{
3cb03002 6412 struct md_rdev *rdev;
a35b0d69 6413 int rv;
d71f9f88 6414 int fit = (num_sectors == 0);
a35b0d69
N
6415
6416 if (mddev->pers->resize == NULL)
6417 return -EINVAL;
d71f9f88
AN
6418 /* The "num_sectors" is the number of sectors of each device that
6419 * is used. This can only make sense for arrays with redundancy.
6420 * linear and raid0 always use whatever space is available. We can only
6421 * consider changing this number if no resync or reconstruction is
6422 * happening, and if the new size is acceptable. It must fit before the
0f420358 6423 * sb_start or, if that is <data_offset, it must fit before the size
d71f9f88
AN
6424 * of each device. If num_sectors is zero, we find the largest size
6425 * that fits.
a35b0d69 6426 */
f851b60d
N
6427 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6428 mddev->sync_thread)
a35b0d69 6429 return -EBUSY;
bd8839e0
N
6430 if (mddev->ro)
6431 return -EROFS;
a4a6125a 6432
dafb20fa 6433 rdev_for_each(rdev, mddev) {
dd8ac336 6434 sector_t avail = rdev->sectors;
01ab5662 6435
d71f9f88
AN
6436 if (fit && (num_sectors == 0 || num_sectors > avail))
6437 num_sectors = avail;
6438 if (avail < num_sectors)
a35b0d69
N
6439 return -ENOSPC;
6440 }
d71f9f88 6441 rv = mddev->pers->resize(mddev, num_sectors);
449aad3e
N
6442 if (!rv)
6443 revalidate_disk(mddev->gendisk);
a35b0d69
N
6444 return rv;
6445}
6446
fd01b88c 6447static int update_raid_disks(struct mddev *mddev, int raid_disks)
da943b99
N
6448{
6449 int rv;
c6563a8c 6450 struct md_rdev *rdev;
da943b99 6451 /* change the number of raid disks */
63c70c4f 6452 if (mddev->pers->check_reshape == NULL)
da943b99 6453 return -EINVAL;
bd8839e0
N
6454 if (mddev->ro)
6455 return -EROFS;
da943b99 6456 if (raid_disks <= 0 ||
233fca36 6457 (mddev->max_disks && raid_disks >= mddev->max_disks))
da943b99 6458 return -EINVAL;
f851b60d
N
6459 if (mddev->sync_thread ||
6460 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6461 mddev->reshape_position != MaxSector)
da943b99 6462 return -EBUSY;
c6563a8c
N
6463
6464 rdev_for_each(rdev, mddev) {
6465 if (mddev->raid_disks < raid_disks &&
6466 rdev->data_offset < rdev->new_data_offset)
6467 return -EINVAL;
6468 if (mddev->raid_disks > raid_disks &&
6469 rdev->data_offset > rdev->new_data_offset)
6470 return -EINVAL;
6471 }
6472
63c70c4f 6473 mddev->delta_disks = raid_disks - mddev->raid_disks;
2c810cdd
N
6474 if (mddev->delta_disks < 0)
6475 mddev->reshape_backwards = 1;
6476 else if (mddev->delta_disks > 0)
6477 mddev->reshape_backwards = 0;
63c70c4f
N
6478
6479 rv = mddev->pers->check_reshape(mddev);
2c810cdd 6480 if (rv < 0) {
de171cb9 6481 mddev->delta_disks = 0;
2c810cdd
N
6482 mddev->reshape_backwards = 0;
6483 }
da943b99
N
6484 return rv;
6485}
6486
1da177e4
LT
6487/*
6488 * update_array_info is used to change the configuration of an
6489 * on-line array.
6490 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6491 * fields in the info are checked against the array.
6492 * Any differences that cannot be handled will cause an error.
6493 * Normally, only one change can be managed at a time.
6494 */
fd01b88c 6495static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
1da177e4
LT
6496{
6497 int rv = 0;
6498 int cnt = 0;
36fa3063
N
6499 int state = 0;
6500
6501 /* calculate expected state,ignoring low bits */
c3d9714e 6502 if (mddev->bitmap && mddev->bitmap_info.offset)
36fa3063 6503 state |= (1 << MD_SB_BITMAP_PRESENT);
1da177e4
LT
6504
6505 if (mddev->major_version != info->major_version ||
6506 mddev->minor_version != info->minor_version ||
6507/* mddev->patch_version != info->patch_version || */
6508 mddev->ctime != info->ctime ||
6509 mddev->level != info->level ||
6510/* mddev->layout != info->layout || */
4e023612 6511 mddev->persistent != !info->not_persistent ||
9d8f0363 6512 mddev->chunk_sectors != info->chunk_size >> 9 ||
36fa3063
N
6513 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6514 ((state^info->state) & 0xfffffe00)
6515 )
1da177e4
LT
6516 return -EINVAL;
6517 /* Check there is only one change */
58c0fed4
AN
6518 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6519 cnt++;
6520 if (mddev->raid_disks != info->raid_disks)
6521 cnt++;
6522 if (mddev->layout != info->layout)
6523 cnt++;
6524 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6525 cnt++;
6526 if (cnt == 0)
6527 return 0;
6528 if (cnt > 1)
6529 return -EINVAL;
1da177e4
LT
6530
6531 if (mddev->layout != info->layout) {
6532 /* Change layout
6533 * we don't need to do anything at the md level, the
6534 * personality will take care of it all.
6535 */
50ac168a 6536 if (mddev->pers->check_reshape == NULL)
1da177e4 6537 return -EINVAL;
597a711b
N
6538 else {
6539 mddev->new_layout = info->layout;
50ac168a 6540 rv = mddev->pers->check_reshape(mddev);
597a711b
N
6541 if (rv)
6542 mddev->new_layout = mddev->layout;
6543 return rv;
6544 }
1da177e4 6545 }
58c0fed4 6546 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
d71f9f88 6547 rv = update_size(mddev, (sector_t)info->size * 2);
a35b0d69 6548
da943b99
N
6549 if (mddev->raid_disks != info->raid_disks)
6550 rv = update_raid_disks(mddev, info->raid_disks);
6551
36fa3063 6552 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
293467aa
GR
6553 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
6554 rv = -EINVAL;
6555 goto err;
6556 }
6557 if (mddev->recovery || mddev->sync_thread) {
6558 rv = -EBUSY;
6559 goto err;
6560 }
36fa3063 6561 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
f9209a32 6562 struct bitmap *bitmap;
36fa3063 6563 /* add the bitmap */
293467aa
GR
6564 if (mddev->bitmap) {
6565 rv = -EEXIST;
6566 goto err;
6567 }
6568 if (mddev->bitmap_info.default_offset == 0) {
6569 rv = -EINVAL;
6570 goto err;
6571 }
c3d9714e
N
6572 mddev->bitmap_info.offset =
6573 mddev->bitmap_info.default_offset;
6409bb05
N
6574 mddev->bitmap_info.space =
6575 mddev->bitmap_info.default_space;
36fa3063 6576 mddev->pers->quiesce(mddev, 1);
f9209a32
GR
6577 bitmap = bitmap_create(mddev, -1);
6578 if (!IS_ERR(bitmap)) {
6579 mddev->bitmap = bitmap;
69e51b44 6580 rv = bitmap_load(mddev);
ba599aca
N
6581 } else
6582 rv = PTR_ERR(bitmap);
36fa3063
N
6583 if (rv)
6584 bitmap_destroy(mddev);
6585 mddev->pers->quiesce(mddev, 0);
6586 } else {
6587 /* remove the bitmap */
293467aa
GR
6588 if (!mddev->bitmap) {
6589 rv = -ENOENT;
6590 goto err;
6591 }
6592 if (mddev->bitmap->storage.file) {
6593 rv = -EINVAL;
6594 goto err;
6595 }
36fa3063
N
6596 mddev->pers->quiesce(mddev, 1);
6597 bitmap_destroy(mddev);
6598 mddev->pers->quiesce(mddev, 0);
c3d9714e 6599 mddev->bitmap_info.offset = 0;
36fa3063
N
6600 }
6601 }
850b2b42 6602 md_update_sb(mddev, 1);
293467aa
GR
6603 return rv;
6604err:
1da177e4
LT
6605 return rv;
6606}
6607
fd01b88c 6608static int set_disk_faulty(struct mddev *mddev, dev_t dev)
1da177e4 6609{
3cb03002 6610 struct md_rdev *rdev;
1ca69c4b 6611 int err = 0;
1da177e4
LT
6612
6613 if (mddev->pers == NULL)
6614 return -ENODEV;
6615
1ca69c4b
N
6616 rcu_read_lock();
6617 rdev = find_rdev_rcu(mddev, dev);
1da177e4 6618 if (!rdev)
1ca69c4b
N
6619 err = -ENODEV;
6620 else {
6621 md_error(mddev, rdev);
6622 if (!test_bit(Faulty, &rdev->flags))
6623 err = -EBUSY;
6624 }
6625 rcu_read_unlock();
6626 return err;
1da177e4
LT
6627}
6628
2f9618ce
AN
6629/*
6630 * We have a problem here : there is no easy way to give a CHS
6631 * virtual geometry. We currently pretend that we have a 2 heads
6632 * 4 sectors (with a BIG number of cylinders...). This drives
6633 * dosfs just mad... ;-)
6634 */
a885c8c4
CH
6635static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6636{
fd01b88c 6637 struct mddev *mddev = bdev->bd_disk->private_data;
a885c8c4
CH
6638
6639 geo->heads = 2;
6640 geo->sectors = 4;
49ce6cea 6641 geo->cylinders = mddev->array_sectors / 8;
a885c8c4
CH
6642 return 0;
6643}
6644
cb335f88
NS
6645static inline bool md_ioctl_valid(unsigned int cmd)
6646{
6647 switch (cmd) {
6648 case ADD_NEW_DISK:
6649 case BLKROSET:
6650 case GET_ARRAY_INFO:
6651 case GET_BITMAP_FILE:
6652 case GET_DISK_INFO:
6653 case HOT_ADD_DISK:
6654 case HOT_REMOVE_DISK:
cb335f88
NS
6655 case RAID_AUTORUN:
6656 case RAID_VERSION:
6657 case RESTART_ARRAY_RW:
6658 case RUN_ARRAY:
6659 case SET_ARRAY_INFO:
6660 case SET_BITMAP_FILE:
6661 case SET_DISK_FAULTY:
6662 case STOP_ARRAY:
6663 case STOP_ARRAY_RO:
1aee41f6 6664 case CLUSTERED_DISK_NACK:
cb335f88
NS
6665 return true;
6666 default:
6667 return false;
6668 }
6669}
6670
a39907fa 6671static int md_ioctl(struct block_device *bdev, fmode_t mode,
1da177e4
LT
6672 unsigned int cmd, unsigned long arg)
6673{
6674 int err = 0;
6675 void __user *argp = (void __user *)arg;
fd01b88c 6676 struct mddev *mddev = NULL;
e2218350 6677 int ro;
1da177e4 6678
cb335f88
NS
6679 if (!md_ioctl_valid(cmd))
6680 return -ENOTTY;
6681
506c9e44
N
6682 switch (cmd) {
6683 case RAID_VERSION:
6684 case GET_ARRAY_INFO:
6685 case GET_DISK_INFO:
6686 break;
6687 default:
6688 if (!capable(CAP_SYS_ADMIN))
6689 return -EACCES;
6690 }
1da177e4
LT
6691
6692 /*
6693 * Commands dealing with the RAID driver but not any
6694 * particular array:
6695 */
c02c0aeb
N
6696 switch (cmd) {
6697 case RAID_VERSION:
6698 err = get_version(argp);
3adc28d8 6699 goto out;
1da177e4 6700
1da177e4 6701#ifndef MODULE
c02c0aeb
N
6702 case RAID_AUTORUN:
6703 err = 0;
6704 autostart_arrays(arg);
3adc28d8 6705 goto out;
1da177e4 6706#endif
c02c0aeb 6707 default:;
1da177e4
LT
6708 }
6709
6710 /*
6711 * Commands creating/starting a new array:
6712 */
6713
a39907fa 6714 mddev = bdev->bd_disk->private_data;
1da177e4
LT
6715
6716 if (!mddev) {
6717 BUG();
3adc28d8 6718 goto out;
1da177e4
LT
6719 }
6720
1ca69c4b
N
6721 /* Some actions do not requires the mutex */
6722 switch (cmd) {
6723 case GET_ARRAY_INFO:
6724 if (!mddev->raid_disks && !mddev->external)
6725 err = -ENODEV;
6726 else
6727 err = get_array_info(mddev, argp);
3adc28d8 6728 goto out;
1ca69c4b
N
6729
6730 case GET_DISK_INFO:
6731 if (!mddev->raid_disks && !mddev->external)
6732 err = -ENODEV;
6733 else
6734 err = get_disk_info(mddev, argp);
3adc28d8 6735 goto out;
1ca69c4b
N
6736
6737 case SET_DISK_FAULTY:
6738 err = set_disk_faulty(mddev, new_decode_dev(arg));
3adc28d8 6739 goto out;
4af1a041
N
6740
6741 case GET_BITMAP_FILE:
6742 err = get_bitmap_file(mddev, argp);
6743 goto out;
6744
1ca69c4b
N
6745 }
6746
a7a3f08d
N
6747 if (cmd == ADD_NEW_DISK)
6748 /* need to ensure md_delayed_delete() has completed */
6749 flush_workqueue(md_misc_wq);
6750
90f5f7ad
HR
6751 if (cmd == HOT_REMOVE_DISK)
6752 /* need to ensure recovery thread has run */
6753 wait_event_interruptible_timeout(mddev->sb_wait,
6754 !test_bit(MD_RECOVERY_NEEDED,
6755 &mddev->flags),
6756 msecs_to_jiffies(5000));
260fa034
N
6757 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
6758 /* Need to flush page cache, and ensure no-one else opens
6759 * and writes
6760 */
6761 mutex_lock(&mddev->open_mutex);
9ba3b7f5 6762 if (mddev->pers && atomic_read(&mddev->openers) > 1) {
260fa034
N
6763 mutex_unlock(&mddev->open_mutex);
6764 err = -EBUSY;
3adc28d8 6765 goto out;
260fa034
N
6766 }
6767 set_bit(MD_STILL_CLOSED, &mddev->flags);
6768 mutex_unlock(&mddev->open_mutex);
6769 sync_blockdev(bdev);
6770 }
1da177e4
LT
6771 err = mddev_lock(mddev);
6772 if (err) {
f72ffdd6 6773 printk(KERN_INFO
1da177e4
LT
6774 "md: ioctl lock interrupted, reason %d, cmd %d\n",
6775 err, cmd);
3adc28d8 6776 goto out;
1da177e4
LT
6777 }
6778
c02c0aeb
N
6779 if (cmd == SET_ARRAY_INFO) {
6780 mdu_array_info_t info;
6781 if (!arg)
6782 memset(&info, 0, sizeof(info));
6783 else if (copy_from_user(&info, argp, sizeof(info))) {
6784 err = -EFAULT;
3adc28d8 6785 goto unlock;
c02c0aeb
N
6786 }
6787 if (mddev->pers) {
6788 err = update_array_info(mddev, &info);
6789 if (err) {
6790 printk(KERN_WARNING "md: couldn't update"
6791 " array info. %d\n", err);
3adc28d8 6792 goto unlock;
1da177e4 6793 }
3adc28d8 6794 goto unlock;
c02c0aeb
N
6795 }
6796 if (!list_empty(&mddev->disks)) {
6797 printk(KERN_WARNING
6798 "md: array %s already has disks!\n",
6799 mdname(mddev));
6800 err = -EBUSY;
3adc28d8 6801 goto unlock;
c02c0aeb
N
6802 }
6803 if (mddev->raid_disks) {
6804 printk(KERN_WARNING
6805 "md: array %s already initialised!\n",
6806 mdname(mddev));
6807 err = -EBUSY;
3adc28d8 6808 goto unlock;
c02c0aeb
N
6809 }
6810 err = set_array_info(mddev, &info);
6811 if (err) {
6812 printk(KERN_WARNING "md: couldn't set"
6813 " array info. %d\n", err);
3adc28d8 6814 goto unlock;
c02c0aeb 6815 }
3adc28d8 6816 goto unlock;
1da177e4
LT
6817 }
6818
6819 /*
6820 * Commands querying/configuring an existing array:
6821 */
32a7627c 6822 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
3f9d7b0d 6823 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
a17184a9
N
6824 if ((!mddev->raid_disks && !mddev->external)
6825 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
6826 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
6827 && cmd != GET_BITMAP_FILE) {
1da177e4 6828 err = -ENODEV;
3adc28d8 6829 goto unlock;
1da177e4
LT
6830 }
6831
6832 /*
6833 * Commands even a read-only array can execute:
6834 */
c02c0aeb 6835 switch (cmd) {
c02c0aeb
N
6836 case RESTART_ARRAY_RW:
6837 err = restart_array(mddev);
3adc28d8 6838 goto unlock;
1da177e4 6839
c02c0aeb
N
6840 case STOP_ARRAY:
6841 err = do_md_stop(mddev, 0, bdev);
3adc28d8 6842 goto unlock;
1da177e4 6843
c02c0aeb
N
6844 case STOP_ARRAY_RO:
6845 err = md_set_readonly(mddev, bdev);
3adc28d8 6846 goto unlock;
1da177e4 6847
3ea8929d
N
6848 case HOT_REMOVE_DISK:
6849 err = hot_remove_disk(mddev, new_decode_dev(arg));
3adc28d8 6850 goto unlock;
3ea8929d 6851
7ceb17e8
N
6852 case ADD_NEW_DISK:
6853 /* We can support ADD_NEW_DISK on read-only arrays
6854 * on if we are re-adding a preexisting device.
6855 * So require mddev->pers and MD_DISK_SYNC.
6856 */
6857 if (mddev->pers) {
6858 mdu_disk_info_t info;
6859 if (copy_from_user(&info, argp, sizeof(info)))
6860 err = -EFAULT;
6861 else if (!(info.state & (1<<MD_DISK_SYNC)))
6862 /* Need to clear read-only for this */
6863 break;
6864 else
6865 err = add_new_disk(mddev, &info);
3adc28d8 6866 goto unlock;
7ceb17e8
N
6867 }
6868 break;
6869
c02c0aeb
N
6870 case BLKROSET:
6871 if (get_user(ro, (int __user *)(arg))) {
6872 err = -EFAULT;
3adc28d8 6873 goto unlock;
c02c0aeb
N
6874 }
6875 err = -EINVAL;
e2218350 6876
c02c0aeb
N
6877 /* if the bdev is going readonly the value of mddev->ro
6878 * does not matter, no writes are coming
6879 */
6880 if (ro)
3adc28d8 6881 goto unlock;
e2218350 6882
c02c0aeb
N
6883 /* are we are already prepared for writes? */
6884 if (mddev->ro != 1)
3adc28d8 6885 goto unlock;
e2218350 6886
c02c0aeb
N
6887 /* transitioning to readauto need only happen for
6888 * arrays that call md_write_start
6889 */
6890 if (mddev->pers) {
6891 err = restart_array(mddev);
6892 if (err == 0) {
6893 mddev->ro = 2;
6894 set_disk_ro(mddev->gendisk, 0);
e2218350 6895 }
c02c0aeb 6896 }
3adc28d8 6897 goto unlock;
1da177e4
LT
6898 }
6899
6900 /*
6901 * The remaining ioctls are changing the state of the
f91de92e 6902 * superblock, so we do not allow them on read-only arrays.
1da177e4 6903 */
326eb17d 6904 if (mddev->ro && mddev->pers) {
f91de92e
N
6905 if (mddev->ro == 2) {
6906 mddev->ro = 0;
00bcb4ac 6907 sysfs_notify_dirent_safe(mddev->sysfs_state);
0fd62b86 6908 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
f3378b48
N
6909 /* mddev_unlock will wake thread */
6910 /* If a device failed while we were read-only, we
6911 * need to make sure the metadata is updated now.
6912 */
6913 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6914 mddev_unlock(mddev);
6915 wait_event(mddev->sb_wait,
6916 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6917 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
29f097c4 6918 mddev_lock_nointr(mddev);
f3378b48 6919 }
f91de92e
N
6920 } else {
6921 err = -EROFS;
3adc28d8 6922 goto unlock;
f91de92e 6923 }
1da177e4
LT
6924 }
6925
c02c0aeb
N
6926 switch (cmd) {
6927 case ADD_NEW_DISK:
1da177e4 6928 {
c02c0aeb
N
6929 mdu_disk_info_t info;
6930 if (copy_from_user(&info, argp, sizeof(info)))
6931 err = -EFAULT;
6932 else
6933 err = add_new_disk(mddev, &info);
3adc28d8 6934 goto unlock;
c02c0aeb 6935 }
1da177e4 6936
1aee41f6
GR
6937 case CLUSTERED_DISK_NACK:
6938 if (mddev_is_clustered(mddev))
6939 md_cluster_ops->new_disk_ack(mddev, false);
6940 else
6941 err = -EINVAL;
6942 goto unlock;
6943
c02c0aeb
N
6944 case HOT_ADD_DISK:
6945 err = hot_add_disk(mddev, new_decode_dev(arg));
3adc28d8 6946 goto unlock;
1da177e4 6947
c02c0aeb
N
6948 case RUN_ARRAY:
6949 err = do_md_run(mddev);
3adc28d8 6950 goto unlock;
1da177e4 6951
c02c0aeb
N
6952 case SET_BITMAP_FILE:
6953 err = set_bitmap_file(mddev, (int)arg);
3adc28d8 6954 goto unlock;
32a7627c 6955
c02c0aeb
N
6956 default:
6957 err = -EINVAL;
3adc28d8 6958 goto unlock;
1da177e4
LT
6959 }
6960
3adc28d8 6961unlock:
d3374825
N
6962 if (mddev->hold_active == UNTIL_IOCTL &&
6963 err != -EINVAL)
6964 mddev->hold_active = 0;
1da177e4 6965 mddev_unlock(mddev);
3adc28d8 6966out:
1da177e4
LT
6967 return err;
6968}
aa98aa31
AB
6969#ifdef CONFIG_COMPAT
6970static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
6971 unsigned int cmd, unsigned long arg)
6972{
6973 switch (cmd) {
6974 case HOT_REMOVE_DISK:
6975 case HOT_ADD_DISK:
6976 case SET_DISK_FAULTY:
6977 case SET_BITMAP_FILE:
6978 /* These take in integer arg, do not convert */
6979 break;
6980 default:
6981 arg = (unsigned long)compat_ptr(arg);
6982 break;
6983 }
6984
6985 return md_ioctl(bdev, mode, cmd, arg);
6986}
6987#endif /* CONFIG_COMPAT */
1da177e4 6988
a39907fa 6989static int md_open(struct block_device *bdev, fmode_t mode)
1da177e4
LT
6990{
6991 /*
6992 * Succeed if we can lock the mddev, which confirms that
6993 * it isn't being stopped right now.
6994 */
fd01b88c 6995 struct mddev *mddev = mddev_find(bdev->bd_dev);
1da177e4
LT
6996 int err;
6997
0c098220
YL
6998 if (!mddev)
6999 return -ENODEV;
7000
d3374825
N
7001 if (mddev->gendisk != bdev->bd_disk) {
7002 /* we are racing with mddev_put which is discarding this
7003 * bd_disk.
7004 */
7005 mddev_put(mddev);
7006 /* Wait until bdev->bd_disk is definitely gone */
e804ac78 7007 flush_workqueue(md_misc_wq);
d3374825
N
7008 /* Then retry the open from the top */
7009 return -ERESTARTSYS;
7010 }
7011 BUG_ON(mddev != bdev->bd_disk->private_data);
7012
c8c00a69 7013 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
1da177e4
LT
7014 goto out;
7015
7016 err = 0;
f2ea68cf 7017 atomic_inc(&mddev->openers);
260fa034 7018 clear_bit(MD_STILL_CLOSED, &mddev->flags);
c8c00a69 7019 mutex_unlock(&mddev->open_mutex);
1da177e4 7020
f0b4f7e2 7021 check_disk_change(bdev);
1da177e4
LT
7022 out:
7023 return err;
7024}
7025
db2a144b 7026static void md_release(struct gendisk *disk, fmode_t mode)
1da177e4 7027{
f72ffdd6 7028 struct mddev *mddev = disk->private_data;
1da177e4 7029
52e5f9d1 7030 BUG_ON(!mddev);
f2ea68cf 7031 atomic_dec(&mddev->openers);
1da177e4 7032 mddev_put(mddev);
1da177e4 7033}
f0b4f7e2
N
7034
7035static int md_media_changed(struct gendisk *disk)
7036{
fd01b88c 7037 struct mddev *mddev = disk->private_data;
f0b4f7e2
N
7038
7039 return mddev->changed;
7040}
7041
7042static int md_revalidate(struct gendisk *disk)
7043{
fd01b88c 7044 struct mddev *mddev = disk->private_data;
f0b4f7e2
N
7045
7046 mddev->changed = 0;
7047 return 0;
7048}
83d5cde4 7049static const struct block_device_operations md_fops =
1da177e4
LT
7050{
7051 .owner = THIS_MODULE,
a39907fa
AV
7052 .open = md_open,
7053 .release = md_release,
b492b852 7054 .ioctl = md_ioctl,
aa98aa31
AB
7055#ifdef CONFIG_COMPAT
7056 .compat_ioctl = md_compat_ioctl,
7057#endif
a885c8c4 7058 .getgeo = md_getgeo,
f0b4f7e2
N
7059 .media_changed = md_media_changed,
7060 .revalidate_disk= md_revalidate,
1da177e4
LT
7061};
7062
f72ffdd6 7063static int md_thread(void *arg)
1da177e4 7064{
2b8bf345 7065 struct md_thread *thread = arg;
1da177e4 7066
1da177e4
LT
7067 /*
7068 * md_thread is a 'system-thread', it's priority should be very
7069 * high. We avoid resource deadlocks individually in each
7070 * raid personality. (RAID5 does preallocation) We also use RR and
7071 * the very same RT priority as kswapd, thus we will never get
7072 * into a priority inversion deadlock.
7073 *
7074 * we definitely have to have equal or higher priority than
7075 * bdflush, otherwise bdflush will deadlock if there are too
7076 * many dirty RAID5 blocks.
7077 */
1da177e4 7078
6985c43f 7079 allow_signal(SIGKILL);
a6fb0934 7080 while (!kthread_should_stop()) {
1da177e4 7081
93588e22
N
7082 /* We need to wait INTERRUPTIBLE so that
7083 * we don't add to the load-average.
7084 * That means we need to be sure no signals are
7085 * pending
7086 */
7087 if (signal_pending(current))
7088 flush_signals(current);
7089
7090 wait_event_interruptible_timeout
7091 (thread->wqueue,
7092 test_bit(THREAD_WAKEUP, &thread->flags)
7093 || kthread_should_stop(),
7094 thread->timeout);
1da177e4 7095
6c987910
N
7096 clear_bit(THREAD_WAKEUP, &thread->flags);
7097 if (!kthread_should_stop())
4ed8731d 7098 thread->run(thread);
1da177e4 7099 }
a6fb0934 7100
1da177e4
LT
7101 return 0;
7102}
7103
2b8bf345 7104void md_wakeup_thread(struct md_thread *thread)
1da177e4
LT
7105{
7106 if (thread) {
36a4e1fe 7107 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
1da177e4
LT
7108 set_bit(THREAD_WAKEUP, &thread->flags);
7109 wake_up(&thread->wqueue);
7110 }
7111}
6c144d31 7112EXPORT_SYMBOL(md_wakeup_thread);
1da177e4 7113
4ed8731d
SL
7114struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7115 struct mddev *mddev, const char *name)
1da177e4 7116{
2b8bf345 7117 struct md_thread *thread;
1da177e4 7118
2b8bf345 7119 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
1da177e4
LT
7120 if (!thread)
7121 return NULL;
7122
1da177e4
LT
7123 init_waitqueue_head(&thread->wqueue);
7124
1da177e4
LT
7125 thread->run = run;
7126 thread->mddev = mddev;
32a7627c 7127 thread->timeout = MAX_SCHEDULE_TIMEOUT;
0da3c619
N
7128 thread->tsk = kthread_run(md_thread, thread,
7129 "%s_%s",
7130 mdname(thread->mddev),
0232605d 7131 name);
a6fb0934 7132 if (IS_ERR(thread->tsk)) {
1da177e4
LT
7133 kfree(thread);
7134 return NULL;
7135 }
1da177e4
LT
7136 return thread;
7137}
6c144d31 7138EXPORT_SYMBOL(md_register_thread);
1da177e4 7139
2b8bf345 7140void md_unregister_thread(struct md_thread **threadp)
1da177e4 7141{
2b8bf345 7142 struct md_thread *thread = *threadp;
e0cf8f04
N
7143 if (!thread)
7144 return;
36a4e1fe 7145 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
01f96c0a
N
7146 /* Locking ensures that mddev_unlock does not wake_up a
7147 * non-existent thread
7148 */
7149 spin_lock(&pers_lock);
7150 *threadp = NULL;
7151 spin_unlock(&pers_lock);
a6fb0934
N
7152
7153 kthread_stop(thread->tsk);
1da177e4
LT
7154 kfree(thread);
7155}
6c144d31 7156EXPORT_SYMBOL(md_unregister_thread);
1da177e4 7157
fd01b88c 7158void md_error(struct mddev *mddev, struct md_rdev *rdev)
1da177e4 7159{
b2d444d7 7160 if (!rdev || test_bit(Faulty, &rdev->flags))
1da177e4 7161 return;
6bfe0b49 7162
de393cde 7163 if (!mddev->pers || !mddev->pers->error_handler)
1da177e4
LT
7164 return;
7165 mddev->pers->error_handler(mddev,rdev);
72a23c21
NB
7166 if (mddev->degraded)
7167 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
00bcb4ac 7168 sysfs_notify_dirent_safe(rdev->sysfs_state);
1da177e4
LT
7169 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7170 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7171 md_wakeup_thread(mddev->thread);
768a418d 7172 if (mddev->event_work.func)
e804ac78 7173 queue_work(md_misc_wq, &mddev->event_work);
c331eb04 7174 md_new_event_inintr(mddev);
1da177e4 7175}
6c144d31 7176EXPORT_SYMBOL(md_error);
1da177e4
LT
7177
7178/* seq_file implementation /proc/mdstat */
7179
7180static void status_unused(struct seq_file *seq)
7181{
7182 int i = 0;
3cb03002 7183 struct md_rdev *rdev;
1da177e4
LT
7184
7185 seq_printf(seq, "unused devices: ");
7186
159ec1fc 7187 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
1da177e4
LT
7188 char b[BDEVNAME_SIZE];
7189 i++;
7190 seq_printf(seq, "%s ",
7191 bdevname(rdev->bdev,b));
7192 }
7193 if (!i)
7194 seq_printf(seq, "<none>");
7195
7196 seq_printf(seq, "\n");
7197}
7198
f7851be7 7199static int status_resync(struct seq_file *seq, struct mddev *mddev)
1da177e4 7200{
dd71cf6b
N
7201 sector_t max_sectors, resync, res;
7202 unsigned long dt, db;
7203 sector_t rt;
4588b42e
N
7204 int scale;
7205 unsigned int per_milli;
1da177e4 7206
c804cdec
N
7207 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7208 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
dd71cf6b 7209 max_sectors = mddev->resync_max_sectors;
1da177e4 7210 else
dd71cf6b 7211 max_sectors = mddev->dev_sectors;
1da177e4 7212
f7851be7
N
7213 resync = mddev->curr_resync;
7214 if (resync <= 3) {
7215 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7216 /* Still cleaning up */
7217 resync = max_sectors;
7218 } else
7219 resync -= atomic_read(&mddev->recovery_active);
7220
7221 if (resync == 0) {
7222 if (mddev->recovery_cp < MaxSector) {
7223 seq_printf(seq, "\tresync=PENDING");
7224 return 1;
7225 }
7226 return 0;
7227 }
7228 if (resync < 3) {
7229 seq_printf(seq, "\tresync=DELAYED");
7230 return 1;
7231 }
7232
403df478 7233 WARN_ON(max_sectors == 0);
4588b42e 7234 /* Pick 'scale' such that (resync>>scale)*1000 will fit
dd71cf6b 7235 * in a sector_t, and (max_sectors>>scale) will fit in a
4588b42e
N
7236 * u32, as those are the requirements for sector_div.
7237 * Thus 'scale' must be at least 10
7238 */
7239 scale = 10;
7240 if (sizeof(sector_t) > sizeof(unsigned long)) {
dd71cf6b 7241 while ( max_sectors/2 > (1ULL<<(scale+32)))
4588b42e
N
7242 scale++;
7243 }
7244 res = (resync>>scale)*1000;
dd71cf6b 7245 sector_div(res, (u32)((max_sectors>>scale)+1));
4588b42e
N
7246
7247 per_milli = res;
1da177e4 7248 {
4588b42e 7249 int i, x = per_milli/50, y = 20-x;
1da177e4
LT
7250 seq_printf(seq, "[");
7251 for (i = 0; i < x; i++)
7252 seq_printf(seq, "=");
7253 seq_printf(seq, ">");
7254 for (i = 0; i < y; i++)
7255 seq_printf(seq, ".");
7256 seq_printf(seq, "] ");
7257 }
4588b42e 7258 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
ccfcc3c1
N
7259 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
7260 "reshape" :
61df9d91
N
7261 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
7262 "check" :
7263 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
7264 "resync" : "recovery"))),
7265 per_milli/10, per_milli % 10,
dd71cf6b
N
7266 (unsigned long long) resync/2,
7267 (unsigned long long) max_sectors/2);
1da177e4
LT
7268
7269 /*
1da177e4
LT
7270 * dt: time from mark until now
7271 * db: blocks written from mark until now
7272 * rt: remaining time
dd71cf6b
N
7273 *
7274 * rt is a sector_t, so could be 32bit or 64bit.
7275 * So we divide before multiply in case it is 32bit and close
7276 * to the limit.
25985edc 7277 * We scale the divisor (db) by 32 to avoid losing precision
dd71cf6b
N
7278 * near the end of resync when the number of remaining sectors
7279 * is close to 'db'.
7280 * We then divide rt by 32 after multiplying by db to compensate.
7281 * The '+1' avoids division by zero if db is very small.
1da177e4
LT
7282 */
7283 dt = ((jiffies - mddev->resync_mark) / HZ);
7284 if (!dt) dt++;
ff4e8d9a
N
7285 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
7286 - mddev->resync_mark_cnt;
1da177e4 7287
dd71cf6b
N
7288 rt = max_sectors - resync; /* number of remaining sectors */
7289 sector_div(rt, db/32+1);
7290 rt *= dt;
7291 rt >>= 5;
7292
7293 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
7294 ((unsigned long)rt % 60)/6);
1da177e4 7295
ff4e8d9a 7296 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
f7851be7 7297 return 1;
1da177e4
LT
7298}
7299
7300static void *md_seq_start(struct seq_file *seq, loff_t *pos)
7301{
7302 struct list_head *tmp;
7303 loff_t l = *pos;
fd01b88c 7304 struct mddev *mddev;
1da177e4
LT
7305
7306 if (l >= 0x10000)
7307 return NULL;
7308 if (!l--)
7309 /* header */
7310 return (void*)1;
7311
7312 spin_lock(&all_mddevs_lock);
7313 list_for_each(tmp,&all_mddevs)
7314 if (!l--) {
fd01b88c 7315 mddev = list_entry(tmp, struct mddev, all_mddevs);
1da177e4
LT
7316 mddev_get(mddev);
7317 spin_unlock(&all_mddevs_lock);
7318 return mddev;
7319 }
7320 spin_unlock(&all_mddevs_lock);
7321 if (!l--)
7322 return (void*)2;/* tail */
7323 return NULL;
7324}
7325
7326static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
7327{
7328 struct list_head *tmp;
fd01b88c 7329 struct mddev *next_mddev, *mddev = v;
f72ffdd6 7330
1da177e4
LT
7331 ++*pos;
7332 if (v == (void*)2)
7333 return NULL;
7334
7335 spin_lock(&all_mddevs_lock);
7336 if (v == (void*)1)
7337 tmp = all_mddevs.next;
7338 else
7339 tmp = mddev->all_mddevs.next;
7340 if (tmp != &all_mddevs)
fd01b88c 7341 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
1da177e4
LT
7342 else {
7343 next_mddev = (void*)2;
7344 *pos = 0x10000;
f72ffdd6 7345 }
1da177e4
LT
7346 spin_unlock(&all_mddevs_lock);
7347
7348 if (v != (void*)1)
7349 mddev_put(mddev);
7350 return next_mddev;
7351
7352}
7353
7354static void md_seq_stop(struct seq_file *seq, void *v)
7355{
fd01b88c 7356 struct mddev *mddev = v;
1da177e4
LT
7357
7358 if (mddev && v != (void*)1 && v != (void*)2)
7359 mddev_put(mddev);
7360}
7361
7362static int md_seq_show(struct seq_file *seq, void *v)
7363{
fd01b88c 7364 struct mddev *mddev = v;
dd8ac336 7365 sector_t sectors;
3cb03002 7366 struct md_rdev *rdev;
1da177e4
LT
7367
7368 if (v == (void*)1) {
84fc4b56 7369 struct md_personality *pers;
1da177e4
LT
7370 seq_printf(seq, "Personalities : ");
7371 spin_lock(&pers_lock);
2604b703
N
7372 list_for_each_entry(pers, &pers_list, list)
7373 seq_printf(seq, "[%s] ", pers->name);
1da177e4
LT
7374
7375 spin_unlock(&pers_lock);
7376 seq_printf(seq, "\n");
f1514638 7377 seq->poll_event = atomic_read(&md_event_count);
1da177e4
LT
7378 return 0;
7379 }
7380 if (v == (void*)2) {
7381 status_unused(seq);
7382 return 0;
7383 }
7384
36d091f4 7385 spin_lock(&mddev->lock);
1da177e4
LT
7386 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7387 seq_printf(seq, "%s : %sactive", mdname(mddev),
7388 mddev->pers ? "" : "in");
7389 if (mddev->pers) {
f91de92e 7390 if (mddev->ro==1)
1da177e4 7391 seq_printf(seq, " (read-only)");
f91de92e 7392 if (mddev->ro==2)
52720ae7 7393 seq_printf(seq, " (auto-read-only)");
1da177e4
LT
7394 seq_printf(seq, " %s", mddev->pers->name);
7395 }
7396
dd8ac336 7397 sectors = 0;
f97fcad3
N
7398 rcu_read_lock();
7399 rdev_for_each_rcu(rdev, mddev) {
1da177e4
LT
7400 char b[BDEVNAME_SIZE];
7401 seq_printf(seq, " %s[%d]",
7402 bdevname(rdev->bdev,b), rdev->desc_nr);
8ddf9efe
N
7403 if (test_bit(WriteMostly, &rdev->flags))
7404 seq_printf(seq, "(W)");
9efdca16
SL
7405 if (test_bit(Journal, &rdev->flags))
7406 seq_printf(seq, "(J)");
b2d444d7 7407 if (test_bit(Faulty, &rdev->flags)) {
1da177e4
LT
7408 seq_printf(seq, "(F)");
7409 continue;
2d78f8c4
N
7410 }
7411 if (rdev->raid_disk < 0)
b325a32e 7412 seq_printf(seq, "(S)"); /* spare */
2d78f8c4
N
7413 if (test_bit(Replacement, &rdev->flags))
7414 seq_printf(seq, "(R)");
dd8ac336 7415 sectors += rdev->sectors;
1da177e4 7416 }
f97fcad3 7417 rcu_read_unlock();
1da177e4
LT
7418
7419 if (!list_empty(&mddev->disks)) {
7420 if (mddev->pers)
7421 seq_printf(seq, "\n %llu blocks",
f233ea5c
AN
7422 (unsigned long long)
7423 mddev->array_sectors / 2);
1da177e4
LT
7424 else
7425 seq_printf(seq, "\n %llu blocks",
dd8ac336 7426 (unsigned long long)sectors / 2);
1da177e4 7427 }
1cd6bf19
N
7428 if (mddev->persistent) {
7429 if (mddev->major_version != 0 ||
7430 mddev->minor_version != 90) {
7431 seq_printf(seq," super %d.%d",
7432 mddev->major_version,
7433 mddev->minor_version);
7434 }
e691063a
N
7435 } else if (mddev->external)
7436 seq_printf(seq, " super external:%s",
7437 mddev->metadata_type);
7438 else
1cd6bf19 7439 seq_printf(seq, " super non-persistent");
1da177e4
LT
7440
7441 if (mddev->pers) {
d710e138 7442 mddev->pers->status(seq, mddev);
f72ffdd6 7443 seq_printf(seq, "\n ");
8e1b39d6 7444 if (mddev->pers->sync_request) {
f7851be7 7445 if (status_resync(seq, mddev))
8e1b39d6 7446 seq_printf(seq, "\n ");
8e1b39d6 7447 }
32a7627c
N
7448 } else
7449 seq_printf(seq, "\n ");
7450
57148964 7451 bitmap_status(seq, mddev->bitmap);
1da177e4
LT
7452
7453 seq_printf(seq, "\n");
7454 }
36d091f4 7455 spin_unlock(&mddev->lock);
f72ffdd6 7456
1da177e4
LT
7457 return 0;
7458}
7459
110518bc 7460static const struct seq_operations md_seq_ops = {
1da177e4
LT
7461 .start = md_seq_start,
7462 .next = md_seq_next,
7463 .stop = md_seq_stop,
7464 .show = md_seq_show,
7465};
7466
7467static int md_seq_open(struct inode *inode, struct file *file)
7468{
f1514638 7469 struct seq_file *seq;
1da177e4
LT
7470 int error;
7471
7472 error = seq_open(file, &md_seq_ops);
d7603b7e 7473 if (error)
f1514638
KS
7474 return error;
7475
7476 seq = file->private_data;
7477 seq->poll_event = atomic_read(&md_event_count);
1da177e4
LT
7478 return error;
7479}
7480
e2f23b60 7481static int md_unloading;
d7603b7e
N
7482static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
7483{
f1514638 7484 struct seq_file *seq = filp->private_data;
d7603b7e
N
7485 int mask;
7486
e2f23b60 7487 if (md_unloading)
7d7e64f2 7488 return POLLIN|POLLRDNORM|POLLERR|POLLPRI;
d7603b7e
N
7489 poll_wait(filp, &md_event_waiters, wait);
7490
7491 /* always allow read */
7492 mask = POLLIN | POLLRDNORM;
7493
f1514638 7494 if (seq->poll_event != atomic_read(&md_event_count))
d7603b7e
N
7495 mask |= POLLERR | POLLPRI;
7496 return mask;
7497}
7498
fa027c2a 7499static const struct file_operations md_seq_fops = {
e24650c2 7500 .owner = THIS_MODULE,
1da177e4
LT
7501 .open = md_seq_open,
7502 .read = seq_read,
7503 .llseek = seq_lseek,
c3f94b40 7504 .release = seq_release_private,
d7603b7e 7505 .poll = mdstat_poll,
1da177e4
LT
7506};
7507
84fc4b56 7508int register_md_personality(struct md_personality *p)
1da177e4 7509{
50bd3774
CY
7510 printk(KERN_INFO "md: %s personality registered for level %d\n",
7511 p->name, p->level);
1da177e4 7512 spin_lock(&pers_lock);
2604b703 7513 list_add_tail(&p->list, &pers_list);
1da177e4
LT
7514 spin_unlock(&pers_lock);
7515 return 0;
7516}
6c144d31 7517EXPORT_SYMBOL(register_md_personality);
1da177e4 7518
84fc4b56 7519int unregister_md_personality(struct md_personality *p)
1da177e4 7520{
2604b703 7521 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
1da177e4 7522 spin_lock(&pers_lock);
2604b703 7523 list_del_init(&p->list);
1da177e4
LT
7524 spin_unlock(&pers_lock);
7525 return 0;
7526}
6c144d31 7527EXPORT_SYMBOL(unregister_md_personality);
1da177e4 7528
6022e75b
N
7529int register_md_cluster_operations(struct md_cluster_operations *ops,
7530 struct module *module)
edb39c9d 7531{
6022e75b 7532 int ret = 0;
edb39c9d 7533 spin_lock(&pers_lock);
6022e75b
N
7534 if (md_cluster_ops != NULL)
7535 ret = -EALREADY;
7536 else {
7537 md_cluster_ops = ops;
7538 md_cluster_mod = module;
7539 }
edb39c9d 7540 spin_unlock(&pers_lock);
6022e75b 7541 return ret;
edb39c9d
GR
7542}
7543EXPORT_SYMBOL(register_md_cluster_operations);
7544
7545int unregister_md_cluster_operations(void)
7546{
7547 spin_lock(&pers_lock);
7548 md_cluster_ops = NULL;
7549 spin_unlock(&pers_lock);
7550 return 0;
7551}
7552EXPORT_SYMBOL(unregister_md_cluster_operations);
7553
7554int md_setup_cluster(struct mddev *mddev, int nodes)
7555{
7556 int err;
7557
7558 err = request_module("md-cluster");
7559 if (err) {
7560 pr_err("md-cluster module not found.\n");
b0c26a79 7561 return -ENOENT;
edb39c9d
GR
7562 }
7563
7564 spin_lock(&pers_lock);
7565 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
7566 spin_unlock(&pers_lock);
7567 return -ENOENT;
7568 }
7569 spin_unlock(&pers_lock);
7570
cf921cc1 7571 return md_cluster_ops->join(mddev, nodes);
edb39c9d
GR
7572}
7573
7574void md_cluster_stop(struct mddev *mddev)
7575{
c4ce867f
GR
7576 if (!md_cluster_ops)
7577 return;
edb39c9d
GR
7578 md_cluster_ops->leave(mddev);
7579 module_put(md_cluster_mod);
7580}
7581
fd01b88c 7582static int is_mddev_idle(struct mddev *mddev, int init)
1da177e4 7583{
f72ffdd6 7584 struct md_rdev *rdev;
1da177e4 7585 int idle;
eea1bf38 7586 int curr_events;
1da177e4
LT
7587
7588 idle = 1;
4b80991c
N
7589 rcu_read_lock();
7590 rdev_for_each_rcu(rdev, mddev) {
1da177e4 7591 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
eea1bf38
N
7592 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7593 (int)part_stat_read(&disk->part0, sectors[1]) -
7594 atomic_read(&disk->sync_io);
713f6ab1
N
7595 /* sync IO will cause sync_io to increase before the disk_stats
7596 * as sync_io is counted when a request starts, and
7597 * disk_stats is counted when it completes.
7598 * So resync activity will cause curr_events to be smaller than
7599 * when there was no such activity.
7600 * non-sync IO will cause disk_stat to increase without
7601 * increasing sync_io so curr_events will (eventually)
7602 * be larger than it was before. Once it becomes
7603 * substantially larger, the test below will cause
7604 * the array to appear non-idle, and resync will slow
7605 * down.
7606 * If there is a lot of outstanding resync activity when
7607 * we set last_event to curr_events, then all that activity
7608 * completing might cause the array to appear non-idle
7609 * and resync will be slowed down even though there might
7610 * not have been non-resync activity. This will only
7611 * happen once though. 'last_events' will soon reflect
7612 * the state where there is little or no outstanding
7613 * resync requests, and further resync activity will
7614 * always make curr_events less than last_events.
c0e48521 7615 *
1da177e4 7616 */
eea1bf38 7617 if (init || curr_events - rdev->last_events > 64) {
1da177e4
LT
7618 rdev->last_events = curr_events;
7619 idle = 0;
7620 }
7621 }
4b80991c 7622 rcu_read_unlock();
1da177e4
LT
7623 return idle;
7624}
7625
fd01b88c 7626void md_done_sync(struct mddev *mddev, int blocks, int ok)
1da177e4
LT
7627{
7628 /* another "blocks" (512byte) blocks have been synced */
7629 atomic_sub(blocks, &mddev->recovery_active);
7630 wake_up(&mddev->recovery_wait);
7631 if (!ok) {
dfc70645 7632 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
0a19caab 7633 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
1da177e4
LT
7634 md_wakeup_thread(mddev->thread);
7635 // stop recovery, signal do_sync ....
7636 }
7637}
6c144d31 7638EXPORT_SYMBOL(md_done_sync);
1da177e4 7639
06d91a5f
N
7640/* md_write_start(mddev, bi)
7641 * If we need to update some array metadata (e.g. 'active' flag
3d310eb7
N
7642 * in superblock) before writing, schedule a superblock update
7643 * and wait for it to complete.
06d91a5f 7644 */
fd01b88c 7645void md_write_start(struct mddev *mddev, struct bio *bi)
1da177e4 7646{
0fd62b86 7647 int did_change = 0;
06d91a5f 7648 if (bio_data_dir(bi) != WRITE)
3d310eb7 7649 return;
06d91a5f 7650
f91de92e
N
7651 BUG_ON(mddev->ro == 1);
7652 if (mddev->ro == 2) {
7653 /* need to switch to read/write */
7654 mddev->ro = 0;
7655 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7656 md_wakeup_thread(mddev->thread);
25156198 7657 md_wakeup_thread(mddev->sync_thread);
0fd62b86 7658 did_change = 1;
f91de92e 7659 }
06d91a5f 7660 atomic_inc(&mddev->writes_pending);
31a59e34
N
7661 if (mddev->safemode == 1)
7662 mddev->safemode = 0;
06d91a5f 7663 if (mddev->in_sync) {
85572d7c 7664 spin_lock(&mddev->lock);
3d310eb7
N
7665 if (mddev->in_sync) {
7666 mddev->in_sync = 0;
850b2b42 7667 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
070dc6dd 7668 set_bit(MD_CHANGE_PENDING, &mddev->flags);
3d310eb7 7669 md_wakeup_thread(mddev->thread);
0fd62b86 7670 did_change = 1;
3d310eb7 7671 }
85572d7c 7672 spin_unlock(&mddev->lock);
06d91a5f 7673 }
0fd62b86 7674 if (did_change)
00bcb4ac 7675 sysfs_notify_dirent_safe(mddev->sysfs_state);
09a44cc1 7676 wait_event(mddev->sb_wait,
09a44cc1 7677 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1da177e4 7678}
6c144d31 7679EXPORT_SYMBOL(md_write_start);
1da177e4 7680
fd01b88c 7681void md_write_end(struct mddev *mddev)
1da177e4
LT
7682{
7683 if (atomic_dec_and_test(&mddev->writes_pending)) {
7684 if (mddev->safemode == 2)
7685 md_wakeup_thread(mddev->thread);
16f17b39 7686 else if (mddev->safemode_delay)
1da177e4
LT
7687 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
7688 }
7689}
6c144d31 7690EXPORT_SYMBOL(md_write_end);
1da177e4 7691
2a2275d6
N
7692/* md_allow_write(mddev)
7693 * Calling this ensures that the array is marked 'active' so that writes
7694 * may proceed without blocking. It is important to call this before
7695 * attempting a GFP_KERNEL allocation while holding the mddev lock.
7696 * Must be called with mddev_lock held.
b5470dc5
DW
7697 *
7698 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
7699 * is dropped, so return -EAGAIN after notifying userspace.
2a2275d6 7700 */
fd01b88c 7701int md_allow_write(struct mddev *mddev)
2a2275d6
N
7702{
7703 if (!mddev->pers)
b5470dc5 7704 return 0;
2a2275d6 7705 if (mddev->ro)
b5470dc5 7706 return 0;
1a0fd497 7707 if (!mddev->pers->sync_request)
b5470dc5 7708 return 0;
2a2275d6 7709
85572d7c 7710 spin_lock(&mddev->lock);
2a2275d6
N
7711 if (mddev->in_sync) {
7712 mddev->in_sync = 0;
7713 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
070dc6dd 7714 set_bit(MD_CHANGE_PENDING, &mddev->flags);
2a2275d6
N
7715 if (mddev->safemode_delay &&
7716 mddev->safemode == 0)
7717 mddev->safemode = 1;
85572d7c 7718 spin_unlock(&mddev->lock);
2a2275d6 7719 md_update_sb(mddev, 0);
00bcb4ac 7720 sysfs_notify_dirent_safe(mddev->sysfs_state);
2a2275d6 7721 } else
85572d7c 7722 spin_unlock(&mddev->lock);
b5470dc5 7723
070dc6dd 7724 if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
b5470dc5
DW
7725 return -EAGAIN;
7726 else
7727 return 0;
2a2275d6
N
7728}
7729EXPORT_SYMBOL_GPL(md_allow_write);
7730
1da177e4
LT
7731#define SYNC_MARKS 10
7732#define SYNC_MARK_STEP (3*HZ)
54f89341 7733#define UPDATE_FREQUENCY (5*60*HZ)
4ed8731d 7734void md_do_sync(struct md_thread *thread)
1da177e4 7735{
4ed8731d 7736 struct mddev *mddev = thread->mddev;
fd01b88c 7737 struct mddev *mddev2;
1da177e4
LT
7738 unsigned int currspeed = 0,
7739 window;
ac7e50a3 7740 sector_t max_sectors,j, io_sectors, recovery_done;
1da177e4 7741 unsigned long mark[SYNC_MARKS];
54f89341 7742 unsigned long update_time;
1da177e4
LT
7743 sector_t mark_cnt[SYNC_MARKS];
7744 int last_mark,m;
7745 struct list_head *tmp;
7746 sector_t last_check;
57afd89f 7747 int skipped = 0;
3cb03002 7748 struct md_rdev *rdev;
c4a39551 7749 char *desc, *action = NULL;
7c2c57c9 7750 struct blk_plug plug;
c186b128 7751 bool cluster_resync_finished = false;
1da177e4
LT
7752
7753 /* just incase thread restarts... */
7754 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7755 return;
3991b31e
N
7756 if (mddev->ro) {/* never try to sync a read-only array */
7757 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5fd6c1dc 7758 return;
3991b31e 7759 }
1da177e4 7760
61df9d91 7761 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
c4a39551 7762 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
61df9d91 7763 desc = "data-check";
c4a39551
JB
7764 action = "check";
7765 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
61df9d91 7766 desc = "requested-resync";
c4a39551
JB
7767 action = "repair";
7768 } else
61df9d91
N
7769 desc = "resync";
7770 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7771 desc = "reshape";
7772 else
7773 desc = "recovery";
7774
c4a39551
JB
7775 mddev->last_sync_action = action ?: desc;
7776
1da177e4
LT
7777 /* we overload curr_resync somewhat here.
7778 * 0 == not engaged in resync at all
7779 * 2 == checking that there is no conflict with another sync
7780 * 1 == like 2, but have yielded to allow conflicting resync to
7781 * commense
7782 * other == active in resync - this many blocks
7783 *
7784 * Before starting a resync we must have set curr_resync to
7785 * 2, and then checked that every "conflicting" array has curr_resync
7786 * less than ours. When we find one that is the same or higher
7787 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
7788 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
7789 * This will mean we have to start checking from the beginning again.
7790 *
7791 */
7792
7793 do {
7794 mddev->curr_resync = 2;
7795
7796 try_again:
404e4b43 7797 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
1da177e4 7798 goto skip;
29ac4aa3 7799 for_each_mddev(mddev2, tmp) {
1da177e4
LT
7800 if (mddev2 == mddev)
7801 continue;
90b08710
BS
7802 if (!mddev->parallel_resync
7803 && mddev2->curr_resync
7804 && match_mddev_units(mddev, mddev2)) {
1da177e4
LT
7805 DEFINE_WAIT(wq);
7806 if (mddev < mddev2 && mddev->curr_resync == 2) {
7807 /* arbitrarily yield */
7808 mddev->curr_resync = 1;
7809 wake_up(&resync_wait);
7810 }
7811 if (mddev > mddev2 && mddev->curr_resync == 1)
7812 /* no need to wait here, we can wait the next
7813 * time 'round when curr_resync == 2
7814 */
7815 continue;
9744197c
N
7816 /* We need to wait 'interruptible' so as not to
7817 * contribute to the load average, and not to
7818 * be caught by 'softlockup'
7819 */
7820 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
c91abf5a 7821 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8712e553 7822 mddev2->curr_resync >= mddev->curr_resync) {
61df9d91
N
7823 printk(KERN_INFO "md: delaying %s of %s"
7824 " until %s has finished (they"
1da177e4 7825 " share one or more physical units)\n",
61df9d91 7826 desc, mdname(mddev), mdname(mddev2));
1da177e4 7827 mddev_put(mddev2);
9744197c
N
7828 if (signal_pending(current))
7829 flush_signals(current);
1da177e4
LT
7830 schedule();
7831 finish_wait(&resync_wait, &wq);
7832 goto try_again;
7833 }
7834 finish_wait(&resync_wait, &wq);
7835 }
7836 }
7837 } while (mddev->curr_resync < 2);
7838
5fd6c1dc 7839 j = 0;
9d88883e 7840 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1da177e4 7841 /* resync follows the size requested by the personality,
57afd89f 7842 * which defaults to physical size, but can be virtual size
1da177e4
LT
7843 */
7844 max_sectors = mddev->resync_max_sectors;
7f7583d4 7845 atomic64_set(&mddev->resync_mismatches, 0);
5fd6c1dc 7846 /* we don't use the checkpoint if there's a bitmap */
5e96ee65
NB
7847 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7848 j = mddev->resync_min;
7849 else if (!mddev->bitmap)
5fd6c1dc 7850 j = mddev->recovery_cp;
5e96ee65 7851
ccfcc3c1 7852 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
c804cdec 7853 max_sectors = mddev->resync_max_sectors;
5fd6c1dc 7854 else {
1da177e4 7855 /* recovery follows the physical size of devices */
58c0fed4 7856 max_sectors = mddev->dev_sectors;
5fd6c1dc 7857 j = MaxSector;
4e59ca7d 7858 rcu_read_lock();
dafb20fa 7859 rdev_for_each_rcu(rdev, mddev)
5fd6c1dc 7860 if (rdev->raid_disk >= 0 &&
f2076e7d 7861 !test_bit(Journal, &rdev->flags) &&
5fd6c1dc
N
7862 !test_bit(Faulty, &rdev->flags) &&
7863 !test_bit(In_sync, &rdev->flags) &&
7864 rdev->recovery_offset < j)
7865 j = rdev->recovery_offset;
4e59ca7d 7866 rcu_read_unlock();
133d4527
N
7867
7868 /* If there is a bitmap, we need to make sure all
7869 * writes that started before we added a spare
7870 * complete before we start doing a recovery.
7871 * Otherwise the write might complete and (via
7872 * bitmap_endwrite) set a bit in the bitmap after the
7873 * recovery has checked that bit and skipped that
7874 * region.
7875 */
7876 if (mddev->bitmap) {
7877 mddev->pers->quiesce(mddev, 1);
7878 mddev->pers->quiesce(mddev, 0);
7879 }
5fd6c1dc 7880 }
1da177e4 7881
61df9d91
N
7882 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
7883 printk(KERN_INFO "md: minimum _guaranteed_ speed:"
7884 " %d KB/sec/disk.\n", speed_min(mddev));
338cec32 7885 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
61df9d91
N
7886 "(but not more than %d KB/sec) for %s.\n",
7887 speed_max(mddev), desc);
1da177e4 7888
eea1bf38 7889 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
5fd6c1dc 7890
57afd89f 7891 io_sectors = 0;
1da177e4
LT
7892 for (m = 0; m < SYNC_MARKS; m++) {
7893 mark[m] = jiffies;
57afd89f 7894 mark_cnt[m] = io_sectors;
1da177e4
LT
7895 }
7896 last_mark = 0;
7897 mddev->resync_mark = mark[last_mark];
7898 mddev->resync_mark_cnt = mark_cnt[last_mark];
7899
7900 /*
7901 * Tune reconstruction:
7902 */
7903 window = 32*(PAGE_SIZE/512);
ac42450c
JB
7904 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
7905 window/2, (unsigned long long)max_sectors/2);
1da177e4
LT
7906
7907 atomic_set(&mddev->recovery_active, 0);
1da177e4
LT
7908 last_check = 0;
7909
7910 if (j>2) {
c91abf5a 7911 printk(KERN_INFO
61df9d91
N
7912 "md: resuming %s of %s from checkpoint.\n",
7913 desc, mdname(mddev));
1da177e4 7914 mddev->curr_resync = j;
72f36d59
N
7915 } else
7916 mddev->curr_resync = 3; /* no longer delayed */
75d3da43 7917 mddev->curr_resync_completed = j;
72f36d59
N
7918 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7919 md_new_event(mddev);
54f89341 7920 update_time = jiffies;
1da177e4 7921
7c2c57c9 7922 blk_start_plug(&plug);
1da177e4 7923 while (j < max_sectors) {
57afd89f 7924 sector_t sectors;
1da177e4 7925
57afd89f 7926 skipped = 0;
97e4f42d 7927
7a91ee1f
N
7928 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7929 ((mddev->curr_resync > mddev->curr_resync_completed &&
7930 (mddev->curr_resync - mddev->curr_resync_completed)
7931 > (max_sectors >> 4)) ||
54f89341 7932 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
7a91ee1f 7933 (j - mddev->curr_resync_completed)*2
c5e19d90
N
7934 >= mddev->resync_max - mddev->curr_resync_completed ||
7935 mddev->curr_resync_completed > mddev->resync_max
7a91ee1f 7936 )) {
97e4f42d 7937 /* time to update curr_resync_completed */
97e4f42d
N
7938 wait_event(mddev->recovery_wait,
7939 atomic_read(&mddev->recovery_active) == 0);
75d3da43 7940 mddev->curr_resync_completed = j;
35d78c66 7941 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
7942 j > mddev->recovery_cp)
7943 mddev->recovery_cp = j;
54f89341 7944 update_time = jiffies;
070dc6dd 7945 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
acb180b0 7946 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
97e4f42d 7947 }
acb180b0 7948
c91abf5a
N
7949 while (j >= mddev->resync_max &&
7950 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
e62e58a5
N
7951 /* As this condition is controlled by user-space,
7952 * we can block indefinitely, so use '_interruptible'
7953 * to avoid triggering warnings.
7954 */
7955 flush_signals(current); /* just in case */
7956 wait_event_interruptible(mddev->recovery_wait,
7957 mddev->resync_max > j
c91abf5a
N
7958 || test_bit(MD_RECOVERY_INTR,
7959 &mddev->recovery));
e62e58a5 7960 }
acb180b0 7961
c91abf5a
N
7962 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7963 break;
acb180b0 7964
09314799 7965 sectors = mddev->pers->sync_request(mddev, j, &skipped);
57afd89f 7966 if (sectors == 0) {
dfc70645 7967 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
c91abf5a 7968 break;
1da177e4 7969 }
57afd89f
N
7970
7971 if (!skipped) { /* actual IO requested */
7972 io_sectors += sectors;
7973 atomic_add(sectors, &mddev->recovery_active);
7974 }
7975
e875ecea
N
7976 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7977 break;
7978
1da177e4 7979 j += sectors;
5ed1df2e
N
7980 if (j > max_sectors)
7981 /* when skipping, extra large numbers can be returned. */
7982 j = max_sectors;
72f36d59
N
7983 if (j > 2)
7984 mddev->curr_resync = j;
ff4e8d9a 7985 mddev->curr_mark_cnt = io_sectors;
d7603b7e 7986 if (last_check == 0)
e875ecea 7987 /* this is the earliest that rebuild will be
d7603b7e
N
7988 * visible in /proc/mdstat
7989 */
7990 md_new_event(mddev);
57afd89f
N
7991
7992 if (last_check + window > io_sectors || j == max_sectors)
1da177e4
LT
7993 continue;
7994
57afd89f 7995 last_check = io_sectors;
1da177e4
LT
7996 repeat:
7997 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
7998 /* step marks */
7999 int next = (last_mark+1) % SYNC_MARKS;
8000
8001 mddev->resync_mark = mark[next];
8002 mddev->resync_mark_cnt = mark_cnt[next];
8003 mark[next] = jiffies;
57afd89f 8004 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
1da177e4
LT
8005 last_mark = next;
8006 }
8007
c91abf5a
N
8008 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8009 break;
1da177e4
LT
8010
8011 /*
8012 * this loop exits only if either when we are slower than
8013 * the 'hard' speed limit, or the system was IO-idle for
8014 * a jiffy.
8015 * the system might be non-idle CPU-wise, but we only care
8016 * about not overloading the IO subsystem. (things like an
8017 * e2fsck being done on the RAID array should execute fast)
8018 */
1da177e4
LT
8019 cond_resched();
8020
ac7e50a3
XN
8021 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
8022 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
57afd89f 8023 /((jiffies-mddev->resync_mark)/HZ +1) +1;
1da177e4 8024
88202a0c 8025 if (currspeed > speed_min(mddev)) {
ac8fa419 8026 if (currspeed > speed_max(mddev)) {
c0e48521 8027 msleep(500);
1da177e4
LT
8028 goto repeat;
8029 }
ac8fa419
N
8030 if (!is_mddev_idle(mddev, 0)) {
8031 /*
8032 * Give other IO more of a chance.
8033 * The faster the devices, the less we wait.
8034 */
8035 wait_event(mddev->recovery_wait,
8036 !atomic_read(&mddev->recovery_active));
8037 }
1da177e4
LT
8038 }
8039 }
c91abf5a
N
8040 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
8041 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
8042 ? "interrupted" : "done");
1da177e4
LT
8043 /*
8044 * this also signals 'finished resyncing' to md_stop
8045 */
7c2c57c9 8046 blk_finish_plug(&plug);
1da177e4
LT
8047 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
8048
5ed1df2e
N
8049 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8050 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8051 mddev->curr_resync > 2) {
8052 mddev->curr_resync_completed = mddev->curr_resync;
8053 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8054 }
c186b128
GR
8055 /* tell personality and other nodes that we are finished */
8056 if (mddev_is_clustered(mddev)) {
8057 md_cluster_ops->resync_finish(mddev);
8058 cluster_resync_finished = true;
8059 }
09314799 8060 mddev->pers->sync_request(mddev, max_sectors, &skipped);
1da177e4 8061
dfc70645 8062 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
5fd6c1dc
N
8063 mddev->curr_resync > 2) {
8064 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8065 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8066 if (mddev->curr_resync >= mddev->recovery_cp) {
8067 printk(KERN_INFO
61df9d91
N
8068 "md: checkpointing %s of %s.\n",
8069 desc, mdname(mddev));
0a19caab 8070 if (test_bit(MD_RECOVERY_ERROR,
8071 &mddev->recovery))
8072 mddev->recovery_cp =
8073 mddev->curr_resync_completed;
8074 else
8075 mddev->recovery_cp =
8076 mddev->curr_resync;
5fd6c1dc
N
8077 }
8078 } else
8079 mddev->recovery_cp = MaxSector;
8080 } else {
8081 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8082 mddev->curr_resync = MaxSector;
4e59ca7d 8083 rcu_read_lock();
dafb20fa 8084 rdev_for_each_rcu(rdev, mddev)
5fd6c1dc 8085 if (rdev->raid_disk >= 0 &&
70fffd0b 8086 mddev->delta_disks >= 0 &&
f2076e7d 8087 !test_bit(Journal, &rdev->flags) &&
5fd6c1dc
N
8088 !test_bit(Faulty, &rdev->flags) &&
8089 !test_bit(In_sync, &rdev->flags) &&
8090 rdev->recovery_offset < mddev->curr_resync)
8091 rdev->recovery_offset = mddev->curr_resync;
4e59ca7d 8092 rcu_read_unlock();
5fd6c1dc 8093 }
1da177e4 8094 }
db91ff55 8095 skip:
17571284 8096 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1da177e4 8097
c186b128
GR
8098 if (mddev_is_clustered(mddev) &&
8099 test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8100 !cluster_resync_finished)
8101 md_cluster_ops->resync_finish(mddev);
8102
23da422b 8103 spin_lock(&mddev->lock);
c07b70ad
N
8104 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8105 /* We completed so min/max setting can be forgotten if used. */
8106 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8107 mddev->resync_min = 0;
8108 mddev->resync_max = MaxSector;
8109 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8110 mddev->resync_min = mddev->curr_resync_completed;
f7851be7 8111 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
1da177e4 8112 mddev->curr_resync = 0;
23da422b
N
8113 spin_unlock(&mddev->lock);
8114
1da177e4 8115 wake_up(&resync_wait);
1da177e4 8116 md_wakeup_thread(mddev->thread);
c6207277 8117 return;
1da177e4 8118}
29269553 8119EXPORT_SYMBOL_GPL(md_do_sync);
1da177e4 8120
746d3207
N
8121static int remove_and_add_spares(struct mddev *mddev,
8122 struct md_rdev *this)
b4c4c7b8 8123{
3cb03002 8124 struct md_rdev *rdev;
b4c4c7b8 8125 int spares = 0;
f2a371c5 8126 int removed = 0;
b4c4c7b8 8127
dafb20fa 8128 rdev_for_each(rdev, mddev)
746d3207
N
8129 if ((this == NULL || rdev == this) &&
8130 rdev->raid_disk >= 0 &&
6bfe0b49 8131 !test_bit(Blocked, &rdev->flags) &&
b4c4c7b8 8132 (test_bit(Faulty, &rdev->flags) ||
f2076e7d
SL
8133 (!test_bit(In_sync, &rdev->flags) &&
8134 !test_bit(Journal, &rdev->flags))) &&
b4c4c7b8
N
8135 atomic_read(&rdev->nr_pending)==0) {
8136 if (mddev->pers->hot_remove_disk(
b8321b68 8137 mddev, rdev) == 0) {
36fad858 8138 sysfs_unlink_rdev(mddev, rdev);
b4c4c7b8 8139 rdev->raid_disk = -1;
f2a371c5 8140 removed++;
b4c4c7b8
N
8141 }
8142 }
90584fc9
JB
8143 if (removed && mddev->kobj.sd)
8144 sysfs_notify(&mddev->kobj, NULL, "degraded");
b4c4c7b8 8145
2910ff17 8146 if (this && removed)
746d3207
N
8147 goto no_add;
8148
dafb20fa 8149 rdev_for_each(rdev, mddev) {
2910ff17
GR
8150 if (this && this != rdev)
8151 continue;
dbb64f86
GR
8152 if (test_bit(Candidate, &rdev->flags))
8153 continue;
7bfec5f3
N
8154 if (rdev->raid_disk >= 0 &&
8155 !test_bit(In_sync, &rdev->flags) &&
f2076e7d 8156 !test_bit(Journal, &rdev->flags) &&
7bfec5f3
N
8157 !test_bit(Faulty, &rdev->flags))
8158 spares++;
7ceb17e8
N
8159 if (rdev->raid_disk >= 0)
8160 continue;
8161 if (test_bit(Faulty, &rdev->flags))
8162 continue;
3069aa8d
SL
8163 if (test_bit(Journal, &rdev->flags))
8164 continue;
7ceb17e8 8165 if (mddev->ro &&
8313b8e5
N
8166 ! (rdev->saved_raid_disk >= 0 &&
8167 !test_bit(Bitmap_sync, &rdev->flags)))
7ceb17e8
N
8168 continue;
8169
d01552a7 8170 rdev->recovery_offset = 0;
7ceb17e8
N
8171 if (mddev->pers->
8172 hot_add_disk(mddev, rdev) == 0) {
8173 if (sysfs_link_rdev(mddev, rdev))
8174 /* failure here is OK */;
8175 spares++;
8176 md_new_event(mddev);
8177 set_bit(MD_CHANGE_DEVS, &mddev->flags);
dfc70645 8178 }
b4c4c7b8 8179 }
746d3207 8180no_add:
6dafab6b
N
8181 if (removed)
8182 set_bit(MD_CHANGE_DEVS, &mddev->flags);
b4c4c7b8
N
8183 return spares;
8184}
7ebc0be7 8185
ac05f256
N
8186static void md_start_sync(struct work_struct *ws)
8187{
8188 struct mddev *mddev = container_of(ws, struct mddev, del_work);
c186b128
GR
8189 int ret = 0;
8190
8191 if (mddev_is_clustered(mddev)) {
8192 ret = md_cluster_ops->resync_start(mddev);
8193 if (ret) {
8194 mddev->sync_thread = NULL;
8195 goto out;
8196 }
8197 }
ac05f256
N
8198
8199 mddev->sync_thread = md_register_thread(md_do_sync,
8200 mddev,
8201 "resync");
c186b128 8202out:
ac05f256 8203 if (!mddev->sync_thread) {
c186b128
GR
8204 if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
8205 printk(KERN_ERR "%s: could not start resync"
8206 " thread...\n",
8207 mdname(mddev));
ac05f256
N
8208 /* leave the spares where they are, it shouldn't hurt */
8209 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8210 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8211 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8212 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8213 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
f851b60d 8214 wake_up(&resync_wait);
ac05f256
N
8215 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8216 &mddev->recovery))
8217 if (mddev->sysfs_action)
8218 sysfs_notify_dirent_safe(mddev->sysfs_action);
8219 } else
8220 md_wakeup_thread(mddev->sync_thread);
8221 sysfs_notify_dirent_safe(mddev->sysfs_action);
8222 md_new_event(mddev);
8223}
8224
1da177e4
LT
8225/*
8226 * This routine is regularly called by all per-raid-array threads to
8227 * deal with generic issues like resync and super-block update.
8228 * Raid personalities that don't have a thread (linear/raid0) do not
8229 * need this as they never do any recovery or update the superblock.
8230 *
8231 * It does not do any resync itself, but rather "forks" off other threads
8232 * to do that as needed.
8233 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
8234 * "->recovery" and create a thread at ->sync_thread.
dfc70645 8235 * When the thread finishes it sets MD_RECOVERY_DONE
1da177e4
LT
8236 * and wakeups up this thread which will reap the thread and finish up.
8237 * This thread also removes any faulty devices (with nr_pending == 0).
8238 *
8239 * The overall approach is:
8240 * 1/ if the superblock needs updating, update it.
8241 * 2/ If a recovery thread is running, don't do anything else.
8242 * 3/ If recovery has finished, clean up, possibly marking spares active.
8243 * 4/ If there are any faulty devices, remove them.
8244 * 5/ If array is degraded, try to add spares devices
8245 * 6/ If array has spares or is not in-sync, start a resync thread.
8246 */
fd01b88c 8247void md_check_recovery(struct mddev *mddev)
1da177e4 8248{
68866e42
JB
8249 if (mddev->suspended)
8250 return;
8251
5f40402d 8252 if (mddev->bitmap)
aa5cbd10 8253 bitmap_daemon_work(mddev);
1da177e4 8254
fca4d848 8255 if (signal_pending(current)) {
31a59e34 8256 if (mddev->pers->sync_request && !mddev->external) {
fca4d848
N
8257 printk(KERN_INFO "md: %s in immediate safe mode\n",
8258 mdname(mddev));
8259 mddev->safemode = 2;
8260 }
8261 flush_signals(current);
8262 }
8263
c89a8eee
N
8264 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
8265 return;
1da177e4 8266 if ( ! (
142d44c3 8267 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
1da177e4 8268 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
fca4d848 8269 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
31a59e34 8270 (mddev->external == 0 && mddev->safemode == 1) ||
fca4d848
N
8271 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
8272 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
1da177e4
LT
8273 ))
8274 return;
fca4d848 8275
df5b89b3 8276 if (mddev_trylock(mddev)) {
b4c4c7b8 8277 int spares = 0;
fca4d848 8278
c89a8eee 8279 if (mddev->ro) {
ab16bfc7
NB
8280 struct md_rdev *rdev;
8281 if (!mddev->external && mddev->in_sync)
8282 /* 'Blocked' flag not needed as failed devices
8283 * will be recorded if array switched to read/write.
8284 * Leaving it set will prevent the device
8285 * from being removed.
8286 */
8287 rdev_for_each(rdev, mddev)
8288 clear_bit(Blocked, &rdev->flags);
7ceb17e8
N
8289 /* On a read-only array we can:
8290 * - remove failed devices
8291 * - add already-in_sync devices if the array itself
8292 * is in-sync.
8293 * As we only add devices that are already in-sync,
8294 * we can activate the spares immediately.
c89a8eee 8295 */
7ceb17e8 8296 remove_and_add_spares(mddev, NULL);
8313b8e5
N
8297 /* There is no thread, but we need to call
8298 * ->spare_active and clear saved_raid_disk
8299 */
2ac295a5 8300 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8313b8e5 8301 md_reap_sync_thread(mddev);
a4a3d26d 8302 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8313b8e5 8303 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
d4929add 8304 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
c89a8eee
N
8305 goto unlock;
8306 }
8307
31a59e34 8308 if (!mddev->external) {
0fd62b86 8309 int did_change = 0;
85572d7c 8310 spin_lock(&mddev->lock);
31a59e34
N
8311 if (mddev->safemode &&
8312 !atomic_read(&mddev->writes_pending) &&
8313 !mddev->in_sync &&
8314 mddev->recovery_cp == MaxSector) {
8315 mddev->in_sync = 1;
0fd62b86 8316 did_change = 1;
070dc6dd 8317 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
31a59e34
N
8318 }
8319 if (mddev->safemode == 1)
8320 mddev->safemode = 0;
85572d7c 8321 spin_unlock(&mddev->lock);
0fd62b86 8322 if (did_change)
00bcb4ac 8323 sysfs_notify_dirent_safe(mddev->sysfs_state);
fca4d848 8324 }
fca4d848 8325
2aa82191 8326 if (mddev->flags & MD_UPDATE_SB_FLAGS)
850b2b42 8327 md_update_sb(mddev, 0);
06d91a5f 8328
1da177e4
LT
8329 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
8330 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
8331 /* resync/recovery still happening */
8332 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8333 goto unlock;
8334 }
8335 if (mddev->sync_thread) {
a91d5ac0 8336 md_reap_sync_thread(mddev);
1da177e4
LT
8337 goto unlock;
8338 }
72a23c21
NB
8339 /* Set RUNNING before clearing NEEDED to avoid
8340 * any transients in the value of "sync_action".
8341 */
72f36d59 8342 mddev->curr_resync_completed = 0;
23da422b 8343 spin_lock(&mddev->lock);
72a23c21 8344 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
23da422b 8345 spin_unlock(&mddev->lock);
24dd469d
N
8346 /* Clear some bits that don't mean anything, but
8347 * might be left set
8348 */
24dd469d
N
8349 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
8350 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
1da177e4 8351
ed209584
N
8352 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8353 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
ac05f256 8354 goto not_running;
1da177e4
LT
8355 /* no recovery is running.
8356 * remove any failed drives, then
8357 * add spares if possible.
72f36d59 8358 * Spares are also removed and re-added, to allow
1da177e4
LT
8359 * the personality to fail the re-add.
8360 */
1da177e4 8361
b4c4c7b8 8362 if (mddev->reshape_position != MaxSector) {
50ac168a
N
8363 if (mddev->pers->check_reshape == NULL ||
8364 mddev->pers->check_reshape(mddev) != 0)
b4c4c7b8 8365 /* Cannot proceed */
ac05f256 8366 goto not_running;
b4c4c7b8 8367 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
72a23c21 8368 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
746d3207 8369 } else if ((spares = remove_and_add_spares(mddev, NULL))) {
24dd469d
N
8370 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8371 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
56ac36d7 8372 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
72a23c21 8373 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
24dd469d
N
8374 } else if (mddev->recovery_cp < MaxSector) {
8375 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
72a23c21 8376 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
24dd469d
N
8377 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
8378 /* nothing to be done ... */
ac05f256 8379 goto not_running;
24dd469d 8380
1da177e4 8381 if (mddev->pers->sync_request) {
ef99bf48 8382 if (spares) {
a654b9d8
N
8383 /* We are adding a device or devices to an array
8384 * which has the bitmap stored on all devices.
8385 * So make sure all bitmap pages get written
8386 */
8387 bitmap_write_all(mddev->bitmap);
8388 }
ac05f256
N
8389 INIT_WORK(&mddev->del_work, md_start_sync);
8390 queue_work(md_misc_wq, &mddev->del_work);
8391 goto unlock;
1da177e4 8392 }
ac05f256 8393 not_running:
72a23c21
NB
8394 if (!mddev->sync_thread) {
8395 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
f851b60d 8396 wake_up(&resync_wait);
72a23c21
NB
8397 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8398 &mddev->recovery))
0c3573f1 8399 if (mddev->sysfs_action)
00bcb4ac 8400 sysfs_notify_dirent_safe(mddev->sysfs_action);
72a23c21 8401 }
ac05f256
N
8402 unlock:
8403 wake_up(&mddev->sb_wait);
1da177e4
LT
8404 mddev_unlock(mddev);
8405 }
8406}
6c144d31 8407EXPORT_SYMBOL(md_check_recovery);
1da177e4 8408
a91d5ac0
JB
8409void md_reap_sync_thread(struct mddev *mddev)
8410{
8411 struct md_rdev *rdev;
8412
8413 /* resync has finished, collect result */
8414 md_unregister_thread(&mddev->sync_thread);
8415 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8416 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8417 /* success...*/
8418 /* activate any spares */
8419 if (mddev->pers->spare_active(mddev)) {
8420 sysfs_notify(&mddev->kobj, NULL,
8421 "degraded");
8422 set_bit(MD_CHANGE_DEVS, &mddev->flags);
8423 }
8424 }
8425 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8426 mddev->pers->finish_reshape)
8427 mddev->pers->finish_reshape(mddev);
8428
8429 /* If array is no-longer degraded, then any saved_raid_disk
f466722c 8430 * information must be scrapped.
a91d5ac0 8431 */
f466722c
N
8432 if (!mddev->degraded)
8433 rdev_for_each(rdev, mddev)
a91d5ac0
JB
8434 rdev->saved_raid_disk = -1;
8435
8436 md_update_sb(mddev, 1);
8437 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
ea358cd0 8438 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
a91d5ac0
JB
8439 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8440 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8441 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8442 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
f851b60d 8443 wake_up(&resync_wait);
a91d5ac0
JB
8444 /* flag recovery needed just to double check */
8445 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8446 sysfs_notify_dirent_safe(mddev->sysfs_action);
8447 md_new_event(mddev);
8448 if (mddev->event_work.func)
8449 queue_work(md_misc_wq, &mddev->event_work);
8450}
6c144d31 8451EXPORT_SYMBOL(md_reap_sync_thread);
a91d5ac0 8452
fd01b88c 8453void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
6bfe0b49 8454{
00bcb4ac 8455 sysfs_notify_dirent_safe(rdev->sysfs_state);
6bfe0b49 8456 wait_event_timeout(rdev->blocked_wait,
de393cde
N
8457 !test_bit(Blocked, &rdev->flags) &&
8458 !test_bit(BlockedBadBlocks, &rdev->flags),
6bfe0b49
DW
8459 msecs_to_jiffies(5000));
8460 rdev_dec_pending(rdev, mddev);
8461}
8462EXPORT_SYMBOL(md_wait_for_blocked_rdev);
8463
c6563a8c
N
8464void md_finish_reshape(struct mddev *mddev)
8465{
8466 /* called be personality module when reshape completes. */
8467 struct md_rdev *rdev;
8468
8469 rdev_for_each(rdev, mddev) {
8470 if (rdev->data_offset > rdev->new_data_offset)
8471 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
8472 else
8473 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
8474 rdev->data_offset = rdev->new_data_offset;
8475 }
8476}
8477EXPORT_SYMBOL(md_finish_reshape);
2230dfe4
N
8478
8479/* Bad block management.
8480 * We can record which blocks on each device are 'bad' and so just
8481 * fail those blocks, or that stripe, rather than the whole device.
8482 * Entries in the bad-block table are 64bits wide. This comprises:
8483 * Length of bad-range, in sectors: 0-511 for lengths 1-512
8484 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
8485 * A 'shift' can be set so that larger blocks are tracked and
8486 * consequently larger devices can be covered.
8487 * 'Acknowledged' flag - 1 bit. - the most significant bit.
8488 *
8489 * Locking of the bad-block table uses a seqlock so md_is_badblock
8490 * might need to retry if it is very unlucky.
8491 * We will sometimes want to check for bad blocks in a bi_end_io function,
8492 * so we use the write_seqlock_irq variant.
8493 *
8494 * When looking for a bad block we specify a range and want to
8495 * know if any block in the range is bad. So we binary-search
8496 * to the last range that starts at-or-before the given endpoint,
8497 * (or "before the sector after the target range")
8498 * then see if it ends after the given start.
8499 * We return
8500 * 0 if there are no known bad blocks in the range
8501 * 1 if there are known bad block which are all acknowledged
8502 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
8503 * plus the start/length of the first bad section we overlap.
8504 */
8505int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
8506 sector_t *first_bad, int *bad_sectors)
8507{
8508 int hi;
ab05613a 8509 int lo;
2230dfe4 8510 u64 *p = bb->page;
ab05613a 8511 int rv;
2230dfe4
N
8512 sector_t target = s + sectors;
8513 unsigned seq;
8514
8515 if (bb->shift > 0) {
8516 /* round the start down, and the end up */
8517 s >>= bb->shift;
8518 target += (1<<bb->shift) - 1;
8519 target >>= bb->shift;
8520 sectors = target - s;
8521 }
8522 /* 'target' is now the first block after the bad range */
8523
8524retry:
8525 seq = read_seqbegin(&bb->lock);
ab05613a 8526 lo = 0;
8527 rv = 0;
2230dfe4
N
8528 hi = bb->count;
8529
8530 /* Binary search between lo and hi for 'target'
8531 * i.e. for the last range that starts before 'target'
8532 */
8533 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
8534 * are known not to be the last range before target.
8535 * VARIANT: hi-lo is the number of possible
8536 * ranges, and decreases until it reaches 1
8537 */
8538 while (hi - lo > 1) {
8539 int mid = (lo + hi) / 2;
8540 sector_t a = BB_OFFSET(p[mid]);
8541 if (a < target)
8542 /* This could still be the one, earlier ranges
8543 * could not. */
8544 lo = mid;
8545 else
8546 /* This and later ranges are definitely out. */
8547 hi = mid;
8548 }
8549 /* 'lo' might be the last that started before target, but 'hi' isn't */
8550 if (hi > lo) {
8551 /* need to check all range that end after 's' to see if
8552 * any are unacknowledged.
8553 */
8554 while (lo >= 0 &&
8555 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8556 if (BB_OFFSET(p[lo]) < target) {
8557 /* starts before the end, and finishes after
8558 * the start, so they must overlap
8559 */
8560 if (rv != -1 && BB_ACK(p[lo]))
8561 rv = 1;
8562 else
8563 rv = -1;
8564 *first_bad = BB_OFFSET(p[lo]);
8565 *bad_sectors = BB_LEN(p[lo]);
8566 }
8567 lo--;
8568 }
8569 }
8570
8571 if (read_seqretry(&bb->lock, seq))
8572 goto retry;
8573
8574 return rv;
8575}
8576EXPORT_SYMBOL_GPL(md_is_badblock);
8577
8578/*
8579 * Add a range of bad blocks to the table.
8580 * This might extend the table, or might contract it
8581 * if two adjacent ranges can be merged.
8582 * We binary-search to find the 'insertion' point, then
8583 * decide how best to handle it.
8584 */
8585static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
8586 int acknowledged)
8587{
8588 u64 *p;
8589 int lo, hi;
8590 int rv = 1;
905b0297 8591 unsigned long flags;
2230dfe4
N
8592
8593 if (bb->shift < 0)
8594 /* badblocks are disabled */
8595 return 0;
8596
8597 if (bb->shift) {
8598 /* round the start down, and the end up */
8599 sector_t next = s + sectors;
8600 s >>= bb->shift;
8601 next += (1<<bb->shift) - 1;
8602 next >>= bb->shift;
8603 sectors = next - s;
8604 }
8605
905b0297 8606 write_seqlock_irqsave(&bb->lock, flags);
2230dfe4
N
8607
8608 p = bb->page;
8609 lo = 0;
8610 hi = bb->count;
8611 /* Find the last range that starts at-or-before 's' */
8612 while (hi - lo > 1) {
8613 int mid = (lo + hi) / 2;
8614 sector_t a = BB_OFFSET(p[mid]);
8615 if (a <= s)
8616 lo = mid;
8617 else
8618 hi = mid;
8619 }
8620 if (hi > lo && BB_OFFSET(p[lo]) > s)
8621 hi = lo;
8622
8623 if (hi > lo) {
8624 /* we found a range that might merge with the start
8625 * of our new range
8626 */
8627 sector_t a = BB_OFFSET(p[lo]);
8628 sector_t e = a + BB_LEN(p[lo]);
8629 int ack = BB_ACK(p[lo]);
8630 if (e >= s) {
8631 /* Yes, we can merge with a previous range */
8632 if (s == a && s + sectors >= e)
8633 /* new range covers old */
8634 ack = acknowledged;
8635 else
8636 ack = ack && acknowledged;
8637
8638 if (e < s + sectors)
8639 e = s + sectors;
8640 if (e - a <= BB_MAX_LEN) {
8641 p[lo] = BB_MAKE(a, e-a, ack);
8642 s = e;
8643 } else {
8644 /* does not all fit in one range,
8645 * make p[lo] maximal
8646 */
8647 if (BB_LEN(p[lo]) != BB_MAX_LEN)
8648 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
8649 s = a + BB_MAX_LEN;
8650 }
8651 sectors = e - s;
8652 }
8653 }
8654 if (sectors && hi < bb->count) {
8655 /* 'hi' points to the first range that starts after 's'.
8656 * Maybe we can merge with the start of that range */
8657 sector_t a = BB_OFFSET(p[hi]);
8658 sector_t e = a + BB_LEN(p[hi]);
8659 int ack = BB_ACK(p[hi]);
8660 if (a <= s + sectors) {
8661 /* merging is possible */
8662 if (e <= s + sectors) {
8663 /* full overlap */
8664 e = s + sectors;
8665 ack = acknowledged;
8666 } else
8667 ack = ack && acknowledged;
8668
8669 a = s;
8670 if (e - a <= BB_MAX_LEN) {
8671 p[hi] = BB_MAKE(a, e-a, ack);
8672 s = e;
8673 } else {
8674 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
8675 s = a + BB_MAX_LEN;
8676 }
8677 sectors = e - s;
8678 lo = hi;
8679 hi++;
8680 }
8681 }
8682 if (sectors == 0 && hi < bb->count) {
8683 /* we might be able to combine lo and hi */
8684 /* Note: 's' is at the end of 'lo' */
8685 sector_t a = BB_OFFSET(p[hi]);
8686 int lolen = BB_LEN(p[lo]);
8687 int hilen = BB_LEN(p[hi]);
8688 int newlen = lolen + hilen - (s - a);
8689 if (s >= a && newlen < BB_MAX_LEN) {
8690 /* yes, we can combine them */
8691 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
8692 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
8693 memmove(p + hi, p + hi + 1,
8694 (bb->count - hi - 1) * 8);
8695 bb->count--;
8696 }
8697 }
8698 while (sectors) {
8699 /* didn't merge (it all).
8700 * Need to add a range just before 'hi' */
8701 if (bb->count >= MD_MAX_BADBLOCKS) {
8702 /* No room for more */
8703 rv = 0;
8704 break;
8705 } else {
8706 int this_sectors = sectors;
8707 memmove(p + hi + 1, p + hi,
8708 (bb->count - hi) * 8);
8709 bb->count++;
8710
8711 if (this_sectors > BB_MAX_LEN)
8712 this_sectors = BB_MAX_LEN;
8713 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
8714 sectors -= this_sectors;
8715 s += this_sectors;
8716 }
8717 }
8718
8719 bb->changed = 1;
de393cde
N
8720 if (!acknowledged)
8721 bb->unacked_exist = 1;
905b0297 8722 write_sequnlock_irqrestore(&bb->lock, flags);
2230dfe4
N
8723
8724 return rv;
8725}
8726
3cb03002 8727int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
c6563a8c 8728 int is_new)
2230dfe4 8729{
c6563a8c
N
8730 int rv;
8731 if (is_new)
8732 s += rdev->new_data_offset;
8733 else
8734 s += rdev->data_offset;
8735 rv = md_set_badblocks(&rdev->badblocks,
8736 s, sectors, 0);
2230dfe4
N
8737 if (rv) {
8738 /* Make sure they get written out promptly */
8bd2f0a0 8739 sysfs_notify_dirent_safe(rdev->sysfs_state);
2230dfe4 8740 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
55ce74d4 8741 set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
2230dfe4
N
8742 md_wakeup_thread(rdev->mddev->thread);
8743 }
8744 return rv;
8745}
8746EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8747
8748/*
8749 * Remove a range of bad blocks from the table.
8750 * This may involve extending the table if we spilt a region,
8751 * but it must not fail. So if the table becomes full, we just
8752 * drop the remove request.
8753 */
8754static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
8755{
8756 u64 *p;
8757 int lo, hi;
8758 sector_t target = s + sectors;
8759 int rv = 0;
8760
8761 if (bb->shift > 0) {
8762 /* When clearing we round the start up and the end down.
8763 * This should not matter as the shift should align with
8764 * the block size and no rounding should ever be needed.
8765 * However it is better the think a block is bad when it
8766 * isn't than to think a block is not bad when it is.
8767 */
8768 s += (1<<bb->shift) - 1;
8769 s >>= bb->shift;
8770 target >>= bb->shift;
8771 sectors = target - s;
8772 }
8773
8774 write_seqlock_irq(&bb->lock);
8775
8776 p = bb->page;
8777 lo = 0;
8778 hi = bb->count;
8779 /* Find the last range that starts before 'target' */
8780 while (hi - lo > 1) {
8781 int mid = (lo + hi) / 2;
8782 sector_t a = BB_OFFSET(p[mid]);
8783 if (a < target)
8784 lo = mid;
8785 else
8786 hi = mid;
8787 }
8788 if (hi > lo) {
8789 /* p[lo] is the last range that could overlap the
8790 * current range. Earlier ranges could also overlap,
8791 * but only this one can overlap the end of the range.
8792 */
8793 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
8794 /* Partial overlap, leave the tail of this range */
8795 int ack = BB_ACK(p[lo]);
8796 sector_t a = BB_OFFSET(p[lo]);
8797 sector_t end = a + BB_LEN(p[lo]);
8798
8799 if (a < s) {
8800 /* we need to split this range */
8801 if (bb->count >= MD_MAX_BADBLOCKS) {
8b32bf5e 8802 rv = -ENOSPC;
2230dfe4
N
8803 goto out;
8804 }
8805 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
8806 bb->count++;
8807 p[lo] = BB_MAKE(a, s-a, ack);
8808 lo++;
8809 }
8810 p[lo] = BB_MAKE(target, end - target, ack);
8811 /* there is no longer an overlap */
8812 hi = lo;
8813 lo--;
8814 }
8815 while (lo >= 0 &&
8816 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8817 /* This range does overlap */
8818 if (BB_OFFSET(p[lo]) < s) {
8819 /* Keep the early parts of this range. */
8820 int ack = BB_ACK(p[lo]);
8821 sector_t start = BB_OFFSET(p[lo]);
8822 p[lo] = BB_MAKE(start, s - start, ack);
8823 /* now low doesn't overlap, so.. */
8824 break;
8825 }
8826 lo--;
8827 }
8828 /* 'lo' is strictly before, 'hi' is strictly after,
8829 * anything between needs to be discarded
8830 */
8831 if (hi - lo > 1) {
8832 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
8833 bb->count -= (hi - lo - 1);
8834 }
8835 }
8836
8837 bb->changed = 1;
8838out:
8839 write_sequnlock_irq(&bb->lock);
8840 return rv;
8841}
8842
c6563a8c
N
8843int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8844 int is_new)
2230dfe4 8845{
c6563a8c
N
8846 if (is_new)
8847 s += rdev->new_data_offset;
8848 else
8849 s += rdev->data_offset;
2230dfe4 8850 return md_clear_badblocks(&rdev->badblocks,
c6563a8c 8851 s, sectors);
2230dfe4
N
8852}
8853EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8854
8855/*
8856 * Acknowledge all bad blocks in a list.
8857 * This only succeeds if ->changed is clear. It is used by
8858 * in-kernel metadata updates
8859 */
8860void md_ack_all_badblocks(struct badblocks *bb)
8861{
8862 if (bb->page == NULL || bb->changed)
8863 /* no point even trying */
8864 return;
8865 write_seqlock_irq(&bb->lock);
8866
ecb178bb 8867 if (bb->changed == 0 && bb->unacked_exist) {
2230dfe4
N
8868 u64 *p = bb->page;
8869 int i;
8870 for (i = 0; i < bb->count ; i++) {
8871 if (!BB_ACK(p[i])) {
8872 sector_t start = BB_OFFSET(p[i]);
8873 int len = BB_LEN(p[i]);
8874 p[i] = BB_MAKE(start, len, 1);
8875 }
8876 }
de393cde 8877 bb->unacked_exist = 0;
2230dfe4
N
8878 }
8879 write_sequnlock_irq(&bb->lock);
8880}
8881EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
8882
16c791a5
N
8883/* sysfs access to bad-blocks list.
8884 * We present two files.
8885 * 'bad-blocks' lists sector numbers and lengths of ranges that
8886 * are recorded as bad. The list is truncated to fit within
8887 * the one-page limit of sysfs.
8888 * Writing "sector length" to this file adds an acknowledged
8889 * bad block list.
8890 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
8891 * been acknowledged. Writing to this file adds bad blocks
8892 * without acknowledging them. This is largely for testing.
8893 */
8894
8895static ssize_t
8896badblocks_show(struct badblocks *bb, char *page, int unack)
8897{
8898 size_t len;
8899 int i;
8900 u64 *p = bb->page;
8901 unsigned seq;
8902
8903 if (bb->shift < 0)
8904 return 0;
8905
8906retry:
8907 seq = read_seqbegin(&bb->lock);
8908
8909 len = 0;
8910 i = 0;
8911
8912 while (len < PAGE_SIZE && i < bb->count) {
8913 sector_t s = BB_OFFSET(p[i]);
8914 unsigned int length = BB_LEN(p[i]);
8915 int ack = BB_ACK(p[i]);
8916 i++;
8917
8918 if (unack && ack)
8919 continue;
8920
8921 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
8922 (unsigned long long)s << bb->shift,
8923 length << bb->shift);
8924 }
de393cde
N
8925 if (unack && len == 0)
8926 bb->unacked_exist = 0;
16c791a5
N
8927
8928 if (read_seqretry(&bb->lock, seq))
8929 goto retry;
8930
8931 return len;
8932}
8933
8934#define DO_DEBUG 1
8935
8936static ssize_t
8937badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8938{
8939 unsigned long long sector;
8940 int length;
8941 char newline;
8942#ifdef DO_DEBUG
8943 /* Allow clearing via sysfs *only* for testing/debugging.
8944 * Normally only a successful write may clear a badblock
8945 */
8946 int clear = 0;
8947 if (page[0] == '-') {
8948 clear = 1;
8949 page++;
8950 }
8951#endif /* DO_DEBUG */
8952
8953 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8954 case 3:
8955 if (newline != '\n')
8956 return -EINVAL;
8957 case 2:
8958 if (length <= 0)
8959 return -EINVAL;
8960 break;
8961 default:
8962 return -EINVAL;
8963 }
8964
8965#ifdef DO_DEBUG
8966 if (clear) {
8967 md_clear_badblocks(bb, sector, length);
8968 return len;
8969 }
8970#endif /* DO_DEBUG */
8971 if (md_set_badblocks(bb, sector, length, !unack))
8972 return len;
8973 else
8974 return -ENOSPC;
8975}
8976
75c96f85
AB
8977static int md_notify_reboot(struct notifier_block *this,
8978 unsigned long code, void *x)
1da177e4
LT
8979{
8980 struct list_head *tmp;
fd01b88c 8981 struct mddev *mddev;
2dba6a91 8982 int need_delay = 0;
1da177e4 8983
c744a65c
N
8984 for_each_mddev(mddev, tmp) {
8985 if (mddev_trylock(mddev)) {
30b8aa91
N
8986 if (mddev->pers)
8987 __md_stop_writes(mddev);
0f62fb22
N
8988 if (mddev->persistent)
8989 mddev->safemode = 2;
c744a65c 8990 mddev_unlock(mddev);
2dba6a91 8991 }
c744a65c 8992 need_delay = 1;
1da177e4 8993 }
c744a65c
N
8994 /*
8995 * certain more exotic SCSI devices are known to be
8996 * volatile wrt too early system reboots. While the
8997 * right place to handle this issue is the given
8998 * driver, we do want to have a safe RAID driver ...
8999 */
9000 if (need_delay)
9001 mdelay(1000*1);
9002
1da177e4
LT
9003 return NOTIFY_DONE;
9004}
9005
75c96f85 9006static struct notifier_block md_notifier = {
1da177e4
LT
9007 .notifier_call = md_notify_reboot,
9008 .next = NULL,
9009 .priority = INT_MAX, /* before any real devices */
9010};
9011
9012static void md_geninit(void)
9013{
36a4e1fe 9014 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
1da177e4 9015
c7705f34 9016 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
1da177e4
LT
9017}
9018
75c96f85 9019static int __init md_init(void)
1da177e4 9020{
e804ac78
TH
9021 int ret = -ENOMEM;
9022
ada609ee 9023 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
e804ac78
TH
9024 if (!md_wq)
9025 goto err_wq;
9026
9027 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9028 if (!md_misc_wq)
9029 goto err_misc_wq;
9030
9031 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
9032 goto err_md;
9033
9034 if ((ret = register_blkdev(0, "mdp")) < 0)
9035 goto err_mdp;
9036 mdp_major = ret;
9037
af5628f0 9038 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
e8703fe1
N
9039 md_probe, NULL, NULL);
9040 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
1da177e4
LT
9041 md_probe, NULL, NULL);
9042
1da177e4 9043 register_reboot_notifier(&md_notifier);
0b4d4147 9044 raid_table_header = register_sysctl_table(raid_root_table);
1da177e4
LT
9045
9046 md_geninit();
d710e138 9047 return 0;
1da177e4 9048
e804ac78
TH
9049err_mdp:
9050 unregister_blkdev(MD_MAJOR, "md");
9051err_md:
9052 destroy_workqueue(md_misc_wq);
9053err_misc_wq:
9054 destroy_workqueue(md_wq);
9055err_wq:
9056 return ret;
9057}
1da177e4 9058
70bcecdb 9059static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
1d7e3e96 9060{
70bcecdb
GR
9061 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9062 struct md_rdev *rdev2;
9063 int role, ret;
9064 char b[BDEVNAME_SIZE];
1d7e3e96 9065
70bcecdb
GR
9066 /* Check for change of roles in the active devices */
9067 rdev_for_each(rdev2, mddev) {
9068 if (test_bit(Faulty, &rdev2->flags))
9069 continue;
9070
9071 /* Check if the roles changed */
9072 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
dbb64f86
GR
9073
9074 if (test_bit(Candidate, &rdev2->flags)) {
9075 if (role == 0xfffe) {
9076 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
9077 md_kick_rdev_from_array(rdev2);
9078 continue;
9079 }
9080 else
9081 clear_bit(Candidate, &rdev2->flags);
9082 }
9083
70bcecdb
GR
9084 if (role != rdev2->raid_disk) {
9085 /* got activated */
9086 if (rdev2->raid_disk == -1 && role != 0xffff) {
9087 rdev2->saved_raid_disk = role;
9088 ret = remove_and_add_spares(mddev, rdev2);
9089 pr_info("Activated spare: %s\n",
9090 bdevname(rdev2->bdev,b));
9091 continue;
9092 }
9093 /* device faulty
9094 * We just want to do the minimum to mark the disk
9095 * as faulty. The recovery is performed by the
9096 * one who initiated the error.
9097 */
9098 if ((role == 0xfffe) || (role == 0xfffd)) {
9099 md_error(mddev, rdev2);
9100 clear_bit(Blocked, &rdev2->flags);
9101 }
9102 }
1d7e3e96 9103 }
70bcecdb 9104
28c1b9fd
GR
9105 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
9106 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
70bcecdb
GR
9107
9108 /* Finally set the event to be up to date */
9109 mddev->events = le64_to_cpu(sb->events);
9110}
9111
9112static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9113{
9114 int err;
9115 struct page *swapout = rdev->sb_page;
9116 struct mdp_superblock_1 *sb;
9117
9118 /* Store the sb page of the rdev in the swapout temporary
9119 * variable in case we err in the future
9120 */
9121 rdev->sb_page = NULL;
9122 alloc_disk_sb(rdev);
9123 ClearPageUptodate(rdev->sb_page);
9124 rdev->sb_loaded = 0;
9125 err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
9126
9127 if (err < 0) {
9128 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9129 __func__, __LINE__, rdev->desc_nr, err);
9130 put_page(rdev->sb_page);
9131 rdev->sb_page = swapout;
9132 rdev->sb_loaded = 1;
9133 return err;
1d7e3e96
GR
9134 }
9135
70bcecdb
GR
9136 sb = page_address(rdev->sb_page);
9137 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
9138 * is not set
9139 */
9140
9141 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9142 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9143
9144 /* The other node finished recovery, call spare_active to set
9145 * device In_sync and mddev->degraded
9146 */
9147 if (rdev->recovery_offset == MaxSector &&
9148 !test_bit(In_sync, &rdev->flags) &&
9149 mddev->pers->spare_active(mddev))
9150 sysfs_notify(&mddev->kobj, NULL, "degraded");
9151
9152 put_page(swapout);
9153 return 0;
9154}
9155
9156void md_reload_sb(struct mddev *mddev, int nr)
9157{
9158 struct md_rdev *rdev;
9159 int err;
9160
9161 /* Find the rdev */
9162 rdev_for_each_rcu(rdev, mddev) {
9163 if (rdev->desc_nr == nr)
9164 break;
9165 }
9166
9167 if (!rdev || rdev->desc_nr != nr) {
9168 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9169 return;
9170 }
9171
9172 err = read_rdev(mddev, rdev);
9173 if (err < 0)
9174 return;
9175
9176 check_sb_changes(mddev, rdev);
9177
9178 /* Read all rdev's to update recovery_offset */
9179 rdev_for_each_rcu(rdev, mddev)
9180 read_rdev(mddev, rdev);
1d7e3e96
GR
9181}
9182EXPORT_SYMBOL(md_reload_sb);
9183
1da177e4
LT
9184#ifndef MODULE
9185
9186/*
9187 * Searches all registered partitions for autorun RAID arrays
9188 * at boot time.
9189 */
4d936ec1
ME
9190
9191static LIST_HEAD(all_detected_devices);
9192struct detected_devices_node {
9193 struct list_head list;
9194 dev_t dev;
9195};
1da177e4
LT
9196
9197void md_autodetect_dev(dev_t dev)
9198{
4d936ec1
ME
9199 struct detected_devices_node *node_detected_dev;
9200
9201 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9202 if (node_detected_dev) {
9203 node_detected_dev->dev = dev;
9204 list_add_tail(&node_detected_dev->list, &all_detected_devices);
9205 } else {
9206 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
9207 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
9208 }
1da177e4
LT
9209}
9210
1da177e4
LT
9211static void autostart_arrays(int part)
9212{
3cb03002 9213 struct md_rdev *rdev;
4d936ec1
ME
9214 struct detected_devices_node *node_detected_dev;
9215 dev_t dev;
9216 int i_scanned, i_passed;
1da177e4 9217
4d936ec1
ME
9218 i_scanned = 0;
9219 i_passed = 0;
1da177e4 9220
4d936ec1 9221 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
1da177e4 9222
4d936ec1
ME
9223 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9224 i_scanned++;
9225 node_detected_dev = list_entry(all_detected_devices.next,
9226 struct detected_devices_node, list);
9227 list_del(&node_detected_dev->list);
9228 dev = node_detected_dev->dev;
9229 kfree(node_detected_dev);
df968c4e 9230 rdev = md_import_device(dev,0, 90);
1da177e4
LT
9231 if (IS_ERR(rdev))
9232 continue;
9233
403df478 9234 if (test_bit(Faulty, &rdev->flags))
1da177e4 9235 continue;
403df478 9236
d0fae18f 9237 set_bit(AutoDetected, &rdev->flags);
1da177e4 9238 list_add(&rdev->same_set, &pending_raid_disks);
4d936ec1 9239 i_passed++;
1da177e4 9240 }
4d936ec1
ME
9241
9242 printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
9243 i_scanned, i_passed);
1da177e4
LT
9244
9245 autorun_devices(part);
9246}
9247
fdee8ae4 9248#endif /* !MODULE */
1da177e4
LT
9249
9250static __exit void md_exit(void)
9251{
fd01b88c 9252 struct mddev *mddev;
1da177e4 9253 struct list_head *tmp;
e2f23b60 9254 int delay = 1;
8ab5e4c1 9255
af5628f0 9256 blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
e8703fe1 9257 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
1da177e4 9258
3dbd8c2e 9259 unregister_blkdev(MD_MAJOR,"md");
1da177e4
LT
9260 unregister_blkdev(mdp_major, "mdp");
9261 unregister_reboot_notifier(&md_notifier);
9262 unregister_sysctl_table(raid_table_header);
e2f23b60
N
9263
9264 /* We cannot unload the modules while some process is
9265 * waiting for us in select() or poll() - wake them up
9266 */
9267 md_unloading = 1;
9268 while (waitqueue_active(&md_event_waiters)) {
9269 /* not safe to leave yet */
9270 wake_up(&md_event_waiters);
9271 msleep(delay);
9272 delay += delay;
9273 }
1da177e4 9274 remove_proc_entry("mdstat", NULL);
e2f23b60 9275
29ac4aa3 9276 for_each_mddev(mddev, tmp) {
1da177e4 9277 export_array(mddev);
d3374825 9278 mddev->hold_active = 0;
1da177e4 9279 }
e804ac78
TH
9280 destroy_workqueue(md_misc_wq);
9281 destroy_workqueue(md_wq);
1da177e4
LT
9282}
9283
685784aa 9284subsys_initcall(md_init);
1da177e4
LT
9285module_exit(md_exit)
9286
f91de92e
N
9287static int get_ro(char *buffer, struct kernel_param *kp)
9288{
9289 return sprintf(buffer, "%d", start_readonly);
9290}
9291static int set_ro(const char *val, struct kernel_param *kp)
9292{
4c9309c0 9293 return kstrtouint(val, 10, (unsigned int *)&start_readonly);
f91de92e
N
9294}
9295
80ca3a44
N
9296module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9297module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
efeb53c0 9298module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
f91de92e 9299
1da177e4 9300MODULE_LICENSE("GPL");
0efb9e61 9301MODULE_DESCRIPTION("MD RAID framework");
aa1595e9 9302MODULE_ALIAS("md");
72008652 9303MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);