Merge tag 'media/v4.15-1' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/mcheh...
[linux-2.6-block.git] / drivers / md / raid10.c
1 /*
2  * raid10.c : Multiple Devices driver for Linux
3  *
4  * Copyright (C) 2000-2004 Neil Brown
5  *
6  * RAID-10 support for md.
7  *
8  * Base on code in raid1.c.  See raid1.c for further copyright information.
9  *
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2, or (at your option)
14  * any later version.
15  *
16  * You should have received a copy of the GNU General Public License
17  * (for example /usr/src/linux/COPYING); if not, write to the Free
18  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20
21 #include <linux/slab.h>
22 #include <linux/delay.h>
23 #include <linux/blkdev.h>
24 #include <linux/module.h>
25 #include <linux/seq_file.h>
26 #include <linux/ratelimit.h>
27 #include <linux/kthread.h>
28 #include <trace/events/block.h>
29 #include "md.h"
30 #include "raid10.h"
31 #include "raid0.h"
32 #include "md-bitmap.h"
33
34 /*
35  * RAID10 provides a combination of RAID0 and RAID1 functionality.
36  * The layout of data is defined by
37  *    chunk_size
38  *    raid_disks
39  *    near_copies (stored in low byte of layout)
40  *    far_copies (stored in second byte of layout)
41  *    far_offset (stored in bit 16 of layout )
42  *    use_far_sets (stored in bit 17 of layout )
43  *    use_far_sets_bugfixed (stored in bit 18 of layout )
44  *
45  * The data to be stored is divided into chunks using chunksize.  Each device
46  * is divided into far_copies sections.   In each section, chunks are laid out
47  * in a style similar to raid0, but near_copies copies of each chunk is stored
48  * (each on a different drive).  The starting device for each section is offset
49  * near_copies from the starting device of the previous section.  Thus there
50  * are (near_copies * far_copies) of each chunk, and each is on a different
51  * drive.  near_copies and far_copies must be at least one, and their product
52  * is at most raid_disks.
53  *
54  * If far_offset is true, then the far_copies are handled a bit differently.
55  * The copies are still in different stripes, but instead of being very far
56  * apart on disk, there are adjacent stripes.
57  *
58  * The far and offset algorithms are handled slightly differently if
59  * 'use_far_sets' is true.  In this case, the array's devices are grouped into
60  * sets that are (near_copies * far_copies) in size.  The far copied stripes
61  * are still shifted by 'near_copies' devices, but this shifting stays confined
62  * to the set rather than the entire array.  This is done to improve the number
63  * of device combinations that can fail without causing the array to fail.
64  * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
65  * on a device):
66  *    A B C D    A B C D E
67  *      ...         ...
68  *    D A B C    E A B C D
69  * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
70  *    [A B] [C D]    [A B] [C D E]
71  *    |...| |...|    |...| | ... |
72  *    [B A] [D C]    [B A] [E C D]
73  */
74
75 /*
76  * Number of guaranteed r10bios in case of extreme VM load:
77  */
78 #define NR_RAID10_BIOS 256
79
80 /* when we get a read error on a read-only array, we redirect to another
81  * device without failing the first device, or trying to over-write to
82  * correct the read error.  To keep track of bad blocks on a per-bio
83  * level, we store IO_BLOCKED in the appropriate 'bios' pointer
84  */
85 #define IO_BLOCKED ((struct bio *)1)
86 /* When we successfully write to a known bad-block, we need to remove the
87  * bad-block marking which must be done from process context.  So we record
88  * the success by setting devs[n].bio to IO_MADE_GOOD
89  */
90 #define IO_MADE_GOOD ((struct bio *)2)
91
92 #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
93
94 /* When there are this many requests queued to be written by
95  * the raid10 thread, we become 'congested' to provide back-pressure
96  * for writeback.
97  */
98 static int max_queued_requests = 1024;
99
100 static void allow_barrier(struct r10conf *conf);
101 static void lower_barrier(struct r10conf *conf);
102 static int _enough(struct r10conf *conf, int previous, int ignore);
103 static int enough(struct r10conf *conf, int ignore);
104 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
105                                 int *skipped);
106 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
107 static void end_reshape_write(struct bio *bio);
108 static void end_reshape(struct r10conf *conf);
109
110 #define raid10_log(md, fmt, args...)                            \
111         do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
112
113 #include "raid1-10.c"
114
115 /*
116  * for resync bio, r10bio pointer can be retrieved from the per-bio
117  * 'struct resync_pages'.
118  */
119 static inline struct r10bio *get_resync_r10bio(struct bio *bio)
120 {
121         return get_resync_pages(bio)->raid_bio;
122 }
123
124 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
125 {
126         struct r10conf *conf = data;
127         int size = offsetof(struct r10bio, devs[conf->copies]);
128
129         /* allocate a r10bio with room for raid_disks entries in the
130          * bios array */
131         return kzalloc(size, gfp_flags);
132 }
133
134 static void r10bio_pool_free(void *r10_bio, void *data)
135 {
136         kfree(r10_bio);
137 }
138
139 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
140 /* amount of memory to reserve for resync requests */
141 #define RESYNC_WINDOW (1024*1024)
142 /* maximum number of concurrent requests, memory permitting */
143 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
144 #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
145 #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
146
147 /*
148  * When performing a resync, we need to read and compare, so
149  * we need as many pages are there are copies.
150  * When performing a recovery, we need 2 bios, one for read,
151  * one for write (we recover only one drive per r10buf)
152  *
153  */
154 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
155 {
156         struct r10conf *conf = data;
157         struct r10bio *r10_bio;
158         struct bio *bio;
159         int j;
160         int nalloc, nalloc_rp;
161         struct resync_pages *rps;
162
163         r10_bio = r10bio_pool_alloc(gfp_flags, conf);
164         if (!r10_bio)
165                 return NULL;
166
167         if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
168             test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
169                 nalloc = conf->copies; /* resync */
170         else
171                 nalloc = 2; /* recovery */
172
173         /* allocate once for all bios */
174         if (!conf->have_replacement)
175                 nalloc_rp = nalloc;
176         else
177                 nalloc_rp = nalloc * 2;
178         rps = kmalloc(sizeof(struct resync_pages) * nalloc_rp, gfp_flags);
179         if (!rps)
180                 goto out_free_r10bio;
181
182         /*
183          * Allocate bios.
184          */
185         for (j = nalloc ; j-- ; ) {
186                 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
187                 if (!bio)
188                         goto out_free_bio;
189                 r10_bio->devs[j].bio = bio;
190                 if (!conf->have_replacement)
191                         continue;
192                 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
193                 if (!bio)
194                         goto out_free_bio;
195                 r10_bio->devs[j].repl_bio = bio;
196         }
197         /*
198          * Allocate RESYNC_PAGES data pages and attach them
199          * where needed.
200          */
201         for (j = 0; j < nalloc; j++) {
202                 struct bio *rbio = r10_bio->devs[j].repl_bio;
203                 struct resync_pages *rp, *rp_repl;
204
205                 rp = &rps[j];
206                 if (rbio)
207                         rp_repl = &rps[nalloc + j];
208
209                 bio = r10_bio->devs[j].bio;
210
211                 if (!j || test_bit(MD_RECOVERY_SYNC,
212                                    &conf->mddev->recovery)) {
213                         if (resync_alloc_pages(rp, gfp_flags))
214                                 goto out_free_pages;
215                 } else {
216                         memcpy(rp, &rps[0], sizeof(*rp));
217                         resync_get_all_pages(rp);
218                 }
219
220                 rp->raid_bio = r10_bio;
221                 bio->bi_private = rp;
222                 if (rbio) {
223                         memcpy(rp_repl, rp, sizeof(*rp));
224                         rbio->bi_private = rp_repl;
225                 }
226         }
227
228         return r10_bio;
229
230 out_free_pages:
231         while (--j >= 0)
232                 resync_free_pages(&rps[j * 2]);
233
234         j = 0;
235 out_free_bio:
236         for ( ; j < nalloc; j++) {
237                 if (r10_bio->devs[j].bio)
238                         bio_put(r10_bio->devs[j].bio);
239                 if (r10_bio->devs[j].repl_bio)
240                         bio_put(r10_bio->devs[j].repl_bio);
241         }
242         kfree(rps);
243 out_free_r10bio:
244         r10bio_pool_free(r10_bio, conf);
245         return NULL;
246 }
247
248 static void r10buf_pool_free(void *__r10_bio, void *data)
249 {
250         struct r10conf *conf = data;
251         struct r10bio *r10bio = __r10_bio;
252         int j;
253         struct resync_pages *rp = NULL;
254
255         for (j = conf->copies; j--; ) {
256                 struct bio *bio = r10bio->devs[j].bio;
257
258                 rp = get_resync_pages(bio);
259                 resync_free_pages(rp);
260                 bio_put(bio);
261
262                 bio = r10bio->devs[j].repl_bio;
263                 if (bio)
264                         bio_put(bio);
265         }
266
267         /* resync pages array stored in the 1st bio's .bi_private */
268         kfree(rp);
269
270         r10bio_pool_free(r10bio, conf);
271 }
272
273 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
274 {
275         int i;
276
277         for (i = 0; i < conf->copies; i++) {
278                 struct bio **bio = & r10_bio->devs[i].bio;
279                 if (!BIO_SPECIAL(*bio))
280                         bio_put(*bio);
281                 *bio = NULL;
282                 bio = &r10_bio->devs[i].repl_bio;
283                 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
284                         bio_put(*bio);
285                 *bio = NULL;
286         }
287 }
288
289 static void free_r10bio(struct r10bio *r10_bio)
290 {
291         struct r10conf *conf = r10_bio->mddev->private;
292
293         put_all_bios(conf, r10_bio);
294         mempool_free(r10_bio, conf->r10bio_pool);
295 }
296
297 static void put_buf(struct r10bio *r10_bio)
298 {
299         struct r10conf *conf = r10_bio->mddev->private;
300
301         mempool_free(r10_bio, conf->r10buf_pool);
302
303         lower_barrier(conf);
304 }
305
306 static void reschedule_retry(struct r10bio *r10_bio)
307 {
308         unsigned long flags;
309         struct mddev *mddev = r10_bio->mddev;
310         struct r10conf *conf = mddev->private;
311
312         spin_lock_irqsave(&conf->device_lock, flags);
313         list_add(&r10_bio->retry_list, &conf->retry_list);
314         conf->nr_queued ++;
315         spin_unlock_irqrestore(&conf->device_lock, flags);
316
317         /* wake up frozen array... */
318         wake_up(&conf->wait_barrier);
319
320         md_wakeup_thread(mddev->thread);
321 }
322
323 /*
324  * raid_end_bio_io() is called when we have finished servicing a mirrored
325  * operation and are ready to return a success/failure code to the buffer
326  * cache layer.
327  */
328 static void raid_end_bio_io(struct r10bio *r10_bio)
329 {
330         struct bio *bio = r10_bio->master_bio;
331         struct r10conf *conf = r10_bio->mddev->private;
332
333         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
334                 bio->bi_status = BLK_STS_IOERR;
335
336         bio_endio(bio);
337         /*
338          * Wake up any possible resync thread that waits for the device
339          * to go idle.
340          */
341         allow_barrier(conf);
342
343         free_r10bio(r10_bio);
344 }
345
346 /*
347  * Update disk head position estimator based on IRQ completion info.
348  */
349 static inline void update_head_pos(int slot, struct r10bio *r10_bio)
350 {
351         struct r10conf *conf = r10_bio->mddev->private;
352
353         conf->mirrors[r10_bio->devs[slot].devnum].head_position =
354                 r10_bio->devs[slot].addr + (r10_bio->sectors);
355 }
356
357 /*
358  * Find the disk number which triggered given bio
359  */
360 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
361                          struct bio *bio, int *slotp, int *replp)
362 {
363         int slot;
364         int repl = 0;
365
366         for (slot = 0; slot < conf->copies; slot++) {
367                 if (r10_bio->devs[slot].bio == bio)
368                         break;
369                 if (r10_bio->devs[slot].repl_bio == bio) {
370                         repl = 1;
371                         break;
372                 }
373         }
374
375         BUG_ON(slot == conf->copies);
376         update_head_pos(slot, r10_bio);
377
378         if (slotp)
379                 *slotp = slot;
380         if (replp)
381                 *replp = repl;
382         return r10_bio->devs[slot].devnum;
383 }
384
385 static void raid10_end_read_request(struct bio *bio)
386 {
387         int uptodate = !bio->bi_status;
388         struct r10bio *r10_bio = bio->bi_private;
389         int slot;
390         struct md_rdev *rdev;
391         struct r10conf *conf = r10_bio->mddev->private;
392
393         slot = r10_bio->read_slot;
394         rdev = r10_bio->devs[slot].rdev;
395         /*
396          * this branch is our 'one mirror IO has finished' event handler:
397          */
398         update_head_pos(slot, r10_bio);
399
400         if (uptodate) {
401                 /*
402                  * Set R10BIO_Uptodate in our master bio, so that
403                  * we will return a good error code to the higher
404                  * levels even if IO on some other mirrored buffer fails.
405                  *
406                  * The 'master' represents the composite IO operation to
407                  * user-side. So if something waits for IO, then it will
408                  * wait for the 'master' bio.
409                  */
410                 set_bit(R10BIO_Uptodate, &r10_bio->state);
411         } else {
412                 /* If all other devices that store this block have
413                  * failed, we want to return the error upwards rather
414                  * than fail the last device.  Here we redefine
415                  * "uptodate" to mean "Don't want to retry"
416                  */
417                 if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state),
418                              rdev->raid_disk))
419                         uptodate = 1;
420         }
421         if (uptodate) {
422                 raid_end_bio_io(r10_bio);
423                 rdev_dec_pending(rdev, conf->mddev);
424         } else {
425                 /*
426                  * oops, read error - keep the refcount on the rdev
427                  */
428                 char b[BDEVNAME_SIZE];
429                 pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n",
430                                    mdname(conf->mddev),
431                                    bdevname(rdev->bdev, b),
432                                    (unsigned long long)r10_bio->sector);
433                 set_bit(R10BIO_ReadError, &r10_bio->state);
434                 reschedule_retry(r10_bio);
435         }
436 }
437
438 static void close_write(struct r10bio *r10_bio)
439 {
440         /* clear the bitmap if all writes complete successfully */
441         bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
442                         r10_bio->sectors,
443                         !test_bit(R10BIO_Degraded, &r10_bio->state),
444                         0);
445         md_write_end(r10_bio->mddev);
446 }
447
448 static void one_write_done(struct r10bio *r10_bio)
449 {
450         if (atomic_dec_and_test(&r10_bio->remaining)) {
451                 if (test_bit(R10BIO_WriteError, &r10_bio->state))
452                         reschedule_retry(r10_bio);
453                 else {
454                         close_write(r10_bio);
455                         if (test_bit(R10BIO_MadeGood, &r10_bio->state))
456                                 reschedule_retry(r10_bio);
457                         else
458                                 raid_end_bio_io(r10_bio);
459                 }
460         }
461 }
462
463 static void raid10_end_write_request(struct bio *bio)
464 {
465         struct r10bio *r10_bio = bio->bi_private;
466         int dev;
467         int dec_rdev = 1;
468         struct r10conf *conf = r10_bio->mddev->private;
469         int slot, repl;
470         struct md_rdev *rdev = NULL;
471         struct bio *to_put = NULL;
472         bool discard_error;
473
474         discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
475
476         dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
477
478         if (repl)
479                 rdev = conf->mirrors[dev].replacement;
480         if (!rdev) {
481                 smp_rmb();
482                 repl = 0;
483                 rdev = conf->mirrors[dev].rdev;
484         }
485         /*
486          * this branch is our 'one mirror IO has finished' event handler:
487          */
488         if (bio->bi_status && !discard_error) {
489                 if (repl)
490                         /* Never record new bad blocks to replacement,
491                          * just fail it.
492                          */
493                         md_error(rdev->mddev, rdev);
494                 else {
495                         set_bit(WriteErrorSeen, &rdev->flags);
496                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
497                                 set_bit(MD_RECOVERY_NEEDED,
498                                         &rdev->mddev->recovery);
499
500                         dec_rdev = 0;
501                         if (test_bit(FailFast, &rdev->flags) &&
502                             (bio->bi_opf & MD_FAILFAST)) {
503                                 md_error(rdev->mddev, rdev);
504                                 if (!test_bit(Faulty, &rdev->flags))
505                                         /* This is the only remaining device,
506                                          * We need to retry the write without
507                                          * FailFast
508                                          */
509                                         set_bit(R10BIO_WriteError, &r10_bio->state);
510                                 else {
511                                         r10_bio->devs[slot].bio = NULL;
512                                         to_put = bio;
513                                         dec_rdev = 1;
514                                 }
515                         } else
516                                 set_bit(R10BIO_WriteError, &r10_bio->state);
517                 }
518         } else {
519                 /*
520                  * Set R10BIO_Uptodate in our master bio, so that
521                  * we will return a good error code for to the higher
522                  * levels even if IO on some other mirrored buffer fails.
523                  *
524                  * The 'master' represents the composite IO operation to
525                  * user-side. So if something waits for IO, then it will
526                  * wait for the 'master' bio.
527                  */
528                 sector_t first_bad;
529                 int bad_sectors;
530
531                 /*
532                  * Do not set R10BIO_Uptodate if the current device is
533                  * rebuilding or Faulty. This is because we cannot use
534                  * such device for properly reading the data back (we could
535                  * potentially use it, if the current write would have felt
536                  * before rdev->recovery_offset, but for simplicity we don't
537                  * check this here.
538                  */
539                 if (test_bit(In_sync, &rdev->flags) &&
540                     !test_bit(Faulty, &rdev->flags))
541                         set_bit(R10BIO_Uptodate, &r10_bio->state);
542
543                 /* Maybe we can clear some bad blocks. */
544                 if (is_badblock(rdev,
545                                 r10_bio->devs[slot].addr,
546                                 r10_bio->sectors,
547                                 &first_bad, &bad_sectors) && !discard_error) {
548                         bio_put(bio);
549                         if (repl)
550                                 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
551                         else
552                                 r10_bio->devs[slot].bio = IO_MADE_GOOD;
553                         dec_rdev = 0;
554                         set_bit(R10BIO_MadeGood, &r10_bio->state);
555                 }
556         }
557
558         /*
559          *
560          * Let's see if all mirrored write operations have finished
561          * already.
562          */
563         one_write_done(r10_bio);
564         if (dec_rdev)
565                 rdev_dec_pending(rdev, conf->mddev);
566         if (to_put)
567                 bio_put(to_put);
568 }
569
570 /*
571  * RAID10 layout manager
572  * As well as the chunksize and raid_disks count, there are two
573  * parameters: near_copies and far_copies.
574  * near_copies * far_copies must be <= raid_disks.
575  * Normally one of these will be 1.
576  * If both are 1, we get raid0.
577  * If near_copies == raid_disks, we get raid1.
578  *
579  * Chunks are laid out in raid0 style with near_copies copies of the
580  * first chunk, followed by near_copies copies of the next chunk and
581  * so on.
582  * If far_copies > 1, then after 1/far_copies of the array has been assigned
583  * as described above, we start again with a device offset of near_copies.
584  * So we effectively have another copy of the whole array further down all
585  * the drives, but with blocks on different drives.
586  * With this layout, and block is never stored twice on the one device.
587  *
588  * raid10_find_phys finds the sector offset of a given virtual sector
589  * on each device that it is on.
590  *
591  * raid10_find_virt does the reverse mapping, from a device and a
592  * sector offset to a virtual address
593  */
594
595 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
596 {
597         int n,f;
598         sector_t sector;
599         sector_t chunk;
600         sector_t stripe;
601         int dev;
602         int slot = 0;
603         int last_far_set_start, last_far_set_size;
604
605         last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
606         last_far_set_start *= geo->far_set_size;
607
608         last_far_set_size = geo->far_set_size;
609         last_far_set_size += (geo->raid_disks % geo->far_set_size);
610
611         /* now calculate first sector/dev */
612         chunk = r10bio->sector >> geo->chunk_shift;
613         sector = r10bio->sector & geo->chunk_mask;
614
615         chunk *= geo->near_copies;
616         stripe = chunk;
617         dev = sector_div(stripe, geo->raid_disks);
618         if (geo->far_offset)
619                 stripe *= geo->far_copies;
620
621         sector += stripe << geo->chunk_shift;
622
623         /* and calculate all the others */
624         for (n = 0; n < geo->near_copies; n++) {
625                 int d = dev;
626                 int set;
627                 sector_t s = sector;
628                 r10bio->devs[slot].devnum = d;
629                 r10bio->devs[slot].addr = s;
630                 slot++;
631
632                 for (f = 1; f < geo->far_copies; f++) {
633                         set = d / geo->far_set_size;
634                         d += geo->near_copies;
635
636                         if ((geo->raid_disks % geo->far_set_size) &&
637                             (d > last_far_set_start)) {
638                                 d -= last_far_set_start;
639                                 d %= last_far_set_size;
640                                 d += last_far_set_start;
641                         } else {
642                                 d %= geo->far_set_size;
643                                 d += geo->far_set_size * set;
644                         }
645                         s += geo->stride;
646                         r10bio->devs[slot].devnum = d;
647                         r10bio->devs[slot].addr = s;
648                         slot++;
649                 }
650                 dev++;
651                 if (dev >= geo->raid_disks) {
652                         dev = 0;
653                         sector += (geo->chunk_mask + 1);
654                 }
655         }
656 }
657
658 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
659 {
660         struct geom *geo = &conf->geo;
661
662         if (conf->reshape_progress != MaxSector &&
663             ((r10bio->sector >= conf->reshape_progress) !=
664              conf->mddev->reshape_backwards)) {
665                 set_bit(R10BIO_Previous, &r10bio->state);
666                 geo = &conf->prev;
667         } else
668                 clear_bit(R10BIO_Previous, &r10bio->state);
669
670         __raid10_find_phys(geo, r10bio);
671 }
672
673 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
674 {
675         sector_t offset, chunk, vchunk;
676         /* Never use conf->prev as this is only called during resync
677          * or recovery, so reshape isn't happening
678          */
679         struct geom *geo = &conf->geo;
680         int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
681         int far_set_size = geo->far_set_size;
682         int last_far_set_start;
683
684         if (geo->raid_disks % geo->far_set_size) {
685                 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
686                 last_far_set_start *= geo->far_set_size;
687
688                 if (dev >= last_far_set_start) {
689                         far_set_size = geo->far_set_size;
690                         far_set_size += (geo->raid_disks % geo->far_set_size);
691                         far_set_start = last_far_set_start;
692                 }
693         }
694
695         offset = sector & geo->chunk_mask;
696         if (geo->far_offset) {
697                 int fc;
698                 chunk = sector >> geo->chunk_shift;
699                 fc = sector_div(chunk, geo->far_copies);
700                 dev -= fc * geo->near_copies;
701                 if (dev < far_set_start)
702                         dev += far_set_size;
703         } else {
704                 while (sector >= geo->stride) {
705                         sector -= geo->stride;
706                         if (dev < (geo->near_copies + far_set_start))
707                                 dev += far_set_size - geo->near_copies;
708                         else
709                                 dev -= geo->near_copies;
710                 }
711                 chunk = sector >> geo->chunk_shift;
712         }
713         vchunk = chunk * geo->raid_disks + dev;
714         sector_div(vchunk, geo->near_copies);
715         return (vchunk << geo->chunk_shift) + offset;
716 }
717
718 /*
719  * This routine returns the disk from which the requested read should
720  * be done. There is a per-array 'next expected sequential IO' sector
721  * number - if this matches on the next IO then we use the last disk.
722  * There is also a per-disk 'last know head position' sector that is
723  * maintained from IRQ contexts, both the normal and the resync IO
724  * completion handlers update this position correctly. If there is no
725  * perfect sequential match then we pick the disk whose head is closest.
726  *
727  * If there are 2 mirrors in the same 2 devices, performance degrades
728  * because position is mirror, not device based.
729  *
730  * The rdev for the device selected will have nr_pending incremented.
731  */
732
733 /*
734  * FIXME: possibly should rethink readbalancing and do it differently
735  * depending on near_copies / far_copies geometry.
736  */
737 static struct md_rdev *read_balance(struct r10conf *conf,
738                                     struct r10bio *r10_bio,
739                                     int *max_sectors)
740 {
741         const sector_t this_sector = r10_bio->sector;
742         int disk, slot;
743         int sectors = r10_bio->sectors;
744         int best_good_sectors;
745         sector_t new_distance, best_dist;
746         struct md_rdev *best_rdev, *rdev = NULL;
747         int do_balance;
748         int best_slot;
749         struct geom *geo = &conf->geo;
750
751         raid10_find_phys(conf, r10_bio);
752         rcu_read_lock();
753         best_slot = -1;
754         best_rdev = NULL;
755         best_dist = MaxSector;
756         best_good_sectors = 0;
757         do_balance = 1;
758         clear_bit(R10BIO_FailFast, &r10_bio->state);
759         /*
760          * Check if we can balance. We can balance on the whole
761          * device if no resync is going on (recovery is ok), or below
762          * the resync window. We take the first readable disk when
763          * above the resync window.
764          */
765         if ((conf->mddev->recovery_cp < MaxSector
766              && (this_sector + sectors >= conf->next_resync)) ||
767             (mddev_is_clustered(conf->mddev) &&
768              md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
769                                             this_sector + sectors)))
770                 do_balance = 0;
771
772         for (slot = 0; slot < conf->copies ; slot++) {
773                 sector_t first_bad;
774                 int bad_sectors;
775                 sector_t dev_sector;
776
777                 if (r10_bio->devs[slot].bio == IO_BLOCKED)
778                         continue;
779                 disk = r10_bio->devs[slot].devnum;
780                 rdev = rcu_dereference(conf->mirrors[disk].replacement);
781                 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
782                     r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
783                         rdev = rcu_dereference(conf->mirrors[disk].rdev);
784                 if (rdev == NULL ||
785                     test_bit(Faulty, &rdev->flags))
786                         continue;
787                 if (!test_bit(In_sync, &rdev->flags) &&
788                     r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
789                         continue;
790
791                 dev_sector = r10_bio->devs[slot].addr;
792                 if (is_badblock(rdev, dev_sector, sectors,
793                                 &first_bad, &bad_sectors)) {
794                         if (best_dist < MaxSector)
795                                 /* Already have a better slot */
796                                 continue;
797                         if (first_bad <= dev_sector) {
798                                 /* Cannot read here.  If this is the
799                                  * 'primary' device, then we must not read
800                                  * beyond 'bad_sectors' from another device.
801                                  */
802                                 bad_sectors -= (dev_sector - first_bad);
803                                 if (!do_balance && sectors > bad_sectors)
804                                         sectors = bad_sectors;
805                                 if (best_good_sectors > sectors)
806                                         best_good_sectors = sectors;
807                         } else {
808                                 sector_t good_sectors =
809                                         first_bad - dev_sector;
810                                 if (good_sectors > best_good_sectors) {
811                                         best_good_sectors = good_sectors;
812                                         best_slot = slot;
813                                         best_rdev = rdev;
814                                 }
815                                 if (!do_balance)
816                                         /* Must read from here */
817                                         break;
818                         }
819                         continue;
820                 } else
821                         best_good_sectors = sectors;
822
823                 if (!do_balance)
824                         break;
825
826                 if (best_slot >= 0)
827                         /* At least 2 disks to choose from so failfast is OK */
828                         set_bit(R10BIO_FailFast, &r10_bio->state);
829                 /* This optimisation is debatable, and completely destroys
830                  * sequential read speed for 'far copies' arrays.  So only
831                  * keep it for 'near' arrays, and review those later.
832                  */
833                 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
834                         new_distance = 0;
835
836                 /* for far > 1 always use the lowest address */
837                 else if (geo->far_copies > 1)
838                         new_distance = r10_bio->devs[slot].addr;
839                 else
840                         new_distance = abs(r10_bio->devs[slot].addr -
841                                            conf->mirrors[disk].head_position);
842                 if (new_distance < best_dist) {
843                         best_dist = new_distance;
844                         best_slot = slot;
845                         best_rdev = rdev;
846                 }
847         }
848         if (slot >= conf->copies) {
849                 slot = best_slot;
850                 rdev = best_rdev;
851         }
852
853         if (slot >= 0) {
854                 atomic_inc(&rdev->nr_pending);
855                 r10_bio->read_slot = slot;
856         } else
857                 rdev = NULL;
858         rcu_read_unlock();
859         *max_sectors = best_good_sectors;
860
861         return rdev;
862 }
863
864 static int raid10_congested(struct mddev *mddev, int bits)
865 {
866         struct r10conf *conf = mddev->private;
867         int i, ret = 0;
868
869         if ((bits & (1 << WB_async_congested)) &&
870             conf->pending_count >= max_queued_requests)
871                 return 1;
872
873         rcu_read_lock();
874         for (i = 0;
875              (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
876                      && ret == 0;
877              i++) {
878                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
879                 if (rdev && !test_bit(Faulty, &rdev->flags)) {
880                         struct request_queue *q = bdev_get_queue(rdev->bdev);
881
882                         ret |= bdi_congested(q->backing_dev_info, bits);
883                 }
884         }
885         rcu_read_unlock();
886         return ret;
887 }
888
889 static void flush_pending_writes(struct r10conf *conf)
890 {
891         /* Any writes that have been queued but are awaiting
892          * bitmap updates get flushed here.
893          */
894         spin_lock_irq(&conf->device_lock);
895
896         if (conf->pending_bio_list.head) {
897                 struct bio *bio;
898                 bio = bio_list_get(&conf->pending_bio_list);
899                 conf->pending_count = 0;
900                 spin_unlock_irq(&conf->device_lock);
901                 /* flush any pending bitmap writes to disk
902                  * before proceeding w/ I/O */
903                 bitmap_unplug(conf->mddev->bitmap);
904                 wake_up(&conf->wait_barrier);
905
906                 while (bio) { /* submit pending writes */
907                         struct bio *next = bio->bi_next;
908                         struct md_rdev *rdev = (void*)bio->bi_disk;
909                         bio->bi_next = NULL;
910                         bio_set_dev(bio, rdev->bdev);
911                         if (test_bit(Faulty, &rdev->flags)) {
912                                 bio_io_error(bio);
913                         } else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
914                                             !blk_queue_discard(bio->bi_disk->queue)))
915                                 /* Just ignore it */
916                                 bio_endio(bio);
917                         else
918                                 generic_make_request(bio);
919                         bio = next;
920                 }
921         } else
922                 spin_unlock_irq(&conf->device_lock);
923 }
924
925 /* Barriers....
926  * Sometimes we need to suspend IO while we do something else,
927  * either some resync/recovery, or reconfigure the array.
928  * To do this we raise a 'barrier'.
929  * The 'barrier' is a counter that can be raised multiple times
930  * to count how many activities are happening which preclude
931  * normal IO.
932  * We can only raise the barrier if there is no pending IO.
933  * i.e. if nr_pending == 0.
934  * We choose only to raise the barrier if no-one is waiting for the
935  * barrier to go down.  This means that as soon as an IO request
936  * is ready, no other operations which require a barrier will start
937  * until the IO request has had a chance.
938  *
939  * So: regular IO calls 'wait_barrier'.  When that returns there
940  *    is no backgroup IO happening,  It must arrange to call
941  *    allow_barrier when it has finished its IO.
942  * backgroup IO calls must call raise_barrier.  Once that returns
943  *    there is no normal IO happeing.  It must arrange to call
944  *    lower_barrier when the particular background IO completes.
945  */
946
947 static void raise_barrier(struct r10conf *conf, int force)
948 {
949         BUG_ON(force && !conf->barrier);
950         spin_lock_irq(&conf->resync_lock);
951
952         /* Wait until no block IO is waiting (unless 'force') */
953         wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
954                             conf->resync_lock);
955
956         /* block any new IO from starting */
957         conf->barrier++;
958
959         /* Now wait for all pending IO to complete */
960         wait_event_lock_irq(conf->wait_barrier,
961                             !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
962                             conf->resync_lock);
963
964         spin_unlock_irq(&conf->resync_lock);
965 }
966
967 static void lower_barrier(struct r10conf *conf)
968 {
969         unsigned long flags;
970         spin_lock_irqsave(&conf->resync_lock, flags);
971         conf->barrier--;
972         spin_unlock_irqrestore(&conf->resync_lock, flags);
973         wake_up(&conf->wait_barrier);
974 }
975
976 static void wait_barrier(struct r10conf *conf)
977 {
978         spin_lock_irq(&conf->resync_lock);
979         if (conf->barrier) {
980                 conf->nr_waiting++;
981                 /* Wait for the barrier to drop.
982                  * However if there are already pending
983                  * requests (preventing the barrier from
984                  * rising completely), and the
985                  * pre-process bio queue isn't empty,
986                  * then don't wait, as we need to empty
987                  * that queue to get the nr_pending
988                  * count down.
989                  */
990                 raid10_log(conf->mddev, "wait barrier");
991                 wait_event_lock_irq(conf->wait_barrier,
992                                     !conf->barrier ||
993                                     (atomic_read(&conf->nr_pending) &&
994                                      current->bio_list &&
995                                      (!bio_list_empty(&current->bio_list[0]) ||
996                                       !bio_list_empty(&current->bio_list[1]))),
997                                     conf->resync_lock);
998                 conf->nr_waiting--;
999                 if (!conf->nr_waiting)
1000                         wake_up(&conf->wait_barrier);
1001         }
1002         atomic_inc(&conf->nr_pending);
1003         spin_unlock_irq(&conf->resync_lock);
1004 }
1005
1006 static void allow_barrier(struct r10conf *conf)
1007 {
1008         if ((atomic_dec_and_test(&conf->nr_pending)) ||
1009                         (conf->array_freeze_pending))
1010                 wake_up(&conf->wait_barrier);
1011 }
1012
1013 static void freeze_array(struct r10conf *conf, int extra)
1014 {
1015         /* stop syncio and normal IO and wait for everything to
1016          * go quiet.
1017          * We increment barrier and nr_waiting, and then
1018          * wait until nr_pending match nr_queued+extra
1019          * This is called in the context of one normal IO request
1020          * that has failed. Thus any sync request that might be pending
1021          * will be blocked by nr_pending, and we need to wait for
1022          * pending IO requests to complete or be queued for re-try.
1023          * Thus the number queued (nr_queued) plus this request (extra)
1024          * must match the number of pending IOs (nr_pending) before
1025          * we continue.
1026          */
1027         spin_lock_irq(&conf->resync_lock);
1028         conf->array_freeze_pending++;
1029         conf->barrier++;
1030         conf->nr_waiting++;
1031         wait_event_lock_irq_cmd(conf->wait_barrier,
1032                                 atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
1033                                 conf->resync_lock,
1034                                 flush_pending_writes(conf));
1035
1036         conf->array_freeze_pending--;
1037         spin_unlock_irq(&conf->resync_lock);
1038 }
1039
1040 static void unfreeze_array(struct r10conf *conf)
1041 {
1042         /* reverse the effect of the freeze */
1043         spin_lock_irq(&conf->resync_lock);
1044         conf->barrier--;
1045         conf->nr_waiting--;
1046         wake_up(&conf->wait_barrier);
1047         spin_unlock_irq(&conf->resync_lock);
1048 }
1049
1050 static sector_t choose_data_offset(struct r10bio *r10_bio,
1051                                    struct md_rdev *rdev)
1052 {
1053         if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1054             test_bit(R10BIO_Previous, &r10_bio->state))
1055                 return rdev->data_offset;
1056         else
1057                 return rdev->new_data_offset;
1058 }
1059
1060 struct raid10_plug_cb {
1061         struct blk_plug_cb      cb;
1062         struct bio_list         pending;
1063         int                     pending_cnt;
1064 };
1065
1066 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1067 {
1068         struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1069                                                    cb);
1070         struct mddev *mddev = plug->cb.data;
1071         struct r10conf *conf = mddev->private;
1072         struct bio *bio;
1073
1074         if (from_schedule || current->bio_list) {
1075                 spin_lock_irq(&conf->device_lock);
1076                 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1077                 conf->pending_count += plug->pending_cnt;
1078                 spin_unlock_irq(&conf->device_lock);
1079                 wake_up(&conf->wait_barrier);
1080                 md_wakeup_thread(mddev->thread);
1081                 kfree(plug);
1082                 return;
1083         }
1084
1085         /* we aren't scheduling, so we can do the write-out directly. */
1086         bio = bio_list_get(&plug->pending);
1087         bitmap_unplug(mddev->bitmap);
1088         wake_up(&conf->wait_barrier);
1089
1090         while (bio) { /* submit pending writes */
1091                 struct bio *next = bio->bi_next;
1092                 struct md_rdev *rdev = (void*)bio->bi_disk;
1093                 bio->bi_next = NULL;
1094                 bio_set_dev(bio, rdev->bdev);
1095                 if (test_bit(Faulty, &rdev->flags)) {
1096                         bio_io_error(bio);
1097                 } else if (unlikely((bio_op(bio) ==  REQ_OP_DISCARD) &&
1098                                     !blk_queue_discard(bio->bi_disk->queue)))
1099                         /* Just ignore it */
1100                         bio_endio(bio);
1101                 else
1102                         generic_make_request(bio);
1103                 bio = next;
1104         }
1105         kfree(plug);
1106 }
1107
1108 static void raid10_read_request(struct mddev *mddev, struct bio *bio,
1109                                 struct r10bio *r10_bio)
1110 {
1111         struct r10conf *conf = mddev->private;
1112         struct bio *read_bio;
1113         const int op = bio_op(bio);
1114         const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1115         int max_sectors;
1116         sector_t sectors;
1117         struct md_rdev *rdev;
1118         char b[BDEVNAME_SIZE];
1119         int slot = r10_bio->read_slot;
1120         struct md_rdev *err_rdev = NULL;
1121         gfp_t gfp = GFP_NOIO;
1122
1123         if (r10_bio->devs[slot].rdev) {
1124                 /*
1125                  * This is an error retry, but we cannot
1126                  * safely dereference the rdev in the r10_bio,
1127                  * we must use the one in conf.
1128                  * If it has already been disconnected (unlikely)
1129                  * we lose the device name in error messages.
1130                  */
1131                 int disk;
1132                 /*
1133                  * As we are blocking raid10, it is a little safer to
1134                  * use __GFP_HIGH.
1135                  */
1136                 gfp = GFP_NOIO | __GFP_HIGH;
1137
1138                 rcu_read_lock();
1139                 disk = r10_bio->devs[slot].devnum;
1140                 err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
1141                 if (err_rdev)
1142                         bdevname(err_rdev->bdev, b);
1143                 else {
1144                         strcpy(b, "???");
1145                         /* This never gets dereferenced */
1146                         err_rdev = r10_bio->devs[slot].rdev;
1147                 }
1148                 rcu_read_unlock();
1149         }
1150         /*
1151          * Register the new request and wait if the reconstruction
1152          * thread has put up a bar for new requests.
1153          * Continue immediately if no resync is active currently.
1154          */
1155         wait_barrier(conf);
1156
1157         sectors = r10_bio->sectors;
1158         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1159             bio->bi_iter.bi_sector < conf->reshape_progress &&
1160             bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1161                 /*
1162                  * IO spans the reshape position.  Need to wait for reshape to
1163                  * pass
1164                  */
1165                 raid10_log(conf->mddev, "wait reshape");
1166                 allow_barrier(conf);
1167                 wait_event(conf->wait_barrier,
1168                            conf->reshape_progress <= bio->bi_iter.bi_sector ||
1169                            conf->reshape_progress >= bio->bi_iter.bi_sector +
1170                            sectors);
1171                 wait_barrier(conf);
1172         }
1173
1174         rdev = read_balance(conf, r10_bio, &max_sectors);
1175         if (!rdev) {
1176                 if (err_rdev) {
1177                         pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
1178                                             mdname(mddev), b,
1179                                             (unsigned long long)r10_bio->sector);
1180                 }
1181                 raid_end_bio_io(r10_bio);
1182                 return;
1183         }
1184         if (err_rdev)
1185                 pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
1186                                    mdname(mddev),
1187                                    bdevname(rdev->bdev, b),
1188                                    (unsigned long long)r10_bio->sector);
1189         if (max_sectors < bio_sectors(bio)) {
1190                 struct bio *split = bio_split(bio, max_sectors,
1191                                               gfp, conf->bio_split);
1192                 bio_chain(split, bio);
1193                 generic_make_request(bio);
1194                 bio = split;
1195                 r10_bio->master_bio = bio;
1196                 r10_bio->sectors = max_sectors;
1197         }
1198         slot = r10_bio->read_slot;
1199
1200         read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
1201
1202         r10_bio->devs[slot].bio = read_bio;
1203         r10_bio->devs[slot].rdev = rdev;
1204
1205         read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
1206                 choose_data_offset(r10_bio, rdev);
1207         bio_set_dev(read_bio, rdev->bdev);
1208         read_bio->bi_end_io = raid10_end_read_request;
1209         bio_set_op_attrs(read_bio, op, do_sync);
1210         if (test_bit(FailFast, &rdev->flags) &&
1211             test_bit(R10BIO_FailFast, &r10_bio->state))
1212                 read_bio->bi_opf |= MD_FAILFAST;
1213         read_bio->bi_private = r10_bio;
1214
1215         if (mddev->gendisk)
1216                 trace_block_bio_remap(read_bio->bi_disk->queue,
1217                                       read_bio, disk_devt(mddev->gendisk),
1218                                       r10_bio->sector);
1219         generic_make_request(read_bio);
1220         return;
1221 }
1222
1223 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
1224                                   struct bio *bio, bool replacement,
1225                                   int n_copy)
1226 {
1227         const int op = bio_op(bio);
1228         const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1229         const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
1230         unsigned long flags;
1231         struct blk_plug_cb *cb;
1232         struct raid10_plug_cb *plug = NULL;
1233         struct r10conf *conf = mddev->private;
1234         struct md_rdev *rdev;
1235         int devnum = r10_bio->devs[n_copy].devnum;
1236         struct bio *mbio;
1237
1238         if (replacement) {
1239                 rdev = conf->mirrors[devnum].replacement;
1240                 if (rdev == NULL) {
1241                         /* Replacement just got moved to main 'rdev' */
1242                         smp_mb();
1243                         rdev = conf->mirrors[devnum].rdev;
1244                 }
1245         } else
1246                 rdev = conf->mirrors[devnum].rdev;
1247
1248         mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1249         if (replacement)
1250                 r10_bio->devs[n_copy].repl_bio = mbio;
1251         else
1252                 r10_bio->devs[n_copy].bio = mbio;
1253
1254         mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
1255                                    choose_data_offset(r10_bio, rdev));
1256         bio_set_dev(mbio, rdev->bdev);
1257         mbio->bi_end_io = raid10_end_write_request;
1258         bio_set_op_attrs(mbio, op, do_sync | do_fua);
1259         if (!replacement && test_bit(FailFast,
1260                                      &conf->mirrors[devnum].rdev->flags)
1261                          && enough(conf, devnum))
1262                 mbio->bi_opf |= MD_FAILFAST;
1263         mbio->bi_private = r10_bio;
1264
1265         if (conf->mddev->gendisk)
1266                 trace_block_bio_remap(mbio->bi_disk->queue,
1267                                       mbio, disk_devt(conf->mddev->gendisk),
1268                                       r10_bio->sector);
1269         /* flush_pending_writes() needs access to the rdev so...*/
1270         mbio->bi_disk = (void *)rdev;
1271
1272         atomic_inc(&r10_bio->remaining);
1273
1274         cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
1275         if (cb)
1276                 plug = container_of(cb, struct raid10_plug_cb, cb);
1277         else
1278                 plug = NULL;
1279         if (plug) {
1280                 bio_list_add(&plug->pending, mbio);
1281                 plug->pending_cnt++;
1282         } else {
1283                 spin_lock_irqsave(&conf->device_lock, flags);
1284                 bio_list_add(&conf->pending_bio_list, mbio);
1285                 conf->pending_count++;
1286                 spin_unlock_irqrestore(&conf->device_lock, flags);
1287                 md_wakeup_thread(mddev->thread);
1288         }
1289 }
1290
1291 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
1292                                  struct r10bio *r10_bio)
1293 {
1294         struct r10conf *conf = mddev->private;
1295         int i;
1296         struct md_rdev *blocked_rdev;
1297         sector_t sectors;
1298         int max_sectors;
1299
1300         if ((mddev_is_clustered(mddev) &&
1301              md_cluster_ops->area_resyncing(mddev, WRITE,
1302                                             bio->bi_iter.bi_sector,
1303                                             bio_end_sector(bio)))) {
1304                 DEFINE_WAIT(w);
1305                 for (;;) {
1306                         prepare_to_wait(&conf->wait_barrier,
1307                                         &w, TASK_IDLE);
1308                         if (!md_cluster_ops->area_resyncing(mddev, WRITE,
1309                                  bio->bi_iter.bi_sector, bio_end_sector(bio)))
1310                                 break;
1311                         schedule();
1312                 }
1313                 finish_wait(&conf->wait_barrier, &w);
1314         }
1315
1316         /*
1317          * Register the new request and wait if the reconstruction
1318          * thread has put up a bar for new requests.
1319          * Continue immediately if no resync is active currently.
1320          */
1321         wait_barrier(conf);
1322
1323         sectors = r10_bio->sectors;
1324         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1325             bio->bi_iter.bi_sector < conf->reshape_progress &&
1326             bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
1327                 /*
1328                  * IO spans the reshape position.  Need to wait for reshape to
1329                  * pass
1330                  */
1331                 raid10_log(conf->mddev, "wait reshape");
1332                 allow_barrier(conf);
1333                 wait_event(conf->wait_barrier,
1334                            conf->reshape_progress <= bio->bi_iter.bi_sector ||
1335                            conf->reshape_progress >= bio->bi_iter.bi_sector +
1336                            sectors);
1337                 wait_barrier(conf);
1338         }
1339
1340         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1341             (mddev->reshape_backwards
1342              ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
1343                 bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
1344              : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe &&
1345                 bio->bi_iter.bi_sector < conf->reshape_progress))) {
1346                 /* Need to update reshape_position in metadata */
1347                 mddev->reshape_position = conf->reshape_progress;
1348                 set_mask_bits(&mddev->sb_flags, 0,
1349                               BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1350                 md_wakeup_thread(mddev->thread);
1351                 raid10_log(conf->mddev, "wait reshape metadata");
1352                 wait_event(mddev->sb_wait,
1353                            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
1354
1355                 conf->reshape_safe = mddev->reshape_position;
1356         }
1357
1358         if (conf->pending_count >= max_queued_requests) {
1359                 md_wakeup_thread(mddev->thread);
1360                 raid10_log(mddev, "wait queued");
1361                 wait_event(conf->wait_barrier,
1362                            conf->pending_count < max_queued_requests);
1363         }
1364         /* first select target devices under rcu_lock and
1365          * inc refcount on their rdev.  Record them by setting
1366          * bios[x] to bio
1367          * If there are known/acknowledged bad blocks on any device
1368          * on which we have seen a write error, we want to avoid
1369          * writing to those blocks.  This potentially requires several
1370          * writes to write around the bad blocks.  Each set of writes
1371          * gets its own r10_bio with a set of bios attached.
1372          */
1373
1374         r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1375         raid10_find_phys(conf, r10_bio);
1376 retry_write:
1377         blocked_rdev = NULL;
1378         rcu_read_lock();
1379         max_sectors = r10_bio->sectors;
1380
1381         for (i = 0;  i < conf->copies; i++) {
1382                 int d = r10_bio->devs[i].devnum;
1383                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1384                 struct md_rdev *rrdev = rcu_dereference(
1385                         conf->mirrors[d].replacement);
1386                 if (rdev == rrdev)
1387                         rrdev = NULL;
1388                 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1389                         atomic_inc(&rdev->nr_pending);
1390                         blocked_rdev = rdev;
1391                         break;
1392                 }
1393                 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1394                         atomic_inc(&rrdev->nr_pending);
1395                         blocked_rdev = rrdev;
1396                         break;
1397                 }
1398                 if (rdev && (test_bit(Faulty, &rdev->flags)))
1399                         rdev = NULL;
1400                 if (rrdev && (test_bit(Faulty, &rrdev->flags)))
1401                         rrdev = NULL;
1402
1403                 r10_bio->devs[i].bio = NULL;
1404                 r10_bio->devs[i].repl_bio = NULL;
1405
1406                 if (!rdev && !rrdev) {
1407                         set_bit(R10BIO_Degraded, &r10_bio->state);
1408                         continue;
1409                 }
1410                 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1411                         sector_t first_bad;
1412                         sector_t dev_sector = r10_bio->devs[i].addr;
1413                         int bad_sectors;
1414                         int is_bad;
1415
1416                         is_bad = is_badblock(rdev, dev_sector, max_sectors,
1417                                              &first_bad, &bad_sectors);
1418                         if (is_bad < 0) {
1419                                 /* Mustn't write here until the bad block
1420                                  * is acknowledged
1421                                  */
1422                                 atomic_inc(&rdev->nr_pending);
1423                                 set_bit(BlockedBadBlocks, &rdev->flags);
1424                                 blocked_rdev = rdev;
1425                                 break;
1426                         }
1427                         if (is_bad && first_bad <= dev_sector) {
1428                                 /* Cannot write here at all */
1429                                 bad_sectors -= (dev_sector - first_bad);
1430                                 if (bad_sectors < max_sectors)
1431                                         /* Mustn't write more than bad_sectors
1432                                          * to other devices yet
1433                                          */
1434                                         max_sectors = bad_sectors;
1435                                 /* We don't set R10BIO_Degraded as that
1436                                  * only applies if the disk is missing,
1437                                  * so it might be re-added, and we want to
1438                                  * know to recover this chunk.
1439                                  * In this case the device is here, and the
1440                                  * fact that this chunk is not in-sync is
1441                                  * recorded in the bad block log.
1442                                  */
1443                                 continue;
1444                         }
1445                         if (is_bad) {
1446                                 int good_sectors = first_bad - dev_sector;
1447                                 if (good_sectors < max_sectors)
1448                                         max_sectors = good_sectors;
1449                         }
1450                 }
1451                 if (rdev) {
1452                         r10_bio->devs[i].bio = bio;
1453                         atomic_inc(&rdev->nr_pending);
1454                 }
1455                 if (rrdev) {
1456                         r10_bio->devs[i].repl_bio = bio;
1457                         atomic_inc(&rrdev->nr_pending);
1458                 }
1459         }
1460         rcu_read_unlock();
1461
1462         if (unlikely(blocked_rdev)) {
1463                 /* Have to wait for this device to get unblocked, then retry */
1464                 int j;
1465                 int d;
1466
1467                 for (j = 0; j < i; j++) {
1468                         if (r10_bio->devs[j].bio) {
1469                                 d = r10_bio->devs[j].devnum;
1470                                 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1471                         }
1472                         if (r10_bio->devs[j].repl_bio) {
1473                                 struct md_rdev *rdev;
1474                                 d = r10_bio->devs[j].devnum;
1475                                 rdev = conf->mirrors[d].replacement;
1476                                 if (!rdev) {
1477                                         /* Race with remove_disk */
1478                                         smp_mb();
1479                                         rdev = conf->mirrors[d].rdev;
1480                                 }
1481                                 rdev_dec_pending(rdev, mddev);
1482                         }
1483                 }
1484                 allow_barrier(conf);
1485                 raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
1486                 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1487                 wait_barrier(conf);
1488                 goto retry_write;
1489         }
1490
1491         if (max_sectors < r10_bio->sectors)
1492                 r10_bio->sectors = max_sectors;
1493
1494         if (r10_bio->sectors < bio_sectors(bio)) {
1495                 struct bio *split = bio_split(bio, r10_bio->sectors,
1496                                               GFP_NOIO, conf->bio_split);
1497                 bio_chain(split, bio);
1498                 generic_make_request(bio);
1499                 bio = split;
1500                 r10_bio->master_bio = bio;
1501         }
1502
1503         atomic_set(&r10_bio->remaining, 1);
1504         bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1505
1506         for (i = 0; i < conf->copies; i++) {
1507                 if (r10_bio->devs[i].bio)
1508                         raid10_write_one_disk(mddev, r10_bio, bio, false, i);
1509                 if (r10_bio->devs[i].repl_bio)
1510                         raid10_write_one_disk(mddev, r10_bio, bio, true, i);
1511         }
1512         one_write_done(r10_bio);
1513 }
1514
1515 static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
1516 {
1517         struct r10conf *conf = mddev->private;
1518         struct r10bio *r10_bio;
1519
1520         r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1521
1522         r10_bio->master_bio = bio;
1523         r10_bio->sectors = sectors;
1524
1525         r10_bio->mddev = mddev;
1526         r10_bio->sector = bio->bi_iter.bi_sector;
1527         r10_bio->state = 0;
1528         memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
1529
1530         if (bio_data_dir(bio) == READ)
1531                 raid10_read_request(mddev, bio, r10_bio);
1532         else
1533                 raid10_write_request(mddev, bio, r10_bio);
1534 }
1535
1536 static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
1537 {
1538         struct r10conf *conf = mddev->private;
1539         sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1540         int chunk_sects = chunk_mask + 1;
1541         int sectors = bio_sectors(bio);
1542
1543         if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1544                 md_flush_request(mddev, bio);
1545                 return true;
1546         }
1547
1548         if (!md_write_start(mddev, bio))
1549                 return false;
1550
1551         /*
1552          * If this request crosses a chunk boundary, we need to split
1553          * it.
1554          */
1555         if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
1556                      sectors > chunk_sects
1557                      && (conf->geo.near_copies < conf->geo.raid_disks
1558                          || conf->prev.near_copies <
1559                          conf->prev.raid_disks)))
1560                 sectors = chunk_sects -
1561                         (bio->bi_iter.bi_sector &
1562                          (chunk_sects - 1));
1563         __make_request(mddev, bio, sectors);
1564
1565         /* In case raid10d snuck in to freeze_array */
1566         wake_up(&conf->wait_barrier);
1567         return true;
1568 }
1569
1570 static void raid10_status(struct seq_file *seq, struct mddev *mddev)
1571 {
1572         struct r10conf *conf = mddev->private;
1573         int i;
1574
1575         if (conf->geo.near_copies < conf->geo.raid_disks)
1576                 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1577         if (conf->geo.near_copies > 1)
1578                 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1579         if (conf->geo.far_copies > 1) {
1580                 if (conf->geo.far_offset)
1581                         seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1582                 else
1583                         seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1584                 if (conf->geo.far_set_size != conf->geo.raid_disks)
1585                         seq_printf(seq, " %d devices per set", conf->geo.far_set_size);
1586         }
1587         seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1588                                         conf->geo.raid_disks - mddev->degraded);
1589         rcu_read_lock();
1590         for (i = 0; i < conf->geo.raid_disks; i++) {
1591                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1592                 seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1593         }
1594         rcu_read_unlock();
1595         seq_printf(seq, "]");
1596 }
1597
1598 /* check if there are enough drives for
1599  * every block to appear on atleast one.
1600  * Don't consider the device numbered 'ignore'
1601  * as we might be about to remove it.
1602  */
1603 static int _enough(struct r10conf *conf, int previous, int ignore)
1604 {
1605         int first = 0;
1606         int has_enough = 0;
1607         int disks, ncopies;
1608         if (previous) {
1609                 disks = conf->prev.raid_disks;
1610                 ncopies = conf->prev.near_copies;
1611         } else {
1612                 disks = conf->geo.raid_disks;
1613                 ncopies = conf->geo.near_copies;
1614         }
1615
1616         rcu_read_lock();
1617         do {
1618                 int n = conf->copies;
1619                 int cnt = 0;
1620                 int this = first;
1621                 while (n--) {
1622                         struct md_rdev *rdev;
1623                         if (this != ignore &&
1624                             (rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
1625                             test_bit(In_sync, &rdev->flags))
1626                                 cnt++;
1627                         this = (this+1) % disks;
1628                 }
1629                 if (cnt == 0)
1630                         goto out;
1631                 first = (first + ncopies) % disks;
1632         } while (first != 0);
1633         has_enough = 1;
1634 out:
1635         rcu_read_unlock();
1636         return has_enough;
1637 }
1638
1639 static int enough(struct r10conf *conf, int ignore)
1640 {
1641         /* when calling 'enough', both 'prev' and 'geo' must
1642          * be stable.
1643          * This is ensured if ->reconfig_mutex or ->device_lock
1644          * is held.
1645          */
1646         return _enough(conf, 0, ignore) &&
1647                 _enough(conf, 1, ignore);
1648 }
1649
1650 static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
1651 {
1652         char b[BDEVNAME_SIZE];
1653         struct r10conf *conf = mddev->private;
1654         unsigned long flags;
1655
1656         /*
1657          * If it is not operational, then we have already marked it as dead
1658          * else if it is the last working disks, ignore the error, let the
1659          * next level up know.
1660          * else mark the drive as failed
1661          */
1662         spin_lock_irqsave(&conf->device_lock, flags);
1663         if (test_bit(In_sync, &rdev->flags)
1664             && !enough(conf, rdev->raid_disk)) {
1665                 /*
1666                  * Don't fail the drive, just return an IO error.
1667                  */
1668                 spin_unlock_irqrestore(&conf->device_lock, flags);
1669                 return;
1670         }
1671         if (test_and_clear_bit(In_sync, &rdev->flags))
1672                 mddev->degraded++;
1673         /*
1674          * If recovery is running, make sure it aborts.
1675          */
1676         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1677         set_bit(Blocked, &rdev->flags);
1678         set_bit(Faulty, &rdev->flags);
1679         set_mask_bits(&mddev->sb_flags, 0,
1680                       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1681         spin_unlock_irqrestore(&conf->device_lock, flags);
1682         pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n"
1683                 "md/raid10:%s: Operation continuing on %d devices.\n",
1684                 mdname(mddev), bdevname(rdev->bdev, b),
1685                 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1686 }
1687
1688 static void print_conf(struct r10conf *conf)
1689 {
1690         int i;
1691         struct md_rdev *rdev;
1692
1693         pr_debug("RAID10 conf printout:\n");
1694         if (!conf) {
1695                 pr_debug("(!conf)\n");
1696                 return;
1697         }
1698         pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1699                  conf->geo.raid_disks);
1700
1701         /* This is only called with ->reconfix_mutex held, so
1702          * rcu protection of rdev is not needed */
1703         for (i = 0; i < conf->geo.raid_disks; i++) {
1704                 char b[BDEVNAME_SIZE];
1705                 rdev = conf->mirrors[i].rdev;
1706                 if (rdev)
1707                         pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1708                                  i, !test_bit(In_sync, &rdev->flags),
1709                                  !test_bit(Faulty, &rdev->flags),
1710                                  bdevname(rdev->bdev,b));
1711         }
1712 }
1713
1714 static void close_sync(struct r10conf *conf)
1715 {
1716         wait_barrier(conf);
1717         allow_barrier(conf);
1718
1719         mempool_destroy(conf->r10buf_pool);
1720         conf->r10buf_pool = NULL;
1721 }
1722
1723 static int raid10_spare_active(struct mddev *mddev)
1724 {
1725         int i;
1726         struct r10conf *conf = mddev->private;
1727         struct raid10_info *tmp;
1728         int count = 0;
1729         unsigned long flags;
1730
1731         /*
1732          * Find all non-in_sync disks within the RAID10 configuration
1733          * and mark them in_sync
1734          */
1735         for (i = 0; i < conf->geo.raid_disks; i++) {
1736                 tmp = conf->mirrors + i;
1737                 if (tmp->replacement
1738                     && tmp->replacement->recovery_offset == MaxSector
1739                     && !test_bit(Faulty, &tmp->replacement->flags)
1740                     && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1741                         /* Replacement has just become active */
1742                         if (!tmp->rdev
1743                             || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1744                                 count++;
1745                         if (tmp->rdev) {
1746                                 /* Replaced device not technically faulty,
1747                                  * but we need to be sure it gets removed
1748                                  * and never re-added.
1749                                  */
1750                                 set_bit(Faulty, &tmp->rdev->flags);
1751                                 sysfs_notify_dirent_safe(
1752                                         tmp->rdev->sysfs_state);
1753                         }
1754                         sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1755                 } else if (tmp->rdev
1756                            && tmp->rdev->recovery_offset == MaxSector
1757                            && !test_bit(Faulty, &tmp->rdev->flags)
1758                            && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1759                         count++;
1760                         sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1761                 }
1762         }
1763         spin_lock_irqsave(&conf->device_lock, flags);
1764         mddev->degraded -= count;
1765         spin_unlock_irqrestore(&conf->device_lock, flags);
1766
1767         print_conf(conf);
1768         return count;
1769 }
1770
1771 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1772 {
1773         struct r10conf *conf = mddev->private;
1774         int err = -EEXIST;
1775         int mirror;
1776         int first = 0;
1777         int last = conf->geo.raid_disks - 1;
1778
1779         if (mddev->recovery_cp < MaxSector)
1780                 /* only hot-add to in-sync arrays, as recovery is
1781                  * very different from resync
1782                  */
1783                 return -EBUSY;
1784         if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1))
1785                 return -EINVAL;
1786
1787         if (md_integrity_add_rdev(rdev, mddev))
1788                 return -ENXIO;
1789
1790         if (rdev->raid_disk >= 0)
1791                 first = last = rdev->raid_disk;
1792
1793         if (rdev->saved_raid_disk >= first &&
1794             conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1795                 mirror = rdev->saved_raid_disk;
1796         else
1797                 mirror = first;
1798         for ( ; mirror <= last ; mirror++) {
1799                 struct raid10_info *p = &conf->mirrors[mirror];
1800                 if (p->recovery_disabled == mddev->recovery_disabled)
1801                         continue;
1802                 if (p->rdev) {
1803                         if (!test_bit(WantReplacement, &p->rdev->flags) ||
1804                             p->replacement != NULL)
1805                                 continue;
1806                         clear_bit(In_sync, &rdev->flags);
1807                         set_bit(Replacement, &rdev->flags);
1808                         rdev->raid_disk = mirror;
1809                         err = 0;
1810                         if (mddev->gendisk)
1811                                 disk_stack_limits(mddev->gendisk, rdev->bdev,
1812                                                   rdev->data_offset << 9);
1813                         conf->fullsync = 1;
1814                         rcu_assign_pointer(p->replacement, rdev);
1815                         break;
1816                 }
1817
1818                 if (mddev->gendisk)
1819                         disk_stack_limits(mddev->gendisk, rdev->bdev,
1820                                           rdev->data_offset << 9);
1821
1822                 p->head_position = 0;
1823                 p->recovery_disabled = mddev->recovery_disabled - 1;
1824                 rdev->raid_disk = mirror;
1825                 err = 0;
1826                 if (rdev->saved_raid_disk != mirror)
1827                         conf->fullsync = 1;
1828                 rcu_assign_pointer(p->rdev, rdev);
1829                 break;
1830         }
1831         if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1832                 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1833
1834         print_conf(conf);
1835         return err;
1836 }
1837
1838 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1839 {
1840         struct r10conf *conf = mddev->private;
1841         int err = 0;
1842         int number = rdev->raid_disk;
1843         struct md_rdev **rdevp;
1844         struct raid10_info *p = conf->mirrors + number;
1845
1846         print_conf(conf);
1847         if (rdev == p->rdev)
1848                 rdevp = &p->rdev;
1849         else if (rdev == p->replacement)
1850                 rdevp = &p->replacement;
1851         else
1852                 return 0;
1853
1854         if (test_bit(In_sync, &rdev->flags) ||
1855             atomic_read(&rdev->nr_pending)) {
1856                 err = -EBUSY;
1857                 goto abort;
1858         }
1859         /* Only remove non-faulty devices if recovery
1860          * is not possible.
1861          */
1862         if (!test_bit(Faulty, &rdev->flags) &&
1863             mddev->recovery_disabled != p->recovery_disabled &&
1864             (!p->replacement || p->replacement == rdev) &&
1865             number < conf->geo.raid_disks &&
1866             enough(conf, -1)) {
1867                 err = -EBUSY;
1868                 goto abort;
1869         }
1870         *rdevp = NULL;
1871         if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1872                 synchronize_rcu();
1873                 if (atomic_read(&rdev->nr_pending)) {
1874                         /* lost the race, try later */
1875                         err = -EBUSY;
1876                         *rdevp = rdev;
1877                         goto abort;
1878                 }
1879         }
1880         if (p->replacement) {
1881                 /* We must have just cleared 'rdev' */
1882                 p->rdev = p->replacement;
1883                 clear_bit(Replacement, &p->replacement->flags);
1884                 smp_mb(); /* Make sure other CPUs may see both as identical
1885                            * but will never see neither -- if they are careful.
1886                            */
1887                 p->replacement = NULL;
1888         }
1889
1890         clear_bit(WantReplacement, &rdev->flags);
1891         err = md_integrity_register(mddev);
1892
1893 abort:
1894
1895         print_conf(conf);
1896         return err;
1897 }
1898
1899 static void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d)
1900 {
1901         struct r10conf *conf = r10_bio->mddev->private;
1902
1903         if (!bio->bi_status)
1904                 set_bit(R10BIO_Uptodate, &r10_bio->state);
1905         else
1906                 /* The write handler will notice the lack of
1907                  * R10BIO_Uptodate and record any errors etc
1908                  */
1909                 atomic_add(r10_bio->sectors,
1910                            &conf->mirrors[d].rdev->corrected_errors);
1911
1912         /* for reconstruct, we always reschedule after a read.
1913          * for resync, only after all reads
1914          */
1915         rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1916         if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1917             atomic_dec_and_test(&r10_bio->remaining)) {
1918                 /* we have read all the blocks,
1919                  * do the comparison in process context in raid10d
1920                  */
1921                 reschedule_retry(r10_bio);
1922         }
1923 }
1924
1925 static void end_sync_read(struct bio *bio)
1926 {
1927         struct r10bio *r10_bio = get_resync_r10bio(bio);
1928         struct r10conf *conf = r10_bio->mddev->private;
1929         int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1930
1931         __end_sync_read(r10_bio, bio, d);
1932 }
1933
1934 static void end_reshape_read(struct bio *bio)
1935 {
1936         /* reshape read bio isn't allocated from r10buf_pool */
1937         struct r10bio *r10_bio = bio->bi_private;
1938
1939         __end_sync_read(r10_bio, bio, r10_bio->read_slot);
1940 }
1941
1942 static void end_sync_request(struct r10bio *r10_bio)
1943 {
1944         struct mddev *mddev = r10_bio->mddev;
1945
1946         while (atomic_dec_and_test(&r10_bio->remaining)) {
1947                 if (r10_bio->master_bio == NULL) {
1948                         /* the primary of several recovery bios */
1949                         sector_t s = r10_bio->sectors;
1950                         if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1951                             test_bit(R10BIO_WriteError, &r10_bio->state))
1952                                 reschedule_retry(r10_bio);
1953                         else
1954                                 put_buf(r10_bio);
1955                         md_done_sync(mddev, s, 1);
1956                         break;
1957                 } else {
1958                         struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1959                         if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1960                             test_bit(R10BIO_WriteError, &r10_bio->state))
1961                                 reschedule_retry(r10_bio);
1962                         else
1963                                 put_buf(r10_bio);
1964                         r10_bio = r10_bio2;
1965                 }
1966         }
1967 }
1968
1969 static void end_sync_write(struct bio *bio)
1970 {
1971         struct r10bio *r10_bio = get_resync_r10bio(bio);
1972         struct mddev *mddev = r10_bio->mddev;
1973         struct r10conf *conf = mddev->private;
1974         int d;
1975         sector_t first_bad;
1976         int bad_sectors;
1977         int slot;
1978         int repl;
1979         struct md_rdev *rdev = NULL;
1980
1981         d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1982         if (repl)
1983                 rdev = conf->mirrors[d].replacement;
1984         else
1985                 rdev = conf->mirrors[d].rdev;
1986
1987         if (bio->bi_status) {
1988                 if (repl)
1989                         md_error(mddev, rdev);
1990                 else {
1991                         set_bit(WriteErrorSeen, &rdev->flags);
1992                         if (!test_and_set_bit(WantReplacement, &rdev->flags))
1993                                 set_bit(MD_RECOVERY_NEEDED,
1994                                         &rdev->mddev->recovery);
1995                         set_bit(R10BIO_WriteError, &r10_bio->state);
1996                 }
1997         } else if (is_badblock(rdev,
1998                              r10_bio->devs[slot].addr,
1999                              r10_bio->sectors,
2000                              &first_bad, &bad_sectors))
2001                 set_bit(R10BIO_MadeGood, &r10_bio->state);
2002
2003         rdev_dec_pending(rdev, mddev);
2004
2005         end_sync_request(r10_bio);
2006 }
2007
2008 /*
2009  * Note: sync and recover and handled very differently for raid10
2010  * This code is for resync.
2011  * For resync, we read through virtual addresses and read all blocks.
2012  * If there is any error, we schedule a write.  The lowest numbered
2013  * drive is authoritative.
2014  * However requests come for physical address, so we need to map.
2015  * For every physical address there are raid_disks/copies virtual addresses,
2016  * which is always are least one, but is not necessarly an integer.
2017  * This means that a physical address can span multiple chunks, so we may
2018  * have to submit multiple io requests for a single sync request.
2019  */
2020 /*
2021  * We check if all blocks are in-sync and only write to blocks that
2022  * aren't in sync
2023  */
2024 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2025 {
2026         struct r10conf *conf = mddev->private;
2027         int i, first;
2028         struct bio *tbio, *fbio;
2029         int vcnt;
2030         struct page **tpages, **fpages;
2031
2032         atomic_set(&r10_bio->remaining, 1);
2033
2034         /* find the first device with a block */
2035         for (i=0; i<conf->copies; i++)
2036                 if (!r10_bio->devs[i].bio->bi_status)
2037                         break;
2038
2039         if (i == conf->copies)
2040                 goto done;
2041
2042         first = i;
2043         fbio = r10_bio->devs[i].bio;
2044         fbio->bi_iter.bi_size = r10_bio->sectors << 9;
2045         fbio->bi_iter.bi_idx = 0;
2046         fpages = get_resync_pages(fbio)->pages;
2047
2048         vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2049         /* now find blocks with errors */
2050         for (i=0 ; i < conf->copies ; i++) {
2051                 int  j, d;
2052                 struct md_rdev *rdev;
2053                 struct resync_pages *rp;
2054
2055                 tbio = r10_bio->devs[i].bio;
2056
2057                 if (tbio->bi_end_io != end_sync_read)
2058                         continue;
2059                 if (i == first)
2060                         continue;
2061
2062                 tpages = get_resync_pages(tbio)->pages;
2063                 d = r10_bio->devs[i].devnum;
2064                 rdev = conf->mirrors[d].rdev;
2065                 if (!r10_bio->devs[i].bio->bi_status) {
2066                         /* We know that the bi_io_vec layout is the same for
2067                          * both 'first' and 'i', so we just compare them.
2068                          * All vec entries are PAGE_SIZE;
2069                          */
2070                         int sectors = r10_bio->sectors;
2071                         for (j = 0; j < vcnt; j++) {
2072                                 int len = PAGE_SIZE;
2073                                 if (sectors < (len / 512))
2074                                         len = sectors * 512;
2075                                 if (memcmp(page_address(fpages[j]),
2076                                            page_address(tpages[j]),
2077                                            len))
2078                                         break;
2079                                 sectors -= len/512;
2080                         }
2081                         if (j == vcnt)
2082                                 continue;
2083                         atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2084                         if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2085                                 /* Don't fix anything. */
2086                                 continue;
2087                 } else if (test_bit(FailFast, &rdev->flags)) {
2088                         /* Just give up on this device */
2089                         md_error(rdev->mddev, rdev);
2090                         continue;
2091                 }
2092                 /* Ok, we need to write this bio, either to correct an
2093                  * inconsistency or to correct an unreadable block.
2094                  * First we need to fixup bv_offset, bv_len and
2095                  * bi_vecs, as the read request might have corrupted these
2096                  */
2097                 rp = get_resync_pages(tbio);
2098                 bio_reset(tbio);
2099
2100                 md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size);
2101
2102                 rp->raid_bio = r10_bio;
2103                 tbio->bi_private = rp;
2104                 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
2105                 tbio->bi_end_io = end_sync_write;
2106                 bio_set_op_attrs(tbio, REQ_OP_WRITE, 0);
2107
2108                 bio_copy_data(tbio, fbio);
2109
2110                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2111                 atomic_inc(&r10_bio->remaining);
2112                 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
2113
2114                 if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
2115                         tbio->bi_opf |= MD_FAILFAST;
2116                 tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset;
2117                 bio_set_dev(tbio, conf->mirrors[d].rdev->bdev);
2118                 generic_make_request(tbio);
2119         }
2120
2121         /* Now write out to any replacement devices
2122          * that are active
2123          */
2124         for (i = 0; i < conf->copies; i++) {
2125                 int d;
2126
2127                 tbio = r10_bio->devs[i].repl_bio;
2128                 if (!tbio || !tbio->bi_end_io)
2129                         continue;
2130                 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2131                     && r10_bio->devs[i].bio != fbio)
2132                         bio_copy_data(tbio, fbio);
2133                 d = r10_bio->devs[i].devnum;
2134                 atomic_inc(&r10_bio->remaining);
2135                 md_sync_acct(conf->mirrors[d].replacement->bdev,
2136                              bio_sectors(tbio));
2137                 generic_make_request(tbio);
2138         }
2139
2140 done:
2141         if (atomic_dec_and_test(&r10_bio->remaining)) {
2142                 md_done_sync(mddev, r10_bio->sectors, 1);
2143                 put_buf(r10_bio);
2144         }
2145 }
2146
2147 /*
2148  * Now for the recovery code.
2149  * Recovery happens across physical sectors.
2150  * We recover all non-is_sync drives by finding the virtual address of
2151  * each, and then choose a working drive that also has that virt address.
2152  * There is a separate r10_bio for each non-in_sync drive.
2153  * Only the first two slots are in use. The first for reading,
2154  * The second for writing.
2155  *
2156  */
2157 static void fix_recovery_read_error(struct r10bio *r10_bio)
2158 {
2159         /* We got a read error during recovery.
2160          * We repeat the read in smaller page-sized sections.
2161          * If a read succeeds, write it to the new device or record
2162          * a bad block if we cannot.
2163          * If a read fails, record a bad block on both old and
2164          * new devices.
2165          */
2166         struct mddev *mddev = r10_bio->mddev;
2167         struct r10conf *conf = mddev->private;
2168         struct bio *bio = r10_bio->devs[0].bio;
2169         sector_t sect = 0;
2170         int sectors = r10_bio->sectors;
2171         int idx = 0;
2172         int dr = r10_bio->devs[0].devnum;
2173         int dw = r10_bio->devs[1].devnum;
2174         struct page **pages = get_resync_pages(bio)->pages;
2175
2176         while (sectors) {
2177                 int s = sectors;
2178                 struct md_rdev *rdev;
2179                 sector_t addr;
2180                 int ok;
2181
2182                 if (s > (PAGE_SIZE>>9))
2183                         s = PAGE_SIZE >> 9;
2184
2185                 rdev = conf->mirrors[dr].rdev;
2186                 addr = r10_bio->devs[0].addr + sect,
2187                 ok = sync_page_io(rdev,
2188                                   addr,
2189                                   s << 9,
2190                                   pages[idx],
2191                                   REQ_OP_READ, 0, false);
2192                 if (ok) {
2193                         rdev = conf->mirrors[dw].rdev;
2194                         addr = r10_bio->devs[1].addr + sect;
2195                         ok = sync_page_io(rdev,
2196                                           addr,
2197                                           s << 9,
2198                                           pages[idx],
2199                                           REQ_OP_WRITE, 0, false);
2200                         if (!ok) {
2201                                 set_bit(WriteErrorSeen, &rdev->flags);
2202                                 if (!test_and_set_bit(WantReplacement,
2203                                                       &rdev->flags))
2204                                         set_bit(MD_RECOVERY_NEEDED,
2205                                                 &rdev->mddev->recovery);
2206                         }
2207                 }
2208                 if (!ok) {
2209                         /* We don't worry if we cannot set a bad block -
2210                          * it really is bad so there is no loss in not
2211                          * recording it yet
2212                          */
2213                         rdev_set_badblocks(rdev, addr, s, 0);
2214
2215                         if (rdev != conf->mirrors[dw].rdev) {
2216                                 /* need bad block on destination too */
2217                                 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2218                                 addr = r10_bio->devs[1].addr + sect;
2219                                 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2220                                 if (!ok) {
2221                                         /* just abort the recovery */
2222                                         pr_notice("md/raid10:%s: recovery aborted due to read error\n",
2223                                                   mdname(mddev));
2224
2225                                         conf->mirrors[dw].recovery_disabled
2226                                                 = mddev->recovery_disabled;
2227                                         set_bit(MD_RECOVERY_INTR,
2228                                                 &mddev->recovery);
2229                                         break;
2230                                 }
2231                         }
2232                 }
2233
2234                 sectors -= s;
2235                 sect += s;
2236                 idx++;
2237         }
2238 }
2239
2240 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2241 {
2242         struct r10conf *conf = mddev->private;
2243         int d;
2244         struct bio *wbio, *wbio2;
2245
2246         if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2247                 fix_recovery_read_error(r10_bio);
2248                 end_sync_request(r10_bio);
2249                 return;
2250         }
2251
2252         /*
2253          * share the pages with the first bio
2254          * and submit the write request
2255          */
2256         d = r10_bio->devs[1].devnum;
2257         wbio = r10_bio->devs[1].bio;
2258         wbio2 = r10_bio->devs[1].repl_bio;
2259         /* Need to test wbio2->bi_end_io before we call
2260          * generic_make_request as if the former is NULL,
2261          * the latter is free to free wbio2.
2262          */
2263         if (wbio2 && !wbio2->bi_end_io)
2264                 wbio2 = NULL;
2265         if (wbio->bi_end_io) {
2266                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2267                 md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
2268                 generic_make_request(wbio);
2269         }
2270         if (wbio2) {
2271                 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2272                 md_sync_acct(conf->mirrors[d].replacement->bdev,
2273                              bio_sectors(wbio2));
2274                 generic_make_request(wbio2);
2275         }
2276 }
2277
2278 /*
2279  * Used by fix_read_error() to decay the per rdev read_errors.
2280  * We halve the read error count for every hour that has elapsed
2281  * since the last recorded read error.
2282  *
2283  */
2284 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2285 {
2286         long cur_time_mon;
2287         unsigned long hours_since_last;
2288         unsigned int read_errors = atomic_read(&rdev->read_errors);
2289
2290         cur_time_mon = ktime_get_seconds();
2291
2292         if (rdev->last_read_error == 0) {
2293                 /* first time we've seen a read error */
2294                 rdev->last_read_error = cur_time_mon;
2295                 return;
2296         }
2297
2298         hours_since_last = (long)(cur_time_mon -
2299                             rdev->last_read_error) / 3600;
2300
2301         rdev->last_read_error = cur_time_mon;
2302
2303         /*
2304          * if hours_since_last is > the number of bits in read_errors
2305          * just set read errors to 0. We do this to avoid
2306          * overflowing the shift of read_errors by hours_since_last.
2307          */
2308         if (hours_since_last >= 8 * sizeof(read_errors))
2309                 atomic_set(&rdev->read_errors, 0);
2310         else
2311                 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2312 }
2313
2314 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2315                             int sectors, struct page *page, int rw)
2316 {
2317         sector_t first_bad;
2318         int bad_sectors;
2319
2320         if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2321             && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2322                 return -1;
2323         if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
2324                 /* success */
2325                 return 1;
2326         if (rw == WRITE) {
2327                 set_bit(WriteErrorSeen, &rdev->flags);
2328                 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2329                         set_bit(MD_RECOVERY_NEEDED,
2330                                 &rdev->mddev->recovery);
2331         }
2332         /* need to record an error - either for the block or the device */
2333         if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2334                 md_error(rdev->mddev, rdev);
2335         return 0;
2336 }
2337
2338 /*
2339  * This is a kernel thread which:
2340  *
2341  *      1.      Retries failed read operations on working mirrors.
2342  *      2.      Updates the raid superblock when problems encounter.
2343  *      3.      Performs writes following reads for array synchronising.
2344  */
2345
2346 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2347 {
2348         int sect = 0; /* Offset from r10_bio->sector */
2349         int sectors = r10_bio->sectors;
2350         struct md_rdev*rdev;
2351         int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2352         int d = r10_bio->devs[r10_bio->read_slot].devnum;
2353
2354         /* still own a reference to this rdev, so it cannot
2355          * have been cleared recently.
2356          */
2357         rdev = conf->mirrors[d].rdev;
2358
2359         if (test_bit(Faulty, &rdev->flags))
2360                 /* drive has already been failed, just ignore any
2361                    more fix_read_error() attempts */
2362                 return;
2363
2364         check_decay_read_errors(mddev, rdev);
2365         atomic_inc(&rdev->read_errors);
2366         if (atomic_read(&rdev->read_errors) > max_read_errors) {
2367                 char b[BDEVNAME_SIZE];
2368                 bdevname(rdev->bdev, b);
2369
2370                 pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n",
2371                           mdname(mddev), b,
2372                           atomic_read(&rdev->read_errors), max_read_errors);
2373                 pr_notice("md/raid10:%s: %s: Failing raid device\n",
2374                           mdname(mddev), b);
2375                 md_error(mddev, rdev);
2376                 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2377                 return;
2378         }
2379
2380         while(sectors) {
2381                 int s = sectors;
2382                 int sl = r10_bio->read_slot;
2383                 int success = 0;
2384                 int start;
2385
2386                 if (s > (PAGE_SIZE>>9))
2387                         s = PAGE_SIZE >> 9;
2388
2389                 rcu_read_lock();
2390                 do {
2391                         sector_t first_bad;
2392                         int bad_sectors;
2393
2394                         d = r10_bio->devs[sl].devnum;
2395                         rdev = rcu_dereference(conf->mirrors[d].rdev);
2396                         if (rdev &&
2397                             test_bit(In_sync, &rdev->flags) &&
2398                             !test_bit(Faulty, &rdev->flags) &&
2399                             is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2400                                         &first_bad, &bad_sectors) == 0) {
2401                                 atomic_inc(&rdev->nr_pending);
2402                                 rcu_read_unlock();
2403                                 success = sync_page_io(rdev,
2404                                                        r10_bio->devs[sl].addr +
2405                                                        sect,
2406                                                        s<<9,
2407                                                        conf->tmppage,
2408                                                        REQ_OP_READ, 0, false);
2409                                 rdev_dec_pending(rdev, mddev);
2410                                 rcu_read_lock();
2411                                 if (success)
2412                                         break;
2413                         }
2414                         sl++;
2415                         if (sl == conf->copies)
2416                                 sl = 0;
2417                 } while (!success && sl != r10_bio->read_slot);
2418                 rcu_read_unlock();
2419
2420                 if (!success) {
2421                         /* Cannot read from anywhere, just mark the block
2422                          * as bad on the first device to discourage future
2423                          * reads.
2424                          */
2425                         int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2426                         rdev = conf->mirrors[dn].rdev;
2427
2428                         if (!rdev_set_badblocks(
2429                                     rdev,
2430                                     r10_bio->devs[r10_bio->read_slot].addr
2431                                     + sect,
2432                                     s, 0)) {
2433                                 md_error(mddev, rdev);
2434                                 r10_bio->devs[r10_bio->read_slot].bio
2435                                         = IO_BLOCKED;
2436                         }
2437                         break;
2438                 }
2439
2440                 start = sl;
2441                 /* write it back and re-read */
2442                 rcu_read_lock();
2443                 while (sl != r10_bio->read_slot) {
2444                         char b[BDEVNAME_SIZE];
2445
2446                         if (sl==0)
2447                                 sl = conf->copies;
2448                         sl--;
2449                         d = r10_bio->devs[sl].devnum;
2450                         rdev = rcu_dereference(conf->mirrors[d].rdev);
2451                         if (!rdev ||
2452                             test_bit(Faulty, &rdev->flags) ||
2453                             !test_bit(In_sync, &rdev->flags))
2454                                 continue;
2455
2456                         atomic_inc(&rdev->nr_pending);
2457                         rcu_read_unlock();
2458                         if (r10_sync_page_io(rdev,
2459                                              r10_bio->devs[sl].addr +
2460                                              sect,
2461                                              s, conf->tmppage, WRITE)
2462                             == 0) {
2463                                 /* Well, this device is dead */
2464                                 pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n",
2465                                           mdname(mddev), s,
2466                                           (unsigned long long)(
2467                                                   sect +
2468                                                   choose_data_offset(r10_bio,
2469                                                                      rdev)),
2470                                           bdevname(rdev->bdev, b));
2471                                 pr_notice("md/raid10:%s: %s: failing drive\n",
2472                                           mdname(mddev),
2473                                           bdevname(rdev->bdev, b));
2474                         }
2475                         rdev_dec_pending(rdev, mddev);
2476                         rcu_read_lock();
2477                 }
2478                 sl = start;
2479                 while (sl != r10_bio->read_slot) {
2480                         char b[BDEVNAME_SIZE];
2481
2482                         if (sl==0)
2483                                 sl = conf->copies;
2484                         sl--;
2485                         d = r10_bio->devs[sl].devnum;
2486                         rdev = rcu_dereference(conf->mirrors[d].rdev);
2487                         if (!rdev ||
2488                             test_bit(Faulty, &rdev->flags) ||
2489                             !test_bit(In_sync, &rdev->flags))
2490                                 continue;
2491
2492                         atomic_inc(&rdev->nr_pending);
2493                         rcu_read_unlock();
2494                         switch (r10_sync_page_io(rdev,
2495                                              r10_bio->devs[sl].addr +
2496                                              sect,
2497                                              s, conf->tmppage,
2498                                                  READ)) {
2499                         case 0:
2500                                 /* Well, this device is dead */
2501                                 pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n",
2502                                        mdname(mddev), s,
2503                                        (unsigned long long)(
2504                                                sect +
2505                                                choose_data_offset(r10_bio, rdev)),
2506                                        bdevname(rdev->bdev, b));
2507                                 pr_notice("md/raid10:%s: %s: failing drive\n",
2508                                        mdname(mddev),
2509                                        bdevname(rdev->bdev, b));
2510                                 break;
2511                         case 1:
2512                                 pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n",
2513                                        mdname(mddev), s,
2514                                        (unsigned long long)(
2515                                                sect +
2516                                                choose_data_offset(r10_bio, rdev)),
2517                                        bdevname(rdev->bdev, b));
2518                                 atomic_add(s, &rdev->corrected_errors);
2519                         }
2520
2521                         rdev_dec_pending(rdev, mddev);
2522                         rcu_read_lock();
2523                 }
2524                 rcu_read_unlock();
2525
2526                 sectors -= s;
2527                 sect += s;
2528         }
2529 }
2530
2531 static int narrow_write_error(struct r10bio *r10_bio, int i)
2532 {
2533         struct bio *bio = r10_bio->master_bio;
2534         struct mddev *mddev = r10_bio->mddev;
2535         struct r10conf *conf = mddev->private;
2536         struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2537         /* bio has the data to be written to slot 'i' where
2538          * we just recently had a write error.
2539          * We repeatedly clone the bio and trim down to one block,
2540          * then try the write.  Where the write fails we record
2541          * a bad block.
2542          * It is conceivable that the bio doesn't exactly align with
2543          * blocks.  We must handle this.
2544          *
2545          * We currently own a reference to the rdev.
2546          */
2547
2548         int block_sectors;
2549         sector_t sector;
2550         int sectors;
2551         int sect_to_write = r10_bio->sectors;
2552         int ok = 1;
2553
2554         if (rdev->badblocks.shift < 0)
2555                 return 0;
2556
2557         block_sectors = roundup(1 << rdev->badblocks.shift,
2558                                 bdev_logical_block_size(rdev->bdev) >> 9);
2559         sector = r10_bio->sector;
2560         sectors = ((r10_bio->sector + block_sectors)
2561                    & ~(sector_t)(block_sectors - 1))
2562                 - sector;
2563
2564         while (sect_to_write) {
2565                 struct bio *wbio;
2566                 sector_t wsector;
2567                 if (sectors > sect_to_write)
2568                         sectors = sect_to_write;
2569                 /* Write at 'sector' for 'sectors' */
2570                 wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
2571                 bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors);
2572                 wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
2573                 wbio->bi_iter.bi_sector = wsector +
2574                                    choose_data_offset(r10_bio, rdev);
2575                 bio_set_dev(wbio, rdev->bdev);
2576                 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
2577
2578                 if (submit_bio_wait(wbio) < 0)
2579                         /* Failure! */
2580                         ok = rdev_set_badblocks(rdev, wsector,
2581                                                 sectors, 0)
2582                                 && ok;
2583
2584                 bio_put(wbio);
2585                 sect_to_write -= sectors;
2586                 sector += sectors;
2587                 sectors = block_sectors;
2588         }
2589         return ok;
2590 }
2591
2592 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2593 {
2594         int slot = r10_bio->read_slot;
2595         struct bio *bio;
2596         struct r10conf *conf = mddev->private;
2597         struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2598
2599         /* we got a read error. Maybe the drive is bad.  Maybe just
2600          * the block and we can fix it.
2601          * We freeze all other IO, and try reading the block from
2602          * other devices.  When we find one, we re-write
2603          * and check it that fixes the read error.
2604          * This is all done synchronously while the array is
2605          * frozen.
2606          */
2607         bio = r10_bio->devs[slot].bio;
2608         bio_put(bio);
2609         r10_bio->devs[slot].bio = NULL;
2610
2611         if (mddev->ro)
2612                 r10_bio->devs[slot].bio = IO_BLOCKED;
2613         else if (!test_bit(FailFast, &rdev->flags)) {
2614                 freeze_array(conf, 1);
2615                 fix_read_error(conf, mddev, r10_bio);
2616                 unfreeze_array(conf);
2617         } else
2618                 md_error(mddev, rdev);
2619
2620         rdev_dec_pending(rdev, mddev);
2621         allow_barrier(conf);
2622         r10_bio->state = 0;
2623         raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
2624 }
2625
2626 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2627 {
2628         /* Some sort of write request has finished and it
2629          * succeeded in writing where we thought there was a
2630          * bad block.  So forget the bad block.
2631          * Or possibly if failed and we need to record
2632          * a bad block.
2633          */
2634         int m;
2635         struct md_rdev *rdev;
2636
2637         if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2638             test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2639                 for (m = 0; m < conf->copies; m++) {
2640                         int dev = r10_bio->devs[m].devnum;
2641                         rdev = conf->mirrors[dev].rdev;
2642                         if (r10_bio->devs[m].bio == NULL)
2643                                 continue;
2644                         if (!r10_bio->devs[m].bio->bi_status) {
2645                                 rdev_clear_badblocks(
2646                                         rdev,
2647                                         r10_bio->devs[m].addr,
2648                                         r10_bio->sectors, 0);
2649                         } else {
2650                                 if (!rdev_set_badblocks(
2651                                             rdev,
2652                                             r10_bio->devs[m].addr,
2653                                             r10_bio->sectors, 0))
2654                                         md_error(conf->mddev, rdev);
2655                         }
2656                         rdev = conf->mirrors[dev].replacement;
2657                         if (r10_bio->devs[m].repl_bio == NULL)
2658                                 continue;
2659
2660                         if (!r10_bio->devs[m].repl_bio->bi_status) {
2661                                 rdev_clear_badblocks(
2662                                         rdev,
2663                                         r10_bio->devs[m].addr,
2664                                         r10_bio->sectors, 0);
2665                         } else {
2666                                 if (!rdev_set_badblocks(
2667                                             rdev,
2668                                             r10_bio->devs[m].addr,
2669                                             r10_bio->sectors, 0))
2670                                         md_error(conf->mddev, rdev);
2671                         }
2672                 }
2673                 put_buf(r10_bio);
2674         } else {
2675                 bool fail = false;
2676                 for (m = 0; m < conf->copies; m++) {
2677                         int dev = r10_bio->devs[m].devnum;
2678                         struct bio *bio = r10_bio->devs[m].bio;
2679                         rdev = conf->mirrors[dev].rdev;
2680                         if (bio == IO_MADE_GOOD) {
2681                                 rdev_clear_badblocks(
2682                                         rdev,
2683                                         r10_bio->devs[m].addr,
2684                                         r10_bio->sectors, 0);
2685                                 rdev_dec_pending(rdev, conf->mddev);
2686                         } else if (bio != NULL && bio->bi_status) {
2687                                 fail = true;
2688                                 if (!narrow_write_error(r10_bio, m)) {
2689                                         md_error(conf->mddev, rdev);
2690                                         set_bit(R10BIO_Degraded,
2691                                                 &r10_bio->state);
2692                                 }
2693                                 rdev_dec_pending(rdev, conf->mddev);
2694                         }
2695                         bio = r10_bio->devs[m].repl_bio;
2696                         rdev = conf->mirrors[dev].replacement;
2697                         if (rdev && bio == IO_MADE_GOOD) {
2698                                 rdev_clear_badblocks(
2699                                         rdev,
2700                                         r10_bio->devs[m].addr,
2701                                         r10_bio->sectors, 0);
2702                                 rdev_dec_pending(rdev, conf->mddev);
2703                         }
2704                 }
2705                 if (fail) {
2706                         spin_lock_irq(&conf->device_lock);
2707                         list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
2708                         conf->nr_queued++;
2709                         spin_unlock_irq(&conf->device_lock);
2710                         /*
2711                          * In case freeze_array() is waiting for condition
2712                          * nr_pending == nr_queued + extra to be true.
2713                          */
2714                         wake_up(&conf->wait_barrier);
2715                         md_wakeup_thread(conf->mddev->thread);
2716                 } else {
2717                         if (test_bit(R10BIO_WriteError,
2718                                      &r10_bio->state))
2719                                 close_write(r10_bio);
2720                         raid_end_bio_io(r10_bio);
2721                 }
2722         }
2723 }
2724
2725 static void raid10d(struct md_thread *thread)
2726 {
2727         struct mddev *mddev = thread->mddev;
2728         struct r10bio *r10_bio;
2729         unsigned long flags;
2730         struct r10conf *conf = mddev->private;
2731         struct list_head *head = &conf->retry_list;
2732         struct blk_plug plug;
2733
2734         md_check_recovery(mddev);
2735
2736         if (!list_empty_careful(&conf->bio_end_io_list) &&
2737             !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2738                 LIST_HEAD(tmp);
2739                 spin_lock_irqsave(&conf->device_lock, flags);
2740                 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
2741                         while (!list_empty(&conf->bio_end_io_list)) {
2742                                 list_move(conf->bio_end_io_list.prev, &tmp);
2743                                 conf->nr_queued--;
2744                         }
2745                 }
2746                 spin_unlock_irqrestore(&conf->device_lock, flags);
2747                 while (!list_empty(&tmp)) {
2748                         r10_bio = list_first_entry(&tmp, struct r10bio,
2749                                                    retry_list);
2750                         list_del(&r10_bio->retry_list);
2751                         if (mddev->degraded)
2752                                 set_bit(R10BIO_Degraded, &r10_bio->state);
2753
2754                         if (test_bit(R10BIO_WriteError,
2755                                      &r10_bio->state))
2756                                 close_write(r10_bio);
2757                         raid_end_bio_io(r10_bio);
2758                 }
2759         }
2760
2761         blk_start_plug(&plug);
2762         for (;;) {
2763
2764                 flush_pending_writes(conf);
2765
2766                 spin_lock_irqsave(&conf->device_lock, flags);
2767                 if (list_empty(head)) {
2768                         spin_unlock_irqrestore(&conf->device_lock, flags);
2769                         break;
2770                 }
2771                 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2772                 list_del(head->prev);
2773                 conf->nr_queued--;
2774                 spin_unlock_irqrestore(&conf->device_lock, flags);
2775
2776                 mddev = r10_bio->mddev;
2777                 conf = mddev->private;
2778                 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2779                     test_bit(R10BIO_WriteError, &r10_bio->state))
2780                         handle_write_completed(conf, r10_bio);
2781                 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2782                         reshape_request_write(mddev, r10_bio);
2783                 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2784                         sync_request_write(mddev, r10_bio);
2785                 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2786                         recovery_request_write(mddev, r10_bio);
2787                 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2788                         handle_read_error(mddev, r10_bio);
2789                 else
2790                         WARN_ON_ONCE(1);
2791
2792                 cond_resched();
2793                 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
2794                         md_check_recovery(mddev);
2795         }
2796         blk_finish_plug(&plug);
2797 }
2798
2799 static int init_resync(struct r10conf *conf)
2800 {
2801         int buffs;
2802         int i;
2803
2804         buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2805         BUG_ON(conf->r10buf_pool);
2806         conf->have_replacement = 0;
2807         for (i = 0; i < conf->geo.raid_disks; i++)
2808                 if (conf->mirrors[i].replacement)
2809                         conf->have_replacement = 1;
2810         conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2811         if (!conf->r10buf_pool)
2812                 return -ENOMEM;
2813         conf->next_resync = 0;
2814         return 0;
2815 }
2816
2817 static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
2818 {
2819         struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
2820         struct rsync_pages *rp;
2821         struct bio *bio;
2822         int nalloc;
2823         int i;
2824
2825         if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
2826             test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
2827                 nalloc = conf->copies; /* resync */
2828         else
2829                 nalloc = 2; /* recovery */
2830
2831         for (i = 0; i < nalloc; i++) {
2832                 bio = r10bio->devs[i].bio;
2833                 rp = bio->bi_private;
2834                 bio_reset(bio);
2835                 bio->bi_private = rp;
2836                 bio = r10bio->devs[i].repl_bio;
2837                 if (bio) {
2838                         rp = bio->bi_private;
2839                         bio_reset(bio);
2840                         bio->bi_private = rp;
2841                 }
2842         }
2843         return r10bio;
2844 }
2845
2846 /*
2847  * Set cluster_sync_high since we need other nodes to add the
2848  * range [cluster_sync_low, cluster_sync_high] to suspend list.
2849  */
2850 static void raid10_set_cluster_sync_high(struct r10conf *conf)
2851 {
2852         sector_t window_size;
2853         int extra_chunk, chunks;
2854
2855         /*
2856          * First, here we define "stripe" as a unit which across
2857          * all member devices one time, so we get chunks by use
2858          * raid_disks / near_copies. Otherwise, if near_copies is
2859          * close to raid_disks, then resync window could increases
2860          * linearly with the increase of raid_disks, which means
2861          * we will suspend a really large IO window while it is not
2862          * necessary. If raid_disks is not divisible by near_copies,
2863          * an extra chunk is needed to ensure the whole "stripe" is
2864          * covered.
2865          */
2866
2867         chunks = conf->geo.raid_disks / conf->geo.near_copies;
2868         if (conf->geo.raid_disks % conf->geo.near_copies == 0)
2869                 extra_chunk = 0;
2870         else
2871                 extra_chunk = 1;
2872         window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
2873
2874         /*
2875          * At least use a 32M window to align with raid1's resync window
2876          */
2877         window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
2878                         CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
2879
2880         conf->cluster_sync_high = conf->cluster_sync_low + window_size;
2881 }
2882
2883 /*
2884  * perform a "sync" on one "block"
2885  *
2886  * We need to make sure that no normal I/O request - particularly write
2887  * requests - conflict with active sync requests.
2888  *
2889  * This is achieved by tracking pending requests and a 'barrier' concept
2890  * that can be installed to exclude normal IO requests.
2891  *
2892  * Resync and recovery are handled very differently.
2893  * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
2894  *
2895  * For resync, we iterate over virtual addresses, read all copies,
2896  * and update if there are differences.  If only one copy is live,
2897  * skip it.
2898  * For recovery, we iterate over physical addresses, read a good
2899  * value for each non-in_sync drive, and over-write.
2900  *
2901  * So, for recovery we may have several outstanding complex requests for a
2902  * given address, one for each out-of-sync device.  We model this by allocating
2903  * a number of r10_bio structures, one for each out-of-sync device.
2904  * As we setup these structures, we collect all bio's together into a list
2905  * which we then process collectively to add pages, and then process again
2906  * to pass to generic_make_request.
2907  *
2908  * The r10_bio structures are linked using a borrowed master_bio pointer.
2909  * This link is counted in ->remaining.  When the r10_bio that points to NULL
2910  * has its remaining count decremented to 0, the whole complex operation
2911  * is complete.
2912  *
2913  */
2914
2915 static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
2916                              int *skipped)
2917 {
2918         struct r10conf *conf = mddev->private;
2919         struct r10bio *r10_bio;
2920         struct bio *biolist = NULL, *bio;
2921         sector_t max_sector, nr_sectors;
2922         int i;
2923         int max_sync;
2924         sector_t sync_blocks;
2925         sector_t sectors_skipped = 0;
2926         int chunks_skipped = 0;
2927         sector_t chunk_mask = conf->geo.chunk_mask;
2928         int page_idx = 0;
2929
2930         if (!conf->r10buf_pool)
2931                 if (init_resync(conf))
2932                         return 0;
2933
2934         /*
2935          * Allow skipping a full rebuild for incremental assembly
2936          * of a clean array, like RAID1 does.
2937          */
2938         if (mddev->bitmap == NULL &&
2939             mddev->recovery_cp == MaxSector &&
2940             mddev->reshape_position == MaxSector &&
2941             !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2942             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
2943             !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2944             conf->fullsync == 0) {
2945                 *skipped = 1;
2946                 return mddev->dev_sectors - sector_nr;
2947         }
2948
2949  skipped:
2950         max_sector = mddev->dev_sectors;
2951         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2952             test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2953                 max_sector = mddev->resync_max_sectors;
2954         if (sector_nr >= max_sector) {
2955                 conf->cluster_sync_low = 0;
2956                 conf->cluster_sync_high = 0;
2957
2958                 /* If we aborted, we need to abort the
2959                  * sync on the 'current' bitmap chucks (there can
2960                  * be several when recovering multiple devices).
2961                  * as we may have started syncing it but not finished.
2962                  * We can find the current address in
2963                  * mddev->curr_resync, but for recovery,
2964                  * we need to convert that to several
2965                  * virtual addresses.
2966                  */
2967                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2968                         end_reshape(conf);
2969                         close_sync(conf);
2970                         return 0;
2971                 }
2972
2973                 if (mddev->curr_resync < max_sector) { /* aborted */
2974                         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2975                                 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2976                                                 &sync_blocks, 1);
2977                         else for (i = 0; i < conf->geo.raid_disks; i++) {
2978                                 sector_t sect =
2979                                         raid10_find_virt(conf, mddev->curr_resync, i);
2980                                 bitmap_end_sync(mddev->bitmap, sect,
2981                                                 &sync_blocks, 1);
2982                         }
2983                 } else {
2984                         /* completed sync */
2985                         if ((!mddev->bitmap || conf->fullsync)
2986                             && conf->have_replacement
2987                             && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2988                                 /* Completed a full sync so the replacements
2989                                  * are now fully recovered.
2990                                  */
2991                                 rcu_read_lock();
2992                                 for (i = 0; i < conf->geo.raid_disks; i++) {
2993                                         struct md_rdev *rdev =
2994                                                 rcu_dereference(conf->mirrors[i].replacement);
2995                                         if (rdev)
2996                                                 rdev->recovery_offset = MaxSector;
2997                                 }
2998                                 rcu_read_unlock();
2999                         }
3000                         conf->fullsync = 0;
3001                 }
3002                 bitmap_close_sync(mddev->bitmap);
3003                 close_sync(conf);
3004                 *skipped = 1;
3005                 return sectors_skipped;
3006         }
3007
3008         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3009                 return reshape_request(mddev, sector_nr, skipped);
3010
3011         if (chunks_skipped >= conf->geo.raid_disks) {
3012                 /* if there has been nothing to do on any drive,
3013                  * then there is nothing to do at all..
3014                  */
3015                 *skipped = 1;
3016                 return (max_sector - sector_nr) + sectors_skipped;
3017         }
3018
3019         if (max_sector > mddev->resync_max)
3020                 max_sector = mddev->resync_max; /* Don't do IO beyond here */
3021
3022         /* make sure whole request will fit in a chunk - if chunks
3023          * are meaningful
3024          */
3025         if (conf->geo.near_copies < conf->geo.raid_disks &&
3026             max_sector > (sector_nr | chunk_mask))
3027                 max_sector = (sector_nr | chunk_mask) + 1;
3028
3029         /*
3030          * If there is non-resync activity waiting for a turn, then let it
3031          * though before starting on this new sync request.
3032          */
3033         if (conf->nr_waiting)
3034                 schedule_timeout_uninterruptible(1);
3035
3036         /* Again, very different code for resync and recovery.
3037          * Both must result in an r10bio with a list of bios that
3038          * have bi_end_io, bi_sector, bi_disk set,
3039          * and bi_private set to the r10bio.
3040          * For recovery, we may actually create several r10bios
3041          * with 2 bios in each, that correspond to the bios in the main one.
3042          * In this case, the subordinate r10bios link back through a
3043          * borrowed master_bio pointer, and the counter in the master
3044          * includes a ref from each subordinate.
3045          */
3046         /* First, we decide what to do and set ->bi_end_io
3047          * To end_sync_read if we want to read, and
3048          * end_sync_write if we will want to write.
3049          */
3050
3051         max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
3052         if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3053                 /* recovery... the complicated one */
3054                 int j;
3055                 r10_bio = NULL;
3056
3057                 for (i = 0 ; i < conf->geo.raid_disks; i++) {
3058                         int still_degraded;
3059                         struct r10bio *rb2;
3060                         sector_t sect;
3061                         int must_sync;
3062                         int any_working;
3063                         struct raid10_info *mirror = &conf->mirrors[i];
3064                         struct md_rdev *mrdev, *mreplace;
3065
3066                         rcu_read_lock();
3067                         mrdev = rcu_dereference(mirror->rdev);
3068                         mreplace = rcu_dereference(mirror->replacement);
3069
3070                         if ((mrdev == NULL ||
3071                              test_bit(Faulty, &mrdev->flags) ||
3072                              test_bit(In_sync, &mrdev->flags)) &&
3073                             (mreplace == NULL ||
3074                              test_bit(Faulty, &mreplace->flags))) {
3075                                 rcu_read_unlock();
3076                                 continue;
3077                         }
3078
3079                         still_degraded = 0;
3080                         /* want to reconstruct this device */
3081                         rb2 = r10_bio;
3082                         sect = raid10_find_virt(conf, sector_nr, i);
3083                         if (sect >= mddev->resync_max_sectors) {
3084                                 /* last stripe is not complete - don't
3085                                  * try to recover this sector.
3086                                  */
3087                                 rcu_read_unlock();
3088                                 continue;
3089                         }
3090                         if (mreplace && test_bit(Faulty, &mreplace->flags))
3091                                 mreplace = NULL;
3092                         /* Unless we are doing a full sync, or a replacement
3093                          * we only need to recover the block if it is set in
3094                          * the bitmap
3095                          */
3096                         must_sync = bitmap_start_sync(mddev->bitmap, sect,
3097                                                       &sync_blocks, 1);
3098                         if (sync_blocks < max_sync)
3099                                 max_sync = sync_blocks;
3100                         if (!must_sync &&
3101                             mreplace == NULL &&
3102                             !conf->fullsync) {
3103                                 /* yep, skip the sync_blocks here, but don't assume
3104                                  * that there will never be anything to do here
3105                                  */
3106                                 chunks_skipped = -1;
3107                                 rcu_read_unlock();
3108                                 continue;
3109                         }
3110                         atomic_inc(&mrdev->nr_pending);
3111                         if (mreplace)
3112                                 atomic_inc(&mreplace->nr_pending);
3113                         rcu_read_unlock();
3114
3115                         r10_bio = raid10_alloc_init_r10buf(conf);
3116                         r10_bio->state = 0;
3117                         raise_barrier(conf, rb2 != NULL);
3118                         atomic_set(&r10_bio->remaining, 0);
3119
3120                         r10_bio->master_bio = (struct bio*)rb2;
3121                         if (rb2)
3122                                 atomic_inc(&rb2->remaining);
3123                         r10_bio->mddev = mddev;
3124                         set_bit(R10BIO_IsRecover, &r10_bio->state);
3125                         r10_bio->sector = sect;
3126
3127                         raid10_find_phys(conf, r10_bio);
3128
3129                         /* Need to check if the array will still be
3130                          * degraded
3131                          */
3132                         rcu_read_lock();
3133                         for (j = 0; j < conf->geo.raid_disks; j++) {
3134                                 struct md_rdev *rdev = rcu_dereference(
3135                                         conf->mirrors[j].rdev);
3136                                 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3137                                         still_degraded = 1;
3138                                         break;
3139                                 }
3140                         }
3141
3142                         must_sync = bitmap_start_sync(mddev->bitmap, sect,
3143                                                       &sync_blocks, still_degraded);
3144
3145                         any_working = 0;
3146                         for (j=0; j<conf->copies;j++) {
3147                                 int k;
3148                                 int d = r10_bio->devs[j].devnum;
3149                                 sector_t from_addr, to_addr;
3150                                 struct md_rdev *rdev =
3151                                         rcu_dereference(conf->mirrors[d].rdev);
3152                                 sector_t sector, first_bad;
3153                                 int bad_sectors;
3154                                 if (!rdev ||
3155                                     !test_bit(In_sync, &rdev->flags))
3156                                         continue;
3157                                 /* This is where we read from */
3158                                 any_working = 1;
3159                                 sector = r10_bio->devs[j].addr;
3160
3161                                 if (is_badblock(rdev, sector, max_sync,
3162                                                 &first_bad, &bad_sectors)) {
3163                                         if (first_bad > sector)
3164                                                 max_sync = first_bad - sector;
3165                                         else {
3166                                                 bad_sectors -= (sector
3167                                                                 - first_bad);
3168                                                 if (max_sync > bad_sectors)
3169                                                         max_sync = bad_sectors;
3170                                                 continue;
3171                                         }
3172                                 }
3173                                 bio = r10_bio->devs[0].bio;
3174                                 bio->bi_next = biolist;
3175                                 biolist = bio;
3176                                 bio->bi_end_io = end_sync_read;
3177                                 bio_set_op_attrs(bio, REQ_OP_READ, 0);
3178                                 if (test_bit(FailFast, &rdev->flags))
3179                                         bio->bi_opf |= MD_FAILFAST;
3180                                 from_addr = r10_bio->devs[j].addr;
3181                                 bio->bi_iter.bi_sector = from_addr +
3182                                         rdev->data_offset;
3183                                 bio_set_dev(bio, rdev->bdev);
3184                                 atomic_inc(&rdev->nr_pending);
3185                                 /* and we write to 'i' (if not in_sync) */
3186
3187                                 for (k=0; k<conf->copies; k++)
3188                                         if (r10_bio->devs[k].devnum == i)
3189                                                 break;
3190                                 BUG_ON(k == conf->copies);
3191                                 to_addr = r10_bio->devs[k].addr;
3192                                 r10_bio->devs[0].devnum = d;
3193                                 r10_bio->devs[0].addr = from_addr;
3194                                 r10_bio->devs[1].devnum = i;
3195                                 r10_bio->devs[1].addr = to_addr;
3196
3197                                 if (!test_bit(In_sync, &mrdev->flags)) {
3198                                         bio = r10_bio->devs[1].bio;
3199                                         bio->bi_next = biolist;
3200                                         biolist = bio;
3201                                         bio->bi_end_io = end_sync_write;
3202                                         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3203                                         bio->bi_iter.bi_sector = to_addr
3204                                                 + mrdev->data_offset;
3205                                         bio_set_dev(bio, mrdev->bdev);
3206                                         atomic_inc(&r10_bio->remaining);
3207                                 } else
3208                                         r10_bio->devs[1].bio->bi_end_io = NULL;
3209
3210                                 /* and maybe write to replacement */
3211                                 bio = r10_bio->devs[1].repl_bio;
3212                                 if (bio)
3213                                         bio->bi_end_io = NULL;
3214                                 /* Note: if mreplace != NULL, then bio
3215                                  * cannot be NULL as r10buf_pool_alloc will
3216                                  * have allocated it.
3217                                  * So the second test here is pointless.
3218                                  * But it keeps semantic-checkers happy, and
3219                                  * this comment keeps human reviewers
3220                                  * happy.
3221                                  */
3222                                 if (mreplace == NULL || bio == NULL ||
3223                                     test_bit(Faulty, &mreplace->flags))
3224                                         break;
3225                                 bio->bi_next = biolist;
3226                                 biolist = bio;
3227                                 bio->bi_end_io = end_sync_write;
3228                                 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3229                                 bio->bi_iter.bi_sector = to_addr +
3230                                         mreplace->data_offset;
3231                                 bio_set_dev(bio, mreplace->bdev);
3232                                 atomic_inc(&r10_bio->remaining);
3233                                 break;
3234                         }
3235                         rcu_read_unlock();
3236                         if (j == conf->copies) {
3237                                 /* Cannot recover, so abort the recovery or
3238                                  * record a bad block */
3239                                 if (any_working) {
3240                                         /* problem is that there are bad blocks
3241                                          * on other device(s)
3242                                          */
3243                                         int k;
3244                                         for (k = 0; k < conf->copies; k++)
3245                                                 if (r10_bio->devs[k].devnum == i)
3246                                                         break;
3247                                         if (!test_bit(In_sync,
3248                                                       &mrdev->flags)
3249                                             && !rdev_set_badblocks(
3250                                                     mrdev,
3251                                                     r10_bio->devs[k].addr,
3252                                                     max_sync, 0))
3253                                                 any_working = 0;
3254                                         if (mreplace &&
3255                                             !rdev_set_badblocks(
3256                                                     mreplace,
3257                                                     r10_bio->devs[k].addr,
3258                                                     max_sync, 0))
3259                                                 any_working = 0;
3260                                 }
3261                                 if (!any_working)  {
3262                                         if (!test_and_set_bit(MD_RECOVERY_INTR,
3263                                                               &mddev->recovery))
3264                                                 pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
3265                                                        mdname(mddev));
3266                                         mirror->recovery_disabled
3267                                                 = mddev->recovery_disabled;
3268                                 }
3269                                 put_buf(r10_bio);
3270                                 if (rb2)
3271                                         atomic_dec(&rb2->remaining);
3272                                 r10_bio = rb2;
3273                                 rdev_dec_pending(mrdev, mddev);
3274                                 if (mreplace)
3275                                         rdev_dec_pending(mreplace, mddev);
3276                                 break;
3277                         }
3278                         rdev_dec_pending(mrdev, mddev);
3279                         if (mreplace)
3280                                 rdev_dec_pending(mreplace, mddev);
3281                         if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) {
3282                                 /* Only want this if there is elsewhere to
3283                                  * read from. 'j' is currently the first
3284                                  * readable copy.
3285                                  */
3286                                 int targets = 1;
3287                                 for (; j < conf->copies; j++) {
3288                                         int d = r10_bio->devs[j].devnum;
3289                                         if (conf->mirrors[d].rdev &&
3290                                             test_bit(In_sync,
3291                                                       &conf->mirrors[d].rdev->flags))
3292                                                 targets++;
3293                                 }
3294                                 if (targets == 1)
3295                                         r10_bio->devs[0].bio->bi_opf
3296                                                 &= ~MD_FAILFAST;
3297                         }
3298                 }
3299                 if (biolist == NULL) {
3300                         while (r10_bio) {
3301                                 struct r10bio *rb2 = r10_bio;
3302                                 r10_bio = (struct r10bio*) rb2->master_bio;
3303                                 rb2->master_bio = NULL;
3304                                 put_buf(rb2);
3305                         }
3306                         goto giveup;
3307                 }
3308         } else {
3309                 /* resync. Schedule a read for every block at this virt offset */
3310                 int count = 0;
3311
3312                 /*
3313                  * Since curr_resync_completed could probably not update in
3314                  * time, and we will set cluster_sync_low based on it.
3315                  * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
3316                  * safety reason, which ensures curr_resync_completed is
3317                  * updated in bitmap_cond_end_sync.
3318                  */
3319                 bitmap_cond_end_sync(mddev->bitmap, sector_nr,
3320                                      mddev_is_clustered(mddev) &&
3321                                      (sector_nr + 2 * RESYNC_SECTORS >
3322                                       conf->cluster_sync_high));
3323
3324                 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3325                                        &sync_blocks, mddev->degraded) &&
3326                     !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3327                                                  &mddev->recovery)) {
3328                         /* We can skip this block */
3329                         *skipped = 1;
3330                         return sync_blocks + sectors_skipped;
3331                 }
3332                 if (sync_blocks < max_sync)
3333                         max_sync = sync_blocks;
3334                 r10_bio = raid10_alloc_init_r10buf(conf);
3335                 r10_bio->state = 0;
3336
3337                 r10_bio->mddev = mddev;
3338                 atomic_set(&r10_bio->remaining, 0);
3339                 raise_barrier(conf, 0);
3340                 conf->next_resync = sector_nr;
3341
3342                 r10_bio->master_bio = NULL;
3343                 r10_bio->sector = sector_nr;
3344                 set_bit(R10BIO_IsSync, &r10_bio->state);
3345                 raid10_find_phys(conf, r10_bio);
3346                 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3347
3348                 for (i = 0; i < conf->copies; i++) {
3349                         int d = r10_bio->devs[i].devnum;
3350                         sector_t first_bad, sector;
3351                         int bad_sectors;
3352                         struct md_rdev *rdev;
3353
3354                         if (r10_bio->devs[i].repl_bio)
3355                                 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3356
3357                         bio = r10_bio->devs[i].bio;
3358                         bio->bi_status = BLK_STS_IOERR;
3359                         rcu_read_lock();
3360                         rdev = rcu_dereference(conf->mirrors[d].rdev);
3361                         if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3362                                 rcu_read_unlock();
3363                                 continue;
3364                         }
3365                         sector = r10_bio->devs[i].addr;
3366                         if (is_badblock(rdev, sector, max_sync,
3367                                         &first_bad, &bad_sectors)) {
3368                                 if (first_bad > sector)
3369                                         max_sync = first_bad - sector;
3370                                 else {
3371                                         bad_sectors -= (sector - first_bad);
3372                                         if (max_sync > bad_sectors)
3373                                                 max_sync = bad_sectors;
3374                                         rcu_read_unlock();
3375                                         continue;
3376                                 }
3377                         }
3378                         atomic_inc(&rdev->nr_pending);
3379                         atomic_inc(&r10_bio->remaining);
3380                         bio->bi_next = biolist;
3381                         biolist = bio;
3382                         bio->bi_end_io = end_sync_read;
3383                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
3384                         if (test_bit(FailFast, &rdev->flags))
3385                                 bio->bi_opf |= MD_FAILFAST;
3386                         bio->bi_iter.bi_sector = sector + rdev->data_offset;
3387                         bio_set_dev(bio, rdev->bdev);
3388                         count++;
3389
3390                         rdev = rcu_dereference(conf->mirrors[d].replacement);
3391                         if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
3392                                 rcu_read_unlock();
3393                                 continue;
3394                         }
3395                         atomic_inc(&rdev->nr_pending);
3396
3397                         /* Need to set up for writing to the replacement */
3398                         bio = r10_bio->devs[i].repl_bio;
3399                         bio->bi_status = BLK_STS_IOERR;
3400
3401                         sector = r10_bio->devs[i].addr;
3402                         bio->bi_next = biolist;
3403                         biolist = bio;
3404                         bio->bi_end_io = end_sync_write;
3405                         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
3406                         if (test_bit(FailFast, &rdev->flags))
3407                                 bio->bi_opf |= MD_FAILFAST;
3408                         bio->bi_iter.bi_sector = sector + rdev->data_offset;
3409                         bio_set_dev(bio, rdev->bdev);
3410                         count++;
3411                         rcu_read_unlock();
3412                 }
3413
3414                 if (count < 2) {
3415                         for (i=0; i<conf->copies; i++) {
3416                                 int d = r10_bio->devs[i].devnum;
3417                                 if (r10_bio->devs[i].bio->bi_end_io)
3418                                         rdev_dec_pending(conf->mirrors[d].rdev,
3419                                                          mddev);
3420                                 if (r10_bio->devs[i].repl_bio &&
3421                                     r10_bio->devs[i].repl_bio->bi_end_io)
3422                                         rdev_dec_pending(
3423                                                 conf->mirrors[d].replacement,
3424                                                 mddev);
3425                         }
3426                         put_buf(r10_bio);
3427                         biolist = NULL;
3428                         goto giveup;
3429                 }
3430         }
3431
3432         nr_sectors = 0;
3433         if (sector_nr + max_sync < max_sector)
3434                 max_sector = sector_nr + max_sync;
3435         do {
3436                 struct page *page;
3437                 int len = PAGE_SIZE;
3438                 if (sector_nr + (len>>9) > max_sector)
3439                         len = (max_sector - sector_nr) << 9;
3440                 if (len == 0)
3441                         break;
3442                 for (bio= biolist ; bio ; bio=bio->bi_next) {
3443                         struct resync_pages *rp = get_resync_pages(bio);
3444                         page = resync_fetch_page(rp, page_idx);
3445                         /*
3446                          * won't fail because the vec table is big enough
3447                          * to hold all these pages
3448                          */
3449                         bio_add_page(bio, page, len, 0);
3450                 }
3451                 nr_sectors += len>>9;
3452                 sector_nr += len>>9;
3453         } while (++page_idx < RESYNC_PAGES);
3454         r10_bio->sectors = nr_sectors;
3455
3456         if (mddev_is_clustered(mddev) &&
3457             test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3458                 /* It is resync not recovery */
3459                 if (conf->cluster_sync_high < sector_nr + nr_sectors) {
3460                         conf->cluster_sync_low = mddev->curr_resync_completed;
3461                         raid10_set_cluster_sync_high(conf);
3462                         /* Send resync message */
3463                         md_cluster_ops->resync_info_update(mddev,
3464                                                 conf->cluster_sync_low,
3465                                                 conf->cluster_sync_high);
3466                 }
3467         } else if (mddev_is_clustered(mddev)) {
3468                 /* This is recovery not resync */
3469                 sector_t sect_va1, sect_va2;
3470                 bool broadcast_msg = false;
3471
3472                 for (i = 0; i < conf->geo.raid_disks; i++) {
3473                         /*
3474                          * sector_nr is a device address for recovery, so we
3475                          * need translate it to array address before compare
3476                          * with cluster_sync_high.
3477                          */
3478                         sect_va1 = raid10_find_virt(conf, sector_nr, i);
3479
3480                         if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
3481                                 broadcast_msg = true;
3482                                 /*
3483                                  * curr_resync_completed is similar as
3484                                  * sector_nr, so make the translation too.
3485                                  */
3486                                 sect_va2 = raid10_find_virt(conf,
3487                                         mddev->curr_resync_completed, i);
3488
3489                                 if (conf->cluster_sync_low == 0 ||
3490                                     conf->cluster_sync_low > sect_va2)
3491                                         conf->cluster_sync_low = sect_va2;
3492                         }
3493                 }
3494                 if (broadcast_msg) {
3495                         raid10_set_cluster_sync_high(conf);
3496                         md_cluster_ops->resync_info_update(mddev,
3497                                                 conf->cluster_sync_low,
3498                                                 conf->cluster_sync_high);
3499                 }
3500         }
3501
3502         while (biolist) {
3503                 bio = biolist;
3504                 biolist = biolist->bi_next;
3505
3506                 bio->bi_next = NULL;
3507                 r10_bio = get_resync_r10bio(bio);
3508                 r10_bio->sectors = nr_sectors;
3509
3510                 if (bio->bi_end_io == end_sync_read) {
3511                         md_sync_acct_bio(bio, nr_sectors);
3512                         bio->bi_status = 0;
3513                         generic_make_request(bio);
3514                 }
3515         }
3516
3517         if (sectors_skipped)
3518                 /* pretend they weren't skipped, it makes
3519                  * no important difference in this case
3520                  */
3521                 md_done_sync(mddev, sectors_skipped, 1);
3522
3523         return sectors_skipped + nr_sectors;
3524  giveup:
3525         /* There is nowhere to write, so all non-sync
3526          * drives must be failed or in resync, all drives
3527          * have a bad block, so try the next chunk...
3528          */
3529         if (sector_nr + max_sync < max_sector)
3530                 max_sector = sector_nr + max_sync;
3531
3532         sectors_skipped += (max_sector - sector_nr);
3533         chunks_skipped ++;
3534         sector_nr = max_sector;
3535         goto skipped;
3536 }
3537
3538 static sector_t
3539 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3540 {
3541         sector_t size;
3542         struct r10conf *conf = mddev->private;
3543
3544         if (!raid_disks)
3545                 raid_disks = min(conf->geo.raid_disks,
3546                                  conf->prev.raid_disks);
3547         if (!sectors)
3548                 sectors = conf->dev_sectors;
3549
3550         size = sectors >> conf->geo.chunk_shift;
3551         sector_div(size, conf->geo.far_copies);
3552         size = size * raid_disks;
3553         sector_div(size, conf->geo.near_copies);
3554
3555         return size << conf->geo.chunk_shift;
3556 }
3557
3558 static void calc_sectors(struct r10conf *conf, sector_t size)
3559 {
3560         /* Calculate the number of sectors-per-device that will
3561          * actually be used, and set conf->dev_sectors and
3562          * conf->stride
3563          */
3564
3565         size = size >> conf->geo.chunk_shift;
3566         sector_div(size, conf->geo.far_copies);
3567         size = size * conf->geo.raid_disks;
3568         sector_div(size, conf->geo.near_copies);
3569         /* 'size' is now the number of chunks in the array */
3570         /* calculate "used chunks per device" */
3571         size = size * conf->copies;
3572
3573         /* We need to round up when dividing by raid_disks to
3574          * get the stride size.
3575          */
3576         size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3577
3578         conf->dev_sectors = size << conf->geo.chunk_shift;
3579
3580         if (conf->geo.far_offset)
3581                 conf->geo.stride = 1 << conf->geo.chunk_shift;
3582         else {
3583                 sector_div(size, conf->geo.far_copies);
3584                 conf->geo.stride = size << conf->geo.chunk_shift;
3585         }
3586 }
3587
3588 enum geo_type {geo_new, geo_old, geo_start};
3589 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3590 {
3591         int nc, fc, fo;
3592         int layout, chunk, disks;
3593         switch (new) {
3594         case geo_old:
3595                 layout = mddev->layout;
3596                 chunk = mddev->chunk_sectors;
3597                 disks = mddev->raid_disks - mddev->delta_disks;
3598                 break;
3599         case geo_new:
3600                 layout = mddev->new_layout;
3601                 chunk = mddev->new_chunk_sectors;
3602                 disks = mddev->raid_disks;
3603                 break;
3604         default: /* avoid 'may be unused' warnings */
3605         case geo_start: /* new when starting reshape - raid_disks not
3606                          * updated yet. */
3607                 layout = mddev->new_layout;
3608                 chunk = mddev->new_chunk_sectors;
3609                 disks = mddev->raid_disks + mddev->delta_disks;
3610                 break;
3611         }
3612         if (layout >> 19)
3613                 return -1;
3614         if (chunk < (PAGE_SIZE >> 9) ||
3615             !is_power_of_2(chunk))
3616                 return -2;
3617         nc = layout & 255;
3618         fc = (layout >> 8) & 255;
3619         fo = layout & (1<<16);
3620         geo->raid_disks = disks;
3621         geo->near_copies = nc;
3622         geo->far_copies = fc;
3623         geo->far_offset = fo;
3624         switch (layout >> 17) {
3625         case 0: /* original layout.  simple but not always optimal */
3626                 geo->far_set_size = disks;
3627                 break;
3628         case 1: /* "improved" layout which was buggy.  Hopefully no-one is
3629                  * actually using this, but leave code here just in case.*/
3630                 geo->far_set_size = disks/fc;
3631                 WARN(geo->far_set_size < fc,
3632                      "This RAID10 layout does not provide data safety - please backup and create new array\n");
3633                 break;
3634         case 2: /* "improved" layout fixed to match documentation */
3635                 geo->far_set_size = fc * nc;
3636                 break;
3637         default: /* Not a valid layout */
3638                 return -1;
3639         }
3640         geo->chunk_mask = chunk - 1;
3641         geo->chunk_shift = ffz(~chunk);
3642         return nc*fc;
3643 }
3644
3645 static struct r10conf *setup_conf(struct mddev *mddev)
3646 {
3647         struct r10conf *conf = NULL;
3648         int err = -EINVAL;
3649         struct geom geo;
3650         int copies;
3651
3652         copies = setup_geo(&geo, mddev, geo_new);
3653
3654         if (copies == -2) {
3655                 pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n",
3656                         mdname(mddev), PAGE_SIZE);
3657                 goto out;
3658         }
3659
3660         if (copies < 2 || copies > mddev->raid_disks) {
3661                 pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3662                         mdname(mddev), mddev->new_layout);
3663                 goto out;
3664         }
3665
3666         err = -ENOMEM;
3667         conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3668         if (!conf)
3669                 goto out;
3670
3671         /* FIXME calc properly */
3672         conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3673                                                             max(0,-mddev->delta_disks)),
3674                                 GFP_KERNEL);
3675         if (!conf->mirrors)
3676                 goto out;
3677
3678         conf->tmppage = alloc_page(GFP_KERNEL);
3679         if (!conf->tmppage)
3680                 goto out;
3681
3682         conf->geo = geo;
3683         conf->copies = copies;
3684         conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3685                                            r10bio_pool_free, conf);
3686         if (!conf->r10bio_pool)
3687                 goto out;
3688
3689         conf->bio_split = bioset_create(BIO_POOL_SIZE, 0, 0);
3690         if (!conf->bio_split)
3691                 goto out;
3692
3693         calc_sectors(conf, mddev->dev_sectors);
3694         if (mddev->reshape_position == MaxSector) {
3695                 conf->prev = conf->geo;
3696                 conf->reshape_progress = MaxSector;
3697         } else {
3698                 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3699                         err = -EINVAL;
3700                         goto out;
3701                 }
3702                 conf->reshape_progress = mddev->reshape_position;
3703                 if (conf->prev.far_offset)
3704                         conf->prev.stride = 1 << conf->prev.chunk_shift;
3705                 else
3706                         /* far_copies must be 1 */
3707                         conf->prev.stride = conf->dev_sectors;
3708         }
3709         conf->reshape_safe = conf->reshape_progress;
3710         spin_lock_init(&conf->device_lock);
3711         INIT_LIST_HEAD(&conf->retry_list);
3712         INIT_LIST_HEAD(&conf->bio_end_io_list);
3713
3714         spin_lock_init(&conf->resync_lock);
3715         init_waitqueue_head(&conf->wait_barrier);
3716         atomic_set(&conf->nr_pending, 0);
3717
3718         conf->thread = md_register_thread(raid10d, mddev, "raid10");
3719         if (!conf->thread)
3720                 goto out;
3721
3722         conf->mddev = mddev;
3723         return conf;
3724
3725  out:
3726         if (conf) {
3727                 mempool_destroy(conf->r10bio_pool);
3728                 kfree(conf->mirrors);
3729                 safe_put_page(conf->tmppage);
3730                 if (conf->bio_split)
3731                         bioset_free(conf->bio_split);
3732                 kfree(conf);
3733         }
3734         return ERR_PTR(err);
3735 }
3736
3737 static int raid10_run(struct mddev *mddev)
3738 {
3739         struct r10conf *conf;
3740         int i, disk_idx, chunk_size;
3741         struct raid10_info *disk;
3742         struct md_rdev *rdev;
3743         sector_t size;
3744         sector_t min_offset_diff = 0;
3745         int first = 1;
3746         bool discard_supported = false;
3747
3748         if (mddev_init_writes_pending(mddev) < 0)
3749                 return -ENOMEM;
3750
3751         if (mddev->private == NULL) {
3752                 conf = setup_conf(mddev);
3753                 if (IS_ERR(conf))
3754                         return PTR_ERR(conf);
3755                 mddev->private = conf;
3756         }
3757         conf = mddev->private;
3758         if (!conf)
3759                 goto out;
3760
3761         if (mddev_is_clustered(conf->mddev)) {
3762                 int fc, fo;
3763
3764                 fc = (mddev->layout >> 8) & 255;
3765                 fo = mddev->layout & (1<<16);
3766                 if (fc > 1 || fo > 0) {
3767                         pr_err("only near layout is supported by clustered"
3768                                 " raid10\n");
3769                         goto out;
3770                 }
3771         }
3772
3773         mddev->thread = conf->thread;
3774         conf->thread = NULL;
3775
3776         chunk_size = mddev->chunk_sectors << 9;
3777         if (mddev->queue) {
3778                 blk_queue_max_discard_sectors(mddev->queue,
3779                                               mddev->chunk_sectors);
3780                 blk_queue_max_write_same_sectors(mddev->queue, 0);
3781                 blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
3782                 blk_queue_io_min(mddev->queue, chunk_size);
3783                 if (conf->geo.raid_disks % conf->geo.near_copies)
3784                         blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3785                 else
3786                         blk_queue_io_opt(mddev->queue, chunk_size *
3787                                          (conf->geo.raid_disks / conf->geo.near_copies));
3788         }
3789
3790         rdev_for_each(rdev, mddev) {
3791                 long long diff;
3792
3793                 disk_idx = rdev->raid_disk;
3794                 if (disk_idx < 0)
3795                         continue;
3796                 if (disk_idx >= conf->geo.raid_disks &&
3797                     disk_idx >= conf->prev.raid_disks)
3798                         continue;
3799                 disk = conf->mirrors + disk_idx;
3800
3801                 if (test_bit(Replacement, &rdev->flags)) {
3802                         if (disk->replacement)
3803                                 goto out_free_conf;
3804                         disk->replacement = rdev;
3805                 } else {
3806                         if (disk->rdev)
3807                                 goto out_free_conf;
3808                         disk->rdev = rdev;
3809                 }
3810                 diff = (rdev->new_data_offset - rdev->data_offset);
3811                 if (!mddev->reshape_backwards)
3812                         diff = -diff;
3813                 if (diff < 0)
3814                         diff = 0;
3815                 if (first || diff < min_offset_diff)
3816                         min_offset_diff = diff;
3817
3818                 if (mddev->gendisk)
3819                         disk_stack_limits(mddev->gendisk, rdev->bdev,
3820                                           rdev->data_offset << 9);
3821
3822                 disk->head_position = 0;
3823
3824                 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3825                         discard_supported = true;
3826                 first = 0;
3827         }
3828
3829         if (mddev->queue) {
3830                 if (discard_supported)
3831                         queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3832                                                 mddev->queue);
3833                 else
3834                         queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3835                                                   mddev->queue);
3836         }
3837         /* need to check that every block has at least one working mirror */
3838         if (!enough(conf, -1)) {
3839                 pr_err("md/raid10:%s: not enough operational mirrors.\n",
3840                        mdname(mddev));
3841                 goto out_free_conf;
3842         }
3843
3844         if (conf->reshape_progress != MaxSector) {
3845                 /* must ensure that shape change is supported */
3846                 if (conf->geo.far_copies != 1 &&
3847                     conf->geo.far_offset == 0)
3848                         goto out_free_conf;
3849                 if (conf->prev.far_copies != 1 &&
3850                     conf->prev.far_offset == 0)
3851                         goto out_free_conf;
3852         }
3853
3854         mddev->degraded = 0;
3855         for (i = 0;
3856              i < conf->geo.raid_disks
3857                      || i < conf->prev.raid_disks;
3858              i++) {
3859
3860                 disk = conf->mirrors + i;
3861
3862                 if (!disk->rdev && disk->replacement) {
3863                         /* The replacement is all we have - use it */
3864                         disk->rdev = disk->replacement;
3865                         disk->replacement = NULL;
3866                         clear_bit(Replacement, &disk->rdev->flags);
3867                 }
3868
3869                 if (!disk->rdev ||
3870                     !test_bit(In_sync, &disk->rdev->flags)) {
3871                         disk->head_position = 0;
3872                         mddev->degraded++;
3873                         if (disk->rdev &&
3874                             disk->rdev->saved_raid_disk < 0)
3875                                 conf->fullsync = 1;
3876                 }
3877                 disk->recovery_disabled = mddev->recovery_disabled - 1;
3878         }
3879
3880         if (mddev->recovery_cp != MaxSector)
3881                 pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
3882                           mdname(mddev));
3883         pr_info("md/raid10:%s: active with %d out of %d devices\n",
3884                 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3885                 conf->geo.raid_disks);
3886         /*
3887          * Ok, everything is just fine now
3888          */
3889         mddev->dev_sectors = conf->dev_sectors;
3890         size = raid10_size(mddev, 0, 0);
3891         md_set_array_sectors(mddev, size);
3892         mddev->resync_max_sectors = size;
3893         set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
3894
3895         if (mddev->queue) {
3896                 int stripe = conf->geo.raid_disks *
3897                         ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3898
3899                 /* Calculate max read-ahead size.
3900                  * We need to readahead at least twice a whole stripe....
3901                  * maybe...
3902                  */
3903                 stripe /= conf->geo.near_copies;
3904                 if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
3905                         mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
3906         }
3907
3908         if (md_integrity_register(mddev))
3909                 goto out_free_conf;
3910
3911         if (conf->reshape_progress != MaxSector) {
3912                 unsigned long before_length, after_length;
3913
3914                 before_length = ((1 << conf->prev.chunk_shift) *
3915                                  conf->prev.far_copies);
3916                 after_length = ((1 << conf->geo.chunk_shift) *
3917                                 conf->geo.far_copies);
3918
3919                 if (max(before_length, after_length) > min_offset_diff) {
3920                         /* This cannot work */
3921                         pr_warn("md/raid10: offset difference not enough to continue reshape\n");
3922                         goto out_free_conf;
3923                 }
3924                 conf->offset_diff = min_offset_diff;
3925
3926                 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3927                 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3928                 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3929                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3930                 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3931                                                         "reshape");
3932         }
3933
3934         return 0;
3935
3936 out_free_conf:
3937         md_unregister_thread(&mddev->thread);
3938         mempool_destroy(conf->r10bio_pool);
3939         safe_put_page(conf->tmppage);
3940         kfree(conf->mirrors);
3941         kfree(conf);
3942         mddev->private = NULL;
3943 out:
3944         return -EIO;
3945 }
3946
3947 static void raid10_free(struct mddev *mddev, void *priv)
3948 {
3949         struct r10conf *conf = priv;
3950
3951         mempool_destroy(conf->r10bio_pool);
3952         safe_put_page(conf->tmppage);
3953         kfree(conf->mirrors);
3954         kfree(conf->mirrors_old);
3955         kfree(conf->mirrors_new);
3956         if (conf->bio_split)
3957                 bioset_free(conf->bio_split);
3958         kfree(conf);
3959 }
3960
3961 static void raid10_quiesce(struct mddev *mddev, int quiesce)
3962 {
3963         struct r10conf *conf = mddev->private;
3964
3965         if (quiesce)
3966                 raise_barrier(conf, 0);
3967         else
3968                 lower_barrier(conf);
3969 }
3970
3971 static int raid10_resize(struct mddev *mddev, sector_t sectors)
3972 {
3973         /* Resize of 'far' arrays is not supported.
3974          * For 'near' and 'offset' arrays we can set the
3975          * number of sectors used to be an appropriate multiple
3976          * of the chunk size.
3977          * For 'offset', this is far_copies*chunksize.
3978          * For 'near' the multiplier is the LCM of
3979          * near_copies and raid_disks.
3980          * So if far_copies > 1 && !far_offset, fail.
3981          * Else find LCM(raid_disks, near_copy)*far_copies and
3982          * multiply by chunk_size.  Then round to this number.
3983          * This is mostly done by raid10_size()
3984          */
3985         struct r10conf *conf = mddev->private;
3986         sector_t oldsize, size;
3987
3988         if (mddev->reshape_position != MaxSector)
3989                 return -EBUSY;
3990
3991         if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3992                 return -EINVAL;
3993
3994         oldsize = raid10_size(mddev, 0, 0);
3995         size = raid10_size(mddev, sectors, 0);
3996         if (mddev->external_size &&
3997             mddev->array_sectors > size)
3998                 return -EINVAL;
3999         if (mddev->bitmap) {
4000                 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
4001                 if (ret)
4002                         return ret;
4003         }
4004         md_set_array_sectors(mddev, size);
4005         if (sectors > mddev->dev_sectors &&
4006             mddev->recovery_cp > oldsize) {
4007                 mddev->recovery_cp = oldsize;
4008                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4009         }
4010         calc_sectors(conf, sectors);
4011         mddev->dev_sectors = conf->dev_sectors;
4012         mddev->resync_max_sectors = size;
4013         return 0;
4014 }
4015
4016 static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
4017 {
4018         struct md_rdev *rdev;
4019         struct r10conf *conf;
4020
4021         if (mddev->degraded > 0) {
4022                 pr_warn("md/raid10:%s: Error: degraded raid0!\n",
4023                         mdname(mddev));
4024                 return ERR_PTR(-EINVAL);
4025         }
4026         sector_div(size, devs);
4027
4028         /* Set new parameters */
4029         mddev->new_level = 10;
4030         /* new layout: far_copies = 1, near_copies = 2 */
4031         mddev->new_layout = (1<<8) + 2;
4032         mddev->new_chunk_sectors = mddev->chunk_sectors;
4033         mddev->delta_disks = mddev->raid_disks;
4034         mddev->raid_disks *= 2;
4035         /* make sure it will be not marked as dirty */
4036         mddev->recovery_cp = MaxSector;
4037         mddev->dev_sectors = size;
4038
4039         conf = setup_conf(mddev);
4040         if (!IS_ERR(conf)) {
4041                 rdev_for_each(rdev, mddev)
4042                         if (rdev->raid_disk >= 0) {
4043                                 rdev->new_raid_disk = rdev->raid_disk * 2;
4044                                 rdev->sectors = size;
4045                         }
4046                 conf->barrier = 1;
4047         }
4048
4049         return conf;
4050 }
4051
4052 static void *raid10_takeover(struct mddev *mddev)
4053 {
4054         struct r0conf *raid0_conf;
4055
4056         /* raid10 can take over:
4057          *  raid0 - providing it has only two drives
4058          */
4059         if (mddev->level == 0) {
4060                 /* for raid0 takeover only one zone is supported */
4061                 raid0_conf = mddev->private;
4062                 if (raid0_conf->nr_strip_zones > 1) {
4063                         pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n",
4064                                 mdname(mddev));
4065                         return ERR_PTR(-EINVAL);
4066                 }
4067                 return raid10_takeover_raid0(mddev,
4068                         raid0_conf->strip_zone->zone_end,
4069                         raid0_conf->strip_zone->nb_dev);
4070         }
4071         return ERR_PTR(-EINVAL);
4072 }
4073
4074 static int raid10_check_reshape(struct mddev *mddev)
4075 {
4076         /* Called when there is a request to change
4077          * - layout (to ->new_layout)
4078          * - chunk size (to ->new_chunk_sectors)
4079          * - raid_disks (by delta_disks)
4080          * or when trying to restart a reshape that was ongoing.
4081          *
4082          * We need to validate the request and possibly allocate
4083          * space if that might be an issue later.
4084          *
4085          * Currently we reject any reshape of a 'far' mode array,
4086          * allow chunk size to change if new is generally acceptable,
4087          * allow raid_disks to increase, and allow
4088          * a switch between 'near' mode and 'offset' mode.
4089          */
4090         struct r10conf *conf = mddev->private;
4091         struct geom geo;
4092
4093         if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
4094                 return -EINVAL;
4095
4096         if (setup_geo(&geo, mddev, geo_start) != conf->copies)
4097                 /* mustn't change number of copies */
4098                 return -EINVAL;
4099         if (geo.far_copies > 1 && !geo.far_offset)
4100                 /* Cannot switch to 'far' mode */
4101                 return -EINVAL;
4102
4103         if (mddev->array_sectors & geo.chunk_mask)
4104                         /* not factor of array size */
4105                         return -EINVAL;
4106
4107         if (!enough(conf, -1))
4108                 return -EINVAL;
4109
4110         kfree(conf->mirrors_new);
4111         conf->mirrors_new = NULL;
4112         if (mddev->delta_disks > 0) {
4113                 /* allocate new 'mirrors' list */
4114                 conf->mirrors_new = kzalloc(
4115                         sizeof(struct raid10_info)
4116                         *(mddev->raid_disks +
4117                           mddev->delta_disks),
4118                         GFP_KERNEL);
4119                 if (!conf->mirrors_new)
4120                         return -ENOMEM;
4121         }
4122         return 0;
4123 }
4124
4125 /*
4126  * Need to check if array has failed when deciding whether to:
4127  *  - start an array
4128  *  - remove non-faulty devices
4129  *  - add a spare
4130  *  - allow a reshape
4131  * This determination is simple when no reshape is happening.
4132  * However if there is a reshape, we need to carefully check
4133  * both the before and after sections.
4134  * This is because some failed devices may only affect one
4135  * of the two sections, and some non-in_sync devices may
4136  * be insync in the section most affected by failed devices.
4137  */
4138 static int calc_degraded(struct r10conf *conf)
4139 {
4140         int degraded, degraded2;
4141         int i;
4142
4143         rcu_read_lock();
4144         degraded = 0;
4145         /* 'prev' section first */
4146         for (i = 0; i < conf->prev.raid_disks; i++) {
4147                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4148                 if (!rdev || test_bit(Faulty, &rdev->flags))
4149                         degraded++;
4150                 else if (!test_bit(In_sync, &rdev->flags))
4151                         /* When we can reduce the number of devices in
4152                          * an array, this might not contribute to
4153                          * 'degraded'.  It does now.
4154                          */
4155                         degraded++;
4156         }
4157         rcu_read_unlock();
4158         if (conf->geo.raid_disks == conf->prev.raid_disks)
4159                 return degraded;
4160         rcu_read_lock();
4161         degraded2 = 0;
4162         for (i = 0; i < conf->geo.raid_disks; i++) {
4163                 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
4164                 if (!rdev || test_bit(Faulty, &rdev->flags))
4165                         degraded2++;
4166                 else if (!test_bit(In_sync, &rdev->flags)) {
4167                         /* If reshape is increasing the number of devices,
4168                          * this section has already been recovered, so
4169                          * it doesn't contribute to degraded.
4170                          * else it does.
4171                          */
4172                         if (conf->geo.raid_disks <= conf->prev.raid_disks)
4173                                 degraded2++;
4174                 }
4175         }
4176         rcu_read_unlock();
4177         if (degraded2 > degraded)
4178                 return degraded2;
4179         return degraded;
4180 }
4181
4182 static int raid10_start_reshape(struct mddev *mddev)
4183 {
4184         /* A 'reshape' has been requested. This commits
4185          * the various 'new' fields and sets MD_RECOVER_RESHAPE
4186          * This also checks if there are enough spares and adds them
4187          * to the array.
4188          * We currently require enough spares to make the final
4189          * array non-degraded.  We also require that the difference
4190          * between old and new data_offset - on each device - is
4191          * enough that we never risk over-writing.
4192          */
4193
4194         unsigned long before_length, after_length;
4195         sector_t min_offset_diff = 0;
4196         int first = 1;
4197         struct geom new;
4198         struct r10conf *conf = mddev->private;
4199         struct md_rdev *rdev;
4200         int spares = 0;
4201         int ret;
4202
4203         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4204                 return -EBUSY;
4205
4206         if (setup_geo(&new, mddev, geo_start) != conf->copies)
4207                 return -EINVAL;
4208
4209         before_length = ((1 << conf->prev.chunk_shift) *
4210                          conf->prev.far_copies);
4211         after_length = ((1 << conf->geo.chunk_shift) *
4212                         conf->geo.far_copies);
4213
4214         rdev_for_each(rdev, mddev) {
4215                 if (!test_bit(In_sync, &rdev->flags)
4216                     && !test_bit(Faulty, &rdev->flags))
4217                         spares++;
4218                 if (rdev->raid_disk >= 0) {
4219                         long long diff = (rdev->new_data_offset
4220                                           - rdev->data_offset);
4221                         if (!mddev->reshape_backwards)
4222                                 diff = -diff;
4223                         if (diff < 0)
4224                                 diff = 0;
4225                         if (first || diff < min_offset_diff)
4226                                 min_offset_diff = diff;
4227                         first = 0;
4228                 }
4229         }
4230
4231         if (max(before_length, after_length) > min_offset_diff)
4232                 return -EINVAL;
4233
4234         if (spares < mddev->delta_disks)
4235                 return -EINVAL;
4236
4237         conf->offset_diff = min_offset_diff;
4238         spin_lock_irq(&conf->device_lock);
4239         if (conf->mirrors_new) {
4240                 memcpy(conf->mirrors_new, conf->mirrors,
4241                        sizeof(struct raid10_info)*conf->prev.raid_disks);
4242                 smp_mb();
4243                 kfree(conf->mirrors_old);
4244                 conf->mirrors_old = conf->mirrors;
4245                 conf->mirrors = conf->mirrors_new;
4246                 conf->mirrors_new = NULL;
4247         }
4248         setup_geo(&conf->geo, mddev, geo_start);
4249         smp_mb();
4250         if (mddev->reshape_backwards) {
4251                 sector_t size = raid10_size(mddev, 0, 0);
4252                 if (size < mddev->array_sectors) {
4253                         spin_unlock_irq(&conf->device_lock);
4254                         pr_warn("md/raid10:%s: array size must be reduce before number of disks\n",
4255                                 mdname(mddev));
4256                         return -EINVAL;
4257                 }
4258                 mddev->resync_max_sectors = size;
4259                 conf->reshape_progress = size;
4260         } else
4261                 conf->reshape_progress = 0;
4262         conf->reshape_safe = conf->reshape_progress;
4263         spin_unlock_irq(&conf->device_lock);
4264
4265         if (mddev->delta_disks && mddev->bitmap) {
4266                 ret = bitmap_resize(mddev->bitmap,
4267                                     raid10_size(mddev, 0,
4268                                                 conf->geo.raid_disks),
4269                                     0, 0);
4270                 if (ret)
4271                         goto abort;
4272         }
4273         if (mddev->delta_disks > 0) {
4274                 rdev_for_each(rdev, mddev)
4275                         if (rdev->raid_disk < 0 &&
4276                             !test_bit(Faulty, &rdev->flags)) {
4277                                 if (raid10_add_disk(mddev, rdev) == 0) {
4278                                         if (rdev->raid_disk >=
4279                                             conf->prev.raid_disks)
4280                                                 set_bit(In_sync, &rdev->flags);
4281                                         else
4282                                                 rdev->recovery_offset = 0;
4283
4284                                         if (sysfs_link_rdev(mddev, rdev))
4285                                                 /* Failure here  is OK */;
4286                                 }
4287                         } else if (rdev->raid_disk >= conf->prev.raid_disks
4288                                    && !test_bit(Faulty, &rdev->flags)) {
4289                                 /* This is a spare that was manually added */
4290                                 set_bit(In_sync, &rdev->flags);
4291                         }
4292         }
4293         /* When a reshape changes the number of devices,
4294          * ->degraded is measured against the larger of the
4295          * pre and  post numbers.
4296          */
4297         spin_lock_irq(&conf->device_lock);
4298         mddev->degraded = calc_degraded(conf);
4299         spin_unlock_irq(&conf->device_lock);
4300         mddev->raid_disks = conf->geo.raid_disks;
4301         mddev->reshape_position = conf->reshape_progress;
4302         set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4303
4304         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4305         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4306         clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
4307         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4308         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4309
4310         mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4311                                                 "reshape");
4312         if (!mddev->sync_thread) {
4313                 ret = -EAGAIN;
4314                 goto abort;
4315         }
4316         conf->reshape_checkpoint = jiffies;
4317         md_wakeup_thread(mddev->sync_thread);
4318         md_new_event(mddev);
4319         return 0;
4320
4321 abort:
4322         mddev->recovery = 0;
4323         spin_lock_irq(&conf->device_lock);
4324         conf->geo = conf->prev;
4325         mddev->raid_disks = conf->geo.raid_disks;
4326         rdev_for_each(rdev, mddev)
4327                 rdev->new_data_offset = rdev->data_offset;
4328         smp_wmb();
4329         conf->reshape_progress = MaxSector;
4330         conf->reshape_safe = MaxSector;
4331         mddev->reshape_position = MaxSector;
4332         spin_unlock_irq(&conf->device_lock);
4333         return ret;
4334 }
4335
4336 /* Calculate the last device-address that could contain
4337  * any block from the chunk that includes the array-address 's'
4338  * and report the next address.
4339  * i.e. the address returned will be chunk-aligned and after
4340  * any data that is in the chunk containing 's'.
4341  */
4342 static sector_t last_dev_address(sector_t s, struct geom *geo)
4343 {
4344         s = (s | geo->chunk_mask) + 1;
4345         s >>= geo->chunk_shift;
4346         s *= geo->near_copies;
4347         s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4348         s *= geo->far_copies;
4349         s <<= geo->chunk_shift;
4350         return s;
4351 }
4352
4353 /* Calculate the first device-address that could contain
4354  * any block from the chunk that includes the array-address 's'.
4355  * This too will be the start of a chunk
4356  */
4357 static sector_t first_dev_address(sector_t s, struct geom *geo)
4358 {
4359         s >>= geo->chunk_shift;
4360         s *= geo->near_copies;
4361         sector_div(s, geo->raid_disks);
4362         s *= geo->far_copies;
4363         s <<= geo->chunk_shift;
4364         return s;
4365 }
4366
4367 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4368                                 int *skipped)
4369 {
4370         /* We simply copy at most one chunk (smallest of old and new)
4371          * at a time, possibly less if that exceeds RESYNC_PAGES,
4372          * or we hit a bad block or something.
4373          * This might mean we pause for normal IO in the middle of
4374          * a chunk, but that is not a problem as mddev->reshape_position
4375          * can record any location.
4376          *
4377          * If we will want to write to a location that isn't
4378          * yet recorded as 'safe' (i.e. in metadata on disk) then
4379          * we need to flush all reshape requests and update the metadata.
4380          *
4381          * When reshaping forwards (e.g. to more devices), we interpret
4382          * 'safe' as the earliest block which might not have been copied
4383          * down yet.  We divide this by previous stripe size and multiply
4384          * by previous stripe length to get lowest device offset that we
4385          * cannot write to yet.
4386          * We interpret 'sector_nr' as an address that we want to write to.
4387          * From this we use last_device_address() to find where we might
4388          * write to, and first_device_address on the  'safe' position.
4389          * If this 'next' write position is after the 'safe' position,
4390          * we must update the metadata to increase the 'safe' position.
4391          *
4392          * When reshaping backwards, we round in the opposite direction
4393          * and perform the reverse test:  next write position must not be
4394          * less than current safe position.
4395          *
4396          * In all this the minimum difference in data offsets
4397          * (conf->offset_diff - always positive) allows a bit of slack,
4398          * so next can be after 'safe', but not by more than offset_diff
4399          *
4400          * We need to prepare all the bios here before we start any IO
4401          * to ensure the size we choose is acceptable to all devices.
4402          * The means one for each copy for write-out and an extra one for
4403          * read-in.
4404          * We store the read-in bio in ->master_bio and the others in
4405          * ->devs[x].bio and ->devs[x].repl_bio.
4406          */
4407         struct r10conf *conf = mddev->private;
4408         struct r10bio *r10_bio;
4409         sector_t next, safe, last;
4410         int max_sectors;
4411         int nr_sectors;
4412         int s;
4413         struct md_rdev *rdev;
4414         int need_flush = 0;
4415         struct bio *blist;
4416         struct bio *bio, *read_bio;
4417         int sectors_done = 0;
4418         struct page **pages;
4419
4420         if (sector_nr == 0) {
4421                 /* If restarting in the middle, skip the initial sectors */
4422                 if (mddev->reshape_backwards &&
4423                     conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4424                         sector_nr = (raid10_size(mddev, 0, 0)
4425                                      - conf->reshape_progress);
4426                 } else if (!mddev->reshape_backwards &&
4427                            conf->reshape_progress > 0)
4428                         sector_nr = conf->reshape_progress;
4429                 if (sector_nr) {
4430                         mddev->curr_resync_completed = sector_nr;
4431                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4432                         *skipped = 1;
4433                         return sector_nr;
4434                 }
4435         }
4436
4437         /* We don't use sector_nr to track where we are up to
4438          * as that doesn't work well for ->reshape_backwards.
4439          * So just use ->reshape_progress.
4440          */
4441         if (mddev->reshape_backwards) {
4442                 /* 'next' is the earliest device address that we might
4443                  * write to for this chunk in the new layout
4444                  */
4445                 next = first_dev_address(conf->reshape_progress - 1,
4446                                          &conf->geo);
4447
4448                 /* 'safe' is the last device address that we might read from
4449                  * in the old layout after a restart
4450                  */
4451                 safe = last_dev_address(conf->reshape_safe - 1,
4452                                         &conf->prev);
4453
4454                 if (next + conf->offset_diff < safe)
4455                         need_flush = 1;
4456
4457                 last = conf->reshape_progress - 1;
4458                 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4459                                                & conf->prev.chunk_mask);
4460                 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4461                         sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4462         } else {
4463                 /* 'next' is after the last device address that we
4464                  * might write to for this chunk in the new layout
4465                  */
4466                 next = last_dev_address(conf->reshape_progress, &conf->geo);
4467
4468                 /* 'safe' is the earliest device address that we might
4469                  * read from in the old layout after a restart
4470                  */
4471                 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4472
4473                 /* Need to update metadata if 'next' might be beyond 'safe'
4474                  * as that would possibly corrupt data
4475                  */
4476                 if (next > safe + conf->offset_diff)
4477                         need_flush = 1;
4478
4479                 sector_nr = conf->reshape_progress;
4480                 last  = sector_nr | (conf->geo.chunk_mask
4481                                      & conf->prev.chunk_mask);
4482
4483                 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4484                         last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4485         }
4486
4487         if (need_flush ||
4488             time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4489                 /* Need to update reshape_position in metadata */
4490                 wait_barrier(conf);
4491                 mddev->reshape_position = conf->reshape_progress;
4492                 if (mddev->reshape_backwards)
4493                         mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4494                                 - conf->reshape_progress;
4495                 else
4496                         mddev->curr_resync_completed = conf->reshape_progress;
4497                 conf->reshape_checkpoint = jiffies;
4498                 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
4499                 md_wakeup_thread(mddev->thread);
4500                 wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
4501                            test_bit(MD_RECOVERY_INTR, &mddev->recovery));
4502                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
4503                         allow_barrier(conf);
4504                         return sectors_done;
4505                 }
4506                 conf->reshape_safe = mddev->reshape_position;
4507                 allow_barrier(conf);
4508         }
4509
4510 read_more:
4511         /* Now schedule reads for blocks from sector_nr to last */
4512         r10_bio = raid10_alloc_init_r10buf(conf);
4513         r10_bio->state = 0;
4514         raise_barrier(conf, sectors_done != 0);
4515         atomic_set(&r10_bio->remaining, 0);
4516         r10_bio->mddev = mddev;
4517         r10_bio->sector = sector_nr;
4518         set_bit(R10BIO_IsReshape, &r10_bio->state);
4519         r10_bio->sectors = last - sector_nr + 1;
4520         rdev = read_balance(conf, r10_bio, &max_sectors);
4521         BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4522
4523         if (!rdev) {
4524                 /* Cannot read from here, so need to record bad blocks
4525                  * on all the target devices.
4526                  */
4527                 // FIXME
4528                 mempool_free(r10_bio, conf->r10buf_pool);
4529                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4530                 return sectors_done;
4531         }
4532
4533         read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4534
4535         bio_set_dev(read_bio, rdev->bdev);
4536         read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4537                                + rdev->data_offset);
4538         read_bio->bi_private = r10_bio;
4539         read_bio->bi_end_io = end_reshape_read;
4540         bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
4541         read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
4542         read_bio->bi_status = 0;
4543         read_bio->bi_vcnt = 0;
4544         read_bio->bi_iter.bi_size = 0;
4545         r10_bio->master_bio = read_bio;
4546         r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4547
4548         /* Now find the locations in the new layout */
4549         __raid10_find_phys(&conf->geo, r10_bio);
4550
4551         blist = read_bio;
4552         read_bio->bi_next = NULL;
4553
4554         rcu_read_lock();
4555         for (s = 0; s < conf->copies*2; s++) {
4556                 struct bio *b;
4557                 int d = r10_bio->devs[s/2].devnum;
4558                 struct md_rdev *rdev2;
4559                 if (s&1) {
4560                         rdev2 = rcu_dereference(conf->mirrors[d].replacement);
4561                         b = r10_bio->devs[s/2].repl_bio;
4562                 } else {
4563                         rdev2 = rcu_dereference(conf->mirrors[d].rdev);
4564                         b = r10_bio->devs[s/2].bio;
4565                 }
4566                 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4567                         continue;
4568
4569                 bio_set_dev(b, rdev2->bdev);
4570                 b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
4571                         rdev2->new_data_offset;
4572                 b->bi_end_io = end_reshape_write;
4573                 bio_set_op_attrs(b, REQ_OP_WRITE, 0);
4574                 b->bi_next = blist;
4575                 blist = b;
4576         }
4577
4578         /* Now add as many pages as possible to all of these bios. */
4579
4580         nr_sectors = 0;
4581         pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4582         for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4583                 struct page *page = pages[s / (PAGE_SIZE >> 9)];
4584                 int len = (max_sectors - s) << 9;
4585                 if (len > PAGE_SIZE)
4586                         len = PAGE_SIZE;
4587                 for (bio = blist; bio ; bio = bio->bi_next) {
4588                         /*
4589                          * won't fail because the vec table is big enough
4590                          * to hold all these pages
4591                          */
4592                         bio_add_page(bio, page, len, 0);
4593                 }
4594                 sector_nr += len >> 9;
4595                 nr_sectors += len >> 9;
4596         }
4597         rcu_read_unlock();
4598         r10_bio->sectors = nr_sectors;
4599
4600         /* Now submit the read */
4601         md_sync_acct_bio(read_bio, r10_bio->sectors);
4602         atomic_inc(&r10_bio->remaining);
4603         read_bio->bi_next = NULL;
4604         generic_make_request(read_bio);
4605         sector_nr += nr_sectors;
4606         sectors_done += nr_sectors;
4607         if (sector_nr <= last)
4608                 goto read_more;
4609
4610         /* Now that we have done the whole section we can
4611          * update reshape_progress
4612          */
4613         if (mddev->reshape_backwards)
4614                 conf->reshape_progress -= sectors_done;
4615         else
4616                 conf->reshape_progress += sectors_done;
4617
4618         return sectors_done;
4619 }
4620
4621 static void end_reshape_request(struct r10bio *r10_bio);
4622 static int handle_reshape_read_error(struct mddev *mddev,
4623                                      struct r10bio *r10_bio);
4624 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4625 {
4626         /* Reshape read completed.  Hopefully we have a block
4627          * to write out.
4628          * If we got a read error then we do sync 1-page reads from
4629          * elsewhere until we find the data - or give up.
4630          */
4631         struct r10conf *conf = mddev->private;
4632         int s;
4633
4634         if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4635                 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4636                         /* Reshape has been aborted */
4637                         md_done_sync(mddev, r10_bio->sectors, 0);
4638                         return;
4639                 }
4640
4641         /* We definitely have the data in the pages, schedule the
4642          * writes.
4643          */
4644         atomic_set(&r10_bio->remaining, 1);
4645         for (s = 0; s < conf->copies*2; s++) {
4646                 struct bio *b;
4647                 int d = r10_bio->devs[s/2].devnum;
4648                 struct md_rdev *rdev;
4649                 rcu_read_lock();
4650                 if (s&1) {
4651                         rdev = rcu_dereference(conf->mirrors[d].replacement);
4652                         b = r10_bio->devs[s/2].repl_bio;
4653                 } else {
4654                         rdev = rcu_dereference(conf->mirrors[d].rdev);
4655                         b = r10_bio->devs[s/2].bio;
4656                 }
4657                 if (!rdev || test_bit(Faulty, &rdev->flags)) {
4658                         rcu_read_unlock();
4659                         continue;
4660                 }
4661                 atomic_inc(&rdev->nr_pending);
4662                 rcu_read_unlock();
4663                 md_sync_acct_bio(b, r10_bio->sectors);
4664                 atomic_inc(&r10_bio->remaining);
4665                 b->bi_next = NULL;
4666                 generic_make_request(b);
4667         }
4668         end_reshape_request(r10_bio);
4669 }
4670
4671 static void end_reshape(struct r10conf *conf)
4672 {
4673         if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4674                 return;
4675
4676         spin_lock_irq(&conf->device_lock);
4677         conf->prev = conf->geo;
4678         md_finish_reshape(conf->mddev);
4679         smp_wmb();
4680         conf->reshape_progress = MaxSector;
4681         conf->reshape_safe = MaxSector;
4682         spin_unlock_irq(&conf->device_lock);
4683
4684         /* read-ahead size must cover two whole stripes, which is
4685          * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4686          */
4687         if (conf->mddev->queue) {
4688                 int stripe = conf->geo.raid_disks *
4689                         ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4690                 stripe /= conf->geo.near_copies;
4691                 if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
4692                         conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
4693         }
4694         conf->fullsync = 0;
4695 }
4696
4697 static int handle_reshape_read_error(struct mddev *mddev,
4698                                      struct r10bio *r10_bio)
4699 {
4700         /* Use sync reads to get the blocks from somewhere else */
4701         int sectors = r10_bio->sectors;
4702         struct r10conf *conf = mddev->private;
4703         struct r10bio *r10b;
4704         int slot = 0;
4705         int idx = 0;
4706         struct page **pages;
4707
4708         r10b = kmalloc(sizeof(*r10b) +
4709                sizeof(struct r10dev) * conf->copies, GFP_NOIO);
4710         if (!r10b) {
4711                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4712                 return -ENOMEM;
4713         }
4714
4715         /* reshape IOs share pages from .devs[0].bio */
4716         pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
4717
4718         r10b->sector = r10_bio->sector;
4719         __raid10_find_phys(&conf->prev, r10b);
4720
4721         while (sectors) {
4722                 int s = sectors;
4723                 int success = 0;
4724                 int first_slot = slot;
4725
4726                 if (s > (PAGE_SIZE >> 9))
4727                         s = PAGE_SIZE >> 9;
4728
4729                 rcu_read_lock();
4730                 while (!success) {
4731                         int d = r10b->devs[slot].devnum;
4732                         struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4733                         sector_t addr;
4734                         if (rdev == NULL ||
4735                             test_bit(Faulty, &rdev->flags) ||
4736                             !test_bit(In_sync, &rdev->flags))
4737                                 goto failed;
4738
4739                         addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4740                         atomic_inc(&rdev->nr_pending);
4741                         rcu_read_unlock();
4742                         success = sync_page_io(rdev,
4743                                                addr,
4744                                                s << 9,
4745                                                pages[idx],
4746                                                REQ_OP_READ, 0, false);
4747                         rdev_dec_pending(rdev, mddev);
4748                         rcu_read_lock();
4749                         if (success)
4750                                 break;
4751                 failed:
4752                         slot++;
4753                         if (slot >= conf->copies)
4754                                 slot = 0;
4755                         if (slot == first_slot)
4756                                 break;
4757                 }
4758                 rcu_read_unlock();
4759                 if (!success) {
4760                         /* couldn't read this block, must give up */
4761                         set_bit(MD_RECOVERY_INTR,
4762                                 &mddev->recovery);
4763                         kfree(r10b);
4764                         return -EIO;
4765                 }
4766                 sectors -= s;
4767                 idx++;
4768         }
4769         kfree(r10b);
4770         return 0;
4771 }
4772
4773 static void end_reshape_write(struct bio *bio)
4774 {
4775         struct r10bio *r10_bio = get_resync_r10bio(bio);
4776         struct mddev *mddev = r10_bio->mddev;
4777         struct r10conf *conf = mddev->private;
4778         int d;
4779         int slot;
4780         int repl;
4781         struct md_rdev *rdev = NULL;
4782
4783         d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4784         if (repl)
4785                 rdev = conf->mirrors[d].replacement;
4786         if (!rdev) {
4787                 smp_mb();
4788                 rdev = conf->mirrors[d].rdev;
4789         }
4790
4791         if (bio->bi_status) {
4792                 /* FIXME should record badblock */
4793                 md_error(mddev, rdev);
4794         }
4795
4796         rdev_dec_pending(rdev, mddev);
4797         end_reshape_request(r10_bio);
4798 }
4799
4800 static void end_reshape_request(struct r10bio *r10_bio)
4801 {
4802         if (!atomic_dec_and_test(&r10_bio->remaining))
4803                 return;
4804         md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4805         bio_put(r10_bio->master_bio);
4806         put_buf(r10_bio);
4807 }
4808
4809 static void raid10_finish_reshape(struct mddev *mddev)
4810 {
4811         struct r10conf *conf = mddev->private;
4812
4813         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4814                 return;
4815
4816         if (mddev->delta_disks > 0) {
4817                 sector_t size = raid10_size(mddev, 0, 0);
4818                 md_set_array_sectors(mddev, size);
4819                 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4820                         mddev->recovery_cp = mddev->resync_max_sectors;
4821                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4822                 }
4823                 mddev->resync_max_sectors = size;
4824                 if (mddev->queue) {
4825                         set_capacity(mddev->gendisk, mddev->array_sectors);
4826                         revalidate_disk(mddev->gendisk);
4827                 }
4828         } else {
4829                 int d;
4830                 rcu_read_lock();
4831                 for (d = conf->geo.raid_disks ;
4832                      d < conf->geo.raid_disks - mddev->delta_disks;
4833                      d++) {
4834                         struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
4835                         if (rdev)
4836                                 clear_bit(In_sync, &rdev->flags);
4837                         rdev = rcu_dereference(conf->mirrors[d].replacement);
4838                         if (rdev)
4839                                 clear_bit(In_sync, &rdev->flags);
4840                 }
4841                 rcu_read_unlock();
4842         }
4843         mddev->layout = mddev->new_layout;
4844         mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4845         mddev->reshape_position = MaxSector;
4846         mddev->delta_disks = 0;
4847         mddev->reshape_backwards = 0;
4848 }
4849
4850 static struct md_personality raid10_personality =
4851 {
4852         .name           = "raid10",
4853         .level          = 10,
4854         .owner          = THIS_MODULE,
4855         .make_request   = raid10_make_request,
4856         .run            = raid10_run,
4857         .free           = raid10_free,
4858         .status         = raid10_status,
4859         .error_handler  = raid10_error,
4860         .hot_add_disk   = raid10_add_disk,
4861         .hot_remove_disk= raid10_remove_disk,
4862         .spare_active   = raid10_spare_active,
4863         .sync_request   = raid10_sync_request,
4864         .quiesce        = raid10_quiesce,
4865         .size           = raid10_size,
4866         .resize         = raid10_resize,
4867         .takeover       = raid10_takeover,
4868         .check_reshape  = raid10_check_reshape,
4869         .start_reshape  = raid10_start_reshape,
4870         .finish_reshape = raid10_finish_reshape,
4871         .congested      = raid10_congested,
4872 };
4873
4874 static int __init raid_init(void)
4875 {
4876         return register_md_personality(&raid10_personality);
4877 }
4878
4879 static void raid_exit(void)
4880 {
4881         unregister_md_personality(&raid10_personality);
4882 }
4883
4884 module_init(raid_init);
4885 module_exit(raid_exit);
4886 MODULE_LICENSE("GPL");
4887 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4888 MODULE_ALIAS("md-personality-9"); /* RAID10 */
4889 MODULE_ALIAS("md-raid10");
4890 MODULE_ALIAS("md-level-10");
4891
4892 module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);