block: Add submit_bio_wait(), remove from md

[linux-2.6-block.git] / drivers / md / raid10.c
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c

index 64d48249c03bf09f73580f4465b59192faa005ab..434586d431150f47f0a6158f8852eec997dc919b 100644 (file)
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
   *    near_copies (stored in low byte of layout)
   *    far_copies (stored in second byte of layout)
   *    far_offset (stored in bit 16 of layout )
+ *    use_far_sets (stored in bit 17 of layout )
   *
- * The data to be stored is divided into chunks using chunksize.
- * Each device is divided into far_copies sections.
- * In each section, chunks are laid out in a style similar to raid0, but
- * near_copies copies of each chunk is stored (each on a different drive).
- * The starting device for each section is offset near_copies from the starting
- * device of the previous section.
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
- * drive.
- * near_copies and far_copies must be at least one, and their product is at most
- * raid_disks.
+ * The data to be stored is divided into chunks using chunksize.  Each device
+ * is divided into far_copies sections.   In each section, chunks are laid out
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
+ * (each on a different drive).  The starting device for each section is offset
+ * near_copies from the starting device of the previous section.  Thus there
+ * are (near_copies * far_copies) of each chunk, and each is on a different
+ * drive.  near_copies and far_copies must be at least one, and their product
+ * is at most raid_disks.
   *
   * If far_offset is true, then the far_copies are handled a bit differently.
- * The copies are still in different stripes, but instead of be very far apart
- * on disk, there are adjacent stripes.
+ * The copies are still in different stripes, but instead of being very far
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true.  In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size.  The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array.  This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ *    A B C D    A B C D E
+ *      ...         ...
+ *    D A B C    E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ *    [A B] [C D]    [A B] [C D E]
+ *    |...| |...|    |...| | ... |
+ *    [B A] [D C]    [B A] [E C D]
   */
  
  /*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
         sector_t stripe;
         int dev;
         int slot = 0;
+       int last_far_set_start, last_far_set_size;
+
+       last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+       last_far_set_start *= geo->far_set_size;
+
+       last_far_set_size = geo->far_set_size;
+       last_far_set_size += (geo->raid_disks % geo->far_set_size);
  
         /* now calculate first sector/dev */
         chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
         /* and calculate all the others */
         for (n = 0; n < geo->near_copies; n++) {
                 int d = dev;
+               int set;
                 sector_t s = sector;
-               r10bio->devs[slot].addr = sector;
                 r10bio->devs[slot].devnum = d;
+               r10bio->devs[slot].addr = s;
                 slot++;
  
                 for (f = 1; f < geo->far_copies; f++) {
+                       set = d / geo->far_set_size;
                         d += geo->near_copies;
-                       if (d >= geo->raid_disks)
-                               d -= geo->raid_disks;
+
+                       if ((geo->raid_disks % geo->far_set_size) &&
+                           (d > last_far_set_start)) {
+                               d -= last_far_set_start;
+                               d %= last_far_set_size;
+                               d += last_far_set_start;
+                       } else {
+                               d %= geo->far_set_size;
+                               d += geo->far_set_size * set;
+                       }
                         s += geo->stride;
                         r10bio->devs[slot].devnum = d;
                         r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
          * or recovery, so reshape isn't happening
          */
         struct geom *geo = &conf->geo;
+       int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+       int far_set_size = geo->far_set_size;
+       int last_far_set_start;
+
+       if (geo->raid_disks % geo->far_set_size) {
+               last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
+               last_far_set_start *= geo->far_set_size;
+
+               if (dev >= last_far_set_start) {
+                       far_set_size = geo->far_set_size;
+                       far_set_size += (geo->raid_disks % geo->far_set_size);
+                       far_set_start = last_far_set_start;
+               }
+       }
  
         offset = sector & geo->chunk_mask;
         if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
                 chunk = sector >> geo->chunk_shift;
                 fc = sector_div(chunk, geo->far_copies);
                 dev -= fc * geo->near_copies;
-               if (dev < 0)
-                       dev += geo->raid_disks;
+               if (dev < far_set_start)
+                       dev += far_set_size;
         } else {
                 while (sector >= geo->stride) {
                         sector -= geo->stride;
-                       if (dev < geo->near_copies)
-                               dev += geo->raid_disks - geo->near_copies;
+                       if (dev < (geo->near_copies + far_set_start))
+                               dev += far_set_size - geo->near_copies;
                         else
                                 dev -= geo->near_copies;
                 }
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
                 bio_list_merge(&conf->pending_bio_list, &plug->pending);
                 conf->pending_count += plug->pending_cnt;
                 spin_unlock_irq(&conf->device_lock);
+               wake_up(&conf->wait_barrier);
                 md_wakeup_thread(mddev->thread);
                 kfree(plug);
                 return;
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
         const unsigned long do_discard = (bio->bi_rw
                                           & (REQ_DISCARD | REQ_SECURE));
+       const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
         unsigned long flags;
         struct md_rdev *blocked_rdev;
         struct blk_plug_cb *cb;
@@ -1121,14 +1169,13 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         /* If this request crosses a chunk boundary, we need to
          * split it.  This will only happen for 1 PAGE (or less) requests.
          */
-       if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
+       if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
                      > chunk_sects
                      && (conf->geo.near_copies < conf->geo.raid_disks
                          || conf->prev.near_copies < conf->prev.raid_disks))) {
                 struct bio_pair *bp;
                 /* Sanity check -- queue functions should prevent this happening */
-               if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
-                   bio->bi_idx != 0)
+               if (bio_segments(bio) > 1)
                         goto bad_map;
                 /* This is a one page bio that upper layers
                  * refuse to split for us, so we need to split it.
@@ -1161,7 +1208,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         bad_map:
                 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
                        " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-                      (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+                      (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
  
                 bio_io_error(bio);
                 return;
@@ -1176,7 +1223,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
          */
         wait_barrier(conf);
  
-       sectors = bio->bi_size >> 9;
+       sectors = bio_sectors(bio);
         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
             bio->bi_sector < conf->reshape_progress &&
             bio->bi_sector + sectors > conf->reshape_progress) {
@@ -1278,8 +1325,7 @@ read_again:
                         r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
  
                         r10_bio->master_bio = bio;
-                       r10_bio->sectors = ((bio->bi_size >> 9)
-                                           - sectors_handled);
+                       r10_bio->sectors = bio_sectors(bio) - sectors_handled;
                         r10_bio->state = 0;
                         r10_bio->mddev = mddev;
                         r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -1460,7 +1506,8 @@ retry_write:
                                                               rdev));
                         mbio->bi_bdev = rdev->bdev;
                         mbio->bi_end_io = raid10_end_write_request;
-                       mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+                       mbio->bi_rw =
+                               WRITE | do_sync | do_fua | do_discard | do_same;
                         mbio->bi_private = r10_bio;
  
                         atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1549,8 @@ retry_write:
                                                    r10_bio, rdev));
                         mbio->bi_bdev = rdev->bdev;
                         mbio->bi_end_io = raid10_end_write_request;
-                       mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
+                       mbio->bi_rw =
+                               WRITE | do_sync | do_fua | do_discard | do_same;
                         mbio->bi_private = r10_bio;
  
                         atomic_inc(&r10_bio->remaining);
@@ -1519,7 +1567,7 @@ retry_write:
          * after checking if we need to go around again.
          */
  
-       if (sectors_handled < (bio->bi_size >> 9)) {
+       if (sectors_handled < bio_sectors(bio)) {
                 one_write_done(r10_bio);
                 /* We need another r10_bio.  It has already been counted
                  * in bio->bi_phys_segments.
@@ -1527,7 +1575,7 @@ retry_write:
                 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
  
                 r10_bio->master_bio = bio;
-               r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+               r10_bio->sectors = bio_sectors(bio) - sectors_handled;
  
                 r10_bio->mddev = mddev;
                 r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -2053,7 +2101,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                 d = r10_bio->devs[i].devnum;
                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
                 atomic_inc(&r10_bio->remaining);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
  
                 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
                 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
@@ -2078,7 +2126,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                 d = r10_bio->devs[i].devnum;
                 atomic_inc(&r10_bio->remaining);
                 md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            tbio->bi_size >> 9);
+                            bio_sectors(tbio));
                 generic_make_request(tbio);
         }
  
@@ -2204,13 +2252,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
         wbio2 = r10_bio->devs[1].repl_bio;
         if (wbio->bi_end_io) {
                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
                 generic_make_request(wbio);
         }
         if (wbio2 && wbio2->bi_end_io) {
                 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
                 md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            wbio2->bi_size >> 9);
+                            bio_sectors(wbio2));
                 generic_make_request(wbio2);
         }
  }
@@ -2481,25 +2529,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
         }
  }
  
-static void bi_complete(struct bio *bio, int error)
-{
-       complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
-       struct completion event;
-       rw |= REQ_SYNC;
-
-       init_completion(&event);
-       bio->bi_private = &event;
-       bio->bi_end_io = bi_complete;
-       submit_bio(rw, bio);
-       wait_for_completion(&event);
-
-       return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
  static int narrow_write_error(struct r10bio *r10_bio, int i)
  {
         struct bio *bio = r10_bio->master_bio;
@@ -2640,8 +2669,7 @@ read_more:
                 r10_bio = mempool_alloc(conf->r10bio_pool,
                                         GFP_NOIO);
                 r10_bio->master_bio = mbio;
-               r10_bio->sectors = (mbio->bi_size >> 9)
-                       - sectors_handled;
+               r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
                 r10_bio->state = 0;
                 set_bit(R10BIO_ReadError,
                         &r10_bio->state);
@@ -3436,7 +3464,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
                 disks = mddev->raid_disks + mddev->delta_disks;
                 break;
         }
-       if (layout >> 17)
+       if (layout >> 18)
                 return -1;
         if (chunk < (PAGE_SIZE >> 9) ||
             !is_power_of_2(chunk))
@@ -3448,6 +3476,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
         geo->near_copies = nc;
         geo->far_copies = fc;
         geo->far_offset = fo;
+       geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
         geo->chunk_mask = chunk - 1;
         geo->chunk_shift = ffz(~chunk);
         return nc*fc;
@@ -3569,6 +3598,8 @@ static int run(struct mddev *mddev)
         if (mddev->queue) {
                 blk_queue_max_discard_sectors(mddev->queue,
                                               mddev->chunk_sectors);
+               blk_queue_max_write_same_sectors(mddev->queue,
+                                                mddev->chunk_sectors);
                 blk_queue_io_min(mddev->queue, chunk_size);
                 if (conf->geo.raid_disks % conf->geo.near_copies)
                         blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
@@ -4336,7 +4367,6 @@ read_more:
         read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
         read_bio->bi_flags |= 1 << BIO_UPTODATE;
         read_bio->bi_vcnt = 0;
-       read_bio->bi_idx = 0;
         read_bio->bi_size = 0;
         r10_bio->master_bio = read_bio;
         r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;