Merge branch 'for-3.10/core' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)
diff --combined block/blk-core.c

index 7c288358a745ad2312e93080201e341cf2b69207,f224d1793ee5e5975f427cc9259e587970f19884..33c33bc99ddd5546e6ba30ce267cb436d0328c51
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -30,6 -30,7 +30,7 @@@
   #include <linux/list_sort.h>
   #include <linux/delay.h>
   #include <linux/ratelimit.h>
+ #include <linux/pm_runtime.h>
   
   #define CREATE_TRACE_POINTS
   #include <trace/events/block.h>
@@@ -39,7 -40,6 +40,7 @@@
   
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
+ +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
   
   DEFINE_IDA(blk_queue_ida);
@@@ -159,20 -159,10 +160,10 @@@ static void req_bio_endio(struct reques
         else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                 error = -EIO;
   
-       if (unlikely(nbytes > bio->bi_size)) {
-               printk(KERN_ERR "%s: want %u bytes done, %u left\n",
-                      __func__, nbytes, bio->bi_size);
-               nbytes = bio->bi_size;
-       }
- 
         if (unlikely(rq->cmd_flags & REQ_QUIET))
                 set_bit(BIO_QUIET, &bio->bi_flags);
   
-       bio->bi_size -= nbytes;
-       bio->bi_sector += (nbytes >> 9);
- 
-       if (bio_integrity(bio))
-               bio_integrity_advance(bio, nbytes);
+       bio_advance(bio, nbytes);
   
         /* don't actually finish bio if it's part of flush sequence */
         if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
@@@ -1264,6 -1254,16 +1255,16 @@@ void part_round_stats(int cpu, struct h
   }
   EXPORT_SYMBOL_GPL(part_round_stats);
   
+ #ifdef CONFIG_PM_RUNTIME
+ static void blk_pm_put_request(struct request *rq)
+ {
+       if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
+               pm_runtime_mark_last_busy(rq->q->dev);
+ }
+ #else
+ static inline void blk_pm_put_request(struct request *rq) {}
+ #endif
+ 
   /*
    * queue lock must be held
    */
@@@ -1274,6 -1274,8 +1275,8 @@@ void __blk_put_request(struct request_q
         if (unlikely(--req->ref_count))
                 return;
   
+       blk_pm_put_request(req);
+ 
         elv_completed_request(q, req);
   
         /* this is a bio leak */
@@@ -1597,7 -1599,7 +1600,7 @@@ static void handle_bad_sector(struct bi
         printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
                         bdevname(bio->bi_bdev, b),
                         bio->bi_rw,
-                       (unsigned long long)bio->bi_sector + bio_sectors(bio),
+                       (unsigned long long)bio_end_sector(bio),
                         (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
   
         set_bit(BIO_EOF, &bio->bi_flags);
@@@ -2053,6 -2055,28 +2056,28 @@@ static void blk_account_io_done(struct 
         }
   }
   
+ #ifdef CONFIG_PM_RUNTIME
+ /*
+  * Don't process normal requests when queue is suspended
+  * or in the process of suspending/resuming
+  */
+ static struct request *blk_pm_peek_request(struct request_queue *q,
+                                          struct request *rq)
+ {
+       if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
+           (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM))))
+               return NULL;
+       else
+               return rq;
+ }
+ #else
+ static inline struct request *blk_pm_peek_request(struct request_queue *q,
+                                                 struct request *rq)
+ {
+       return rq;
+ }
+ #endif
+ 
   /**
    * blk_peek_request - peek at the top of a request queue
    * @q: request queue to peek at
@@@ -2075,6 -2099,11 +2100,11 @@@ struct request *blk_peek_request(struc
         int ret;
   
         while ((rq = __elv_next_request(q)) != NULL) {
+ 
+               rq = blk_pm_peek_request(q, rq);
+               if (!rq)
+                       break;
+ 
                 if (!(rq->cmd_flags & REQ_STARTED)) {
                         /*
                          * This is the first time the device driver
@@@ -2253,8 -2282,7 +2283,7 @@@ EXPORT_SYMBOL(blk_fetch_request)
    **/
   bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
   {
-       int total_bytes, bio_nbytes, next_idx = 0;
-       struct bio *bio;
+       int total_bytes;
   
         if (!req->bio)
                 return false;
@@@ -2300,56 -2328,21 +2329,21 @@@
   
         blk_account_io_completion(req, nr_bytes);
   
-       total_bytes = bio_nbytes = 0;
-       while ((bio = req->bio) != NULL) {
-               int nbytes;
+       total_bytes = 0;
+       while (req->bio) {
+               struct bio *bio = req->bio;
+               unsigned bio_bytes = min(bio->bi_size, nr_bytes);
   
-               if (nr_bytes >= bio->bi_size) {
+               if (bio_bytes == bio->bi_size)
                         req->bio = bio->bi_next;
-                       nbytes = bio->bi_size;
-                       req_bio_endio(req, bio, nbytes, error);
-                       next_idx = 0;
-                       bio_nbytes = 0;
-               } else {
-                       int idx = bio->bi_idx + next_idx;
   
-                       if (unlikely(idx >= bio->bi_vcnt)) {
-                               blk_dump_rq_flags(req, "__end_that");
-                               printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
-                                      __func__, idx, bio->bi_vcnt);
-                               break;
-                       }
+               req_bio_endio(req, bio, bio_bytes, error);
   
-                       nbytes = bio_iovec_idx(bio, idx)->bv_len;
-                       BIO_BUG_ON(nbytes > bio->bi_size);
+               total_bytes += bio_bytes;
+               nr_bytes -= bio_bytes;
   
-                       /*
-                        * not a complete bvec done
-                        */
-                       if (unlikely(nbytes > nr_bytes)) {
-                               bio_nbytes += nr_bytes;
-                               total_bytes += nr_bytes;
-                               break;
-                       }
- 
-                       /*
-                        * advance to the next vector
-                        */
-                       next_idx++;
-                       bio_nbytes += nbytes;
-               }
- 
-               total_bytes += nbytes;
-               nr_bytes -= nbytes;
- 
-               bio = req->bio;
-               if (bio) {
-                       /*
-                        * end more in this run, or just return 'not-done'
-                        */
-                       if (unlikely(nr_bytes <= 0))
-                               break;
-               }
+               if (!nr_bytes)
+                       break;
         }
   
         /*
@@@ -2365,16 -2358,6 +2359,6 @@@
                 return false;
         }
   
-       /*
-        * if the request wasn't completed, update state
-        */
-       if (bio_nbytes) {
-               req_bio_endio(req, bio, bio_nbytes, error);
-               bio->bi_idx += next_idx;
-               bio_iovec(bio)->bv_offset += nr_bytes;
-               bio_iovec(bio)->bv_len -= nr_bytes;
-       }
- 
         req->__data_len -= total_bytes;
         req->buffer = bio_data(req->bio);
   
@@@ -3046,6 -3029,149 +3030,149 @@@ void blk_finish_plug(struct blk_plug *p
   }
   EXPORT_SYMBOL(blk_finish_plug);
   
+ #ifdef CONFIG_PM_RUNTIME
+ /**
+  * blk_pm_runtime_init - Block layer runtime PM initialization routine
+  * @q: the queue of the device
+  * @dev: the device the queue belongs to
+  *
+  * Description:
+  *    Initialize runtime-PM-related fields for @q and start auto suspend for
+  *    @dev. Drivers that want to take advantage of request-based runtime PM
+  *    should call this function after @dev has been initialized, and its
+  *    request queue @q has been allocated, and runtime PM for it can not happen
+  *    yet(either due to disabled/forbidden or its usage_count > 0). In most
+  *    cases, driver should call this function before any I/O has taken place.
+  *
+  *    This function takes care of setting up using auto suspend for the device,
+  *    the autosuspend delay is set to -1 to make runtime suspend impossible
+  *    until an updated value is either set by user or by driver. Drivers do
+  *    not need to touch other autosuspend settings.
+  *
+  *    The block layer runtime PM is request based, so only works for drivers
+  *    that use request as their IO unit instead of those directly use bio's.
+  */
+ void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
+ {
+       q->dev = dev;
+       q->rpm_status = RPM_ACTIVE;
+       pm_runtime_set_autosuspend_delay(q->dev, -1);
+       pm_runtime_use_autosuspend(q->dev);
+ }
+ EXPORT_SYMBOL(blk_pm_runtime_init);
+ 
+ /**
+  * blk_pre_runtime_suspend - Pre runtime suspend check
+  * @q: the queue of the device
+  *
+  * Description:
+  *    This function will check if runtime suspend is allowed for the device
+  *    by examining if there are any requests pending in the queue. If there
+  *    are requests pending, the device can not be runtime suspended; otherwise,
+  *    the queue's status will be updated to SUSPENDING and the driver can
+  *    proceed to suspend the device.
+  *
+  *    For the not allowed case, we mark last busy for the device so that
+  *    runtime PM core will try to autosuspend it some time later.
+  *
+  *    This function should be called near the start of the device's
+  *    runtime_suspend callback.
+  *
+  * Return:
+  *    0               - OK to runtime suspend the device
+  *    -EBUSY  - Device should not be runtime suspended
+  */
+ int blk_pre_runtime_suspend(struct request_queue *q)
+ {
+       int ret = 0;
+ 
+       spin_lock_irq(q->queue_lock);
+       if (q->nr_pending) {
+               ret = -EBUSY;
+               pm_runtime_mark_last_busy(q->dev);
+       } else {
+               q->rpm_status = RPM_SUSPENDING;
+       }
+       spin_unlock_irq(q->queue_lock);
+       return ret;
+ }
+ EXPORT_SYMBOL(blk_pre_runtime_suspend);
+ 
+ /**
+  * blk_post_runtime_suspend - Post runtime suspend processing
+  * @q: the queue of the device
+  * @err: return value of the device's runtime_suspend function
+  *
+  * Description:
+  *    Update the queue's runtime status according to the return value of the
+  *    device's runtime suspend function and mark last busy for the device so
+  *    that PM core will try to auto suspend the device at a later time.
+  *
+  *    This function should be called near the end of the device's
+  *    runtime_suspend callback.
+  */
+ void blk_post_runtime_suspend(struct request_queue *q, int err)
+ {
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_SUSPENDED;
+       } else {
+               q->rpm_status = RPM_ACTIVE;
+               pm_runtime_mark_last_busy(q->dev);
+       }
+       spin_unlock_irq(q->queue_lock);
+ }
+ EXPORT_SYMBOL(blk_post_runtime_suspend);
+ 
+ /**
+  * blk_pre_runtime_resume - Pre runtime resume processing
+  * @q: the queue of the device
+  *
+  * Description:
+  *    Update the queue's runtime status to RESUMING in preparation for the
+  *    runtime resume of the device.
+  *
+  *    This function should be called near the start of the device's
+  *    runtime_resume callback.
+  */
+ void blk_pre_runtime_resume(struct request_queue *q)
+ {
+       spin_lock_irq(q->queue_lock);
+       q->rpm_status = RPM_RESUMING;
+       spin_unlock_irq(q->queue_lock);
+ }
+ EXPORT_SYMBOL(blk_pre_runtime_resume);
+ 
+ /**
+  * blk_post_runtime_resume - Post runtime resume processing
+  * @q: the queue of the device
+  * @err: return value of the device's runtime_resume function
+  *
+  * Description:
+  *    Update the queue's runtime status according to the return value of the
+  *    device's runtime_resume function. If it is successfully resumed, process
+  *    the requests that are queued into the device's queue when it is resuming
+  *    and then mark last busy and initiate autosuspend for it.
+  *
+  *    This function should be called near the end of the device's
+  *    runtime_resume callback.
+  */
+ void blk_post_runtime_resume(struct request_queue *q, int err)
+ {
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_ACTIVE;
+               __blk_run_queue(q);
+               pm_runtime_mark_last_busy(q->dev);
+               pm_runtime_autosuspend(q->dev);
+       } else {
+               q->rpm_status = RPM_SUSPENDED;
+       }
+       spin_unlock_irq(q->queue_lock);
+ }
+ EXPORT_SYMBOL(blk_post_runtime_resume);
+ #endif
+ 
   int __init blk_dev_init(void)
   {
         BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --combined drivers/block/aoe/aoecmd.c

index 92b6d7c51e39590b3780f17c88737009b7becbbf,af96ca171238e1f9ca6499ca9d3f3f5f5194d648..5efed089a702d501f3b0de2e85997362555d0a09
--- 1/drivers/block/aoe/aoecmd.c
--- 2/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@@ -51,9 -51,8 +51,9 @@@ new_skb(ulong len
   {
         struct sk_buff *skb;
   
- -      skb = alloc_skb(len, GFP_ATOMIC);
+ +      skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC);
         if (skb) {
+ +              skb_reserve(skb, MAX_HEADER);
                 skb_reset_mac_header(skb);
                 skb_reset_network_header(skb);
                 skb->protocol = __constant_htons(ETH_P_AOE);
@@@ -928,7 -927,7 +928,7 @@@ bufinit(struct buf *buf, struct reques
         buf->resid = bio->bi_size;
         buf->sector = bio->bi_sector;
         bio_pageinc(bio);
-       buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
+       buf->bv = bv = bio_iovec(bio);
         buf->bv_resid = bv->bv_len;
         WARN_ON(buf->bv_resid == 0);
   }
diff --combined drivers/block/floppy.c

index c49e85608101e6fb97b90b7a1b014fb0cbbd2ac4,83232639034eda69da39d8f84eb60e8f63339538..04ceb7e2fadd6ca075d20ecd844c39bf1da07ff3
--- 1/drivers/block/floppy.c
--- 2/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@@ -3601,7 -3601,7 +3601,7 @@@ static void __init config_types(void
                 pr_cont("\n");
   }
   
- -static int floppy_release(struct gendisk *disk, fmode_t mode)
+ +static void floppy_release(struct gendisk *disk, fmode_t mode)
   {
         int drive = (long)disk->private_data;
   
@@@ -3615,6 -3615,8 +3615,6 @@@
                 opened_bdev[drive] = NULL;
         mutex_unlock(&open_lock);
         mutex_unlock(&floppy_mutex);
- -
- -      return 0;
   }
   
   /*
@@@ -3775,7 -3777,6 +3775,6 @@@ static int __floppy_read_block_0(struc
         bio_vec.bv_len = size;
         bio_vec.bv_offset = 0;
         bio.bi_vcnt = 1;
-       bio.bi_idx = 0;
         bio.bi_size = size;
         bio.bi_bdev = bdev;
         bio.bi_sector = 0;
diff --combined drivers/block/pktcdvd.c

index 9f2d348f7115424e3bda72d1f8ce61418f4f03d1,11190424536a95a8cf1ca3eb84a46fc396930e69..3c08983e600a0a15e1380e9de1e6f50714fe3976
--- 1/drivers/block/pktcdvd.c
--- 2/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@@ -901,7 -901,7 +901,7 @@@ static void pkt_iosched_process_queue(s
                         pd->iosched.successive_reads += bio->bi_size >> 10;
                 else {
                         pd->iosched.successive_reads = 0;
-                       pd->iosched.last_write = bio->bi_sector + bio_sectors(bio);
+                       pd->iosched.last_write = bio_end_sector(bio);
                 }
                 if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
                         if (pd->read_speed == pd->write_speed) {
@@@ -947,31 -947,6 +947,6 @@@ static int pkt_set_segment_merging(stru
         }
   }
   
- /*
-  * Copy CD_FRAMESIZE bytes from src_bio into a destination page
-  */
- static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct page *dst_page, int dst_offs)
- {
-       unsigned int copy_size = CD_FRAMESIZE;
- 
-       while (copy_size > 0) {
-               struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
-               void *vfrom = kmap_atomic(src_bvl->bv_page) +
-                       src_bvl->bv_offset + offs;
-               void *vto = page_address(dst_page) + dst_offs;
-               int len = min_t(int, copy_size, src_bvl->bv_len - offs);
- 
-               BUG_ON(len < 0);
-               memcpy(vto, vfrom, len);
-               kunmap_atomic(vfrom);
- 
-               seg++;
-               offs = 0;
-               dst_offs += len;
-               copy_size -= len;
-       }
- }
- 
   /*
    * Copy all data for this packet to pkt->pages[], so that
    * a) The number of required segments for the write bio is minimized, which
@@@ -1181,16 -1156,15 +1156,15 @@@ static int pkt_start_recovery(struct pa
         new_sector = new_block * (CD_FRAMESIZE >> 9);
         pkt->sector = new_sector;
   
+       bio_reset(pkt->bio);
+       pkt->bio->bi_bdev = pd->bdev;
+       pkt->bio->bi_rw = REQ_WRITE;
         pkt->bio->bi_sector = new_sector;
-       pkt->bio->bi_next = NULL;
-       pkt->bio->bi_flags = 1 << BIO_UPTODATE;
-       pkt->bio->bi_idx = 0;
+       pkt->bio->bi_size = pkt->frames * CD_FRAMESIZE;
+       pkt->bio->bi_vcnt = pkt->frames;
   
-       BUG_ON(pkt->bio->bi_rw != REQ_WRITE);
-       BUG_ON(pkt->bio->bi_vcnt != pkt->frames);
-       BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE);
-       BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write);
-       BUG_ON(pkt->bio->bi_private != pkt);
+       pkt->bio->bi_end_io = pkt_end_io_packet_write;
+       pkt->bio->bi_private = pkt;
   
         drop_super(sb);
         return 1;
@@@ -1325,55 -1299,35 +1299,35 @@@ try_next_bio
    */
   static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
   {
-       struct bio *bio;
         int f;
-       int frames_write;
         struct bio_vec *bvec = pkt->w_bio->bi_io_vec;
   
+       bio_reset(pkt->w_bio);
+       pkt->w_bio->bi_sector = pkt->sector;
+       pkt->w_bio->bi_bdev = pd->bdev;
+       pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+       pkt->w_bio->bi_private = pkt;
+ 
+       /* XXX: locking? */
         for (f = 0; f < pkt->frames; f++) {
                 bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
                 bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+               if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
+                       BUG();
         }
+       VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
   
         /*
          * Fill-in bvec with data from orig_bios.
          */
-       frames_write = 0;
         spin_lock(&pkt->lock);
-       bio_list_for_each(bio, &pkt->orig_bios) {
-               int segment = bio->bi_idx;
-               int src_offs = 0;
-               int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
-               int num_frames = bio->bi_size / CD_FRAMESIZE;
-               BUG_ON(first_frame < 0);
-               BUG_ON(first_frame + num_frames > pkt->frames);
-               for (f = first_frame; f < first_frame + num_frames; f++) {
-                       struct bio_vec *src_bvl = bio_iovec_idx(bio, segment);
- 
-                       while (src_offs >= src_bvl->bv_len) {
-                               src_offs -= src_bvl->bv_len;
-                               segment++;
-                               BUG_ON(segment >= bio->bi_vcnt);
-                               src_bvl = bio_iovec_idx(bio, segment);
-                       }
+       bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
   
-                       if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) {
-                               bvec[f].bv_page = src_bvl->bv_page;
-                               bvec[f].bv_offset = src_bvl->bv_offset + src_offs;
-                       } else {
-                               pkt_copy_bio_data(bio, segment, src_offs,
-                                                 bvec[f].bv_page, bvec[f].bv_offset);
-                       }
-                       src_offs += CD_FRAMESIZE;
-                       frames_write++;
-               }
-       }
         pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
         spin_unlock(&pkt->lock);
   
         VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n",
-               frames_write, (unsigned long long)pkt->sector);
-       BUG_ON(frames_write != pkt->write_size);
+               pkt->write_size, (unsigned long long)pkt->sector);
   
         if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
                 pkt_make_local_copy(pkt, bvec);
@@@ -1383,16 -1337,6 +1337,6 @@@
         }
   
         /* Start the write request */
-       bio_reset(pkt->w_bio);
-       pkt->w_bio->bi_sector = pkt->sector;
-       pkt->w_bio->bi_bdev = pd->bdev;
-       pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
-       pkt->w_bio->bi_private = pkt;
-       for (f = 0; f < pkt->frames; f++)
-               if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
-                       BUG();
-       VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
- 
         atomic_set(&pkt->io_wait, 1);
         pkt->w_bio->bi_rw = WRITE;
         pkt_queue_bio(pd, pkt->w_bio);
@@@ -2376,9 -2320,10 +2320,9 @@@ out
         return ret;
   }
   
- -static int pkt_close(struct gendisk *disk, fmode_t mode)
+ +static void pkt_close(struct gendisk *disk, fmode_t mode)
   {
         struct pktcdvd_device *pd = disk->private_data;
- -      int ret = 0;
   
         mutex_lock(&pktcdvd_mutex);
         mutex_lock(&ctl_mutex);
@@@ -2390,6 -2335,7 +2334,6 @@@
         }
         mutex_unlock(&ctl_mutex);
         mutex_unlock(&pktcdvd_mutex);
- -      return ret;
   }
   
   
@@@ -2431,7 -2377,7 +2375,7 @@@ static void pkt_make_request(struct req
                 cloned_bio->bi_bdev = pd->bdev;
                 cloned_bio->bi_private = psd;
                 cloned_bio->bi_end_io = pkt_end_io_read_cloned;
-               pd->stats.secs_r += bio->bi_size >> 9;
+               pd->stats.secs_r += bio_sectors(bio);
                 pkt_queue_bio(pd, cloned_bio);
                 return;
         }
@@@ -2452,7 -2398,7 +2396,7 @@@
         zone = ZONE(bio->bi_sector, pd);
         VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
                 (unsigned long long)bio->bi_sector,
-               (unsigned long long)(bio->bi_sector + bio_sectors(bio)));
+               (unsigned long long)bio_end_sector(bio));
   
         /* Check if we have to split the bio */
         {
@@@ -2460,7 -2406,7 +2404,7 @@@
                 sector_t last_zone;
                 int first_sectors;
   
-               last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd);
+               last_zone = ZONE(bio_end_sector(bio) - 1, pd);
                 if (last_zone != zone) {
                         BUG_ON(last_zone != zone + pd->settings.size);
                         first_sectors = last_zone - bio->bi_sector;
@@@ -2646,7 -2592,7 +2590,7 @@@ static int pkt_seq_show(struct seq_fil
   
   static int pkt_seq_open(struct inode *inode, struct file *file)
   {
- -      return single_open(file, pkt_seq_show, PDE(inode)->data);
+ +      return single_open(file, pkt_seq_show, PDE_DATA(inode));
   }
   
   static const struct file_operations pkt_proc_fops = {
diff --combined drivers/block/rbd.c

index 22ffd5dcb1681da2b48b110e54e22f69457aa508,6b2b039c191fc5da53418371f290babdc5cd02c1..ca63104136e0db46d0248aa290c355483989ec2e
--- 1/drivers/block/rbd.c
--- 2/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@@ -1,4 -1,3 +1,4 @@@
+ +
   /*
      rbd.c -- Export ceph rados objects as a Linux block device
   
@@@ -33,14 -32,12 +33,14 @@@
   #include <linux/ceph/mon_client.h>
   #include <linux/ceph/decode.h>
   #include <linux/parser.h>
+ +#include <linux/bsearch.h>
   
   #include <linux/kernel.h>
   #include <linux/device.h>
   #include <linux/module.h>
   #include <linux/fs.h>
   #include <linux/blkdev.h>
+ +#include <linux/slab.h>
   
   #include "rbd_types.h"
   
@@@ -55,6 -52,13 +55,6 @@@
   #define       SECTOR_SHIFT    9
   #define       SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
   
- -/* It might be useful to have these defined elsewhere */
- -
- -#define       U8_MAX  ((u8)   (~0U))
- -#define       U16_MAX ((u16)  (~0U))
- -#define       U32_MAX ((u32)  (~0U))
- -#define       U64_MAX ((u64)  (~0ULL))
- -
   #define RBD_DRV_NAME "rbd"
   #define RBD_DRV_NAME_LONG "rbd (rados block device)"
   
@@@ -68,8 -72,6 +68,8 @@@
   
   #define RBD_SNAP_HEAD_NAME    "-"
   
+ +#define       BAD_SNAP_INDEX  U32_MAX         /* invalid index into snap array */
+ +
   /* This allows a single page to hold an image name sent by OSD */
   #define RBD_IMAGE_NAME_LEN_MAX        (PAGE_SIZE - sizeof (__le32) - 1)
   #define RBD_IMAGE_ID_LEN_MAX  64
@@@ -78,14 -80,11 +78,14 @@@
   
   /* Feature bits */
   
- -#define RBD_FEATURE_LAYERING      1
+ +#define RBD_FEATURE_LAYERING  (1<<0)
+ +#define RBD_FEATURE_STRIPINGV2        (1<<1)
+ +#define RBD_FEATURES_ALL \
+ +          (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
   
   /* Features supported by this (client software) implementation. */
   
- -#define RBD_FEATURES_ALL          (0)
+ +#define RBD_FEATURES_SUPPORTED        (RBD_FEATURES_ALL)
   
   /*
    * An RBD device name will be "rbd#", where the "rbd" comes from
@@@ -113,8 -112,7 +113,8 @@@ struct rbd_image_header 
         char *snap_names;
         u64 *snap_sizes;
   
- -      u64 obj_version;
+ +      u64 stripe_unit;
+ +      u64 stripe_count;
   };
   
   /*
@@@ -144,13 -142,13 +144,13 @@@
    */
   struct rbd_spec {
         u64             pool_id;
- -      char            *pool_name;
+ +      const char      *pool_name;
   
- -      char            *image_id;
- -      char            *image_name;
+ +      const char      *image_id;
+ +      const char      *image_name;
   
         u64             snap_id;
- -      char            *snap_name;
+ +      const char      *snap_name;
   
         struct kref     kref;
   };
@@@ -176,44 -174,13 +176,44 @@@ enum obj_request_type 
         OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
   };
   
+ +enum obj_req_flags {
+ +      OBJ_REQ_DONE,           /* completion flag: not done = 0, done = 1 */
+ +      OBJ_REQ_IMG_DATA,       /* object usage: standalone = 0, image = 1 */
+ +      OBJ_REQ_KNOWN,          /* EXISTS flag valid: no = 0, yes = 1 */
+ +      OBJ_REQ_EXISTS,         /* target exists: no = 0, yes = 1 */
+ +};
+ +
   struct rbd_obj_request {
         const char              *object_name;
         u64                     offset;         /* object start byte */
         u64                     length;         /* bytes from offset */
+ +      unsigned long           flags;
   
- -      struct rbd_img_request  *img_request;
- -      struct list_head        links;          /* img_request->obj_requests */
+ +      /*
+ +       * An object request associated with an image will have its
+ +       * img_data flag set; a standalone object request will not.
+ +       *
+ +       * A standalone object request will have which == BAD_WHICH
+ +       * and a null obj_request pointer.
+ +       *
+ +       * An object request initiated in support of a layered image
+ +       * object (to check for its existence before a write) will
+ +       * have which == BAD_WHICH and a non-null obj_request pointer.
+ +       *
+ +       * Finally, an object request for rbd image data will have
+ +       * which != BAD_WHICH, and will have a non-null img_request
+ +       * pointer.  The value of which will be in the range
+ +       * 0..(img_request->obj_request_count-1).
+ +       */
+ +      union {
+ +              struct rbd_obj_request  *obj_request;   /* STAT op */
+ +              struct {
+ +                      struct rbd_img_request  *img_request;
+ +                      u64                     img_offset;
+ +                      /* links for img_request->obj_requests list */
+ +                      struct list_head        links;
+ +              };
+ +      };
         u32                     which;          /* posn image request list */
   
         enum obj_request_type   type;
@@@ -224,12 -191,13 +224,12 @@@
                         u32             page_count;
                 };
         };
+ +      struct page             **copyup_pages;
   
         struct ceph_osd_request *osd_req;
   
         u64                     xferred;        /* bytes transferred */
- -      u64                     version;
         int                     result;
- -      atomic_t                done;
   
         rbd_obj_callback_t      callback;
         struct completion       completion;
@@@ -237,31 -205,19 +237,31 @@@
         struct kref             kref;
   };
   
+ +enum img_req_flags {
+ +      IMG_REQ_WRITE,          /* I/O direction: read = 0, write = 1 */
+ +      IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
+ +      IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
+ +};
+ +
   struct rbd_img_request {
- -      struct request          *rq;
         struct rbd_device       *rbd_dev;
         u64                     offset; /* starting image byte offset */
         u64                     length; /* byte count from offset */
- -      bool                    write_request;  /* false for read */
+ +      unsigned long           flags;
         union {
+ +              u64                     snap_id;        /* for reads */
                 struct ceph_snap_context *snapc;        /* for writes */
- -              u64             snap_id;                /* for reads */
         };
+ +      union {
+ +              struct request          *rq;            /* block request */
+ +              struct rbd_obj_request  *obj_request;   /* obj req initiator */
+ +      };
+ +      struct page             **copyup_pages;
         spinlock_t              completion_lock;/* protects next_completion */
         u32                     next_completion;
         rbd_img_callback_t      callback;
+ +      u64                     xferred;/* aggregate bytes transferred */
+ +      int                     result; /* first nonzero obj_request result */
   
         u32                     obj_request_count;
         struct list_head        obj_requests;   /* rbd_obj_request structs */
@@@ -276,6 -232,15 +276,6 @@@
   #define for_each_obj_request_safe(ireq, oreq, n) \
         list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
   
- -struct rbd_snap {
- -      struct  device          dev;
- -      const char              *name;
- -      u64                     size;
- -      struct list_head        node;
- -      u64                     id;
- -      u64                     features;
- -};
- -
   struct rbd_mapping {
         u64                     size;
         u64                     features;
@@@ -311,7 -276,6 +311,7 @@@ struct rbd_device 
   
         struct rbd_spec         *parent_spec;
         u64                     parent_overlap;
+ +      struct rbd_device       *parent;
   
         /* protects updating the header */
         struct rw_semaphore     header_rwsem;
@@@ -320,6 -284,9 +320,6 @@@
   
         struct list_head        node;
   
- -      /* list of snapshots */
- -      struct list_head        snaps;
- -
         /* sysfs related */
         struct device           dev;
         unsigned long           open_count;     /* protected by lock */
@@@ -345,21 -312,16 +345,21 @@@ static DEFINE_SPINLOCK(rbd_dev_list_loc
   static LIST_HEAD(rbd_client_list);            /* clients */
   static DEFINE_SPINLOCK(rbd_client_list_lock);
   
- -static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
- -static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
+ +/* Slab caches for frequently-allocated structures */
   
- -static void rbd_dev_release(struct device *dev);
- -static void rbd_remove_snap_dev(struct rbd_snap *snap);
+ +static struct kmem_cache      *rbd_img_request_cache;
+ +static struct kmem_cache      *rbd_obj_request_cache;
+ +static struct kmem_cache      *rbd_segment_name_cache;
+ +
+ +static int rbd_img_request_submit(struct rbd_img_request *img_request);
+ +
+ +static void rbd_dev_device_release(struct device *dev);
   
   static ssize_t rbd_add(struct bus_type *bus, const char *buf,
                        size_t count);
   static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
                           size_t count);
+ +static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
   
   static struct bus_attribute rbd_bus_attrs[] = {
         __ATTR(add, S_IWUSR, NULL, rbd_add),
@@@ -421,19 -383,8 +421,19 @@@ void rbd_warn(struct rbd_device *rbd_de
   #  define rbd_assert(expr)    ((void) 0)
   #endif /* !RBD_DEBUG */
   
- -static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
- -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
+ +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
+ +static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
+ +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
+ +
+ +static int rbd_dev_refresh(struct rbd_device *rbd_dev);
+ +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
+ +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+ +                                      u64 snap_id);
+ +static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+ +                              u8 *order, u64 *snap_size);
+ +static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+ +              u64 *snap_features);
+ +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
   
   static int rbd_open(struct block_device *bdev, fmode_t mode)
   {
@@@ -460,7 -411,7 +460,7 @@@
         return 0;
   }
   
- -static int rbd_release(struct gendisk *disk, fmode_t mode)
+ +static void rbd_release(struct gendisk *disk, fmode_t mode)
   {
         struct rbd_device *rbd_dev = disk->private_data;
         unsigned long open_count_before;
@@@ -473,6 -424,8 +473,6 @@@
         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
         put_device(&rbd_dev->dev);
         mutex_unlock(&ctl_mutex);
- -
- -      return 0;
   }
   
   static const struct block_device_operations rbd_bd_ops = {
@@@ -531,13 -484,6 +531,13 @@@ out_opt
         return ERR_PTR(ret);
   }
   
+ +static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
+ +{
+ +      kref_get(&rbdc->kref);
+ +
+ +      return rbdc;
+ +}
+ +
   /*
    * Find a ceph client with specific addr and configuration.  If
    * found, bump its reference count.
@@@ -553,8 -499,7 +553,8 @@@ static struct rbd_client *rbd_client_fi
         spin_lock(&rbd_client_list_lock);
         list_for_each_entry(client_node, &rbd_client_list, node) {
                 if (!ceph_compare_options(ceph_opts, client_node->client)) {
- -                      kref_get(&client_node->kref);
+ +                      __rbd_get_client(client_node);
+ +
                         found = true;
                         break;
                 }
@@@ -777,6 -722,7 +777,6 @@@ static int rbd_header_from_disk(struct 
                         header->snap_sizes[i] =
                                 le64_to_cpu(ondisk->snaps[i].image_size);
         } else {
- -              WARN_ON(ondisk->snap_names_len);
                 header->snap_names = NULL;
                 header->snap_sizes = NULL;
         }
@@@ -789,13 -735,18 +789,13 @@@
         /* Allocate and fill in the snapshot context */
   
         header->image_size = le64_to_cpu(ondisk->image_size);
- -      size = sizeof (struct ceph_snap_context);
- -      size += snap_count * sizeof (header->snapc->snaps[0]);
- -      header->snapc = kzalloc(size, GFP_KERNEL);
+ +
+ +      header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
         if (!header->snapc)
                 goto out_err;
- -
- -      atomic_set(&header->snapc->nref, 1);
         header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
- -      header->snapc->num_snaps = snap_count;
         for (i = 0; i < snap_count; i++)
- -              header->snapc->snaps[i] =
- -                      le64_to_cpu(ondisk->snaps[i].id);
+ +              header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
   
         return 0;
   
@@@ -810,174 -761,70 +810,174 @@@ out_err
         return -ENOMEM;
   }
   
- -static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
+ +static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
+ +{
+ +      const char *snap_name;
+ +
+ +      rbd_assert(which < rbd_dev->header.snapc->num_snaps);
+ +
+ +      /* Skip over names until we find the one we are looking for */
+ +
+ +      snap_name = rbd_dev->header.snap_names;
+ +      while (which--)
+ +              snap_name += strlen(snap_name) + 1;
+ +
+ +      return kstrdup(snap_name, GFP_KERNEL);
+ +}
+ +
+ +/*
+ + * Snapshot id comparison function for use with qsort()/bsearch().
+ + * Note that result is for snapshots in *descending* order.
+ + */
+ +static int snapid_compare_reverse(const void *s1, const void *s2)
+ +{
+ +      u64 snap_id1 = *(u64 *)s1;
+ +      u64 snap_id2 = *(u64 *)s2;
+ +
+ +      if (snap_id1 < snap_id2)
+ +              return 1;
+ +      return snap_id1 == snap_id2 ? 0 : -1;
+ +}
+ +
+ +/*
+ + * Search a snapshot context to see if the given snapshot id is
+ + * present.
+ + *
+ + * Returns the position of the snapshot id in the array if it's found,
+ + * or BAD_SNAP_INDEX otherwise.
+ + *
+ + * Note: The snapshot array is in kept sorted (by the osd) in
+ + * reverse order, highest snapshot id first.
+ + */
+ +static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
+ +{
+ +      struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+ +      u64 *found;
+ +
+ +      found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
+ +                              sizeof (snap_id), snapid_compare_reverse);
+ +
+ +      return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
+ +}
+ +
+ +static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
+ +                                      u64 snap_id)
   {
- -      struct rbd_snap *snap;
+ +      u32 which;
   
+ +      which = rbd_dev_snap_index(rbd_dev, snap_id);
+ +      if (which == BAD_SNAP_INDEX)
+ +              return NULL;
+ +
+ +      return _rbd_dev_v1_snap_name(rbd_dev, which);
+ +}
+ +
+ +static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
+ +{
         if (snap_id == CEPH_NOSNAP)
                 return RBD_SNAP_HEAD_NAME;
   
- -      list_for_each_entry(snap, &rbd_dev->snaps, node)
- -              if (snap_id == snap->id)
- -                      return snap->name;
+ +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ +      if (rbd_dev->image_format == 1)
+ +              return rbd_dev_v1_snap_name(rbd_dev, snap_id);
   
- -      return NULL;
+ +      return rbd_dev_v2_snap_name(rbd_dev, snap_id);
   }
   
- -static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
+ +static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+ +                              u64 *snap_size)
   {
+ +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ +      if (snap_id == CEPH_NOSNAP) {
+ +              *snap_size = rbd_dev->header.image_size;
+ +      } else if (rbd_dev->image_format == 1) {
+ +              u32 which;
   
- -      struct rbd_snap *snap;
+ +              which = rbd_dev_snap_index(rbd_dev, snap_id);
+ +              if (which == BAD_SNAP_INDEX)
+ +                      return -ENOENT;
+ +
+ +              *snap_size = rbd_dev->header.snap_sizes[which];
+ +      } else {
+ +              u64 size = 0;
+ +              int ret;
   
- -      list_for_each_entry(snap, &rbd_dev->snaps, node) {
- -              if (!strcmp(snap_name, snap->name)) {
- -                      rbd_dev->spec->snap_id = snap->id;
- -                      rbd_dev->mapping.size = snap->size;
- -                      rbd_dev->mapping.features = snap->features;
+ +              ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
+ +              if (ret)
+ +                      return ret;
   
- -                      return 0;
- -              }
+ +              *snap_size = size;
         }
+ +      return 0;
+ +}
+ +
+ +static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+ +                      u64 *snap_features)
+ +{
+ +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ +      if (snap_id == CEPH_NOSNAP) {
+ +              *snap_features = rbd_dev->header.features;
+ +      } else if (rbd_dev->image_format == 1) {
+ +              *snap_features = 0;     /* No features for format 1 */
+ +      } else {
+ +              u64 features = 0;
+ +              int ret;
+ +
+ +              ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
+ +              if (ret)
+ +                      return ret;
   
- -      return -ENOENT;
+ +              *snap_features = features;
+ +      }
+ +      return 0;
   }
   
- -static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
+ +static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
   {
+ +      const char *snap_name = rbd_dev->spec->snap_name;
+ +      u64 snap_id;
+ +      u64 size = 0;
+ +      u64 features = 0;
         int ret;
   
- -      if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
- -                  sizeof (RBD_SNAP_HEAD_NAME))) {
- -              rbd_dev->spec->snap_id = CEPH_NOSNAP;
- -              rbd_dev->mapping.size = rbd_dev->header.image_size;
- -              rbd_dev->mapping.features = rbd_dev->header.features;
- -              ret = 0;
+ +      if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) {
+ +              snap_id = rbd_snap_id_by_name(rbd_dev, snap_name);
+ +              if (snap_id == CEPH_NOSNAP)
+ +                      return -ENOENT;
         } else {
- -              ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
- -              if (ret < 0)
- -                      goto done;
- -              rbd_dev->mapping.read_only = true;
+ +              snap_id = CEPH_NOSNAP;
         }
- -      set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
   
- -done:
- -      return ret;
+ +      ret = rbd_snap_size(rbd_dev, snap_id, &size);
+ +      if (ret)
+ +              return ret;
+ +      ret = rbd_snap_features(rbd_dev, snap_id, &features);
+ +      if (ret)
+ +              return ret;
+ +
+ +      rbd_dev->mapping.size = size;
+ +      rbd_dev->mapping.features = features;
+ +
+ +      /* If we are mapping a snapshot it must be marked read-only */
+ +
+ +      if (snap_id != CEPH_NOSNAP)
+ +              rbd_dev->mapping.read_only = true;
+ +
+ +      return 0;
   }
   
- -static void rbd_header_free(struct rbd_image_header *header)
+ +static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
   {
- -      kfree(header->object_prefix);
- -      header->object_prefix = NULL;
- -      kfree(header->snap_sizes);
- -      header->snap_sizes = NULL;
- -      kfree(header->snap_names);
- -      header->snap_names = NULL;
- -      ceph_put_snap_context(header->snapc);
- -      header->snapc = NULL;
+ +      rbd_dev->mapping.size = 0;
+ +      rbd_dev->mapping.features = 0;
+ +      rbd_dev->mapping.read_only = true;
+ +}
+ +
+ +static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
+ +{
+ +      rbd_dev->mapping.size = 0;
+ +      rbd_dev->mapping.features = 0;
+ +      rbd_dev->mapping.read_only = true;
   }
   
   static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
@@@ -986,7 -833,7 +986,7 @@@
         u64 segment;
         int ret;
   
- -      name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
+ +      name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
         if (!name)
                 return NULL;
         segment = offset >> rbd_dev->header.obj_order;
@@@ -1002,13 -849,6 +1002,13 @@@
         return name;
   }
   
+ +static void rbd_segment_name_free(const char *name)
+ +{
+ +      /* The explicit cast here is needed to drop the const qualifier */
+ +
+ +      kmem_cache_free(rbd_segment_name_cache, (void *)name);
+ +}
+ +
   static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
   {
         u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
@@@ -1080,37 -920,6 +1080,37 @@@ static void zero_bio_chain(struct bio *
         }
   }
   
+ +/*
+ + * similar to zero_bio_chain(), zeros data defined by a page array,
+ + * starting at the given byte offset from the start of the array and
+ + * continuing up to the given end offset.  The pages array is
+ + * assumed to be big enough to hold all bytes up to the end.
+ + */
+ +static void zero_pages(struct page **pages, u64 offset, u64 end)
+ +{
+ +      struct page **page = &pages[offset >> PAGE_SHIFT];
+ +
+ +      rbd_assert(end > offset);
+ +      rbd_assert(end - offset <= (u64)SIZE_MAX);
+ +      while (offset < end) {
+ +              size_t page_offset;
+ +              size_t length;
+ +              unsigned long flags;
+ +              void *kaddr;
+ +
+ +              page_offset = (size_t)(offset & ~PAGE_MASK);
+ +              length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
+ +              local_irq_save(flags);
+ +              kaddr = kmap_atomic(*page);
+ +              memset(kaddr + page_offset, 0, length);
+ +              kunmap_atomic(kaddr);
+ +              local_irq_restore(flags);
+ +
+ +              offset += length;
+ +              page++;
+ +      }
+ +}
+ +
   /*
    * Clone a portion of a bio, starting at the given byte offset
    * and continuing for the number of bytes indicated.
@@@ -1143,7 -952,7 +1143,7 @@@ static struct bio *bio_clone_range(stru
         /* Find first affected segment... */
   
         resid = offset;
-       __bio_for_each_segment(bv, bio_src, idx, 0) {
+       bio_for_each_segment(bv, bio_src, idx) {
                 if (resid < bv->bv_len)
                         break;
                 resid -= bv->bv_len;
@@@ -1255,77 -1064,6 +1255,77 @@@ out_err
         return NULL;
   }
   
+ +/*
+ + * The default/initial value for all object request flags is 0.  For
+ + * each flag, once its value is set to 1 it is never reset to 0
+ + * again.
+ + */
+ +static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
+ +{
+ +      if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
+ +              struct rbd_device *rbd_dev;
+ +
+ +              rbd_dev = obj_request->img_request->rbd_dev;
+ +              rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
+ +                      obj_request);
+ +      }
+ +}
+ +
+ +static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
+ +{
+ +      smp_mb();
+ +      return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
+ +}
+ +
+ +static void obj_request_done_set(struct rbd_obj_request *obj_request)
+ +{
+ +      if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
+ +              struct rbd_device *rbd_dev = NULL;
+ +
+ +              if (obj_request_img_data_test(obj_request))
+ +                      rbd_dev = obj_request->img_request->rbd_dev;
+ +              rbd_warn(rbd_dev, "obj_request %p already marked done\n",
+ +                      obj_request);
+ +      }
+ +}
+ +
+ +static bool obj_request_done_test(struct rbd_obj_request *obj_request)
+ +{
+ +      smp_mb();
+ +      return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
+ +}
+ +
+ +/*
+ + * This sets the KNOWN flag after (possibly) setting the EXISTS
+ + * flag.  The latter is set based on the "exists" value provided.
+ + *
+ + * Note that for our purposes once an object exists it never goes
+ + * away again.  It's possible that the response from two existence
+ + * checks are separated by the creation of the target object, and
+ + * the first ("doesn't exist") response arrives *after* the second
+ + * ("does exist").  In that case we ignore the second one.
+ + */
+ +static void obj_request_existence_set(struct rbd_obj_request *obj_request,
+ +                              bool exists)
+ +{
+ +      if (exists)
+ +              set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
+ +      set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
+ +      smp_mb();
+ +}
+ +
+ +static bool obj_request_known_test(struct rbd_obj_request *obj_request)
+ +{
+ +      smp_mb();
+ +      return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
+ +}
+ +
+ +static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
+ +{
+ +      smp_mb();
+ +      return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
+ +}
+ +
   static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
   {
         dout("%s: obj %p (was %d)\n", __func__, obj_request,
@@@ -1363,11 -1101,9 +1363,11 @@@ static inline void rbd_img_obj_request_
   {
         rbd_assert(obj_request->img_request == NULL);
   
- -      rbd_obj_request_get(obj_request);
+ +      /* Image request now owns object's original reference */
         obj_request->img_request = img_request;
         obj_request->which = img_request->obj_request_count;
+ +      rbd_assert(!obj_request_img_data_test(obj_request));
+ +      obj_request_img_data_set(obj_request);
         rbd_assert(obj_request->which != BAD_WHICH);
         img_request->obj_request_count++;
         list_add_tail(&obj_request->links, &img_request->obj_requests);
@@@ -1387,7 -1123,6 +1387,7 @@@ static inline void rbd_img_obj_request_
         img_request->obj_request_count--;
         rbd_assert(obj_request->which == img_request->obj_request_count);
         obj_request->which = BAD_WHICH;
+ +      rbd_assert(obj_request_img_data_test(obj_request));
         rbd_assert(obj_request->img_request == img_request);
         obj_request->img_request = NULL;
         obj_request->callback = NULL;
@@@ -1406,6 -1141,76 +1406,6 @@@ static bool obj_request_type_valid(enu
         }
   }
   
- -static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
- -{
- -      struct ceph_osd_req_op *op;
- -      va_list args;
- -      size_t size;
- -
- -      op = kzalloc(sizeof (*op), GFP_NOIO);
- -      if (!op)
- -              return NULL;
- -      op->op = opcode;
- -      va_start(args, opcode);
- -      switch (opcode) {
- -      case CEPH_OSD_OP_READ:
- -      case CEPH_OSD_OP_WRITE:
- -              /* rbd_osd_req_op_create(READ, offset, length) */
- -              /* rbd_osd_req_op_create(WRITE, offset, length) */
- -              op->extent.offset = va_arg(args, u64);
- -              op->extent.length = va_arg(args, u64);
- -              if (opcode == CEPH_OSD_OP_WRITE)
- -                      op->payload_len = op->extent.length;
- -              break;
- -      case CEPH_OSD_OP_STAT:
- -              break;
- -      case CEPH_OSD_OP_CALL:
- -              /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
- -              op->cls.class_name = va_arg(args, char *);
- -              size = strlen(op->cls.class_name);
- -              rbd_assert(size <= (size_t) U8_MAX);
- -              op->cls.class_len = size;
- -              op->payload_len = size;
- -
- -              op->cls.method_name = va_arg(args, char *);
- -              size = strlen(op->cls.method_name);
- -              rbd_assert(size <= (size_t) U8_MAX);
- -              op->cls.method_len = size;
- -              op->payload_len += size;
- -
- -              op->cls.argc = 0;
- -              op->cls.indata = va_arg(args, void *);
- -              size = va_arg(args, size_t);
- -              rbd_assert(size <= (size_t) U32_MAX);
- -              op->cls.indata_len = (u32) size;
- -              op->payload_len += size;
- -              break;
- -      case CEPH_OSD_OP_NOTIFY_ACK:
- -      case CEPH_OSD_OP_WATCH:
- -              /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
- -              /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
- -              op->watch.cookie = va_arg(args, u64);
- -              op->watch.ver = va_arg(args, u64);
- -              op->watch.ver = cpu_to_le64(op->watch.ver);
- -              if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
- -                      op->watch.flag = (u8) 1;
- -              break;
- -      default:
- -              rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
- -              kfree(op);
- -              op = NULL;
- -              break;
- -      }
- -      va_end(args);
- -
- -      return op;
- -}
- -
- -static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
- -{
- -      kfree(op);
- -}
- -
   static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
                                 struct rbd_obj_request *obj_request)
   {
@@@ -1416,24 -1221,7 +1416,24 @@@
   
   static void rbd_img_request_complete(struct rbd_img_request *img_request)
   {
+ +
         dout("%s: img %p\n", __func__, img_request);
+ +
+ +      /*
+ +       * If no error occurred, compute the aggregate transfer
+ +       * count for the image request.  We could instead use
+ +       * atomic64_cmpxchg() to update it as each object request
+ +       * completes; not clear which way is better off hand.
+ +       */
+ +      if (!img_request->result) {
+ +              struct rbd_obj_request *obj_request;
+ +              u64 xferred = 0;
+ +
+ +              for_each_obj_request(img_request, obj_request)
+ +                      xferred += obj_request->xferred;
+ +              img_request->xferred = xferred;
+ +      }
+ +
         if (img_request->callback)
                 img_request->callback(img_request);
         else
@@@ -1449,56 -1237,39 +1449,56 @@@ static int rbd_obj_request_wait(struct 
         return wait_for_completion_interruptible(&obj_request->completion);
   }
   
- -static void obj_request_done_init(struct rbd_obj_request *obj_request)
+ +/*
+ + * The default/initial value for all image request flags is 0.  Each
+ + * is conditionally set to 1 at image request initialization time
+ + * and currently never change thereafter.
+ + */
+ +static void img_request_write_set(struct rbd_img_request *img_request)
   {
- -      atomic_set(&obj_request->done, 0);
- -      smp_wmb();
+ +      set_bit(IMG_REQ_WRITE, &img_request->flags);
+ +      smp_mb();
   }
   
- -static void obj_request_done_set(struct rbd_obj_request *obj_request)
+ +static bool img_request_write_test(struct rbd_img_request *img_request)
+ +{
+ +      smp_mb();
+ +      return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
+ +}
+ +
+ +static void img_request_child_set(struct rbd_img_request *img_request)
   {
- -      int done;
+ +      set_bit(IMG_REQ_CHILD, &img_request->flags);
+ +      smp_mb();
+ +}
   
- -      done = atomic_inc_return(&obj_request->done);
- -      if (done > 1) {
- -              struct rbd_img_request *img_request = obj_request->img_request;
- -              struct rbd_device *rbd_dev;
+ +static bool img_request_child_test(struct rbd_img_request *img_request)
+ +{
+ +      smp_mb();
+ +      return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
+ +}
   
- -              rbd_dev = img_request ? img_request->rbd_dev : NULL;
- -              rbd_warn(rbd_dev, "obj_request %p was already done\n",
- -                      obj_request);
- -      }
+ +static void img_request_layered_set(struct rbd_img_request *img_request)
+ +{
+ +      set_bit(IMG_REQ_LAYERED, &img_request->flags);
+ +      smp_mb();
   }
   
- -static bool obj_request_done_test(struct rbd_obj_request *obj_request)
+ +static bool img_request_layered_test(struct rbd_img_request *img_request)
   {
         smp_mb();
- -      return atomic_read(&obj_request->done) != 0;
+ +      return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
   }
   
   static void
   rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
   {
+ +      u64 xferred = obj_request->xferred;
+ +      u64 length = obj_request->length;
+ +
         dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
                 obj_request, obj_request->img_request, obj_request->result,
- -              obj_request->xferred, obj_request->length);
+ +              xferred, length);
         /*
          * ENOENT means a hole in the image.  We zero-fill the
          * entire length of the request.  A short read also implies
@@@ -1506,20 -1277,15 +1506,20 @@@
          * update the xferred count to indicate the whole request
          * was satisfied.
          */
- -      BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
+ +      rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
         if (obj_request->result == -ENOENT) {
- -              zero_bio_chain(obj_request->bio_list, 0);
+ +              if (obj_request->type == OBJ_REQUEST_BIO)
+ +                      zero_bio_chain(obj_request->bio_list, 0);
+ +              else
+ +                      zero_pages(obj_request->pages, 0, length);
                 obj_request->result = 0;
- -              obj_request->xferred = obj_request->length;
- -      } else if (obj_request->xferred < obj_request->length &&
- -                      !obj_request->result) {
- -              zero_bio_chain(obj_request->bio_list, obj_request->xferred);
- -              obj_request->xferred = obj_request->length;
+ +              obj_request->xferred = length;
+ +      } else if (xferred < length && !obj_request->result) {
+ +              if (obj_request->type == OBJ_REQUEST_BIO)
+ +                      zero_bio_chain(obj_request->bio_list, xferred);
+ +              else
+ +                      zero_pages(obj_request->pages, xferred, length);
+ +              obj_request->xferred = length;
         }
         obj_request_done_set(obj_request);
   }
@@@ -1542,23 -1308,9 +1542,23 @@@ static void rbd_osd_trivial_callback(st
   
   static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
   {
- -      dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
- -              obj_request->result, obj_request->xferred, obj_request->length);
- -      if (obj_request->img_request)
+ +      struct rbd_img_request *img_request = NULL;
+ +      struct rbd_device *rbd_dev = NULL;
+ +      bool layered = false;
+ +
+ +      if (obj_request_img_data_test(obj_request)) {
+ +              img_request = obj_request->img_request;
+ +              layered = img_request && img_request_layered_test(img_request);
+ +              rbd_dev = img_request->rbd_dev;
+ +      }
+ +
+ +      dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
+ +              obj_request, img_request, obj_request->result,
+ +              obj_request->xferred, obj_request->length);
+ +      if (layered && obj_request->result == -ENOENT &&
+ +                      obj_request->img_offset < rbd_dev->parent_overlap)
+ +              rbd_img_parent_read(obj_request);
+ +      else if (img_request)
                 rbd_img_obj_request_read_callback(obj_request);
         else
                 obj_request_done_set(obj_request);
@@@ -1569,8 -1321,9 +1569,8 @@@ static void rbd_osd_write_callback(stru
         dout("%s: obj %p result %d %llu\n", __func__, obj_request,
                 obj_request->result, obj_request->length);
         /*
- -       * There is no such thing as a successful short write.
- -       * Our xferred value is the number of bytes transferred
- -       * back.  Set it to our originally-requested length.
+ +       * There is no such thing as a successful short write.  Set
+ +       * it to our originally-requested length.
          */
         obj_request->xferred = obj_request->length;
         obj_request_done_set(obj_request);
@@@ -1594,25 -1347,22 +1594,25 @@@ static void rbd_osd_req_callback(struc
   
         dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
         rbd_assert(osd_req == obj_request->osd_req);
- -      rbd_assert(!!obj_request->img_request ^
- -                              (obj_request->which == BAD_WHICH));
+ +      if (obj_request_img_data_test(obj_request)) {
+ +              rbd_assert(obj_request->img_request);
+ +              rbd_assert(obj_request->which != BAD_WHICH);
+ +      } else {
+ +              rbd_assert(obj_request->which == BAD_WHICH);
+ +      }
   
         if (osd_req->r_result < 0)
                 obj_request->result = osd_req->r_result;
- -      obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
   
- -      WARN_ON(osd_req->r_num_ops != 1);       /* For now */
+ +      BUG_ON(osd_req->r_num_ops > 2);
   
         /*
          * We support a 64-bit length, but ultimately it has to be
          * passed to blk_end_request(), which takes an unsigned int.
          */
         obj_request->xferred = osd_req->r_reply_op_len[0];
- -      rbd_assert(obj_request->xferred < (u64) UINT_MAX);
- -      opcode = osd_req->r_request_ops[0].op;
+ +      rbd_assert(obj_request->xferred < (u64)UINT_MAX);
+ +      opcode = osd_req->r_ops[0].op;
         switch (opcode) {
         case CEPH_OSD_OP_READ:
                 rbd_osd_read_callback(obj_request);
@@@ -1638,49 -1388,28 +1638,49 @@@
                 rbd_obj_request_complete(obj_request);
   }
   
+ +static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_img_request *img_request = obj_request->img_request;
+ +      struct ceph_osd_request *osd_req = obj_request->osd_req;
+ +      u64 snap_id;
+ +
+ +      rbd_assert(osd_req != NULL);
+ +
+ +      snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
+ +      ceph_osdc_build_request(osd_req, obj_request->offset,
+ +                      NULL, snap_id, NULL);
+ +}
+ +
+ +static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_img_request *img_request = obj_request->img_request;
+ +      struct ceph_osd_request *osd_req = obj_request->osd_req;
+ +      struct ceph_snap_context *snapc;
+ +      struct timespec mtime = CURRENT_TIME;
+ +
+ +      rbd_assert(osd_req != NULL);
+ +
+ +      snapc = img_request ? img_request->snapc : NULL;
+ +      ceph_osdc_build_request(osd_req, obj_request->offset,
+ +                      snapc, CEPH_NOSNAP, &mtime);
+ +}
+ +
   static struct ceph_osd_request *rbd_osd_req_create(
                                         struct rbd_device *rbd_dev,
                                         bool write_request,
- -                                      struct rbd_obj_request *obj_request,
- -                                      struct ceph_osd_req_op *op)
+ +                                      struct rbd_obj_request *obj_request)
   {
- -      struct rbd_img_request *img_request = obj_request->img_request;
         struct ceph_snap_context *snapc = NULL;
         struct ceph_osd_client *osdc;
         struct ceph_osd_request *osd_req;
- -      struct timespec now;
- -      struct timespec *mtime;
- -      u64 snap_id = CEPH_NOSNAP;
- -      u64 offset = obj_request->offset;
- -      u64 length = obj_request->length;
   
- -      if (img_request) {
- -              rbd_assert(img_request->write_request == write_request);
- -              if (img_request->write_request)
- -                      snapc = img_request->snapc;
- -              else
- -                      snap_id = img_request->snap_id;
+ +      if (obj_request_img_data_test(obj_request)) {
+ +              struct rbd_img_request *img_request = obj_request->img_request;
+ +
+ +              rbd_assert(write_request ==
+ +                              img_request_write_test(img_request));
+ +              if (write_request)
+ +                      snapc = img_request->snapc;
         }
   
         /* Allocate and initialize the request, for the single op */
@@@ -1690,10 -1419,31 +1690,10 @@@
         if (!osd_req)
                 return NULL;    /* ENOMEM */
   
- -      rbd_assert(obj_request_type_valid(obj_request->type));
- -      switch (obj_request->type) {
- -      case OBJ_REQUEST_NODATA:
- -              break;          /* Nothing to do */
- -      case OBJ_REQUEST_BIO:
- -              rbd_assert(obj_request->bio_list != NULL);
- -              osd_req->r_bio = obj_request->bio_list;
- -              break;
- -      case OBJ_REQUEST_PAGES:
- -              osd_req->r_pages = obj_request->pages;
- -              osd_req->r_num_pages = obj_request->page_count;
- -              osd_req->r_page_alignment = offset & ~PAGE_MASK;
- -              break;
- -      }
- -
- -      if (write_request) {
+ +      if (write_request)
                 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
- -              now = CURRENT_TIME;
- -              mtime = &now;
- -      } else {
+ +      else
                 osd_req->r_flags = CEPH_OSD_FLAG_READ;
- -              mtime = NULL;   /* not needed for reads */
- -              offset = 0;     /* These are not used... */
- -              length = 0;     /* ...for osd read requests */
- -      }
   
         osd_req->r_callback = rbd_osd_req_callback;
         osd_req->r_priv = obj_request;
@@@ -1704,51 -1454,14 +1704,51 @@@
   
         osd_req->r_file_layout = rbd_dev->layout;       /* struct */
   
- -      /* osd_req will get its own reference to snapc (if non-null) */
+ +      return osd_req;
+ +}
   
- -      ceph_osdc_build_request(osd_req, offset, length, 1, op,
- -                              snapc, snap_id, mtime);
+ +/*
+ + * Create a copyup osd request based on the information in the
+ + * object request supplied.  A copyup request has two osd ops,
+ + * a copyup method call, and a "normal" write request.
+ + */
+ +static struct ceph_osd_request *
+ +rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_img_request *img_request;
+ +      struct ceph_snap_context *snapc;
+ +      struct rbd_device *rbd_dev;
+ +      struct ceph_osd_client *osdc;
+ +      struct ceph_osd_request *osd_req;
+ +
+ +      rbd_assert(obj_request_img_data_test(obj_request));
+ +      img_request = obj_request->img_request;
+ +      rbd_assert(img_request);
+ +      rbd_assert(img_request_write_test(img_request));
+ +
+ +      /* Allocate and initialize the request, for the two ops */
+ +
+ +      snapc = img_request->snapc;
+ +      rbd_dev = img_request->rbd_dev;
+ +      osdc = &rbd_dev->rbd_client->client->osdc;
+ +      osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
+ +      if (!osd_req)
+ +              return NULL;    /* ENOMEM */
+ +
+ +      osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ +      osd_req->r_callback = rbd_osd_req_callback;
+ +      osd_req->r_priv = obj_request;
+ +
+ +      osd_req->r_oid_len = strlen(obj_request->object_name);
+ +      rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
+ +      memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
+ +
+ +      osd_req->r_file_layout = rbd_dev->layout;       /* struct */
   
         return osd_req;
   }
   
+ +
   static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
   {
         ceph_osdc_put_request(osd_req);
@@@ -1767,23 -1480,18 +1767,23 @@@ static struct rbd_obj_request *rbd_obj_
         rbd_assert(obj_request_type_valid(type));
   
         size = strlen(object_name) + 1;
- -      obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
- -      if (!obj_request)
+ +      name = kmalloc(size, GFP_KERNEL);
+ +      if (!name)
                 return NULL;
   
- -      name = (char *)(obj_request + 1);
+ +      obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
+ +      if (!obj_request) {
+ +              kfree(name);
+ +              return NULL;
+ +      }
+ +
         obj_request->object_name = memcpy(name, object_name, size);
         obj_request->offset = offset;
         obj_request->length = length;
+ +      obj_request->flags = 0;
         obj_request->which = BAD_WHICH;
         obj_request->type = type;
         INIT_LIST_HEAD(&obj_request->links);
- -      obj_request_done_init(obj_request);
         init_completion(&obj_request->completion);
         kref_init(&obj_request->kref);
   
@@@ -1822,9 -1530,7 +1822,9 @@@ static void rbd_obj_request_destroy(str
                 break;
         }
   
- -      kfree(obj_request);
+ +      kfree(obj_request->object_name);
+ +      obj_request->object_name = NULL;
+ +      kmem_cache_free(rbd_obj_request_cache, obj_request);
   }
   
   /*
@@@ -1835,40 -1541,37 +1835,40 @@@
   static struct rbd_img_request *rbd_img_request_create(
                                         struct rbd_device *rbd_dev,
                                         u64 offset, u64 length,
- -                                      bool write_request)
+ +                                      bool write_request,
+ +                                      bool child_request)
   {
         struct rbd_img_request *img_request;
- -      struct ceph_snap_context *snapc = NULL;
   
- -      img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
+ +      img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
         if (!img_request)
                 return NULL;
   
         if (write_request) {
                 down_read(&rbd_dev->header_rwsem);
- -              snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+ +              ceph_get_snap_context(rbd_dev->header.snapc);
                 up_read(&rbd_dev->header_rwsem);
- -              if (WARN_ON(!snapc)) {
- -                      kfree(img_request);
- -                      return NULL;    /* Shouldn't happen */
- -              }
         }
   
         img_request->rq = NULL;
         img_request->rbd_dev = rbd_dev;
         img_request->offset = offset;
         img_request->length = length;
- -      img_request->write_request = write_request;
- -      if (write_request)
- -              img_request->snapc = snapc;
- -      else
+ +      img_request->flags = 0;
+ +      if (write_request) {
+ +              img_request_write_set(img_request);
+ +              img_request->snapc = rbd_dev->header.snapc;
+ +      } else {
                 img_request->snap_id = rbd_dev->spec->snap_id;
+ +      }
+ +      if (child_request)
+ +              img_request_child_set(img_request);
+ +      if (rbd_dev->parent_spec)
+ +              img_request_layered_set(img_request);
         spin_lock_init(&img_request->completion_lock);
         img_request->next_completion = 0;
         img_request->callback = NULL;
+ +      img_request->result = 0;
         img_request->obj_request_count = 0;
         INIT_LIST_HEAD(&img_request->obj_requests);
         kref_init(&img_request->kref);
@@@ -1897,204 -1600,78 +1897,204 @@@ static void rbd_img_request_destroy(str
                 rbd_img_obj_request_del(img_request, obj_request);
         rbd_assert(img_request->obj_request_count == 0);
   
- -      if (img_request->write_request)
+ +      if (img_request_write_test(img_request))
                 ceph_put_snap_context(img_request->snapc);
   
- -      kfree(img_request);
+ +      if (img_request_child_test(img_request))
+ +              rbd_obj_request_put(img_request->obj_request);
+ +
+ +      kmem_cache_free(rbd_img_request_cache, img_request);
+ +}
+ +
+ +static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_img_request *img_request;
+ +      unsigned int xferred;
+ +      int result;
+ +      bool more;
+ +
+ +      rbd_assert(obj_request_img_data_test(obj_request));
+ +      img_request = obj_request->img_request;
+ +
+ +      rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
+ +      xferred = (unsigned int)obj_request->xferred;
+ +      result = obj_request->result;
+ +      if (result) {
+ +              struct rbd_device *rbd_dev = img_request->rbd_dev;
+ +
+ +              rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
+ +                      img_request_write_test(img_request) ? "write" : "read",
+ +                      obj_request->length, obj_request->img_offset,
+ +                      obj_request->offset);
+ +              rbd_warn(rbd_dev, "  result %d xferred %x\n",
+ +                      result, xferred);
+ +              if (!img_request->result)
+ +                      img_request->result = result;
+ +      }
+ +
+ +      /* Image object requests don't own their page array */
+ +
+ +      if (obj_request->type == OBJ_REQUEST_PAGES) {
+ +              obj_request->pages = NULL;
+ +              obj_request->page_count = 0;
+ +      }
+ +
+ +      if (img_request_child_test(img_request)) {
+ +              rbd_assert(img_request->obj_request != NULL);
+ +              more = obj_request->which < img_request->obj_request_count - 1;
+ +      } else {
+ +              rbd_assert(img_request->rq != NULL);
+ +              more = blk_end_request(img_request->rq, result, xferred);
+ +      }
+ +
+ +      return more;
+ +}
+ +
+ +static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_img_request *img_request;
+ +      u32 which = obj_request->which;
+ +      bool more = true;
+ +
+ +      rbd_assert(obj_request_img_data_test(obj_request));
+ +      img_request = obj_request->img_request;
+ +
+ +      dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
+ +      rbd_assert(img_request != NULL);
+ +      rbd_assert(img_request->obj_request_count > 0);
+ +      rbd_assert(which != BAD_WHICH);
+ +      rbd_assert(which < img_request->obj_request_count);
+ +      rbd_assert(which >= img_request->next_completion);
+ +
+ +      spin_lock_irq(&img_request->completion_lock);
+ +      if (which != img_request->next_completion)
+ +              goto out;
+ +
+ +      for_each_obj_request_from(img_request, obj_request) {
+ +              rbd_assert(more);
+ +              rbd_assert(which < img_request->obj_request_count);
+ +
+ +              if (!obj_request_done_test(obj_request))
+ +                      break;
+ +              more = rbd_img_obj_end_request(obj_request);
+ +              which++;
+ +      }
+ +
+ +      rbd_assert(more ^ (which == img_request->obj_request_count));
+ +      img_request->next_completion = which;
+ +out:
+ +      spin_unlock_irq(&img_request->completion_lock);
+ +
+ +      if (!more)
+ +              rbd_img_request_complete(img_request);
   }
   
- -static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
- -                                      struct bio *bio_list)
+ +/*
+ + * Split up an image request into one or more object requests, each
+ + * to a different object.  The "type" parameter indicates whether
+ + * "data_desc" is the pointer to the head of a list of bio
+ + * structures, or the base of a page array.  In either case this
+ + * function assumes data_desc describes memory sufficient to hold
+ + * all data described by the image request.
+ + */
+ +static int rbd_img_request_fill(struct rbd_img_request *img_request,
+ +                                      enum obj_request_type type,
+ +                                      void *data_desc)
   {
         struct rbd_device *rbd_dev = img_request->rbd_dev;
         struct rbd_obj_request *obj_request = NULL;
         struct rbd_obj_request *next_obj_request;
- -      unsigned int bio_offset;
- -      u64 image_offset;
+ +      bool write_request = img_request_write_test(img_request);
+ +      struct bio *bio_list;
+ +      unsigned int bio_offset = 0;
+ +      struct page **pages;
+ +      u64 img_offset;
         u64 resid;
         u16 opcode;
   
- -      dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
+ +      dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
+ +              (int)type, data_desc);
   
- -      opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
- -                                            : CEPH_OSD_OP_READ;
- -      bio_offset = 0;
- -      image_offset = img_request->offset;
- -      rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
+ +      opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
+ +      img_offset = img_request->offset;
         resid = img_request->length;
         rbd_assert(resid > 0);
+ +
+ +      if (type == OBJ_REQUEST_BIO) {
+ +              bio_list = data_desc;
+ +              rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
+ +      } else {
+ +              rbd_assert(type == OBJ_REQUEST_PAGES);
+ +              pages = data_desc;
+ +      }
+ +
         while (resid) {
+ +              struct ceph_osd_request *osd_req;
                 const char *object_name;
- -              unsigned int clone_size;
- -              struct ceph_osd_req_op *op;
                 u64 offset;
                 u64 length;
   
- -              object_name = rbd_segment_name(rbd_dev, image_offset);
+ +              object_name = rbd_segment_name(rbd_dev, img_offset);
                 if (!object_name)
                         goto out_unwind;
- -              offset = rbd_segment_offset(rbd_dev, image_offset);
- -              length = rbd_segment_length(rbd_dev, image_offset, resid);
+ +              offset = rbd_segment_offset(rbd_dev, img_offset);
+ +              length = rbd_segment_length(rbd_dev, img_offset, resid);
                 obj_request = rbd_obj_request_create(object_name,
- -                                              offset, length,
- -                                              OBJ_REQUEST_BIO);
- -              kfree(object_name);     /* object request has its own copy */
+ +                                              offset, length, type);
+ +              /* object request has its own copy of the object name */
+ +              rbd_segment_name_free(object_name);
                 if (!obj_request)
                         goto out_unwind;
   
- -              rbd_assert(length <= (u64) UINT_MAX);
- -              clone_size = (unsigned int) length;
- -              obj_request->bio_list = bio_chain_clone_range(&bio_list,
- -                                              &bio_offset, clone_size,
- -                                              GFP_ATOMIC);
- -              if (!obj_request->bio_list)
- -                      goto out_partial;
+ +              if (type == OBJ_REQUEST_BIO) {
+ +                      unsigned int clone_size;
+ +
+ +                      rbd_assert(length <= (u64)UINT_MAX);
+ +                      clone_size = (unsigned int)length;
+ +                      obj_request->bio_list =
+ +                                      bio_chain_clone_range(&bio_list,
+ +                                                              &bio_offset,
+ +                                                              clone_size,
+ +                                                              GFP_ATOMIC);
+ +                      if (!obj_request->bio_list)
+ +                              goto out_partial;
+ +              } else {
+ +                      unsigned int page_count;
+ +
+ +                      obj_request->pages = pages;
+ +                      page_count = (u32)calc_pages_for(offset, length);
+ +                      obj_request->page_count = page_count;
+ +                      if ((offset + length) & ~PAGE_MASK)
+ +                              page_count--;   /* more on last page */
+ +                      pages += page_count;
+ +              }
   
- -              /*
- -               * Build up the op to use in building the osd
- -               * request.  Note that the contents of the op are
- -               * copied by rbd_osd_req_create().
- -               */
- -              op = rbd_osd_req_op_create(opcode, offset, length);
- -              if (!op)
- -                      goto out_partial;
- -              obj_request->osd_req = rbd_osd_req_create(rbd_dev,
- -                                              img_request->write_request,
- -                                              obj_request, op);
- -              rbd_osd_req_op_destroy(op);
- -              if (!obj_request->osd_req)
+ +              osd_req = rbd_osd_req_create(rbd_dev, write_request,
+ +                                              obj_request);
+ +              if (!osd_req)
                         goto out_partial;
- -              /* status and version are initially zero-filled */
+ +              obj_request->osd_req = osd_req;
+ +              obj_request->callback = rbd_img_obj_callback;
+ +
+ +              osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
+ +                                              0, 0);
+ +              if (type == OBJ_REQUEST_BIO)
+ +                      osd_req_op_extent_osd_data_bio(osd_req, 0,
+ +                                      obj_request->bio_list, length);
+ +              else
+ +                      osd_req_op_extent_osd_data_pages(osd_req, 0,
+ +                                      obj_request->pages, length,
+ +                                      offset & ~PAGE_MASK, false, false);
   
+ +              if (write_request)
+ +                      rbd_osd_req_format_write(obj_request);
+ +              else
+ +                      rbd_osd_req_format_read(obj_request);
+ +
+ +              obj_request->img_offset = img_offset;
                 rbd_img_obj_request_add(img_request, obj_request);
   
- -              image_offset += length;
+ +              img_offset += length;
                 resid -= length;
         }
   
@@@ -2109,495 -1686,88 +2109,495 @@@ out_unwind
         return -ENOMEM;
   }
   
- -static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
+ +static void
+ +rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
   {
         struct rbd_img_request *img_request;
- -      u32 which = obj_request->which;
- -      bool more = true;
+ +      struct rbd_device *rbd_dev;
+ +      u64 length;
+ +      u32 page_count;
   
+ +      rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
+ +      rbd_assert(obj_request_img_data_test(obj_request));
         img_request = obj_request->img_request;
+ +      rbd_assert(img_request);
   
- -      dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
+ +      rbd_dev = img_request->rbd_dev;
+ +      rbd_assert(rbd_dev);
+ +      length = (u64)1 << rbd_dev->header.obj_order;
+ +      page_count = (u32)calc_pages_for(0, length);
+ +
+ +      rbd_assert(obj_request->copyup_pages);
+ +      ceph_release_page_vector(obj_request->copyup_pages, page_count);
+ +      obj_request->copyup_pages = NULL;
+ +
+ +      /*
+ +       * We want the transfer count to reflect the size of the
+ +       * original write request.  There is no such thing as a
+ +       * successful short write, so if the request was successful
+ +       * we can just set it to the originally-requested length.
+ +       */
+ +      if (!obj_request->result)
+ +              obj_request->xferred = obj_request->length;
+ +
+ +      /* Finish up with the normal image object callback */
+ +
+ +      rbd_img_obj_callback(obj_request);
+ +}
+ +
+ +static void
+ +rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
+ +{
+ +      struct rbd_obj_request *orig_request;
+ +      struct ceph_osd_request *osd_req;
+ +      struct ceph_osd_client *osdc;
+ +      struct rbd_device *rbd_dev;
+ +      struct page **pages;
+ +      int result;
+ +      u64 obj_size;
+ +      u64 xferred;
+ +
+ +      rbd_assert(img_request_child_test(img_request));
+ +
+ +      /* First get what we need from the image request */
+ +
+ +      pages = img_request->copyup_pages;
+ +      rbd_assert(pages != NULL);
+ +      img_request->copyup_pages = NULL;
+ +
+ +      orig_request = img_request->obj_request;
+ +      rbd_assert(orig_request != NULL);
+ +      rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
+ +      result = img_request->result;
+ +      obj_size = img_request->length;
+ +      xferred = img_request->xferred;
+ +
+ +      rbd_dev = img_request->rbd_dev;
+ +      rbd_assert(rbd_dev);
+ +      rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
+ +
+ +      rbd_img_request_put(img_request);
+ +
+ +      if (result)
+ +              goto out_err;
+ +
+ +      /* Allocate the new copyup osd request for the original request */
+ +
+ +      result = -ENOMEM;
+ +      rbd_assert(!orig_request->osd_req);
+ +      osd_req = rbd_osd_req_create_copyup(orig_request);
+ +      if (!osd_req)
+ +              goto out_err;
+ +      orig_request->osd_req = osd_req;
+ +      orig_request->copyup_pages = pages;
+ +
+ +      /* Initialize the copyup op */
+ +
+ +      osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
+ +      osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
+ +                                              false, false);
+ +
+ +      /* Then the original write request op */
+ +
+ +      osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
+ +                                      orig_request->offset,
+ +                                      orig_request->length, 0, 0);
+ +      osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
+ +                                      orig_request->length);
+ +
+ +      rbd_osd_req_format_write(orig_request);
+ +
+ +      /* All set, send it off. */
+ +
+ +      orig_request->callback = rbd_img_obj_copyup_callback;
+ +      osdc = &rbd_dev->rbd_client->client->osdc;
+ +      result = rbd_obj_request_submit(osdc, orig_request);
+ +      if (!result)
+ +              return;
+ +out_err:
+ +      /* Record the error code and complete the request */
+ +
+ +      orig_request->result = result;
+ +      orig_request->xferred = 0;
+ +      obj_request_done_set(orig_request);
+ +      rbd_obj_request_complete(orig_request);
+ +}
+ +
+ +/*
+ + * Read from the parent image the range of data that covers the
+ + * entire target of the given object request.  This is used for
+ + * satisfying a layered image write request when the target of an
+ + * object request from the image request does not exist.
+ + *
+ + * A page array big enough to hold the returned data is allocated
+ + * and supplied to rbd_img_request_fill() as the "data descriptor."
+ + * When the read completes, this page array will be transferred to
+ + * the original object request for the copyup operation.
+ + *
+ + * If an error occurs, record it as the result of the original
+ + * object request and mark it done so it gets completed.
+ + */
+ +static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_img_request *img_request = NULL;
+ +      struct rbd_img_request *parent_request = NULL;
+ +      struct rbd_device *rbd_dev;
+ +      u64 img_offset;
+ +      u64 length;
+ +      struct page **pages = NULL;
+ +      u32 page_count;
+ +      int result;
+ +
+ +      rbd_assert(obj_request_img_data_test(obj_request));
+ +      rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
+ +
+ +      img_request = obj_request->img_request;
         rbd_assert(img_request != NULL);
- -      rbd_assert(img_request->rq != NULL);
- -      rbd_assert(img_request->obj_request_count > 0);
- -      rbd_assert(which != BAD_WHICH);
- -      rbd_assert(which < img_request->obj_request_count);
- -      rbd_assert(which >= img_request->next_completion);
+ +      rbd_dev = img_request->rbd_dev;
+ +      rbd_assert(rbd_dev->parent != NULL);
   
- -      spin_lock_irq(&img_request->completion_lock);
- -      if (which != img_request->next_completion)
- -              goto out;
+ +      /*
+ +       * First things first.  The original osd request is of no
+ +       * use to use any more, we'll need a new one that can hold
+ +       * the two ops in a copyup request.  We'll get that later,
+ +       * but for now we can release the old one.
+ +       */
+ +      rbd_osd_req_destroy(obj_request->osd_req);
+ +      obj_request->osd_req = NULL;
   
- -      for_each_obj_request_from(img_request, obj_request) {
- -              unsigned int xferred;
- -              int result;
+ +      /*
+ +       * Determine the byte range covered by the object in the
+ +       * child image to which the original request was to be sent.
+ +       */
+ +      img_offset = obj_request->img_offset - obj_request->offset;
+ +      length = (u64)1 << rbd_dev->header.obj_order;
   
- -              rbd_assert(more);
- -              rbd_assert(which < img_request->obj_request_count);
+ +      /*
+ +       * There is no defined parent data beyond the parent
+ +       * overlap, so limit what we read at that boundary if
+ +       * necessary.
+ +       */
+ +      if (img_offset + length > rbd_dev->parent_overlap) {
+ +              rbd_assert(img_offset < rbd_dev->parent_overlap);
+ +              length = rbd_dev->parent_overlap - img_offset;
+ +      }
   
- -              if (!obj_request_done_test(obj_request))
- -                      break;
+ +      /*
+ +       * Allocate a page array big enough to receive the data read
+ +       * from the parent.
+ +       */
+ +      page_count = (u32)calc_pages_for(0, length);
+ +      pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+ +      if (IS_ERR(pages)) {
+ +              result = PTR_ERR(pages);
+ +              pages = NULL;
+ +              goto out_err;
+ +      }
   
- -              rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
- -              xferred = (unsigned int) obj_request->xferred;
- -              result = (int) obj_request->result;
- -              if (result)
- -                      rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
- -                              img_request->write_request ? "write" : "read",
- -                              result, xferred);
+ +      result = -ENOMEM;
+ +      parent_request = rbd_img_request_create(rbd_dev->parent,
+ +                                              img_offset, length,
+ +                                              false, true);
+ +      if (!parent_request)
+ +              goto out_err;
+ +      rbd_obj_request_get(obj_request);
+ +      parent_request->obj_request = obj_request;
   
- -              more = blk_end_request(img_request->rq, result, xferred);
- -              which++;
+ +      result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
+ +      if (result)
+ +              goto out_err;
+ +      parent_request->copyup_pages = pages;
+ +
+ +      parent_request->callback = rbd_img_obj_parent_read_full_callback;
+ +      result = rbd_img_request_submit(parent_request);
+ +      if (!result)
+ +              return 0;
+ +
+ +      parent_request->copyup_pages = NULL;
+ +      parent_request->obj_request = NULL;
+ +      rbd_obj_request_put(obj_request);
+ +out_err:
+ +      if (pages)
+ +              ceph_release_page_vector(pages, page_count);
+ +      if (parent_request)
+ +              rbd_img_request_put(parent_request);
+ +      obj_request->result = result;
+ +      obj_request->xferred = 0;
+ +      obj_request_done_set(obj_request);
+ +
+ +      return result;
+ +}
+ +
+ +static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_obj_request *orig_request;
+ +      int result;
+ +
+ +      rbd_assert(!obj_request_img_data_test(obj_request));
+ +
+ +      /*
+ +       * All we need from the object request is the original
+ +       * request and the result of the STAT op.  Grab those, then
+ +       * we're done with the request.
+ +       */
+ +      orig_request = obj_request->obj_request;
+ +      obj_request->obj_request = NULL;
+ +      rbd_assert(orig_request);
+ +      rbd_assert(orig_request->img_request);
+ +
+ +      result = obj_request->result;
+ +      obj_request->result = 0;
+ +
+ +      dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
+ +              obj_request, orig_request, result,
+ +              obj_request->xferred, obj_request->length);
+ +      rbd_obj_request_put(obj_request);
+ +
+ +      rbd_assert(orig_request);
+ +      rbd_assert(orig_request->img_request);
+ +
+ +      /*
+ +       * Our only purpose here is to determine whether the object
+ +       * exists, and we don't want to treat the non-existence as
+ +       * an error.  If something else comes back, transfer the
+ +       * error to the original request and complete it now.
+ +       */
+ +      if (!result) {
+ +              obj_request_existence_set(orig_request, true);
+ +      } else if (result == -ENOENT) {
+ +              obj_request_existence_set(orig_request, false);
+ +      } else if (result) {
+ +              orig_request->result = result;
+ +              goto out;
         }
   
- -      rbd_assert(more ^ (which == img_request->obj_request_count));
- -      img_request->next_completion = which;
+ +      /*
+ +       * Resubmit the original request now that we have recorded
+ +       * whether the target object exists.
+ +       */
+ +      orig_request->result = rbd_img_obj_request_submit(orig_request);
   out:
- -      spin_unlock_irq(&img_request->completion_lock);
+ +      if (orig_request->result)
+ +              rbd_obj_request_complete(orig_request);
+ +      rbd_obj_request_put(orig_request);
+ +}
   
- -      if (!more)
- -              rbd_img_request_complete(img_request);
+ +static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_obj_request *stat_request;
+ +      struct rbd_device *rbd_dev;
+ +      struct ceph_osd_client *osdc;
+ +      struct page **pages = NULL;
+ +      u32 page_count;
+ +      size_t size;
+ +      int ret;
+ +
+ +      /*
+ +       * The response data for a STAT call consists of:
+ +       *     le64 length;
+ +       *     struct {
+ +       *         le32 tv_sec;
+ +       *         le32 tv_nsec;
+ +       *     } mtime;
+ +       */
+ +      size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
+ +      page_count = (u32)calc_pages_for(0, size);
+ +      pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+ +      if (IS_ERR(pages))
+ +              return PTR_ERR(pages);
+ +
+ +      ret = -ENOMEM;
+ +      stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
+ +                                                      OBJ_REQUEST_PAGES);
+ +      if (!stat_request)
+ +              goto out;
+ +
+ +      rbd_obj_request_get(obj_request);
+ +      stat_request->obj_request = obj_request;
+ +      stat_request->pages = pages;
+ +      stat_request->page_count = page_count;
+ +
+ +      rbd_assert(obj_request->img_request);
+ +      rbd_dev = obj_request->img_request->rbd_dev;
+ +      stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
+ +                                              stat_request);
+ +      if (!stat_request->osd_req)
+ +              goto out;
+ +      stat_request->callback = rbd_img_obj_exists_callback;
+ +
+ +      osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
+ +      osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
+ +                                      false, false);
+ +      rbd_osd_req_format_read(stat_request);
+ +
+ +      osdc = &rbd_dev->rbd_client->client->osdc;
+ +      ret = rbd_obj_request_submit(osdc, stat_request);
+ +out:
+ +      if (ret)
+ +              rbd_obj_request_put(obj_request);
+ +
+ +      return ret;
+ +}
+ +
+ +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_img_request *img_request;
+ +      struct rbd_device *rbd_dev;
+ +      bool known;
+ +
+ +      rbd_assert(obj_request_img_data_test(obj_request));
+ +
+ +      img_request = obj_request->img_request;
+ +      rbd_assert(img_request);
+ +      rbd_dev = img_request->rbd_dev;
+ +
+ +      /*
+ +       * Only writes to layered images need special handling.
+ +       * Reads and non-layered writes are simple object requests.
+ +       * Layered writes that start beyond the end of the overlap
+ +       * with the parent have no parent data, so they too are
+ +       * simple object requests.  Finally, if the target object is
+ +       * known to already exist, its parent data has already been
+ +       * copied, so a write to the object can also be handled as a
+ +       * simple object request.
+ +       */
+ +      if (!img_request_write_test(img_request) ||
+ +              !img_request_layered_test(img_request) ||
+ +              rbd_dev->parent_overlap <= obj_request->img_offset ||
+ +              ((known = obj_request_known_test(obj_request)) &&
+ +                      obj_request_exists_test(obj_request))) {
+ +
+ +              struct rbd_device *rbd_dev;
+ +              struct ceph_osd_client *osdc;
+ +
+ +              rbd_dev = obj_request->img_request->rbd_dev;
+ +              osdc = &rbd_dev->rbd_client->client->osdc;
+ +
+ +              return rbd_obj_request_submit(osdc, obj_request);
+ +      }
+ +
+ +      /*
+ +       * It's a layered write.  The target object might exist but
+ +       * we may not know that yet.  If we know it doesn't exist,
+ +       * start by reading the data for the full target object from
+ +       * the parent so we can use it for a copyup to the target.
+ +       */
+ +      if (known)
+ +              return rbd_img_obj_parent_read_full(obj_request);
+ +
+ +      /* We don't know whether the target exists.  Go find out. */
+ +
+ +      return rbd_img_obj_exists_submit(obj_request);
   }
   
   static int rbd_img_request_submit(struct rbd_img_request *img_request)
   {
- -      struct rbd_device *rbd_dev = img_request->rbd_dev;
- -      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
         struct rbd_obj_request *obj_request;
+ +      struct rbd_obj_request *next_obj_request;
   
         dout("%s: img %p\n", __func__, img_request);
- -      for_each_obj_request(img_request, obj_request) {
+ +      for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
                 int ret;
   
- -              obj_request->callback = rbd_img_obj_callback;
- -              ret = rbd_obj_request_submit(osdc, obj_request);
+ +              ret = rbd_img_obj_request_submit(obj_request);
                 if (ret)
                         return ret;
- -              /*
- -               * The image request has its own reference to each
- -               * of its object requests, so we can safely drop the
- -               * initial one here.
- -               */
- -              rbd_obj_request_put(obj_request);
         }
   
- -      return 0;
+ +      return 0;
+ +}
+ +
+ +static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
+ +{
+ +      struct rbd_obj_request *obj_request;
+ +      struct rbd_device *rbd_dev;
+ +      u64 obj_end;
+ +
+ +      rbd_assert(img_request_child_test(img_request));
+ +
+ +      obj_request = img_request->obj_request;
+ +      rbd_assert(obj_request);
+ +      rbd_assert(obj_request->img_request);
+ +
+ +      obj_request->result = img_request->result;
+ +      if (obj_request->result)
+ +              goto out;
+ +
+ +      /*
+ +       * We need to zero anything beyond the parent overlap
+ +       * boundary.  Since rbd_img_obj_request_read_callback()
+ +       * will zero anything beyond the end of a short read, an
+ +       * easy way to do this is to pretend the data from the
+ +       * parent came up short--ending at the overlap boundary.
+ +       */
+ +      rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
+ +      obj_end = obj_request->img_offset + obj_request->length;
+ +      rbd_dev = obj_request->img_request->rbd_dev;
+ +      if (obj_end > rbd_dev->parent_overlap) {
+ +              u64 xferred = 0;
+ +
+ +              if (obj_request->img_offset < rbd_dev->parent_overlap)
+ +                      xferred = rbd_dev->parent_overlap -
+ +                                      obj_request->img_offset;
+ +
+ +              obj_request->xferred = min(img_request->xferred, xferred);
+ +      } else {
+ +              obj_request->xferred = img_request->xferred;
+ +      }
+ +out:
+ +      rbd_img_request_put(img_request);
+ +      rbd_img_obj_request_read_callback(obj_request);
+ +      rbd_obj_request_complete(obj_request);
+ +}
+ +
+ +static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
+ +{
+ +      struct rbd_device *rbd_dev;
+ +      struct rbd_img_request *img_request;
+ +      int result;
+ +
+ +      rbd_assert(obj_request_img_data_test(obj_request));
+ +      rbd_assert(obj_request->img_request != NULL);
+ +      rbd_assert(obj_request->result == (s32) -ENOENT);
+ +      rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
+ +
+ +      rbd_dev = obj_request->img_request->rbd_dev;
+ +      rbd_assert(rbd_dev->parent != NULL);
+ +      /* rbd_read_finish(obj_request, obj_request->length); */
+ +      img_request = rbd_img_request_create(rbd_dev->parent,
+ +                                              obj_request->img_offset,
+ +                                              obj_request->length,
+ +                                              false, true);
+ +      result = -ENOMEM;
+ +      if (!img_request)
+ +              goto out_err;
+ +
+ +      rbd_obj_request_get(obj_request);
+ +      img_request->obj_request = obj_request;
+ +
+ +      result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
+ +                                      obj_request->bio_list);
+ +      if (result)
+ +              goto out_err;
+ +
+ +      img_request->callback = rbd_img_parent_read_callback;
+ +      result = rbd_img_request_submit(img_request);
+ +      if (result)
+ +              goto out_err;
+ +
+ +      return;
+ +out_err:
+ +      if (img_request)
+ +              rbd_img_request_put(img_request);
+ +      obj_request->result = result;
+ +      obj_request->xferred = 0;
+ +      obj_request_done_set(obj_request);
   }
   
- -static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
- -                                 u64 ver, u64 notify_id)
+ +static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
   {
         struct rbd_obj_request *obj_request;
- -      struct ceph_osd_req_op *op;
- -      struct ceph_osd_client *osdc;
+ +      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
         int ret;
   
         obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
@@@ -2606,15 -1776,17 +2606,15 @@@
                 return -ENOMEM;
   
         ret = -ENOMEM;
- -      op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
- -      if (!op)
- -              goto out;
- -      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
- -                                              obj_request, op);
- -      rbd_osd_req_op_destroy(op);
+ +      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
         if (!obj_request->osd_req)
                 goto out;
- -
- -      osdc = &rbd_dev->rbd_client->client->osdc;
         obj_request->callback = rbd_obj_request_put;
+ +
+ +      osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
+ +                                      notify_id, 0, 0);
+ +      rbd_osd_req_format_read(obj_request);
+ +
         ret = rbd_obj_request_submit(osdc, obj_request);
   out:
         if (ret)
@@@ -2626,16 -1798,21 +2626,16 @@@
   static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
   {
         struct rbd_device *rbd_dev = (struct rbd_device *)data;
- -      u64 hver;
- -      int rc;
   
         if (!rbd_dev)
                 return;
   
         dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
- -              rbd_dev->header_name, (unsigned long long) notify_id,
- -              (unsigned int) opcode);
- -      rc = rbd_dev_refresh(rbd_dev, &hver);
- -      if (rc)
- -              rbd_warn(rbd_dev, "got notification but failed to "
- -                         " update snaps: %d\n", rc);
+ +              rbd_dev->header_name, (unsigned long long)notify_id,
+ +              (unsigned int)opcode);
+ +      (void)rbd_dev_refresh(rbd_dev);
   
- -      rbd_obj_notify_ack(rbd_dev, hver, notify_id);
+ +      rbd_obj_notify_ack(rbd_dev, notify_id);
   }
   
   /*
@@@ -2646,6 -1823,7 +2646,6 @@@ static int rbd_dev_header_watch_sync(st
   {
         struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
         struct rbd_obj_request *obj_request;
- -      struct ceph_osd_req_op *op;
         int ret;
   
         rbd_assert(start ^ !!rbd_dev->watch_event);
@@@ -2665,7 -1843,14 +2665,7 @@@
         if (!obj_request)
                 goto out_cancel;
   
- -      op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
- -                              rbd_dev->watch_event->cookie,
- -                              rbd_dev->header.obj_version, start);
- -      if (!op)
- -              goto out_cancel;
- -      obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
- -                                                      obj_request, op);
- -      rbd_osd_req_op_destroy(op);
+ +      obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
         if (!obj_request->osd_req)
                 goto out_cancel;
   
@@@ -2674,11 -1859,6 +2674,11 @@@
         else
                 ceph_osdc_unregister_linger_request(osdc,
                                         rbd_dev->watch_request->osd_req);
+ +
+ +      osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
+ +                              rbd_dev->watch_event->cookie, 0, start);
+ +      rbd_osd_req_format_write(obj_request);
+ +
         ret = rbd_obj_request_submit(osdc, obj_request);
         if (ret)
                 goto out_cancel;
@@@ -2718,38 -1898,40 +2718,38 @@@ out_cancel
   }
   
   /*
- - * Synchronous osd object method call
+ + * Synchronous osd object method call.  Returns the number of bytes
+ + * returned in the outbound buffer, or a negative error code.
    */
   static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
                              const char *object_name,
                              const char *class_name,
                              const char *method_name,
- -                           const char *outbound,
+ +                           const void *outbound,
                              size_t outbound_size,
- -                           char *inbound,
- -                           size_t inbound_size,
- -                           u64 *version)
+ +                           void *inbound,
+ +                           size_t inbound_size)
   {
+ +      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
         struct rbd_obj_request *obj_request;
- -      struct ceph_osd_client *osdc;
- -      struct ceph_osd_req_op *op;
         struct page **pages;
         u32 page_count;
         int ret;
   
         /*
- -       * Method calls are ultimately read operations but they
- -       * don't involve object data (so no offset or length).
- -       * The result should placed into the inbound buffer
- -       * provided.  They also supply outbound data--parameters for
- -       * the object method.  Currently if this is present it will
- -       * be a snapshot id.
+ +       * Method calls are ultimately read operations.  The result
+ +       * should placed into the inbound buffer provided.  They
+ +       * also supply outbound data--parameters for the object
+ +       * method.  Currently if this is present it will be a
+ +       * snapshot id.
          */
- -      page_count = (u32) calc_pages_for(0, inbound_size);
+ +      page_count = (u32)calc_pages_for(0, inbound_size);
         pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
         if (IS_ERR(pages))
                 return PTR_ERR(pages);
   
         ret = -ENOMEM;
- -      obj_request = rbd_obj_request_create(object_name, 0, 0,
+ +      obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
                                                         OBJ_REQUEST_PAGES);
         if (!obj_request)
                 goto out;
@@@ -2757,29 -1939,17 +2757,29 @@@
         obj_request->pages = pages;
         obj_request->page_count = page_count;
   
- -      op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
- -                                      method_name, outbound, outbound_size);
- -      if (!op)
- -              goto out;
- -      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
- -                                              obj_request, op);
- -      rbd_osd_req_op_destroy(op);
+ +      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
         if (!obj_request->osd_req)
                 goto out;
   
- -      osdc = &rbd_dev->rbd_client->client->osdc;
+ +      osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
+ +                                      class_name, method_name);
+ +      if (outbound_size) {
+ +              struct ceph_pagelist *pagelist;
+ +
+ +              pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+ +              if (!pagelist)
+ +                      goto out;
+ +
+ +              ceph_pagelist_init(pagelist);
+ +              ceph_pagelist_append(pagelist, outbound, outbound_size);
+ +              osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
+ +                                              pagelist);
+ +      }
+ +      osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
+ +                                      obj_request->pages, inbound_size,
+ +                                      0, false, false);
+ +      rbd_osd_req_format_read(obj_request);
+ +
         ret = rbd_obj_request_submit(osdc, obj_request);
         if (ret)
                 goto out;
@@@ -2790,10 -1960,10 +2790,10 @@@
         ret = obj_request->result;
         if (ret < 0)
                 goto out;
- -      ret = 0;
+ +
+ +      rbd_assert(obj_request->xferred < (u64)INT_MAX);
+ +      ret = (int)obj_request->xferred;
         ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
- -      if (version)
- -              *version = obj_request->version;
   out:
         if (obj_request)
                 rbd_obj_request_put(obj_request);
@@@ -2863,22 -2033,18 +2863,22 @@@ static void rbd_request_fn(struct reque
                 }
   
                 result = -EINVAL;
- -              if (WARN_ON(offset && length > U64_MAX - offset + 1))
+ +              if (offset && length > U64_MAX - offset + 1) {
+ +                      rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
+ +                              offset, length);
                         goto end_request;       /* Shouldn't happen */
+ +              }
   
                 result = -ENOMEM;
                 img_request = rbd_img_request_create(rbd_dev, offset, length,
- -                                                      write_request);
+ +                                                      write_request, false);
                 if (!img_request)
                         goto end_request;
   
                 img_request->rq = rq;
   
- -              result = rbd_img_request_fill_bio(img_request, rq->bio);
+ +              result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
+ +                                              rq->bio);
                 if (!result)
                         result = rbd_img_request_submit(img_request);
                 if (result)
@@@ -2886,10 -2052,8 +2886,10 @@@
   end_request:
                 spin_lock_irq(q->queue_lock);
                 if (result < 0) {
- -                      rbd_warn(rbd_dev, "obj_request %s result %d\n",
- -                              write_request ? "write" : "read", result);
+ +                      rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
+ +                              write_request ? "write" : "read",
+ +                              length, offset, result);
+ +
                         __blk_end_request_all(rq, result);
                 }
         }
@@@ -2948,22 -2112,22 +2948,22 @@@ static void rbd_free_disk(struct rbd_de
         if (!disk)
                 return;
   
- -      if (disk->flags & GENHD_FL_UP)
+ +      rbd_dev->disk = NULL;
+ +      if (disk->flags & GENHD_FL_UP) {
                 del_gendisk(disk);
- -      if (disk->queue)
- -              blk_cleanup_queue(disk->queue);
+ +              if (disk->queue)
+ +                      blk_cleanup_queue(disk->queue);
+ +      }
         put_disk(disk);
   }
   
   static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
                                 const char *object_name,
- -                              u64 offset, u64 length,
- -                              char *buf, u64 *version)
+ +                              u64 offset, u64 length, void *buf)
   
   {
- -      struct ceph_osd_req_op *op;
+ +      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
         struct rbd_obj_request *obj_request;
- -      struct ceph_osd_client *osdc;
         struct page **pages = NULL;
         u32 page_count;
         size_t size;
@@@ -2983,19 -2147,16 +2983,19 @@@
         obj_request->pages = pages;
         obj_request->page_count = page_count;
   
- -      op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
- -      if (!op)
- -              goto out;
- -      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
- -                                              obj_request, op);
- -      rbd_osd_req_op_destroy(op);
+ +      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
         if (!obj_request->osd_req)
                 goto out;
   
- -      osdc = &rbd_dev->rbd_client->client->osdc;
+ +      osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
+ +                                      offset, length, 0, 0);
+ +      osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
+ +                                      obj_request->pages,
+ +                                      obj_request->length,
+ +                                      obj_request->offset & ~PAGE_MASK,
+ +                                      false, false);
+ +      rbd_osd_req_format_read(obj_request);
+ +
         ret = rbd_obj_request_submit(osdc, obj_request);
         if (ret)
                 goto out;
@@@ -3010,8 -2171,10 +3010,8 @@@
         rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
         size = (size_t) obj_request->xferred;
         ceph_copy_from_page_vector(pages, buf, 0, size);
- -      rbd_assert(size <= (size_t) INT_MAX);
- -      ret = (int) size;
- -      if (version)
- -              *version = obj_request->version;
+ +      rbd_assert(size <= (size_t)INT_MAX);
+ +      ret = (int)size;
   out:
         if (obj_request)
                 rbd_obj_request_put(obj_request);
@@@ -3032,7 -2195,7 +3032,7 @@@
    * Returns a pointer-coded errno if a failure occurs.
    */
   static struct rbd_image_header_ondisk *
- -rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
+ +rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
   {
         struct rbd_image_header_ondisk *ondisk = NULL;
         u32 snap_count = 0;
@@@ -3060,10 -2223,11 +3060,10 @@@
                         return ERR_PTR(-ENOMEM);
   
                 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
- -                                     0, size,
- -                                     (char *) ondisk, version);
+ +                                     0, size, ondisk);
                 if (ret < 0)
                         goto out_err;
- -              if (WARN_ON((size_t) ret < size)) {
+ +              if ((size_t)ret < size) {
                         ret = -ENXIO;
                         rbd_warn(rbd_dev, "short header read (want %zd got %d)",
                                 size, ret);
@@@ -3095,36 -2259,46 +3095,36 @@@ static int rbd_read_header(struct rbd_d
                            struct rbd_image_header *header)
   {
         struct rbd_image_header_ondisk *ondisk;
- -      u64 ver = 0;
         int ret;
   
- -      ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
+ +      ondisk = rbd_dev_v1_header_read(rbd_dev);
         if (IS_ERR(ondisk))
                 return PTR_ERR(ondisk);
         ret = rbd_header_from_disk(header, ondisk);
- -      if (ret >= 0)
- -              header->obj_version = ver;
         kfree(ondisk);
   
         return ret;
   }
   
- -static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
- -{
- -      struct rbd_snap *snap;
- -      struct rbd_snap *next;
- -
- -      list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
- -              rbd_remove_snap_dev(snap);
- -}
- -
   static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
   {
- -      sector_t size;
- -
         if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
                 return;
   
- -      size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
- -      dout("setting size to %llu sectors", (unsigned long long) size);
- -      rbd_dev->mapping.size = (u64) size;
- -      set_capacity(rbd_dev->disk, size);
+ +      if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
+ +              sector_t size;
+ +
+ +              rbd_dev->mapping.size = rbd_dev->header.image_size;
+ +              size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
+ +              dout("setting size to %llu sectors", (unsigned long long)size);
+ +              set_capacity(rbd_dev->disk, size);
+ +      }
   }
   
   /*
    * only read the first part of the ondisk header, without the snaps info
    */
- -static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
+ +static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
   {
         int ret;
         struct rbd_image_header h;
@@@ -3145,61 -2319,37 +3145,61 @@@
         /* osd requests may still refer to snapc */
         ceph_put_snap_context(rbd_dev->header.snapc);
   
- -      if (hver)
- -              *hver = h.obj_version;
- -      rbd_dev->header.obj_version = h.obj_version;
         rbd_dev->header.image_size = h.image_size;
         rbd_dev->header.snapc = h.snapc;
         rbd_dev->header.snap_names = h.snap_names;
         rbd_dev->header.snap_sizes = h.snap_sizes;
         /* Free the extra copy of the object prefix */
- -      WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
+ +      if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
+ +              rbd_warn(rbd_dev, "object prefix changed (ignoring)");
         kfree(h.object_prefix);
   
- -      ret = rbd_dev_snaps_update(rbd_dev);
- -      if (!ret)
- -              ret = rbd_dev_snaps_register(rbd_dev);
- -
         up_write(&rbd_dev->header_rwsem);
   
         return ret;
   }
   
- -static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
+ +/*
+ + * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
+ + * has disappeared from the (just updated) snapshot context.
+ + */
+ +static void rbd_exists_validate(struct rbd_device *rbd_dev)
+ +{
+ +      u64 snap_id;
+ +
+ +      if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
+ +              return;
+ +
+ +      snap_id = rbd_dev->spec->snap_id;
+ +      if (snap_id == CEPH_NOSNAP)
+ +              return;
+ +
+ +      if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
+ +              clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+ +}
+ +
+ +static int rbd_dev_refresh(struct rbd_device *rbd_dev)
   {
+ +      u64 image_size;
         int ret;
   
         rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ +      image_size = rbd_dev->header.image_size;
         mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
         if (rbd_dev->image_format == 1)
- -              ret = rbd_dev_v1_refresh(rbd_dev, hver);
+ +              ret = rbd_dev_v1_refresh(rbd_dev);
         else
- -              ret = rbd_dev_v2_refresh(rbd_dev, hver);
+ +              ret = rbd_dev_v2_refresh(rbd_dev);
+ +
+ +      /* If it's a mapped snapshot, validate its EXISTS flag */
+ +
+ +      rbd_exists_validate(rbd_dev);
         mutex_unlock(&ctl_mutex);
+ +      if (ret)
+ +              rbd_warn(rbd_dev, "got notification but failed to "
+ +                         " update snaps: %d\n", ret);
+ +      if (image_size != rbd_dev->header.image_size)
+ +              revalidate_disk(rbd_dev->disk);
   
         return ret;
   }
@@@ -3243,6 -2393,8 +3243,6 @@@ static int rbd_init_disk(struct rbd_dev
   
         rbd_dev->disk = disk;
   
- -      set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
- -
         return 0;
   out_disk:
         put_disk(disk);
@@@ -3263,9 -2415,13 +3263,9 @@@ static ssize_t rbd_size_show(struct dev
                              struct device_attribute *attr, char *buf)
   {
         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
- -      sector_t size;
   
- -      down_read(&rbd_dev->header_rwsem);
- -      size = get_capacity(rbd_dev->disk);
- -      up_read(&rbd_dev->header_rwsem);
- -
- -      return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
+ +      return sprintf(buf, "%llu\n",
+ +              (unsigned long long)rbd_dev->mapping.size);
   }
   
   /*
@@@ -3278,7 -2434,7 +3278,7 @@@ static ssize_t rbd_features_show(struc
         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
   
         return sprintf(buf, "0x%016llx\n",
- -                      (unsigned long long) rbd_dev->mapping.features);
+ +                      (unsigned long long)rbd_dev->mapping.features);
   }
   
   static ssize_t rbd_major_show(struct device *dev,
@@@ -3286,11 -2442,7 +3286,11 @@@
   {
         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
   
- -      return sprintf(buf, "%d\n", rbd_dev->major);
+ +      if (rbd_dev->major)
+ +              return sprintf(buf, "%d\n", rbd_dev->major);
+ +
+ +      return sprintf(buf, "(none)\n");
+ +
   }
   
   static ssize_t rbd_client_id_show(struct device *dev,
@@@ -3316,7 -2468,7 +3316,7 @@@ static ssize_t rbd_pool_id_show(struct 
         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
   
         return sprintf(buf, "%llu\n",
- -              (unsigned long long) rbd_dev->spec->pool_id);
+ +                      (unsigned long long) rbd_dev->spec->pool_id);
   }
   
   static ssize_t rbd_name_show(struct device *dev,
@@@ -3402,7 -2554,7 +3402,7 @@@ static ssize_t rbd_image_refresh(struc
         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
         int ret;
   
- -      ret = rbd_dev_refresh(rbd_dev, NULL);
+ +      ret = rbd_dev_refresh(rbd_dev);
   
         return ret < 0 ? ret : size;
   }
@@@ -3453,6 -2605,71 +3453,6 @@@ static struct device_type rbd_device_ty
         .release        = rbd_sysfs_dev_release,
   };
   
- -
- -/*
- -  sysfs - snapshots
- -*/
- -
- -static ssize_t rbd_snap_size_show(struct device *dev,
- -                                struct device_attribute *attr,
- -                                char *buf)
- -{
- -      struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
- -
- -      return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
- -}
- -
- -static ssize_t rbd_snap_id_show(struct device *dev,
- -                              struct device_attribute *attr,
- -                              char *buf)
- -{
- -      struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
- -
- -      return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
- -}
- -
- -static ssize_t rbd_snap_features_show(struct device *dev,
- -                              struct device_attribute *attr,
- -                              char *buf)
- -{
- -      struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
- -
- -      return sprintf(buf, "0x%016llx\n",
- -                      (unsigned long long) snap->features);
- -}
- -
- -static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
- -static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
- -static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
- -
- -static struct attribute *rbd_snap_attrs[] = {
- -      &dev_attr_snap_size.attr,
- -      &dev_attr_snap_id.attr,
- -      &dev_attr_snap_features.attr,
- -      NULL,
- -};
- -
- -static struct attribute_group rbd_snap_attr_group = {
- -      .attrs = rbd_snap_attrs,
- -};
- -
- -static void rbd_snap_dev_release(struct device *dev)
- -{
- -      struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
- -      kfree(snap->name);
- -      kfree(snap);
- -}
- -
- -static const struct attribute_group *rbd_snap_attr_groups[] = {
- -      &rbd_snap_attr_group,
- -      NULL
- -};
- -
- -static struct device_type rbd_snap_device_type = {
- -      .groups         = rbd_snap_attr_groups,
- -      .release        = rbd_snap_dev_release,
- -};
- -
   static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
   {
         kref_get(&spec->kref);
@@@ -3476,6 -2693,8 +3476,6 @@@ static struct rbd_spec *rbd_spec_alloc(
                 return NULL;
         kref_init(&spec->kref);
   
- -      rbd_spec_put(rbd_spec_get(spec));       /* TEMPORARY */
- -
         return spec;
   }
   
@@@ -3502,6 -2721,7 +3502,6 @@@ static struct rbd_device *rbd_dev_creat
         spin_lock_init(&rbd_dev->lock);
         rbd_dev->flags = 0;
         INIT_LIST_HEAD(&rbd_dev->node);
- -      INIT_LIST_HEAD(&rbd_dev->snaps);
         init_rwsem(&rbd_dev->header_rwsem);
   
         rbd_dev->spec = spec;
@@@ -3519,11 -2739,96 +3519,11 @@@
   
   static void rbd_dev_destroy(struct rbd_device *rbd_dev)
   {
- -      rbd_spec_put(rbd_dev->parent_spec);
- -      kfree(rbd_dev->header_name);
         rbd_put_client(rbd_dev->rbd_client);
         rbd_spec_put(rbd_dev->spec);
         kfree(rbd_dev);
   }
   
- -static bool rbd_snap_registered(struct rbd_snap *snap)
- -{
- -      bool ret = snap->dev.type == &rbd_snap_device_type;
- -      bool reg = device_is_registered(&snap->dev);
- -
- -      rbd_assert(!ret ^ reg);
- -
- -      return ret;
- -}
- -
- -static void rbd_remove_snap_dev(struct rbd_snap *snap)
- -{
- -      list_del(&snap->node);
- -      if (device_is_registered(&snap->dev))
- -              device_unregister(&snap->dev);
- -}
- -
- -static int rbd_register_snap_dev(struct rbd_snap *snap,
- -                                struct device *parent)
- -{
- -      struct device *dev = &snap->dev;
- -      int ret;
- -
- -      dev->type = &rbd_snap_device_type;
- -      dev->parent = parent;
- -      dev->release = rbd_snap_dev_release;
- -      dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
- -      dout("%s: registering device for snapshot %s\n", __func__, snap->name);
- -
- -      ret = device_register(dev);
- -
- -      return ret;
- -}
- -
- -static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
- -                                              const char *snap_name,
- -                                              u64 snap_id, u64 snap_size,
- -                                              u64 snap_features)
- -{
- -      struct rbd_snap *snap;
- -      int ret;
- -
- -      snap = kzalloc(sizeof (*snap), GFP_KERNEL);
- -      if (!snap)
- -              return ERR_PTR(-ENOMEM);
- -
- -      ret = -ENOMEM;
- -      snap->name = kstrdup(snap_name, GFP_KERNEL);
- -      if (!snap->name)
- -              goto err;
- -
- -      snap->id = snap_id;
- -      snap->size = snap_size;
- -      snap->features = snap_features;
- -
- -      return snap;
- -
- -err:
- -      kfree(snap->name);
- -      kfree(snap);
- -
- -      return ERR_PTR(ret);
- -}
- -
- -static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
- -              u64 *snap_size, u64 *snap_features)
- -{
- -      char *snap_name;
- -
- -      rbd_assert(which < rbd_dev->header.snapc->num_snaps);
- -
- -      *snap_size = rbd_dev->header.snap_sizes[which];
- -      *snap_features = 0;     /* No features for v1 */
- -
- -      /* Skip over names until we find the one we are looking for */
- -
- -      snap_name = rbd_dev->header.snap_names;
- -      while (which--)
- -              snap_name += strlen(snap_name) + 1;
- -
- -      return snap_name;
- -}
- -
   /*
    * Get the size and object order for an image snapshot, or if
    * snap_id is CEPH_NOSNAP, gets this information for the base
@@@ -3541,21 -2846,18 +3541,21 @@@ static int _rbd_dev_v2_snap_size(struc
   
         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
                                 "rbd", "get_size",
- -                              (char *) &snapid, sizeof (snapid),
- -                              (char *) &size_buf, sizeof (size_buf), NULL);
+ +                              &snapid, sizeof (snapid),
+ +                              &size_buf, sizeof (size_buf));
         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
         if (ret < 0)
                 return ret;
+ +      if (ret < sizeof (size_buf))
+ +              return -ERANGE;
   
- -      *order = size_buf.order;
+ +      if (order)
+ +              *order = size_buf.order;
         *snap_size = le64_to_cpu(size_buf.size);
   
         dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
- -              (unsigned long long) snap_id, (unsigned int) *order,
- -              (unsigned long long) *snap_size);
+ +              (unsigned long long)snap_id, (unsigned int)*order,
+ +              (unsigned long long)*snap_size);
   
         return 0;
   }
@@@ -3578,16 -2880,17 +3578,16 @@@ static int rbd_dev_v2_object_prefix(str
                 return -ENOMEM;
   
         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
- -                              "rbd", "get_object_prefix",
- -                              NULL, 0,
- -                              reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
+ +                              "rbd", "get_object_prefix", NULL, 0,
+ +                              reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
         if (ret < 0)
                 goto out;
   
         p = reply_buf;
         rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
- -                                              p + RBD_OBJ_PREFIX_LEN_MAX,
- -                                              NULL, GFP_NOIO);
+ +                                              p + ret, NULL, GFP_NOIO);
+ +      ret = 0;
   
         if (IS_ERR(rbd_dev->header.object_prefix)) {
                 ret = PTR_ERR(rbd_dev->header.object_prefix);
@@@ -3595,6 -2898,7 +3595,6 @@@
         } else {
                 dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
         }
- -
   out:
         kfree(reply_buf);
   
@@@ -3608,30 -2912,29 +3608,30 @@@ static int _rbd_dev_v2_snap_features(st
         struct {
                 __le64 features;
                 __le64 incompat;
- -      } features_buf = { 0 };
+ +      } __attribute__ ((packed)) features_buf = { 0 };
         u64 incompat;
         int ret;
   
         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
                                 "rbd", "get_features",
- -                              (char *) &snapid, sizeof (snapid),
- -                              (char *) &features_buf, sizeof (features_buf),
- -                              NULL);
+ +                              &snapid, sizeof (snapid),
+ +                              &features_buf, sizeof (features_buf));
         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
         if (ret < 0)
                 return ret;
+ +      if (ret < sizeof (features_buf))
+ +              return -ERANGE;
   
         incompat = le64_to_cpu(features_buf.incompat);
- -      if (incompat & ~RBD_FEATURES_ALL)
+ +      if (incompat & ~RBD_FEATURES_SUPPORTED)
                 return -ENXIO;
   
         *snap_features = le64_to_cpu(features_buf.features);
   
         dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
- -              (unsigned long long) snap_id,
- -              (unsigned long long) *snap_features,
- -              (unsigned long long) le64_to_cpu(features_buf.incompat));
+ +              (unsigned long long)snap_id,
+ +              (unsigned long long)*snap_features,
+ +              (unsigned long long)le64_to_cpu(features_buf.incompat));
   
         return 0;
   }
@@@ -3671,15 -2974,15 +3671,15 @@@ static int rbd_dev_v2_parent_info(struc
         snapid = cpu_to_le64(CEPH_NOSNAP);
         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
                                 "rbd", "get_parent",
- -                              (char *) &snapid, sizeof (snapid),
- -                              (char *) reply_buf, size, NULL);
+ +                              &snapid, sizeof (snapid),
+ +                              reply_buf, size);
         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
         if (ret < 0)
                 goto out_err;
   
- -      ret = -ERANGE;
         p = reply_buf;
- -      end = (char *) reply_buf + size;
+ +      end = reply_buf + ret;
+ +      ret = -ERANGE;
         ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
         if (parent_spec->pool_id == CEPH_NOPOOL)
                 goto out;       /* No parent?  No problem. */
@@@ -3687,11 -2990,8 +3687,11 @@@
         /* The ceph file layout needs to fit pool id in 32 bits */
   
         ret = -EIO;
- -      if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
- -              goto out;
+ +      if (parent_spec->pool_id > (u64)U32_MAX) {
+ +              rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
+ +                      (unsigned long long)parent_spec->pool_id, U32_MAX);
+ +              goto out_err;
+ +      }
   
         image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
         if (IS_ERR(image_id)) {
@@@ -3714,56 -3014,6 +3714,56 @@@ out_err
         return ret;
   }
   
+ +static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
+ +{
+ +      struct {
+ +              __le64 stripe_unit;
+ +              __le64 stripe_count;
+ +      } __attribute__ ((packed)) striping_info_buf = { 0 };
+ +      size_t size = sizeof (striping_info_buf);
+ +      void *p;
+ +      u64 obj_size;
+ +      u64 stripe_unit;
+ +      u64 stripe_count;
+ +      int ret;
+ +
+ +      ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+ +                              "rbd", "get_stripe_unit_count", NULL, 0,
+ +                              (char *)&striping_info_buf, size);
+ +      dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ +      if (ret < 0)
+ +              return ret;
+ +      if (ret < size)
+ +              return -ERANGE;
+ +
+ +      /*
+ +       * We don't actually support the "fancy striping" feature
+ +       * (STRIPINGV2) yet, but if the striping sizes are the
+ +       * defaults the behavior is the same as before.  So find
+ +       * out, and only fail if the image has non-default values.
+ +       */
+ +      ret = -EINVAL;
+ +      obj_size = (u64)1 << rbd_dev->header.obj_order;
+ +      p = &striping_info_buf;
+ +      stripe_unit = ceph_decode_64(&p);
+ +      if (stripe_unit != obj_size) {
+ +              rbd_warn(rbd_dev, "unsupported stripe unit "
+ +                              "(got %llu want %llu)",
+ +                              stripe_unit, obj_size);
+ +              return -EINVAL;
+ +      }
+ +      stripe_count = ceph_decode_64(&p);
+ +      if (stripe_count != 1) {
+ +              rbd_warn(rbd_dev, "unsupported stripe count "
+ +                              "(got %llu want 1)", stripe_count);
+ +              return -EINVAL;
+ +      }
+ +      rbd_dev->header.stripe_unit = stripe_unit;
+ +      rbd_dev->header.stripe_count = stripe_count;
+ +
+ +      return 0;
+ +}
+ +
   static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
   {
         size_t image_id_size;
@@@ -3785,8 -3035,8 +3785,8 @@@
                 return NULL;
   
         p = image_id;
- -      end = (char *) image_id + image_id_size;
- -      ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
+ +      end = image_id + image_id_size;
+ +      ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
   
         size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
         reply_buf = kmalloc(size, GFP_KERNEL);
@@@ -3796,12 -3046,11 +3796,12 @@@
         ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
                                 "rbd", "dir_get_name",
                                 image_id, image_id_size,
- -                              (char *) reply_buf, size, NULL);
+ +                              reply_buf, size);
         if (ret < 0)
                 goto out;
         p = reply_buf;
- -      end = (char *) reply_buf + size;
+ +      end = reply_buf + ret;
+ +
         image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
         if (IS_ERR(image_name))
                 image_name = NULL;
@@@ -3814,134 -3063,69 +3814,134 @@@ out
         return image_name;
   }
   
+ +static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+ +{
+ +      struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+ +      const char *snap_name;
+ +      u32 which = 0;
+ +
+ +      /* Skip over names until we find the one we are looking for */
+ +
+ +      snap_name = rbd_dev->header.snap_names;
+ +      while (which < snapc->num_snaps) {
+ +              if (!strcmp(name, snap_name))
+ +                      return snapc->snaps[which];
+ +              snap_name += strlen(snap_name) + 1;
+ +              which++;
+ +      }
+ +      return CEPH_NOSNAP;
+ +}
+ +
+ +static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+ +{
+ +      struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+ +      u32 which;
+ +      bool found = false;
+ +      u64 snap_id;
+ +
+ +      for (which = 0; !found && which < snapc->num_snaps; which++) {
+ +              const char *snap_name;
+ +
+ +              snap_id = snapc->snaps[which];
+ +              snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
+ +              if (IS_ERR(snap_name))
+ +                      break;
+ +              found = !strcmp(name, snap_name);
+ +              kfree(snap_name);
+ +      }
+ +      return found ? snap_id : CEPH_NOSNAP;
+ +}
+ +
   /*
- - * When a parent image gets probed, we only have the pool, image,
- - * and snapshot ids but not the names of any of them.  This call
- - * is made later to fill in those names.  It has to be done after
- - * rbd_dev_snaps_update() has completed because some of the
- - * information (in particular, snapshot name) is not available
- - * until then.
+ + * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
+ + * no snapshot by that name is found, or if an error occurs.
    */
- -static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
+ +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
   {
- -      struct ceph_osd_client *osdc;
- -      const char *name;
- -      void *reply_buf = NULL;
+ +      if (rbd_dev->image_format == 1)
+ +              return rbd_v1_snap_id_by_name(rbd_dev, name);
+ +
+ +      return rbd_v2_snap_id_by_name(rbd_dev, name);
+ +}
+ +
+ +/*
+ + * When an rbd image has a parent image, it is identified by the
+ + * pool, image, and snapshot ids (not names).  This function fills
+ + * in the names for those ids.  (It's OK if we can't figure out the
+ + * name for an image id, but the pool and snapshot ids should always
+ + * exist and have names.)  All names in an rbd spec are dynamically
+ + * allocated.
+ + *
+ + * When an image being mapped (not a parent) is probed, we have the
+ + * pool name and pool id, image name and image id, and the snapshot
+ + * name.  The only thing we're missing is the snapshot id.
+ + */
+ +static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
+ +{
+ +      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ +      struct rbd_spec *spec = rbd_dev->spec;
+ +      const char *pool_name;
+ +      const char *image_name;
+ +      const char *snap_name;
         int ret;
   
- -      if (rbd_dev->spec->pool_name)
- -              return 0;       /* Already have the names */
+ +      /*
+ +       * An image being mapped will have the pool name (etc.), but
+ +       * we need to look up the snapshot id.
+ +       */
+ +      if (spec->pool_name) {
+ +              if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
+ +                      u64 snap_id;
+ +
+ +                      snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
+ +                      if (snap_id == CEPH_NOSNAP)
+ +                              return -ENOENT;
+ +                      spec->snap_id = snap_id;
+ +              } else {
+ +                      spec->snap_id = CEPH_NOSNAP;
+ +              }
   
- -      /* Look up the pool name */
+ +              return 0;
+ +      }
   
- -      osdc = &rbd_dev->rbd_client->client->osdc;
- -      name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
- -      if (!name) {
- -              rbd_warn(rbd_dev, "there is no pool with id %llu",
- -                      rbd_dev->spec->pool_id);        /* Really a BUG() */
+ +      /* Get the pool name; we have to make our own copy of this */
+ +
+ +      pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
+ +      if (!pool_name) {
+ +              rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
                 return -EIO;
         }
- -
- -      rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
- -      if (!rbd_dev->spec->pool_name)
+ +      pool_name = kstrdup(pool_name, GFP_KERNEL);
+ +      if (!pool_name)
                 return -ENOMEM;
   
         /* Fetch the image name; tolerate failure here */
   
- -      name = rbd_dev_image_name(rbd_dev);
- -      if (name)
- -              rbd_dev->spec->image_name = (char *) name;
- -      else
+ +      image_name = rbd_dev_image_name(rbd_dev);
+ +      if (!image_name)
                 rbd_warn(rbd_dev, "unable to get image name");
   
- -      /* Look up the snapshot name. */
+ +      /* Look up the snapshot name, and make a copy */
   
- -      name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
- -      if (!name) {
- -              rbd_warn(rbd_dev, "no snapshot with id %llu",
- -                      rbd_dev->spec->snap_id);        /* Really a BUG() */
- -              ret = -EIO;
+ +      snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
+ +      if (!snap_name) {
+ +              ret = -ENOMEM;
                 goto out_err;
         }
- -      rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
- -      if(!rbd_dev->spec->snap_name)
- -              goto out_err;
+ +
+ +      spec->pool_name = pool_name;
+ +      spec->image_name = image_name;
+ +      spec->snap_name = snap_name;
   
         return 0;
   out_err:
- -      kfree(reply_buf);
- -      kfree(rbd_dev->spec->pool_name);
- -      rbd_dev->spec->pool_name = NULL;
+ +      kfree(image_name);
+ +      kfree(pool_name);
   
         return ret;
   }
   
- -static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
+ +static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
   {
         size_t size;
         int ret;
@@@ -3966,15 -3150,16 +3966,15 @@@
                 return -ENOMEM;
   
         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
- -                              "rbd", "get_snapcontext",
- -                              NULL, 0,
- -                              reply_buf, size, ver);
+ +                              "rbd", "get_snapcontext", NULL, 0,
+ +                              reply_buf, size);
         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
         if (ret < 0)
                 goto out;
   
- -      ret = -ERANGE;
         p = reply_buf;
- -      end = (char *) reply_buf + size;
+ +      end = reply_buf + ret;
+ +      ret = -ERANGE;
         ceph_decode_64_safe(&p, end, seq, out);
         ceph_decode_32_safe(&p, end, snap_count, out);
   
@@@ -3991,33 -3176,37 +3991,33 @@@
         }
         if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
                 goto out;
+ +      ret = 0;
   
- -      size = sizeof (struct ceph_snap_context) +
- -                              snap_count * sizeof (snapc->snaps[0]);
- -      snapc = kmalloc(size, GFP_KERNEL);
+ +      snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
         if (!snapc) {
                 ret = -ENOMEM;
                 goto out;
         }
- -
- -      atomic_set(&snapc->nref, 1);
         snapc->seq = seq;
- -      snapc->num_snaps = snap_count;
         for (i = 0; i < snap_count; i++)
                 snapc->snaps[i] = ceph_decode_64(&p);
   
         rbd_dev->header.snapc = snapc;
   
         dout("  snap context seq = %llu, snap_count = %u\n",
- -              (unsigned long long) seq, (unsigned int) snap_count);
- -
+ +              (unsigned long long)seq, (unsigned int)snap_count);
   out:
         kfree(reply_buf);
   
- -      return 0;
+ +      return ret;
   }
   
- -static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
+ +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+ +                                      u64 snap_id)
   {
         size_t size;
         void *reply_buf;
- -      __le64 snap_id;
+ +      __le64 snapid;
         int ret;
         void *p;
         void *end;
@@@ -4028,52 -3217,236 +4028,52 @@@
         if (!reply_buf)
                 return ERR_PTR(-ENOMEM);
   
- -      snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
+ +      snapid = cpu_to_le64(snap_id);
         ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
                                 "rbd", "get_snapshot_name",
- -                              (char *) &snap_id, sizeof (snap_id),
- -                              reply_buf, size, NULL);
+ +                              &snapid, sizeof (snapid),
+ +                              reply_buf, size);
         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
- -      if (ret < 0)
- -              goto out;
- -
- -      p = reply_buf;
- -      end = (char *) reply_buf + size;
- -      snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
- -      if (IS_ERR(snap_name)) {
- -              ret = PTR_ERR(snap_name);
+ +      if (ret < 0) {
+ +              snap_name = ERR_PTR(ret);
                 goto out;
- -      } else {
- -              dout("  snap_id 0x%016llx snap_name = %s\n",
- -                      (unsigned long long) le64_to_cpu(snap_id), snap_name);
         }
- -      kfree(reply_buf);
- -
- -      return snap_name;
- -out:
- -      kfree(reply_buf);
- -
- -      return ERR_PTR(ret);
- -}
- -
- -static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
- -              u64 *snap_size, u64 *snap_features)
- -{
- -      u64 snap_id;
- -      u8 order;
- -      int ret;
   
- -      snap_id = rbd_dev->header.snapc->snaps[which];
- -      ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
- -      if (ret)
- -              return ERR_PTR(ret);
- -      ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
- -      if (ret)
- -              return ERR_PTR(ret);
+ +      p = reply_buf;
+ +      end = reply_buf + ret;
+ +      snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
+ +      if (IS_ERR(snap_name))
+ +              goto out;
   
- -      return rbd_dev_v2_snap_name(rbd_dev, which);
- -}
+ +      dout("  snap_id 0x%016llx snap_name = %s\n",
+ +              (unsigned long long)snap_id, snap_name);
+ +out:
+ +      kfree(reply_buf);
   
- -static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
- -              u64 *snap_size, u64 *snap_features)
- -{
- -      if (rbd_dev->image_format == 1)
- -              return rbd_dev_v1_snap_info(rbd_dev, which,
- -                                      snap_size, snap_features);
- -      if (rbd_dev->image_format == 2)
- -              return rbd_dev_v2_snap_info(rbd_dev, which,
- -                                      snap_size, snap_features);
- -      return ERR_PTR(-EINVAL);
+ +      return snap_name;
   }
   
- -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
+ +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
   {
         int ret;
- -      __u8 obj_order;
   
         down_write(&rbd_dev->header_rwsem);
   
- -      /* Grab old order first, to see if it changes */
- -
- -      obj_order = rbd_dev->header.obj_order,
         ret = rbd_dev_v2_image_size(rbd_dev);
         if (ret)
                 goto out;
- -      if (rbd_dev->header.obj_order != obj_order) {
- -              ret = -EIO;
- -              goto out;
- -      }
         rbd_update_mapping_size(rbd_dev);
   
- -      ret = rbd_dev_v2_snap_context(rbd_dev, hver);
+ +      ret = rbd_dev_v2_snap_context(rbd_dev);
         dout("rbd_dev_v2_snap_context returned %d\n", ret);
         if (ret)
                 goto out;
- -      ret = rbd_dev_snaps_update(rbd_dev);
- -      dout("rbd_dev_snaps_update returned %d\n", ret);
- -      if (ret)
- -              goto out;
- -      ret = rbd_dev_snaps_register(rbd_dev);
- -      dout("rbd_dev_snaps_register returned %d\n", ret);
   out:
         up_write(&rbd_dev->header_rwsem);
   
         return ret;
   }
   
- -/*
- - * Scan the rbd device's current snapshot list and compare it to the
- - * newly-received snapshot context.  Remove any existing snapshots
- - * not present in the new snapshot context.  Add a new snapshot for
- - * any snaphots in the snapshot context not in the current list.
- - * And verify there are no changes to snapshots we already know
- - * about.
- - *
- - * Assumes the snapshots in the snapshot context are sorted by
- - * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
- - * are also maintained in that order.)
- - */
- -static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
- -{
- -      struct ceph_snap_context *snapc = rbd_dev->header.snapc;
- -      const u32 snap_count = snapc->num_snaps;
- -      struct list_head *head = &rbd_dev->snaps;
- -      struct list_head *links = head->next;
- -      u32 index = 0;
- -
- -      dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
- -      while (index < snap_count || links != head) {
- -              u64 snap_id;
- -              struct rbd_snap *snap;
- -              char *snap_name;
- -              u64 snap_size = 0;
- -              u64 snap_features = 0;
- -
- -              snap_id = index < snap_count ? snapc->snaps[index]
- -                                           : CEPH_NOSNAP;
- -              snap = links != head ? list_entry(links, struct rbd_snap, node)
- -                                   : NULL;
- -              rbd_assert(!snap || snap->id != CEPH_NOSNAP);
- -
- -              if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
- -                      struct list_head *next = links->next;
- -
- -                      /*
- -                       * A previously-existing snapshot is not in
- -                       * the new snap context.
- -                       *
- -                       * If the now missing snapshot is the one the
- -                       * image is mapped to, clear its exists flag
- -                       * so we can avoid sending any more requests
- -                       * to it.
- -                       */
- -                      if (rbd_dev->spec->snap_id == snap->id)
- -                              clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
- -                      rbd_remove_snap_dev(snap);
- -                      dout("%ssnap id %llu has been removed\n",
- -                              rbd_dev->spec->snap_id == snap->id ?
- -                                                      "mapped " : "",
- -                              (unsigned long long) snap->id);
- -
- -                      /* Done with this list entry; advance */
- -
- -                      links = next;
- -                      continue;
- -              }
- -
- -              snap_name = rbd_dev_snap_info(rbd_dev, index,
- -                                      &snap_size, &snap_features);
- -              if (IS_ERR(snap_name))
- -                      return PTR_ERR(snap_name);
- -
- -              dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
- -                      (unsigned long long) snap_id);
- -              if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
- -                      struct rbd_snap *new_snap;
- -
- -                      /* We haven't seen this snapshot before */
- -
- -                      new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
- -                                      snap_id, snap_size, snap_features);
- -                      if (IS_ERR(new_snap)) {
- -                              int err = PTR_ERR(new_snap);
- -
- -                              dout("  failed to add dev, error %d\n", err);
- -
- -                              return err;
- -                      }
- -
- -                      /* New goes before existing, or at end of list */
- -
- -                      dout("  added dev%s\n", snap ? "" : " at end\n");
- -                      if (snap)
- -                              list_add_tail(&new_snap->node, &snap->node);
- -                      else
- -                              list_add_tail(&new_snap->node, head);
- -              } else {
- -                      /* Already have this one */
- -
- -                      dout("  already present\n");
- -
- -                      rbd_assert(snap->size == snap_size);
- -                      rbd_assert(!strcmp(snap->name, snap_name));
- -                      rbd_assert(snap->features == snap_features);
- -
- -                      /* Done with this list entry; advance */
- -
- -                      links = links->next;
- -              }
- -
- -              /* Advance to the next entry in the snapshot context */
- -
- -              index++;
- -      }
- -      dout("%s: done\n", __func__);
- -
- -      return 0;
- -}
- -
- -/*
- - * Scan the list of snapshots and register the devices for any that
- - * have not already been registered.
- - */
- -static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
- -{
- -      struct rbd_snap *snap;
- -      int ret = 0;
- -
- -      dout("%s:\n", __func__);
- -      if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
- -              return -EIO;
- -
- -      list_for_each_entry(snap, &rbd_dev->snaps, node) {
- -              if (!rbd_snap_registered(snap)) {
- -                      ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
- -                      if (ret < 0)
- -                              break;
- -              }
- -      }
- -      dout("%s: returning %d\n", __func__, ret);
- -
- -      return ret;
- -}
- -
   static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
   {
         struct device *dev;
@@@ -4085,7 -3458,7 +4085,7 @@@
         dev->bus = &rbd_bus_type;
         dev->type = &rbd_device_type;
         dev->parent = &rbd_root_dev;
- -      dev->release = rbd_dev_release;
+ +      dev->release = rbd_dev_device_release;
         dev_set_name(dev, "%d", rbd_dev->dev_id);
         ret = device_register(dev);
   
@@@ -4299,7 -3672,6 +4299,7 @@@ static int rbd_add_parse_args(const cha
         size_t len;
         char *options;
         const char *mon_addrs;
+ +      char *snap_name;
         size_t mon_addrs_size;
         struct rbd_spec *spec = NULL;
         struct rbd_options *rbd_opts = NULL;
@@@ -4358,11 -3730,10 +4358,11 @@@
                 ret = -ENAMETOOLONG;
                 goto out_err;
         }
- -      spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
- -      if (!spec->snap_name)
+ +      snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
+ +      if (!snap_name)
                 goto out_mem;
- -      *(spec->snap_name + len) = '\0';
+ +      *(snap_name + len) = '\0';
+ +      spec->snap_name = snap_name;
   
         /* Initialize all rbd options to the defaults */
   
@@@ -4416,19 -3787,15 +4416,19 @@@ static int rbd_dev_image_id(struct rbd_
         size_t size;
         char *object_name;
         void *response;
- -      void *p;
+ +      char *image_id;
   
         /*
          * When probing a parent image, the image id is already
          * known (and the image name likely is not).  There's no
- -       * need to fetch the image id again in this case.
+ +       * need to fetch the image id again in this case.  We
+ +       * do still need to set the image format though.
          */
- -      if (rbd_dev->spec->image_id)
+ +      if (rbd_dev->spec->image_id) {
+ +              rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
+ +
                 return 0;
+ +      }
   
         /*
          * First, see if the format 2 image id file exists, and if
@@@ -4450,32 -3817,23 +4450,32 @@@
                 goto out;
         }
   
+ +      /* If it doesn't exist we'll assume it's a format 1 image */
+ +
         ret = rbd_obj_method_sync(rbd_dev, object_name,
- -                              "rbd", "get_id",
- -                              NULL, 0,
- -                              response, RBD_IMAGE_ID_LEN_MAX, NULL);
+ +                              "rbd", "get_id", NULL, 0,
+ +                              response, RBD_IMAGE_ID_LEN_MAX);
         dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
- -      if (ret < 0)
- -              goto out;
- -
- -      p = response;
- -      rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
- -                                              p + RBD_IMAGE_ID_LEN_MAX,
+ +      if (ret == -ENOENT) {
+ +              image_id = kstrdup("", GFP_KERNEL);
+ +              ret = image_id ? 0 : -ENOMEM;
+ +              if (!ret)
+ +                      rbd_dev->image_format = 1;
+ +      } else if (ret > sizeof (__le32)) {
+ +              void *p = response;
+ +
+ +              image_id = ceph_extract_encoded_string(&p, p + ret,
                                                 NULL, GFP_NOIO);
- -      if (IS_ERR(rbd_dev->spec->image_id)) {
- -              ret = PTR_ERR(rbd_dev->spec->image_id);
- -              rbd_dev->spec->image_id = NULL;
+ +              ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
+ +              if (!ret)
+ +                      rbd_dev->image_format = 2;
         } else {
- -              dout("image_id is %s\n", rbd_dev->spec->image_id);
+ +              ret = -EINVAL;
+ +      }
+ +
+ +      if (!ret) {
+ +              rbd_dev->spec->image_id = image_id;
+ +              dout("image_id is %s\n", image_id);
         }
   out:
         kfree(response);
@@@ -4484,30 -3842,27 +4484,30 @@@
         return ret;
   }
   
- -static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
+ +/* Undo whatever state changes are made by v1 or v2 image probe */
+ +
+ +static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
   {
- -      int ret;
- -      size_t size;
+ +      struct rbd_image_header *header;
   
- -      /* Version 1 images have no id; empty string is used */
+ +      rbd_dev_remove_parent(rbd_dev);
+ +      rbd_spec_put(rbd_dev->parent_spec);
+ +      rbd_dev->parent_spec = NULL;
+ +      rbd_dev->parent_overlap = 0;
   
- -      rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
- -      if (!rbd_dev->spec->image_id)
- -              return -ENOMEM;
+ +      /* Free dynamic fields from the header, then zero it out */
   
- -      /* Record the header object name for this rbd image. */
+ +      header = &rbd_dev->header;
+ +      ceph_put_snap_context(header->snapc);
+ +      kfree(header->snap_sizes);
+ +      kfree(header->snap_names);
+ +      kfree(header->object_prefix);
+ +      memset(header, 0, sizeof (*header));
+ +}
   
- -      size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
- -      rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
- -      if (!rbd_dev->header_name) {
- -              ret = -ENOMEM;
- -              goto out_err;
- -      }
- -      sprintf(rbd_dev->header_name, "%s%s",
- -              rbd_dev->spec->image_name, RBD_SUFFIX);
+ +static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
+ +{
+ +      int ret;
   
         /* Populate rbd image metadata */
   
@@@ -4520,6 -3875,8 +4520,6 @@@
         rbd_dev->parent_spec = NULL;
         rbd_dev->parent_overlap = 0;
   
- -      rbd_dev->image_format = 1;
- -
         dout("discovered version 1 image, header name is %s\n",
                 rbd_dev->header_name);
   
@@@ -4536,45 -3893,43 +4536,45 @@@ out_err
   
   static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
   {
- -      size_t size;
         int ret;
- -      u64 ver = 0;
- -
- -      /*
- -       * Image id was filled in by the caller.  Record the header
- -       * object name for this rbd image.
- -       */
- -      size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
- -      rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
- -      if (!rbd_dev->header_name)
- -              return -ENOMEM;
- -      sprintf(rbd_dev->header_name, "%s%s",
- -                      RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
- -
- -      /* Get the size and object order for the image */
   
         ret = rbd_dev_v2_image_size(rbd_dev);
- -      if (ret < 0)
+ +      if (ret)
                 goto out_err;
   
         /* Get the object prefix (a.k.a. block_name) for the image */
   
         ret = rbd_dev_v2_object_prefix(rbd_dev);
- -      if (ret < 0)
+ +      if (ret)
                 goto out_err;
   
         /* Get the and check features for the image */
   
         ret = rbd_dev_v2_features(rbd_dev);
- -      if (ret < 0)
+ +      if (ret)
                 goto out_err;
   
         /* If the image supports layering, get the parent info */
   
         if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
                 ret = rbd_dev_v2_parent_info(rbd_dev);
+ +              if (ret)
+ +                      goto out_err;
+ +
+ +              /*
+ +               * Don't print a warning for parent images.  We can
+ +               * tell this point because we won't know its pool
+ +               * name yet (just its pool id).
+ +               */
+ +              if (rbd_dev->spec->pool_name)
+ +                      rbd_warn(rbd_dev, "WARNING: kernel layering "
+ +                                      "is EXPERIMENTAL!");
+ +      }
+ +
+ +      /* If the image supports fancy striping, get its parameters */
+ +
+ +      if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
+ +              ret = rbd_dev_v2_striping_info(rbd_dev);
                 if (ret < 0)
                         goto out_err;
         }
@@@ -4586,9 -3941,12 +4586,9 @@@
   
         /* Get the snapshot context, plus the header version */
   
- -      ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
+ +      ret = rbd_dev_v2_snap_context(rbd_dev);
         if (ret)
                 goto out_err;
- -      rbd_dev->header.obj_version = ver;
- -
- -      rbd_dev->image_format = 2;
   
         dout("discovered version 2 image, header name is %s\n",
                 rbd_dev->header_name);
@@@ -4606,54 -3964,22 +4606,54 @@@ out_err
         return ret;
   }
   
- -static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
+ +static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
   {
+ +      struct rbd_device *parent = NULL;
+ +      struct rbd_spec *parent_spec;
+ +      struct rbd_client *rbdc;
         int ret;
   
- -      /* no need to lock here, as rbd_dev is not registered yet */
- -      ret = rbd_dev_snaps_update(rbd_dev);
- -      if (ret)
- -              return ret;
+ +      if (!rbd_dev->parent_spec)
+ +              return 0;
+ +      /*
+ +       * We need to pass a reference to the client and the parent
+ +       * spec when creating the parent rbd_dev.  Images related by
+ +       * parent/child relationships always share both.
+ +       */
+ +      parent_spec = rbd_spec_get(rbd_dev->parent_spec);
+ +      rbdc = __rbd_get_client(rbd_dev->rbd_client);
   
- -      ret = rbd_dev_probe_update_spec(rbd_dev);
- -      if (ret)
- -              goto err_out_snaps;
+ +      ret = -ENOMEM;
+ +      parent = rbd_dev_create(rbdc, parent_spec);
+ +      if (!parent)
+ +              goto out_err;
   
- -      ret = rbd_dev_set_mapping(rbd_dev);
+ +      ret = rbd_dev_image_probe(parent);
+ +      if (ret < 0)
+ +              goto out_err;
+ +      rbd_dev->parent = parent;
+ +
+ +      return 0;
+ +out_err:
+ +      if (parent) {
+ +              rbd_spec_put(rbd_dev->parent_spec);
+ +              kfree(rbd_dev->header_name);
+ +              rbd_dev_destroy(parent);
+ +      } else {
+ +              rbd_put_client(rbdc);
+ +              rbd_spec_put(parent_spec);
+ +      }
+ +
+ +      return ret;
+ +}
+ +
+ +static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
+ +{
+ +      int ret;
+ +
+ +      ret = rbd_dev_mapping_set(rbd_dev);
         if (ret)
- -              goto err_out_snaps;
+ +              return ret;
   
         /* generate unique id: find highest unique id, add one */
         rbd_dev_id_get(rbd_dev);
@@@ -4680,81 -4006,54 +4680,81 @@@
         if (ret)
                 goto err_out_disk;
   
- -      /*
- -       * At this point cleanup in the event of an error is the job
- -       * of the sysfs code (initiated by rbd_bus_del_dev()).
- -       */
- -      down_write(&rbd_dev->header_rwsem);
- -      ret = rbd_dev_snaps_register(rbd_dev);
- -      up_write(&rbd_dev->header_rwsem);
- -      if (ret)
- -              goto err_out_bus;
- -
- -      ret = rbd_dev_header_watch_sync(rbd_dev, 1);
- -      if (ret)
- -              goto err_out_bus;
- -
         /* Everything's ready.  Announce the disk to the world. */
   
+ +      set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
+ +      set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
         add_disk(rbd_dev->disk);
   
         pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
                 (unsigned long long) rbd_dev->mapping.size);
   
         return ret;
- -err_out_bus:
- -      /* this will also clean up rest of rbd_dev stuff */
- -
- -      rbd_bus_del_dev(rbd_dev);
   
- -      return ret;
   err_out_disk:
         rbd_free_disk(rbd_dev);
   err_out_blkdev:
         unregister_blkdev(rbd_dev->major, rbd_dev->name);
   err_out_id:
         rbd_dev_id_put(rbd_dev);
- -err_out_snaps:
- -      rbd_remove_all_snaps(rbd_dev);
+ +      rbd_dev_mapping_clear(rbd_dev);
   
         return ret;
   }
   
+ +static int rbd_dev_header_name(struct rbd_device *rbd_dev)
+ +{
+ +      struct rbd_spec *spec = rbd_dev->spec;
+ +      size_t size;
+ +
+ +      /* Record the header object name for this rbd image. */
+ +
+ +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ +
+ +      if (rbd_dev->image_format == 1)
+ +              size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
+ +      else
+ +              size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
+ +
+ +      rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
+ +      if (!rbd_dev->header_name)
+ +              return -ENOMEM;
+ +
+ +      if (rbd_dev->image_format == 1)
+ +              sprintf(rbd_dev->header_name, "%s%s",
+ +                      spec->image_name, RBD_SUFFIX);
+ +      else
+ +              sprintf(rbd_dev->header_name, "%s%s",
+ +                      RBD_HEADER_PREFIX, spec->image_id);
+ +      return 0;
+ +}
+ +
+ +static void rbd_dev_image_release(struct rbd_device *rbd_dev)
+ +{
+ +      int ret;
+ +
+ +      rbd_dev_unprobe(rbd_dev);
+ +      ret = rbd_dev_header_watch_sync(rbd_dev, 0);
+ +      if (ret)
+ +              rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
+ +      kfree(rbd_dev->header_name);
+ +      rbd_dev->header_name = NULL;
+ +      rbd_dev->image_format = 0;
+ +      kfree(rbd_dev->spec->image_id);
+ +      rbd_dev->spec->image_id = NULL;
+ +
+ +      rbd_dev_destroy(rbd_dev);
+ +}
+ +
   /*
    * Probe for the existence of the header object for the given rbd
    * device.  For format 2 images this includes determining the image
    * id.
    */
- -static int rbd_dev_probe(struct rbd_device *rbd_dev)
+ +static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
   {
         int ret;
+ +      int tmp;
   
         /*
          * Get the id from the image id object.  If it's not a
@@@ -4763,48 -4062,18 +4763,48 @@@
          */
         ret = rbd_dev_image_id(rbd_dev);
         if (ret)
+ +              return ret;
+ +      rbd_assert(rbd_dev->spec->image_id);
+ +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ +
+ +      ret = rbd_dev_header_name(rbd_dev);
+ +      if (ret)
+ +              goto err_out_format;
+ +
+ +      ret = rbd_dev_header_watch_sync(rbd_dev, 1);
+ +      if (ret)
+ +              goto out_header_name;
+ +
+ +      if (rbd_dev->image_format == 1)
                 ret = rbd_dev_v1_probe(rbd_dev);
         else
                 ret = rbd_dev_v2_probe(rbd_dev);
- -      if (ret) {
- -              dout("probe failed, returning %d\n", ret);
- -
- -              return ret;
- -      }
+ +      if (ret)
+ +              goto err_out_watch;
   
- -      ret = rbd_dev_probe_finish(rbd_dev);
+ +      ret = rbd_dev_spec_update(rbd_dev);
         if (ret)
- -              rbd_header_free(&rbd_dev->header);
+ +              goto err_out_probe;
+ +
+ +      ret = rbd_dev_probe_parent(rbd_dev);
+ +      if (!ret)
+ +              return 0;
+ +
+ +err_out_probe:
+ +      rbd_dev_unprobe(rbd_dev);
+ +err_out_watch:
+ +      tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
+ +      if (tmp)
+ +              rbd_warn(rbd_dev, "unable to tear down watch request\n");
+ +out_header_name:
+ +      kfree(rbd_dev->header_name);
+ +      rbd_dev->header_name = NULL;
+ +err_out_format:
+ +      rbd_dev->image_format = 0;
+ +      kfree(rbd_dev->spec->image_id);
+ +      rbd_dev->spec->image_id = NULL;
+ +
+ +      dout("probe failed, returning %d\n", ret);
   
         return ret;
   }
@@@ -4841,13 -4110,11 +4841,13 @@@ static ssize_t rbd_add(struct bus_type 
         rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
         if (rc < 0)
                 goto err_out_client;
- -      spec->pool_id = (u64) rc;
+ +      spec->pool_id = (u64)rc;
   
         /* The ceph file layout needs to fit pool id in 32 bits */
   
- -      if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
+ +      if (spec->pool_id > (u64)U32_MAX) {
+ +              rbd_warn(NULL, "pool id too large (%llu > %u)\n",
+ +                              (unsigned long long)spec->pool_id, U32_MAX);
                 rc = -EIO;
                 goto err_out_client;
         }
@@@ -4862,15 -4129,11 +4862,15 @@@
         kfree(rbd_opts);
         rbd_opts = NULL;        /* done with this */
   
- -      rc = rbd_dev_probe(rbd_dev);
+ +      rc = rbd_dev_image_probe(rbd_dev);
         if (rc < 0)
                 goto err_out_rbd_dev;
   
- -      return count;
+ +      rc = rbd_dev_device_setup(rbd_dev);
+ +      if (!rc)
+ +              return count;
+ +
+ +      rbd_dev_image_release(rbd_dev);
   err_out_rbd_dev:
         rbd_dev_destroy(rbd_dev);
   err_out_client:
@@@ -4885,7 -4148,7 +4885,7 @@@ err_out_module
   
         dout("Error adding device %s\n", buf);
   
- -      return (ssize_t) rc;
+ +      return (ssize_t)rc;
   }
   
   static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
@@@ -4905,43 -4168,27 +4905,43 @@@
         return NULL;
   }
   
- -static void rbd_dev_release(struct device *dev)
+ +static void rbd_dev_device_release(struct device *dev)
   {
         struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
   
- -      if (rbd_dev->watch_event)
- -              rbd_dev_header_watch_sync(rbd_dev, 0);
- -
- -      /* clean up and free blkdev */
         rbd_free_disk(rbd_dev);
+ +      clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+ +      rbd_dev_clear_mapping(rbd_dev);
         unregister_blkdev(rbd_dev->major, rbd_dev->name);
- -
- -      /* release allocated disk header fields */
- -      rbd_header_free(&rbd_dev->header);
- -
- -      /* done with the id, and with the rbd_dev */
+ +      rbd_dev->major = 0;
         rbd_dev_id_put(rbd_dev);
- -      rbd_assert(rbd_dev->rbd_client != NULL);
- -      rbd_dev_destroy(rbd_dev);
+ +      rbd_dev_mapping_clear(rbd_dev);
+ +}
   
- -      /* release module ref */
- -      module_put(THIS_MODULE);
+ +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
+ +{
+ +      while (rbd_dev->parent) {
+ +              struct rbd_device *first = rbd_dev;
+ +              struct rbd_device *second = first->parent;
+ +              struct rbd_device *third;
+ +
+ +              /*
+ +               * Follow to the parent with no grandparent and
+ +               * remove it.
+ +               */
+ +              while (second && (third = second->parent)) {
+ +                      first = second;
+ +                      second = third;
+ +              }
+ +              rbd_assert(second);
+ +              rbd_dev_image_release(second);
+ +              first->parent = NULL;
+ +              first->parent_overlap = 0;
+ +
+ +              rbd_assert(first->parent_spec);
+ +              rbd_spec_put(first->parent_spec);
+ +              first->parent_spec = NULL;
+ +      }
   }
   
   static ssize_t rbd_remove(struct bus_type *bus,
@@@ -4949,13 -4196,13 +4949,13 @@@
                           size_t count)
   {
         struct rbd_device *rbd_dev = NULL;
- -      int target_id, rc;
+ +      int target_id;
         unsigned long ul;
- -      int ret = count;
+ +      int ret;
   
- -      rc = strict_strtoul(buf, 10, &ul);
- -      if (rc)
- -              return rc;
+ +      ret = strict_strtoul(buf, 10, &ul);
+ +      if (ret)
+ +              return ret;
   
         /* convert to int; abort if we lost anything in the conversion */
         target_id = (int) ul;
@@@ -4978,10 -4225,10 +4978,10 @@@
         spin_unlock_irq(&rbd_dev->lock);
         if (ret < 0)
                 goto done;
- -
- -      rbd_remove_all_snaps(rbd_dev);
+ +      ret = count;
         rbd_bus_del_dev(rbd_dev);
- -
+ +      rbd_dev_image_release(rbd_dev);
+ +      module_put(THIS_MODULE);
   done:
         mutex_unlock(&ctl_mutex);
   
@@@ -5013,56 -4260,6 +5013,56 @@@ static void rbd_sysfs_cleanup(void
         device_unregister(&rbd_root_dev);
   }
   
+ +static int rbd_slab_init(void)
+ +{
+ +      rbd_assert(!rbd_img_request_cache);
+ +      rbd_img_request_cache = kmem_cache_create("rbd_img_request",
+ +                                      sizeof (struct rbd_img_request),
+ +                                      __alignof__(struct rbd_img_request),
+ +                                      0, NULL);
+ +      if (!rbd_img_request_cache)
+ +              return -ENOMEM;
+ +
+ +      rbd_assert(!rbd_obj_request_cache);
+ +      rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
+ +                                      sizeof (struct rbd_obj_request),
+ +                                      __alignof__(struct rbd_obj_request),
+ +                                      0, NULL);
+ +      if (!rbd_obj_request_cache)
+ +              goto out_err;
+ +
+ +      rbd_assert(!rbd_segment_name_cache);
+ +      rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
+ +                                      MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
+ +      if (rbd_segment_name_cache)
+ +              return 0;
+ +out_err:
+ +      if (rbd_obj_request_cache) {
+ +              kmem_cache_destroy(rbd_obj_request_cache);
+ +              rbd_obj_request_cache = NULL;
+ +      }
+ +
+ +      kmem_cache_destroy(rbd_img_request_cache);
+ +      rbd_img_request_cache = NULL;
+ +
+ +      return -ENOMEM;
+ +}
+ +
+ +static void rbd_slab_exit(void)
+ +{
+ +      rbd_assert(rbd_segment_name_cache);
+ +      kmem_cache_destroy(rbd_segment_name_cache);
+ +      rbd_segment_name_cache = NULL;
+ +
+ +      rbd_assert(rbd_obj_request_cache);
+ +      kmem_cache_destroy(rbd_obj_request_cache);
+ +      rbd_obj_request_cache = NULL;
+ +
+ +      rbd_assert(rbd_img_request_cache);
+ +      kmem_cache_destroy(rbd_img_request_cache);
+ +      rbd_img_request_cache = NULL;
+ +}
+ +
   static int __init rbd_init(void)
   {
         int rc;
@@@ -5072,22 -4269,16 +5072,22 @@@
   
                 return -EINVAL;
         }
- -      rc = rbd_sysfs_init();
+ +      rc = rbd_slab_init();
         if (rc)
                 return rc;
- -      pr_info("loaded " RBD_DRV_NAME_LONG "\n");
- -      return 0;
+ +      rc = rbd_sysfs_init();
+ +      if (rc)
+ +              rbd_slab_exit();
+ +      else
+ +              pr_info("loaded " RBD_DRV_NAME_LONG "\n");
+ +
+ +      return rc;
   }
   
   static void __exit rbd_exit(void)
   {
         rbd_sysfs_cleanup();
+ +      rbd_slab_exit();
   }
   
   module_init(rbd_init);
diff --combined drivers/md/md.c

index 6330c727396cd6071f85e1a20bb4103d6aeeb322,1d03ebde40b51885cd63950d695b19fe5b9214de..681d1099a2d58936864b3b63610a31f38a908219
--- 1/drivers/md/md.c
--- 2/drivers/md/md.c
+++ b/drivers/md/md.c
@@@ -72,9 -72,6 +72,9 @@@ static DECLARE_WAIT_QUEUE_HEAD(resync_w
   static struct workqueue_struct *md_wq;
   static struct workqueue_struct *md_misc_wq;
   
+ +static int remove_and_add_spares(struct mddev *mddev,
+ +                               struct md_rdev *this);
+ +
   #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
   
   /*
@@@ -197,21 -194,12 +197,12 @@@ void md_trim_bio(struct bio *bio, int o
         if (offset == 0 && size == bio->bi_size)
                 return;
   
-       bio->bi_sector += offset;
-       bio->bi_size = size;
-       offset <<= 9;
         clear_bit(BIO_SEG_VALID, &bio->bi_flags);
   
-       while (bio->bi_idx < bio->bi_vcnt &&
-              bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
-               /* remove this whole bio_vec */
-               offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
-               bio->bi_idx++;
-       }
-       if (bio->bi_idx < bio->bi_vcnt) {
-               bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
-               bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
-       }
+       bio_advance(bio, offset << 9);
+ 
+       bio->bi_size = size;
+ 
         /* avoid any complications with bi_idx being non-zero*/
         if (bio->bi_idx) {
                 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
@@@ -1567,8 -1555,8 +1558,8 @@@ static int super_1_load(struct md_rdev 
                                              sector, count, 1) == 0)
                                 return -EINVAL;
                 }
- -      } else if (sb->bblog_offset == 0)
- -              rdev->badblocks.shift = -1;
+ +      } else if (sb->bblog_offset != 0)
+ +              rdev->badblocks.shift = 0;
   
         if (!refdev) {
                 ret = 1;
@@@ -2414,11 -2402,6 +2405,11 @@@ static void md_update_sb(struct mddev 
         int nospares = 0;
         int any_badblocks_changed = 0;
   
+ +      if (mddev->ro) {
+ +              if (force_change)
+ +                      set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ +              return;
+ +      }
   repeat:
         /* First make sure individual recovery_offsets are correct */
         rdev_for_each(rdev, mddev) {
@@@ -2808,10 -2791,12 +2799,10 @@@ slot_store(struct md_rdev *rdev, const 
                 /* personality does all needed checks */
                 if (rdev->mddev->pers->hot_remove_disk == NULL)
                         return -EINVAL;
- -              err = rdev->mddev->pers->
- -                      hot_remove_disk(rdev->mddev, rdev);
- -              if (err)
- -                      return err;
- -              sysfs_unlink_rdev(rdev->mddev, rdev);
- -              rdev->raid_disk = -1;
+ +              clear_bit(Blocked, &rdev->flags);
+ +              remove_and_add_spares(rdev->mddev, rdev);
+ +              if (rdev->raid_disk >= 0)
+ +                      return -EBUSY;
                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
                 md_wakeup_thread(rdev->mddev->thread);
         } else if (rdev->mddev->pers) {
@@@ -3227,7 -3212,7 +3218,7 @@@ int md_rdev_init(struct md_rdev *rdev
          * be used - I wonder if that matters
          */
         rdev->badblocks.count = 0;
- -      rdev->badblocks.shift = 0;
+ +      rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
         rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
         seqlock_init(&rdev->badblocks.lock);
         if (rdev->badblocks.page == NULL)
@@@ -3299,6 -3284,9 +3290,6 @@@ static struct md_rdev *md_import_device
                         goto abort_free;
                 }
         }
- -      if (super_format == -1)
- -              /* hot-add for 0.90, or non-persistent: so no badblocks */
- -              rdev->badblocks.shift = -1;
   
         return rdev;
   
@@@ -4228,6 -4216,8 +4219,6 @@@ action_show(struct mddev *mddev, char *
         return sprintf(page, "%s\n", type);
   }
   
- -static void reap_sync_thread(struct mddev *mddev);
- -
   static ssize_t
   action_store(struct mddev *mddev, const char *page, size_t len)
   {
@@@ -4242,7 -4232,7 +4233,7 @@@
         if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
                 if (mddev->sync_thread) {
                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- -                      reap_sync_thread(mddev);
+ +                      md_reap_sync_thread(mddev);
                 }
         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
                    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@@ -5280,7 -5270,7 +5271,7 @@@ static void __md_stop_writes(struct mdd
         if (mddev->sync_thread) {
                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- -              reap_sync_thread(mddev);
+ +              md_reap_sync_thread(mddev);
         }
   
         del_timer_sync(&mddev->safemode_timer);
@@@ -5288,8 -5278,7 +5279,8 @@@
         bitmap_flush(mddev);
         md_super_wait(mddev);
   
- -      if (!mddev->in_sync || mddev->flags) {
+ +      if (mddev->ro == 0 &&
+ +          (!mddev->in_sync || mddev->flags)) {
                 /* mark array as shutdown cleanly */
                 mddev->in_sync = 1;
                 md_update_sb(mddev, 1);
@@@ -5812,7 -5801,7 +5803,7 @@@ static int add_new_disk(struct mddev * 
                 else
                         sysfs_notify_dirent_safe(rdev->sysfs_state);
   
- -              md_update_sb(mddev, 1);
+ +              set_bit(MD_CHANGE_DEVS, &mddev->flags);
                 if (mddev->degraded)
                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@@ -5879,9 -5868,6 +5870,9 @@@ static int hot_remove_disk(struct mdde
         if (!rdev)
                 return -ENXIO;
   
+ +      clear_bit(Blocked, &rdev->flags);
+ +      remove_and_add_spares(mddev, rdev);
+ +
         if (rdev->raid_disk >= 0)
                 goto busy;
   
@@@ -6495,28 -6481,6 +6486,28 @@@ static int md_ioctl(struct block_devic
                 err = md_set_readonly(mddev, bdev);
                 goto done_unlock;
   
+ +      case HOT_REMOVE_DISK:
+ +              err = hot_remove_disk(mddev, new_decode_dev(arg));
+ +              goto done_unlock;
+ +
+ +      case ADD_NEW_DISK:
+ +              /* We can support ADD_NEW_DISK on read-only arrays
+ +               * on if we are re-adding a preexisting device.
+ +               * So require mddev->pers and MD_DISK_SYNC.
+ +               */
+ +              if (mddev->pers) {
+ +                      mdu_disk_info_t info;
+ +                      if (copy_from_user(&info, argp, sizeof(info)))
+ +                              err = -EFAULT;
+ +                      else if (!(info.state & (1<<MD_DISK_SYNC)))
+ +                              /* Need to clear read-only for this */
+ +                              break;
+ +                      else
+ +                              err = add_new_disk(mddev, &info);
+ +                      goto done_unlock;
+ +              }
+ +              break;
+ +
         case BLKROSET:
                 if (get_user(ro, (int __user *)(arg))) {
                         err = -EFAULT;
@@@ -6587,6 -6551,10 +6578,6 @@@
                 goto done_unlock;
         }
   
- -      case HOT_REMOVE_DISK:
- -              err = hot_remove_disk(mddev, new_decode_dev(arg));
- -              goto done_unlock;
- -
         case HOT_ADD_DISK:
                 err = hot_add_disk(mddev, new_decode_dev(arg));
                 goto done_unlock;
@@@ -6674,13 -6642,15 +6665,13 @@@ static int md_open(struct block_device 
         return err;
   }
   
- -static int md_release(struct gendisk *disk, fmode_t mode)
+ +static void md_release(struct gendisk *disk, fmode_t mode)
   {
         struct mddev *mddev = disk->private_data;
   
         BUG_ON(!mddev);
         atomic_dec(&mddev->openers);
         mddev_put(mddev);
- -
- -      return 0;
   }
   
   static int md_media_changed(struct gendisk *disk)
@@@ -7665,16 -7635,14 +7656,16 @@@ void md_do_sync(struct md_thread *threa
   }
   EXPORT_SYMBOL_GPL(md_do_sync);
   
- -static int remove_and_add_spares(struct mddev *mddev)
+ +static int remove_and_add_spares(struct mddev *mddev,
+ +                               struct md_rdev *this)
   {
         struct md_rdev *rdev;
         int spares = 0;
         int removed = 0;
   
         rdev_for_each(rdev, mddev)
- -              if (rdev->raid_disk >= 0 &&
+ +              if ((this == NULL || rdev == this) &&
+ +                  rdev->raid_disk >= 0 &&
                     !test_bit(Blocked, &rdev->flags) &&
                     (test_bit(Faulty, &rdev->flags) ||
                      ! test_bit(In_sync, &rdev->flags)) &&
@@@ -7689,52 -7657,74 +7680,52 @@@
         if (removed && mddev->kobj.sd)
                 sysfs_notify(&mddev->kobj, NULL, "degraded");
   
+ +      if (this)
+ +              goto no_add;
+ +
         rdev_for_each(rdev, mddev) {
                 if (rdev->raid_disk >= 0 &&
                     !test_bit(In_sync, &rdev->flags) &&
                     !test_bit(Faulty, &rdev->flags))
                         spares++;
- -              if (rdev->raid_disk < 0
- -                  && !test_bit(Faulty, &rdev->flags)) {
- -                      rdev->recovery_offset = 0;
- -                      if (mddev->pers->
- -                          hot_add_disk(mddev, rdev) == 0) {
- -                              if (sysfs_link_rdev(mddev, rdev))
- -                                      /* failure here is OK */;
- -                              spares++;
- -                              md_new_event(mddev);
- -                              set_bit(MD_CHANGE_DEVS, &mddev->flags);
- -                      }
+ +              if (rdev->raid_disk >= 0)
+ +                      continue;
+ +              if (test_bit(Faulty, &rdev->flags))
+ +                      continue;
+ +              if (mddev->ro &&
+ +                  rdev->saved_raid_disk < 0)
+ +                      continue;
+ +
+ +              rdev->recovery_offset = 0;
+ +              if (rdev->saved_raid_disk >= 0 && mddev->in_sync) {
+ +                      spin_lock_irq(&mddev->write_lock);
+ +                      if (mddev->in_sync)
+ +                              /* OK, this device, which is in_sync,
+ +                               * will definitely be noticed before
+ +                               * the next write, so recovery isn't
+ +                               * needed.
+ +                               */
+ +                              rdev->recovery_offset = mddev->recovery_cp;
+ +                      spin_unlock_irq(&mddev->write_lock);
+ +              }
+ +              if (mddev->ro && rdev->recovery_offset != MaxSector)
+ +                      /* not safe to add this disk now */
+ +                      continue;
+ +              if (mddev->pers->
+ +                  hot_add_disk(mddev, rdev) == 0) {
+ +                      if (sysfs_link_rdev(mddev, rdev))
+ +                              /* failure here is OK */;
+ +                      spares++;
+ +                      md_new_event(mddev);
+ +                      set_bit(MD_CHANGE_DEVS, &mddev->flags);
                 }
         }
+ +no_add:
         if (removed)
                 set_bit(MD_CHANGE_DEVS, &mddev->flags);
         return spares;
   }
   
- -static void reap_sync_thread(struct mddev *mddev)
- -{
- -      struct md_rdev *rdev;
- -
- -      /* resync has finished, collect result */
- -      md_unregister_thread(&mddev->sync_thread);
- -      if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- -          !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- -              /* success...*/
- -              /* activate any spares */
- -              if (mddev->pers->spare_active(mddev)) {
- -                      sysfs_notify(&mddev->kobj, NULL,
- -                                   "degraded");
- -                      set_bit(MD_CHANGE_DEVS, &mddev->flags);
- -              }
- -      }
- -      if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- -          mddev->pers->finish_reshape)
- -              mddev->pers->finish_reshape(mddev);
- -
- -      /* If array is no-longer degraded, then any saved_raid_disk
- -       * information must be scrapped.  Also if any device is now
- -       * In_sync we must scrape the saved_raid_disk for that device
- -       * do the superblock for an incrementally recovered device
- -       * written out.
- -       */
- -      rdev_for_each(rdev, mddev)
- -              if (!mddev->degraded ||
- -                  test_bit(In_sync, &rdev->flags))
- -                      rdev->saved_raid_disk = -1;
- -
- -      md_update_sb(mddev, 1);
- -      clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- -      clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- -      clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- -      clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- -      clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- -      /* flag recovery needed just to double check */
- -      set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- -      sysfs_notify_dirent_safe(mddev->sysfs_action);
- -      md_new_event(mddev);
- -      if (mddev->event_work.func)
- -              queue_work(md_misc_wq, &mddev->event_work);
- -}
- -
   /*
    * This routine is regularly called by all per-raid-array threads to
    * deal with generic issues like resync and super-block update.
@@@ -7790,16 -7780,22 +7781,16 @@@ void md_check_recovery(struct mddev *md
                 int spares = 0;
   
                 if (mddev->ro) {
- -                      /* Only thing we do on a ro array is remove
- -                       * failed devices.
+ +                      /* On a read-only array we can:
+ +                       * - remove failed devices
+ +                       * - add already-in_sync devices if the array itself
+ +                       *   is in-sync.
+ +                       * As we only add devices that are already in-sync,
+ +                       * we can activate the spares immediately.
                          */
- -                      struct md_rdev *rdev;
- -                      rdev_for_each(rdev, mddev)
- -                              if (rdev->raid_disk >= 0 &&
- -                                  !test_bit(Blocked, &rdev->flags) &&
- -                                  test_bit(Faulty, &rdev->flags) &&
- -                                  atomic_read(&rdev->nr_pending)==0) {
- -                                      if (mddev->pers->hot_remove_disk(
- -                                                  mddev, rdev) == 0) {
- -                                              sysfs_unlink_rdev(mddev, rdev);
- -                                              rdev->raid_disk = -1;
- -                                      }
- -                              }
                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ +                      remove_and_add_spares(mddev, NULL);
+ +                      mddev->pers->spare_active(mddev);
                         goto unlock;
                 }
   
@@@ -7831,7 -7827,7 +7822,7 @@@
                         goto unlock;
                 }
                 if (mddev->sync_thread) {
- -                      reap_sync_thread(mddev);
+ +                      md_reap_sync_thread(mddev);
                         goto unlock;
                 }
                 /* Set RUNNING before clearing NEEDED to avoid
@@@ -7862,7 -7858,7 +7853,7 @@@
                                 goto unlock;
                         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- -              } else if ((spares = remove_and_add_spares(mddev))) {
+ +              } else if ((spares = remove_and_add_spares(mddev, NULL))) {
                         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
                         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
                         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
@@@ -7912,51 -7908,6 +7903,51 @@@
         }
   }
   
+ +void md_reap_sync_thread(struct mddev *mddev)
+ +{
+ +      struct md_rdev *rdev;
+ +
+ +      /* resync has finished, collect result */
+ +      md_unregister_thread(&mddev->sync_thread);
+ +      if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ +          !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ +              /* success...*/
+ +              /* activate any spares */
+ +              if (mddev->pers->spare_active(mddev)) {
+ +                      sysfs_notify(&mddev->kobj, NULL,
+ +                                   "degraded");
+ +                      set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ +              }
+ +      }
+ +      if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ +          mddev->pers->finish_reshape)
+ +              mddev->pers->finish_reshape(mddev);
+ +
+ +      /* If array is no-longer degraded, then any saved_raid_disk
+ +       * information must be scrapped.  Also if any device is now
+ +       * In_sync we must scrape the saved_raid_disk for that device
+ +       * do the superblock for an incrementally recovered device
+ +       * written out.
+ +       */
+ +      rdev_for_each(rdev, mddev)
+ +              if (!mddev->degraded ||
+ +                  test_bit(In_sync, &rdev->flags))
+ +                      rdev->saved_raid_disk = -1;
+ +
+ +      md_update_sb(mddev, 1);
+ +      clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ +      clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ +      clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ +      clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ +      clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ +      /* flag recovery needed just to double check */
+ +      set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ +      sysfs_notify_dirent_safe(mddev->sysfs_action);
+ +      md_new_event(mddev);
+ +      if (mddev->event_work.func)
+ +              queue_work(md_misc_wq, &mddev->event_work);
+ +}
+ +
   void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
   {
         sysfs_notify_dirent_safe(rdev->sysfs_state);
@@@ -8682,7 -8633,6 +8673,7 @@@ EXPORT_SYMBOL(md_register_thread)
   EXPORT_SYMBOL(md_unregister_thread);
   EXPORT_SYMBOL(md_wakeup_thread);
   EXPORT_SYMBOL(md_check_recovery);
+ +EXPORT_SYMBOL(md_reap_sync_thread);
   MODULE_LICENSE("GPL");
   MODULE_DESCRIPTION("MD RAID framework");
   MODULE_ALIAS("md");
diff --combined drivers/md/raid1.c

index 851023e2ba5d5296824a46bdc12482056de648a1,aeb4e3f74791b3f1d163aa19c8f3ed34cfa0f284..55951182af73680d3b7f40d32cac1302062dbe74
--- 1/drivers/md/raid1.c
--- 2/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@@ -92,7 -92,6 +92,6 @@@ static void r1bio_pool_free(void *r1_bi
   static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
   {
         struct pool_info *pi = data;
-       struct page *page;
         struct r1bio *r1_bio;
         struct bio *bio;
         int i, j;
@@@ -122,14 -121,10 +121,10 @@@
                 j = 1;
         while(j--) {
                 bio = r1_bio->bios[j];
-               for (i = 0; i < RESYNC_PAGES; i++) {
-                       page = alloc_page(gfp_flags);
-                       if (unlikely(!page))
-                               goto out_free_pages;
+               bio->bi_vcnt = RESYNC_PAGES;
   
-                       bio->bi_io_vec[i].bv_page = page;
-                       bio->bi_vcnt = i+1;
-               }
+               if (bio_alloc_pages(bio, gfp_flags))
+                       goto out_free_bio;
         }
         /* If not user-requests, copy the page pointers to all bios */
         if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@@ -143,11 -138,6 +138,6 @@@
   
         return r1_bio;
   
- out_free_pages:
-       for (j=0 ; j < pi->raid_disks; j++)
-               for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
-                       put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
-       j = -1;
   out_free_bio:
         while (++j < pi->raid_disks)
                 bio_put(r1_bio->bios[j]);
@@@ -267,7 -257,7 +257,7 @@@ static void raid_end_bio_io(struct r1bi
                          (bio_data_dir(bio) == WRITE) ? "write" : "read",
                          (unsigned long long) bio->bi_sector,
                          (unsigned long long) bio->bi_sector +
-                        (bio->bi_size >> 9) - 1);
+                        bio_sectors(bio) - 1);
   
                 call_bio_endio(r1_bio);
         }
@@@ -458,7 -448,7 +448,7 @@@ static void raid1_end_write_request(str
                                          " %llu-%llu\n",
                                          (unsigned long long) mbio->bi_sector,
                                          (unsigned long long) mbio->bi_sector +
-                                        (mbio->bi_size >> 9) - 1);
+                                        bio_sectors(mbio) - 1);
                                 call_bio_endio(r1_bio);
                         }
                 }
@@@ -925,7 -915,7 +915,7 @@@ static void alloc_behind_pages(struct b
         if (unlikely(!bvecs))
                 return;
   
-       bio_for_each_segment(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 bvecs[i] = *bvec;
                 bvecs[i].bv_page = alloc_page(GFP_NOIO);
                 if (unlikely(!bvecs[i].bv_page))
@@@ -981,12 -971,7 +971,12 @@@ static void raid1_unplug(struct blk_plu
         while (bio) { /* submit pending writes */
                 struct bio *next = bio->bi_next;
                 bio->bi_next = NULL;
- -              generic_make_request(bio);
+ +              if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ +                  !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ +                      /* Just ignore it */
+ +                      bio_endio(bio, 0);
+ +              else
+ +                      generic_make_request(bio);
                 bio = next;
         }
         kfree(plug);
@@@ -1023,7 -1008,7 +1013,7 @@@ static void make_request(struct mddev *
         md_write_start(mddev, bio); /* wait on superblock update early */
   
         if (bio_data_dir(bio) == WRITE &&
-           bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+           bio_end_sector(bio) > mddev->suspend_lo &&
             bio->bi_sector < mddev->suspend_hi) {
                 /* As the suspend_* range is controlled by
                  * userspace, we want an interruptible
@@@ -1034,7 -1019,7 +1024,7 @@@
                         flush_signals(current);
                         prepare_to_wait(&conf->wait_barrier,
                                         &w, TASK_INTERRUPTIBLE);
-                       if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+                       if (bio_end_sector(bio) <= mddev->suspend_lo ||
                             bio->bi_sector >= mddev->suspend_hi)
                                 break;
                         schedule();
@@@ -1054,7 -1039,7 +1044,7 @@@
         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
   
         r1_bio->master_bio = bio;
-       r1_bio->sectors = bio->bi_size >> 9;
+       r1_bio->sectors = bio_sectors(bio);
         r1_bio->state = 0;
         r1_bio->mddev = mddev;
         r1_bio->sector = bio->bi_sector;
@@@ -1132,7 -1117,7 +1122,7 @@@ read_again
                         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
   
                         r1_bio->master_bio = bio;
-                       r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+                       r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                         r1_bio->state = 0;
                         r1_bio->mddev = mddev;
                         r1_bio->sector = bio->bi_sector + sectors_handled;
@@@ -1289,14 -1274,10 +1279,10 @@@
                         struct bio_vec *bvec;
                         int j;
   
-                       /* Yes, I really want the '__' version so that
-                        * we clear any unused pointer in the io_vec, rather
-                        * than leave them unchanged.  This is important
-                        * because when we come to free the pages, we won't
-                        * know the original bi_idx, so we just free
-                        * them all
+                       /*
+                        * We trimmed the bio, so _all is legit
                          */
-                       __bio_for_each_segment(bvec, mbio, j, 0)
+                       bio_for_each_segment_all(bvec, mbio, j)
                                 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
                         if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
                                 atomic_inc(&r1_bio->behind_remaining);
@@@ -1334,14 -1315,14 +1320,14 @@@
         /* Mustn't call r1_bio_write_done before this next test,
          * as it could result in the bio being freed.
          */
-       if (sectors_handled < (bio->bi_size >> 9)) {
+       if (sectors_handled < bio_sectors(bio)) {
                 r1_bio_write_done(r1_bio);
                 /* We need another r1_bio.  It has already been counted
                  * in bio->bi_phys_segments
                  */
                 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
                 r1_bio->master_bio = bio;
-               r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+               r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                 r1_bio->state = 0;
                 r1_bio->mddev = mddev;
                 r1_bio->sector = bio->bi_sector + sectors_handled;
@@@ -1867,7 -1848,7 +1853,7 @@@ static int process_checks(struct r1bio 
                 struct bio *sbio = r1_bio->bios[i];
                 int size;
   
-               if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+               if (sbio->bi_end_io != end_sync_read)
                         continue;
   
                 if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
@@@ -1892,16 -1873,15 +1878,15 @@@
                         continue;
                 }
                 /* fixup the bio for reuse */
+               bio_reset(sbio);
                 sbio->bi_vcnt = vcnt;
                 sbio->bi_size = r1_bio->sectors << 9;
-               sbio->bi_idx = 0;
-               sbio->bi_phys_segments = 0;
-               sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               sbio->bi_flags |= 1 << BIO_UPTODATE;
-               sbio->bi_next = NULL;
                 sbio->bi_sector = r1_bio->sector +
                         conf->mirrors[i].rdev->data_offset;
                 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+               sbio->bi_end_io = end_sync_read;
+               sbio->bi_private = r1_bio;
+ 
                 size = sbio->bi_size;
                 for (j = 0; j < vcnt ; j++) {
                         struct bio_vec *bi;
@@@ -1912,10 -1892,9 +1897,9 @@@
                         else
                                 bi->bv_len = size;
                         size -= PAGE_SIZE;
-                       memcpy(page_address(bi->bv_page),
-                              page_address(pbio->bi_io_vec[j].bv_page),
-                              PAGE_SIZE);
                 }
+ 
+               bio_copy_data(sbio, pbio);
         }
         return 0;
   }
@@@ -1952,7 -1931,7 +1936,7 @@@ static void sync_request_write(struct m
                 wbio->bi_rw = WRITE;
                 wbio->bi_end_io = end_sync_write;
                 atomic_inc(&r1_bio->remaining);
-               md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
   
                 generic_make_request(wbio);
         }
@@@ -2064,32 -2043,11 +2048,11 @@@ static void fix_read_error(struct r1con
         }
   }
   
- static void bi_complete(struct bio *bio, int error)
- {
-       complete((struct completion *)bio->bi_private);
- }
- 
- static int submit_bio_wait(int rw, struct bio *bio)
- {
-       struct completion event;
-       rw |= REQ_SYNC;
- 
-       init_completion(&event);
-       bio->bi_private = &event;
-       bio->bi_end_io = bi_complete;
-       submit_bio(rw, bio);
-       wait_for_completion(&event);
- 
-       return test_bit(BIO_UPTODATE, &bio->bi_flags);
- }
- 
   static int narrow_write_error(struct r1bio *r1_bio, int i)
   {
         struct mddev *mddev = r1_bio->mddev;
         struct r1conf *conf = mddev->private;
         struct md_rdev *rdev = conf->mirrors[i].rdev;
-       int vcnt, idx;
-       struct bio_vec *vec;
   
         /* bio has the data to be written to device 'i' where
          * we just recently had a write error.
@@@ -2117,30 -2075,32 +2080,32 @@@
                    & ~(sector_t)(block_sectors - 1))
                 - sector;
   
-       if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-               vcnt = r1_bio->behind_page_count;
-               vec = r1_bio->behind_bvecs;
-               idx = 0;
-               while (vec[idx].bv_page == NULL)
-                       idx++;
-       } else {
-               vcnt = r1_bio->master_bio->bi_vcnt;
-               vec = r1_bio->master_bio->bi_io_vec;
-               idx = r1_bio->master_bio->bi_idx;
-       }
         while (sect_to_write) {
                 struct bio *wbio;
                 if (sectors > sect_to_write)
                         sectors = sect_to_write;
                 /* Write at 'sector' for 'sectors'*/
   
-               wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
-               memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-               wbio->bi_sector = r1_bio->sector;
+               if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+                       unsigned vcnt = r1_bio->behind_page_count;
+                       struct bio_vec *vec = r1_bio->behind_bvecs;
+ 
+                       while (!vec->bv_page) {
+                               vec++;
+                               vcnt--;
+                       }
+ 
+                       wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+                       memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+ 
+                       wbio->bi_vcnt = vcnt;
+               } else {
+                       wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+               }
+ 
                 wbio->bi_rw = WRITE;
-               wbio->bi_vcnt = vcnt;
+               wbio->bi_sector = r1_bio->sector;
                 wbio->bi_size = r1_bio->sectors << 9;
-               wbio->bi_idx = idx;
   
                 md_trim_bio(wbio, sector - r1_bio->sector, sectors);
                 wbio->bi_sector += rdev->data_offset;
@@@ -2289,8 -2249,7 +2254,7 @@@ read_more
                         r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
   
                         r1_bio->master_bio = mbio;
-                       r1_bio->sectors = (mbio->bi_size >> 9)
-                                         - sectors_handled;
+                       r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
                         r1_bio->state = 0;
                         set_bit(R1BIO_ReadError, &r1_bio->state);
                         r1_bio->mddev = mddev;
@@@ -2464,18 -2423,7 +2428,7 @@@ static sector_t sync_request(struct mdd
         for (i = 0; i < conf->raid_disks * 2; i++) {
                 struct md_rdev *rdev;
                 bio = r1_bio->bios[i];
- 
-               /* take from bio_init */
-               bio->bi_next = NULL;
-               bio->bi_flags &= ~(BIO_POOL_MASK-1);
-               bio->bi_flags |= 1 << BIO_UPTODATE;
-               bio->bi_rw = READ;
-               bio->bi_vcnt = 0;
-               bio->bi_idx = 0;
-               bio->bi_phys_segments = 0;
-               bio->bi_size = 0;
-               bio->bi_end_io = NULL;
-               bio->bi_private = NULL;
+               bio_reset(bio);
   
                 rdev = rcu_dereference(conf->mirrors[i].rdev);
                 if (rdev == NULL ||
@@@ -2906,7 -2854,6 +2859,7 @@@ static int stop(struct mddev *mddev
         if (conf->r1bio_pool)
                 mempool_destroy(conf->r1bio_pool);
         kfree(conf->mirrors);
+ +      safe_put_page(conf->tmppage);
         kfree(conf->poolinfo);
         kfree(conf);
         mddev->private = NULL;
diff --combined drivers/md/raid10.c

index 018741ba93104d9ad432d7524a136cf7e69b2227,e32e8b1042f8e57e5ee77f7162af708717f9e06c..59d4daa5f4c7a32c245ef954f24650fe75084117
--- 1/drivers/md/raid10.c
--- 2/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@@ -1133,12 -1133,7 +1133,12 @@@ static void raid10_unplug(struct blk_pl
         while (bio) { /* submit pending writes */
                 struct bio *next = bio->bi_next;
                 bio->bi_next = NULL;
- -              generic_make_request(bio);
+ +              if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ +                  !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ +                      /* Just ignore it */
+ +                      bio_endio(bio, 0);
+ +              else
+ +                      generic_make_request(bio);
                 bio = next;
         }
         kfree(plug);
@@@ -1174,14 -1169,13 +1174,13 @@@ static void make_request(struct mddev *
         /* If this request crosses a chunk boundary, we need to
          * split it.  This will only happen for 1 PAGE (or less) requests.
          */
-       if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
+       if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
                      > chunk_sects
                      && (conf->geo.near_copies < conf->geo.raid_disks
                          || conf->prev.near_copies < conf->prev.raid_disks))) {
                 struct bio_pair *bp;
                 /* Sanity check -- queue functions should prevent this happening */
-               if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
-                   bio->bi_idx != 0)
+               if (bio_segments(bio) > 1)
                         goto bad_map;
                 /* This is a one page bio that upper layers
                  * refuse to split for us, so we need to split it.
@@@ -1214,7 -1208,7 +1213,7 @@@
         bad_map:
                 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
                        " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-                      (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+                      (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
   
                 bio_io_error(bio);
                 return;
@@@ -1229,7 -1223,7 +1228,7 @@@
          */
         wait_barrier(conf);
   
-       sectors = bio->bi_size >> 9;
+       sectors = bio_sectors(bio);
         while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
             bio->bi_sector < conf->reshape_progress &&
             bio->bi_sector + sectors > conf->reshape_progress) {
@@@ -1331,8 -1325,7 +1330,7 @@@ read_again
                         r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
   
                         r10_bio->master_bio = bio;
-                       r10_bio->sectors = ((bio->bi_size >> 9)
-                                           - sectors_handled);
+                       r10_bio->sectors = bio_sectors(bio) - sectors_handled;
                         r10_bio->state = 0;
                         r10_bio->mddev = mddev;
                         r10_bio->sector = bio->bi_sector + sectors_handled;
@@@ -1574,7 -1567,7 +1572,7 @@@ retry_write
          * after checking if we need to go around again.
          */
   
-       if (sectors_handled < (bio->bi_size >> 9)) {
+       if (sectors_handled < bio_sectors(bio)) {
                 one_write_done(r10_bio);
                 /* We need another r10_bio.  It has already been counted
                  * in bio->bi_phys_segments.
@@@ -1582,7 -1575,7 +1580,7 @@@
                 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
   
                 r10_bio->master_bio = bio;
-               r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+               r10_bio->sectors = bio_sectors(bio) - sectors_handled;
   
                 r10_bio->mddev = mddev;
                 r10_bio->sector = bio->bi_sector + sectors_handled;
@@@ -2084,13 -2077,10 +2082,10 @@@ static void sync_request_write(struct m
                  * First we need to fixup bv_offset, bv_len and
                  * bi_vecs, as the read request might have corrupted these
                  */
+               bio_reset(tbio);
+ 
                 tbio->bi_vcnt = vcnt;
                 tbio->bi_size = r10_bio->sectors << 9;
-               tbio->bi_idx = 0;
-               tbio->bi_phys_segments = 0;
-               tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               tbio->bi_flags |= 1 << BIO_UPTODATE;
-               tbio->bi_next = NULL;
                 tbio->bi_rw = WRITE;
                 tbio->bi_private = r10_bio;
                 tbio->bi_sector = r10_bio->devs[i].addr;
@@@ -2108,7 -2098,7 +2103,7 @@@
                 d = r10_bio->devs[i].devnum;
                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
                 atomic_inc(&r10_bio->remaining);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
   
                 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
                 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
@@@ -2133,7 -2123,7 +2128,7 @@@
                 d = r10_bio->devs[i].devnum;
                 atomic_inc(&r10_bio->remaining);
                 md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            tbio->bi_size >> 9);
+                            bio_sectors(tbio));
                 generic_make_request(tbio);
         }
   
@@@ -2259,13 -2249,13 +2254,13 @@@ static void recovery_request_write(stru
         wbio2 = r10_bio->devs[1].repl_bio;
         if (wbio->bi_end_io) {
                 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
                 generic_make_request(wbio);
         }
         if (wbio2 && wbio2->bi_end_io) {
                 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
                 md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            wbio2->bi_size >> 9);
+                            bio_sectors(wbio2));
                 generic_make_request(wbio2);
         }
   }
@@@ -2536,25 -2526,6 +2531,6 @@@ static void fix_read_error(struct r10co
         }
   }
   
- static void bi_complete(struct bio *bio, int error)
- {
-       complete((struct completion *)bio->bi_private);
- }
- 
- static int submit_bio_wait(int rw, struct bio *bio)
- {
-       struct completion event;
-       rw |= REQ_SYNC;
- 
-       init_completion(&event);
-       bio->bi_private = &event;
-       bio->bi_end_io = bi_complete;
-       submit_bio(rw, bio);
-       wait_for_completion(&event);
- 
-       return test_bit(BIO_UPTODATE, &bio->bi_flags);
- }
- 
   static int narrow_write_error(struct r10bio *r10_bio, int i)
   {
         struct bio *bio = r10_bio->master_bio;
@@@ -2695,8 -2666,7 +2671,7 @@@ read_more
                 r10_bio = mempool_alloc(conf->r10bio_pool,
                                         GFP_NOIO);
                 r10_bio->master_bio = mbio;
-               r10_bio->sectors = (mbio->bi_size >> 9)
-                       - sectors_handled;
+               r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
                 r10_bio->state = 0;
                 set_bit(R10BIO_ReadError,
                         &r10_bio->state);
@@@ -2918,22 -2888,6 +2893,22 @@@ static sector_t sync_request(struct mdd
                 if (init_resync(conf))
                         return 0;
   
+ +      /*
+ +       * Allow skipping a full rebuild for incremental assembly
+ +       * of a clean array, like RAID1 does.
+ +       */
+ +      if (mddev->bitmap == NULL &&
+ +          mddev->recovery_cp == MaxSector &&
+ +          !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ +          conf->fullsync == 0) {
+ +              *skipped = 1;
+ +              max_sector = mddev->dev_sectors;
+ +              if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+ +                  test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ +                      max_sector = mddev->resync_max_sectors;
+ +              return max_sector - sector_nr;
+ +      }
+ +
    skipped:
         max_sector = mddev->dev_sectors;
         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@@ -3133,6 -3087,7 +3108,7 @@@
                                         }
                                 }
                                 bio = r10_bio->devs[0].bio;
+                               bio_reset(bio);
                                 bio->bi_next = biolist;
                                 biolist = bio;
                                 bio->bi_private = r10_bio;
@@@ -3157,6 -3112,7 +3133,7 @@@
                                 rdev = mirror->rdev;
                                 if (!test_bit(In_sync, &rdev->flags)) {
                                         bio = r10_bio->devs[1].bio;
+                                       bio_reset(bio);
                                         bio->bi_next = biolist;
                                         biolist = bio;
                                         bio->bi_private = r10_bio;
@@@ -3185,6 -3141,7 +3162,7 @@@
                                 if (rdev == NULL || bio == NULL ||
                                     test_bit(Faulty, &rdev->flags))
                                         break;
+                               bio_reset(bio);
                                 bio->bi_next = biolist;
                                 biolist = bio;
                                 bio->bi_private = r10_bio;
@@@ -3283,7 -3240,7 +3261,7 @@@
                                 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
   
                         bio = r10_bio->devs[i].bio;
-                       bio->bi_end_io = NULL;
+                       bio_reset(bio);
                         clear_bit(BIO_UPTODATE, &bio->bi_flags);
                         if (conf->mirrors[d].rdev == NULL ||
                             test_bit(Faulty, &conf->mirrors[d].rdev->flags))
@@@ -3320,6 -3277,7 +3298,7 @@@
   
                         /* Need to set up for writing to the replacement */
                         bio = r10_bio->devs[i].repl_bio;
+                       bio_reset(bio);
                         clear_bit(BIO_UPTODATE, &bio->bi_flags);
   
                         sector = r10_bio->devs[i].addr;
@@@ -3353,17 -3311,6 +3332,6 @@@
                 }
         }
   
-       for (bio = biolist; bio ; bio=bio->bi_next) {
- 
-               bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               if (bio->bi_end_io)
-                       bio->bi_flags |= 1 << BIO_UPTODATE;
-               bio->bi_vcnt = 0;
-               bio->bi_idx = 0;
-               bio->bi_phys_segments = 0;
-               bio->bi_size = 0;
-       }
- 
         nr_sectors = 0;
         if (sector_nr + max_sync < max_sector)
                 max_sector = sector_nr + max_sync;
@@@ -3831,7 -3778,6 +3799,7 @@@ static int stop(struct mddev *mddev
   
         if (conf->r10bio_pool)
                 mempool_destroy(conf->r10bio_pool);
+ +      safe_put_page(conf->tmppage);
         kfree(conf->mirrors);
         kfree(conf);
         mddev->private = NULL;
@@@ -4411,7 -4357,6 +4379,6 @@@ read_more
         read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
         read_bio->bi_flags |= 1 << BIO_UPTODATE;
         read_bio->bi_vcnt = 0;
-       read_bio->bi_idx = 0;
         read_bio->bi_size = 0;
         r10_bio->master_bio = read_bio;
         r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
@@@ -4435,17 -4380,14 +4402,14 @@@
                 }
                 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
                         continue;
+ 
+               bio_reset(b);
                 b->bi_bdev = rdev2->bdev;
                 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
                 b->bi_private = r10_bio;
                 b->bi_end_io = end_reshape_write;
                 b->bi_rw = WRITE;
-               b->bi_flags &= ~(BIO_POOL_MASK - 1);
-               b->bi_flags |= 1 << BIO_UPTODATE;
                 b->bi_next = blist;
-               b->bi_vcnt = 0;
-               b->bi_idx = 0;
-               b->bi_size = 0;
                 blist = b;
         }
   
diff --combined drivers/md/raid5.c

index 4a7be455d6d86ceb6bda86a332b81d036db52dee,2fefb9f2198e2269bafa26d62e60eb96dbb63aa0..9359828ffe264d3313ee77de993ea4c5147f1205
--- 1/drivers/md/raid5.c
--- 2/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@@ -90,7 -90,7 +90,7 @@@ static inline struct hlist_head *stripe
    */
   static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
   {
-       int sectors = bio->bi_size >> 9;
+       int sectors = bio_sectors(bio);
         if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
                 return bio->bi_next;
         else
@@@ -184,8 -184,6 +184,8 @@@ static void return_io(struct bio *retur
                 return_bi = bi->bi_next;
                 bi->bi_next = NULL;
                 bi->bi_size = 0;
+ +              trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ +                                       bi, 0);
                 bio_endio(bi, 0);
                 bi = return_bi;
         }
@@@ -569,14 -567,6 +569,6 @@@ static void ops_run_io(struct stripe_he
                 bi = &sh->dev[i].req;
                 rbi = &sh->dev[i].rreq; /* For writing to replacement */
   
-               bi->bi_rw = rw;
-               rbi->bi_rw = rw;
-               if (rw & WRITE) {
-                       bi->bi_end_io = raid5_end_write_request;
-                       rbi->bi_end_io = raid5_end_write_request;
-               } else
-                       bi->bi_end_io = raid5_end_read_request;
- 
                 rcu_read_lock();
                 rrdev = rcu_dereference(conf->disks[i].replacement);
                 smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
@@@ -651,7 -641,14 +643,14 @@@
   
                         set_bit(STRIPE_IO_STARTED, &sh->state);
   
+                       bio_reset(bi);
                         bi->bi_bdev = rdev->bdev;
+                       bi->bi_rw = rw;
+                       bi->bi_end_io = (rw & WRITE)
+                               ? raid5_end_write_request
+                               : raid5_end_read_request;
+                       bi->bi_private = sh;
+ 
                         pr_debug("%s: for %llu schedule op %ld on disc %d\n",
                                 __func__, (unsigned long long)sh->sector,
                                 bi->bi_rw, i);
@@@ -665,12 -662,9 +664,9 @@@
                         if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                                 bi->bi_rw |= REQ_FLUSH;
   
-                       bi->bi_flags = 1 << BIO_UPTODATE;
-                       bi->bi_idx = 0;
                         bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                         bi->bi_io_vec[0].bv_offset = 0;
                         bi->bi_size = STRIPE_SIZE;
-                       bi->bi_next = NULL;
                         if (rrdev)
                                 set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
   
@@@ -687,7 -681,13 +683,13 @@@
   
                         set_bit(STRIPE_IO_STARTED, &sh->state);
   
+                       bio_reset(rbi);
                         rbi->bi_bdev = rrdev->bdev;
+                       rbi->bi_rw = rw;
+                       BUG_ON(!(rw & WRITE));
+                       rbi->bi_end_io = raid5_end_write_request;
+                       rbi->bi_private = sh;
+ 
                         pr_debug("%s: for %llu schedule op %ld on "
                                  "replacement disc %d\n",
                                 __func__, (unsigned long long)sh->sector,
@@@ -699,12 -699,9 +701,9 @@@
                         else
                                 rbi->bi_sector = (sh->sector
                                                   + rrdev->data_offset);
-                       rbi->bi_flags = 1 << BIO_UPTODATE;
-                       rbi->bi_idx = 0;
                         rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                         rbi->bi_io_vec[0].bv_offset = 0;
                         rbi->bi_size = STRIPE_SIZE;
-                       rbi->bi_next = NULL;
                         if (conf->mddev->gendisk)
                                 trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
                                                       rbi, disk_devt(conf->mddev->gendisk),
@@@ -1887,15 -1884,8 +1886,15 @@@ static void raid5_end_write_request(str
                                         &rdev->mddev->recovery);
                 } else if (is_badblock(rdev, sh->sector,
                                        STRIPE_SECTORS,
- -                                     &first_bad, &bad_sectors))
+ +                                     &first_bad, &bad_sectors)) {
                         set_bit(R5_MadeGood, &sh->dev[i].flags);
+ +                      if (test_bit(R5_ReadError, &sh->dev[i].flags))
+ +                              /* That was a successful write so make
+ +                               * sure it looks like we already did
+ +                               * a re-write.
+ +                               */
+ +                              set_bit(R5_ReWrite, &sh->dev[i].flags);
+ +              }
         }
         rdev_dec_pending(rdev, conf->mddev);
   
@@@ -2402,11 -2392,11 +2401,11 @@@ static int add_stripe_bio(struct stripe
         } else
                 bip = &sh->dev[dd_idx].toread;
         while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-               if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+               if (bio_end_sector(*bip) > bi->bi_sector)
                         goto overlap;
                 bip = & (*bip)->bi_next;
         }
-       if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+       if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
                 goto overlap;
   
         BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@@ -2422,8 -2412,8 +2421,8 @@@
                      sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
                              bi && bi->bi_sector <= sector;
                      bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
-                       if (bi->bi_sector + (bi->bi_size>>9) >= sector)
-                               sector = bi->bi_sector + (bi->bi_size>>9);
+                       if (bio_end_sector(bi) >= sector)
+                               sector = bio_end_sector(bi);
                 }
                 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
                         set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
@@@ -3849,7 -3839,7 +3848,7 @@@ static int in_chunk_boundary(struct mdd
   {
         sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
         unsigned int chunk_sectors = mddev->chunk_sectors;
-       unsigned int bio_sectors = bio->bi_size >> 9;
+       unsigned int bio_sectors = bio_sectors(bio);
   
         if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                 chunk_sectors = mddev->new_chunk_sectors;
@@@ -3923,8 -3913,6 +3922,8 @@@ static void raid5_align_endio(struct bi
         rdev_dec_pending(rdev, conf->mddev);
   
         if (!error && uptodate) {
+ +              trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+ +                                       raid_bi, 0);
                 bio_endio(raid_bi, 0);
                 if (atomic_dec_and_test(&conf->active_aligned_reads))
                         wake_up(&conf->wait_for_stripe);
@@@ -3941,7 -3929,7 +3940,7 @@@ static int bio_fits_rdev(struct bio *bi
   {
         struct request_queue *q = bdev_get_queue(bi->bi_bdev);
   
-       if ((bi->bi_size>>9) > queue_max_sectors(q))
+       if (bio_sectors(bi) > queue_max_sectors(q))
                 return 0;
         blk_recount_segments(q, bi);
         if (bi->bi_phys_segments > queue_max_segments(q))
@@@ -3988,7 -3976,7 +3987,7 @@@ static int chunk_aligned_read(struct md
                                                     0,
                                                     &dd_idx, NULL);
   
-       end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
+       end_sector = bio_end_sector(align_bi);
         rcu_read_lock();
         rdev = rcu_dereference(conf->disks[dd_idx].replacement);
         if (!rdev || test_bit(Faulty, &rdev->flags) ||
@@@ -4011,7 -3999,7 +4010,7 @@@
                 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
   
                 if (!bio_fits_rdev(align_bi) ||
-                   is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+                   is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
                                 &first_bad, &bad_sectors)) {
                         /* too big in some way, or has a known bad block */
                         bio_put(align_bi);
@@@ -4273,7 -4261,7 +4272,7 @@@ static void make_request(struct mddev *
         }
   
         logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-       last_sector = bi->bi_sector + (bi->bi_size>>9);
+       last_sector = bio_end_sector(bi);
         bi->bi_next = NULL;
         bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
   
@@@ -4393,8 -4381,6 +4392,8 @@@
                 if ( rw == WRITE )
                         md_write_end(mddev);
   
+ +              trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ +                                       bi, 0);
                 bio_endio(bi, 0);
         }
   }
@@@ -4679,10 -4665,9 +4678,10 @@@ static inline sector_t sync_request(str
                 *skipped = 1;
                 return rv;
         }
- -      if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
- -          !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
- -          !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+ +      if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ +          !conf->fullsync &&
+ +          !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ +          sync_blocks >= STRIPE_SECTORS) {
                 /* we can skip this block, and probably more */
                 sync_blocks /= STRIPE_SECTORS;
                 *skipped = 1;
@@@ -4739,7 -4724,7 +4738,7 @@@ static int  retry_aligned_read(struct r
         logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
         sector = raid5_compute_sector(conf, logical_sector,
                                       0, &dd_idx, NULL);
-       last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+       last_sector = bio_end_sector(raid_bio);
   
         for (; logical_sector < last_sector;
              logical_sector += STRIPE_SECTORS,
@@@ -4772,11 -4757,8 +4771,11 @@@
                 handled++;
         }
         remaining = raid5_dec_bi_active_stripes(raid_bio);
- -      if (remaining == 0)
+ +      if (remaining == 0) {
+ +              trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+ +                                       raid_bio, 0);
                 bio_endio(raid_bio, 0);
+ +      }
         if (atomic_dec_and_test(&conf->active_aligned_reads))
                 wake_up(&conf->wait_for_stripe);
         return handled;
diff --combined drivers/message/fusion/mptsas.c

index ffee6f781e30f6a25537bba1ed44f379aee137fd,2bb01546df0bcdf5063b09ccf55da5d17dbf838d..dd239bdbfcb4a0877db2ab49aa3c27a81eec7dd1
--- 1/drivers/message/fusion/mptsas.c
--- 2/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@@ -1977,7 -1977,7 +1977,7 @@@ done
   static struct scsi_host_template mptsas_driver_template = {
         .module                         = THIS_MODULE,
         .proc_name                      = "mptsas",
- -      .proc_info                      = mptscsih_proc_info,
+ +      .show_info                      = mptscsih_show_info,
         .name                           = "MPT SAS Host",
         .info                           = mptscsih_info,
         .queuecommand                   = mptsas_qcmd,
@@@ -2235,10 -2235,10 +2235,10 @@@ static int mptsas_smp_handler(struct Sc
         }
   
         /* do we need to support multiple segments? */
-       if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
                 printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n",
-                   ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
-                   rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
+                   ioc->name, __func__, bio_segments(req->bio), blk_rq_bytes(req),
+                   bio_segments(rsp->bio), blk_rq_bytes(rsp));
                 return -EINVAL;
         }
   
diff --combined drivers/s390/block/dcssblk.c

index 07ba32b07fb05d427cb18e62807ef40152577406,12d08b4529e9d3b1292ed317ee641bdfd7abddd5..6eca019bcf30a50edfab1a80daf1b351d2320474
--- 1/drivers/s390/block/dcssblk.c
--- 2/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@@ -26,7 -26,7 +26,7 @@@
   #define DCSS_BUS_ID_SIZE 20
   
   static int dcssblk_open(struct block_device *bdev, fmode_t mode);
- -static int dcssblk_release(struct gendisk *disk, fmode_t mode);
+ +static void dcssblk_release(struct gendisk *disk, fmode_t mode);
   static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
   static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
                                  void **kaddr, unsigned long *pfn);
@@@ -781,15 -781,16 +781,15 @@@ out
         return rc;
   }
   
- -static int
+ +static void
   dcssblk_release(struct gendisk *disk, fmode_t mode)
   {
         struct dcssblk_dev_info *dev_info = disk->private_data;
         struct segment_info *entry;
- -      int rc;
   
         if (!dev_info) {
- -              rc = -ENODEV;
- -              goto out;
+ +              WARN_ON(1);
+ +              return;
         }
         down_write(&dcssblk_devices_sem);
         if (atomic_dec_and_test(&dev_info->use_count)
@@@ -802,6 -803,9 +802,6 @@@
                 dev_info->save_pending = 0;
         }
         up_write(&dcssblk_devices_sem);
- -      rc = 0;
- -out:
- -      return rc;
   }
   
   static void
@@@ -822,8 -826,7 +822,7 @@@ dcssblk_make_request(struct request_que
         if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
                 /* Request is not page-aligned. */
                 goto fail;
-       if (((bio->bi_size >> 9) + bio->bi_sector)
-                       > get_capacity(bio->bi_bdev->bd_disk)) {
+       if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) {
                 /* Request beyond end of DCSS segment. */
                 goto fail;
         }
diff --combined drivers/scsi/libsas/sas_expander.c

index 55cbd018015997bba484bfe5ffdbacabcc7e43db,7af776737b40ea346fb7586bdfea1236fb8b92a0..f42b0e15410f8a52e0a570664b04547d459e7aec
--- 1/drivers/scsi/libsas/sas_expander.c
--- 2/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@@ -235,17 -235,6 +235,17 @@@ static void sas_set_ex_phy(struct domai
         linkrate  = phy->linkrate;
         memcpy(sas_addr, phy->attached_sas_addr, SAS_ADDR_SIZE);
   
+ +      /* Handle vacant phy - rest of dr data is not valid so skip it */
+ +      if (phy->phy_state == PHY_VACANT) {
+ +              memset(phy->attached_sas_addr, 0, SAS_ADDR_SIZE);
+ +              phy->attached_dev_type = NO_DEVICE;
+ +              if (!test_bit(SAS_HA_ATA_EH_ACTIVE, &ha->state)) {
+ +                      phy->phy_id = phy_id;
+ +                      goto skip;
+ +              } else
+ +                      goto out;
+ +      }
+ +
         phy->attached_dev_type = to_dev_type(dr);
         if (test_bit(SAS_HA_ATA_EH_ACTIVE, &ha->state))
                 goto out;
@@@ -283,7 -272,6 +283,7 @@@
         phy->phy->maximum_linkrate = dr->pmax_linkrate;
         phy->phy->negotiated_linkrate = phy->linkrate;
   
+ + skip:
         if (new_phy)
                 if (sas_phy_add(phy->phy)) {
                         sas_phy_free(phy->phy);
@@@ -400,7 -388,7 +400,7 @@@ int sas_ex_phy_discover(struct domain_d
         if (!disc_req)
                 return -ENOMEM;
   
- -      disc_resp = alloc_smp_req(DISCOVER_RESP_SIZE);
+ +      disc_resp = alloc_smp_resp(DISCOVER_RESP_SIZE);
         if (!disc_resp) {
                 kfree(disc_req);
                 return -ENOMEM;
@@@ -2163,10 -2151,10 +2163,10 @@@ int sas_smp_handler(struct Scsi_Host *s
         }
   
         /* do we need to support multiple segments? */
-       if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
                 printk("%s: multiple segments req %u %u, rsp %u %u\n",
-                      __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
-                      rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
+                      __func__, bio_segments(req->bio), blk_rq_bytes(req),
+                      bio_segments(rsp->bio), blk_rq_bytes(rsp));
                 return -EINVAL;
         }
   
diff --combined fs/bio.c

index 954d73124b411a733891f00394c44a1ffaefd0b1,9238a54b562c9bb0b672f4bd6359871e192c21d8..94bbc04dba77053bb47d3d8b793a3a8218f0a0d2
--- 1/fs/bio.c
--- 2/fs/bio.c
+++ b/fs/bio.c
@@@ -19,7 -19,6 +19,7 @@@
   #include <linux/swap.h>
   #include <linux/bio.h>
   #include <linux/blkdev.h>
+ +#include <linux/uio.h>
   #include <linux/iocontext.h>
   #include <linux/slab.h>
   #include <linux/init.h>
@@@ -161,12 -160,12 +161,12 @@@ unsigned int bvec_nr_vecs(unsigned shor
         return bvec_slabs[idx].nr_vecs;
   }
   
- void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+ void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
   {
         BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
   
         if (idx == BIOVEC_MAX_IDX)
-               mempool_free(bv, bs->bvec_pool);
+               mempool_free(bv, pool);
         else {
                 struct biovec_slab *bvs = bvec_slabs + idx;
   
@@@ -174,8 -173,8 +174,8 @@@
         }
   }
   
- struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
-                             struct bio_set *bs)
+ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
+                          mempool_t *pool)
   {
         struct bio_vec *bvl;
   
@@@ -211,7 -210,7 +211,7 @@@
          */
         if (*idx == BIOVEC_MAX_IDX) {
   fallback:
-               bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+               bvl = mempool_alloc(pool, gfp_mask);
         } else {
                 struct biovec_slab *bvs = bvec_slabs + *idx;
                 gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
@@@ -253,8 -252,8 +253,8 @@@ static void bio_free(struct bio *bio
         __bio_free(bio);
   
         if (bs) {
-               if (bio_has_allocated_vec(bio))
-                       bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+               if (bio_flagged(bio, BIO_OWNS_VEC))
+                       bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
   
                 /*
                  * If we have front padding, adjust the bio pointer before freeing
@@@ -298,6 -297,54 +298,54 @@@ void bio_reset(struct bio *bio
   }
   EXPORT_SYMBOL(bio_reset);
   
+ static void bio_alloc_rescue(struct work_struct *work)
+ {
+       struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
+       struct bio *bio;
+ 
+       while (1) {
+               spin_lock(&bs->rescue_lock);
+               bio = bio_list_pop(&bs->rescue_list);
+               spin_unlock(&bs->rescue_lock);
+ 
+               if (!bio)
+                       break;
+ 
+               generic_make_request(bio);
+       }
+ }
+ 
+ static void punt_bios_to_rescuer(struct bio_set *bs)
+ {
+       struct bio_list punt, nopunt;
+       struct bio *bio;
+ 
+       /*
+        * In order to guarantee forward progress we must punt only bios that
+        * were allocated from this bio_set; otherwise, if there was a bio on
+        * there for a stacking driver higher up in the stack, processing it
+        * could require allocating bios from this bio_set, and doing that from
+        * our own rescuer would be bad.
+        *
+        * Since bio lists are singly linked, pop them all instead of trying to
+        * remove from the middle of the list:
+        */
+ 
+       bio_list_init(&punt);
+       bio_list_init(&nopunt);
+ 
+       while ((bio = bio_list_pop(current->bio_list)))
+               bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+ 
+       *current->bio_list = nopunt;
+ 
+       spin_lock(&bs->rescue_lock);
+       bio_list_merge(&bs->rescue_list, &punt);
+       spin_unlock(&bs->rescue_lock);
+ 
+       queue_work(bs->rescue_workqueue, &bs->rescue_work);
+ }
+ 
   /**
    * bio_alloc_bioset - allocate a bio for I/O
    * @gfp_mask:   the GFP_ mask given to the slab allocator
@@@ -315,11 -362,27 +363,27 @@@
    *   previously allocated bio for IO before attempting to allocate a new one.
    *   Failure to do so can cause deadlocks under memory pressure.
    *
+  *   Note that when running under generic_make_request() (i.e. any block
+  *   driver), bios are not submitted until after you return - see the code in
+  *   generic_make_request() that converts recursion into iteration, to prevent
+  *   stack overflows.
+  *
+  *   This would normally mean allocating multiple bios under
+  *   generic_make_request() would be susceptible to deadlocks, but we have
+  *   deadlock avoidance code that resubmits any blocked bios from a rescuer
+  *   thread.
+  *
+  *   However, we do not guarantee forward progress for allocations from other
+  *   mempools. Doing multiple allocations from the same mempool under
+  *   generic_make_request() should be avoided - instead, use bio_set's front_pad
+  *   for per bio allocations.
+  *
    *   RETURNS:
    *   Pointer to new bio on success, NULL on failure.
    */
   struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
   {
+       gfp_t saved_gfp = gfp_mask;
         unsigned front_pad;
         unsigned inline_vecs;
         unsigned long idx = BIO_POOL_NONE;
@@@ -337,7 -400,37 +401,37 @@@
                 front_pad = 0;
                 inline_vecs = nr_iovecs;
         } else {
+               /*
+                * generic_make_request() converts recursion to iteration; this
+                * means if we're running beneath it, any bios we allocate and
+                * submit will not be submitted (and thus freed) until after we
+                * return.
+                *
+                * This exposes us to a potential deadlock if we allocate
+                * multiple bios from the same bio_set() while running
+                * underneath generic_make_request(). If we were to allocate
+                * multiple bios (say a stacking block driver that was splitting
+                * bios), we would deadlock if we exhausted the mempool's
+                * reserve.
+                *
+                * We solve this, and guarantee forward progress, with a rescuer
+                * workqueue per bio_set. If we go to allocate and there are
+                * bios on current->bio_list, we first try the allocation
+                * without __GFP_WAIT; if that fails, we punt those bios we
+                * would be blocking to the rescuer workqueue before we retry
+                * with the original gfp_flags.
+                */
+ 
+               if (current->bio_list && !bio_list_empty(current->bio_list))
+                       gfp_mask &= ~__GFP_WAIT;
+ 
                 p = mempool_alloc(bs->bio_pool, gfp_mask);
+               if (!p && gfp_mask != saved_gfp) {
+                       punt_bios_to_rescuer(bs);
+                       gfp_mask = saved_gfp;
+                       p = mempool_alloc(bs->bio_pool, gfp_mask);
+               }
+ 
                 front_pad = bs->front_pad;
                 inline_vecs = BIO_INLINE_VECS;
         }
@@@ -349,9 -442,17 +443,17 @@@
         bio_init(bio);
   
         if (nr_iovecs > inline_vecs) {
-               bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+               bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+               if (!bvl && gfp_mask != saved_gfp) {
+                       punt_bios_to_rescuer(bs);
+                       gfp_mask = saved_gfp;
+                       bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+               }
+ 
                 if (unlikely(!bvl))
                         goto err_free;
+ 
+               bio->bi_flags |= 1 << BIO_OWNS_VEC;
         } else if (nr_iovecs) {
                 bvl = bio->bi_inline_vecs;
         }
@@@ -653,6 -754,181 +755,181 @@@ int bio_add_page(struct bio *bio, struc
   }
   EXPORT_SYMBOL(bio_add_page);
   
+ struct submit_bio_ret {
+       struct completion event;
+       int error;
+ };
+ 
+ static void submit_bio_wait_endio(struct bio *bio, int error)
+ {
+       struct submit_bio_ret *ret = bio->bi_private;
+ 
+       ret->error = error;
+       complete(&ret->event);
+ }
+ 
+ /**
+  * submit_bio_wait - submit a bio, and wait until it completes
+  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
+  * @bio: The &struct bio which describes the I/O
+  *
+  * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
+  * bio_endio() on failure.
+  */
+ int submit_bio_wait(int rw, struct bio *bio)
+ {
+       struct submit_bio_ret ret;
+ 
+       rw |= REQ_SYNC;
+       init_completion(&ret.event);
+       bio->bi_private = &ret;
+       bio->bi_end_io = submit_bio_wait_endio;
+       submit_bio(rw, bio);
+       wait_for_completion(&ret.event);
+ 
+       return ret.error;
+ }
+ EXPORT_SYMBOL(submit_bio_wait);
+ 
+ /**
+  * bio_advance - increment/complete a bio by some number of bytes
+  * @bio:      bio to advance
+  * @bytes:    number of bytes to complete
+  *
+  * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+  * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+  * be updated on the last bvec as well.
+  *
+  * @bio will then represent the remaining, uncompleted portion of the io.
+  */
+ void bio_advance(struct bio *bio, unsigned bytes)
+ {
+       if (bio_integrity(bio))
+               bio_integrity_advance(bio, bytes);
+ 
+       bio->bi_sector += bytes >> 9;
+       bio->bi_size -= bytes;
+ 
+       if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
+               return;
+ 
+       while (bytes) {
+               if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+                       WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
+                                 bio->bi_idx, bio->bi_vcnt);
+                       break;
+               }
+ 
+               if (bytes >= bio_iovec(bio)->bv_len) {
+                       bytes -= bio_iovec(bio)->bv_len;
+                       bio->bi_idx++;
+               } else {
+                       bio_iovec(bio)->bv_len -= bytes;
+                       bio_iovec(bio)->bv_offset += bytes;
+                       bytes = 0;
+               }
+       }
+ }
+ EXPORT_SYMBOL(bio_advance);
+ 
+ /**
+  * bio_alloc_pages - allocates a single page for each bvec in a bio
+  * @bio: bio to allocate pages for
+  * @gfp_mask: flags for allocation
+  *
+  * Allocates pages up to @bio->bi_vcnt.
+  *
+  * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+  * freed.
+  */
+ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+ {
+       int i;
+       struct bio_vec *bv;
+ 
+       bio_for_each_segment_all(bv, bio, i) {
+               bv->bv_page = alloc_page(gfp_mask);
+               if (!bv->bv_page) {
+                       while (--bv >= bio->bi_io_vec)
+                               __free_page(bv->bv_page);
+                       return -ENOMEM;
+               }
+       }
+ 
+       return 0;
+ }
+ EXPORT_SYMBOL(bio_alloc_pages);
+ 
+ /**
+  * bio_copy_data - copy contents of data buffers from one chain of bios to
+  * another
+  * @src: source bio list
+  * @dst: destination bio list
+  *
+  * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+  * @src and @dst as linked lists of bios.
+  *
+  * Stops when it reaches the end of either @src or @dst - that is, copies
+  * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+  */
+ void bio_copy_data(struct bio *dst, struct bio *src)
+ {
+       struct bio_vec *src_bv, *dst_bv;
+       unsigned src_offset, dst_offset, bytes;
+       void *src_p, *dst_p;
+ 
+       src_bv = bio_iovec(src);
+       dst_bv = bio_iovec(dst);
+ 
+       src_offset = src_bv->bv_offset;
+       dst_offset = dst_bv->bv_offset;
+ 
+       while (1) {
+               if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
+                       src_bv++;
+                       if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
+                               src = src->bi_next;
+                               if (!src)
+                                       break;
+ 
+                               src_bv = bio_iovec(src);
+                       }
+ 
+                       src_offset = src_bv->bv_offset;
+               }
+ 
+               if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
+                       dst_bv++;
+                       if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
+                               dst = dst->bi_next;
+                               if (!dst)
+                                       break;
+ 
+                               dst_bv = bio_iovec(dst);
+                       }
+ 
+                       dst_offset = dst_bv->bv_offset;
+               }
+ 
+               bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
+                           src_bv->bv_offset + src_bv->bv_len - src_offset);
+ 
+               src_p = kmap_atomic(src_bv->bv_page);
+               dst_p = kmap_atomic(dst_bv->bv_page);
+ 
+               memcpy(dst_p + dst_bv->bv_offset,
+                      src_p + src_bv->bv_offset,
+                      bytes);
+ 
+               kunmap_atomic(dst_p);
+               kunmap_atomic(src_p);
+ 
+               src_offset += bytes;
+               dst_offset += bytes;
+       }
+ }
+ EXPORT_SYMBOL(bio_copy_data);
+ 
   struct bio_map_data {
         struct bio_vec *iovecs;
         struct sg_iovec *sgvecs;
@@@ -715,7 -991,7 +992,7 @@@ static int __bio_copy_iov(struct bio *b
         int iov_idx = 0;
         unsigned int iov_off = 0;
   
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 char *bv_addr = page_address(bvec->bv_page);
                 unsigned int bv_len = iovecs[i].bv_len;
   
@@@ -897,7 -1173,7 +1174,7 @@@ struct bio *bio_copy_user_iov(struct re
         return bio;
   cleanup:
         if (!map_data)
-               bio_for_each_segment(bvec, bio, i)
+               bio_for_each_segment_all(bvec, bio, i)
                         __free_page(bvec->bv_page);
   
         bio_put(bio);
@@@ -1111,7 -1387,7 +1388,7 @@@ static void __bio_unmap_user(struct bi
         /*
          * make sure we dirty pages we wrote to
          */
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 if (bio_data_dir(bio) == READ)
                         set_page_dirty_lock(bvec->bv_page);
   
@@@ -1217,7 -1493,7 +1494,7 @@@ static void bio_copy_kern_endio(struct 
         int i;
         char *p = bmd->sgvecs[0].iov_base;
   
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 char *addr = page_address(bvec->bv_page);
                 int len = bmd->iovecs[i].bv_len;
   
@@@ -1257,7 -1533,7 +1534,7 @@@ struct bio *bio_copy_kern(struct reques
         if (!reading) {
                 void *p = data;
   
-               bio_for_each_segment(bvec, bio, i) {
+               bio_for_each_segment_all(bvec, bio, i) {
                         char *addr = page_address(bvec->bv_page);
   
                         memcpy(addr, p, bvec->bv_len);
@@@ -1302,11 -1578,11 +1579,11 @@@ EXPORT_SYMBOL(bio_copy_kern)
    */
   void bio_set_pages_dirty(struct bio *bio)
   {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
         int i;
   
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
   
                 if (page && !PageCompound(page))
                         set_page_dirty_lock(page);
@@@ -1315,11 -1591,11 +1592,11 @@@
   
   static void bio_release_pages(struct bio *bio)
   {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
         int i;
   
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
   
                 if (page)
                         put_page(page);
@@@ -1368,16 -1644,16 +1645,16 @@@ static void bio_dirty_fn(struct work_st
   
   void bio_check_pages_dirty(struct bio *bio)
   {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
         int nr_clean_pages = 0;
         int i;
   
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
   
                 if (PageDirty(page) || PageCompound(page)) {
                         page_cache_release(page);
-                       bvec[i].bv_page = NULL;
+                       bvec->bv_page = NULL;
                 } else {
                         nr_clean_pages++;
                 }
@@@ -1429,6 -1705,8 +1706,6 @@@ void bio_endio(struct bio *bio, int err
         else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                 error = -EIO;
   
- -      trace_block_bio_complete(bio, error);
- -
         if (bio->bi_end_io)
                 bio->bi_end_io(bio, error);
   }
@@@ -1478,8 -1756,7 +1755,7 @@@ struct bio_pair *bio_split(struct bio *
         trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
                                 bi->bi_sector + first_sectors);
   
-       BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
-       BUG_ON(bi->bi_idx != 0);
+       BUG_ON(bio_segments(bi) > 1);
         atomic_set(&bp->cnt, 3);
         bp->error = 0;
         bp->bio1 = *bi;
@@@ -1489,8 -1766,8 +1765,8 @@@
         bp->bio1.bi_size = first_sectors << 9;
   
         if (bi->bi_vcnt != 0) {
-               bp->bv1 = bi->bi_io_vec[0];
-               bp->bv2 = bi->bi_io_vec[0];
+               bp->bv1 = *bio_iovec(bi);
+               bp->bv2 = *bio_iovec(bi);
   
                 if (bio_is_rw(bi)) {
                         bp->bv2.bv_offset += first_sectors << 9;
@@@ -1542,7 -1819,7 +1818,7 @@@ sector_t bio_sector_offset(struct bio *
         if (index >= bio->bi_idx)
                 index = bio->bi_vcnt - 1;
   
-       __bio_for_each_segment(bv, bio, i, 0) {
+       bio_for_each_segment_all(bv, bio, i) {
                 if (i == index) {
                         if (offset > bv->bv_offset)
                                 sectors += (offset - bv->bv_offset) / sector_sz;
@@@ -1560,29 -1837,25 +1836,25 @@@ EXPORT_SYMBOL(bio_sector_offset)
    * create memory pools for biovec's in a bio_set.
    * use the global biovec slabs created for general use.
    */
- static int biovec_create_pools(struct bio_set *bs, int pool_entries)
+ mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
   {
         struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
   
-       bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-       if (!bs->bvec_pool)
-               return -ENOMEM;
- 
-       return 0;
- }
- 
- static void biovec_free_pools(struct bio_set *bs)
- {
-       mempool_destroy(bs->bvec_pool);
+       return mempool_create_slab_pool(pool_entries, bp->slab);
   }
   
   void bioset_free(struct bio_set *bs)
   {
+       if (bs->rescue_workqueue)
+               destroy_workqueue(bs->rescue_workqueue);
+ 
         if (bs->bio_pool)
                 mempool_destroy(bs->bio_pool);
   
+       if (bs->bvec_pool)
+               mempool_destroy(bs->bvec_pool);
+ 
         bioset_integrity_free(bs);
-       biovec_free_pools(bs);
         bio_put_slab(bs);
   
         kfree(bs);
@@@ -1613,6 -1886,10 +1885,10 @@@ struct bio_set *bioset_create(unsigned 
   
         bs->front_pad = front_pad;
   
+       spin_lock_init(&bs->rescue_lock);
+       bio_list_init(&bs->rescue_list);
+       INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+ 
         bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
         if (!bs->bio_slab) {
                 kfree(bs);
@@@ -1623,9 -1900,15 +1899,15 @@@
         if (!bs->bio_pool)
                 goto bad;
   
-       if (!biovec_create_pools(bs, pool_size))
-               return bs;
+       bs->bvec_pool = biovec_create_pool(bs, pool_size);
+       if (!bs->bvec_pool)
+               goto bad;
+ 
+       bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+       if (!bs->rescue_workqueue)
+               goto bad;
   
+       return bs;
   bad:
         bioset_free(bs);
         return NULL;
diff --combined fs/block_dev.c

index d9871c1f08949082ab5175f09c50668f961b9512,dc7f9836fb5e69630dfd93b64202514476a033bd..2091db8cdd783a2287ce9165a7223cfe59cf5a2b
--- 1/fs/block_dev.c
--- 2/fs/block_dev.c
+++ b/fs/block_dev.c
@@@ -27,7 -27,6 +27,7 @@@
   #include <linux/namei.h>
   #include <linux/log2.h>
   #include <linux/cleancache.h>
+ +#include <linux/aio.h>
   #include <asm/uaccess.h>
   #include "internal.h"
   
@@@ -552,7 -551,6 +552,7 @@@ struct block_device *bdgrab(struct bloc
         ihold(bdev->bd_inode);
         return bdev;
   }
+ +EXPORT_SYMBOL(bdgrab);
   
   long nr_blockdev_pages(void)
   {
@@@ -618,9 -616,11 +618,9 @@@ void bd_forget(struct inode *inode
         struct block_device *bdev = NULL;
   
         spin_lock(&bdev_lock);
- -      if (inode->i_bdev) {
- -              if (!sb_is_blkdev_sb(inode->i_sb))
- -                      bdev = inode->i_bdev;
- -              __bd_forget(inode);
- -      }
+ +      if (!sb_is_blkdev_sb(inode->i_sb))
+ +              bdev = inode->i_bdev;
+ +      __bd_forget(inode);
         spin_unlock(&bdev_lock);
   
         if (bdev)
@@@ -1046,7 -1046,7 +1046,7 @@@ void bd_set_size(struct block_device *b
   }
   EXPORT_SYMBOL(bd_set_size);
   
- -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+ +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
   
   /*
    * bd_mutex locking:
@@@ -1401,8 -1401,9 +1401,8 @@@ static int blkdev_open(struct inode * i
         return blkdev_get(bdev, filp->f_mode, filp);
   }
   
- -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+ +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
   {
- -      int ret = 0;
         struct gendisk *disk = bdev->bd_disk;
         struct block_device *victim = NULL;
   
@@@ -1422,7 -1423,7 +1422,7 @@@
         }
         if (bdev->bd_contains == bdev) {
                 if (disk->fops->release)
- -                      ret = disk->fops->release(disk, mode);
+ +                      disk->fops->release(disk, mode);
         }
         if (!bdev->bd_openers) {
                 struct module *owner = disk->fops->owner;
@@@ -1441,9 -1442,10 +1441,9 @@@
         bdput(bdev);
         if (victim)
                 __blkdev_put(victim, mode, 1);
- -      return ret;
   }
   
- -int blkdev_put(struct block_device *bdev, fmode_t mode)
+ +void blkdev_put(struct block_device *bdev, fmode_t mode)
   {
         mutex_lock(&bdev->bd_mutex);
   
@@@ -1487,15 -1489,15 +1487,15 @@@
   
         mutex_unlock(&bdev->bd_mutex);
   
- -      return __blkdev_put(bdev, mode, 0);
+ +      __blkdev_put(bdev, mode, 0);
   }
   EXPORT_SYMBOL(blkdev_put);
   
   static int blkdev_close(struct inode * inode, struct file * filp)
   {
         struct block_device *bdev = I_BDEV(filp->f_mapping->host);
- -
- -      return blkdev_put(bdev, filp->f_mode);
+ +      blkdev_put(bdev, filp->f_mode);
+ +      return 0;
   }
   
   static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
@@@ -1556,7 -1558,7 +1556,7 @@@ static ssize_t blkdev_aio_read(struct k
                 return 0;
   
         size -= pos;
-       if (size < INT_MAX)
+       if (size < iocb->ki_left)
                 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
         return generic_file_aio_read(iocb, iov, nr_segs, pos);
   }
diff --combined fs/buffer.c

index bc1fe14aaa3e4583aa351298fc9faa4f42d2d6a9,ecd3792ae0e9f886429cace691d68d8284662741..d2a4d1bb2d57aec3999e494d52c4f765a0ae48e8
--- 1/fs/buffer.c
--- 2/fs/buffer.c
+++ b/fs/buffer.c
@@@ -865,6 -865,8 +865,6 @@@ try_again
   
                 /* Link the buffer to its page */
                 set_bh_page(bh, page, offset);
- -
- -              init_buffer(bh, NULL, NULL);
         }
         return head;
   /*
@@@ -2947,7 -2949,7 +2947,7 @@@ static void guard_bh_eod(int rw, struc
         }
   }
   
- -int submit_bh(int rw, struct buffer_head * bh)
+ +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
   {
         struct bio *bio;
         int ret = 0;
@@@ -2977,21 -2979,14 +2977,20 @@@
         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
   
         bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
         bio->bi_size = bh->b_size;
   
         bio->bi_end_io = end_bio_bh_io_sync;
         bio->bi_private = bh;
+ +      bio->bi_flags |= bio_flags;
   
         /* Take care of bh's that straddle the end of the device */
         guard_bh_eod(rw, bio, bh);
   
+ +      if (buffer_meta(bh))
+ +              rw |= REQ_META;
+ +      if (buffer_prio(bh))
+ +              rw |= REQ_PRIO;
+ +
         bio_get(bio);
         submit_bio(rw, bio);
   
@@@ -3001,12 -2996,6 +3000,12 @@@
         bio_put(bio);
         return ret;
   }
+ +EXPORT_SYMBOL_GPL(_submit_bh);
+ +
+ +int submit_bh(int rw, struct buffer_head *bh)
+ +{
+ +      return _submit_bh(rw, bh, 0);
+ +}
   EXPORT_SYMBOL(submit_bh);
   
   /**
diff --combined fs/direct-io.c

index 51d16e067d6815909907b6ccb33bba7d7ec72ab5,38484b08a39ac93ef024a72b9b2833b84e6c44f2..7ab90f5081eebc4ab8b0de88bef8d0b6310ed113
--- 1/fs/direct-io.c
--- 2/fs/direct-io.c
+++ b/fs/direct-io.c
@@@ -37,7 -37,6 +37,7 @@@
   #include <linux/uio.h>
   #include <linux/atomic.h>
   #include <linux/prefetch.h>
+ +#include <linux/aio.h>
   
   /*
    * How many user pages to map in one call to get_user_pages().  This determines
@@@ -442,8 -441,8 +442,8 @@@ static struct bio *dio_await_one(struc
   static int dio_bio_complete(struct dio *dio, struct bio *bio)
   {
         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec;
-       int page_no;
+       struct bio_vec *bvec;
+       unsigned i;
   
         if (!uptodate)
                 dio->io_error = -EIO;
@@@ -451,8 -450,8 +451,8 @@@
         if (dio->is_async && dio->rw == READ) {
                 bio_check_pages_dirty(bio);     /* transfers ownership */
         } else {
-               for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
-                       struct page *page = bvec[page_no].bv_page;
+               bio_for_each_segment_all(bvec, bio, i) {
+                       struct page *page = bvec->bv_page;
   
                         if (dio->rw == READ && !PageCompound(page))
                                 set_page_dirty_lock(page);
@@@ -673,6 -672,12 +673,6 @@@ static inline int dio_send_cur_page(str
                 if (sdio->final_block_in_bio != sdio->cur_page_block ||
                     cur_offset != bio_next_offset)
                         dio_bio_submit(dio, sdio);
- -              /*
- -               * Submit now if the underlying fs is about to perform a
- -               * metadata read
- -               */
- -              else if (sdio->boundary)
- -                      dio_bio_submit(dio, sdio);
         }
   
         if (sdio->bio == NULL) {
@@@ -732,6 -737,16 +732,6 @@@ submit_page_section(struct dio *dio, st
             sdio->cur_page_block +
             (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
                 sdio->cur_page_len += len;
- -
- -              /*
- -               * If sdio->boundary then we want to schedule the IO now to
- -               * avoid metadata seeks.
- -               */
- -              if (sdio->boundary) {
- -                      ret = dio_send_cur_page(dio, sdio, map_bh);
- -                      page_cache_release(sdio->cur_page);
- -                      sdio->cur_page = NULL;
- -              }
                 goto out;
         }
   
@@@ -743,7 -758,7 +743,7 @@@
                 page_cache_release(sdio->cur_page);
                 sdio->cur_page = NULL;
                 if (ret)
- -                      goto out;
+ +                      return ret;
         }
   
         page_cache_get(page);           /* It is in dio */
@@@ -753,16 -768,6 +753,16 @@@
         sdio->cur_page_block = blocknr;
         sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
   out:
+ +      /*
+ +       * If sdio->boundary then we want to schedule the IO now to
+ +       * avoid metadata seeks.
+ +       */
+ +      if (sdio->boundary) {
+ +              ret = dio_send_cur_page(dio, sdio, map_bh);
+ +              dio_bio_submit(dio, sdio);
+ +              page_cache_release(sdio->cur_page);
+ +              sdio->cur_page = NULL;
+ +      }
         return ret;
   }
   
@@@ -964,8 -969,7 +964,8 @@@ do_holes
                         this_chunk_bytes = this_chunk_blocks << blkbits;
                         BUG_ON(this_chunk_bytes == 0);
   
- -                      sdio->boundary = buffer_boundary(map_bh);
+ +                      if (this_chunk_blocks == sdio->blocks_available)
+ +                              sdio->boundary = buffer_boundary(map_bh);
                         ret = submit_page_section(dio, sdio, page,
                                                   offset_in_page,
                                                   this_chunk_bytes,
diff --combined fs/fs-writeback.c

index 798d4458a4d3a5798a7b04858d82a5f031849bab,8067d3719e94eb194a8ce5e0a50fd49b7705384a..3be57189efd5b3a8005321f02e40971af9429cf6
--- 1/fs/fs-writeback.c
--- 2/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@@ -22,7 -22,6 +22,6 @@@
   #include <linux/mm.h>
   #include <linux/pagemap.h>
   #include <linux/kthread.h>
- #include <linux/freezer.h>
   #include <linux/writeback.h>
   #include <linux/blkdev.h>
   #include <linux/backing-dev.h>
@@@ -88,20 -87,6 +87,6 @@@ static inline struct inode *wb_inode(st
   #define CREATE_TRACE_POINTS
   #include <trace/events/writeback.h>
   
- /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
- static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
- {
-       if (bdi->wb.task) {
-               wake_up_process(bdi->wb.task);
-       } else {
-               /*
-                * The bdi thread isn't there, wake up the forker thread which
-                * will create and run it.
-                */
-               wake_up_process(default_backing_dev_info.wb.task);
-       }
- }
- 
   static void bdi_queue_work(struct backing_dev_info *bdi,
                            struct wb_writeback_work *work)
   {
@@@ -109,10 -94,9 +94,9 @@@
   
         spin_lock_bh(&bdi->wb_lock);
         list_add_tail(&work->list, &bdi->work_list);
-       if (!bdi->wb.task)
-               trace_writeback_nothread(bdi, work);
-       bdi_wakeup_flusher(bdi);
         spin_unlock_bh(&bdi->wb_lock);
+ 
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
   }
   
   static void
@@@ -127,10 -111,8 +111,8 @@@ __bdi_start_writeback(struct backing_de
          */
         work = kzalloc(sizeof(*work), GFP_ATOMIC);
         if (!work) {
-               if (bdi->wb.task) {
-                       trace_writeback_nowork(bdi);
-                       wake_up_process(bdi->wb.task);
-               }
+               trace_writeback_nowork(bdi);
+               mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
                 return;
         }
   
@@@ -177,9 -159,7 +159,7 @@@ void bdi_start_background_writeback(str
          * writeback as soon as there is no other work to do.
          */
         trace_writeback_wake_background(bdi);
-       spin_lock_bh(&bdi->wb_lock);
-       bdi_wakeup_flusher(bdi);
-       spin_unlock_bh(&bdi->wb_lock);
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
   }
   
   /*
@@@ -1020,67 -1000,48 +1000,49 @@@ long wb_do_writeback(struct bdi_writeba
   
   /*
    * Handle writeback of dirty data for the device backed by this bdi. Also
-  * wakes up periodically and does kupdated style flushing.
+  * reschedules periodically and does kupdated style flushing.
    */
- int bdi_writeback_thread(void *data)
+ void bdi_writeback_workfn(struct work_struct *work)
   {
-       struct bdi_writeback *wb = data;
+       struct bdi_writeback *wb = container_of(to_delayed_work(work),
+                                               struct bdi_writeback, dwork);
         struct backing_dev_info *bdi = wb->bdi;
         long pages_written;
   
+ +      set_worker_desc("flush-%s", dev_name(bdi->dev));
         current->flags |= PF_SWAPWRITE;
-       set_freezable();
-       wb->last_active = jiffies;
- 
-       /*
-        * Our parent may run at a different priority, just set us to normal
-        */
-       set_user_nice(current, 0);
- 
-       trace_writeback_thread_start(bdi);
   
-       while (!kthread_freezable_should_stop(NULL)) {
+       if (likely(!current_is_workqueue_rescuer() ||
+                  list_empty(&bdi->bdi_list))) {
                 /*
-                * Remove own delayed wake-up timer, since we are already awake
-                * and we'll take care of the periodic write-back.
+                * The normal path.  Keep writing back @bdi until its
+                * work_list is empty.  Note that this path is also taken
+                * if @bdi is shutting down even when we're running off the
+                * rescuer as work_list needs to be drained.
                  */
-               del_timer(&wb->wakeup_timer);
- 
-               pages_written = wb_do_writeback(wb, 0);
- 
+               do {
+                       pages_written = wb_do_writeback(wb, 0);
+                       trace_writeback_pages_written(pages_written);
+               } while (!list_empty(&bdi->work_list));
+       } else {
+               /*
+                * bdi_wq can't get enough workers and we're running off
+                * the emergency worker.  Don't hog it.  Hopefully, 1024 is
+                * enough for efficient IO.
+                */
+               pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+                                                   WB_REASON_FORKER_THREAD);
                 trace_writeback_pages_written(pages_written);
- 
-               if (pages_written)
-                       wb->last_active = jiffies;
- 
-               set_current_state(TASK_INTERRUPTIBLE);
-               if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
-                       __set_current_state(TASK_RUNNING);
-                       continue;
-               }
- 
-               if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-                       schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-               else {
-                       /*
-                        * We have nothing to do, so can go sleep without any
-                        * timeout and save power. When a work is queued or
-                        * something is made dirty - we will be woken up.
-                        */
-                       schedule();
-               }
         }
   
-       /* Flush any work that raced with us exiting */
-       if (!list_empty(&bdi->work_list))
-               wb_do_writeback(wb, 1);
+       if (!list_empty(&bdi->work_list) ||
+           (wb_has_dirty_io(wb) && dirty_writeback_interval))
+               queue_delayed_work(bdi_wq, &wb->dwork,
+                       msecs_to_jiffies(dirty_writeback_interval * 10));
   
-       trace_writeback_thread_stop(bdi);
-       return 0;
+       current->flags &= ~PF_SWAPWRITE;
   }
   
- 
   /*
    * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
    * the whole world.
diff --combined fs/gfs2/lops.c

index 7318abf9d0fb863165857fb8adc897ad0f5e7c9e,5c37ef982390eb361684ba796776edbb64938c97..c5fa758fd8446e1938036be9cdedaf75e2bc552b
--- 1/fs/gfs2/lops.c
--- 2/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@@ -53,8 -53,8 +53,8 @@@ void gfs2_pin(struct gfs2_sbd *sdp, str
          * to in-place disk block, remove it from the AIL.
          */
         spin_lock(&sdp->sd_ail_lock);
- -      if (bd->bd_ail)
- -              list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+ +      if (bd->bd_tr)
+ +              list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list);
         spin_unlock(&sdp->sd_ail_lock);
         get_bh(bh);
         atomic_inc(&sdp->sd_log_pinned);
@@@ -94,7 -94,7 +94,7 @@@ static void maybe_release_space(struct 
    */
   
   static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
- -                     struct gfs2_ail *ai)
+ +                     struct gfs2_trans *tr)
   {
         struct gfs2_bufdata *bd = bh->b_private;
   
@@@ -109,7 -109,7 +109,7 @@@
                 maybe_release_space(bd);
   
         spin_lock(&sdp->sd_ail_lock);
- -      if (bd->bd_ail) {
+ +      if (bd->bd_tr) {
                 list_del(&bd->bd_ail_st_list);
                 brelse(bh);
         } else {
@@@ -117,8 -117,8 +117,8 @@@
                 list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
                 atomic_inc(&gl->gl_ail_count);
         }
- -      bd->bd_ail = ai;
- -      list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+ +      bd->bd_tr = tr;
+ +      list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list);
         spin_unlock(&sdp->sd_ail_lock);
   
         clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
@@@ -300,7 -300,7 +300,7 @@@ static struct bio *gfs2_log_get_bio(str
         u64 nblk;
   
         if (bio) {
-               nblk = bio->bi_sector + bio_sectors(bio);
+               nblk = bio_end_sector(bio);
                 nblk >>= sdp->sd_fsb2bb_shift;
                 if (blkno == nblk)
                         return bio;
@@@ -480,22 -480,17 +480,22 @@@ static void buf_lo_before_commit(struc
                            &sdp->sd_log_le_buf, 0);
   }
   
- -static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+ +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
   {
         struct list_head *head = &sdp->sd_log_le_buf;
         struct gfs2_bufdata *bd;
   
+ +      if (tr == NULL) {
+ +              gfs2_assert(sdp, list_empty(head));
+ +              return;
+ +      }
+ +
         while (!list_empty(head)) {
                 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
                 list_del_init(&bd->bd_list);
                 sdp->sd_log_num_buf--;
   
- -              gfs2_unpin(sdp, bd->bd_bh, ai);
+ +              gfs2_unpin(sdp, bd->bd_bh, tr);
         }
         gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
   }
@@@ -618,7 -613,7 +618,7 @@@ static void revoke_lo_before_commit(str
         gfs2_log_write_page(sdp, page);
   }
   
- -static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+ +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
   {
         struct list_head *head = &sdp->sd_log_le_revoke;
         struct gfs2_bufdata *bd;
@@@ -796,21 -791,16 +796,21 @@@ static void databuf_lo_after_scan(struc
                 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
   }
   
- -static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+ +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
   {
         struct list_head *head = &sdp->sd_log_le_databuf;
         struct gfs2_bufdata *bd;
   
+ +      if (tr == NULL) {
+ +              gfs2_assert(sdp, list_empty(head));
+ +              return;
+ +      }
+ +
         while (!list_empty(head)) {
                 bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
                 list_del_init(&bd->bd_list);
                 sdp->sd_log_num_databuf--;
- -              gfs2_unpin(sdp, bd->bd_bh, ai);
+ +              gfs2_unpin(sdp, bd->bd_bh, tr);
         }
         gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
   }
diff --combined fs/jfs/jfs_logmgr.c

index cbe48ea9318eea1ce44cae4bf3190911edf6c4c4,8ae5e350da430132ca37c94ff8e22d6053b76877..c57499dca89c5a3910bcefc5af951179aa693f24
--- 1/fs/jfs/jfs_logmgr.c
--- 2/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@@ -1058,8 -1058,7 +1058,8 @@@ static int lmLogSync(struct jfs_log * l
    */
   void jfs_syncpt(struct jfs_log *log, int hard_sync)
   {     LOG_LOCK(log);
- -      lmLogSync(log, hard_sync);
+ +      if (!test_bit(log_QUIESCE, &log->flag))
+ +              lmLogSync(log, hard_sync);
         LOG_UNLOCK(log);
   }
   
@@@ -2005,7 -2004,6 +2005,6 @@@ static int lbmRead(struct jfs_log * log
         bio->bi_io_vec[0].bv_offset = bp->l_offset;
   
         bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
         bio->bi_size = LOGPSIZE;
   
         bio->bi_end_io = lbmIODone;
@@@ -2146,7 -2144,6 +2145,6 @@@ static void lbmStartIO(struct lbuf * bp
         bio->bi_io_vec[0].bv_offset = bp->l_offset;
   
         bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
         bio->bi_size = LOGPSIZE;
   
         bio->bi_end_io = lbmIODone;
diff --combined include/linux/blk_types.h

index 22990cf4439d2e8b91fedc0be9d254985f7ab64a,e8de67053cd4b15264c599644e1dc012273604fd..fa1abeb45b7602a4f0c1a4098f05f63d7a075281
--- 1/include/linux/blk_types.h
--- 2/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@@ -111,13 -111,13 +111,14 @@@ struct bio 
   #define BIO_FS_INTEGRITY 9    /* fs owns integrity data, not block layer */
   #define BIO_QUIET     10      /* Make BIO Quiet */
   #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+ +#define BIO_SNAP_STABLE       12      /* bio data must be snapshotted during write */
   
   /*
    * Flags starting here get preserved by bio_reset() - this includes
    * BIO_POOL_IDX()
    */
- -#define BIO_RESET_BITS        12
- -#define BIO_OWNS_VEC  12      /* bio_free() should free bvec */
+ +#define BIO_RESET_BITS        13
++#define BIO_OWNS_VEC  13      /* bio_free() should free bvec */
   
   #define bio_flagged(bio, flag)        ((bio)->bi_flags & (1 << (flag)))
   
@@@ -176,6 -176,7 +177,7 @@@ enum rq_flag_bits 
         __REQ_IO_STAT,          /* account I/O stat */
         __REQ_MIXED_MERGE,      /* merge of different types, fail separately */
         __REQ_KERNEL,           /* direct IO to kernel pages */
+       __REQ_PM,               /* runtime pm request */
         __REQ_NR_BITS,          /* stops here */
   };
   
@@@ -198,6 -199,8 +200,8 @@@
          REQ_SECURE)
   #define REQ_CLONE_MASK                REQ_COMMON_MASK
   
+ #define BIO_NO_ADVANCE_ITER_MASK      (REQ_DISCARD|REQ_WRITE_SAME)
+ 
   /* This mask is used for both bio and request merge checking */
   #define REQ_NOMERGE_FLAGS \
         (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
@@@ -224,5 -227,6 +228,6 @@@
   #define REQ_MIXED_MERGE               (1 << __REQ_MIXED_MERGE)
   #define REQ_SECURE            (1 << __REQ_SECURE)
   #define REQ_KERNEL            (1 << __REQ_KERNEL)
+ #define REQ_PM                        (1 << __REQ_PM)
   
   #endif /* __LINUX_BLK_TYPES_H */
diff --combined include/linux/blkdev.h

index e38cfe77f7f017ddc9347d66f224b986b86abf45,6189bf26b53d6c5eeb7d153fbc59aeeb51392a2e..2fdb4a451b49bd626d9415b231c76b7ac927cf69
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -361,6 -361,12 +361,12 @@@ struct request_queue 
          */
         struct kobject kobj;
   
+ #ifdef CONFIG_PM_RUNTIME
+       struct device           *dev;
+       int                     rpm_status;
+       unsigned int            nr_pending;
+ #endif
+ 
         /*
          * queue settings
          */
@@@ -838,7 -844,7 +844,7 @@@ static inline unsigned int blk_queue_ge
                                                      unsigned int cmd_flags)
   {
         if (unlikely(cmd_flags & REQ_DISCARD))
-               return q->limits.max_discard_sectors;
+               return min(q->limits.max_discard_sectors, UINT_MAX >> 9);
   
         if (unlikely(cmd_flags & REQ_WRITE_SAME))
                 return q->limits.max_write_same_sectors;
@@@ -960,6 -966,27 +966,27 @@@ struct request_queue *blk_alloc_queue(g
   struct request_queue *blk_alloc_queue_node(gfp_t, int);
   extern void blk_put_queue(struct request_queue *);
   
+ /*
+  * block layer runtime pm functions
+  */
+ #ifdef CONFIG_PM_RUNTIME
+ extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev);
+ extern int blk_pre_runtime_suspend(struct request_queue *q);
+ extern void blk_post_runtime_suspend(struct request_queue *q, int err);
+ extern void blk_pre_runtime_resume(struct request_queue *q);
+ extern void blk_post_runtime_resume(struct request_queue *q, int err);
+ #else
+ static inline void blk_pm_runtime_init(struct request_queue *q,
+       struct device *dev) {}
+ static inline int blk_pre_runtime_suspend(struct request_queue *q)
+ {
+       return -ENOSYS;
+ }
+ static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {}
+ static inline void blk_pre_runtime_resume(struct request_queue *q) {}
+ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
+ #endif
+ 
   /*
    * blk_plug permits building a queue of related requests by holding the I/O
    * fragments for a short period. This allows merging of sequential requests
@@@ -1484,7 -1511,7 +1511,7 @@@ static inline bool blk_integrity_is_ini
   
   struct block_device_operations {
         int (*open) (struct block_device *, fmode_t);
- -      int (*release) (struct gendisk *, fmode_t);
+ +      void (*release) (struct gendisk *, fmode_t);
         int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
         int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
         int (*direct_access) (struct block_device *, sector_t,
diff --combined include/trace/events/block.h

index 9c1467357b03c616967cd193efab6506e3e5adff,5a28843725dfa96a70468b09cdc960f8abaf8697..60ae7c3db912de7e068452de1a1c1978cad0a662
--- 1/include/trace/events/block.h
--- 2/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@@ -244,7 -244,7 +244,7 @@@ TRACE_EVENT(block_bio_bounce
                 __entry->dev            = bio->bi_bdev ?
                                           bio->bi_bdev->bd_dev : 0;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
@@@ -257,7 -257,6 +257,7 @@@
   
   /**
    * block_bio_complete - completed all work on the block operation
+ + * @q: queue holding the block operation
    * @bio: block operation completed
    * @error: io error value
    *
@@@ -266,9 -265,9 +266,9 @@@
    */
   TRACE_EVENT(block_bio_complete,
   
- -      TP_PROTO(struct bio *bio, int error),
+ +      TP_PROTO(struct request_queue *q, struct bio *bio, int error),
   
- -      TP_ARGS(bio, error),
+ +      TP_ARGS(q, bio, error),
   
         TP_STRUCT__entry(
                 __field( dev_t,         dev             )
@@@ -279,9 -278,10 +279,9 @@@
         ),
   
         TP_fast_assign(
- -              __entry->dev            = bio->bi_bdev ?
- -                                        bio->bi_bdev->bd_dev : 0;
+ +              __entry->dev            = bio->bi_bdev->bd_dev;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 __entry->error          = error;
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
         ),
@@@ -309,7 -309,7 +309,7 @@@ DECLARE_EVENT_CLASS(block_bio_merge
         TP_fast_assign(
                 __entry->dev            = bio->bi_bdev->bd_dev;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
@@@ -376,7 -376,7 +376,7 @@@ TRACE_EVENT(block_bio_queue
         TP_fast_assign(
                 __entry->dev            = bio->bi_bdev->bd_dev;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
@@@ -404,7 -404,7 +404,7 @@@ DECLARE_EVENT_CLASS(block_get_rq
         TP_fast_assign(
                 __entry->dev            = bio ? bio->bi_bdev->bd_dev : 0;
                 __entry->sector         = bio ? bio->bi_sector : 0;
-               __entry->nr_sector      = bio ? bio->bi_size >> 9 : 0;
+               __entry->nr_sector      = bio ? bio_sectors(bio) : 0;
                 blk_fill_rwbs(__entry->rwbs,
                               bio ? bio->bi_rw : 0, __entry->nr_sector);
                 memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
@@@ -580,7 -580,7 +580,7 @@@ TRACE_EVENT(block_bio_remap
         TP_fast_assign(
                 __entry->dev            = bio->bi_bdev->bd_dev;
                 __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                 __entry->old_dev        = dev;
                 __entry->old_sector     = from;
                 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
diff --combined kernel/relay.c

index eef0d113b79ed22734b980b076e05c3aee73a64c,a0d200012adb6dc9c40f6587992c9d756b79b325..b91488ba2e5a7edb6e3cbdc5876adaf9ae7f2791
--- 1/kernel/relay.c
--- 2/kernel/relay.c
+++ b/kernel/relay.c
@@@ -234,7 -234,6 +234,6 @@@ static void relay_destroy_buf(struct rc
   static void relay_remove_buf(struct kref *kref)
   {
         struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-       buf->chan->cb->remove_buf_file(buf->dentry);
         relay_destroy_buf(buf);
   }
   
@@@ -484,6 -483,7 +483,7 @@@ static void relay_close_buf(struct rcha
   {
         buf->finalized = 1;
         del_timer_sync(&buf->timer);
+       buf->chan->cb->remove_buf_file(buf->dentry);
         kref_put(&buf->kref, relay_remove_buf);
   }
   
@@@ -588,7 -588,7 +588,7 @@@ struct rchan *relay_open(const char *ba
         chan->version = RELAYFS_CHANNEL_VERSION;
         chan->n_subbufs = n_subbufs;
         chan->subbuf_size = subbuf_size;
- -      chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+ +      chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
         chan->parent = parent;
         chan->private_data = private_data;
         if (base_filename) {
@@@ -1099,7 -1099,8 +1099,7 @@@ static size_t relay_file_read_end_pos(s
   static int subbuf_read_actor(size_t read_start,
                              struct rchan_buf *buf,
                              size_t avail,
- -                           read_descriptor_t *desc,
- -                           read_actor_t actor)
+ +                           read_descriptor_t *desc)
   {
         void *from;
         int ret = 0;
@@@ -1120,13 -1121,15 +1120,13 @@@
   typedef int (*subbuf_actor_t) (size_t read_start,
                                struct rchan_buf *buf,
                                size_t avail,
- -                             read_descriptor_t *desc,
- -                             read_actor_t actor);
+ +                             read_descriptor_t *desc);
   
   /*
    *    relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
    */
   static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
                                         subbuf_actor_t subbuf_actor,
- -                                      read_actor_t actor,
                                         read_descriptor_t *desc)
   {
         struct rchan_buf *buf = filp->private_data;
@@@ -1147,7 -1150,7 +1147,7 @@@
                         break;
   
                 avail = min(desc->count, avail);
- -              ret = subbuf_actor(read_start, buf, avail, desc, actor);
+ +              ret = subbuf_actor(read_start, buf, avail, desc);
                 if (desc->error < 0)
                         break;
   
@@@ -1171,7 -1174,8 +1171,7 @@@ static ssize_t relay_file_read(struct f
         desc.count = count;
         desc.arg.buf = buffer;
         desc.error = 0;
- -      return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
- -                                     NULL, &desc);
+ +      return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
   }
   
   static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
diff --combined mm/bounce.c

index a5c2ec3589cb94934e8654821b0876d28e1f5510,f5326b24d65d88c1051ee9547b0a66373372940c..c9f0a4339a7dafc2ba7295e49ad8fcdda8fa13de
--- 1/mm/bounce.c
--- 2/mm/bounce.c
+++ b/mm/bounce.c
@@@ -101,7 -101,7 +101,7 @@@ static void copy_to_high_bio_irq(struc
         struct bio_vec *tovec, *fromvec;
         int i;
   
-       __bio_for_each_segment(tovec, to, i, 0) {
+       bio_for_each_segment(tovec, to, i) {
                 fromvec = from->bi_io_vec + i;
   
                 /*
@@@ -134,7 -134,7 +134,7 @@@ static void bounce_end_io(struct bio *b
         /*
          * free up bounce indirect pages used
          */
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                 org_vec = bio_orig->bi_io_vec + i;
                 if (bvec->bv_page == org_vec->bv_page)
                         continue;
@@@ -181,13 -181,32 +181,13 @@@ static void bounce_end_io_read_isa(stru
   #ifdef CONFIG_NEED_BOUNCE_POOL
   static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
   {
- -      struct page *page;
- -      struct backing_dev_info *bdi;
- -      struct address_space *mapping;
- -      struct bio_vec *from;
- -      int i;
- -
         if (bio_data_dir(bio) != WRITE)
                 return 0;
   
         if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
                 return 0;
   
- -      /*
- -       * Based on the first page that has a valid mapping, decide whether or
- -       * not we have to employ bounce buffering to guarantee stable pages.
- -       */
- -      bio_for_each_segment(from, bio, i) {
- -              page = from->bv_page;
- -              mapping = page_mapping(page);
- -              if (!mapping)
- -                      continue;
- -              bdi = mapping->backing_dev_info;
- -              return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
- -      }
- -
- -      return 0;
+ +      return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
   }
   #else
   static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
@@@ -199,78 -218,43 +199,43 @@@
   static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
                                mempool_t *pool, int force)
   {
-       struct page *page;
-       struct bio *bio = NULL;
-       int i, rw = bio_data_dir(*bio_orig);
+       struct bio *bio;
+       int rw = bio_data_dir(*bio_orig);
         struct bio_vec *to, *from;
+       unsigned i;
   
-       bio_for_each_segment(from, *bio_orig, i) {
-               page = from->bv_page;
+       bio_for_each_segment(from, *bio_orig, i)
+               if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
+                       goto bounce;
   
-               /*
-                * is destination page below bounce pfn?
-                */
-               if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
-                       continue;
+       return;
+ bounce:
+       bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
   
-               /*
-                * irk, bounce it
-                */
-               if (!bio) {
-                       unsigned int cnt = (*bio_orig)->bi_vcnt;
+       bio_for_each_segment_all(to, bio, i) {
+               struct page *page = to->bv_page;
   
-                       bio = bio_alloc(GFP_NOIO, cnt);
-                       memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec));
-               }
-                       
- 
-               to = bio->bi_io_vec + i;
+               if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+                       continue;
   
-               to->bv_page = mempool_alloc(pool, q->bounce_gfp);
-               to->bv_len = from->bv_len;
-               to->bv_offset = from->bv_offset;
                 inc_zone_page_state(to->bv_page, NR_BOUNCE);
+               to->bv_page = mempool_alloc(pool, q->bounce_gfp);
   
                 if (rw == WRITE) {
                         char *vto, *vfrom;
   
-                       flush_dcache_page(from->bv_page);
+                       flush_dcache_page(page);
+ 
                         vto = page_address(to->bv_page) + to->bv_offset;
-                       vfrom = kmap(from->bv_page) + from->bv_offset;
+                       vfrom = kmap_atomic(page) + to->bv_offset;
                         memcpy(vto, vfrom, to->bv_len);
-                       kunmap(from->bv_page);
+                       kunmap_atomic(vfrom);
                 }
         }
   
-       /*
-        * no pages bounced
-        */
-       if (!bio)
-               return;
- 
         trace_block_bio_bounce(q, *bio_orig);
   
-       /*
-        * at least one page was bounced, fill in possible non-highmem
-        * pages
-        */
-       __bio_for_each_segment(from, *bio_orig, i, 0) {
-               to = bio_iovec_idx(bio, i);
-               if (!to->bv_page) {
-                       to->bv_page = from->bv_page;
-                       to->bv_len = from->bv_len;
-                       to->bv_offset = from->bv_offset;
-               }
-       }
- 
-       bio->bi_bdev = (*bio_orig)->bi_bdev;
         bio->bi_flags |= (1 << BIO_BOUNCED);
-       bio->bi_sector = (*bio_orig)->bi_sector;
-       bio->bi_rw = (*bio_orig)->bi_rw;
- 
-       bio->bi_vcnt = (*bio_orig)->bi_vcnt;
-       bio->bi_idx = (*bio_orig)->bi_idx;
-       bio->bi_size = (*bio_orig)->bi_size;
   
         if (pool == page_pool) {
                 bio->bi_end_io = bounce_end_io_write;
diff --combined mm/page_io.c

index 06a8842a6ec612dbda8071f94d15423326a375f4,8d3c0c088105e2e6127eb1d27fcd2a88dd0193c3..a8a3ef45fed753b68ac1cc4a94c9260979a37879
--- 1/mm/page_io.c
--- 2/mm/page_io.c
+++ b/mm/page_io.c
@@@ -20,7 -20,6 +20,7 @@@
   #include <linux/buffer_head.h>
   #include <linux/writeback.h>
   #include <linux/frontswap.h>
+ +#include <linux/aio.h>
   #include <asm/pgtable.h>
   
   static struct bio *get_swap_bio(gfp_t gfp_flags,
@@@ -36,14 -35,13 +36,13 @@@
                 bio->bi_io_vec[0].bv_len = PAGE_SIZE;
                 bio->bi_io_vec[0].bv_offset = 0;
                 bio->bi_vcnt = 1;
-               bio->bi_idx = 0;
                 bio->bi_size = PAGE_SIZE;
                 bio->bi_end_io = end_io;
         }
         return bio;
   }
   
- -static void end_swap_bio_write(struct bio *bio, int err)
+ +void end_swap_bio_write(struct bio *bio, int err)
   {
         const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
         struct page *page = bio->bi_io_vec[0].bv_page;
@@@ -186,7 -184,9 +185,7 @@@ bad_bmap
    */
   int swap_writepage(struct page *page, struct writeback_control *wbc)
   {
- -      struct bio *bio;
- -      int ret = 0, rw = WRITE;
- -      struct swap_info_struct *sis = page_swap_info(page);
+ +      int ret = 0;
   
         if (try_to_free_swap(page)) {
                 unlock_page(page);
@@@ -198,17 -198,6 +197,17 @@@
                 end_page_writeback(page);
                 goto out;
         }
+ +      ret = __swap_writepage(page, wbc, end_swap_bio_write);
+ +out:
+ +      return ret;
+ +}
+ +
+ +int __swap_writepage(struct page *page, struct writeback_control *wbc,
+ +      void (*end_write_func)(struct bio *, int))
+ +{
+ +      struct bio *bio;
+ +      int ret = 0, rw = WRITE;
+ +      struct swap_info_struct *sis = page_swap_info(page);
   
         if (sis->flags & SWP_FILE) {
                 struct kiocb kiocb;
@@@ -224,7 -213,6 +223,7 @@@
                 kiocb.ki_left = PAGE_SIZE;
                 kiocb.ki_nbytes = PAGE_SIZE;
   
+ +              set_page_writeback(page);
                 unlock_page(page);
                 ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
                                                 &kiocb, &iov,
@@@ -233,27 -221,11 +232,27 @@@
                 if (ret == PAGE_SIZE) {
                         count_vm_event(PSWPOUT);
                         ret = 0;
+ +              } else {
+ +                      /*
+ +                       * In the case of swap-over-nfs, this can be a
+ +                       * temporary failure if the system has limited
+ +                       * memory for allocating transmit buffers.
+ +                       * Mark the page dirty and avoid
+ +                       * rotate_reclaimable_page but rate-limit the
+ +                       * messages but do not flag PageError like
+ +                       * the normal direct-to-bio case as it could
+ +                       * be temporary.
+ +                       */
+ +                      set_page_dirty(page);
+ +                      ClearPageReclaim(page);
+ +                      pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
+ +                              page_file_offset(page));
                 }
+ +              end_page_writeback(page);
                 return ret;
         }
   
- -      bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
+ +      bio = get_swap_bio(GFP_NOIO, page, end_write_func);
         if (bio == NULL) {
                 set_page_dirty(page);
                 unlock_page(page);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)
		1	2
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/aoe/aoecmd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/floppy.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/pktcdvd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/rbd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/md.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid1.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid10.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid5.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/message/fusion/mptsas.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/s390/block/dcssblk.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/scsi/libsas/sas_expander.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/bio.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/block_dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/direct-io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/fs-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/lops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/jfs/jfs_logmgr.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/trace/events/block.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/relay.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/bounce.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_io.c	patch \|	diff1 \|	diff2 \|	blob \| history