Merge branch 'for-3.10/core' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)
Pull block core updates from Jens Axboe:

 - Major bit is Kents prep work for immutable bio vecs.

 - Stable candidate fix for a scheduling-while-atomic in the queue
   bypass operation.

 - Fix for the hang on exceeded rq->datalen 32-bit unsigned when merging
   discard bios.

 - Tejuns changes to convert the writeback thread pool to the generic
   workqueue mechanism.

 - Runtime PM framework, SCSI patches exists on top of these in James'
   tree.

 - A few random fixes.

* 'for-3.10/core' of git://git.kernel.dk/linux-block: (40 commits)
  relay: move remove_buf_file inside relay_close_buf
  partitions/efi.c: replace useless kzalloc's by kmalloc's
  fs/block_dev.c: fix iov_shorten() criteria in blkdev_aio_read()
  block: fix max discard sectors limit
  blkcg: fix "scheduling while atomic" in blk_queue_bypass_start
  Documentation: cfq-iosched: update documentation help for cfq tunables
  writeback: expose the bdi_wq workqueue
  writeback: replace custom worker pool implementation with unbound workqueue
  writeback: remove unused bdi_pending_list
  aoe: Fix unitialized var usage
  bio-integrity: Add explicit field for owner of bip_buf
  block: Add an explicit bio flag for bios that own their bvec
  block: Add bio_alloc_pages()
  block: Convert some code to bio_for_each_segment_all()
  block: Add bio_for_each_segment_all()
  bounce: Refactor __blk_queue_bounce to not use bi_io_vec
  raid1: use bio_copy_data()
  pktcdvd: Use bio_reset() in disabled code to kill bi_idx usage
  pktcdvd: use bio_copy_data()
  block: Add bio_copy_data()
  ...

25 files changed:
1  2 
block/blk-core.c
drivers/block/aoe/aoecmd.c
drivers/block/floppy.c
drivers/block/pktcdvd.c
drivers/block/rbd.c
drivers/md/md.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/message/fusion/mptsas.c
drivers/s390/block/dcssblk.c
drivers/scsi/libsas/sas_expander.c
fs/bio.c
fs/block_dev.c
fs/buffer.c
fs/direct-io.c
fs/fs-writeback.c
fs/gfs2/lops.c
fs/jfs/jfs_logmgr.c
include/linux/blk_types.h
include/linux/blkdev.h
include/trace/events/block.h
kernel/relay.c
mm/bounce.c
mm/page_io.c

diff --combined block/blk-core.c
index 7c288358a745ad2312e93080201e341cf2b69207,f224d1793ee5e5975f427cc9259e587970f19884..33c33bc99ddd5546e6ba30ce267cb436d0328c51
@@@ -30,6 -30,7 +30,7 @@@
  #include <linux/list_sort.h>
  #include <linux/delay.h>
  #include <linux/ratelimit.h>
+ #include <linux/pm_runtime.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/block.h>
@@@ -39,7 -40,6 +40,7 @@@
  
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
  
  DEFINE_IDA(blk_queue_ida);
@@@ -159,20 -159,10 +160,10 @@@ static void req_bio_endio(struct reques
        else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = -EIO;
  
-       if (unlikely(nbytes > bio->bi_size)) {
-               printk(KERN_ERR "%s: want %u bytes done, %u left\n",
-                      __func__, nbytes, bio->bi_size);
-               nbytes = bio->bi_size;
-       }
        if (unlikely(rq->cmd_flags & REQ_QUIET))
                set_bit(BIO_QUIET, &bio->bi_flags);
  
-       bio->bi_size -= nbytes;
-       bio->bi_sector += (nbytes >> 9);
-       if (bio_integrity(bio))
-               bio_integrity_advance(bio, nbytes);
+       bio_advance(bio, nbytes);
  
        /* don't actually finish bio if it's part of flush sequence */
        if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
@@@ -1264,6 -1254,16 +1255,16 @@@ void part_round_stats(int cpu, struct h
  }
  EXPORT_SYMBOL_GPL(part_round_stats);
  
+ #ifdef CONFIG_PM_RUNTIME
+ static void blk_pm_put_request(struct request *rq)
+ {
+       if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
+               pm_runtime_mark_last_busy(rq->q->dev);
+ }
+ #else
+ static inline void blk_pm_put_request(struct request *rq) {}
+ #endif
  /*
   * queue lock must be held
   */
@@@ -1274,6 -1274,8 +1275,8 @@@ void __blk_put_request(struct request_q
        if (unlikely(--req->ref_count))
                return;
  
+       blk_pm_put_request(req);
        elv_completed_request(q, req);
  
        /* this is a bio leak */
@@@ -1597,7 -1599,7 +1600,7 @@@ static void handle_bad_sector(struct bi
        printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
                        bdevname(bio->bi_bdev, b),
                        bio->bi_rw,
-                       (unsigned long long)bio->bi_sector + bio_sectors(bio),
+                       (unsigned long long)bio_end_sector(bio),
                        (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
  
        set_bit(BIO_EOF, &bio->bi_flags);
@@@ -2053,6 -2055,28 +2056,28 @@@ static void blk_account_io_done(struct 
        }
  }
  
+ #ifdef CONFIG_PM_RUNTIME
+ /*
+  * Don't process normal requests when queue is suspended
+  * or in the process of suspending/resuming
+  */
+ static struct request *blk_pm_peek_request(struct request_queue *q,
+                                          struct request *rq)
+ {
+       if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
+           (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM))))
+               return NULL;
+       else
+               return rq;
+ }
+ #else
+ static inline struct request *blk_pm_peek_request(struct request_queue *q,
+                                                 struct request *rq)
+ {
+       return rq;
+ }
+ #endif
  /**
   * blk_peek_request - peek at the top of a request queue
   * @q: request queue to peek at
@@@ -2075,6 -2099,11 +2100,11 @@@ struct request *blk_peek_request(struc
        int ret;
  
        while ((rq = __elv_next_request(q)) != NULL) {
+               rq = blk_pm_peek_request(q, rq);
+               if (!rq)
+                       break;
                if (!(rq->cmd_flags & REQ_STARTED)) {
                        /*
                         * This is the first time the device driver
@@@ -2253,8 -2282,7 +2283,7 @@@ EXPORT_SYMBOL(blk_fetch_request)
   **/
  bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
  {
-       int total_bytes, bio_nbytes, next_idx = 0;
-       struct bio *bio;
+       int total_bytes;
  
        if (!req->bio)
                return false;
  
        blk_account_io_completion(req, nr_bytes);
  
-       total_bytes = bio_nbytes = 0;
-       while ((bio = req->bio) != NULL) {
-               int nbytes;
+       total_bytes = 0;
+       while (req->bio) {
+               struct bio *bio = req->bio;
+               unsigned bio_bytes = min(bio->bi_size, nr_bytes);
  
-               if (nr_bytes >= bio->bi_size) {
+               if (bio_bytes == bio->bi_size)
                        req->bio = bio->bi_next;
-                       nbytes = bio->bi_size;
-                       req_bio_endio(req, bio, nbytes, error);
-                       next_idx = 0;
-                       bio_nbytes = 0;
-               } else {
-                       int idx = bio->bi_idx + next_idx;
  
-                       if (unlikely(idx >= bio->bi_vcnt)) {
-                               blk_dump_rq_flags(req, "__end_that");
-                               printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
-                                      __func__, idx, bio->bi_vcnt);
-                               break;
-                       }
+               req_bio_endio(req, bio, bio_bytes, error);
  
-                       nbytes = bio_iovec_idx(bio, idx)->bv_len;
-                       BIO_BUG_ON(nbytes > bio->bi_size);
+               total_bytes += bio_bytes;
+               nr_bytes -= bio_bytes;
  
-                       /*
-                        * not a complete bvec done
-                        */
-                       if (unlikely(nbytes > nr_bytes)) {
-                               bio_nbytes += nr_bytes;
-                               total_bytes += nr_bytes;
-                               break;
-                       }
-                       /*
-                        * advance to the next vector
-                        */
-                       next_idx++;
-                       bio_nbytes += nbytes;
-               }
-               total_bytes += nbytes;
-               nr_bytes -= nbytes;
-               bio = req->bio;
-               if (bio) {
-                       /*
-                        * end more in this run, or just return 'not-done'
-                        */
-                       if (unlikely(nr_bytes <= 0))
-                               break;
-               }
+               if (!nr_bytes)
+                       break;
        }
  
        /*
                return false;
        }
  
-       /*
-        * if the request wasn't completed, update state
-        */
-       if (bio_nbytes) {
-               req_bio_endio(req, bio, bio_nbytes, error);
-               bio->bi_idx += next_idx;
-               bio_iovec(bio)->bv_offset += nr_bytes;
-               bio_iovec(bio)->bv_len -= nr_bytes;
-       }
        req->__data_len -= total_bytes;
        req->buffer = bio_data(req->bio);
  
@@@ -3046,6 -3029,149 +3030,149 @@@ void blk_finish_plug(struct blk_plug *p
  }
  EXPORT_SYMBOL(blk_finish_plug);
  
+ #ifdef CONFIG_PM_RUNTIME
+ /**
+  * blk_pm_runtime_init - Block layer runtime PM initialization routine
+  * @q: the queue of the device
+  * @dev: the device the queue belongs to
+  *
+  * Description:
+  *    Initialize runtime-PM-related fields for @q and start auto suspend for
+  *    @dev. Drivers that want to take advantage of request-based runtime PM
+  *    should call this function after @dev has been initialized, and its
+  *    request queue @q has been allocated, and runtime PM for it can not happen
+  *    yet(either due to disabled/forbidden or its usage_count > 0). In most
+  *    cases, driver should call this function before any I/O has taken place.
+  *
+  *    This function takes care of setting up using auto suspend for the device,
+  *    the autosuspend delay is set to -1 to make runtime suspend impossible
+  *    until an updated value is either set by user or by driver. Drivers do
+  *    not need to touch other autosuspend settings.
+  *
+  *    The block layer runtime PM is request based, so only works for drivers
+  *    that use request as their IO unit instead of those directly use bio's.
+  */
+ void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
+ {
+       q->dev = dev;
+       q->rpm_status = RPM_ACTIVE;
+       pm_runtime_set_autosuspend_delay(q->dev, -1);
+       pm_runtime_use_autosuspend(q->dev);
+ }
+ EXPORT_SYMBOL(blk_pm_runtime_init);
+ /**
+  * blk_pre_runtime_suspend - Pre runtime suspend check
+  * @q: the queue of the device
+  *
+  * Description:
+  *    This function will check if runtime suspend is allowed for the device
+  *    by examining if there are any requests pending in the queue. If there
+  *    are requests pending, the device can not be runtime suspended; otherwise,
+  *    the queue's status will be updated to SUSPENDING and the driver can
+  *    proceed to suspend the device.
+  *
+  *    For the not allowed case, we mark last busy for the device so that
+  *    runtime PM core will try to autosuspend it some time later.
+  *
+  *    This function should be called near the start of the device's
+  *    runtime_suspend callback.
+  *
+  * Return:
+  *    0               - OK to runtime suspend the device
+  *    -EBUSY  - Device should not be runtime suspended
+  */
+ int blk_pre_runtime_suspend(struct request_queue *q)
+ {
+       int ret = 0;
+       spin_lock_irq(q->queue_lock);
+       if (q->nr_pending) {
+               ret = -EBUSY;
+               pm_runtime_mark_last_busy(q->dev);
+       } else {
+               q->rpm_status = RPM_SUSPENDING;
+       }
+       spin_unlock_irq(q->queue_lock);
+       return ret;
+ }
+ EXPORT_SYMBOL(blk_pre_runtime_suspend);
+ /**
+  * blk_post_runtime_suspend - Post runtime suspend processing
+  * @q: the queue of the device
+  * @err: return value of the device's runtime_suspend function
+  *
+  * Description:
+  *    Update the queue's runtime status according to the return value of the
+  *    device's runtime suspend function and mark last busy for the device so
+  *    that PM core will try to auto suspend the device at a later time.
+  *
+  *    This function should be called near the end of the device's
+  *    runtime_suspend callback.
+  */
+ void blk_post_runtime_suspend(struct request_queue *q, int err)
+ {
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_SUSPENDED;
+       } else {
+               q->rpm_status = RPM_ACTIVE;
+               pm_runtime_mark_last_busy(q->dev);
+       }
+       spin_unlock_irq(q->queue_lock);
+ }
+ EXPORT_SYMBOL(blk_post_runtime_suspend);
+ /**
+  * blk_pre_runtime_resume - Pre runtime resume processing
+  * @q: the queue of the device
+  *
+  * Description:
+  *    Update the queue's runtime status to RESUMING in preparation for the
+  *    runtime resume of the device.
+  *
+  *    This function should be called near the start of the device's
+  *    runtime_resume callback.
+  */
+ void blk_pre_runtime_resume(struct request_queue *q)
+ {
+       spin_lock_irq(q->queue_lock);
+       q->rpm_status = RPM_RESUMING;
+       spin_unlock_irq(q->queue_lock);
+ }
+ EXPORT_SYMBOL(blk_pre_runtime_resume);
+ /**
+  * blk_post_runtime_resume - Post runtime resume processing
+  * @q: the queue of the device
+  * @err: return value of the device's runtime_resume function
+  *
+  * Description:
+  *    Update the queue's runtime status according to the return value of the
+  *    device's runtime_resume function. If it is successfully resumed, process
+  *    the requests that are queued into the device's queue when it is resuming
+  *    and then mark last busy and initiate autosuspend for it.
+  *
+  *    This function should be called near the end of the device's
+  *    runtime_resume callback.
+  */
+ void blk_post_runtime_resume(struct request_queue *q, int err)
+ {
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_ACTIVE;
+               __blk_run_queue(q);
+               pm_runtime_mark_last_busy(q->dev);
+               pm_runtime_autosuspend(q->dev);
+       } else {
+               q->rpm_status = RPM_SUSPENDED;
+       }
+       spin_unlock_irq(q->queue_lock);
+ }
+ EXPORT_SYMBOL(blk_post_runtime_resume);
+ #endif
  int __init blk_dev_init(void)
  {
        BUILD_BUG_ON(__REQ_NR_BITS > 8 *
index 92b6d7c51e39590b3780f17c88737009b7becbbf,af96ca171238e1f9ca6499ca9d3f3f5f5194d648..5efed089a702d501f3b0de2e85997362555d0a09
@@@ -51,9 -51,8 +51,9 @@@ new_skb(ulong len
  {
        struct sk_buff *skb;
  
 -      skb = alloc_skb(len, GFP_ATOMIC);
 +      skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC);
        if (skb) {
 +              skb_reserve(skb, MAX_HEADER);
                skb_reset_mac_header(skb);
                skb_reset_network_header(skb);
                skb->protocol = __constant_htons(ETH_P_AOE);
@@@ -928,7 -927,7 +928,7 @@@ bufinit(struct buf *buf, struct reques
        buf->resid = bio->bi_size;
        buf->sector = bio->bi_sector;
        bio_pageinc(bio);
-       buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
+       buf->bv = bv = bio_iovec(bio);
        buf->bv_resid = bv->bv_len;
        WARN_ON(buf->bv_resid == 0);
  }
diff --combined drivers/block/floppy.c
index c49e85608101e6fb97b90b7a1b014fb0cbbd2ac4,83232639034eda69da39d8f84eb60e8f63339538..04ceb7e2fadd6ca075d20ecd844c39bf1da07ff3
@@@ -3601,7 -3601,7 +3601,7 @@@ static void __init config_types(void
                pr_cont("\n");
  }
  
 -static int floppy_release(struct gendisk *disk, fmode_t mode)
 +static void floppy_release(struct gendisk *disk, fmode_t mode)
  {
        int drive = (long)disk->private_data;
  
                opened_bdev[drive] = NULL;
        mutex_unlock(&open_lock);
        mutex_unlock(&floppy_mutex);
 -
 -      return 0;
  }
  
  /*
@@@ -3775,7 -3777,6 +3775,6 @@@ static int __floppy_read_block_0(struc
        bio_vec.bv_len = size;
        bio_vec.bv_offset = 0;
        bio.bi_vcnt = 1;
-       bio.bi_idx = 0;
        bio.bi_size = size;
        bio.bi_bdev = bdev;
        bio.bi_sector = 0;
diff --combined drivers/block/pktcdvd.c
index 9f2d348f7115424e3bda72d1f8ce61418f4f03d1,11190424536a95a8cf1ca3eb84a46fc396930e69..3c08983e600a0a15e1380e9de1e6f50714fe3976
@@@ -901,7 -901,7 +901,7 @@@ static void pkt_iosched_process_queue(s
                        pd->iosched.successive_reads += bio->bi_size >> 10;
                else {
                        pd->iosched.successive_reads = 0;
-                       pd->iosched.last_write = bio->bi_sector + bio_sectors(bio);
+                       pd->iosched.last_write = bio_end_sector(bio);
                }
                if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
                        if (pd->read_speed == pd->write_speed) {
@@@ -947,31 -947,6 +947,6 @@@ static int pkt_set_segment_merging(stru
        }
  }
  
- /*
-  * Copy CD_FRAMESIZE bytes from src_bio into a destination page
-  */
- static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct page *dst_page, int dst_offs)
- {
-       unsigned int copy_size = CD_FRAMESIZE;
-       while (copy_size > 0) {
-               struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
-               void *vfrom = kmap_atomic(src_bvl->bv_page) +
-                       src_bvl->bv_offset + offs;
-               void *vto = page_address(dst_page) + dst_offs;
-               int len = min_t(int, copy_size, src_bvl->bv_len - offs);
-               BUG_ON(len < 0);
-               memcpy(vto, vfrom, len);
-               kunmap_atomic(vfrom);
-               seg++;
-               offs = 0;
-               dst_offs += len;
-               copy_size -= len;
-       }
- }
  /*
   * Copy all data for this packet to pkt->pages[], so that
   * a) The number of required segments for the write bio is minimized, which
@@@ -1181,16 -1156,15 +1156,15 @@@ static int pkt_start_recovery(struct pa
        new_sector = new_block * (CD_FRAMESIZE >> 9);
        pkt->sector = new_sector;
  
+       bio_reset(pkt->bio);
+       pkt->bio->bi_bdev = pd->bdev;
+       pkt->bio->bi_rw = REQ_WRITE;
        pkt->bio->bi_sector = new_sector;
-       pkt->bio->bi_next = NULL;
-       pkt->bio->bi_flags = 1 << BIO_UPTODATE;
-       pkt->bio->bi_idx = 0;
+       pkt->bio->bi_size = pkt->frames * CD_FRAMESIZE;
+       pkt->bio->bi_vcnt = pkt->frames;
  
-       BUG_ON(pkt->bio->bi_rw != REQ_WRITE);
-       BUG_ON(pkt->bio->bi_vcnt != pkt->frames);
-       BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE);
-       BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write);
-       BUG_ON(pkt->bio->bi_private != pkt);
+       pkt->bio->bi_end_io = pkt_end_io_packet_write;
+       pkt->bio->bi_private = pkt;
  
        drop_super(sb);
        return 1;
@@@ -1325,55 -1299,35 +1299,35 @@@ try_next_bio
   */
  static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
  {
-       struct bio *bio;
        int f;
-       int frames_write;
        struct bio_vec *bvec = pkt->w_bio->bi_io_vec;
  
+       bio_reset(pkt->w_bio);
+       pkt->w_bio->bi_sector = pkt->sector;
+       pkt->w_bio->bi_bdev = pd->bdev;
+       pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+       pkt->w_bio->bi_private = pkt;
+       /* XXX: locking? */
        for (f = 0; f < pkt->frames; f++) {
                bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
                bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+               if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
+                       BUG();
        }
+       VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
  
        /*
         * Fill-in bvec with data from orig_bios.
         */
-       frames_write = 0;
        spin_lock(&pkt->lock);
-       bio_list_for_each(bio, &pkt->orig_bios) {
-               int segment = bio->bi_idx;
-               int src_offs = 0;
-               int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
-               int num_frames = bio->bi_size / CD_FRAMESIZE;
-               BUG_ON(first_frame < 0);
-               BUG_ON(first_frame + num_frames > pkt->frames);
-               for (f = first_frame; f < first_frame + num_frames; f++) {
-                       struct bio_vec *src_bvl = bio_iovec_idx(bio, segment);
-                       while (src_offs >= src_bvl->bv_len) {
-                               src_offs -= src_bvl->bv_len;
-                               segment++;
-                               BUG_ON(segment >= bio->bi_vcnt);
-                               src_bvl = bio_iovec_idx(bio, segment);
-                       }
+       bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
  
-                       if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) {
-                               bvec[f].bv_page = src_bvl->bv_page;
-                               bvec[f].bv_offset = src_bvl->bv_offset + src_offs;
-                       } else {
-                               pkt_copy_bio_data(bio, segment, src_offs,
-                                                 bvec[f].bv_page, bvec[f].bv_offset);
-                       }
-                       src_offs += CD_FRAMESIZE;
-                       frames_write++;
-               }
-       }
        pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
        spin_unlock(&pkt->lock);
  
        VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n",
-               frames_write, (unsigned long long)pkt->sector);
-       BUG_ON(frames_write != pkt->write_size);
+               pkt->write_size, (unsigned long long)pkt->sector);
  
        if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
                pkt_make_local_copy(pkt, bvec);
        }
  
        /* Start the write request */
-       bio_reset(pkt->w_bio);
-       pkt->w_bio->bi_sector = pkt->sector;
-       pkt->w_bio->bi_bdev = pd->bdev;
-       pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
-       pkt->w_bio->bi_private = pkt;
-       for (f = 0; f < pkt->frames; f++)
-               if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
-                       BUG();
-       VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
        atomic_set(&pkt->io_wait, 1);
        pkt->w_bio->bi_rw = WRITE;
        pkt_queue_bio(pd, pkt->w_bio);
@@@ -2376,9 -2320,10 +2320,9 @@@ out
        return ret;
  }
  
 -static int pkt_close(struct gendisk *disk, fmode_t mode)
 +static void pkt_close(struct gendisk *disk, fmode_t mode)
  {
        struct pktcdvd_device *pd = disk->private_data;
 -      int ret = 0;
  
        mutex_lock(&pktcdvd_mutex);
        mutex_lock(&ctl_mutex);
        }
        mutex_unlock(&ctl_mutex);
        mutex_unlock(&pktcdvd_mutex);
 -      return ret;
  }
  
  
@@@ -2431,7 -2377,7 +2375,7 @@@ static void pkt_make_request(struct req
                cloned_bio->bi_bdev = pd->bdev;
                cloned_bio->bi_private = psd;
                cloned_bio->bi_end_io = pkt_end_io_read_cloned;
-               pd->stats.secs_r += bio->bi_size >> 9;
+               pd->stats.secs_r += bio_sectors(bio);
                pkt_queue_bio(pd, cloned_bio);
                return;
        }
        zone = ZONE(bio->bi_sector, pd);
        VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
                (unsigned long long)bio->bi_sector,
-               (unsigned long long)(bio->bi_sector + bio_sectors(bio)));
+               (unsigned long long)bio_end_sector(bio));
  
        /* Check if we have to split the bio */
        {
                sector_t last_zone;
                int first_sectors;
  
-               last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd);
+               last_zone = ZONE(bio_end_sector(bio) - 1, pd);
                if (last_zone != zone) {
                        BUG_ON(last_zone != zone + pd->settings.size);
                        first_sectors = last_zone - bio->bi_sector;
@@@ -2646,7 -2592,7 +2590,7 @@@ static int pkt_seq_show(struct seq_fil
  
  static int pkt_seq_open(struct inode *inode, struct file *file)
  {
 -      return single_open(file, pkt_seq_show, PDE(inode)->data);
 +      return single_open(file, pkt_seq_show, PDE_DATA(inode));
  }
  
  static const struct file_operations pkt_proc_fops = {
diff --combined drivers/block/rbd.c
index 22ffd5dcb1681da2b48b110e54e22f69457aa508,6b2b039c191fc5da53418371f290babdc5cd02c1..ca63104136e0db46d0248aa290c355483989ec2e
@@@ -1,4 -1,3 +1,4 @@@
 +
  /*
     rbd.c -- Export ceph rados objects as a Linux block device
  
  #include <linux/ceph/mon_client.h>
  #include <linux/ceph/decode.h>
  #include <linux/parser.h>
 +#include <linux/bsearch.h>
  
  #include <linux/kernel.h>
  #include <linux/device.h>
  #include <linux/module.h>
  #include <linux/fs.h>
  #include <linux/blkdev.h>
 +#include <linux/slab.h>
  
  #include "rbd_types.h"
  
  #define       SECTOR_SHIFT    9
  #define       SECTOR_SIZE     (1ULL << SECTOR_SHIFT)
  
 -/* It might be useful to have these defined elsewhere */
 -
 -#define       U8_MAX  ((u8)   (~0U))
 -#define       U16_MAX ((u16)  (~0U))
 -#define       U32_MAX ((u32)  (~0U))
 -#define       U64_MAX ((u64)  (~0ULL))
 -
  #define RBD_DRV_NAME "rbd"
  #define RBD_DRV_NAME_LONG "rbd (rados block device)"
  
@@@ -68,8 -72,6 +68,8 @@@
  
  #define RBD_SNAP_HEAD_NAME    "-"
  
 +#define       BAD_SNAP_INDEX  U32_MAX         /* invalid index into snap array */
 +
  /* This allows a single page to hold an image name sent by OSD */
  #define RBD_IMAGE_NAME_LEN_MAX        (PAGE_SIZE - sizeof (__le32) - 1)
  #define RBD_IMAGE_ID_LEN_MAX  64
  
  /* Feature bits */
  
 -#define RBD_FEATURE_LAYERING      1
 +#define RBD_FEATURE_LAYERING  (1<<0)
 +#define RBD_FEATURE_STRIPINGV2        (1<<1)
 +#define RBD_FEATURES_ALL \
 +          (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
  
  /* Features supported by this (client software) implementation. */
  
 -#define RBD_FEATURES_ALL          (0)
 +#define RBD_FEATURES_SUPPORTED        (RBD_FEATURES_ALL)
  
  /*
   * An RBD device name will be "rbd#", where the "rbd" comes from
@@@ -113,8 -112,7 +113,8 @@@ struct rbd_image_header 
        char *snap_names;
        u64 *snap_sizes;
  
 -      u64 obj_version;
 +      u64 stripe_unit;
 +      u64 stripe_count;
  };
  
  /*
   */
  struct rbd_spec {
        u64             pool_id;
 -      char            *pool_name;
 +      const char      *pool_name;
  
 -      char            *image_id;
 -      char            *image_name;
 +      const char      *image_id;
 +      const char      *image_name;
  
        u64             snap_id;
 -      char            *snap_name;
 +      const char      *snap_name;
  
        struct kref     kref;
  };
@@@ -176,44 -174,13 +176,44 @@@ enum obj_request_type 
        OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
  };
  
 +enum obj_req_flags {
 +      OBJ_REQ_DONE,           /* completion flag: not done = 0, done = 1 */
 +      OBJ_REQ_IMG_DATA,       /* object usage: standalone = 0, image = 1 */
 +      OBJ_REQ_KNOWN,          /* EXISTS flag valid: no = 0, yes = 1 */
 +      OBJ_REQ_EXISTS,         /* target exists: no = 0, yes = 1 */
 +};
 +
  struct rbd_obj_request {
        const char              *object_name;
        u64                     offset;         /* object start byte */
        u64                     length;         /* bytes from offset */
 +      unsigned long           flags;
  
 -      struct rbd_img_request  *img_request;
 -      struct list_head        links;          /* img_request->obj_requests */
 +      /*
 +       * An object request associated with an image will have its
 +       * img_data flag set; a standalone object request will not.
 +       *
 +       * A standalone object request will have which == BAD_WHICH
 +       * and a null obj_request pointer.
 +       *
 +       * An object request initiated in support of a layered image
 +       * object (to check for its existence before a write) will
 +       * have which == BAD_WHICH and a non-null obj_request pointer.
 +       *
 +       * Finally, an object request for rbd image data will have
 +       * which != BAD_WHICH, and will have a non-null img_request
 +       * pointer.  The value of which will be in the range
 +       * 0..(img_request->obj_request_count-1).
 +       */
 +      union {
 +              struct rbd_obj_request  *obj_request;   /* STAT op */
 +              struct {
 +                      struct rbd_img_request  *img_request;
 +                      u64                     img_offset;
 +                      /* links for img_request->obj_requests list */
 +                      struct list_head        links;
 +              };
 +      };
        u32                     which;          /* posn image request list */
  
        enum obj_request_type   type;
                        u32             page_count;
                };
        };
 +      struct page             **copyup_pages;
  
        struct ceph_osd_request *osd_req;
  
        u64                     xferred;        /* bytes transferred */
 -      u64                     version;
        int                     result;
 -      atomic_t                done;
  
        rbd_obj_callback_t      callback;
        struct completion       completion;
        struct kref             kref;
  };
  
 +enum img_req_flags {
 +      IMG_REQ_WRITE,          /* I/O direction: read = 0, write = 1 */
 +      IMG_REQ_CHILD,          /* initiator: block = 0, child image = 1 */
 +      IMG_REQ_LAYERED,        /* ENOENT handling: normal = 0, layered = 1 */
 +};
 +
  struct rbd_img_request {
 -      struct request          *rq;
        struct rbd_device       *rbd_dev;
        u64                     offset; /* starting image byte offset */
        u64                     length; /* byte count from offset */
 -      bool                    write_request;  /* false for read */
 +      unsigned long           flags;
        union {
 +              u64                     snap_id;        /* for reads */
                struct ceph_snap_context *snapc;        /* for writes */
 -              u64             snap_id;                /* for reads */
        };
 +      union {
 +              struct request          *rq;            /* block request */
 +              struct rbd_obj_request  *obj_request;   /* obj req initiator */
 +      };
 +      struct page             **copyup_pages;
        spinlock_t              completion_lock;/* protects next_completion */
        u32                     next_completion;
        rbd_img_callback_t      callback;
 +      u64                     xferred;/* aggregate bytes transferred */
 +      int                     result; /* first nonzero obj_request result */
  
        u32                     obj_request_count;
        struct list_head        obj_requests;   /* rbd_obj_request structs */
  #define for_each_obj_request_safe(ireq, oreq, n) \
        list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
  
 -struct rbd_snap {
 -      struct  device          dev;
 -      const char              *name;
 -      u64                     size;
 -      struct list_head        node;
 -      u64                     id;
 -      u64                     features;
 -};
 -
  struct rbd_mapping {
        u64                     size;
        u64                     features;
@@@ -311,7 -276,6 +311,7 @@@ struct rbd_device 
  
        struct rbd_spec         *parent_spec;
        u64                     parent_overlap;
 +      struct rbd_device       *parent;
  
        /* protects updating the header */
        struct rw_semaphore     header_rwsem;
  
        struct list_head        node;
  
 -      /* list of snapshots */
 -      struct list_head        snaps;
 -
        /* sysfs related */
        struct device           dev;
        unsigned long           open_count;     /* protected by lock */
@@@ -345,21 -312,16 +345,21 @@@ static DEFINE_SPINLOCK(rbd_dev_list_loc
  static LIST_HEAD(rbd_client_list);            /* clients */
  static DEFINE_SPINLOCK(rbd_client_list_lock);
  
 -static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
 -static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
 +/* Slab caches for frequently-allocated structures */
  
 -static void rbd_dev_release(struct device *dev);
 -static void rbd_remove_snap_dev(struct rbd_snap *snap);
 +static struct kmem_cache      *rbd_img_request_cache;
 +static struct kmem_cache      *rbd_obj_request_cache;
 +static struct kmem_cache      *rbd_segment_name_cache;
 +
 +static int rbd_img_request_submit(struct rbd_img_request *img_request);
 +
 +static void rbd_dev_device_release(struct device *dev);
  
  static ssize_t rbd_add(struct bus_type *bus, const char *buf,
                       size_t count);
  static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
                          size_t count);
 +static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
  
  static struct bus_attribute rbd_bus_attrs[] = {
        __ATTR(add, S_IWUSR, NULL, rbd_add),
@@@ -421,19 -383,8 +421,19 @@@ void rbd_warn(struct rbd_device *rbd_de
  #  define rbd_assert(expr)    ((void) 0)
  #endif /* !RBD_DEBUG */
  
 -static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
 -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
 +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
 +static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 +
 +static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
 +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 +                                      u64 snap_id);
 +static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 +                              u8 *order, u64 *snap_size);
 +static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 +              u64 *snap_features);
 +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
  
  static int rbd_open(struct block_device *bdev, fmode_t mode)
  {
        return 0;
  }
  
 -static int rbd_release(struct gendisk *disk, fmode_t mode)
 +static void rbd_release(struct gendisk *disk, fmode_t mode)
  {
        struct rbd_device *rbd_dev = disk->private_data;
        unsigned long open_count_before;
        mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
        put_device(&rbd_dev->dev);
        mutex_unlock(&ctl_mutex);
 -
 -      return 0;
  }
  
  static const struct block_device_operations rbd_bd_ops = {
@@@ -531,13 -484,6 +531,13 @@@ out_opt
        return ERR_PTR(ret);
  }
  
 +static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 +{
 +      kref_get(&rbdc->kref);
 +
 +      return rbdc;
 +}
 +
  /*
   * Find a ceph client with specific addr and configuration.  If
   * found, bump its reference count.
@@@ -553,8 -499,7 +553,8 @@@ static struct rbd_client *rbd_client_fi
        spin_lock(&rbd_client_list_lock);
        list_for_each_entry(client_node, &rbd_client_list, node) {
                if (!ceph_compare_options(ceph_opts, client_node->client)) {
 -                      kref_get(&client_node->kref);
 +                      __rbd_get_client(client_node);
 +
                        found = true;
                        break;
                }
@@@ -777,6 -722,7 +777,6 @@@ static int rbd_header_from_disk(struct 
                        header->snap_sizes[i] =
                                le64_to_cpu(ondisk->snaps[i].image_size);
        } else {
 -              WARN_ON(ondisk->snap_names_len);
                header->snap_names = NULL;
                header->snap_sizes = NULL;
        }
        /* Allocate and fill in the snapshot context */
  
        header->image_size = le64_to_cpu(ondisk->image_size);
 -      size = sizeof (struct ceph_snap_context);
 -      size += snap_count * sizeof (header->snapc->snaps[0]);
 -      header->snapc = kzalloc(size, GFP_KERNEL);
 +
 +      header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
        if (!header->snapc)
                goto out_err;
 -
 -      atomic_set(&header->snapc->nref, 1);
        header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
 -      header->snapc->num_snaps = snap_count;
        for (i = 0; i < snap_count; i++)
 -              header->snapc->snaps[i] =
 -                      le64_to_cpu(ondisk->snaps[i].id);
 +              header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
  
        return 0;
  
@@@ -810,174 -761,70 +810,174 @@@ out_err
        return -ENOMEM;
  }
  
 -static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 +static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
 +{
 +      const char *snap_name;
 +
 +      rbd_assert(which < rbd_dev->header.snapc->num_snaps);
 +
 +      /* Skip over names until we find the one we are looking for */
 +
 +      snap_name = rbd_dev->header.snap_names;
 +      while (which--)
 +              snap_name += strlen(snap_name) + 1;
 +
 +      return kstrdup(snap_name, GFP_KERNEL);
 +}
 +
 +/*
 + * Snapshot id comparison function for use with qsort()/bsearch().
 + * Note that result is for snapshots in *descending* order.
 + */
 +static int snapid_compare_reverse(const void *s1, const void *s2)
 +{
 +      u64 snap_id1 = *(u64 *)s1;
 +      u64 snap_id2 = *(u64 *)s2;
 +
 +      if (snap_id1 < snap_id2)
 +              return 1;
 +      return snap_id1 == snap_id2 ? 0 : -1;
 +}
 +
 +/*
 + * Search a snapshot context to see if the given snapshot id is
 + * present.
 + *
 + * Returns the position of the snapshot id in the array if it's found,
 + * or BAD_SNAP_INDEX otherwise.
 + *
 + * Note: The snapshot array is in kept sorted (by the osd) in
 + * reverse order, highest snapshot id first.
 + */
 +static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
 +{
 +      struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 +      u64 *found;
 +
 +      found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
 +                              sizeof (snap_id), snapid_compare_reverse);
 +
 +      return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
 +}
 +
 +static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
 +                                      u64 snap_id)
  {
 -      struct rbd_snap *snap;
 +      u32 which;
  
 +      which = rbd_dev_snap_index(rbd_dev, snap_id);
 +      if (which == BAD_SNAP_INDEX)
 +              return NULL;
 +
 +      return _rbd_dev_v1_snap_name(rbd_dev, which);
 +}
 +
 +static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 +{
        if (snap_id == CEPH_NOSNAP)
                return RBD_SNAP_HEAD_NAME;
  
 -      list_for_each_entry(snap, &rbd_dev->snaps, node)
 -              if (snap_id == snap->id)
 -                      return snap->name;
 +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 +      if (rbd_dev->image_format == 1)
 +              return rbd_dev_v1_snap_name(rbd_dev, snap_id);
  
 -      return NULL;
 +      return rbd_dev_v2_snap_name(rbd_dev, snap_id);
  }
  
 -static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
 +static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 +                              u64 *snap_size)
  {
 +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 +      if (snap_id == CEPH_NOSNAP) {
 +              *snap_size = rbd_dev->header.image_size;
 +      } else if (rbd_dev->image_format == 1) {
 +              u32 which;
  
 -      struct rbd_snap *snap;
 +              which = rbd_dev_snap_index(rbd_dev, snap_id);
 +              if (which == BAD_SNAP_INDEX)
 +                      return -ENOENT;
 +
 +              *snap_size = rbd_dev->header.snap_sizes[which];
 +      } else {
 +              u64 size = 0;
 +              int ret;
  
 -      list_for_each_entry(snap, &rbd_dev->snaps, node) {
 -              if (!strcmp(snap_name, snap->name)) {
 -                      rbd_dev->spec->snap_id = snap->id;
 -                      rbd_dev->mapping.size = snap->size;
 -                      rbd_dev->mapping.features = snap->features;
 +              ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
 +              if (ret)
 +                      return ret;
  
 -                      return 0;
 -              }
 +              *snap_size = size;
        }
 +      return 0;
 +}
 +
 +static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 +                      u64 *snap_features)
 +{
 +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 +      if (snap_id == CEPH_NOSNAP) {
 +              *snap_features = rbd_dev->header.features;
 +      } else if (rbd_dev->image_format == 1) {
 +              *snap_features = 0;     /* No features for format 1 */
 +      } else {
 +              u64 features = 0;
 +              int ret;
 +
 +              ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
 +              if (ret)
 +                      return ret;
  
 -      return -ENOENT;
 +              *snap_features = features;
 +      }
 +      return 0;
  }
  
 -static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
 +static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
  {
 +      const char *snap_name = rbd_dev->spec->snap_name;
 +      u64 snap_id;
 +      u64 size = 0;
 +      u64 features = 0;
        int ret;
  
 -      if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
 -                  sizeof (RBD_SNAP_HEAD_NAME))) {
 -              rbd_dev->spec->snap_id = CEPH_NOSNAP;
 -              rbd_dev->mapping.size = rbd_dev->header.image_size;
 -              rbd_dev->mapping.features = rbd_dev->header.features;
 -              ret = 0;
 +      if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) {
 +              snap_id = rbd_snap_id_by_name(rbd_dev, snap_name);
 +              if (snap_id == CEPH_NOSNAP)
 +                      return -ENOENT;
        } else {
 -              ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
 -              if (ret < 0)
 -                      goto done;
 -              rbd_dev->mapping.read_only = true;
 +              snap_id = CEPH_NOSNAP;
        }
 -      set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
  
 -done:
 -      return ret;
 +      ret = rbd_snap_size(rbd_dev, snap_id, &size);
 +      if (ret)
 +              return ret;
 +      ret = rbd_snap_features(rbd_dev, snap_id, &features);
 +      if (ret)
 +              return ret;
 +
 +      rbd_dev->mapping.size = size;
 +      rbd_dev->mapping.features = features;
 +
 +      /* If we are mapping a snapshot it must be marked read-only */
 +
 +      if (snap_id != CEPH_NOSNAP)
 +              rbd_dev->mapping.read_only = true;
 +
 +      return 0;
  }
  
 -static void rbd_header_free(struct rbd_image_header *header)
 +static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
  {
 -      kfree(header->object_prefix);
 -      header->object_prefix = NULL;
 -      kfree(header->snap_sizes);
 -      header->snap_sizes = NULL;
 -      kfree(header->snap_names);
 -      header->snap_names = NULL;
 -      ceph_put_snap_context(header->snapc);
 -      header->snapc = NULL;
 +      rbd_dev->mapping.size = 0;
 +      rbd_dev->mapping.features = 0;
 +      rbd_dev->mapping.read_only = true;
 +}
 +
 +static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
 +{
 +      rbd_dev->mapping.size = 0;
 +      rbd_dev->mapping.features = 0;
 +      rbd_dev->mapping.read_only = true;
  }
  
  static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
        u64 segment;
        int ret;
  
 -      name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
 +      name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
        if (!name)
                return NULL;
        segment = offset >> rbd_dev->header.obj_order;
        return name;
  }
  
 +static void rbd_segment_name_free(const char *name)
 +{
 +      /* The explicit cast here is needed to drop the const qualifier */
 +
 +      kmem_cache_free(rbd_segment_name_cache, (void *)name);
 +}
 +
  static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
  {
        u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
@@@ -1080,37 -920,6 +1080,37 @@@ static void zero_bio_chain(struct bio *
        }
  }
  
 +/*
 + * similar to zero_bio_chain(), zeros data defined by a page array,
 + * starting at the given byte offset from the start of the array and
 + * continuing up to the given end offset.  The pages array is
 + * assumed to be big enough to hold all bytes up to the end.
 + */
 +static void zero_pages(struct page **pages, u64 offset, u64 end)
 +{
 +      struct page **page = &pages[offset >> PAGE_SHIFT];
 +
 +      rbd_assert(end > offset);
 +      rbd_assert(end - offset <= (u64)SIZE_MAX);
 +      while (offset < end) {
 +              size_t page_offset;
 +              size_t length;
 +              unsigned long flags;
 +              void *kaddr;
 +
 +              page_offset = (size_t)(offset & ~PAGE_MASK);
 +              length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
 +              local_irq_save(flags);
 +              kaddr = kmap_atomic(*page);
 +              memset(kaddr + page_offset, 0, length);
 +              kunmap_atomic(kaddr);
 +              local_irq_restore(flags);
 +
 +              offset += length;
 +              page++;
 +      }
 +}
 +
  /*
   * Clone a portion of a bio, starting at the given byte offset
   * and continuing for the number of bytes indicated.
@@@ -1143,7 -952,7 +1143,7 @@@ static struct bio *bio_clone_range(stru
        /* Find first affected segment... */
  
        resid = offset;
-       __bio_for_each_segment(bv, bio_src, idx, 0) {
+       bio_for_each_segment(bv, bio_src, idx) {
                if (resid < bv->bv_len)
                        break;
                resid -= bv->bv_len;
@@@ -1255,77 -1064,6 +1255,77 @@@ out_err
        return NULL;
  }
  
 +/*
 + * The default/initial value for all object request flags is 0.  For
 + * each flag, once its value is set to 1 it is never reset to 0
 + * again.
 + */
 +static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
 +{
 +      if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
 +              struct rbd_device *rbd_dev;
 +
 +              rbd_dev = obj_request->img_request->rbd_dev;
 +              rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
 +                      obj_request);
 +      }
 +}
 +
 +static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
 +{
 +      smp_mb();
 +      return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
 +}
 +
 +static void obj_request_done_set(struct rbd_obj_request *obj_request)
 +{
 +      if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
 +              struct rbd_device *rbd_dev = NULL;
 +
 +              if (obj_request_img_data_test(obj_request))
 +                      rbd_dev = obj_request->img_request->rbd_dev;
 +              rbd_warn(rbd_dev, "obj_request %p already marked done\n",
 +                      obj_request);
 +      }
 +}
 +
 +static bool obj_request_done_test(struct rbd_obj_request *obj_request)
 +{
 +      smp_mb();
 +      return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
 +}
 +
 +/*
 + * This sets the KNOWN flag after (possibly) setting the EXISTS
 + * flag.  The latter is set based on the "exists" value provided.
 + *
 + * Note that for our purposes once an object exists it never goes
 + * away again.  It's possible that the response from two existence
 + * checks are separated by the creation of the target object, and
 + * the first ("doesn't exist") response arrives *after* the second
 + * ("does exist").  In that case we ignore the second one.
 + */
 +static void obj_request_existence_set(struct rbd_obj_request *obj_request,
 +                              bool exists)
 +{
 +      if (exists)
 +              set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
 +      set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
 +      smp_mb();
 +}
 +
 +static bool obj_request_known_test(struct rbd_obj_request *obj_request)
 +{
 +      smp_mb();
 +      return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
 +}
 +
 +static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
 +{
 +      smp_mb();
 +      return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
 +}
 +
  static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
  {
        dout("%s: obj %p (was %d)\n", __func__, obj_request,
@@@ -1363,11 -1101,9 +1363,11 @@@ static inline void rbd_img_obj_request_
  {
        rbd_assert(obj_request->img_request == NULL);
  
 -      rbd_obj_request_get(obj_request);
 +      /* Image request now owns object's original reference */
        obj_request->img_request = img_request;
        obj_request->which = img_request->obj_request_count;
 +      rbd_assert(!obj_request_img_data_test(obj_request));
 +      obj_request_img_data_set(obj_request);
        rbd_assert(obj_request->which != BAD_WHICH);
        img_request->obj_request_count++;
        list_add_tail(&obj_request->links, &img_request->obj_requests);
@@@ -1387,7 -1123,6 +1387,7 @@@ static inline void rbd_img_obj_request_
        img_request->obj_request_count--;
        rbd_assert(obj_request->which == img_request->obj_request_count);
        obj_request->which = BAD_WHICH;
 +      rbd_assert(obj_request_img_data_test(obj_request));
        rbd_assert(obj_request->img_request == img_request);
        obj_request->img_request = NULL;
        obj_request->callback = NULL;
@@@ -1406,6 -1141,76 +1406,6 @@@ static bool obj_request_type_valid(enu
        }
  }
  
 -static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 -{
 -      struct ceph_osd_req_op *op;
 -      va_list args;
 -      size_t size;
 -
 -      op = kzalloc(sizeof (*op), GFP_NOIO);
 -      if (!op)
 -              return NULL;
 -      op->op = opcode;
 -      va_start(args, opcode);
 -      switch (opcode) {
 -      case CEPH_OSD_OP_READ:
 -      case CEPH_OSD_OP_WRITE:
 -              /* rbd_osd_req_op_create(READ, offset, length) */
 -              /* rbd_osd_req_op_create(WRITE, offset, length) */
 -              op->extent.offset = va_arg(args, u64);
 -              op->extent.length = va_arg(args, u64);
 -              if (opcode == CEPH_OSD_OP_WRITE)
 -                      op->payload_len = op->extent.length;
 -              break;
 -      case CEPH_OSD_OP_STAT:
 -              break;
 -      case CEPH_OSD_OP_CALL:
 -              /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
 -              op->cls.class_name = va_arg(args, char *);
 -              size = strlen(op->cls.class_name);
 -              rbd_assert(size <= (size_t) U8_MAX);
 -              op->cls.class_len = size;
 -              op->payload_len = size;
 -
 -              op->cls.method_name = va_arg(args, char *);
 -              size = strlen(op->cls.method_name);
 -              rbd_assert(size <= (size_t) U8_MAX);
 -              op->cls.method_len = size;
 -              op->payload_len += size;
 -
 -              op->cls.argc = 0;
 -              op->cls.indata = va_arg(args, void *);
 -              size = va_arg(args, size_t);
 -              rbd_assert(size <= (size_t) U32_MAX);
 -              op->cls.indata_len = (u32) size;
 -              op->payload_len += size;
 -              break;
 -      case CEPH_OSD_OP_NOTIFY_ACK:
 -      case CEPH_OSD_OP_WATCH:
 -              /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
 -              /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
 -              op->watch.cookie = va_arg(args, u64);
 -              op->watch.ver = va_arg(args, u64);
 -              op->watch.ver = cpu_to_le64(op->watch.ver);
 -              if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
 -                      op->watch.flag = (u8) 1;
 -              break;
 -      default:
 -              rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
 -              kfree(op);
 -              op = NULL;
 -              break;
 -      }
 -      va_end(args);
 -
 -      return op;
 -}
 -
 -static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
 -{
 -      kfree(op);
 -}
 -
  static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
                                struct rbd_obj_request *obj_request)
  {
  
  static void rbd_img_request_complete(struct rbd_img_request *img_request)
  {
 +
        dout("%s: img %p\n", __func__, img_request);
 +
 +      /*
 +       * If no error occurred, compute the aggregate transfer
 +       * count for the image request.  We could instead use
 +       * atomic64_cmpxchg() to update it as each object request
 +       * completes; not clear which way is better off hand.
 +       */
 +      if (!img_request->result) {
 +              struct rbd_obj_request *obj_request;
 +              u64 xferred = 0;
 +
 +              for_each_obj_request(img_request, obj_request)
 +                      xferred += obj_request->xferred;
 +              img_request->xferred = xferred;
 +      }
 +
        if (img_request->callback)
                img_request->callback(img_request);
        else
@@@ -1449,56 -1237,39 +1449,56 @@@ static int rbd_obj_request_wait(struct 
        return wait_for_completion_interruptible(&obj_request->completion);
  }
  
 -static void obj_request_done_init(struct rbd_obj_request *obj_request)
 +/*
 + * The default/initial value for all image request flags is 0.  Each
 + * is conditionally set to 1 at image request initialization time
 + * and currently never change thereafter.
 + */
 +static void img_request_write_set(struct rbd_img_request *img_request)
  {
 -      atomic_set(&obj_request->done, 0);
 -      smp_wmb();
 +      set_bit(IMG_REQ_WRITE, &img_request->flags);
 +      smp_mb();
  }
  
 -static void obj_request_done_set(struct rbd_obj_request *obj_request)
 +static bool img_request_write_test(struct rbd_img_request *img_request)
 +{
 +      smp_mb();
 +      return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
 +}
 +
 +static void img_request_child_set(struct rbd_img_request *img_request)
  {
 -      int done;
 +      set_bit(IMG_REQ_CHILD, &img_request->flags);
 +      smp_mb();
 +}
  
 -      done = atomic_inc_return(&obj_request->done);
 -      if (done > 1) {
 -              struct rbd_img_request *img_request = obj_request->img_request;
 -              struct rbd_device *rbd_dev;
 +static bool img_request_child_test(struct rbd_img_request *img_request)
 +{
 +      smp_mb();
 +      return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
 +}
  
 -              rbd_dev = img_request ? img_request->rbd_dev : NULL;
 -              rbd_warn(rbd_dev, "obj_request %p was already done\n",
 -                      obj_request);
 -      }
 +static void img_request_layered_set(struct rbd_img_request *img_request)
 +{
 +      set_bit(IMG_REQ_LAYERED, &img_request->flags);
 +      smp_mb();
  }
  
 -static bool obj_request_done_test(struct rbd_obj_request *obj_request)
 +static bool img_request_layered_test(struct rbd_img_request *img_request)
  {
        smp_mb();
 -      return atomic_read(&obj_request->done) != 0;
 +      return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
  }
  
  static void
  rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
  {
 +      u64 xferred = obj_request->xferred;
 +      u64 length = obj_request->length;
 +
        dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
                obj_request, obj_request->img_request, obj_request->result,
 -              obj_request->xferred, obj_request->length);
 +              xferred, length);
        /*
         * ENOENT means a hole in the image.  We zero-fill the
         * entire length of the request.  A short read also implies
         * update the xferred count to indicate the whole request
         * was satisfied.
         */
 -      BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
 +      rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
        if (obj_request->result == -ENOENT) {
 -              zero_bio_chain(obj_request->bio_list, 0);
 +              if (obj_request->type == OBJ_REQUEST_BIO)
 +                      zero_bio_chain(obj_request->bio_list, 0);
 +              else
 +                      zero_pages(obj_request->pages, 0, length);
                obj_request->result = 0;
 -              obj_request->xferred = obj_request->length;
 -      } else if (obj_request->xferred < obj_request->length &&
 -                      !obj_request->result) {
 -              zero_bio_chain(obj_request->bio_list, obj_request->xferred);
 -              obj_request->xferred = obj_request->length;
 +              obj_request->xferred = length;
 +      } else if (xferred < length && !obj_request->result) {
 +              if (obj_request->type == OBJ_REQUEST_BIO)
 +                      zero_bio_chain(obj_request->bio_list, xferred);
 +              else
 +                      zero_pages(obj_request->pages, xferred, length);
 +              obj_request->xferred = length;
        }
        obj_request_done_set(obj_request);
  }
@@@ -1542,23 -1308,9 +1542,23 @@@ static void rbd_osd_trivial_callback(st
  
  static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
  {
 -      dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
 -              obj_request->result, obj_request->xferred, obj_request->length);
 -      if (obj_request->img_request)
 +      struct rbd_img_request *img_request = NULL;
 +      struct rbd_device *rbd_dev = NULL;
 +      bool layered = false;
 +
 +      if (obj_request_img_data_test(obj_request)) {
 +              img_request = obj_request->img_request;
 +              layered = img_request && img_request_layered_test(img_request);
 +              rbd_dev = img_request->rbd_dev;
 +      }
 +
 +      dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
 +              obj_request, img_request, obj_request->result,
 +              obj_request->xferred, obj_request->length);
 +      if (layered && obj_request->result == -ENOENT &&
 +                      obj_request->img_offset < rbd_dev->parent_overlap)
 +              rbd_img_parent_read(obj_request);
 +      else if (img_request)
                rbd_img_obj_request_read_callback(obj_request);
        else
                obj_request_done_set(obj_request);
@@@ -1569,8 -1321,9 +1569,8 @@@ static void rbd_osd_write_callback(stru
        dout("%s: obj %p result %d %llu\n", __func__, obj_request,
                obj_request->result, obj_request->length);
        /*
 -       * There is no such thing as a successful short write.
 -       * Our xferred value is the number of bytes transferred
 -       * back.  Set it to our originally-requested length.
 +       * There is no such thing as a successful short write.  Set
 +       * it to our originally-requested length.
         */
        obj_request->xferred = obj_request->length;
        obj_request_done_set(obj_request);
@@@ -1594,25 -1347,22 +1594,25 @@@ static void rbd_osd_req_callback(struc
  
        dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
        rbd_assert(osd_req == obj_request->osd_req);
 -      rbd_assert(!!obj_request->img_request ^
 -                              (obj_request->which == BAD_WHICH));
 +      if (obj_request_img_data_test(obj_request)) {
 +              rbd_assert(obj_request->img_request);
 +              rbd_assert(obj_request->which != BAD_WHICH);
 +      } else {
 +              rbd_assert(obj_request->which == BAD_WHICH);
 +      }
  
        if (osd_req->r_result < 0)
                obj_request->result = osd_req->r_result;
 -      obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
  
 -      WARN_ON(osd_req->r_num_ops != 1);       /* For now */
 +      BUG_ON(osd_req->r_num_ops > 2);
  
        /*
         * We support a 64-bit length, but ultimately it has to be
         * passed to blk_end_request(), which takes an unsigned int.
         */
        obj_request->xferred = osd_req->r_reply_op_len[0];
 -      rbd_assert(obj_request->xferred < (u64) UINT_MAX);
 -      opcode = osd_req->r_request_ops[0].op;
 +      rbd_assert(obj_request->xferred < (u64)UINT_MAX);
 +      opcode = osd_req->r_ops[0].op;
        switch (opcode) {
        case CEPH_OSD_OP_READ:
                rbd_osd_read_callback(obj_request);
                rbd_obj_request_complete(obj_request);
  }
  
 +static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_img_request *img_request = obj_request->img_request;
 +      struct ceph_osd_request *osd_req = obj_request->osd_req;
 +      u64 snap_id;
 +
 +      rbd_assert(osd_req != NULL);
 +
 +      snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
 +      ceph_osdc_build_request(osd_req, obj_request->offset,
 +                      NULL, snap_id, NULL);
 +}
 +
 +static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_img_request *img_request = obj_request->img_request;
 +      struct ceph_osd_request *osd_req = obj_request->osd_req;
 +      struct ceph_snap_context *snapc;
 +      struct timespec mtime = CURRENT_TIME;
 +
 +      rbd_assert(osd_req != NULL);
 +
 +      snapc = img_request ? img_request->snapc : NULL;
 +      ceph_osdc_build_request(osd_req, obj_request->offset,
 +                      snapc, CEPH_NOSNAP, &mtime);
 +}
 +
  static struct ceph_osd_request *rbd_osd_req_create(
                                        struct rbd_device *rbd_dev,
                                        bool write_request,
 -                                      struct rbd_obj_request *obj_request,
 -                                      struct ceph_osd_req_op *op)
 +                                      struct rbd_obj_request *obj_request)
  {
 -      struct rbd_img_request *img_request = obj_request->img_request;
        struct ceph_snap_context *snapc = NULL;
        struct ceph_osd_client *osdc;
        struct ceph_osd_request *osd_req;
 -      struct timespec now;
 -      struct timespec *mtime;
 -      u64 snap_id = CEPH_NOSNAP;
 -      u64 offset = obj_request->offset;
 -      u64 length = obj_request->length;
  
 -      if (img_request) {
 -              rbd_assert(img_request->write_request == write_request);
 -              if (img_request->write_request)
 -                      snapc = img_request->snapc;
 -              else
 -                      snap_id = img_request->snap_id;
 +      if (obj_request_img_data_test(obj_request)) {
 +              struct rbd_img_request *img_request = obj_request->img_request;
 +
 +              rbd_assert(write_request ==
 +                              img_request_write_test(img_request));
 +              if (write_request)
 +                      snapc = img_request->snapc;
        }
  
        /* Allocate and initialize the request, for the single op */
        if (!osd_req)
                return NULL;    /* ENOMEM */
  
 -      rbd_assert(obj_request_type_valid(obj_request->type));
 -      switch (obj_request->type) {
 -      case OBJ_REQUEST_NODATA:
 -              break;          /* Nothing to do */
 -      case OBJ_REQUEST_BIO:
 -              rbd_assert(obj_request->bio_list != NULL);
 -              osd_req->r_bio = obj_request->bio_list;
 -              break;
 -      case OBJ_REQUEST_PAGES:
 -              osd_req->r_pages = obj_request->pages;
 -              osd_req->r_num_pages = obj_request->page_count;
 -              osd_req->r_page_alignment = offset & ~PAGE_MASK;
 -              break;
 -      }
 -
 -      if (write_request) {
 +      if (write_request)
                osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
 -              now = CURRENT_TIME;
 -              mtime = &now;
 -      } else {
 +      else
                osd_req->r_flags = CEPH_OSD_FLAG_READ;
 -              mtime = NULL;   /* not needed for reads */
 -              offset = 0;     /* These are not used... */
 -              length = 0;     /* ...for osd read requests */
 -      }
  
        osd_req->r_callback = rbd_osd_req_callback;
        osd_req->r_priv = obj_request;
  
        osd_req->r_file_layout = rbd_dev->layout;       /* struct */
  
 -      /* osd_req will get its own reference to snapc (if non-null) */
 +      return osd_req;
 +}
  
 -      ceph_osdc_build_request(osd_req, offset, length, 1, op,
 -                              snapc, snap_id, mtime);
 +/*
 + * Create a copyup osd request based on the information in the
 + * object request supplied.  A copyup request has two osd ops,
 + * a copyup method call, and a "normal" write request.
 + */
 +static struct ceph_osd_request *
 +rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_img_request *img_request;
 +      struct ceph_snap_context *snapc;
 +      struct rbd_device *rbd_dev;
 +      struct ceph_osd_client *osdc;
 +      struct ceph_osd_request *osd_req;
 +
 +      rbd_assert(obj_request_img_data_test(obj_request));
 +      img_request = obj_request->img_request;
 +      rbd_assert(img_request);
 +      rbd_assert(img_request_write_test(img_request));
 +
 +      /* Allocate and initialize the request, for the two ops */
 +
 +      snapc = img_request->snapc;
 +      rbd_dev = img_request->rbd_dev;
 +      osdc = &rbd_dev->rbd_client->client->osdc;
 +      osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
 +      if (!osd_req)
 +              return NULL;    /* ENOMEM */
 +
 +      osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
 +      osd_req->r_callback = rbd_osd_req_callback;
 +      osd_req->r_priv = obj_request;
 +
 +      osd_req->r_oid_len = strlen(obj_request->object_name);
 +      rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
 +      memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
 +
 +      osd_req->r_file_layout = rbd_dev->layout;       /* struct */
  
        return osd_req;
  }
  
 +
  static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
  {
        ceph_osdc_put_request(osd_req);
@@@ -1767,23 -1480,18 +1767,23 @@@ static struct rbd_obj_request *rbd_obj_
        rbd_assert(obj_request_type_valid(type));
  
        size = strlen(object_name) + 1;
 -      obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
 -      if (!obj_request)
 +      name = kmalloc(size, GFP_KERNEL);
 +      if (!name)
                return NULL;
  
 -      name = (char *)(obj_request + 1);
 +      obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
 +      if (!obj_request) {
 +              kfree(name);
 +              return NULL;
 +      }
 +
        obj_request->object_name = memcpy(name, object_name, size);
        obj_request->offset = offset;
        obj_request->length = length;
 +      obj_request->flags = 0;
        obj_request->which = BAD_WHICH;
        obj_request->type = type;
        INIT_LIST_HEAD(&obj_request->links);
 -      obj_request_done_init(obj_request);
        init_completion(&obj_request->completion);
        kref_init(&obj_request->kref);
  
@@@ -1822,9 -1530,7 +1822,9 @@@ static void rbd_obj_request_destroy(str
                break;
        }
  
 -      kfree(obj_request);
 +      kfree(obj_request->object_name);
 +      obj_request->object_name = NULL;
 +      kmem_cache_free(rbd_obj_request_cache, obj_request);
  }
  
  /*
  static struct rbd_img_request *rbd_img_request_create(
                                        struct rbd_device *rbd_dev,
                                        u64 offset, u64 length,
 -                                      bool write_request)
 +                                      bool write_request,
 +                                      bool child_request)
  {
        struct rbd_img_request *img_request;
 -      struct ceph_snap_context *snapc = NULL;
  
 -      img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
 +      img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
        if (!img_request)
                return NULL;
  
        if (write_request) {
                down_read(&rbd_dev->header_rwsem);
 -              snapc = ceph_get_snap_context(rbd_dev->header.snapc);
 +              ceph_get_snap_context(rbd_dev->header.snapc);
                up_read(&rbd_dev->header_rwsem);
 -              if (WARN_ON(!snapc)) {
 -                      kfree(img_request);
 -                      return NULL;    /* Shouldn't happen */
 -              }
        }
  
        img_request->rq = NULL;
        img_request->rbd_dev = rbd_dev;
        img_request->offset = offset;
        img_request->length = length;
 -      img_request->write_request = write_request;
 -      if (write_request)
 -              img_request->snapc = snapc;
 -      else
 +      img_request->flags = 0;
 +      if (write_request) {
 +              img_request_write_set(img_request);
 +              img_request->snapc = rbd_dev->header.snapc;
 +      } else {
                img_request->snap_id = rbd_dev->spec->snap_id;
 +      }
 +      if (child_request)
 +              img_request_child_set(img_request);
 +      if (rbd_dev->parent_spec)
 +              img_request_layered_set(img_request);
        spin_lock_init(&img_request->completion_lock);
        img_request->next_completion = 0;
        img_request->callback = NULL;
 +      img_request->result = 0;
        img_request->obj_request_count = 0;
        INIT_LIST_HEAD(&img_request->obj_requests);
        kref_init(&img_request->kref);
@@@ -1897,204 -1600,78 +1897,204 @@@ static void rbd_img_request_destroy(str
                rbd_img_obj_request_del(img_request, obj_request);
        rbd_assert(img_request->obj_request_count == 0);
  
 -      if (img_request->write_request)
 +      if (img_request_write_test(img_request))
                ceph_put_snap_context(img_request->snapc);
  
 -      kfree(img_request);
 +      if (img_request_child_test(img_request))
 +              rbd_obj_request_put(img_request->obj_request);
 +
 +      kmem_cache_free(rbd_img_request_cache, img_request);
 +}
 +
 +static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_img_request *img_request;
 +      unsigned int xferred;
 +      int result;
 +      bool more;
 +
 +      rbd_assert(obj_request_img_data_test(obj_request));
 +      img_request = obj_request->img_request;
 +
 +      rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
 +      xferred = (unsigned int)obj_request->xferred;
 +      result = obj_request->result;
 +      if (result) {
 +              struct rbd_device *rbd_dev = img_request->rbd_dev;
 +
 +              rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
 +                      img_request_write_test(img_request) ? "write" : "read",
 +                      obj_request->length, obj_request->img_offset,
 +                      obj_request->offset);
 +              rbd_warn(rbd_dev, "  result %d xferred %x\n",
 +                      result, xferred);
 +              if (!img_request->result)
 +                      img_request->result = result;
 +      }
 +
 +      /* Image object requests don't own their page array */
 +
 +      if (obj_request->type == OBJ_REQUEST_PAGES) {
 +              obj_request->pages = NULL;
 +              obj_request->page_count = 0;
 +      }
 +
 +      if (img_request_child_test(img_request)) {
 +              rbd_assert(img_request->obj_request != NULL);
 +              more = obj_request->which < img_request->obj_request_count - 1;
 +      } else {
 +              rbd_assert(img_request->rq != NULL);
 +              more = blk_end_request(img_request->rq, result, xferred);
 +      }
 +
 +      return more;
 +}
 +
 +static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_img_request *img_request;
 +      u32 which = obj_request->which;
 +      bool more = true;
 +
 +      rbd_assert(obj_request_img_data_test(obj_request));
 +      img_request = obj_request->img_request;
 +
 +      dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
 +      rbd_assert(img_request != NULL);
 +      rbd_assert(img_request->obj_request_count > 0);
 +      rbd_assert(which != BAD_WHICH);
 +      rbd_assert(which < img_request->obj_request_count);
 +      rbd_assert(which >= img_request->next_completion);
 +
 +      spin_lock_irq(&img_request->completion_lock);
 +      if (which != img_request->next_completion)
 +              goto out;
 +
 +      for_each_obj_request_from(img_request, obj_request) {
 +              rbd_assert(more);
 +              rbd_assert(which < img_request->obj_request_count);
 +
 +              if (!obj_request_done_test(obj_request))
 +                      break;
 +              more = rbd_img_obj_end_request(obj_request);
 +              which++;
 +      }
 +
 +      rbd_assert(more ^ (which == img_request->obj_request_count));
 +      img_request->next_completion = which;
 +out:
 +      spin_unlock_irq(&img_request->completion_lock);
 +
 +      if (!more)
 +              rbd_img_request_complete(img_request);
  }
  
 -static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 -                                      struct bio *bio_list)
 +/*
 + * Split up an image request into one or more object requests, each
 + * to a different object.  The "type" parameter indicates whether
 + * "data_desc" is the pointer to the head of a list of bio
 + * structures, or the base of a page array.  In either case this
 + * function assumes data_desc describes memory sufficient to hold
 + * all data described by the image request.
 + */
 +static int rbd_img_request_fill(struct rbd_img_request *img_request,
 +                                      enum obj_request_type type,
 +                                      void *data_desc)
  {
        struct rbd_device *rbd_dev = img_request->rbd_dev;
        struct rbd_obj_request *obj_request = NULL;
        struct rbd_obj_request *next_obj_request;
 -      unsigned int bio_offset;
 -      u64 image_offset;
 +      bool write_request = img_request_write_test(img_request);
 +      struct bio *bio_list;
 +      unsigned int bio_offset = 0;
 +      struct page **pages;
 +      u64 img_offset;
        u64 resid;
        u16 opcode;
  
 -      dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
 +      dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
 +              (int)type, data_desc);
  
 -      opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
 -                                            : CEPH_OSD_OP_READ;
 -      bio_offset = 0;
 -      image_offset = img_request->offset;
 -      rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
 +      opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
 +      img_offset = img_request->offset;
        resid = img_request->length;
        rbd_assert(resid > 0);
 +
 +      if (type == OBJ_REQUEST_BIO) {
 +              bio_list = data_desc;
 +              rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
 +      } else {
 +              rbd_assert(type == OBJ_REQUEST_PAGES);
 +              pages = data_desc;
 +      }
 +
        while (resid) {
 +              struct ceph_osd_request *osd_req;
                const char *object_name;
 -              unsigned int clone_size;
 -              struct ceph_osd_req_op *op;
                u64 offset;
                u64 length;
  
 -              object_name = rbd_segment_name(rbd_dev, image_offset);
 +              object_name = rbd_segment_name(rbd_dev, img_offset);
                if (!object_name)
                        goto out_unwind;
 -              offset = rbd_segment_offset(rbd_dev, image_offset);
 -              length = rbd_segment_length(rbd_dev, image_offset, resid);
 +              offset = rbd_segment_offset(rbd_dev, img_offset);
 +              length = rbd_segment_length(rbd_dev, img_offset, resid);
                obj_request = rbd_obj_request_create(object_name,
 -                                              offset, length,
 -                                              OBJ_REQUEST_BIO);
 -              kfree(object_name);     /* object request has its own copy */
 +                                              offset, length, type);
 +              /* object request has its own copy of the object name */
 +              rbd_segment_name_free(object_name);
                if (!obj_request)
                        goto out_unwind;
  
 -              rbd_assert(length <= (u64) UINT_MAX);
 -              clone_size = (unsigned int) length;
 -              obj_request->bio_list = bio_chain_clone_range(&bio_list,
 -                                              &bio_offset, clone_size,
 -                                              GFP_ATOMIC);
 -              if (!obj_request->bio_list)
 -                      goto out_partial;
 +              if (type == OBJ_REQUEST_BIO) {
 +                      unsigned int clone_size;
 +
 +                      rbd_assert(length <= (u64)UINT_MAX);
 +                      clone_size = (unsigned int)length;
 +                      obj_request->bio_list =
 +                                      bio_chain_clone_range(&bio_list,
 +                                                              &bio_offset,
 +                                                              clone_size,
 +                                                              GFP_ATOMIC);
 +                      if (!obj_request->bio_list)
 +                              goto out_partial;
 +              } else {
 +                      unsigned int page_count;
 +
 +                      obj_request->pages = pages;
 +                      page_count = (u32)calc_pages_for(offset, length);
 +                      obj_request->page_count = page_count;
 +                      if ((offset + length) & ~PAGE_MASK)
 +                              page_count--;   /* more on last page */
 +                      pages += page_count;
 +              }
  
 -              /*
 -               * Build up the op to use in building the osd
 -               * request.  Note that the contents of the op are
 -               * copied by rbd_osd_req_create().
 -               */
 -              op = rbd_osd_req_op_create(opcode, offset, length);
 -              if (!op)
 -                      goto out_partial;
 -              obj_request->osd_req = rbd_osd_req_create(rbd_dev,
 -                                              img_request->write_request,
 -                                              obj_request, op);
 -              rbd_osd_req_op_destroy(op);
 -              if (!obj_request->osd_req)
 +              osd_req = rbd_osd_req_create(rbd_dev, write_request,
 +                                              obj_request);
 +              if (!osd_req)
                        goto out_partial;
 -              /* status and version are initially zero-filled */
 +              obj_request->osd_req = osd_req;
 +              obj_request->callback = rbd_img_obj_callback;
 +
 +              osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
 +                                              0, 0);
 +              if (type == OBJ_REQUEST_BIO)
 +                      osd_req_op_extent_osd_data_bio(osd_req, 0,
 +                                      obj_request->bio_list, length);
 +              else
 +                      osd_req_op_extent_osd_data_pages(osd_req, 0,
 +                                      obj_request->pages, length,
 +                                      offset & ~PAGE_MASK, false, false);
  
 +              if (write_request)
 +                      rbd_osd_req_format_write(obj_request);
 +              else
 +                      rbd_osd_req_format_read(obj_request);
 +
 +              obj_request->img_offset = img_offset;
                rbd_img_obj_request_add(img_request, obj_request);
  
 -              image_offset += length;
 +              img_offset += length;
                resid -= length;
        }
  
@@@ -2109,495 -1686,88 +2109,495 @@@ out_unwind
        return -ENOMEM;
  }
  
 -static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
 +static void
 +rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
  {
        struct rbd_img_request *img_request;
 -      u32 which = obj_request->which;
 -      bool more = true;
 +      struct rbd_device *rbd_dev;
 +      u64 length;
 +      u32 page_count;
  
 +      rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
 +      rbd_assert(obj_request_img_data_test(obj_request));
        img_request = obj_request->img_request;
 +      rbd_assert(img_request);
  
 -      dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
 +      rbd_dev = img_request->rbd_dev;
 +      rbd_assert(rbd_dev);
 +      length = (u64)1 << rbd_dev->header.obj_order;
 +      page_count = (u32)calc_pages_for(0, length);
 +
 +      rbd_assert(obj_request->copyup_pages);
 +      ceph_release_page_vector(obj_request->copyup_pages, page_count);
 +      obj_request->copyup_pages = NULL;
 +
 +      /*
 +       * We want the transfer count to reflect the size of the
 +       * original write request.  There is no such thing as a
 +       * successful short write, so if the request was successful
 +       * we can just set it to the originally-requested length.
 +       */
 +      if (!obj_request->result)
 +              obj_request->xferred = obj_request->length;
 +
 +      /* Finish up with the normal image object callback */
 +
 +      rbd_img_obj_callback(obj_request);
 +}
 +
 +static void
 +rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
 +{
 +      struct rbd_obj_request *orig_request;
 +      struct ceph_osd_request *osd_req;
 +      struct ceph_osd_client *osdc;
 +      struct rbd_device *rbd_dev;
 +      struct page **pages;
 +      int result;
 +      u64 obj_size;
 +      u64 xferred;
 +
 +      rbd_assert(img_request_child_test(img_request));
 +
 +      /* First get what we need from the image request */
 +
 +      pages = img_request->copyup_pages;
 +      rbd_assert(pages != NULL);
 +      img_request->copyup_pages = NULL;
 +
 +      orig_request = img_request->obj_request;
 +      rbd_assert(orig_request != NULL);
 +      rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
 +      result = img_request->result;
 +      obj_size = img_request->length;
 +      xferred = img_request->xferred;
 +
 +      rbd_dev = img_request->rbd_dev;
 +      rbd_assert(rbd_dev);
 +      rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
 +
 +      rbd_img_request_put(img_request);
 +
 +      if (result)
 +              goto out_err;
 +
 +      /* Allocate the new copyup osd request for the original request */
 +
 +      result = -ENOMEM;
 +      rbd_assert(!orig_request->osd_req);
 +      osd_req = rbd_osd_req_create_copyup(orig_request);
 +      if (!osd_req)
 +              goto out_err;
 +      orig_request->osd_req = osd_req;
 +      orig_request->copyup_pages = pages;
 +
 +      /* Initialize the copyup op */
 +
 +      osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
 +      osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
 +                                              false, false);
 +
 +      /* Then the original write request op */
 +
 +      osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
 +                                      orig_request->offset,
 +                                      orig_request->length, 0, 0);
 +      osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
 +                                      orig_request->length);
 +
 +      rbd_osd_req_format_write(orig_request);
 +
 +      /* All set, send it off. */
 +
 +      orig_request->callback = rbd_img_obj_copyup_callback;
 +      osdc = &rbd_dev->rbd_client->client->osdc;
 +      result = rbd_obj_request_submit(osdc, orig_request);
 +      if (!result)
 +              return;
 +out_err:
 +      /* Record the error code and complete the request */
 +
 +      orig_request->result = result;
 +      orig_request->xferred = 0;
 +      obj_request_done_set(orig_request);
 +      rbd_obj_request_complete(orig_request);
 +}
 +
 +/*
 + * Read from the parent image the range of data that covers the
 + * entire target of the given object request.  This is used for
 + * satisfying a layered image write request when the target of an
 + * object request from the image request does not exist.
 + *
 + * A page array big enough to hold the returned data is allocated
 + * and supplied to rbd_img_request_fill() as the "data descriptor."
 + * When the read completes, this page array will be transferred to
 + * the original object request for the copyup operation.
 + *
 + * If an error occurs, record it as the result of the original
 + * object request and mark it done so it gets completed.
 + */
 +static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_img_request *img_request = NULL;
 +      struct rbd_img_request *parent_request = NULL;
 +      struct rbd_device *rbd_dev;
 +      u64 img_offset;
 +      u64 length;
 +      struct page **pages = NULL;
 +      u32 page_count;
 +      int result;
 +
 +      rbd_assert(obj_request_img_data_test(obj_request));
 +      rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
 +
 +      img_request = obj_request->img_request;
        rbd_assert(img_request != NULL);
 -      rbd_assert(img_request->rq != NULL);
 -      rbd_assert(img_request->obj_request_count > 0);
 -      rbd_assert(which != BAD_WHICH);
 -      rbd_assert(which < img_request->obj_request_count);
 -      rbd_assert(which >= img_request->next_completion);
 +      rbd_dev = img_request->rbd_dev;
 +      rbd_assert(rbd_dev->parent != NULL);
  
 -      spin_lock_irq(&img_request->completion_lock);
 -      if (which != img_request->next_completion)
 -              goto out;
 +      /*
 +       * First things first.  The original osd request is of no
 +       * use to use any more, we'll need a new one that can hold
 +       * the two ops in a copyup request.  We'll get that later,
 +       * but for now we can release the old one.
 +       */
 +      rbd_osd_req_destroy(obj_request->osd_req);
 +      obj_request->osd_req = NULL;
  
 -      for_each_obj_request_from(img_request, obj_request) {
 -              unsigned int xferred;
 -              int result;
 +      /*
 +       * Determine the byte range covered by the object in the
 +       * child image to which the original request was to be sent.
 +       */
 +      img_offset = obj_request->img_offset - obj_request->offset;
 +      length = (u64)1 << rbd_dev->header.obj_order;
  
 -              rbd_assert(more);
 -              rbd_assert(which < img_request->obj_request_count);
 +      /*
 +       * There is no defined parent data beyond the parent
 +       * overlap, so limit what we read at that boundary if
 +       * necessary.
 +       */
 +      if (img_offset + length > rbd_dev->parent_overlap) {
 +              rbd_assert(img_offset < rbd_dev->parent_overlap);
 +              length = rbd_dev->parent_overlap - img_offset;
 +      }
  
 -              if (!obj_request_done_test(obj_request))
 -                      break;
 +      /*
 +       * Allocate a page array big enough to receive the data read
 +       * from the parent.
 +       */
 +      page_count = (u32)calc_pages_for(0, length);
 +      pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
 +      if (IS_ERR(pages)) {
 +              result = PTR_ERR(pages);
 +              pages = NULL;
 +              goto out_err;
 +      }
  
 -              rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
 -              xferred = (unsigned int) obj_request->xferred;
 -              result = (int) obj_request->result;
 -              if (result)
 -                      rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
 -                              img_request->write_request ? "write" : "read",
 -                              result, xferred);
 +      result = -ENOMEM;
 +      parent_request = rbd_img_request_create(rbd_dev->parent,
 +                                              img_offset, length,
 +                                              false, true);
 +      if (!parent_request)
 +              goto out_err;
 +      rbd_obj_request_get(obj_request);
 +      parent_request->obj_request = obj_request;
  
 -              more = blk_end_request(img_request->rq, result, xferred);
 -              which++;
 +      result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
 +      if (result)
 +              goto out_err;
 +      parent_request->copyup_pages = pages;
 +
 +      parent_request->callback = rbd_img_obj_parent_read_full_callback;
 +      result = rbd_img_request_submit(parent_request);
 +      if (!result)
 +              return 0;
 +
 +      parent_request->copyup_pages = NULL;
 +      parent_request->obj_request = NULL;
 +      rbd_obj_request_put(obj_request);
 +out_err:
 +      if (pages)
 +              ceph_release_page_vector(pages, page_count);
 +      if (parent_request)
 +              rbd_img_request_put(parent_request);
 +      obj_request->result = result;
 +      obj_request->xferred = 0;
 +      obj_request_done_set(obj_request);
 +
 +      return result;
 +}
 +
 +static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_obj_request *orig_request;
 +      int result;
 +
 +      rbd_assert(!obj_request_img_data_test(obj_request));
 +
 +      /*
 +       * All we need from the object request is the original
 +       * request and the result of the STAT op.  Grab those, then
 +       * we're done with the request.
 +       */
 +      orig_request = obj_request->obj_request;
 +      obj_request->obj_request = NULL;
 +      rbd_assert(orig_request);
 +      rbd_assert(orig_request->img_request);
 +
 +      result = obj_request->result;
 +      obj_request->result = 0;
 +
 +      dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
 +              obj_request, orig_request, result,
 +              obj_request->xferred, obj_request->length);
 +      rbd_obj_request_put(obj_request);
 +
 +      rbd_assert(orig_request);
 +      rbd_assert(orig_request->img_request);
 +
 +      /*
 +       * Our only purpose here is to determine whether the object
 +       * exists, and we don't want to treat the non-existence as
 +       * an error.  If something else comes back, transfer the
 +       * error to the original request and complete it now.
 +       */
 +      if (!result) {
 +              obj_request_existence_set(orig_request, true);
 +      } else if (result == -ENOENT) {
 +              obj_request_existence_set(orig_request, false);
 +      } else if (result) {
 +              orig_request->result = result;
 +              goto out;
        }
  
 -      rbd_assert(more ^ (which == img_request->obj_request_count));
 -      img_request->next_completion = which;
 +      /*
 +       * Resubmit the original request now that we have recorded
 +       * whether the target object exists.
 +       */
 +      orig_request->result = rbd_img_obj_request_submit(orig_request);
  out:
 -      spin_unlock_irq(&img_request->completion_lock);
 +      if (orig_request->result)
 +              rbd_obj_request_complete(orig_request);
 +      rbd_obj_request_put(orig_request);
 +}
  
 -      if (!more)
 -              rbd_img_request_complete(img_request);
 +static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_obj_request *stat_request;
 +      struct rbd_device *rbd_dev;
 +      struct ceph_osd_client *osdc;
 +      struct page **pages = NULL;
 +      u32 page_count;
 +      size_t size;
 +      int ret;
 +
 +      /*
 +       * The response data for a STAT call consists of:
 +       *     le64 length;
 +       *     struct {
 +       *         le32 tv_sec;
 +       *         le32 tv_nsec;
 +       *     } mtime;
 +       */
 +      size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
 +      page_count = (u32)calc_pages_for(0, size);
 +      pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
 +      if (IS_ERR(pages))
 +              return PTR_ERR(pages);
 +
 +      ret = -ENOMEM;
 +      stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
 +                                                      OBJ_REQUEST_PAGES);
 +      if (!stat_request)
 +              goto out;
 +
 +      rbd_obj_request_get(obj_request);
 +      stat_request->obj_request = obj_request;
 +      stat_request->pages = pages;
 +      stat_request->page_count = page_count;
 +
 +      rbd_assert(obj_request->img_request);
 +      rbd_dev = obj_request->img_request->rbd_dev;
 +      stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
 +                                              stat_request);
 +      if (!stat_request->osd_req)
 +              goto out;
 +      stat_request->callback = rbd_img_obj_exists_callback;
 +
 +      osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
 +      osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
 +                                      false, false);
 +      rbd_osd_req_format_read(stat_request);
 +
 +      osdc = &rbd_dev->rbd_client->client->osdc;
 +      ret = rbd_obj_request_submit(osdc, stat_request);
 +out:
 +      if (ret)
 +              rbd_obj_request_put(obj_request);
 +
 +      return ret;
 +}
 +
 +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_img_request *img_request;
 +      struct rbd_device *rbd_dev;
 +      bool known;
 +
 +      rbd_assert(obj_request_img_data_test(obj_request));
 +
 +      img_request = obj_request->img_request;
 +      rbd_assert(img_request);
 +      rbd_dev = img_request->rbd_dev;
 +
 +      /*
 +       * Only writes to layered images need special handling.
 +       * Reads and non-layered writes are simple object requests.
 +       * Layered writes that start beyond the end of the overlap
 +       * with the parent have no parent data, so they too are
 +       * simple object requests.  Finally, if the target object is
 +       * known to already exist, its parent data has already been
 +       * copied, so a write to the object can also be handled as a
 +       * simple object request.
 +       */
 +      if (!img_request_write_test(img_request) ||
 +              !img_request_layered_test(img_request) ||
 +              rbd_dev->parent_overlap <= obj_request->img_offset ||
 +              ((known = obj_request_known_test(obj_request)) &&
 +                      obj_request_exists_test(obj_request))) {
 +
 +              struct rbd_device *rbd_dev;
 +              struct ceph_osd_client *osdc;
 +
 +              rbd_dev = obj_request->img_request->rbd_dev;
 +              osdc = &rbd_dev->rbd_client->client->osdc;
 +
 +              return rbd_obj_request_submit(osdc, obj_request);
 +      }
 +
 +      /*
 +       * It's a layered write.  The target object might exist but
 +       * we may not know that yet.  If we know it doesn't exist,
 +       * start by reading the data for the full target object from
 +       * the parent so we can use it for a copyup to the target.
 +       */
 +      if (known)
 +              return rbd_img_obj_parent_read_full(obj_request);
 +
 +      /* We don't know whether the target exists.  Go find out. */
 +
 +      return rbd_img_obj_exists_submit(obj_request);
  }
  
  static int rbd_img_request_submit(struct rbd_img_request *img_request)
  {
 -      struct rbd_device *rbd_dev = img_request->rbd_dev;
 -      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        struct rbd_obj_request *obj_request;
 +      struct rbd_obj_request *next_obj_request;
  
        dout("%s: img %p\n", __func__, img_request);
 -      for_each_obj_request(img_request, obj_request) {
 +      for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
                int ret;
  
 -              obj_request->callback = rbd_img_obj_callback;
 -              ret = rbd_obj_request_submit(osdc, obj_request);
 +              ret = rbd_img_obj_request_submit(obj_request);
                if (ret)
                        return ret;
 -              /*
 -               * The image request has its own reference to each
 -               * of its object requests, so we can safely drop the
 -               * initial one here.
 -               */
 -              rbd_obj_request_put(obj_request);
        }
  
 -      return 0;
 +      return 0;
 +}
 +
 +static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
 +{
 +      struct rbd_obj_request *obj_request;
 +      struct rbd_device *rbd_dev;
 +      u64 obj_end;
 +
 +      rbd_assert(img_request_child_test(img_request));
 +
 +      obj_request = img_request->obj_request;
 +      rbd_assert(obj_request);
 +      rbd_assert(obj_request->img_request);
 +
 +      obj_request->result = img_request->result;
 +      if (obj_request->result)
 +              goto out;
 +
 +      /*
 +       * We need to zero anything beyond the parent overlap
 +       * boundary.  Since rbd_img_obj_request_read_callback()
 +       * will zero anything beyond the end of a short read, an
 +       * easy way to do this is to pretend the data from the
 +       * parent came up short--ending at the overlap boundary.
 +       */
 +      rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
 +      obj_end = obj_request->img_offset + obj_request->length;
 +      rbd_dev = obj_request->img_request->rbd_dev;
 +      if (obj_end > rbd_dev->parent_overlap) {
 +              u64 xferred = 0;
 +
 +              if (obj_request->img_offset < rbd_dev->parent_overlap)
 +                      xferred = rbd_dev->parent_overlap -
 +                                      obj_request->img_offset;
 +
 +              obj_request->xferred = min(img_request->xferred, xferred);
 +      } else {
 +              obj_request->xferred = img_request->xferred;
 +      }
 +out:
 +      rbd_img_request_put(img_request);
 +      rbd_img_obj_request_read_callback(obj_request);
 +      rbd_obj_request_complete(obj_request);
 +}
 +
 +static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
 +{
 +      struct rbd_device *rbd_dev;
 +      struct rbd_img_request *img_request;
 +      int result;
 +
 +      rbd_assert(obj_request_img_data_test(obj_request));
 +      rbd_assert(obj_request->img_request != NULL);
 +      rbd_assert(obj_request->result == (s32) -ENOENT);
 +      rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
 +
 +      rbd_dev = obj_request->img_request->rbd_dev;
 +      rbd_assert(rbd_dev->parent != NULL);
 +      /* rbd_read_finish(obj_request, obj_request->length); */
 +      img_request = rbd_img_request_create(rbd_dev->parent,
 +                                              obj_request->img_offset,
 +                                              obj_request->length,
 +                                              false, true);
 +      result = -ENOMEM;
 +      if (!img_request)
 +              goto out_err;
 +
 +      rbd_obj_request_get(obj_request);
 +      img_request->obj_request = obj_request;
 +
 +      result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
 +                                      obj_request->bio_list);
 +      if (result)
 +              goto out_err;
 +
 +      img_request->callback = rbd_img_parent_read_callback;
 +      result = rbd_img_request_submit(img_request);
 +      if (result)
 +              goto out_err;
 +
 +      return;
 +out_err:
 +      if (img_request)
 +              rbd_img_request_put(img_request);
 +      obj_request->result = result;
 +      obj_request->xferred = 0;
 +      obj_request_done_set(obj_request);
  }
  
 -static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
 -                                 u64 ver, u64 notify_id)
 +static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
  {
        struct rbd_obj_request *obj_request;
 -      struct ceph_osd_req_op *op;
 -      struct ceph_osd_client *osdc;
 +      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        int ret;
  
        obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
                return -ENOMEM;
  
        ret = -ENOMEM;
 -      op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
 -      if (!op)
 -              goto out;
 -      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
 -                                              obj_request, op);
 -      rbd_osd_req_op_destroy(op);
 +      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
        if (!obj_request->osd_req)
                goto out;
 -
 -      osdc = &rbd_dev->rbd_client->client->osdc;
        obj_request->callback = rbd_obj_request_put;
 +
 +      osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
 +                                      notify_id, 0, 0);
 +      rbd_osd_req_format_read(obj_request);
 +
        ret = rbd_obj_request_submit(osdc, obj_request);
  out:
        if (ret)
  static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
  {
        struct rbd_device *rbd_dev = (struct rbd_device *)data;
 -      u64 hver;
 -      int rc;
  
        if (!rbd_dev)
                return;
  
        dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
 -              rbd_dev->header_name, (unsigned long long) notify_id,
 -              (unsigned int) opcode);
 -      rc = rbd_dev_refresh(rbd_dev, &hver);
 -      if (rc)
 -              rbd_warn(rbd_dev, "got notification but failed to "
 -                         " update snaps: %d\n", rc);
 +              rbd_dev->header_name, (unsigned long long)notify_id,
 +              (unsigned int)opcode);
 +      (void)rbd_dev_refresh(rbd_dev);
  
 -      rbd_obj_notify_ack(rbd_dev, hver, notify_id);
 +      rbd_obj_notify_ack(rbd_dev, notify_id);
  }
  
  /*
@@@ -2646,6 -1823,7 +2646,6 @@@ static int rbd_dev_header_watch_sync(st
  {
        struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        struct rbd_obj_request *obj_request;
 -      struct ceph_osd_req_op *op;
        int ret;
  
        rbd_assert(start ^ !!rbd_dev->watch_event);
        if (!obj_request)
                goto out_cancel;
  
 -      op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
 -                              rbd_dev->watch_event->cookie,
 -                              rbd_dev->header.obj_version, start);
 -      if (!op)
 -              goto out_cancel;
 -      obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
 -                                                      obj_request, op);
 -      rbd_osd_req_op_destroy(op);
 +      obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
        if (!obj_request->osd_req)
                goto out_cancel;
  
        else
                ceph_osdc_unregister_linger_request(osdc,
                                        rbd_dev->watch_request->osd_req);
 +
 +      osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
 +                              rbd_dev->watch_event->cookie, 0, start);
 +      rbd_osd_req_format_write(obj_request);
 +
        ret = rbd_obj_request_submit(osdc, obj_request);
        if (ret)
                goto out_cancel;
@@@ -2718,38 -1898,40 +2718,38 @@@ out_cancel
  }
  
  /*
 - * Synchronous osd object method call
 + * Synchronous osd object method call.  Returns the number of bytes
 + * returned in the outbound buffer, or a negative error code.
   */
  static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
                             const char *object_name,
                             const char *class_name,
                             const char *method_name,
 -                           const char *outbound,
 +                           const void *outbound,
                             size_t outbound_size,
 -                           char *inbound,
 -                           size_t inbound_size,
 -                           u64 *version)
 +                           void *inbound,
 +                           size_t inbound_size)
  {
 +      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        struct rbd_obj_request *obj_request;
 -      struct ceph_osd_client *osdc;
 -      struct ceph_osd_req_op *op;
        struct page **pages;
        u32 page_count;
        int ret;
  
        /*
 -       * Method calls are ultimately read operations but they
 -       * don't involve object data (so no offset or length).
 -       * The result should placed into the inbound buffer
 -       * provided.  They also supply outbound data--parameters for
 -       * the object method.  Currently if this is present it will
 -       * be a snapshot id.
 +       * Method calls are ultimately read operations.  The result
 +       * should placed into the inbound buffer provided.  They
 +       * also supply outbound data--parameters for the object
 +       * method.  Currently if this is present it will be a
 +       * snapshot id.
         */
 -      page_count = (u32) calc_pages_for(0, inbound_size);
 +      page_count = (u32)calc_pages_for(0, inbound_size);
        pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
        if (IS_ERR(pages))
                return PTR_ERR(pages);
  
        ret = -ENOMEM;
 -      obj_request = rbd_obj_request_create(object_name, 0, 0,
 +      obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
                                                        OBJ_REQUEST_PAGES);
        if (!obj_request)
                goto out;
        obj_request->pages = pages;
        obj_request->page_count = page_count;
  
 -      op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
 -                                      method_name, outbound, outbound_size);
 -      if (!op)
 -              goto out;
 -      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
 -                                              obj_request, op);
 -      rbd_osd_req_op_destroy(op);
 +      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
        if (!obj_request->osd_req)
                goto out;
  
 -      osdc = &rbd_dev->rbd_client->client->osdc;
 +      osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
 +                                      class_name, method_name);
 +      if (outbound_size) {
 +              struct ceph_pagelist *pagelist;
 +
 +              pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
 +              if (!pagelist)
 +                      goto out;
 +
 +              ceph_pagelist_init(pagelist);
 +              ceph_pagelist_append(pagelist, outbound, outbound_size);
 +              osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
 +                                              pagelist);
 +      }
 +      osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
 +                                      obj_request->pages, inbound_size,
 +                                      0, false, false);
 +      rbd_osd_req_format_read(obj_request);
 +
        ret = rbd_obj_request_submit(osdc, obj_request);
        if (ret)
                goto out;
        ret = obj_request->result;
        if (ret < 0)
                goto out;
 -      ret = 0;
 +
 +      rbd_assert(obj_request->xferred < (u64)INT_MAX);
 +      ret = (int)obj_request->xferred;
        ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
 -      if (version)
 -              *version = obj_request->version;
  out:
        if (obj_request)
                rbd_obj_request_put(obj_request);
@@@ -2863,22 -2033,18 +2863,22 @@@ static void rbd_request_fn(struct reque
                }
  
                result = -EINVAL;
 -              if (WARN_ON(offset && length > U64_MAX - offset + 1))
 +              if (offset && length > U64_MAX - offset + 1) {
 +                      rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
 +                              offset, length);
                        goto end_request;       /* Shouldn't happen */
 +              }
  
                result = -ENOMEM;
                img_request = rbd_img_request_create(rbd_dev, offset, length,
 -                                                      write_request);
 +                                                      write_request, false);
                if (!img_request)
                        goto end_request;
  
                img_request->rq = rq;
  
 -              result = rbd_img_request_fill_bio(img_request, rq->bio);
 +              result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
 +                                              rq->bio);
                if (!result)
                        result = rbd_img_request_submit(img_request);
                if (result)
  end_request:
                spin_lock_irq(q->queue_lock);
                if (result < 0) {
 -                      rbd_warn(rbd_dev, "obj_request %s result %d\n",
 -                              write_request ? "write" : "read", result);
 +                      rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
 +                              write_request ? "write" : "read",
 +                              length, offset, result);
 +
                        __blk_end_request_all(rq, result);
                }
        }
@@@ -2948,22 -2112,22 +2948,22 @@@ static void rbd_free_disk(struct rbd_de
        if (!disk)
                return;
  
 -      if (disk->flags & GENHD_FL_UP)
 +      rbd_dev->disk = NULL;
 +      if (disk->flags & GENHD_FL_UP) {
                del_gendisk(disk);
 -      if (disk->queue)
 -              blk_cleanup_queue(disk->queue);
 +              if (disk->queue)
 +                      blk_cleanup_queue(disk->queue);
 +      }
        put_disk(disk);
  }
  
  static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
                                const char *object_name,
 -                              u64 offset, u64 length,
 -                              char *buf, u64 *version)
 +                              u64 offset, u64 length, void *buf)
  
  {
 -      struct ceph_osd_req_op *op;
 +      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
        struct rbd_obj_request *obj_request;
 -      struct ceph_osd_client *osdc;
        struct page **pages = NULL;
        u32 page_count;
        size_t size;
        obj_request->pages = pages;
        obj_request->page_count = page_count;
  
 -      op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
 -      if (!op)
 -              goto out;
 -      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
 -                                              obj_request, op);
 -      rbd_osd_req_op_destroy(op);
 +      obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
        if (!obj_request->osd_req)
                goto out;
  
 -      osdc = &rbd_dev->rbd_client->client->osdc;
 +      osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
 +                                      offset, length, 0, 0);
 +      osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
 +                                      obj_request->pages,
 +                                      obj_request->length,
 +                                      obj_request->offset & ~PAGE_MASK,
 +                                      false, false);
 +      rbd_osd_req_format_read(obj_request);
 +
        ret = rbd_obj_request_submit(osdc, obj_request);
        if (ret)
                goto out;
        rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
        size = (size_t) obj_request->xferred;
        ceph_copy_from_page_vector(pages, buf, 0, size);
 -      rbd_assert(size <= (size_t) INT_MAX);
 -      ret = (int) size;
 -      if (version)
 -              *version = obj_request->version;
 +      rbd_assert(size <= (size_t)INT_MAX);
 +      ret = (int)size;
  out:
        if (obj_request)
                rbd_obj_request_put(obj_request);
   * Returns a pointer-coded errno if a failure occurs.
   */
  static struct rbd_image_header_ondisk *
 -rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
 +rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
  {
        struct rbd_image_header_ondisk *ondisk = NULL;
        u32 snap_count = 0;
                        return ERR_PTR(-ENOMEM);
  
                ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
 -                                     0, size,
 -                                     (char *) ondisk, version);
 +                                     0, size, ondisk);
                if (ret < 0)
                        goto out_err;
 -              if (WARN_ON((size_t) ret < size)) {
 +              if ((size_t)ret < size) {
                        ret = -ENXIO;
                        rbd_warn(rbd_dev, "short header read (want %zd got %d)",
                                size, ret);
@@@ -3095,36 -2259,46 +3095,36 @@@ static int rbd_read_header(struct rbd_d
                           struct rbd_image_header *header)
  {
        struct rbd_image_header_ondisk *ondisk;
 -      u64 ver = 0;
        int ret;
  
 -      ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
 +      ondisk = rbd_dev_v1_header_read(rbd_dev);
        if (IS_ERR(ondisk))
                return PTR_ERR(ondisk);
        ret = rbd_header_from_disk(header, ondisk);
 -      if (ret >= 0)
 -              header->obj_version = ver;
        kfree(ondisk);
  
        return ret;
  }
  
 -static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
 -{
 -      struct rbd_snap *snap;
 -      struct rbd_snap *next;
 -
 -      list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
 -              rbd_remove_snap_dev(snap);
 -}
 -
  static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
  {
 -      sector_t size;
 -
        if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
                return;
  
 -      size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
 -      dout("setting size to %llu sectors", (unsigned long long) size);
 -      rbd_dev->mapping.size = (u64) size;
 -      set_capacity(rbd_dev->disk, size);
 +      if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
 +              sector_t size;
 +
 +              rbd_dev->mapping.size = rbd_dev->header.image_size;
 +              size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
 +              dout("setting size to %llu sectors", (unsigned long long)size);
 +              set_capacity(rbd_dev->disk, size);
 +      }
  }
  
  /*
   * only read the first part of the ondisk header, without the snaps info
   */
 -static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
 +static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
  {
        int ret;
        struct rbd_image_header h;
        /* osd requests may still refer to snapc */
        ceph_put_snap_context(rbd_dev->header.snapc);
  
 -      if (hver)
 -              *hver = h.obj_version;
 -      rbd_dev->header.obj_version = h.obj_version;
        rbd_dev->header.image_size = h.image_size;
        rbd_dev->header.snapc = h.snapc;
        rbd_dev->header.snap_names = h.snap_names;
        rbd_dev->header.snap_sizes = h.snap_sizes;
        /* Free the extra copy of the object prefix */
 -      WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
 +      if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
 +              rbd_warn(rbd_dev, "object prefix changed (ignoring)");
        kfree(h.object_prefix);
  
 -      ret = rbd_dev_snaps_update(rbd_dev);
 -      if (!ret)
 -              ret = rbd_dev_snaps_register(rbd_dev);
 -
        up_write(&rbd_dev->header_rwsem);
  
        return ret;
  }
  
 -static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
 +/*
 + * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
 + * has disappeared from the (just updated) snapshot context.
 + */
 +static void rbd_exists_validate(struct rbd_device *rbd_dev)
 +{
 +      u64 snap_id;
 +
 +      if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
 +              return;
 +
 +      snap_id = rbd_dev->spec->snap_id;
 +      if (snap_id == CEPH_NOSNAP)
 +              return;
 +
 +      if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
 +              clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 +}
 +
 +static int rbd_dev_refresh(struct rbd_device *rbd_dev)
  {
 +      u64 image_size;
        int ret;
  
        rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 +      image_size = rbd_dev->header.image_size;
        mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
        if (rbd_dev->image_format == 1)
 -              ret = rbd_dev_v1_refresh(rbd_dev, hver);
 +              ret = rbd_dev_v1_refresh(rbd_dev);
        else
 -              ret = rbd_dev_v2_refresh(rbd_dev, hver);
 +              ret = rbd_dev_v2_refresh(rbd_dev);
 +
 +      /* If it's a mapped snapshot, validate its EXISTS flag */
 +
 +      rbd_exists_validate(rbd_dev);
        mutex_unlock(&ctl_mutex);
 +      if (ret)
 +              rbd_warn(rbd_dev, "got notification but failed to "
 +                         " update snaps: %d\n", ret);
 +      if (image_size != rbd_dev->header.image_size)
 +              revalidate_disk(rbd_dev->disk);
  
        return ret;
  }
@@@ -3243,6 -2393,8 +3243,6 @@@ static int rbd_init_disk(struct rbd_dev
  
        rbd_dev->disk = disk;
  
 -      set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
 -
        return 0;
  out_disk:
        put_disk(disk);
@@@ -3263,9 -2415,13 +3263,9 @@@ static ssize_t rbd_size_show(struct dev
                             struct device_attribute *attr, char *buf)
  {
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
 -      sector_t size;
  
 -      down_read(&rbd_dev->header_rwsem);
 -      size = get_capacity(rbd_dev->disk);
 -      up_read(&rbd_dev->header_rwsem);
 -
 -      return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
 +      return sprintf(buf, "%llu\n",
 +              (unsigned long long)rbd_dev->mapping.size);
  }
  
  /*
@@@ -3278,7 -2434,7 +3278,7 @@@ static ssize_t rbd_features_show(struc
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
  
        return sprintf(buf, "0x%016llx\n",
 -                      (unsigned long long) rbd_dev->mapping.features);
 +                      (unsigned long long)rbd_dev->mapping.features);
  }
  
  static ssize_t rbd_major_show(struct device *dev,
  {
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
  
 -      return sprintf(buf, "%d\n", rbd_dev->major);
 +      if (rbd_dev->major)
 +              return sprintf(buf, "%d\n", rbd_dev->major);
 +
 +      return sprintf(buf, "(none)\n");
 +
  }
  
  static ssize_t rbd_client_id_show(struct device *dev,
@@@ -3316,7 -2468,7 +3316,7 @@@ static ssize_t rbd_pool_id_show(struct 
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
  
        return sprintf(buf, "%llu\n",
 -              (unsigned long long) rbd_dev->spec->pool_id);
 +                      (unsigned long long) rbd_dev->spec->pool_id);
  }
  
  static ssize_t rbd_name_show(struct device *dev,
@@@ -3402,7 -2554,7 +3402,7 @@@ static ssize_t rbd_image_refresh(struc
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
        int ret;
  
 -      ret = rbd_dev_refresh(rbd_dev, NULL);
 +      ret = rbd_dev_refresh(rbd_dev);
  
        return ret < 0 ? ret : size;
  }
@@@ -3453,6 -2605,71 +3453,6 @@@ static struct device_type rbd_device_ty
        .release        = rbd_sysfs_dev_release,
  };
  
 -
 -/*
 -  sysfs - snapshots
 -*/
 -
 -static ssize_t rbd_snap_size_show(struct device *dev,
 -                                struct device_attribute *attr,
 -                                char *buf)
 -{
 -      struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 -
 -      return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
 -}
 -
 -static ssize_t rbd_snap_id_show(struct device *dev,
 -                              struct device_attribute *attr,
 -                              char *buf)
 -{
 -      struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 -
 -      return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
 -}
 -
 -static ssize_t rbd_snap_features_show(struct device *dev,
 -                              struct device_attribute *attr,
 -                              char *buf)
 -{
 -      struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 -
 -      return sprintf(buf, "0x%016llx\n",
 -                      (unsigned long long) snap->features);
 -}
 -
 -static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
 -static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
 -static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
 -
 -static struct attribute *rbd_snap_attrs[] = {
 -      &dev_attr_snap_size.attr,
 -      &dev_attr_snap_id.attr,
 -      &dev_attr_snap_features.attr,
 -      NULL,
 -};
 -
 -static struct attribute_group rbd_snap_attr_group = {
 -      .attrs = rbd_snap_attrs,
 -};
 -
 -static void rbd_snap_dev_release(struct device *dev)
 -{
 -      struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
 -      kfree(snap->name);
 -      kfree(snap);
 -}
 -
 -static const struct attribute_group *rbd_snap_attr_groups[] = {
 -      &rbd_snap_attr_group,
 -      NULL
 -};
 -
 -static struct device_type rbd_snap_device_type = {
 -      .groups         = rbd_snap_attr_groups,
 -      .release        = rbd_snap_dev_release,
 -};
 -
  static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
  {
        kref_get(&spec->kref);
@@@ -3476,6 -2693,8 +3476,6 @@@ static struct rbd_spec *rbd_spec_alloc(
                return NULL;
        kref_init(&spec->kref);
  
 -      rbd_spec_put(rbd_spec_get(spec));       /* TEMPORARY */
 -
        return spec;
  }
  
@@@ -3502,6 -2721,7 +3502,6 @@@ static struct rbd_device *rbd_dev_creat
        spin_lock_init(&rbd_dev->lock);
        rbd_dev->flags = 0;
        INIT_LIST_HEAD(&rbd_dev->node);
 -      INIT_LIST_HEAD(&rbd_dev->snaps);
        init_rwsem(&rbd_dev->header_rwsem);
  
        rbd_dev->spec = spec;
  
  static void rbd_dev_destroy(struct rbd_device *rbd_dev)
  {
 -      rbd_spec_put(rbd_dev->parent_spec);
 -      kfree(rbd_dev->header_name);
        rbd_put_client(rbd_dev->rbd_client);
        rbd_spec_put(rbd_dev->spec);
        kfree(rbd_dev);
  }
  
 -static bool rbd_snap_registered(struct rbd_snap *snap)
 -{
 -      bool ret = snap->dev.type == &rbd_snap_device_type;
 -      bool reg = device_is_registered(&snap->dev);
 -
 -      rbd_assert(!ret ^ reg);
 -
 -      return ret;
 -}
 -
 -static void rbd_remove_snap_dev(struct rbd_snap *snap)
 -{
 -      list_del(&snap->node);
 -      if (device_is_registered(&snap->dev))
 -              device_unregister(&snap->dev);
 -}
 -
 -static int rbd_register_snap_dev(struct rbd_snap *snap,
 -                                struct device *parent)
 -{
 -      struct device *dev = &snap->dev;
 -      int ret;
 -
 -      dev->type = &rbd_snap_device_type;
 -      dev->parent = parent;
 -      dev->release = rbd_snap_dev_release;
 -      dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
 -      dout("%s: registering device for snapshot %s\n", __func__, snap->name);
 -
 -      ret = device_register(dev);
 -
 -      return ret;
 -}
 -
 -static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
 -                                              const char *snap_name,
 -                                              u64 snap_id, u64 snap_size,
 -                                              u64 snap_features)
 -{
 -      struct rbd_snap *snap;
 -      int ret;
 -
 -      snap = kzalloc(sizeof (*snap), GFP_KERNEL);
 -      if (!snap)
 -              return ERR_PTR(-ENOMEM);
 -
 -      ret = -ENOMEM;
 -      snap->name = kstrdup(snap_name, GFP_KERNEL);
 -      if (!snap->name)
 -              goto err;
 -
 -      snap->id = snap_id;
 -      snap->size = snap_size;
 -      snap->features = snap_features;
 -
 -      return snap;
 -
 -err:
 -      kfree(snap->name);
 -      kfree(snap);
 -
 -      return ERR_PTR(ret);
 -}
 -
 -static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
 -              u64 *snap_size, u64 *snap_features)
 -{
 -      char *snap_name;
 -
 -      rbd_assert(which < rbd_dev->header.snapc->num_snaps);
 -
 -      *snap_size = rbd_dev->header.snap_sizes[which];
 -      *snap_features = 0;     /* No features for v1 */
 -
 -      /* Skip over names until we find the one we are looking for */
 -
 -      snap_name = rbd_dev->header.snap_names;
 -      while (which--)
 -              snap_name += strlen(snap_name) + 1;
 -
 -      return snap_name;
 -}
 -
  /*
   * Get the size and object order for an image snapshot, or if
   * snap_id is CEPH_NOSNAP, gets this information for the base
@@@ -3541,21 -2846,18 +3541,21 @@@ static int _rbd_dev_v2_snap_size(struc
  
        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
                                "rbd", "get_size",
 -                              (char *) &snapid, sizeof (snapid),
 -                              (char *) &size_buf, sizeof (size_buf), NULL);
 +                              &snapid, sizeof (snapid),
 +                              &size_buf, sizeof (size_buf));
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
 +      if (ret < sizeof (size_buf))
 +              return -ERANGE;
  
 -      *order = size_buf.order;
 +      if (order)
 +              *order = size_buf.order;
        *snap_size = le64_to_cpu(size_buf.size);
  
        dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
 -              (unsigned long long) snap_id, (unsigned int) *order,
 -              (unsigned long long) *snap_size);
 +              (unsigned long long)snap_id, (unsigned int)*order,
 +              (unsigned long long)*snap_size);
  
        return 0;
  }
@@@ -3578,16 -2880,17 +3578,16 @@@ static int rbd_dev_v2_object_prefix(str
                return -ENOMEM;
  
        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 -                              "rbd", "get_object_prefix",
 -                              NULL, 0,
 -                              reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
 +                              "rbd", "get_object_prefix", NULL, 0,
 +                              reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out;
  
        p = reply_buf;
        rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
 -                                              p + RBD_OBJ_PREFIX_LEN_MAX,
 -                                              NULL, GFP_NOIO);
 +                                              p + ret, NULL, GFP_NOIO);
 +      ret = 0;
  
        if (IS_ERR(rbd_dev->header.object_prefix)) {
                ret = PTR_ERR(rbd_dev->header.object_prefix);
        } else {
                dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
        }
 -
  out:
        kfree(reply_buf);
  
@@@ -3608,30 -2912,29 +3608,30 @@@ static int _rbd_dev_v2_snap_features(st
        struct {
                __le64 features;
                __le64 incompat;
 -      } features_buf = { 0 };
 +      } __attribute__ ((packed)) features_buf = { 0 };
        u64 incompat;
        int ret;
  
        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
                                "rbd", "get_features",
 -                              (char *) &snapid, sizeof (snapid),
 -                              (char *) &features_buf, sizeof (features_buf),
 -                              NULL);
 +                              &snapid, sizeof (snapid),
 +                              &features_buf, sizeof (features_buf));
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                return ret;
 +      if (ret < sizeof (features_buf))
 +              return -ERANGE;
  
        incompat = le64_to_cpu(features_buf.incompat);
 -      if (incompat & ~RBD_FEATURES_ALL)
 +      if (incompat & ~RBD_FEATURES_SUPPORTED)
                return -ENXIO;
  
        *snap_features = le64_to_cpu(features_buf.features);
  
        dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
 -              (unsigned long long) snap_id,
 -              (unsigned long long) *snap_features,
 -              (unsigned long long) le64_to_cpu(features_buf.incompat));
 +              (unsigned long long)snap_id,
 +              (unsigned long long)*snap_features,
 +              (unsigned long long)le64_to_cpu(features_buf.incompat));
  
        return 0;
  }
@@@ -3671,15 -2974,15 +3671,15 @@@ static int rbd_dev_v2_parent_info(struc
        snapid = cpu_to_le64(CEPH_NOSNAP);
        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
                                "rbd", "get_parent",
 -                              (char *) &snapid, sizeof (snapid),
 -                              (char *) reply_buf, size, NULL);
 +                              &snapid, sizeof (snapid),
 +                              reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out_err;
  
 -      ret = -ERANGE;
        p = reply_buf;
 -      end = (char *) reply_buf + size;
 +      end = reply_buf + ret;
 +      ret = -ERANGE;
        ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
        if (parent_spec->pool_id == CEPH_NOPOOL)
                goto out;       /* No parent?  No problem. */
        /* The ceph file layout needs to fit pool id in 32 bits */
  
        ret = -EIO;
 -      if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
 -              goto out;
 +      if (parent_spec->pool_id > (u64)U32_MAX) {
 +              rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
 +                      (unsigned long long)parent_spec->pool_id, U32_MAX);
 +              goto out_err;
 +      }
  
        image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
        if (IS_ERR(image_id)) {
@@@ -3714,56 -3014,6 +3714,56 @@@ out_err
        return ret;
  }
  
 +static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
 +{
 +      struct {
 +              __le64 stripe_unit;
 +              __le64 stripe_count;
 +      } __attribute__ ((packed)) striping_info_buf = { 0 };
 +      size_t size = sizeof (striping_info_buf);
 +      void *p;
 +      u64 obj_size;
 +      u64 stripe_unit;
 +      u64 stripe_count;
 +      int ret;
 +
 +      ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 +                              "rbd", "get_stripe_unit_count", NULL, 0,
 +                              (char *)&striping_info_buf, size);
 +      dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 +      if (ret < 0)
 +              return ret;
 +      if (ret < size)
 +              return -ERANGE;
 +
 +      /*
 +       * We don't actually support the "fancy striping" feature
 +       * (STRIPINGV2) yet, but if the striping sizes are the
 +       * defaults the behavior is the same as before.  So find
 +       * out, and only fail if the image has non-default values.
 +       */
 +      ret = -EINVAL;
 +      obj_size = (u64)1 << rbd_dev->header.obj_order;
 +      p = &striping_info_buf;
 +      stripe_unit = ceph_decode_64(&p);
 +      if (stripe_unit != obj_size) {
 +              rbd_warn(rbd_dev, "unsupported stripe unit "
 +                              "(got %llu want %llu)",
 +                              stripe_unit, obj_size);
 +              return -EINVAL;
 +      }
 +      stripe_count = ceph_decode_64(&p);
 +      if (stripe_count != 1) {
 +              rbd_warn(rbd_dev, "unsupported stripe count "
 +                              "(got %llu want 1)", stripe_count);
 +              return -EINVAL;
 +      }
 +      rbd_dev->header.stripe_unit = stripe_unit;
 +      rbd_dev->header.stripe_count = stripe_count;
 +
 +      return 0;
 +}
 +
  static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
  {
        size_t image_id_size;
                return NULL;
  
        p = image_id;
 -      end = (char *) image_id + image_id_size;
 -      ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
 +      end = image_id + image_id_size;
 +      ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
  
        size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
        reply_buf = kmalloc(size, GFP_KERNEL);
        ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
                                "rbd", "dir_get_name",
                                image_id, image_id_size,
 -                              (char *) reply_buf, size, NULL);
 +                              reply_buf, size);
        if (ret < 0)
                goto out;
        p = reply_buf;
 -      end = (char *) reply_buf + size;
 +      end = reply_buf + ret;
 +
        image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
        if (IS_ERR(image_name))
                image_name = NULL;
        return image_name;
  }
  
 +static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
 +{
 +      struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 +      const char *snap_name;
 +      u32 which = 0;
 +
 +      /* Skip over names until we find the one we are looking for */
 +
 +      snap_name = rbd_dev->header.snap_names;
 +      while (which < snapc->num_snaps) {
 +              if (!strcmp(name, snap_name))
 +                      return snapc->snaps[which];
 +              snap_name += strlen(snap_name) + 1;
 +              which++;
 +      }
 +      return CEPH_NOSNAP;
 +}
 +
 +static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
 +{
 +      struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 +      u32 which;
 +      bool found = false;
 +      u64 snap_id;
 +
 +      for (which = 0; !found && which < snapc->num_snaps; which++) {
 +              const char *snap_name;
 +
 +              snap_id = snapc->snaps[which];
 +              snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
 +              if (IS_ERR(snap_name))
 +                      break;
 +              found = !strcmp(name, snap_name);
 +              kfree(snap_name);
 +      }
 +      return found ? snap_id : CEPH_NOSNAP;
 +}
 +
  /*
 - * When a parent image gets probed, we only have the pool, image,
 - * and snapshot ids but not the names of any of them.  This call
 - * is made later to fill in those names.  It has to be done after
 - * rbd_dev_snaps_update() has completed because some of the
 - * information (in particular, snapshot name) is not available
 - * until then.
 + * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
 + * no snapshot by that name is found, or if an error occurs.
   */
 -static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
 +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
  {
 -      struct ceph_osd_client *osdc;
 -      const char *name;
 -      void *reply_buf = NULL;
 +      if (rbd_dev->image_format == 1)
 +              return rbd_v1_snap_id_by_name(rbd_dev, name);
 +
 +      return rbd_v2_snap_id_by_name(rbd_dev, name);
 +}
 +
 +/*
 + * When an rbd image has a parent image, it is identified by the
 + * pool, image, and snapshot ids (not names).  This function fills
 + * in the names for those ids.  (It's OK if we can't figure out the
 + * name for an image id, but the pool and snapshot ids should always
 + * exist and have names.)  All names in an rbd spec are dynamically
 + * allocated.
 + *
 + * When an image being mapped (not a parent) is probed, we have the
 + * pool name and pool id, image name and image id, and the snapshot
 + * name.  The only thing we're missing is the snapshot id.
 + */
 +static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
 +{
 +      struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 +      struct rbd_spec *spec = rbd_dev->spec;
 +      const char *pool_name;
 +      const char *image_name;
 +      const char *snap_name;
        int ret;
  
 -      if (rbd_dev->spec->pool_name)
 -              return 0;       /* Already have the names */
 +      /*
 +       * An image being mapped will have the pool name (etc.), but
 +       * we need to look up the snapshot id.
 +       */
 +      if (spec->pool_name) {
 +              if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
 +                      u64 snap_id;
 +
 +                      snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
 +                      if (snap_id == CEPH_NOSNAP)
 +                              return -ENOENT;
 +                      spec->snap_id = snap_id;
 +              } else {
 +                      spec->snap_id = CEPH_NOSNAP;
 +              }
  
 -      /* Look up the pool name */
 +              return 0;
 +      }
  
 -      osdc = &rbd_dev->rbd_client->client->osdc;
 -      name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
 -      if (!name) {
 -              rbd_warn(rbd_dev, "there is no pool with id %llu",
 -                      rbd_dev->spec->pool_id);        /* Really a BUG() */
 +      /* Get the pool name; we have to make our own copy of this */
 +
 +      pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
 +      if (!pool_name) {
 +              rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
                return -EIO;
        }
 -
 -      rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
 -      if (!rbd_dev->spec->pool_name)
 +      pool_name = kstrdup(pool_name, GFP_KERNEL);
 +      if (!pool_name)
                return -ENOMEM;
  
        /* Fetch the image name; tolerate failure here */
  
 -      name = rbd_dev_image_name(rbd_dev);
 -      if (name)
 -              rbd_dev->spec->image_name = (char *) name;
 -      else
 +      image_name = rbd_dev_image_name(rbd_dev);
 +      if (!image_name)
                rbd_warn(rbd_dev, "unable to get image name");
  
 -      /* Look up the snapshot name. */
 +      /* Look up the snapshot name, and make a copy */
  
 -      name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
 -      if (!name) {
 -              rbd_warn(rbd_dev, "no snapshot with id %llu",
 -                      rbd_dev->spec->snap_id);        /* Really a BUG() */
 -              ret = -EIO;
 +      snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
 +      if (!snap_name) {
 +              ret = -ENOMEM;
                goto out_err;
        }
 -      rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
 -      if(!rbd_dev->spec->snap_name)
 -              goto out_err;
 +
 +      spec->pool_name = pool_name;
 +      spec->image_name = image_name;
 +      spec->snap_name = snap_name;
  
        return 0;
  out_err:
 -      kfree(reply_buf);
 -      kfree(rbd_dev->spec->pool_name);
 -      rbd_dev->spec->pool_name = NULL;
 +      kfree(image_name);
 +      kfree(pool_name);
  
        return ret;
  }
  
 -static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
 +static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
  {
        size_t size;
        int ret;
                return -ENOMEM;
  
        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
 -                              "rbd", "get_snapcontext",
 -                              NULL, 0,
 -                              reply_buf, size, ver);
 +                              "rbd", "get_snapcontext", NULL, 0,
 +                              reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
        if (ret < 0)
                goto out;
  
 -      ret = -ERANGE;
        p = reply_buf;
 -      end = (char *) reply_buf + size;
 +      end = reply_buf + ret;
 +      ret = -ERANGE;
        ceph_decode_64_safe(&p, end, seq, out);
        ceph_decode_32_safe(&p, end, snap_count, out);
  
        }
        if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
                goto out;
 +      ret = 0;
  
 -      size = sizeof (struct ceph_snap_context) +
 -                              snap_count * sizeof (snapc->snaps[0]);
 -      snapc = kmalloc(size, GFP_KERNEL);
 +      snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
        if (!snapc) {
                ret = -ENOMEM;
                goto out;
        }
 -
 -      atomic_set(&snapc->nref, 1);
        snapc->seq = seq;
 -      snapc->num_snaps = snap_count;
        for (i = 0; i < snap_count; i++)
                snapc->snaps[i] = ceph_decode_64(&p);
  
        rbd_dev->header.snapc = snapc;
  
        dout("  snap context seq = %llu, snap_count = %u\n",
 -              (unsigned long long) seq, (unsigned int) snap_count);
 -
 +              (unsigned long long)seq, (unsigned int)snap_count);
  out:
        kfree(reply_buf);
  
 -      return 0;
 +      return ret;
  }
  
 -static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
 +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 +                                      u64 snap_id)
  {
        size_t size;
        void *reply_buf;
 -      __le64 snap_id;
 +      __le64 snapid;
        int ret;
        void *p;
        void *end;
        if (!reply_buf)
                return ERR_PTR(-ENOMEM);
  
 -      snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
 +      snapid = cpu_to_le64(snap_id);
        ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
                                "rbd", "get_snapshot_name",
 -                              (char *) &snap_id, sizeof (snap_id),
 -                              reply_buf, size, NULL);
 +                              &snapid, sizeof (snapid),
 +                              reply_buf, size);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 -      if (ret < 0)
 -              goto out;
 -
 -      p = reply_buf;
 -      end = (char *) reply_buf + size;
 -      snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
 -      if (IS_ERR(snap_name)) {
 -              ret = PTR_ERR(snap_name);
 +      if (ret < 0) {
 +              snap_name = ERR_PTR(ret);
                goto out;
 -      } else {
 -              dout("  snap_id 0x%016llx snap_name = %s\n",
 -                      (unsigned long long) le64_to_cpu(snap_id), snap_name);
        }
 -      kfree(reply_buf);
 -
 -      return snap_name;
 -out:
 -      kfree(reply_buf);
 -
 -      return ERR_PTR(ret);
 -}
 -
 -static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
 -              u64 *snap_size, u64 *snap_features)
 -{
 -      u64 snap_id;
 -      u8 order;
 -      int ret;
  
 -      snap_id = rbd_dev->header.snapc->snaps[which];
 -      ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
 -      if (ret)
 -              return ERR_PTR(ret);
 -      ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
 -      if (ret)
 -              return ERR_PTR(ret);
 +      p = reply_buf;
 +      end = reply_buf + ret;
 +      snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
 +      if (IS_ERR(snap_name))
 +              goto out;
  
 -      return rbd_dev_v2_snap_name(rbd_dev, which);
 -}
 +      dout("  snap_id 0x%016llx snap_name = %s\n",
 +              (unsigned long long)snap_id, snap_name);
 +out:
 +      kfree(reply_buf);
  
 -static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
 -              u64 *snap_size, u64 *snap_features)
 -{
 -      if (rbd_dev->image_format == 1)
 -              return rbd_dev_v1_snap_info(rbd_dev, which,
 -                                      snap_size, snap_features);
 -      if (rbd_dev->image_format == 2)
 -              return rbd_dev_v2_snap_info(rbd_dev, which,
 -                                      snap_size, snap_features);
 -      return ERR_PTR(-EINVAL);
 +      return snap_name;
  }
  
 -static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
 +static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
  {
        int ret;
 -      __u8 obj_order;
  
        down_write(&rbd_dev->header_rwsem);
  
 -      /* Grab old order first, to see if it changes */
 -
 -      obj_order = rbd_dev->header.obj_order,
        ret = rbd_dev_v2_image_size(rbd_dev);
        if (ret)
                goto out;
 -      if (rbd_dev->header.obj_order != obj_order) {
 -              ret = -EIO;
 -              goto out;
 -      }
        rbd_update_mapping_size(rbd_dev);
  
 -      ret = rbd_dev_v2_snap_context(rbd_dev, hver);
 +      ret = rbd_dev_v2_snap_context(rbd_dev);
        dout("rbd_dev_v2_snap_context returned %d\n", ret);
        if (ret)
                goto out;
 -      ret = rbd_dev_snaps_update(rbd_dev);
 -      dout("rbd_dev_snaps_update returned %d\n", ret);
 -      if (ret)
 -              goto out;
 -      ret = rbd_dev_snaps_register(rbd_dev);
 -      dout("rbd_dev_snaps_register returned %d\n", ret);
  out:
        up_write(&rbd_dev->header_rwsem);
  
        return ret;
  }
  
 -/*
 - * Scan the rbd device's current snapshot list and compare it to the
 - * newly-received snapshot context.  Remove any existing snapshots
 - * not present in the new snapshot context.  Add a new snapshot for
 - * any snaphots in the snapshot context not in the current list.
 - * And verify there are no changes to snapshots we already know
 - * about.
 - *
 - * Assumes the snapshots in the snapshot context are sorted by
 - * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
 - * are also maintained in that order.)
 - */
 -static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
 -{
 -      struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 -      const u32 snap_count = snapc->num_snaps;
 -      struct list_head *head = &rbd_dev->snaps;
 -      struct list_head *links = head->next;
 -      u32 index = 0;
 -
 -      dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
 -      while (index < snap_count || links != head) {
 -              u64 snap_id;
 -              struct rbd_snap *snap;
 -              char *snap_name;
 -              u64 snap_size = 0;
 -              u64 snap_features = 0;
 -
 -              snap_id = index < snap_count ? snapc->snaps[index]
 -                                           : CEPH_NOSNAP;
 -              snap = links != head ? list_entry(links, struct rbd_snap, node)
 -                                   : NULL;
 -              rbd_assert(!snap || snap->id != CEPH_NOSNAP);
 -
 -              if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
 -                      struct list_head *next = links->next;
 -
 -                      /*
 -                       * A previously-existing snapshot is not in
 -                       * the new snap context.
 -                       *
 -                       * If the now missing snapshot is the one the
 -                       * image is mapped to, clear its exists flag
 -                       * so we can avoid sending any more requests
 -                       * to it.
 -                       */
 -                      if (rbd_dev->spec->snap_id == snap->id)
 -                              clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 -                      rbd_remove_snap_dev(snap);
 -                      dout("%ssnap id %llu has been removed\n",
 -                              rbd_dev->spec->snap_id == snap->id ?
 -                                                      "mapped " : "",
 -                              (unsigned long long) snap->id);
 -
 -                      /* Done with this list entry; advance */
 -
 -                      links = next;
 -                      continue;
 -              }
 -
 -              snap_name = rbd_dev_snap_info(rbd_dev, index,
 -                                      &snap_size, &snap_features);
 -              if (IS_ERR(snap_name))
 -                      return PTR_ERR(snap_name);
 -
 -              dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
 -                      (unsigned long long) snap_id);
 -              if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
 -                      struct rbd_snap *new_snap;
 -
 -                      /* We haven't seen this snapshot before */
 -
 -                      new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
 -                                      snap_id, snap_size, snap_features);
 -                      if (IS_ERR(new_snap)) {
 -                              int err = PTR_ERR(new_snap);
 -
 -                              dout("  failed to add dev, error %d\n", err);
 -
 -                              return err;
 -                      }
 -
 -                      /* New goes before existing, or at end of list */
 -
 -                      dout("  added dev%s\n", snap ? "" : " at end\n");
 -                      if (snap)
 -                              list_add_tail(&new_snap->node, &snap->node);
 -                      else
 -                              list_add_tail(&new_snap->node, head);
 -              } else {
 -                      /* Already have this one */
 -
 -                      dout("  already present\n");
 -
 -                      rbd_assert(snap->size == snap_size);
 -                      rbd_assert(!strcmp(snap->name, snap_name));
 -                      rbd_assert(snap->features == snap_features);
 -
 -                      /* Done with this list entry; advance */
 -
 -                      links = links->next;
 -              }
 -
 -              /* Advance to the next entry in the snapshot context */
 -
 -              index++;
 -      }
 -      dout("%s: done\n", __func__);
 -
 -      return 0;
 -}
 -
 -/*
 - * Scan the list of snapshots and register the devices for any that
 - * have not already been registered.
 - */
 -static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
 -{
 -      struct rbd_snap *snap;
 -      int ret = 0;
 -
 -      dout("%s:\n", __func__);
 -      if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
 -              return -EIO;
 -
 -      list_for_each_entry(snap, &rbd_dev->snaps, node) {
 -              if (!rbd_snap_registered(snap)) {
 -                      ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
 -                      if (ret < 0)
 -                              break;
 -              }
 -      }
 -      dout("%s: returning %d\n", __func__, ret);
 -
 -      return ret;
 -}
 -
  static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
  {
        struct device *dev;
        dev->bus = &rbd_bus_type;
        dev->type = &rbd_device_type;
        dev->parent = &rbd_root_dev;
 -      dev->release = rbd_dev_release;
 +      dev->release = rbd_dev_device_release;
        dev_set_name(dev, "%d", rbd_dev->dev_id);
        ret = device_register(dev);
  
@@@ -4299,7 -3672,6 +4299,7 @@@ static int rbd_add_parse_args(const cha
        size_t len;
        char *options;
        const char *mon_addrs;
 +      char *snap_name;
        size_t mon_addrs_size;
        struct rbd_spec *spec = NULL;
        struct rbd_options *rbd_opts = NULL;
                ret = -ENAMETOOLONG;
                goto out_err;
        }
 -      spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
 -      if (!spec->snap_name)
 +      snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
 +      if (!snap_name)
                goto out_mem;
 -      *(spec->snap_name + len) = '\0';
 +      *(snap_name + len) = '\0';
 +      spec->snap_name = snap_name;
  
        /* Initialize all rbd options to the defaults */
  
@@@ -4416,19 -3787,15 +4416,19 @@@ static int rbd_dev_image_id(struct rbd_
        size_t size;
        char *object_name;
        void *response;
 -      void *p;
 +      char *image_id;
  
        /*
         * When probing a parent image, the image id is already
         * known (and the image name likely is not).  There's no
 -       * need to fetch the image id again in this case.
 +       * need to fetch the image id again in this case.  We
 +       * do still need to set the image format though.
         */
 -      if (rbd_dev->spec->image_id)
 +      if (rbd_dev->spec->image_id) {
 +              rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
 +
                return 0;
 +      }
  
        /*
         * First, see if the format 2 image id file exists, and if
                goto out;
        }
  
 +      /* If it doesn't exist we'll assume it's a format 1 image */
 +
        ret = rbd_obj_method_sync(rbd_dev, object_name,
 -                              "rbd", "get_id",
 -                              NULL, 0,
 -                              response, RBD_IMAGE_ID_LEN_MAX, NULL);
 +                              "rbd", "get_id", NULL, 0,
 +                              response, RBD_IMAGE_ID_LEN_MAX);
        dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
 -      if (ret < 0)
 -              goto out;
 -
 -      p = response;
 -      rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
 -                                              p + RBD_IMAGE_ID_LEN_MAX,
 +      if (ret == -ENOENT) {
 +              image_id = kstrdup("", GFP_KERNEL);
 +              ret = image_id ? 0 : -ENOMEM;
 +              if (!ret)
 +                      rbd_dev->image_format = 1;
 +      } else if (ret > sizeof (__le32)) {
 +              void *p = response;
 +
 +              image_id = ceph_extract_encoded_string(&p, p + ret,
                                                NULL, GFP_NOIO);
 -      if (IS_ERR(rbd_dev->spec->image_id)) {
 -              ret = PTR_ERR(rbd_dev->spec->image_id);
 -              rbd_dev->spec->image_id = NULL;
 +              ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
 +              if (!ret)
 +                      rbd_dev->image_format = 2;
        } else {
 -              dout("image_id is %s\n", rbd_dev->spec->image_id);
 +              ret = -EINVAL;
 +      }
 +
 +      if (!ret) {
 +              rbd_dev->spec->image_id = image_id;
 +              dout("image_id is %s\n", image_id);
        }
  out:
        kfree(response);
        return ret;
  }
  
 -static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
 +/* Undo whatever state changes are made by v1 or v2 image probe */
 +
 +static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
  {
 -      int ret;
 -      size_t size;
 +      struct rbd_image_header *header;
  
 -      /* Version 1 images have no id; empty string is used */
 +      rbd_dev_remove_parent(rbd_dev);
 +      rbd_spec_put(rbd_dev->parent_spec);
 +      rbd_dev->parent_spec = NULL;
 +      rbd_dev->parent_overlap = 0;
  
 -      rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
 -      if (!rbd_dev->spec->image_id)
 -              return -ENOMEM;
 +      /* Free dynamic fields from the header, then zero it out */
  
 -      /* Record the header object name for this rbd image. */
 +      header = &rbd_dev->header;
 +      ceph_put_snap_context(header->snapc);
 +      kfree(header->snap_sizes);
 +      kfree(header->snap_names);
 +      kfree(header->object_prefix);
 +      memset(header, 0, sizeof (*header));
 +}
  
 -      size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
 -      rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
 -      if (!rbd_dev->header_name) {
 -              ret = -ENOMEM;
 -              goto out_err;
 -      }
 -      sprintf(rbd_dev->header_name, "%s%s",
 -              rbd_dev->spec->image_name, RBD_SUFFIX);
 +static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
 +{
 +      int ret;
  
        /* Populate rbd image metadata */
  
        rbd_dev->parent_spec = NULL;
        rbd_dev->parent_overlap = 0;
  
 -      rbd_dev->image_format = 1;
 -
        dout("discovered version 1 image, header name is %s\n",
                rbd_dev->header_name);
  
@@@ -4536,45 -3893,43 +4536,45 @@@ out_err
  
  static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
  {
 -      size_t size;
        int ret;
 -      u64 ver = 0;
 -
 -      /*
 -       * Image id was filled in by the caller.  Record the header
 -       * object name for this rbd image.
 -       */
 -      size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
 -      rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
 -      if (!rbd_dev->header_name)
 -              return -ENOMEM;
 -      sprintf(rbd_dev->header_name, "%s%s",
 -                      RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
 -
 -      /* Get the size and object order for the image */
  
        ret = rbd_dev_v2_image_size(rbd_dev);
 -      if (ret < 0)
 +      if (ret)
                goto out_err;
  
        /* Get the object prefix (a.k.a. block_name) for the image */
  
        ret = rbd_dev_v2_object_prefix(rbd_dev);
 -      if (ret < 0)
 +      if (ret)
                goto out_err;
  
        /* Get the and check features for the image */
  
        ret = rbd_dev_v2_features(rbd_dev);
 -      if (ret < 0)
 +      if (ret)
                goto out_err;
  
        /* If the image supports layering, get the parent info */
  
        if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
                ret = rbd_dev_v2_parent_info(rbd_dev);
 +              if (ret)
 +                      goto out_err;
 +
 +              /*
 +               * Don't print a warning for parent images.  We can
 +               * tell this point because we won't know its pool
 +               * name yet (just its pool id).
 +               */
 +              if (rbd_dev->spec->pool_name)
 +                      rbd_warn(rbd_dev, "WARNING: kernel layering "
 +                                      "is EXPERIMENTAL!");
 +      }
 +
 +      /* If the image supports fancy striping, get its parameters */
 +
 +      if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
 +              ret = rbd_dev_v2_striping_info(rbd_dev);
                if (ret < 0)
                        goto out_err;
        }
  
        /* Get the snapshot context, plus the header version */
  
 -      ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
 +      ret = rbd_dev_v2_snap_context(rbd_dev);
        if (ret)
                goto out_err;
 -      rbd_dev->header.obj_version = ver;
 -
 -      rbd_dev->image_format = 2;
  
        dout("discovered version 2 image, header name is %s\n",
                rbd_dev->header_name);
@@@ -4606,54 -3964,22 +4606,54 @@@ out_err
        return ret;
  }
  
 -static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
 +static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
  {
 +      struct rbd_device *parent = NULL;
 +      struct rbd_spec *parent_spec;
 +      struct rbd_client *rbdc;
        int ret;
  
 -      /* no need to lock here, as rbd_dev is not registered yet */
 -      ret = rbd_dev_snaps_update(rbd_dev);
 -      if (ret)
 -              return ret;
 +      if (!rbd_dev->parent_spec)
 +              return 0;
 +      /*
 +       * We need to pass a reference to the client and the parent
 +       * spec when creating the parent rbd_dev.  Images related by
 +       * parent/child relationships always share both.
 +       */
 +      parent_spec = rbd_spec_get(rbd_dev->parent_spec);
 +      rbdc = __rbd_get_client(rbd_dev->rbd_client);
  
 -      ret = rbd_dev_probe_update_spec(rbd_dev);
 -      if (ret)
 -              goto err_out_snaps;
 +      ret = -ENOMEM;
 +      parent = rbd_dev_create(rbdc, parent_spec);
 +      if (!parent)
 +              goto out_err;
  
 -      ret = rbd_dev_set_mapping(rbd_dev);
 +      ret = rbd_dev_image_probe(parent);
 +      if (ret < 0)
 +              goto out_err;
 +      rbd_dev->parent = parent;
 +
 +      return 0;
 +out_err:
 +      if (parent) {
 +              rbd_spec_put(rbd_dev->parent_spec);
 +              kfree(rbd_dev->header_name);
 +              rbd_dev_destroy(parent);
 +      } else {
 +              rbd_put_client(rbdc);
 +              rbd_spec_put(parent_spec);
 +      }
 +
 +      return ret;
 +}
 +
 +static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
 +{
 +      int ret;
 +
 +      ret = rbd_dev_mapping_set(rbd_dev);
        if (ret)
 -              goto err_out_snaps;
 +              return ret;
  
        /* generate unique id: find highest unique id, add one */
        rbd_dev_id_get(rbd_dev);
        if (ret)
                goto err_out_disk;
  
 -      /*
 -       * At this point cleanup in the event of an error is the job
 -       * of the sysfs code (initiated by rbd_bus_del_dev()).
 -       */
 -      down_write(&rbd_dev->header_rwsem);
 -      ret = rbd_dev_snaps_register(rbd_dev);
 -      up_write(&rbd_dev->header_rwsem);
 -      if (ret)
 -              goto err_out_bus;
 -
 -      ret = rbd_dev_header_watch_sync(rbd_dev, 1);
 -      if (ret)
 -              goto err_out_bus;
 -
        /* Everything's ready.  Announce the disk to the world. */
  
 +      set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
 +      set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
        add_disk(rbd_dev->disk);
  
        pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
                (unsigned long long) rbd_dev->mapping.size);
  
        return ret;
 -err_out_bus:
 -      /* this will also clean up rest of rbd_dev stuff */
 -
 -      rbd_bus_del_dev(rbd_dev);
  
 -      return ret;
  err_out_disk:
        rbd_free_disk(rbd_dev);
  err_out_blkdev:
        unregister_blkdev(rbd_dev->major, rbd_dev->name);
  err_out_id:
        rbd_dev_id_put(rbd_dev);
 -err_out_snaps:
 -      rbd_remove_all_snaps(rbd_dev);
 +      rbd_dev_mapping_clear(rbd_dev);
  
        return ret;
  }
  
 +static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 +{
 +      struct rbd_spec *spec = rbd_dev->spec;
 +      size_t size;
 +
 +      /* Record the header object name for this rbd image. */
 +
 +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 +
 +      if (rbd_dev->image_format == 1)
 +              size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
 +      else
 +              size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
 +
 +      rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
 +      if (!rbd_dev->header_name)
 +              return -ENOMEM;
 +
 +      if (rbd_dev->image_format == 1)
 +              sprintf(rbd_dev->header_name, "%s%s",
 +                      spec->image_name, RBD_SUFFIX);
 +      else
 +              sprintf(rbd_dev->header_name, "%s%s",
 +                      RBD_HEADER_PREFIX, spec->image_id);
 +      return 0;
 +}
 +
 +static void rbd_dev_image_release(struct rbd_device *rbd_dev)
 +{
 +      int ret;
 +
 +      rbd_dev_unprobe(rbd_dev);
 +      ret = rbd_dev_header_watch_sync(rbd_dev, 0);
 +      if (ret)
 +              rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
 +      kfree(rbd_dev->header_name);
 +      rbd_dev->header_name = NULL;
 +      rbd_dev->image_format = 0;
 +      kfree(rbd_dev->spec->image_id);
 +      rbd_dev->spec->image_id = NULL;
 +
 +      rbd_dev_destroy(rbd_dev);
 +}
 +
  /*
   * Probe for the existence of the header object for the given rbd
   * device.  For format 2 images this includes determining the image
   * id.
   */
 -static int rbd_dev_probe(struct rbd_device *rbd_dev)
 +static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
  {
        int ret;
 +      int tmp;
  
        /*
         * Get the id from the image id object.  If it's not a
         */
        ret = rbd_dev_image_id(rbd_dev);
        if (ret)
 +              return ret;
 +      rbd_assert(rbd_dev->spec->image_id);
 +      rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 +
 +      ret = rbd_dev_header_name(rbd_dev);
 +      if (ret)
 +              goto err_out_format;
 +
 +      ret = rbd_dev_header_watch_sync(rbd_dev, 1);
 +      if (ret)
 +              goto out_header_name;
 +
 +      if (rbd_dev->image_format == 1)
                ret = rbd_dev_v1_probe(rbd_dev);
        else
                ret = rbd_dev_v2_probe(rbd_dev);
 -      if (ret) {
 -              dout("probe failed, returning %d\n", ret);
 -
 -              return ret;
 -      }
 +      if (ret)
 +              goto err_out_watch;
  
 -      ret = rbd_dev_probe_finish(rbd_dev);
 +      ret = rbd_dev_spec_update(rbd_dev);
        if (ret)
 -              rbd_header_free(&rbd_dev->header);
 +              goto err_out_probe;
 +
 +      ret = rbd_dev_probe_parent(rbd_dev);
 +      if (!ret)
 +              return 0;
 +
 +err_out_probe:
 +      rbd_dev_unprobe(rbd_dev);
 +err_out_watch:
 +      tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
 +      if (tmp)
 +              rbd_warn(rbd_dev, "unable to tear down watch request\n");
 +out_header_name:
 +      kfree(rbd_dev->header_name);
 +      rbd_dev->header_name = NULL;
 +err_out_format:
 +      rbd_dev->image_format = 0;
 +      kfree(rbd_dev->spec->image_id);
 +      rbd_dev->spec->image_id = NULL;
 +
 +      dout("probe failed, returning %d\n", ret);
  
        return ret;
  }
@@@ -4841,13 -4110,11 +4841,13 @@@ static ssize_t rbd_add(struct bus_type 
        rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
        if (rc < 0)
                goto err_out_client;
 -      spec->pool_id = (u64) rc;
 +      spec->pool_id = (u64)rc;
  
        /* The ceph file layout needs to fit pool id in 32 bits */
  
 -      if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
 +      if (spec->pool_id > (u64)U32_MAX) {
 +              rbd_warn(NULL, "pool id too large (%llu > %u)\n",
 +                              (unsigned long long)spec->pool_id, U32_MAX);
                rc = -EIO;
                goto err_out_client;
        }
        kfree(rbd_opts);
        rbd_opts = NULL;        /* done with this */
  
 -      rc = rbd_dev_probe(rbd_dev);
 +      rc = rbd_dev_image_probe(rbd_dev);
        if (rc < 0)
                goto err_out_rbd_dev;
  
 -      return count;
 +      rc = rbd_dev_device_setup(rbd_dev);
 +      if (!rc)
 +              return count;
 +
 +      rbd_dev_image_release(rbd_dev);
  err_out_rbd_dev:
        rbd_dev_destroy(rbd_dev);
  err_out_client:
@@@ -4885,7 -4148,7 +4885,7 @@@ err_out_module
  
        dout("Error adding device %s\n", buf);
  
 -      return (ssize_t) rc;
 +      return (ssize_t)rc;
  }
  
  static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
        return NULL;
  }
  
 -static void rbd_dev_release(struct device *dev)
 +static void rbd_dev_device_release(struct device *dev)
  {
        struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
  
 -      if (rbd_dev->watch_event)
 -              rbd_dev_header_watch_sync(rbd_dev, 0);
 -
 -      /* clean up and free blkdev */
        rbd_free_disk(rbd_dev);
 +      clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
 +      rbd_dev_clear_mapping(rbd_dev);
        unregister_blkdev(rbd_dev->major, rbd_dev->name);
 -
 -      /* release allocated disk header fields */
 -      rbd_header_free(&rbd_dev->header);
 -
 -      /* done with the id, and with the rbd_dev */
 +      rbd_dev->major = 0;
        rbd_dev_id_put(rbd_dev);
 -      rbd_assert(rbd_dev->rbd_client != NULL);
 -      rbd_dev_destroy(rbd_dev);
 +      rbd_dev_mapping_clear(rbd_dev);
 +}
  
 -      /* release module ref */
 -      module_put(THIS_MODULE);
 +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
 +{
 +      while (rbd_dev->parent) {
 +              struct rbd_device *first = rbd_dev;
 +              struct rbd_device *second = first->parent;
 +              struct rbd_device *third;
 +
 +              /*
 +               * Follow to the parent with no grandparent and
 +               * remove it.
 +               */
 +              while (second && (third = second->parent)) {
 +                      first = second;
 +                      second = third;
 +              }
 +              rbd_assert(second);
 +              rbd_dev_image_release(second);
 +              first->parent = NULL;
 +              first->parent_overlap = 0;
 +
 +              rbd_assert(first->parent_spec);
 +              rbd_spec_put(first->parent_spec);
 +              first->parent_spec = NULL;
 +      }
  }
  
  static ssize_t rbd_remove(struct bus_type *bus,
                          size_t count)
  {
        struct rbd_device *rbd_dev = NULL;
 -      int target_id, rc;
 +      int target_id;
        unsigned long ul;
 -      int ret = count;
 +      int ret;
  
 -      rc = strict_strtoul(buf, 10, &ul);
 -      if (rc)
 -              return rc;
 +      ret = strict_strtoul(buf, 10, &ul);
 +      if (ret)
 +              return ret;
  
        /* convert to int; abort if we lost anything in the conversion */
        target_id = (int) ul;
        spin_unlock_irq(&rbd_dev->lock);
        if (ret < 0)
                goto done;
 -
 -      rbd_remove_all_snaps(rbd_dev);
 +      ret = count;
        rbd_bus_del_dev(rbd_dev);
 -
 +      rbd_dev_image_release(rbd_dev);
 +      module_put(THIS_MODULE);
  done:
        mutex_unlock(&ctl_mutex);
  
@@@ -5013,56 -4260,6 +5013,56 @@@ static void rbd_sysfs_cleanup(void
        device_unregister(&rbd_root_dev);
  }
  
 +static int rbd_slab_init(void)
 +{
 +      rbd_assert(!rbd_img_request_cache);
 +      rbd_img_request_cache = kmem_cache_create("rbd_img_request",
 +                                      sizeof (struct rbd_img_request),
 +                                      __alignof__(struct rbd_img_request),
 +                                      0, NULL);
 +      if (!rbd_img_request_cache)
 +              return -ENOMEM;
 +
 +      rbd_assert(!rbd_obj_request_cache);
 +      rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
 +                                      sizeof (struct rbd_obj_request),
 +                                      __alignof__(struct rbd_obj_request),
 +                                      0, NULL);
 +      if (!rbd_obj_request_cache)
 +              goto out_err;
 +
 +      rbd_assert(!rbd_segment_name_cache);
 +      rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
 +                                      MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
 +      if (rbd_segment_name_cache)
 +              return 0;
 +out_err:
 +      if (rbd_obj_request_cache) {
 +              kmem_cache_destroy(rbd_obj_request_cache);
 +              rbd_obj_request_cache = NULL;
 +      }
 +
 +      kmem_cache_destroy(rbd_img_request_cache);
 +      rbd_img_request_cache = NULL;
 +
 +      return -ENOMEM;
 +}
 +
 +static void rbd_slab_exit(void)
 +{
 +      rbd_assert(rbd_segment_name_cache);
 +      kmem_cache_destroy(rbd_segment_name_cache);
 +      rbd_segment_name_cache = NULL;
 +
 +      rbd_assert(rbd_obj_request_cache);
 +      kmem_cache_destroy(rbd_obj_request_cache);
 +      rbd_obj_request_cache = NULL;
 +
 +      rbd_assert(rbd_img_request_cache);
 +      kmem_cache_destroy(rbd_img_request_cache);
 +      rbd_img_request_cache = NULL;
 +}
 +
  static int __init rbd_init(void)
  {
        int rc;
  
                return -EINVAL;
        }
 -      rc = rbd_sysfs_init();
 +      rc = rbd_slab_init();
        if (rc)
                return rc;
 -      pr_info("loaded " RBD_DRV_NAME_LONG "\n");
 -      return 0;
 +      rc = rbd_sysfs_init();
 +      if (rc)
 +              rbd_slab_exit();
 +      else
 +              pr_info("loaded " RBD_DRV_NAME_LONG "\n");
 +
 +      return rc;
  }
  
  static void __exit rbd_exit(void)
  {
        rbd_sysfs_cleanup();
 +      rbd_slab_exit();
  }
  
  module_init(rbd_init);
diff --combined drivers/md/md.c
index 6330c727396cd6071f85e1a20bb4103d6aeeb322,1d03ebde40b51885cd63950d695b19fe5b9214de..681d1099a2d58936864b3b63610a31f38a908219
@@@ -72,9 -72,6 +72,9 @@@ static DECLARE_WAIT_QUEUE_HEAD(resync_w
  static struct workqueue_struct *md_wq;
  static struct workqueue_struct *md_misc_wq;
  
 +static int remove_and_add_spares(struct mddev *mddev,
 +                               struct md_rdev *this);
 +
  #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
  
  /*
@@@ -197,21 -194,12 +197,12 @@@ void md_trim_bio(struct bio *bio, int o
        if (offset == 0 && size == bio->bi_size)
                return;
  
-       bio->bi_sector += offset;
-       bio->bi_size = size;
-       offset <<= 9;
        clear_bit(BIO_SEG_VALID, &bio->bi_flags);
  
-       while (bio->bi_idx < bio->bi_vcnt &&
-              bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
-               /* remove this whole bio_vec */
-               offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
-               bio->bi_idx++;
-       }
-       if (bio->bi_idx < bio->bi_vcnt) {
-               bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
-               bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
-       }
+       bio_advance(bio, offset << 9);
+       bio->bi_size = size;
        /* avoid any complications with bi_idx being non-zero*/
        if (bio->bi_idx) {
                memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
@@@ -1567,8 -1555,8 +1558,8 @@@ static int super_1_load(struct md_rdev 
                                             sector, count, 1) == 0)
                                return -EINVAL;
                }
 -      } else if (sb->bblog_offset == 0)
 -              rdev->badblocks.shift = -1;
 +      } else if (sb->bblog_offset != 0)
 +              rdev->badblocks.shift = 0;
  
        if (!refdev) {
                ret = 1;
@@@ -2414,11 -2402,6 +2405,11 @@@ static void md_update_sb(struct mddev 
        int nospares = 0;
        int any_badblocks_changed = 0;
  
 +      if (mddev->ro) {
 +              if (force_change)
 +                      set_bit(MD_CHANGE_DEVS, &mddev->flags);
 +              return;
 +      }
  repeat:
        /* First make sure individual recovery_offsets are correct */
        rdev_for_each(rdev, mddev) {
@@@ -2808,10 -2791,12 +2799,10 @@@ slot_store(struct md_rdev *rdev, const 
                /* personality does all needed checks */
                if (rdev->mddev->pers->hot_remove_disk == NULL)
                        return -EINVAL;
 -              err = rdev->mddev->pers->
 -                      hot_remove_disk(rdev->mddev, rdev);
 -              if (err)
 -                      return err;
 -              sysfs_unlink_rdev(rdev->mddev, rdev);
 -              rdev->raid_disk = -1;
 +              clear_bit(Blocked, &rdev->flags);
 +              remove_and_add_spares(rdev->mddev, rdev);
 +              if (rdev->raid_disk >= 0)
 +                      return -EBUSY;
                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
                md_wakeup_thread(rdev->mddev->thread);
        } else if (rdev->mddev->pers) {
@@@ -3227,7 -3212,7 +3218,7 @@@ int md_rdev_init(struct md_rdev *rdev
         * be used - I wonder if that matters
         */
        rdev->badblocks.count = 0;
 -      rdev->badblocks.shift = 0;
 +      rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
        rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
        seqlock_init(&rdev->badblocks.lock);
        if (rdev->badblocks.page == NULL)
@@@ -3299,6 -3284,9 +3290,6 @@@ static struct md_rdev *md_import_device
                        goto abort_free;
                }
        }
 -      if (super_format == -1)
 -              /* hot-add for 0.90, or non-persistent: so no badblocks */
 -              rdev->badblocks.shift = -1;
  
        return rdev;
  
@@@ -4228,6 -4216,8 +4219,6 @@@ action_show(struct mddev *mddev, char *
        return sprintf(page, "%s\n", type);
  }
  
 -static void reap_sync_thread(struct mddev *mddev);
 -
  static ssize_t
  action_store(struct mddev *mddev, const char *page, size_t len)
  {
        if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
                if (mddev->sync_thread) {
                        set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 -                      reap_sync_thread(mddev);
 +                      md_reap_sync_thread(mddev);
                }
        } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
                   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@@ -5280,7 -5270,7 +5271,7 @@@ static void __md_stop_writes(struct mdd
        if (mddev->sync_thread) {
                set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
                set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 -              reap_sync_thread(mddev);
 +              md_reap_sync_thread(mddev);
        }
  
        del_timer_sync(&mddev->safemode_timer);
        bitmap_flush(mddev);
        md_super_wait(mddev);
  
 -      if (!mddev->in_sync || mddev->flags) {
 +      if (mddev->ro == 0 &&
 +          (!mddev->in_sync || mddev->flags)) {
                /* mark array as shutdown cleanly */
                mddev->in_sync = 1;
                md_update_sb(mddev, 1);
@@@ -5812,7 -5801,7 +5803,7 @@@ static int add_new_disk(struct mddev * 
                else
                        sysfs_notify_dirent_safe(rdev->sysfs_state);
  
 -              md_update_sb(mddev, 1);
 +              set_bit(MD_CHANGE_DEVS, &mddev->flags);
                if (mddev->degraded)
                        set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
                set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@@ -5879,9 -5868,6 +5870,9 @@@ static int hot_remove_disk(struct mdde
        if (!rdev)
                return -ENXIO;
  
 +      clear_bit(Blocked, &rdev->flags);
 +      remove_and_add_spares(mddev, rdev);
 +
        if (rdev->raid_disk >= 0)
                goto busy;
  
@@@ -6495,28 -6481,6 +6486,28 @@@ static int md_ioctl(struct block_devic
                err = md_set_readonly(mddev, bdev);
                goto done_unlock;
  
 +      case HOT_REMOVE_DISK:
 +              err = hot_remove_disk(mddev, new_decode_dev(arg));
 +              goto done_unlock;
 +
 +      case ADD_NEW_DISK:
 +              /* We can support ADD_NEW_DISK on read-only arrays
 +               * on if we are re-adding a preexisting device.
 +               * So require mddev->pers and MD_DISK_SYNC.
 +               */
 +              if (mddev->pers) {
 +                      mdu_disk_info_t info;
 +                      if (copy_from_user(&info, argp, sizeof(info)))
 +                              err = -EFAULT;
 +                      else if (!(info.state & (1<<MD_DISK_SYNC)))
 +                              /* Need to clear read-only for this */
 +                              break;
 +                      else
 +                              err = add_new_disk(mddev, &info);
 +                      goto done_unlock;
 +              }
 +              break;
 +
        case BLKROSET:
                if (get_user(ro, (int __user *)(arg))) {
                        err = -EFAULT;
                goto done_unlock;
        }
  
 -      case HOT_REMOVE_DISK:
 -              err = hot_remove_disk(mddev, new_decode_dev(arg));
 -              goto done_unlock;
 -
        case HOT_ADD_DISK:
                err = hot_add_disk(mddev, new_decode_dev(arg));
                goto done_unlock;
@@@ -6674,13 -6642,15 +6665,13 @@@ static int md_open(struct block_device 
        return err;
  }
  
 -static int md_release(struct gendisk *disk, fmode_t mode)
 +static void md_release(struct gendisk *disk, fmode_t mode)
  {
        struct mddev *mddev = disk->private_data;
  
        BUG_ON(!mddev);
        atomic_dec(&mddev->openers);
        mddev_put(mddev);
 -
 -      return 0;
  }
  
  static int md_media_changed(struct gendisk *disk)
@@@ -7665,16 -7635,14 +7656,16 @@@ void md_do_sync(struct md_thread *threa
  }
  EXPORT_SYMBOL_GPL(md_do_sync);
  
 -static int remove_and_add_spares(struct mddev *mddev)
 +static int remove_and_add_spares(struct mddev *mddev,
 +                               struct md_rdev *this)
  {
        struct md_rdev *rdev;
        int spares = 0;
        int removed = 0;
  
        rdev_for_each(rdev, mddev)
 -              if (rdev->raid_disk >= 0 &&
 +              if ((this == NULL || rdev == this) &&
 +                  rdev->raid_disk >= 0 &&
                    !test_bit(Blocked, &rdev->flags) &&
                    (test_bit(Faulty, &rdev->flags) ||
                     ! test_bit(In_sync, &rdev->flags)) &&
        if (removed && mddev->kobj.sd)
                sysfs_notify(&mddev->kobj, NULL, "degraded");
  
 +      if (this)
 +              goto no_add;
 +
        rdev_for_each(rdev, mddev) {
                if (rdev->raid_disk >= 0 &&
                    !test_bit(In_sync, &rdev->flags) &&
                    !test_bit(Faulty, &rdev->flags))
                        spares++;
 -              if (rdev->raid_disk < 0
 -                  && !test_bit(Faulty, &rdev->flags)) {
 -                      rdev->recovery_offset = 0;
 -                      if (mddev->pers->
 -                          hot_add_disk(mddev, rdev) == 0) {
 -                              if (sysfs_link_rdev(mddev, rdev))
 -                                      /* failure here is OK */;
 -                              spares++;
 -                              md_new_event(mddev);
 -                              set_bit(MD_CHANGE_DEVS, &mddev->flags);
 -                      }
 +              if (rdev->raid_disk >= 0)
 +                      continue;
 +              if (test_bit(Faulty, &rdev->flags))
 +                      continue;
 +              if (mddev->ro &&
 +                  rdev->saved_raid_disk < 0)
 +                      continue;
 +
 +              rdev->recovery_offset = 0;
 +              if (rdev->saved_raid_disk >= 0 && mddev->in_sync) {
 +                      spin_lock_irq(&mddev->write_lock);
 +                      if (mddev->in_sync)
 +                              /* OK, this device, which is in_sync,
 +                               * will definitely be noticed before
 +                               * the next write, so recovery isn't
 +                               * needed.
 +                               */
 +                              rdev->recovery_offset = mddev->recovery_cp;
 +                      spin_unlock_irq(&mddev->write_lock);
 +              }
 +              if (mddev->ro && rdev->recovery_offset != MaxSector)
 +                      /* not safe to add this disk now */
 +                      continue;
 +              if (mddev->pers->
 +                  hot_add_disk(mddev, rdev) == 0) {
 +                      if (sysfs_link_rdev(mddev, rdev))
 +                              /* failure here is OK */;
 +                      spares++;
 +                      md_new_event(mddev);
 +                      set_bit(MD_CHANGE_DEVS, &mddev->flags);
                }
        }
 +no_add:
        if (removed)
                set_bit(MD_CHANGE_DEVS, &mddev->flags);
        return spares;
  }
  
 -static void reap_sync_thread(struct mddev *mddev)
 -{
 -      struct md_rdev *rdev;
 -
 -      /* resync has finished, collect result */
 -      md_unregister_thread(&mddev->sync_thread);
 -      if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 -          !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 -              /* success...*/
 -              /* activate any spares */
 -              if (mddev->pers->spare_active(mddev)) {
 -                      sysfs_notify(&mddev->kobj, NULL,
 -                                   "degraded");
 -                      set_bit(MD_CHANGE_DEVS, &mddev->flags);
 -              }
 -      }
 -      if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 -          mddev->pers->finish_reshape)
 -              mddev->pers->finish_reshape(mddev);
 -
 -      /* If array is no-longer degraded, then any saved_raid_disk
 -       * information must be scrapped.  Also if any device is now
 -       * In_sync we must scrape the saved_raid_disk for that device
 -       * do the superblock for an incrementally recovered device
 -       * written out.
 -       */
 -      rdev_for_each(rdev, mddev)
 -              if (!mddev->degraded ||
 -                  test_bit(In_sync, &rdev->flags))
 -                      rdev->saved_raid_disk = -1;
 -
 -      md_update_sb(mddev, 1);
 -      clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 -      clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 -      clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 -      clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
 -      clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 -      /* flag recovery needed just to double check */
 -      set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 -      sysfs_notify_dirent_safe(mddev->sysfs_action);
 -      md_new_event(mddev);
 -      if (mddev->event_work.func)
 -              queue_work(md_misc_wq, &mddev->event_work);
 -}
 -
  /*
   * This routine is regularly called by all per-raid-array threads to
   * deal with generic issues like resync and super-block update.
@@@ -7790,16 -7780,22 +7781,16 @@@ void md_check_recovery(struct mddev *md
                int spares = 0;
  
                if (mddev->ro) {
 -                      /* Only thing we do on a ro array is remove
 -                       * failed devices.
 +                      /* On a read-only array we can:
 +                       * - remove failed devices
 +                       * - add already-in_sync devices if the array itself
 +                       *   is in-sync.
 +                       * As we only add devices that are already in-sync,
 +                       * we can activate the spares immediately.
                         */
 -                      struct md_rdev *rdev;
 -                      rdev_for_each(rdev, mddev)
 -                              if (rdev->raid_disk >= 0 &&
 -                                  !test_bit(Blocked, &rdev->flags) &&
 -                                  test_bit(Faulty, &rdev->flags) &&
 -                                  atomic_read(&rdev->nr_pending)==0) {
 -                                      if (mddev->pers->hot_remove_disk(
 -                                                  mddev, rdev) == 0) {
 -                                              sysfs_unlink_rdev(mddev, rdev);
 -                                              rdev->raid_disk = -1;
 -                                      }
 -                              }
                        clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 +                      remove_and_add_spares(mddev, NULL);
 +                      mddev->pers->spare_active(mddev);
                        goto unlock;
                }
  
                        goto unlock;
                }
                if (mddev->sync_thread) {
 -                      reap_sync_thread(mddev);
 +                      md_reap_sync_thread(mddev);
                        goto unlock;
                }
                /* Set RUNNING before clearing NEEDED to avoid
                                goto unlock;
                        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
                        clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 -              } else if ((spares = remove_and_add_spares(mddev))) {
 +              } else if ((spares = remove_and_add_spares(mddev, NULL))) {
                        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
                        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
                        clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
        }
  }
  
 +void md_reap_sync_thread(struct mddev *mddev)
 +{
 +      struct md_rdev *rdev;
 +
 +      /* resync has finished, collect result */
 +      md_unregister_thread(&mddev->sync_thread);
 +      if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 +          !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 +              /* success...*/
 +              /* activate any spares */
 +              if (mddev->pers->spare_active(mddev)) {
 +                      sysfs_notify(&mddev->kobj, NULL,
 +                                   "degraded");
 +                      set_bit(MD_CHANGE_DEVS, &mddev->flags);
 +              }
 +      }
 +      if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 +          mddev->pers->finish_reshape)
 +              mddev->pers->finish_reshape(mddev);
 +
 +      /* If array is no-longer degraded, then any saved_raid_disk
 +       * information must be scrapped.  Also if any device is now
 +       * In_sync we must scrape the saved_raid_disk for that device
 +       * do the superblock for an incrementally recovered device
 +       * written out.
 +       */
 +      rdev_for_each(rdev, mddev)
 +              if (!mddev->degraded ||
 +                  test_bit(In_sync, &rdev->flags))
 +                      rdev->saved_raid_disk = -1;
 +
 +      md_update_sb(mddev, 1);
 +      clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 +      clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 +      clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 +      clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
 +      clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 +      /* flag recovery needed just to double check */
 +      set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 +      sysfs_notify_dirent_safe(mddev->sysfs_action);
 +      md_new_event(mddev);
 +      if (mddev->event_work.func)
 +              queue_work(md_misc_wq, &mddev->event_work);
 +}
 +
  void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
  {
        sysfs_notify_dirent_safe(rdev->sysfs_state);
@@@ -8682,7 -8633,6 +8673,7 @@@ EXPORT_SYMBOL(md_register_thread)
  EXPORT_SYMBOL(md_unregister_thread);
  EXPORT_SYMBOL(md_wakeup_thread);
  EXPORT_SYMBOL(md_check_recovery);
 +EXPORT_SYMBOL(md_reap_sync_thread);
  MODULE_LICENSE("GPL");
  MODULE_DESCRIPTION("MD RAID framework");
  MODULE_ALIAS("md");
diff --combined drivers/md/raid1.c
index 851023e2ba5d5296824a46bdc12482056de648a1,aeb4e3f74791b3f1d163aa19c8f3ed34cfa0f284..55951182af73680d3b7f40d32cac1302062dbe74
@@@ -92,7 -92,6 +92,6 @@@ static void r1bio_pool_free(void *r1_bi
  static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
  {
        struct pool_info *pi = data;
-       struct page *page;
        struct r1bio *r1_bio;
        struct bio *bio;
        int i, j;
                j = 1;
        while(j--) {
                bio = r1_bio->bios[j];
-               for (i = 0; i < RESYNC_PAGES; i++) {
-                       page = alloc_page(gfp_flags);
-                       if (unlikely(!page))
-                               goto out_free_pages;
+               bio->bi_vcnt = RESYNC_PAGES;
  
-                       bio->bi_io_vec[i].bv_page = page;
-                       bio->bi_vcnt = i+1;
-               }
+               if (bio_alloc_pages(bio, gfp_flags))
+                       goto out_free_bio;
        }
        /* If not user-requests, copy the page pointers to all bios */
        if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
  
        return r1_bio;
  
- out_free_pages:
-       for (j=0 ; j < pi->raid_disks; j++)
-               for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
-                       put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
-       j = -1;
  out_free_bio:
        while (++j < pi->raid_disks)
                bio_put(r1_bio->bios[j]);
@@@ -267,7 -257,7 +257,7 @@@ static void raid_end_bio_io(struct r1bi
                         (bio_data_dir(bio) == WRITE) ? "write" : "read",
                         (unsigned long long) bio->bi_sector,
                         (unsigned long long) bio->bi_sector +
-                        (bio->bi_size >> 9) - 1);
+                        bio_sectors(bio) - 1);
  
                call_bio_endio(r1_bio);
        }
@@@ -458,7 -448,7 +448,7 @@@ static void raid1_end_write_request(str
                                         " %llu-%llu\n",
                                         (unsigned long long) mbio->bi_sector,
                                         (unsigned long long) mbio->bi_sector +
-                                        (mbio->bi_size >> 9) - 1);
+                                        bio_sectors(mbio) - 1);
                                call_bio_endio(r1_bio);
                        }
                }
@@@ -925,7 -915,7 +915,7 @@@ static void alloc_behind_pages(struct b
        if (unlikely(!bvecs))
                return;
  
-       bio_for_each_segment(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i) {
                bvecs[i] = *bvec;
                bvecs[i].bv_page = alloc_page(GFP_NOIO);
                if (unlikely(!bvecs[i].bv_page))
@@@ -981,12 -971,7 +971,12 @@@ static void raid1_unplug(struct blk_plu
        while (bio) { /* submit pending writes */
                struct bio *next = bio->bi_next;
                bio->bi_next = NULL;
 -              generic_make_request(bio);
 +              if (unlikely((bio->bi_rw & REQ_DISCARD) &&
 +                  !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 +                      /* Just ignore it */
 +                      bio_endio(bio, 0);
 +              else
 +                      generic_make_request(bio);
                bio = next;
        }
        kfree(plug);
@@@ -1023,7 -1008,7 +1013,7 @@@ static void make_request(struct mddev *
        md_write_start(mddev, bio); /* wait on superblock update early */
  
        if (bio_data_dir(bio) == WRITE &&
-           bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+           bio_end_sector(bio) > mddev->suspend_lo &&
            bio->bi_sector < mddev->suspend_hi) {
                /* As the suspend_* range is controlled by
                 * userspace, we want an interruptible
                        flush_signals(current);
                        prepare_to_wait(&conf->wait_barrier,
                                        &w, TASK_INTERRUPTIBLE);
-                       if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+                       if (bio_end_sector(bio) <= mddev->suspend_lo ||
                            bio->bi_sector >= mddev->suspend_hi)
                                break;
                        schedule();
        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
  
        r1_bio->master_bio = bio;
-       r1_bio->sectors = bio->bi_size >> 9;
+       r1_bio->sectors = bio_sectors(bio);
        r1_bio->state = 0;
        r1_bio->mddev = mddev;
        r1_bio->sector = bio->bi_sector;
@@@ -1132,7 -1117,7 +1122,7 @@@ read_again
                        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
  
                        r1_bio->master_bio = bio;
-                       r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+                       r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                        r1_bio->state = 0;
                        r1_bio->mddev = mddev;
                        r1_bio->sector = bio->bi_sector + sectors_handled;
                        struct bio_vec *bvec;
                        int j;
  
-                       /* Yes, I really want the '__' version so that
-                        * we clear any unused pointer in the io_vec, rather
-                        * than leave them unchanged.  This is important
-                        * because when we come to free the pages, we won't
-                        * know the original bi_idx, so we just free
-                        * them all
+                       /*
+                        * We trimmed the bio, so _all is legit
                         */
-                       __bio_for_each_segment(bvec, mbio, j, 0)
+                       bio_for_each_segment_all(bvec, mbio, j)
                                bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
                        if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
                                atomic_inc(&r1_bio->behind_remaining);
        /* Mustn't call r1_bio_write_done before this next test,
         * as it could result in the bio being freed.
         */
-       if (sectors_handled < (bio->bi_size >> 9)) {
+       if (sectors_handled < bio_sectors(bio)) {
                r1_bio_write_done(r1_bio);
                /* We need another r1_bio.  It has already been counted
                 * in bio->bi_phys_segments
                 */
                r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
                r1_bio->master_bio = bio;
-               r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+               r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                r1_bio->state = 0;
                r1_bio->mddev = mddev;
                r1_bio->sector = bio->bi_sector + sectors_handled;
@@@ -1867,7 -1848,7 +1853,7 @@@ static int process_checks(struct r1bio 
                struct bio *sbio = r1_bio->bios[i];
                int size;
  
-               if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+               if (sbio->bi_end_io != end_sync_read)
                        continue;
  
                if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
                        continue;
                }
                /* fixup the bio for reuse */
+               bio_reset(sbio);
                sbio->bi_vcnt = vcnt;
                sbio->bi_size = r1_bio->sectors << 9;
-               sbio->bi_idx = 0;
-               sbio->bi_phys_segments = 0;
-               sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               sbio->bi_flags |= 1 << BIO_UPTODATE;
-               sbio->bi_next = NULL;
                sbio->bi_sector = r1_bio->sector +
                        conf->mirrors[i].rdev->data_offset;
                sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+               sbio->bi_end_io = end_sync_read;
+               sbio->bi_private = r1_bio;
                size = sbio->bi_size;
                for (j = 0; j < vcnt ; j++) {
                        struct bio_vec *bi;
                        else
                                bi->bv_len = size;
                        size -= PAGE_SIZE;
-                       memcpy(page_address(bi->bv_page),
-                              page_address(pbio->bi_io_vec[j].bv_page),
-                              PAGE_SIZE);
                }
+               bio_copy_data(sbio, pbio);
        }
        return 0;
  }
@@@ -1952,7 -1931,7 +1936,7 @@@ static void sync_request_write(struct m
                wbio->bi_rw = WRITE;
                wbio->bi_end_io = end_sync_write;
                atomic_inc(&r1_bio->remaining);
-               md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
  
                generic_make_request(wbio);
        }
@@@ -2064,32 -2043,11 +2048,11 @@@ static void fix_read_error(struct r1con
        }
  }
  
- static void bi_complete(struct bio *bio, int error)
- {
-       complete((struct completion *)bio->bi_private);
- }
- static int submit_bio_wait(int rw, struct bio *bio)
- {
-       struct completion event;
-       rw |= REQ_SYNC;
-       init_completion(&event);
-       bio->bi_private = &event;
-       bio->bi_end_io = bi_complete;
-       submit_bio(rw, bio);
-       wait_for_completion(&event);
-       return test_bit(BIO_UPTODATE, &bio->bi_flags);
- }
  static int narrow_write_error(struct r1bio *r1_bio, int i)
  {
        struct mddev *mddev = r1_bio->mddev;
        struct r1conf *conf = mddev->private;
        struct md_rdev *rdev = conf->mirrors[i].rdev;
-       int vcnt, idx;
-       struct bio_vec *vec;
  
        /* bio has the data to be written to device 'i' where
         * we just recently had a write error.
                   & ~(sector_t)(block_sectors - 1))
                - sector;
  
-       if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-               vcnt = r1_bio->behind_page_count;
-               vec = r1_bio->behind_bvecs;
-               idx = 0;
-               while (vec[idx].bv_page == NULL)
-                       idx++;
-       } else {
-               vcnt = r1_bio->master_bio->bi_vcnt;
-               vec = r1_bio->master_bio->bi_io_vec;
-               idx = r1_bio->master_bio->bi_idx;
-       }
        while (sect_to_write) {
                struct bio *wbio;
                if (sectors > sect_to_write)
                        sectors = sect_to_write;
                /* Write at 'sector' for 'sectors'*/
  
-               wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
-               memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-               wbio->bi_sector = r1_bio->sector;
+               if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+                       unsigned vcnt = r1_bio->behind_page_count;
+                       struct bio_vec *vec = r1_bio->behind_bvecs;
+                       while (!vec->bv_page) {
+                               vec++;
+                               vcnt--;
+                       }
+                       wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+                       memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+                       wbio->bi_vcnt = vcnt;
+               } else {
+                       wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+               }
                wbio->bi_rw = WRITE;
-               wbio->bi_vcnt = vcnt;
+               wbio->bi_sector = r1_bio->sector;
                wbio->bi_size = r1_bio->sectors << 9;
-               wbio->bi_idx = idx;
  
                md_trim_bio(wbio, sector - r1_bio->sector, sectors);
                wbio->bi_sector += rdev->data_offset;
@@@ -2289,8 -2249,7 +2254,7 @@@ read_more
                        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
  
                        r1_bio->master_bio = mbio;
-                       r1_bio->sectors = (mbio->bi_size >> 9)
-                                         - sectors_handled;
+                       r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
                        r1_bio->state = 0;
                        set_bit(R1BIO_ReadError, &r1_bio->state);
                        r1_bio->mddev = mddev;
@@@ -2464,18 -2423,7 +2428,7 @@@ static sector_t sync_request(struct mdd
        for (i = 0; i < conf->raid_disks * 2; i++) {
                struct md_rdev *rdev;
                bio = r1_bio->bios[i];
-               /* take from bio_init */
-               bio->bi_next = NULL;
-               bio->bi_flags &= ~(BIO_POOL_MASK-1);
-               bio->bi_flags |= 1 << BIO_UPTODATE;
-               bio->bi_rw = READ;
-               bio->bi_vcnt = 0;
-               bio->bi_idx = 0;
-               bio->bi_phys_segments = 0;
-               bio->bi_size = 0;
-               bio->bi_end_io = NULL;
-               bio->bi_private = NULL;
+               bio_reset(bio);
  
                rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev == NULL ||
@@@ -2906,7 -2854,6 +2859,7 @@@ static int stop(struct mddev *mddev
        if (conf->r1bio_pool)
                mempool_destroy(conf->r1bio_pool);
        kfree(conf->mirrors);
 +      safe_put_page(conf->tmppage);
        kfree(conf->poolinfo);
        kfree(conf);
        mddev->private = NULL;
diff --combined drivers/md/raid10.c
index 018741ba93104d9ad432d7524a136cf7e69b2227,e32e8b1042f8e57e5ee77f7162af708717f9e06c..59d4daa5f4c7a32c245ef954f24650fe75084117
@@@ -1133,12 -1133,7 +1133,12 @@@ static void raid10_unplug(struct blk_pl
        while (bio) { /* submit pending writes */
                struct bio *next = bio->bi_next;
                bio->bi_next = NULL;
 -              generic_make_request(bio);
 +              if (unlikely((bio->bi_rw & REQ_DISCARD) &&
 +                  !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 +                      /* Just ignore it */
 +                      bio_endio(bio, 0);
 +              else
 +                      generic_make_request(bio);
                bio = next;
        }
        kfree(plug);
@@@ -1174,14 -1169,13 +1174,13 @@@ static void make_request(struct mddev *
        /* If this request crosses a chunk boundary, we need to
         * split it.  This will only happen for 1 PAGE (or less) requests.
         */
-       if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
+       if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
                     > chunk_sects
                     && (conf->geo.near_copies < conf->geo.raid_disks
                         || conf->prev.near_copies < conf->prev.raid_disks))) {
                struct bio_pair *bp;
                /* Sanity check -- queue functions should prevent this happening */
-               if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
-                   bio->bi_idx != 0)
+               if (bio_segments(bio) > 1)
                        goto bad_map;
                /* This is a one page bio that upper layers
                 * refuse to split for us, so we need to split it.
        bad_map:
                printk("md/raid10:%s: make_request bug: can't convert block across chunks"
                       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-                      (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+                      (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
  
                bio_io_error(bio);
                return;
         */
        wait_barrier(conf);
  
-       sectors = bio->bi_size >> 9;
+       sectors = bio_sectors(bio);
        while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
            bio->bi_sector < conf->reshape_progress &&
            bio->bi_sector + sectors > conf->reshape_progress) {
@@@ -1331,8 -1325,7 +1330,7 @@@ read_again
                        r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
  
                        r10_bio->master_bio = bio;
-                       r10_bio->sectors = ((bio->bi_size >> 9)
-                                           - sectors_handled);
+                       r10_bio->sectors = bio_sectors(bio) - sectors_handled;
                        r10_bio->state = 0;
                        r10_bio->mddev = mddev;
                        r10_bio->sector = bio->bi_sector + sectors_handled;
@@@ -1574,7 -1567,7 +1572,7 @@@ retry_write
         * after checking if we need to go around again.
         */
  
-       if (sectors_handled < (bio->bi_size >> 9)) {
+       if (sectors_handled < bio_sectors(bio)) {
                one_write_done(r10_bio);
                /* We need another r10_bio.  It has already been counted
                 * in bio->bi_phys_segments.
                r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
  
                r10_bio->master_bio = bio;
-               r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+               r10_bio->sectors = bio_sectors(bio) - sectors_handled;
  
                r10_bio->mddev = mddev;
                r10_bio->sector = bio->bi_sector + sectors_handled;
@@@ -2084,13 -2077,10 +2082,10 @@@ static void sync_request_write(struct m
                 * First we need to fixup bv_offset, bv_len and
                 * bi_vecs, as the read request might have corrupted these
                 */
+               bio_reset(tbio);
                tbio->bi_vcnt = vcnt;
                tbio->bi_size = r10_bio->sectors << 9;
-               tbio->bi_idx = 0;
-               tbio->bi_phys_segments = 0;
-               tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               tbio->bi_flags |= 1 << BIO_UPTODATE;
-               tbio->bi_next = NULL;
                tbio->bi_rw = WRITE;
                tbio->bi_private = r10_bio;
                tbio->bi_sector = r10_bio->devs[i].addr;
                d = r10_bio->devs[i].devnum;
                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
                atomic_inc(&r10_bio->remaining);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
  
                tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
                tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
                d = r10_bio->devs[i].devnum;
                atomic_inc(&r10_bio->remaining);
                md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            tbio->bi_size >> 9);
+                            bio_sectors(tbio));
                generic_make_request(tbio);
        }
  
@@@ -2259,13 -2249,13 +2254,13 @@@ static void recovery_request_write(stru
        wbio2 = r10_bio->devs[1].repl_bio;
        if (wbio->bi_end_io) {
                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
                generic_make_request(wbio);
        }
        if (wbio2 && wbio2->bi_end_io) {
                atomic_inc(&conf->mirrors[d].replacement->nr_pending);
                md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            wbio2->bi_size >> 9);
+                            bio_sectors(wbio2));
                generic_make_request(wbio2);
        }
  }
@@@ -2536,25 -2526,6 +2531,6 @@@ static void fix_read_error(struct r10co
        }
  }
  
- static void bi_complete(struct bio *bio, int error)
- {
-       complete((struct completion *)bio->bi_private);
- }
- static int submit_bio_wait(int rw, struct bio *bio)
- {
-       struct completion event;
-       rw |= REQ_SYNC;
-       init_completion(&event);
-       bio->bi_private = &event;
-       bio->bi_end_io = bi_complete;
-       submit_bio(rw, bio);
-       wait_for_completion(&event);
-       return test_bit(BIO_UPTODATE, &bio->bi_flags);
- }
  static int narrow_write_error(struct r10bio *r10_bio, int i)
  {
        struct bio *bio = r10_bio->master_bio;
@@@ -2695,8 -2666,7 +2671,7 @@@ read_more
                r10_bio = mempool_alloc(conf->r10bio_pool,
                                        GFP_NOIO);
                r10_bio->master_bio = mbio;
-               r10_bio->sectors = (mbio->bi_size >> 9)
-                       - sectors_handled;
+               r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
                r10_bio->state = 0;
                set_bit(R10BIO_ReadError,
                        &r10_bio->state);
@@@ -2918,22 -2888,6 +2893,22 @@@ static sector_t sync_request(struct mdd
                if (init_resync(conf))
                        return 0;
  
 +      /*
 +       * Allow skipping a full rebuild for incremental assembly
 +       * of a clean array, like RAID1 does.
 +       */
 +      if (mddev->bitmap == NULL &&
 +          mddev->recovery_cp == MaxSector &&
 +          !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 +          conf->fullsync == 0) {
 +              *skipped = 1;
 +              max_sector = mddev->dev_sectors;
 +              if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
 +                  test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 +                      max_sector = mddev->resync_max_sectors;
 +              return max_sector - sector_nr;
 +      }
 +
   skipped:
        max_sector = mddev->dev_sectors;
        if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
                                        }
                                }
                                bio = r10_bio->devs[0].bio;
+                               bio_reset(bio);
                                bio->bi_next = biolist;
                                biolist = bio;
                                bio->bi_private = r10_bio;
                                rdev = mirror->rdev;
                                if (!test_bit(In_sync, &rdev->flags)) {
                                        bio = r10_bio->devs[1].bio;
+                                       bio_reset(bio);
                                        bio->bi_next = biolist;
                                        biolist = bio;
                                        bio->bi_private = r10_bio;
                                if (rdev == NULL || bio == NULL ||
                                    test_bit(Faulty, &rdev->flags))
                                        break;
+                               bio_reset(bio);
                                bio->bi_next = biolist;
                                biolist = bio;
                                bio->bi_private = r10_bio;
                                r10_bio->devs[i].repl_bio->bi_end_io = NULL;
  
                        bio = r10_bio->devs[i].bio;
-                       bio->bi_end_io = NULL;
+                       bio_reset(bio);
                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
                        if (conf->mirrors[d].rdev == NULL ||
                            test_bit(Faulty, &conf->mirrors[d].rdev->flags))
  
                        /* Need to set up for writing to the replacement */
                        bio = r10_bio->devs[i].repl_bio;
+                       bio_reset(bio);
                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
  
                        sector = r10_bio->devs[i].addr;
                }
        }
  
-       for (bio = biolist; bio ; bio=bio->bi_next) {
-               bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               if (bio->bi_end_io)
-                       bio->bi_flags |= 1 << BIO_UPTODATE;
-               bio->bi_vcnt = 0;
-               bio->bi_idx = 0;
-               bio->bi_phys_segments = 0;
-               bio->bi_size = 0;
-       }
        nr_sectors = 0;
        if (sector_nr + max_sync < max_sector)
                max_sector = sector_nr + max_sync;
@@@ -3831,7 -3778,6 +3799,7 @@@ static int stop(struct mddev *mddev
  
        if (conf->r10bio_pool)
                mempool_destroy(conf->r10bio_pool);
 +      safe_put_page(conf->tmppage);
        kfree(conf->mirrors);
        kfree(conf);
        mddev->private = NULL;
@@@ -4411,7 -4357,6 +4379,6 @@@ read_more
        read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
        read_bio->bi_flags |= 1 << BIO_UPTODATE;
        read_bio->bi_vcnt = 0;
-       read_bio->bi_idx = 0;
        read_bio->bi_size = 0;
        r10_bio->master_bio = read_bio;
        r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
                }
                if (!rdev2 || test_bit(Faulty, &rdev2->flags))
                        continue;
+               bio_reset(b);
                b->bi_bdev = rdev2->bdev;
                b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
                b->bi_private = r10_bio;
                b->bi_end_io = end_reshape_write;
                b->bi_rw = WRITE;
-               b->bi_flags &= ~(BIO_POOL_MASK - 1);
-               b->bi_flags |= 1 << BIO_UPTODATE;
                b->bi_next = blist;
-               b->bi_vcnt = 0;
-               b->bi_idx = 0;
-               b->bi_size = 0;
                blist = b;
        }
  
diff --combined drivers/md/raid5.c
index 4a7be455d6d86ceb6bda86a332b81d036db52dee,2fefb9f2198e2269bafa26d62e60eb96dbb63aa0..9359828ffe264d3313ee77de993ea4c5147f1205
@@@ -90,7 -90,7 +90,7 @@@ static inline struct hlist_head *stripe
   */
  static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
  {
-       int sectors = bio->bi_size >> 9;
+       int sectors = bio_sectors(bio);
        if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
                return bio->bi_next;
        else
@@@ -184,8 -184,6 +184,8 @@@ static void return_io(struct bio *retur
                return_bi = bi->bi_next;
                bi->bi_next = NULL;
                bi->bi_size = 0;
 +              trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 +                                       bi, 0);
                bio_endio(bi, 0);
                bi = return_bi;
        }
@@@ -569,14 -567,6 +569,6 @@@ static void ops_run_io(struct stripe_he
                bi = &sh->dev[i].req;
                rbi = &sh->dev[i].rreq; /* For writing to replacement */
  
-               bi->bi_rw = rw;
-               rbi->bi_rw = rw;
-               if (rw & WRITE) {
-                       bi->bi_end_io = raid5_end_write_request;
-                       rbi->bi_end_io = raid5_end_write_request;
-               } else
-                       bi->bi_end_io = raid5_end_read_request;
                rcu_read_lock();
                rrdev = rcu_dereference(conf->disks[i].replacement);
                smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
  
                        set_bit(STRIPE_IO_STARTED, &sh->state);
  
+                       bio_reset(bi);
                        bi->bi_bdev = rdev->bdev;
+                       bi->bi_rw = rw;
+                       bi->bi_end_io = (rw & WRITE)
+                               ? raid5_end_write_request
+                               : raid5_end_read_request;
+                       bi->bi_private = sh;
                        pr_debug("%s: for %llu schedule op %ld on disc %d\n",
                                __func__, (unsigned long long)sh->sector,
                                bi->bi_rw, i);
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                                bi->bi_rw |= REQ_FLUSH;
  
-                       bi->bi_flags = 1 << BIO_UPTODATE;
-                       bi->bi_idx = 0;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        bi->bi_io_vec[0].bv_offset = 0;
                        bi->bi_size = STRIPE_SIZE;
-                       bi->bi_next = NULL;
                        if (rrdev)
                                set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
  
  
                        set_bit(STRIPE_IO_STARTED, &sh->state);
  
+                       bio_reset(rbi);
                        rbi->bi_bdev = rrdev->bdev;
+                       rbi->bi_rw = rw;
+                       BUG_ON(!(rw & WRITE));
+                       rbi->bi_end_io = raid5_end_write_request;
+                       rbi->bi_private = sh;
                        pr_debug("%s: for %llu schedule op %ld on "
                                 "replacement disc %d\n",
                                __func__, (unsigned long long)sh->sector,
                        else
                                rbi->bi_sector = (sh->sector
                                                  + rrdev->data_offset);
-                       rbi->bi_flags = 1 << BIO_UPTODATE;
-                       rbi->bi_idx = 0;
                        rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        rbi->bi_io_vec[0].bv_offset = 0;
                        rbi->bi_size = STRIPE_SIZE;
-                       rbi->bi_next = NULL;
                        if (conf->mddev->gendisk)
                                trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
                                                      rbi, disk_devt(conf->mddev->gendisk),
@@@ -1887,15 -1884,8 +1886,15 @@@ static void raid5_end_write_request(str
                                        &rdev->mddev->recovery);
                } else if (is_badblock(rdev, sh->sector,
                                       STRIPE_SECTORS,
 -                                     &first_bad, &bad_sectors))
 +                                     &first_bad, &bad_sectors)) {
                        set_bit(R5_MadeGood, &sh->dev[i].flags);
 +                      if (test_bit(R5_ReadError, &sh->dev[i].flags))
 +                              /* That was a successful write so make
 +                               * sure it looks like we already did
 +                               * a re-write.
 +                               */
 +                              set_bit(R5_ReWrite, &sh->dev[i].flags);
 +              }
        }
        rdev_dec_pending(rdev, conf->mddev);
  
@@@ -2402,11 -2392,11 +2401,11 @@@ static int add_stripe_bio(struct stripe
        } else
                bip = &sh->dev[dd_idx].toread;
        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-               if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+               if (bio_end_sector(*bip) > bi->bi_sector)
                        goto overlap;
                bip = & (*bip)->bi_next;
        }
-       if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+       if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
                goto overlap;
  
        BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
                             bi && bi->bi_sector <= sector;
                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
-                       if (bi->bi_sector + (bi->bi_size>>9) >= sector)
-                               sector = bi->bi_sector + (bi->bi_size>>9);
+                       if (bio_end_sector(bi) >= sector)
+                               sector = bio_end_sector(bi);
                }
                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
@@@ -3849,7 -3839,7 +3848,7 @@@ static int in_chunk_boundary(struct mdd
  {
        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
        unsigned int chunk_sectors = mddev->chunk_sectors;
-       unsigned int bio_sectors = bio->bi_size >> 9;
+       unsigned int bio_sectors = bio_sectors(bio);
  
        if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                chunk_sectors = mddev->new_chunk_sectors;
@@@ -3923,8 -3913,6 +3922,8 @@@ static void raid5_align_endio(struct bi
        rdev_dec_pending(rdev, conf->mddev);
  
        if (!error && uptodate) {
 +              trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
 +                                       raid_bi, 0);
                bio_endio(raid_bi, 0);
                if (atomic_dec_and_test(&conf->active_aligned_reads))
                        wake_up(&conf->wait_for_stripe);
@@@ -3941,7 -3929,7 +3940,7 @@@ static int bio_fits_rdev(struct bio *bi
  {
        struct request_queue *q = bdev_get_queue(bi->bi_bdev);
  
-       if ((bi->bi_size>>9) > queue_max_sectors(q))
+       if (bio_sectors(bi) > queue_max_sectors(q))
                return 0;
        blk_recount_segments(q, bi);
        if (bi->bi_phys_segments > queue_max_segments(q))
@@@ -3988,7 -3976,7 +3987,7 @@@ static int chunk_aligned_read(struct md
                                                    0,
                                                    &dd_idx, NULL);
  
-       end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
+       end_sector = bio_end_sector(align_bi);
        rcu_read_lock();
        rdev = rcu_dereference(conf->disks[dd_idx].replacement);
        if (!rdev || test_bit(Faulty, &rdev->flags) ||
                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
  
                if (!bio_fits_rdev(align_bi) ||
-                   is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+                   is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
                                &first_bad, &bad_sectors)) {
                        /* too big in some way, or has a known bad block */
                        bio_put(align_bi);
@@@ -4273,7 -4261,7 +4272,7 @@@ static void make_request(struct mddev *
        }
  
        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-       last_sector = bi->bi_sector + (bi->bi_size>>9);
+       last_sector = bio_end_sector(bi);
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
  
                if ( rw == WRITE )
                        md_write_end(mddev);
  
 +              trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 +                                       bi, 0);
                bio_endio(bi, 0);
        }
  }
@@@ -4679,10 -4665,9 +4678,10 @@@ static inline sector_t sync_request(str
                *skipped = 1;
                return rv;
        }
 -      if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
 -          !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 -          !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
 +      if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 +          !conf->fullsync &&
 +          !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
 +          sync_blocks >= STRIPE_SECTORS) {
                /* we can skip this block, and probably more */
                sync_blocks /= STRIPE_SECTORS;
                *skipped = 1;
@@@ -4739,7 -4724,7 +4738,7 @@@ static int  retry_aligned_read(struct r
        logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        sector = raid5_compute_sector(conf, logical_sector,
                                      0, &dd_idx, NULL);
-       last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+       last_sector = bio_end_sector(raid_bio);
  
        for (; logical_sector < last_sector;
             logical_sector += STRIPE_SECTORS,
                handled++;
        }
        remaining = raid5_dec_bi_active_stripes(raid_bio);
 -      if (remaining == 0)
 +      if (remaining == 0) {
 +              trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
 +                                       raid_bio, 0);
                bio_endio(raid_bio, 0);
 +      }
        if (atomic_dec_and_test(&conf->active_aligned_reads))
                wake_up(&conf->wait_for_stripe);
        return handled;
index ffee6f781e30f6a25537bba1ed44f379aee137fd,2bb01546df0bcdf5063b09ccf55da5d17dbf838d..dd239bdbfcb4a0877db2ab49aa3c27a81eec7dd1
@@@ -1977,7 -1977,7 +1977,7 @@@ done
  static struct scsi_host_template mptsas_driver_template = {
        .module                         = THIS_MODULE,
        .proc_name                      = "mptsas",
 -      .proc_info                      = mptscsih_proc_info,
 +      .show_info                      = mptscsih_show_info,
        .name                           = "MPT SAS Host",
        .info                           = mptscsih_info,
        .queuecommand                   = mptsas_qcmd,
@@@ -2235,10 -2235,10 +2235,10 @@@ static int mptsas_smp_handler(struct Sc
        }
  
        /* do we need to support multiple segments? */
-       if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
                printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n",
-                   ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
-                   rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
+                   ioc->name, __func__, bio_segments(req->bio), blk_rq_bytes(req),
+                   bio_segments(rsp->bio), blk_rq_bytes(rsp));
                return -EINVAL;
        }
  
index 07ba32b07fb05d427cb18e62807ef40152577406,12d08b4529e9d3b1292ed317ee641bdfd7abddd5..6eca019bcf30a50edfab1a80daf1b351d2320474
@@@ -26,7 -26,7 +26,7 @@@
  #define DCSS_BUS_ID_SIZE 20
  
  static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 -static int dcssblk_release(struct gendisk *disk, fmode_t mode);
 +static void dcssblk_release(struct gendisk *disk, fmode_t mode);
  static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
  static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
                                 void **kaddr, unsigned long *pfn);
@@@ -781,15 -781,16 +781,15 @@@ out
        return rc;
  }
  
 -static int
 +static void
  dcssblk_release(struct gendisk *disk, fmode_t mode)
  {
        struct dcssblk_dev_info *dev_info = disk->private_data;
        struct segment_info *entry;
 -      int rc;
  
        if (!dev_info) {
 -              rc = -ENODEV;
 -              goto out;
 +              WARN_ON(1);
 +              return;
        }
        down_write(&dcssblk_devices_sem);
        if (atomic_dec_and_test(&dev_info->use_count)
                dev_info->save_pending = 0;
        }
        up_write(&dcssblk_devices_sem);
 -      rc = 0;
 -out:
 -      return rc;
  }
  
  static void
@@@ -822,8 -826,7 +822,7 @@@ dcssblk_make_request(struct request_que
        if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
                /* Request is not page-aligned. */
                goto fail;
-       if (((bio->bi_size >> 9) + bio->bi_sector)
-                       > get_capacity(bio->bi_bdev->bd_disk)) {
+       if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) {
                /* Request beyond end of DCSS segment. */
                goto fail;
        }
index 55cbd018015997bba484bfe5ffdbacabcc7e43db,7af776737b40ea346fb7586bdfea1236fb8b92a0..f42b0e15410f8a52e0a570664b04547d459e7aec
@@@ -235,17 -235,6 +235,17 @@@ static void sas_set_ex_phy(struct domai
        linkrate  = phy->linkrate;
        memcpy(sas_addr, phy->attached_sas_addr, SAS_ADDR_SIZE);
  
 +      /* Handle vacant phy - rest of dr data is not valid so skip it */
 +      if (phy->phy_state == PHY_VACANT) {
 +              memset(phy->attached_sas_addr, 0, SAS_ADDR_SIZE);
 +              phy->attached_dev_type = NO_DEVICE;
 +              if (!test_bit(SAS_HA_ATA_EH_ACTIVE, &ha->state)) {
 +                      phy->phy_id = phy_id;
 +                      goto skip;
 +              } else
 +                      goto out;
 +      }
 +
        phy->attached_dev_type = to_dev_type(dr);
        if (test_bit(SAS_HA_ATA_EH_ACTIVE, &ha->state))
                goto out;
        phy->phy->maximum_linkrate = dr->pmax_linkrate;
        phy->phy->negotiated_linkrate = phy->linkrate;
  
 + skip:
        if (new_phy)
                if (sas_phy_add(phy->phy)) {
                        sas_phy_free(phy->phy);
@@@ -400,7 -388,7 +400,7 @@@ int sas_ex_phy_discover(struct domain_d
        if (!disc_req)
                return -ENOMEM;
  
 -      disc_resp = alloc_smp_req(DISCOVER_RESP_SIZE);
 +      disc_resp = alloc_smp_resp(DISCOVER_RESP_SIZE);
        if (!disc_resp) {
                kfree(disc_req);
                return -ENOMEM;
@@@ -2163,10 -2151,10 +2163,10 @@@ int sas_smp_handler(struct Scsi_Host *s
        }
  
        /* do we need to support multiple segments? */
-       if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
                printk("%s: multiple segments req %u %u, rsp %u %u\n",
-                      __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
-                      rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
+                      __func__, bio_segments(req->bio), blk_rq_bytes(req),
+                      bio_segments(rsp->bio), blk_rq_bytes(rsp));
                return -EINVAL;
        }
  
diff --combined fs/bio.c
index 954d73124b411a733891f00394c44a1ffaefd0b1,9238a54b562c9bb0b672f4bd6359871e192c21d8..94bbc04dba77053bb47d3d8b793a3a8218f0a0d2
+++ b/fs/bio.c
@@@ -19,7 -19,6 +19,7 @@@
  #include <linux/swap.h>
  #include <linux/bio.h>
  #include <linux/blkdev.h>
 +#include <linux/uio.h>
  #include <linux/iocontext.h>
  #include <linux/slab.h>
  #include <linux/init.h>
@@@ -161,12 -160,12 +161,12 @@@ unsigned int bvec_nr_vecs(unsigned shor
        return bvec_slabs[idx].nr_vecs;
  }
  
- void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+ void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
  {
        BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
  
        if (idx == BIOVEC_MAX_IDX)
-               mempool_free(bv, bs->bvec_pool);
+               mempool_free(bv, pool);
        else {
                struct biovec_slab *bvs = bvec_slabs + idx;
  
        }
  }
  
- struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
-                             struct bio_set *bs)
+ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
+                          mempool_t *pool)
  {
        struct bio_vec *bvl;
  
         */
        if (*idx == BIOVEC_MAX_IDX) {
  fallback:
-               bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+               bvl = mempool_alloc(pool, gfp_mask);
        } else {
                struct biovec_slab *bvs = bvec_slabs + *idx;
                gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
@@@ -253,8 -252,8 +253,8 @@@ static void bio_free(struct bio *bio
        __bio_free(bio);
  
        if (bs) {
-               if (bio_has_allocated_vec(bio))
-                       bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+               if (bio_flagged(bio, BIO_OWNS_VEC))
+                       bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
  
                /*
                 * If we have front padding, adjust the bio pointer before freeing
@@@ -298,6 -297,54 +298,54 @@@ void bio_reset(struct bio *bio
  }
  EXPORT_SYMBOL(bio_reset);
  
+ static void bio_alloc_rescue(struct work_struct *work)
+ {
+       struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
+       struct bio *bio;
+       while (1) {
+               spin_lock(&bs->rescue_lock);
+               bio = bio_list_pop(&bs->rescue_list);
+               spin_unlock(&bs->rescue_lock);
+               if (!bio)
+                       break;
+               generic_make_request(bio);
+       }
+ }
+ static void punt_bios_to_rescuer(struct bio_set *bs)
+ {
+       struct bio_list punt, nopunt;
+       struct bio *bio;
+       /*
+        * In order to guarantee forward progress we must punt only bios that
+        * were allocated from this bio_set; otherwise, if there was a bio on
+        * there for a stacking driver higher up in the stack, processing it
+        * could require allocating bios from this bio_set, and doing that from
+        * our own rescuer would be bad.
+        *
+        * Since bio lists are singly linked, pop them all instead of trying to
+        * remove from the middle of the list:
+        */
+       bio_list_init(&punt);
+       bio_list_init(&nopunt);
+       while ((bio = bio_list_pop(current->bio_list)))
+               bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+       *current->bio_list = nopunt;
+       spin_lock(&bs->rescue_lock);
+       bio_list_merge(&bs->rescue_list, &punt);
+       spin_unlock(&bs->rescue_lock);
+       queue_work(bs->rescue_workqueue, &bs->rescue_work);
+ }
  /**
   * bio_alloc_bioset - allocate a bio for I/O
   * @gfp_mask:   the GFP_ mask given to the slab allocator
   *   previously allocated bio for IO before attempting to allocate a new one.
   *   Failure to do so can cause deadlocks under memory pressure.
   *
+  *   Note that when running under generic_make_request() (i.e. any block
+  *   driver), bios are not submitted until after you return - see the code in
+  *   generic_make_request() that converts recursion into iteration, to prevent
+  *   stack overflows.
+  *
+  *   This would normally mean allocating multiple bios under
+  *   generic_make_request() would be susceptible to deadlocks, but we have
+  *   deadlock avoidance code that resubmits any blocked bios from a rescuer
+  *   thread.
+  *
+  *   However, we do not guarantee forward progress for allocations from other
+  *   mempools. Doing multiple allocations from the same mempool under
+  *   generic_make_request() should be avoided - instead, use bio_set's front_pad
+  *   for per bio allocations.
+  *
   *   RETURNS:
   *   Pointer to new bio on success, NULL on failure.
   */
  struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
  {
+       gfp_t saved_gfp = gfp_mask;
        unsigned front_pad;
        unsigned inline_vecs;
        unsigned long idx = BIO_POOL_NONE;
                front_pad = 0;
                inline_vecs = nr_iovecs;
        } else {
+               /*
+                * generic_make_request() converts recursion to iteration; this
+                * means if we're running beneath it, any bios we allocate and
+                * submit will not be submitted (and thus freed) until after we
+                * return.
+                *
+                * This exposes us to a potential deadlock if we allocate
+                * multiple bios from the same bio_set() while running
+                * underneath generic_make_request(). If we were to allocate
+                * multiple bios (say a stacking block driver that was splitting
+                * bios), we would deadlock if we exhausted the mempool's
+                * reserve.
+                *
+                * We solve this, and guarantee forward progress, with a rescuer
+                * workqueue per bio_set. If we go to allocate and there are
+                * bios on current->bio_list, we first try the allocation
+                * without __GFP_WAIT; if that fails, we punt those bios we
+                * would be blocking to the rescuer workqueue before we retry
+                * with the original gfp_flags.
+                */
+               if (current->bio_list && !bio_list_empty(current->bio_list))
+                       gfp_mask &= ~__GFP_WAIT;
                p = mempool_alloc(bs->bio_pool, gfp_mask);
+               if (!p && gfp_mask != saved_gfp) {
+                       punt_bios_to_rescuer(bs);
+                       gfp_mask = saved_gfp;
+                       p = mempool_alloc(bs->bio_pool, gfp_mask);
+               }
                front_pad = bs->front_pad;
                inline_vecs = BIO_INLINE_VECS;
        }
        bio_init(bio);
  
        if (nr_iovecs > inline_vecs) {
-               bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+               bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+               if (!bvl && gfp_mask != saved_gfp) {
+                       punt_bios_to_rescuer(bs);
+                       gfp_mask = saved_gfp;
+                       bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+               }
                if (unlikely(!bvl))
                        goto err_free;
+               bio->bi_flags |= 1 << BIO_OWNS_VEC;
        } else if (nr_iovecs) {
                bvl = bio->bi_inline_vecs;
        }
@@@ -653,6 -754,181 +755,181 @@@ int bio_add_page(struct bio *bio, struc
  }
  EXPORT_SYMBOL(bio_add_page);
  
+ struct submit_bio_ret {
+       struct completion event;
+       int error;
+ };
+ static void submit_bio_wait_endio(struct bio *bio, int error)
+ {
+       struct submit_bio_ret *ret = bio->bi_private;
+       ret->error = error;
+       complete(&ret->event);
+ }
+ /**
+  * submit_bio_wait - submit a bio, and wait until it completes
+  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
+  * @bio: The &struct bio which describes the I/O
+  *
+  * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
+  * bio_endio() on failure.
+  */
+ int submit_bio_wait(int rw, struct bio *bio)
+ {
+       struct submit_bio_ret ret;
+       rw |= REQ_SYNC;
+       init_completion(&ret.event);
+       bio->bi_private = &ret;
+       bio->bi_end_io = submit_bio_wait_endio;
+       submit_bio(rw, bio);
+       wait_for_completion(&ret.event);
+       return ret.error;
+ }
+ EXPORT_SYMBOL(submit_bio_wait);
+ /**
+  * bio_advance - increment/complete a bio by some number of bytes
+  * @bio:      bio to advance
+  * @bytes:    number of bytes to complete
+  *
+  * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+  * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+  * be updated on the last bvec as well.
+  *
+  * @bio will then represent the remaining, uncompleted portion of the io.
+  */
+ void bio_advance(struct bio *bio, unsigned bytes)
+ {
+       if (bio_integrity(bio))
+               bio_integrity_advance(bio, bytes);
+       bio->bi_sector += bytes >> 9;
+       bio->bi_size -= bytes;
+       if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
+               return;
+       while (bytes) {
+               if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+                       WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
+                                 bio->bi_idx, bio->bi_vcnt);
+                       break;
+               }
+               if (bytes >= bio_iovec(bio)->bv_len) {
+                       bytes -= bio_iovec(bio)->bv_len;
+                       bio->bi_idx++;
+               } else {
+                       bio_iovec(bio)->bv_len -= bytes;
+                       bio_iovec(bio)->bv_offset += bytes;
+                       bytes = 0;
+               }
+       }
+ }
+ EXPORT_SYMBOL(bio_advance);
+ /**
+  * bio_alloc_pages - allocates a single page for each bvec in a bio
+  * @bio: bio to allocate pages for
+  * @gfp_mask: flags for allocation
+  *
+  * Allocates pages up to @bio->bi_vcnt.
+  *
+  * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+  * freed.
+  */
+ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+ {
+       int i;
+       struct bio_vec *bv;
+       bio_for_each_segment_all(bv, bio, i) {
+               bv->bv_page = alloc_page(gfp_mask);
+               if (!bv->bv_page) {
+                       while (--bv >= bio->bi_io_vec)
+                               __free_page(bv->bv_page);
+                       return -ENOMEM;
+               }
+       }
+       return 0;
+ }
+ EXPORT_SYMBOL(bio_alloc_pages);
+ /**
+  * bio_copy_data - copy contents of data buffers from one chain of bios to
+  * another
+  * @src: source bio list
+  * @dst: destination bio list
+  *
+  * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+  * @src and @dst as linked lists of bios.
+  *
+  * Stops when it reaches the end of either @src or @dst - that is, copies
+  * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+  */
+ void bio_copy_data(struct bio *dst, struct bio *src)
+ {
+       struct bio_vec *src_bv, *dst_bv;
+       unsigned src_offset, dst_offset, bytes;
+       void *src_p, *dst_p;
+       src_bv = bio_iovec(src);
+       dst_bv = bio_iovec(dst);
+       src_offset = src_bv->bv_offset;
+       dst_offset = dst_bv->bv_offset;
+       while (1) {
+               if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
+                       src_bv++;
+                       if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
+                               src = src->bi_next;
+                               if (!src)
+                                       break;
+                               src_bv = bio_iovec(src);
+                       }
+                       src_offset = src_bv->bv_offset;
+               }
+               if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
+                       dst_bv++;
+                       if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
+                               dst = dst->bi_next;
+                               if (!dst)
+                                       break;
+                               dst_bv = bio_iovec(dst);
+                       }
+                       dst_offset = dst_bv->bv_offset;
+               }
+               bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
+                           src_bv->bv_offset + src_bv->bv_len - src_offset);
+               src_p = kmap_atomic(src_bv->bv_page);
+               dst_p = kmap_atomic(dst_bv->bv_page);
+               memcpy(dst_p + dst_bv->bv_offset,
+                      src_p + src_bv->bv_offset,
+                      bytes);
+               kunmap_atomic(dst_p);
+               kunmap_atomic(src_p);
+               src_offset += bytes;
+               dst_offset += bytes;
+       }
+ }
+ EXPORT_SYMBOL(bio_copy_data);
  struct bio_map_data {
        struct bio_vec *iovecs;
        struct sg_iovec *sgvecs;
@@@ -715,7 -991,7 +992,7 @@@ static int __bio_copy_iov(struct bio *b
        int iov_idx = 0;
        unsigned int iov_off = 0;
  
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                char *bv_addr = page_address(bvec->bv_page);
                unsigned int bv_len = iovecs[i].bv_len;
  
@@@ -897,7 -1173,7 +1174,7 @@@ struct bio *bio_copy_user_iov(struct re
        return bio;
  cleanup:
        if (!map_data)
-               bio_for_each_segment(bvec, bio, i)
+               bio_for_each_segment_all(bvec, bio, i)
                        __free_page(bvec->bv_page);
  
        bio_put(bio);
@@@ -1111,7 -1387,7 +1388,7 @@@ static void __bio_unmap_user(struct bi
        /*
         * make sure we dirty pages we wrote to
         */
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                if (bio_data_dir(bio) == READ)
                        set_page_dirty_lock(bvec->bv_page);
  
@@@ -1217,7 -1493,7 +1494,7 @@@ static void bio_copy_kern_endio(struct 
        int i;
        char *p = bmd->sgvecs[0].iov_base;
  
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                char *addr = page_address(bvec->bv_page);
                int len = bmd->iovecs[i].bv_len;
  
@@@ -1257,7 -1533,7 +1534,7 @@@ struct bio *bio_copy_kern(struct reques
        if (!reading) {
                void *p = data;
  
-               bio_for_each_segment(bvec, bio, i) {
+               bio_for_each_segment_all(bvec, bio, i) {
                        char *addr = page_address(bvec->bv_page);
  
                        memcpy(addr, p, bvec->bv_len);
@@@ -1302,11 -1578,11 +1579,11 @@@ EXPORT_SYMBOL(bio_copy_kern)
   */
  void bio_set_pages_dirty(struct bio *bio)
  {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
        int i;
  
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
  
                if (page && !PageCompound(page))
                        set_page_dirty_lock(page);
  
  static void bio_release_pages(struct bio *bio)
  {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
        int i;
  
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
  
                if (page)
                        put_page(page);
@@@ -1368,16 -1644,16 +1645,16 @@@ static void bio_dirty_fn(struct work_st
  
  void bio_check_pages_dirty(struct bio *bio)
  {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
        int nr_clean_pages = 0;
        int i;
  
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
  
                if (PageDirty(page) || PageCompound(page)) {
                        page_cache_release(page);
-                       bvec[i].bv_page = NULL;
+                       bvec->bv_page = NULL;
                } else {
                        nr_clean_pages++;
                }
@@@ -1429,6 -1705,8 +1706,6 @@@ void bio_endio(struct bio *bio, int err
        else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = -EIO;
  
 -      trace_block_bio_complete(bio, error);
 -
        if (bio->bi_end_io)
                bio->bi_end_io(bio, error);
  }
@@@ -1478,8 -1756,7 +1755,7 @@@ struct bio_pair *bio_split(struct bio *
        trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
                                bi->bi_sector + first_sectors);
  
-       BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
-       BUG_ON(bi->bi_idx != 0);
+       BUG_ON(bio_segments(bi) > 1);
        atomic_set(&bp->cnt, 3);
        bp->error = 0;
        bp->bio1 = *bi;
        bp->bio1.bi_size = first_sectors << 9;
  
        if (bi->bi_vcnt != 0) {
-               bp->bv1 = bi->bi_io_vec[0];
-               bp->bv2 = bi->bi_io_vec[0];
+               bp->bv1 = *bio_iovec(bi);
+               bp->bv2 = *bio_iovec(bi);
  
                if (bio_is_rw(bi)) {
                        bp->bv2.bv_offset += first_sectors << 9;
@@@ -1542,7 -1819,7 +1818,7 @@@ sector_t bio_sector_offset(struct bio *
        if (index >= bio->bi_idx)
                index = bio->bi_vcnt - 1;
  
-       __bio_for_each_segment(bv, bio, i, 0) {
+       bio_for_each_segment_all(bv, bio, i) {
                if (i == index) {
                        if (offset > bv->bv_offset)
                                sectors += (offset - bv->bv_offset) / sector_sz;
@@@ -1560,29 -1837,25 +1836,25 @@@ EXPORT_SYMBOL(bio_sector_offset)
   * create memory pools for biovec's in a bio_set.
   * use the global biovec slabs created for general use.
   */
static int biovec_create_pools(struct bio_set *bs, int pool_entries)
mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
  {
        struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
  
-       bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-       if (!bs->bvec_pool)
-               return -ENOMEM;
-       return 0;
- }
- static void biovec_free_pools(struct bio_set *bs)
- {
-       mempool_destroy(bs->bvec_pool);
+       return mempool_create_slab_pool(pool_entries, bp->slab);
  }
  
  void bioset_free(struct bio_set *bs)
  {
+       if (bs->rescue_workqueue)
+               destroy_workqueue(bs->rescue_workqueue);
        if (bs->bio_pool)
                mempool_destroy(bs->bio_pool);
  
+       if (bs->bvec_pool)
+               mempool_destroy(bs->bvec_pool);
        bioset_integrity_free(bs);
-       biovec_free_pools(bs);
        bio_put_slab(bs);
  
        kfree(bs);
@@@ -1613,6 -1886,10 +1885,10 @@@ struct bio_set *bioset_create(unsigned 
  
        bs->front_pad = front_pad;
  
+       spin_lock_init(&bs->rescue_lock);
+       bio_list_init(&bs->rescue_list);
+       INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
        bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
        if (!bs->bio_slab) {
                kfree(bs);
        if (!bs->bio_pool)
                goto bad;
  
-       if (!biovec_create_pools(bs, pool_size))
-               return bs;
+       bs->bvec_pool = biovec_create_pool(bs, pool_size);
+       if (!bs->bvec_pool)
+               goto bad;
+       bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+       if (!bs->rescue_workqueue)
+               goto bad;
  
+       return bs;
  bad:
        bioset_free(bs);
        return NULL;
diff --combined fs/block_dev.c
index d9871c1f08949082ab5175f09c50668f961b9512,dc7f9836fb5e69630dfd93b64202514476a033bd..2091db8cdd783a2287ce9165a7223cfe59cf5a2b
@@@ -27,7 -27,6 +27,7 @@@
  #include <linux/namei.h>
  #include <linux/log2.h>
  #include <linux/cleancache.h>
 +#include <linux/aio.h>
  #include <asm/uaccess.h>
  #include "internal.h"
  
@@@ -552,7 -551,6 +552,7 @@@ struct block_device *bdgrab(struct bloc
        ihold(bdev->bd_inode);
        return bdev;
  }
 +EXPORT_SYMBOL(bdgrab);
  
  long nr_blockdev_pages(void)
  {
@@@ -618,9 -616,11 +618,9 @@@ void bd_forget(struct inode *inode
        struct block_device *bdev = NULL;
  
        spin_lock(&bdev_lock);
 -      if (inode->i_bdev) {
 -              if (!sb_is_blkdev_sb(inode->i_sb))
 -                      bdev = inode->i_bdev;
 -              __bd_forget(inode);
 -      }
 +      if (!sb_is_blkdev_sb(inode->i_sb))
 +              bdev = inode->i_bdev;
 +      __bd_forget(inode);
        spin_unlock(&bdev_lock);
  
        if (bdev)
@@@ -1046,7 -1046,7 +1046,7 @@@ void bd_set_size(struct block_device *b
  }
  EXPORT_SYMBOL(bd_set_size);
  
 -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
  
  /*
   * bd_mutex locking:
@@@ -1401,8 -1401,9 +1401,8 @@@ static int blkdev_open(struct inode * i
        return blkdev_get(bdev, filp->f_mode, filp);
  }
  
 -static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 +static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
  {
 -      int ret = 0;
        struct gendisk *disk = bdev->bd_disk;
        struct block_device *victim = NULL;
  
        }
        if (bdev->bd_contains == bdev) {
                if (disk->fops->release)
 -                      ret = disk->fops->release(disk, mode);
 +                      disk->fops->release(disk, mode);
        }
        if (!bdev->bd_openers) {
                struct module *owner = disk->fops->owner;
        bdput(bdev);
        if (victim)
                __blkdev_put(victim, mode, 1);
 -      return ret;
  }
  
 -int blkdev_put(struct block_device *bdev, fmode_t mode)
 +void blkdev_put(struct block_device *bdev, fmode_t mode)
  {
        mutex_lock(&bdev->bd_mutex);
  
  
        mutex_unlock(&bdev->bd_mutex);
  
 -      return __blkdev_put(bdev, mode, 0);
 +      __blkdev_put(bdev, mode, 0);
  }
  EXPORT_SYMBOL(blkdev_put);
  
  static int blkdev_close(struct inode * inode, struct file * filp)
  {
        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
 -
 -      return blkdev_put(bdev, filp->f_mode);
 +      blkdev_put(bdev, filp->f_mode);
 +      return 0;
  }
  
  static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
@@@ -1556,7 -1558,7 +1556,7 @@@ static ssize_t blkdev_aio_read(struct k
                return 0;
  
        size -= pos;
-       if (size < INT_MAX)
+       if (size < iocb->ki_left)
                nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
        return generic_file_aio_read(iocb, iov, nr_segs, pos);
  }
diff --combined fs/buffer.c
index bc1fe14aaa3e4583aa351298fc9faa4f42d2d6a9,ecd3792ae0e9f886429cace691d68d8284662741..d2a4d1bb2d57aec3999e494d52c4f765a0ae48e8
@@@ -865,6 -865,8 +865,6 @@@ try_again
  
                /* Link the buffer to its page */
                set_bh_page(bh, page, offset);
 -
 -              init_buffer(bh, NULL, NULL);
        }
        return head;
  /*
@@@ -2947,7 -2949,7 +2947,7 @@@ static void guard_bh_eod(int rw, struc
        }
  }
  
 -int submit_bh(int rw, struct buffer_head * bh)
 +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
  {
        struct bio *bio;
        int ret = 0;
        bio->bi_io_vec[0].bv_offset = bh_offset(bh);
  
        bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
        bio->bi_size = bh->b_size;
  
        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;
 +      bio->bi_flags |= bio_flags;
  
        /* Take care of bh's that straddle the end of the device */
        guard_bh_eod(rw, bio, bh);
  
 +      if (buffer_meta(bh))
 +              rw |= REQ_META;
 +      if (buffer_prio(bh))
 +              rw |= REQ_PRIO;
 +
        bio_get(bio);
        submit_bio(rw, bio);
  
        bio_put(bio);
        return ret;
  }
 +EXPORT_SYMBOL_GPL(_submit_bh);
 +
 +int submit_bh(int rw, struct buffer_head *bh)
 +{
 +      return _submit_bh(rw, bh, 0);
 +}
  EXPORT_SYMBOL(submit_bh);
  
  /**
diff --combined fs/direct-io.c
index 51d16e067d6815909907b6ccb33bba7d7ec72ab5,38484b08a39ac93ef024a72b9b2833b84e6c44f2..7ab90f5081eebc4ab8b0de88bef8d0b6310ed113
@@@ -37,7 -37,6 +37,7 @@@
  #include <linux/uio.h>
  #include <linux/atomic.h>
  #include <linux/prefetch.h>
 +#include <linux/aio.h>
  
  /*
   * How many user pages to map in one call to get_user_pages().  This determines
@@@ -442,8 -441,8 +442,8 @@@ static struct bio *dio_await_one(struc
  static int dio_bio_complete(struct dio *dio, struct bio *bio)
  {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec;
-       int page_no;
+       struct bio_vec *bvec;
+       unsigned i;
  
        if (!uptodate)
                dio->io_error = -EIO;
        if (dio->is_async && dio->rw == READ) {
                bio_check_pages_dirty(bio);     /* transfers ownership */
        } else {
-               for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
-                       struct page *page = bvec[page_no].bv_page;
+               bio_for_each_segment_all(bvec, bio, i) {
+                       struct page *page = bvec->bv_page;
  
                        if (dio->rw == READ && !PageCompound(page))
                                set_page_dirty_lock(page);
@@@ -673,6 -672,12 +673,6 @@@ static inline int dio_send_cur_page(str
                if (sdio->final_block_in_bio != sdio->cur_page_block ||
                    cur_offset != bio_next_offset)
                        dio_bio_submit(dio, sdio);
 -              /*
 -               * Submit now if the underlying fs is about to perform a
 -               * metadata read
 -               */
 -              else if (sdio->boundary)
 -                      dio_bio_submit(dio, sdio);
        }
  
        if (sdio->bio == NULL) {
@@@ -732,6 -737,16 +732,6 @@@ submit_page_section(struct dio *dio, st
            sdio->cur_page_block +
            (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
                sdio->cur_page_len += len;
 -
 -              /*
 -               * If sdio->boundary then we want to schedule the IO now to
 -               * avoid metadata seeks.
 -               */
 -              if (sdio->boundary) {
 -                      ret = dio_send_cur_page(dio, sdio, map_bh);
 -                      page_cache_release(sdio->cur_page);
 -                      sdio->cur_page = NULL;
 -              }
                goto out;
        }
  
                page_cache_release(sdio->cur_page);
                sdio->cur_page = NULL;
                if (ret)
 -                      goto out;
 +                      return ret;
        }
  
        page_cache_get(page);           /* It is in dio */
        sdio->cur_page_block = blocknr;
        sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
  out:
 +      /*
 +       * If sdio->boundary then we want to schedule the IO now to
 +       * avoid metadata seeks.
 +       */
 +      if (sdio->boundary) {
 +              ret = dio_send_cur_page(dio, sdio, map_bh);
 +              dio_bio_submit(dio, sdio);
 +              page_cache_release(sdio->cur_page);
 +              sdio->cur_page = NULL;
 +      }
        return ret;
  }
  
@@@ -964,8 -969,7 +964,8 @@@ do_holes
                        this_chunk_bytes = this_chunk_blocks << blkbits;
                        BUG_ON(this_chunk_bytes == 0);
  
 -                      sdio->boundary = buffer_boundary(map_bh);
 +                      if (this_chunk_blocks == sdio->blocks_available)
 +                              sdio->boundary = buffer_boundary(map_bh);
                        ret = submit_page_section(dio, sdio, page,
                                                  offset_in_page,
                                                  this_chunk_bytes,
diff --combined fs/fs-writeback.c
index 798d4458a4d3a5798a7b04858d82a5f031849bab,8067d3719e94eb194a8ce5e0a50fd49b7705384a..3be57189efd5b3a8005321f02e40971af9429cf6
@@@ -22,7 -22,6 +22,6 @@@
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/kthread.h>
- #include <linux/freezer.h>
  #include <linux/writeback.h>
  #include <linux/blkdev.h>
  #include <linux/backing-dev.h>
@@@ -88,20 -87,6 +87,6 @@@ static inline struct inode *wb_inode(st
  #define CREATE_TRACE_POINTS
  #include <trace/events/writeback.h>
  
- /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
- static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
- {
-       if (bdi->wb.task) {
-               wake_up_process(bdi->wb.task);
-       } else {
-               /*
-                * The bdi thread isn't there, wake up the forker thread which
-                * will create and run it.
-                */
-               wake_up_process(default_backing_dev_info.wb.task);
-       }
- }
  static void bdi_queue_work(struct backing_dev_info *bdi,
                           struct wb_writeback_work *work)
  {
  
        spin_lock_bh(&bdi->wb_lock);
        list_add_tail(&work->list, &bdi->work_list);
-       if (!bdi->wb.task)
-               trace_writeback_nothread(bdi, work);
-       bdi_wakeup_flusher(bdi);
        spin_unlock_bh(&bdi->wb_lock);
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
  }
  
  static void
@@@ -127,10 -111,8 +111,8 @@@ __bdi_start_writeback(struct backing_de
         */
        work = kzalloc(sizeof(*work), GFP_ATOMIC);
        if (!work) {
-               if (bdi->wb.task) {
-                       trace_writeback_nowork(bdi);
-                       wake_up_process(bdi->wb.task);
-               }
+               trace_writeback_nowork(bdi);
+               mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
                return;
        }
  
@@@ -177,9 -159,7 +159,7 @@@ void bdi_start_background_writeback(str
         * writeback as soon as there is no other work to do.
         */
        trace_writeback_wake_background(bdi);
-       spin_lock_bh(&bdi->wb_lock);
-       bdi_wakeup_flusher(bdi);
-       spin_unlock_bh(&bdi->wb_lock);
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
  }
  
  /*
@@@ -1020,67 -1000,48 +1000,49 @@@ long wb_do_writeback(struct bdi_writeba
  
  /*
   * Handle writeback of dirty data for the device backed by this bdi. Also
-  * wakes up periodically and does kupdated style flushing.
+  * reschedules periodically and does kupdated style flushing.
   */
int bdi_writeback_thread(void *data)
void bdi_writeback_workfn(struct work_struct *work)
  {
-       struct bdi_writeback *wb = data;
+       struct bdi_writeback *wb = container_of(to_delayed_work(work),
+                                               struct bdi_writeback, dwork);
        struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
  
 +      set_worker_desc("flush-%s", dev_name(bdi->dev));
        current->flags |= PF_SWAPWRITE;
-       set_freezable();
-       wb->last_active = jiffies;
-       /*
-        * Our parent may run at a different priority, just set us to normal
-        */
-       set_user_nice(current, 0);
-       trace_writeback_thread_start(bdi);
  
-       while (!kthread_freezable_should_stop(NULL)) {
+       if (likely(!current_is_workqueue_rescuer() ||
+                  list_empty(&bdi->bdi_list))) {
                /*
-                * Remove own delayed wake-up timer, since we are already awake
-                * and we'll take care of the periodic write-back.
+                * The normal path.  Keep writing back @bdi until its
+                * work_list is empty.  Note that this path is also taken
+                * if @bdi is shutting down even when we're running off the
+                * rescuer as work_list needs to be drained.
                 */
-               del_timer(&wb->wakeup_timer);
-               pages_written = wb_do_writeback(wb, 0);
+               do {
+                       pages_written = wb_do_writeback(wb, 0);
+                       trace_writeback_pages_written(pages_written);
+               } while (!list_empty(&bdi->work_list));
+       } else {
+               /*
+                * bdi_wq can't get enough workers and we're running off
+                * the emergency worker.  Don't hog it.  Hopefully, 1024 is
+                * enough for efficient IO.
+                */
+               pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+                                                   WB_REASON_FORKER_THREAD);
                trace_writeback_pages_written(pages_written);
-               if (pages_written)
-                       wb->last_active = jiffies;
-               set_current_state(TASK_INTERRUPTIBLE);
-               if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
-                       __set_current_state(TASK_RUNNING);
-                       continue;
-               }
-               if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-                       schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-               else {
-                       /*
-                        * We have nothing to do, so can go sleep without any
-                        * timeout and save power. When a work is queued or
-                        * something is made dirty - we will be woken up.
-                        */
-                       schedule();
-               }
        }
  
-       /* Flush any work that raced with us exiting */
-       if (!list_empty(&bdi->work_list))
-               wb_do_writeback(wb, 1);
+       if (!list_empty(&bdi->work_list) ||
+           (wb_has_dirty_io(wb) && dirty_writeback_interval))
+               queue_delayed_work(bdi_wq, &wb->dwork,
+                       msecs_to_jiffies(dirty_writeback_interval * 10));
  
-       trace_writeback_thread_stop(bdi);
-       return 0;
+       current->flags &= ~PF_SWAPWRITE;
  }
  
  /*
   * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
   * the whole world.
diff --combined fs/gfs2/lops.c
index 7318abf9d0fb863165857fb8adc897ad0f5e7c9e,5c37ef982390eb361684ba796776edbb64938c97..c5fa758fd8446e1938036be9cdedaf75e2bc552b
@@@ -53,8 -53,8 +53,8 @@@ void gfs2_pin(struct gfs2_sbd *sdp, str
         * to in-place disk block, remove it from the AIL.
         */
        spin_lock(&sdp->sd_ail_lock);
 -      if (bd->bd_ail)
 -              list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
 +      if (bd->bd_tr)
 +              list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list);
        spin_unlock(&sdp->sd_ail_lock);
        get_bh(bh);
        atomic_inc(&sdp->sd_log_pinned);
@@@ -94,7 -94,7 +94,7 @@@ static void maybe_release_space(struct 
   */
  
  static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 -                     struct gfs2_ail *ai)
 +                     struct gfs2_trans *tr)
  {
        struct gfs2_bufdata *bd = bh->b_private;
  
                maybe_release_space(bd);
  
        spin_lock(&sdp->sd_ail_lock);
 -      if (bd->bd_ail) {
 +      if (bd->bd_tr) {
                list_del(&bd->bd_ail_st_list);
                brelse(bh);
        } else {
                list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
                atomic_inc(&gl->gl_ail_count);
        }
 -      bd->bd_ail = ai;
 -      list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
 +      bd->bd_tr = tr;
 +      list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list);
        spin_unlock(&sdp->sd_ail_lock);
  
        clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
@@@ -300,7 -300,7 +300,7 @@@ static struct bio *gfs2_log_get_bio(str
        u64 nblk;
  
        if (bio) {
-               nblk = bio->bi_sector + bio_sectors(bio);
+               nblk = bio_end_sector(bio);
                nblk >>= sdp->sd_fsb2bb_shift;
                if (blkno == nblk)
                        return bio;
@@@ -480,22 -480,17 +480,22 @@@ static void buf_lo_before_commit(struc
                           &sdp->sd_log_le_buf, 0);
  }
  
 -static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
  {
        struct list_head *head = &sdp->sd_log_le_buf;
        struct gfs2_bufdata *bd;
  
 +      if (tr == NULL) {
 +              gfs2_assert(sdp, list_empty(head));
 +              return;
 +      }
 +
        while (!list_empty(head)) {
                bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
                list_del_init(&bd->bd_list);
                sdp->sd_log_num_buf--;
  
 -              gfs2_unpin(sdp, bd->bd_bh, ai);
 +              gfs2_unpin(sdp, bd->bd_bh, tr);
        }
        gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
  }
@@@ -618,7 -613,7 +618,7 @@@ static void revoke_lo_before_commit(str
        gfs2_log_write_page(sdp, page);
  }
  
 -static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
  {
        struct list_head *head = &sdp->sd_log_le_revoke;
        struct gfs2_bufdata *bd;
@@@ -796,21 -791,16 +796,21 @@@ static void databuf_lo_after_scan(struc
                jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
  }
  
 -static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
  {
        struct list_head *head = &sdp->sd_log_le_databuf;
        struct gfs2_bufdata *bd;
  
 +      if (tr == NULL) {
 +              gfs2_assert(sdp, list_empty(head));
 +              return;
 +      }
 +
        while (!list_empty(head)) {
                bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
                list_del_init(&bd->bd_list);
                sdp->sd_log_num_databuf--;
 -              gfs2_unpin(sdp, bd->bd_bh, ai);
 +              gfs2_unpin(sdp, bd->bd_bh, tr);
        }
        gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
  }
diff --combined fs/jfs/jfs_logmgr.c
index cbe48ea9318eea1ce44cae4bf3190911edf6c4c4,8ae5e350da430132ca37c94ff8e22d6053b76877..c57499dca89c5a3910bcefc5af951179aa693f24
@@@ -1058,8 -1058,7 +1058,8 @@@ static int lmLogSync(struct jfs_log * l
   */
  void jfs_syncpt(struct jfs_log *log, int hard_sync)
  {     LOG_LOCK(log);
 -      lmLogSync(log, hard_sync);
 +      if (!test_bit(log_QUIESCE, &log->flag))
 +              lmLogSync(log, hard_sync);
        LOG_UNLOCK(log);
  }
  
@@@ -2005,7 -2004,6 +2005,6 @@@ static int lbmRead(struct jfs_log * log
        bio->bi_io_vec[0].bv_offset = bp->l_offset;
  
        bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
        bio->bi_size = LOGPSIZE;
  
        bio->bi_end_io = lbmIODone;
@@@ -2146,7 -2144,6 +2145,6 @@@ static void lbmStartIO(struct lbuf * bp
        bio->bi_io_vec[0].bv_offset = bp->l_offset;
  
        bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
        bio->bi_size = LOGPSIZE;
  
        bio->bi_end_io = lbmIODone;
index 22990cf4439d2e8b91fedc0be9d254985f7ab64a,e8de67053cd4b15264c599644e1dc012273604fd..fa1abeb45b7602a4f0c1a4098f05f63d7a075281
@@@ -111,13 -111,13 +111,14 @@@ struct bio 
  #define BIO_FS_INTEGRITY 9    /* fs owns integrity data, not block layer */
  #define BIO_QUIET     10      /* Make BIO Quiet */
  #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
 +#define BIO_SNAP_STABLE       12      /* bio data must be snapshotted during write */
  
  /*
   * Flags starting here get preserved by bio_reset() - this includes
   * BIO_POOL_IDX()
   */
 -#define BIO_RESET_BITS        12
 -#define BIO_OWNS_VEC  12      /* bio_free() should free bvec */
 +#define BIO_RESET_BITS        13
++#define BIO_OWNS_VEC  13      /* bio_free() should free bvec */
  
  #define bio_flagged(bio, flag)        ((bio)->bi_flags & (1 << (flag)))
  
@@@ -176,6 -176,7 +177,7 @@@ enum rq_flag_bits 
        __REQ_IO_STAT,          /* account I/O stat */
        __REQ_MIXED_MERGE,      /* merge of different types, fail separately */
        __REQ_KERNEL,           /* direct IO to kernel pages */
+       __REQ_PM,               /* runtime pm request */
        __REQ_NR_BITS,          /* stops here */
  };
  
         REQ_SECURE)
  #define REQ_CLONE_MASK                REQ_COMMON_MASK
  
+ #define BIO_NO_ADVANCE_ITER_MASK      (REQ_DISCARD|REQ_WRITE_SAME)
  /* This mask is used for both bio and request merge checking */
  #define REQ_NOMERGE_FLAGS \
        (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
  #define REQ_MIXED_MERGE               (1 << __REQ_MIXED_MERGE)
  #define REQ_SECURE            (1 << __REQ_SECURE)
  #define REQ_KERNEL            (1 << __REQ_KERNEL)
+ #define REQ_PM                        (1 << __REQ_PM)
  
  #endif /* __LINUX_BLK_TYPES_H */
diff --combined include/linux/blkdev.h
index e38cfe77f7f017ddc9347d66f224b986b86abf45,6189bf26b53d6c5eeb7d153fbc59aeeb51392a2e..2fdb4a451b49bd626d9415b231c76b7ac927cf69
@@@ -361,6 -361,12 +361,12 @@@ struct request_queue 
         */
        struct kobject kobj;
  
+ #ifdef CONFIG_PM_RUNTIME
+       struct device           *dev;
+       int                     rpm_status;
+       unsigned int            nr_pending;
+ #endif
        /*
         * queue settings
         */
@@@ -838,7 -844,7 +844,7 @@@ static inline unsigned int blk_queue_ge
                                                     unsigned int cmd_flags)
  {
        if (unlikely(cmd_flags & REQ_DISCARD))
-               return q->limits.max_discard_sectors;
+               return min(q->limits.max_discard_sectors, UINT_MAX >> 9);
  
        if (unlikely(cmd_flags & REQ_WRITE_SAME))
                return q->limits.max_write_same_sectors;
@@@ -960,6 -966,27 +966,27 @@@ struct request_queue *blk_alloc_queue(g
  struct request_queue *blk_alloc_queue_node(gfp_t, int);
  extern void blk_put_queue(struct request_queue *);
  
+ /*
+  * block layer runtime pm functions
+  */
+ #ifdef CONFIG_PM_RUNTIME
+ extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev);
+ extern int blk_pre_runtime_suspend(struct request_queue *q);
+ extern void blk_post_runtime_suspend(struct request_queue *q, int err);
+ extern void blk_pre_runtime_resume(struct request_queue *q);
+ extern void blk_post_runtime_resume(struct request_queue *q, int err);
+ #else
+ static inline void blk_pm_runtime_init(struct request_queue *q,
+       struct device *dev) {}
+ static inline int blk_pre_runtime_suspend(struct request_queue *q)
+ {
+       return -ENOSYS;
+ }
+ static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {}
+ static inline void blk_pre_runtime_resume(struct request_queue *q) {}
+ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
+ #endif
  /*
   * blk_plug permits building a queue of related requests by holding the I/O
   * fragments for a short period. This allows merging of sequential requests
@@@ -1484,7 -1511,7 +1511,7 @@@ static inline bool blk_integrity_is_ini
  
  struct block_device_operations {
        int (*open) (struct block_device *, fmode_t);
 -      int (*release) (struct gendisk *, fmode_t);
 +      void (*release) (struct gendisk *, fmode_t);
        int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
        int (*direct_access) (struct block_device *, sector_t,
index 9c1467357b03c616967cd193efab6506e3e5adff,5a28843725dfa96a70468b09cdc960f8abaf8697..60ae7c3db912de7e068452de1a1c1978cad0a662
@@@ -244,7 -244,7 +244,7 @@@ TRACE_EVENT(block_bio_bounce
                __entry->dev            = bio->bi_bdev ?
                                          bio->bi_bdev->bd_dev : 0;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),
  
  /**
   * block_bio_complete - completed all work on the block operation
 + * @q: queue holding the block operation
   * @bio: block operation completed
   * @error: io error value
   *
   */
  TRACE_EVENT(block_bio_complete,
  
 -      TP_PROTO(struct bio *bio, int error),
 +      TP_PROTO(struct request_queue *q, struct bio *bio, int error),
  
 -      TP_ARGS(bio, error),
 +      TP_ARGS(q, bio, error),
  
        TP_STRUCT__entry(
                __field( dev_t,         dev             )
        ),
  
        TP_fast_assign(
 -              __entry->dev            = bio->bi_bdev ?
 -                                        bio->bi_bdev->bd_dev : 0;
 +              __entry->dev            = bio->bi_bdev->bd_dev;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                __entry->error          = error;
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
        ),
@@@ -309,7 -309,7 +309,7 @@@ DECLARE_EVENT_CLASS(block_bio_merge
        TP_fast_assign(
                __entry->dev            = bio->bi_bdev->bd_dev;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),
@@@ -376,7 -376,7 +376,7 @@@ TRACE_EVENT(block_bio_queue
        TP_fast_assign(
                __entry->dev            = bio->bi_bdev->bd_dev;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),
@@@ -404,7 -404,7 +404,7 @@@ DECLARE_EVENT_CLASS(block_get_rq
        TP_fast_assign(
                __entry->dev            = bio ? bio->bi_bdev->bd_dev : 0;
                __entry->sector         = bio ? bio->bi_sector : 0;
-               __entry->nr_sector      = bio ? bio->bi_size >> 9 : 0;
+               __entry->nr_sector      = bio ? bio_sectors(bio) : 0;
                blk_fill_rwbs(__entry->rwbs,
                              bio ? bio->bi_rw : 0, __entry->nr_sector);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
@@@ -580,7 -580,7 +580,7 @@@ TRACE_EVENT(block_bio_remap
        TP_fast_assign(
                __entry->dev            = bio->bi_bdev->bd_dev;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                __entry->old_dev        = dev;
                __entry->old_sector     = from;
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
diff --combined kernel/relay.c
index eef0d113b79ed22734b980b076e05c3aee73a64c,a0d200012adb6dc9c40f6587992c9d756b79b325..b91488ba2e5a7edb6e3cbdc5876adaf9ae7f2791
@@@ -234,7 -234,6 +234,6 @@@ static void relay_destroy_buf(struct rc
  static void relay_remove_buf(struct kref *kref)
  {
        struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-       buf->chan->cb->remove_buf_file(buf->dentry);
        relay_destroy_buf(buf);
  }
  
@@@ -484,6 -483,7 +483,7 @@@ static void relay_close_buf(struct rcha
  {
        buf->finalized = 1;
        del_timer_sync(&buf->timer);
+       buf->chan->cb->remove_buf_file(buf->dentry);
        kref_put(&buf->kref, relay_remove_buf);
  }
  
@@@ -588,7 -588,7 +588,7 @@@ struct rchan *relay_open(const char *ba
        chan->version = RELAYFS_CHANNEL_VERSION;
        chan->n_subbufs = n_subbufs;
        chan->subbuf_size = subbuf_size;
 -      chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
 +      chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
        chan->parent = parent;
        chan->private_data = private_data;
        if (base_filename) {
@@@ -1099,7 -1099,8 +1099,7 @@@ static size_t relay_file_read_end_pos(s
  static int subbuf_read_actor(size_t read_start,
                             struct rchan_buf *buf,
                             size_t avail,
 -                           read_descriptor_t *desc,
 -                           read_actor_t actor)
 +                           read_descriptor_t *desc)
  {
        void *from;
        int ret = 0;
  typedef int (*subbuf_actor_t) (size_t read_start,
                               struct rchan_buf *buf,
                               size_t avail,
 -                             read_descriptor_t *desc,
 -                             read_actor_t actor);
 +                             read_descriptor_t *desc);
  
  /*
   *    relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
   */
  static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
                                        subbuf_actor_t subbuf_actor,
 -                                      read_actor_t actor,
                                        read_descriptor_t *desc)
  {
        struct rchan_buf *buf = filp->private_data;
                        break;
  
                avail = min(desc->count, avail);
 -              ret = subbuf_actor(read_start, buf, avail, desc, actor);
 +              ret = subbuf_actor(read_start, buf, avail, desc);
                if (desc->error < 0)
                        break;
  
@@@ -1171,7 -1174,8 +1171,7 @@@ static ssize_t relay_file_read(struct f
        desc.count = count;
        desc.arg.buf = buffer;
        desc.error = 0;
 -      return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
 -                                     NULL, &desc);
 +      return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
  }
  
  static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
diff --combined mm/bounce.c
index a5c2ec3589cb94934e8654821b0876d28e1f5510,f5326b24d65d88c1051ee9547b0a66373372940c..c9f0a4339a7dafc2ba7295e49ad8fcdda8fa13de
@@@ -101,7 -101,7 +101,7 @@@ static void copy_to_high_bio_irq(struc
        struct bio_vec *tovec, *fromvec;
        int i;
  
-       __bio_for_each_segment(tovec, to, i, 0) {
+       bio_for_each_segment(tovec, to, i) {
                fromvec = from->bi_io_vec + i;
  
                /*
@@@ -134,7 -134,7 +134,7 @@@ static void bounce_end_io(struct bio *b
        /*
         * free up bounce indirect pages used
         */
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                org_vec = bio_orig->bi_io_vec + i;
                if (bvec->bv_page == org_vec->bv_page)
                        continue;
@@@ -181,13 -181,32 +181,13 @@@ static void bounce_end_io_read_isa(stru
  #ifdef CONFIG_NEED_BOUNCE_POOL
  static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
  {
 -      struct page *page;
 -      struct backing_dev_info *bdi;
 -      struct address_space *mapping;
 -      struct bio_vec *from;
 -      int i;
 -
        if (bio_data_dir(bio) != WRITE)
                return 0;
  
        if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
                return 0;
  
 -      /*
 -       * Based on the first page that has a valid mapping, decide whether or
 -       * not we have to employ bounce buffering to guarantee stable pages.
 -       */
 -      bio_for_each_segment(from, bio, i) {
 -              page = from->bv_page;
 -              mapping = page_mapping(page);
 -              if (!mapping)
 -                      continue;
 -              bdi = mapping->backing_dev_info;
 -              return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
 -      }
 -
 -      return 0;
 +      return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
  }
  #else
  static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
  static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
                               mempool_t *pool, int force)
  {
-       struct page *page;
-       struct bio *bio = NULL;
-       int i, rw = bio_data_dir(*bio_orig);
+       struct bio *bio;
+       int rw = bio_data_dir(*bio_orig);
        struct bio_vec *to, *from;
+       unsigned i;
  
-       bio_for_each_segment(from, *bio_orig, i) {
-               page = from->bv_page;
+       bio_for_each_segment(from, *bio_orig, i)
+               if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
+                       goto bounce;
  
-               /*
-                * is destination page below bounce pfn?
-                */
-               if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
-                       continue;
+       return;
+ bounce:
+       bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
  
-               /*
-                * irk, bounce it
-                */
-               if (!bio) {
-                       unsigned int cnt = (*bio_orig)->bi_vcnt;
+       bio_for_each_segment_all(to, bio, i) {
+               struct page *page = to->bv_page;
  
-                       bio = bio_alloc(GFP_NOIO, cnt);
-                       memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec));
-               }
-                       
-               to = bio->bi_io_vec + i;
+               if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+                       continue;
  
-               to->bv_page = mempool_alloc(pool, q->bounce_gfp);
-               to->bv_len = from->bv_len;
-               to->bv_offset = from->bv_offset;
                inc_zone_page_state(to->bv_page, NR_BOUNCE);
+               to->bv_page = mempool_alloc(pool, q->bounce_gfp);
  
                if (rw == WRITE) {
                        char *vto, *vfrom;
  
-                       flush_dcache_page(from->bv_page);
+                       flush_dcache_page(page);
                        vto = page_address(to->bv_page) + to->bv_offset;
-                       vfrom = kmap(from->bv_page) + from->bv_offset;
+                       vfrom = kmap_atomic(page) + to->bv_offset;
                        memcpy(vto, vfrom, to->bv_len);
-                       kunmap(from->bv_page);
+                       kunmap_atomic(vfrom);
                }
        }
  
-       /*
-        * no pages bounced
-        */
-       if (!bio)
-               return;
        trace_block_bio_bounce(q, *bio_orig);
  
-       /*
-        * at least one page was bounced, fill in possible non-highmem
-        * pages
-        */
-       __bio_for_each_segment(from, *bio_orig, i, 0) {
-               to = bio_iovec_idx(bio, i);
-               if (!to->bv_page) {
-                       to->bv_page = from->bv_page;
-                       to->bv_len = from->bv_len;
-                       to->bv_offset = from->bv_offset;
-               }
-       }
-       bio->bi_bdev = (*bio_orig)->bi_bdev;
        bio->bi_flags |= (1 << BIO_BOUNCED);
-       bio->bi_sector = (*bio_orig)->bi_sector;
-       bio->bi_rw = (*bio_orig)->bi_rw;
-       bio->bi_vcnt = (*bio_orig)->bi_vcnt;
-       bio->bi_idx = (*bio_orig)->bi_idx;
-       bio->bi_size = (*bio_orig)->bi_size;
  
        if (pool == page_pool) {
                bio->bi_end_io = bounce_end_io_write;
diff --combined mm/page_io.c
index 06a8842a6ec612dbda8071f94d15423326a375f4,8d3c0c088105e2e6127eb1d27fcd2a88dd0193c3..a8a3ef45fed753b68ac1cc4a94c9260979a37879
@@@ -20,7 -20,6 +20,7 @@@
  #include <linux/buffer_head.h>
  #include <linux/writeback.h>
  #include <linux/frontswap.h>
 +#include <linux/aio.h>
  #include <asm/pgtable.h>
  
  static struct bio *get_swap_bio(gfp_t gfp_flags,
                bio->bi_io_vec[0].bv_len = PAGE_SIZE;
                bio->bi_io_vec[0].bv_offset = 0;
                bio->bi_vcnt = 1;
-               bio->bi_idx = 0;
                bio->bi_size = PAGE_SIZE;
                bio->bi_end_io = end_io;
        }
        return bio;
  }
  
 -static void end_swap_bio_write(struct bio *bio, int err)
 +void end_swap_bio_write(struct bio *bio, int err)
  {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        struct page *page = bio->bi_io_vec[0].bv_page;
@@@ -186,7 -184,9 +185,7 @@@ bad_bmap
   */
  int swap_writepage(struct page *page, struct writeback_control *wbc)
  {
 -      struct bio *bio;
 -      int ret = 0, rw = WRITE;
 -      struct swap_info_struct *sis = page_swap_info(page);
 +      int ret = 0;
  
        if (try_to_free_swap(page)) {
                unlock_page(page);
                end_page_writeback(page);
                goto out;
        }
 +      ret = __swap_writepage(page, wbc, end_swap_bio_write);
 +out:
 +      return ret;
 +}
 +
 +int __swap_writepage(struct page *page, struct writeback_control *wbc,
 +      void (*end_write_func)(struct bio *, int))
 +{
 +      struct bio *bio;
 +      int ret = 0, rw = WRITE;
 +      struct swap_info_struct *sis = page_swap_info(page);
  
        if (sis->flags & SWP_FILE) {
                struct kiocb kiocb;
                kiocb.ki_left = PAGE_SIZE;
                kiocb.ki_nbytes = PAGE_SIZE;
  
 +              set_page_writeback(page);
                unlock_page(page);
                ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
                                                &kiocb, &iov,
                if (ret == PAGE_SIZE) {
                        count_vm_event(PSWPOUT);
                        ret = 0;
 +              } else {
 +                      /*
 +                       * In the case of swap-over-nfs, this can be a
 +                       * temporary failure if the system has limited
 +                       * memory for allocating transmit buffers.
 +                       * Mark the page dirty and avoid
 +                       * rotate_reclaimable_page but rate-limit the
 +                       * messages but do not flag PageError like
 +                       * the normal direct-to-bio case as it could
 +                       * be temporary.
 +                       */
 +                      set_page_dirty(page);
 +                      ClearPageReclaim(page);
 +                      pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
 +                              page_file_offset(page));
                }
 +              end_page_writeback(page);
                return ret;
        }
  
 -      bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
 +      bio = get_swap_bio(GFP_NOIO, page, end_write_func);
        if (bio == NULL) {
                set_page_dirty(page);
                unlock_page(page);