Merge branch 'for-4.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
[linux-2.6-block.git] / fs / btrfs / extent_io.c
index a3455d216a1d19125280720ce45bbc309b4f749a..556484cf5d9338d3383a23de7d484df2b5348d73 100644 (file)
@@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void)
                pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
                       state->start, state->end, state->state,
                       extent_state_in_tree(state),
-                      atomic_read(&state->refs));
+                      refcount_read(&state->refs));
                list_del(&state->leak_list);
                kmem_cache_free(extent_state_cache, state);
        }
@@ -87,19 +87,9 @@ void btrfs_leak_debug_check(void)
 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
                struct extent_io_tree *tree, u64 start, u64 end)
 {
-       struct inode *inode;
-       u64 isize;
-
-       if (!tree->mapping)
-               return;
-
-       inode = tree->mapping->host;
-       isize = i_size_read(inode);
-       if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-               btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
-                   "%s: ino %llu isize %llu odd range [%llu,%llu]",
-                       caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
-       }
+       if (tree->ops && tree->ops->check_extent_io_range)
+               tree->ops->check_extent_io_range(tree->private_data, caller,
+                                                start, end);
 }
 #else
 #define btrfs_leak_debug_add(new, head)        do {} while (0)
@@ -154,9 +144,9 @@ static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
 {
-       if (!tree->mapping)
-               return NULL;
-       return btrfs_sb(tree->mapping->host->i_sb);
+       if (tree->ops)
+               return tree->ops->tree_fs_info(tree->private_data);
+       return NULL;
 }
 
 int __init extent_io_init(void)
@@ -174,7 +164,8 @@ int __init extent_io_init(void)
                goto free_state_cache;
 
        btrfs_bioset = bioset_create(BIO_POOL_SIZE,
-                                    offsetof(struct btrfs_io_bio, bio));
+                                    offsetof(struct btrfs_io_bio, bio),
+                                    BIOSET_NEED_BVECS);
        if (!btrfs_bioset)
                goto free_buffer_cache;
 
@@ -213,13 +204,13 @@ void extent_io_exit(void)
 }
 
 void extent_io_tree_init(struct extent_io_tree *tree,
-                        struct address_space *mapping)
+                        void *private_data)
 {
        tree->state = RB_ROOT;
        tree->ops = NULL;
        tree->dirty_bytes = 0;
        spin_lock_init(&tree->lock);
-       tree->mapping = mapping;
+       tree->private_data = private_data;
 }
 
 static struct extent_state *alloc_extent_state(gfp_t mask)
@@ -238,7 +229,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
        state->failrec = NULL;
        RB_CLEAR_NODE(&state->rb_node);
        btrfs_leak_debug_add(&state->leak_list, &states);
-       atomic_set(&state->refs, 1);
+       refcount_set(&state->refs, 1);
        init_waitqueue_head(&state->wq);
        trace_alloc_extent_state(state, mask, _RET_IP_);
        return state;
@@ -248,7 +239,7 @@ void free_extent_state(struct extent_state *state)
 {
        if (!state)
                return;
-       if (atomic_dec_and_test(&state->refs)) {
+       if (refcount_dec_and_test(&state->refs)) {
                WARN_ON(extent_state_in_tree(state));
                btrfs_leak_debug_del(&state->leak_list);
                trace_free_extent_state(state, _RET_IP_);
@@ -369,8 +360,7 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
                     struct extent_state *other)
 {
        if (tree->ops && tree->ops->merge_extent_hook)
-               tree->ops->merge_extent_hook(tree->mapping->host, new,
-                                            other);
+               tree->ops->merge_extent_hook(tree->private_data, new, other);
 }
 
 /*
@@ -421,15 +411,14 @@ static void set_state_cb(struct extent_io_tree *tree,
                         struct extent_state *state, unsigned *bits)
 {
        if (tree->ops && tree->ops->set_bit_hook)
-               tree->ops->set_bit_hook(tree->mapping->host, state, bits);
+               tree->ops->set_bit_hook(tree->private_data, state, bits);
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
                           struct extent_state *state, unsigned *bits)
 {
        if (tree->ops && tree->ops->clear_bit_hook)
-               tree->ops->clear_bit_hook(BTRFS_I(tree->mapping->host),
-                               state, bits);
+               tree->ops->clear_bit_hook(tree->private_data, state, bits);
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
@@ -478,7 +467,7 @@ static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
                     u64 split)
 {
        if (tree->ops && tree->ops->split_extent_hook)
-               tree->ops->split_extent_hook(tree->mapping->host, orig, split);
+               tree->ops->split_extent_hook(tree->private_data, orig, split);
 }
 
 /*
@@ -641,7 +630,7 @@ again:
                if (cached && extent_state_in_tree(cached) &&
                    cached->start <= start && cached->end > start) {
                        if (clear)
-                               atomic_dec(&cached->refs);
+                               refcount_dec(&cached->refs);
                        state = cached;
                        goto hit_next;
                }
@@ -793,7 +782,7 @@ process_node:
 
                if (state->state & bits) {
                        start = state->start;
-                       atomic_inc(&state->refs);
+                       refcount_inc(&state->refs);
                        wait_on_state(tree, state);
                        free_extent_state(state);
                        goto again;
@@ -834,7 +823,7 @@ static void cache_state_if_flags(struct extent_state *state,
        if (cached_ptr && !(*cached_ptr)) {
                if (!flags || (state->state & flags)) {
                        *cached_ptr = state;
-                       atomic_inc(&state->refs);
+                       refcount_inc(&state->refs);
                }
        }
 }
@@ -1402,17 +1391,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
  */
 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
-       unsigned long index = start >> PAGE_SHIFT;
-       unsigned long end_index = end >> PAGE_SHIFT;
-       struct page *page;
-
-       while (index <= end_index) {
-               page = find_get_page(tree->mapping, index);
-               BUG_ON(!page); /* Pages should be in the extent_io_tree */
-               set_page_writeback(page);
-               put_page(page);
-               index++;
-       }
+       tree->ops->set_range_writeback(tree->private_data, start, end);
 }
 
 /* find the first state struct with 'bits' set after 'start', and
@@ -1538,7 +1517,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
                if (!found) {
                        *start = state->start;
                        *cached_state = state;
-                       atomic_inc(&state->refs);
+                       refcount_inc(&state->refs);
                }
                found++;
                *end = state->end;
@@ -1961,11 +1940,12 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
                SetPageUptodate(page);
 }
 
-int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
+int free_io_failure(struct extent_io_tree *failure_tree,
+                   struct extent_io_tree *io_tree,
+                   struct io_failure_record *rec)
 {
        int ret;
        int err = 0;
-       struct extent_io_tree *failure_tree = &inode->io_failure_tree;
 
        set_state_failrec(failure_tree, rec->start, NULL);
        ret = clear_extent_bits(failure_tree, rec->start,
@@ -1974,7 +1954,7 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
        if (ret)
                err = ret;
 
-       ret = clear_extent_bits(&inode->io_tree, rec->start,
+       ret = clear_extent_bits(io_tree, rec->start,
                                rec->start + rec->len - 1,
                                EXTENT_DAMAGED);
        if (ret && !err)
@@ -1994,29 +1974,21 @@ int free_io_failure(struct btrfs_inode *inode, struct io_failure_record *rec)
  * currently, there can be no more than two copies of every data bit. thus,
  * exactly one rewrite is required.
  */
-int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
-               u64 logical, struct page *page,
-               unsigned int pg_offset, int mirror_num)
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+                     u64 length, u64 logical, struct page *page,
+                     unsigned int pg_offset, int mirror_num)
 {
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct bio *bio;
        struct btrfs_device *dev;
        u64 map_length = 0;
        u64 sector;
        struct btrfs_bio *bbio = NULL;
-       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        int ret;
 
        ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
        BUG_ON(!mirror_num);
 
-       /* we can't repair anything in raid56 yet */
-       if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
-               return 0;
-
-       bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-       if (!bio)
-               return -EIO;
+       bio = btrfs_io_bio_alloc(1);
        bio->bi_iter.bi_size = 0;
        map_length = length;
 
@@ -2026,17 +1998,35 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
         * read repair operation.
         */
        btrfs_bio_counter_inc_blocked(fs_info);
-       ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
-                             &map_length, &bbio, mirror_num);
-       if (ret) {
-               btrfs_bio_counter_dec(fs_info);
-               bio_put(bio);
-               return -EIO;
+       if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) {
+               /*
+                * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+                * to update all raid stripes, but here we just want to correct
+                * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+                * stripe's dev and sector.
+                */
+               ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+                                     &map_length, &bbio, 0);
+               if (ret) {
+                       btrfs_bio_counter_dec(fs_info);
+                       bio_put(bio);
+                       return -EIO;
+               }
+               ASSERT(bbio->mirror_num == 1);
+       } else {
+               ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+                                     &map_length, &bbio, mirror_num);
+               if (ret) {
+                       btrfs_bio_counter_dec(fs_info);
+                       bio_put(bio);
+                       return -EIO;
+               }
+               BUG_ON(mirror_num != bbio->mirror_num);
        }
-       BUG_ON(mirror_num != bbio->mirror_num);
-       sector = bbio->stripes[mirror_num-1].physical >> 9;
+
+       sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
        bio->bi_iter.bi_sector = sector;
-       dev = bbio->stripes[mirror_num-1].dev;
+       dev = bbio->stripes[bbio->mirror_num - 1].dev;
        btrfs_put_bbio(bbio);
        if (!dev || !dev->bdev || !dev->writeable) {
                btrfs_bio_counter_dec(fs_info);
@@ -2057,7 +2047,7 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length,
 
        btrfs_info_rl_in_rcu(fs_info,
                "read error corrected: ino %llu off %llu (dev %s sector %llu)",
-                                 btrfs_ino(inode), start,
+                                 ino, start,
                                  rcu_str_deref(dev->name), sector);
        btrfs_bio_counter_dec(fs_info);
        bio_put(bio);
@@ -2077,8 +2067,7 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
        for (i = 0; i < num_pages; i++) {
                struct page *p = eb->pages[i];
 
-               ret = repair_io_failure(BTRFS_I(fs_info->btree_inode), start,
-                                       PAGE_SIZE, start, p,
+               ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
                                        start - page_offset(p), mirror_num);
                if (ret)
                        break;
@@ -2092,24 +2081,24 @@ int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
  */
-int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page,
-                    unsigned int pg_offset)
+int clean_io_failure(struct btrfs_fs_info *fs_info,
+                    struct extent_io_tree *failure_tree,
+                    struct extent_io_tree *io_tree, u64 start,
+                    struct page *page, u64 ino, unsigned int pg_offset)
 {
        u64 private;
        struct io_failure_record *failrec;
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct extent_state *state;
        int num_copies;
        int ret;
 
        private = 0;
-       ret = count_range_bits(&inode->io_failure_tree, &private,
-                               (u64)-1, 1, EXTENT_DIRTY, 0);
+       ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
+                              EXTENT_DIRTY, 0);
        if (!ret)
                return 0;
 
-       ret = get_state_failrec(&inode->io_failure_tree, start,
-                       &failrec);
+       ret = get_state_failrec(failure_tree, start, &failrec);
        if (ret)
                return 0;
 
@@ -2125,25 +2114,25 @@ int clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page,
        if (fs_info->sb->s_flags & MS_RDONLY)
                goto out;
 
-       spin_lock(&inode->io_tree.lock);
-       state = find_first_extent_bit_state(&inode->io_tree,
+       spin_lock(&io_tree->lock);
+       state = find_first_extent_bit_state(io_tree,
                                            failrec->start,
                                            EXTENT_LOCKED);
-       spin_unlock(&inode->io_tree.lock);
+       spin_unlock(&io_tree->lock);
 
        if (state && state->start <= failrec->start &&
            state->end >= failrec->start + failrec->len - 1) {
                num_copies = btrfs_num_copies(fs_info, failrec->logical,
                                              failrec->len);
                if (num_copies > 1)  {
-                       repair_io_failure(inode, start, failrec->len,
-                                         failrec->logical, page,
-                                         pg_offset, failrec->failed_mirror);
+                       repair_io_failure(fs_info, ino, start, failrec->len,
+                                         failrec->logical, page, pg_offset,
+                                         failrec->failed_mirror);
                }
        }
 
 out:
-       free_io_failure(inode, failrec);
+       free_io_failure(failure_tree, io_tree, failrec);
 
        return 0;
 }
@@ -2343,10 +2332,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
        struct btrfs_io_bio *btrfs_failed_bio;
        struct btrfs_io_bio *btrfs_bio;
 
-       bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
-       if (!bio)
-               return NULL;
-
+       bio = btrfs_io_bio_alloc(1);
        bio->bi_end_io = endio_func;
        bio->bi_iter.bi_sector = failrec->logical >> 9;
        bio->bi_bdev = fs_info->fs_devices->latest_bdev;
@@ -2384,8 +2370,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
        struct io_failure_record *failrec;
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
        struct bio *bio;
        int read_mode = 0;
+       blk_status_t status;
        int ret;
 
        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2396,7 +2384,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
        ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
        if (!ret) {
-               free_io_failure(BTRFS_I(inode), failrec);
+               free_io_failure(failure_tree, tree, failrec);
                return -EIO;
        }
 
@@ -2409,7 +2397,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                                      (int)phy_offset, failed_bio->bi_end_io,
                                      NULL);
        if (!bio) {
-               free_io_failure(BTRFS_I(inode), failrec);
+               free_io_failure(failure_tree, tree, failrec);
                return -EIO;
        }
        bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
@@ -2418,11 +2406,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
                "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
                read_mode, failrec->this_mirror, failrec->in_validation);
 
-       ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror,
+       status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
                                         failrec->bio_flags, 0);
-       if (ret) {
-               free_io_failure(BTRFS_I(inode), failrec);
+       if (status) {
+               free_io_failure(failure_tree, tree, failrec);
                bio_put(bio);
+               ret = blk_status_to_errno(status);
        }
 
        return ret;
@@ -2445,7 +2434,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
        if (!uptodate) {
                ClearPageUptodate(page);
                SetPageError(page);
-               ret = ret < 0 ? ret : -EIO;
+               ret = err < 0 ? err : -EIO;
                mapping_set_error(page->mapping, ret);
        }
 }
@@ -2461,6 +2450,7 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
  */
 static void end_bio_extent_writepage(struct bio *bio)
 {
+       int error = blk_status_to_errno(bio->bi_status);
        struct bio_vec *bvec;
        u64 start;
        u64 end;
@@ -2490,7 +2480,7 @@ static void end_bio_extent_writepage(struct bio *bio)
                start = page_offset(page);
                end = start + bvec->bv_offset + bvec->bv_len - 1;
 
-               end_extent_writepage(page, bio->bi_error, start, end);
+               end_extent_writepage(page, error, start, end);
                end_page_writeback(page);
        }
 
@@ -2523,9 +2513,9 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
 static void end_bio_extent_readpage(struct bio *bio)
 {
        struct bio_vec *bvec;
-       int uptodate = !bio->bi_error;
+       int uptodate = !bio->bi_status;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
-       struct extent_io_tree *tree;
+       struct extent_io_tree *tree, *failure_tree;
        u64 offset = 0;
        u64 start;
        u64 end;
@@ -2543,9 +2533,10 @@ static void end_bio_extent_readpage(struct bio *bio)
 
                btrfs_debug(fs_info,
                        "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
-                       (u64)bio->bi_iter.bi_sector, bio->bi_error,
+                       (u64)bio->bi_iter.bi_sector, bio->bi_status,
                        io_bio->mirror_num);
                tree = &BTRFS_I(inode)->io_tree;
+               failure_tree = &BTRFS_I(inode)->io_failure_tree;
 
                /* We always issue full-page reads, but if some block
                 * in a page fails to read, blk_update_request() will
@@ -2575,8 +2566,10 @@ static void end_bio_extent_readpage(struct bio *bio)
                        if (ret)
                                uptodate = 0;
                        else
-                               clean_io_failure(BTRFS_I(inode), start,
-                                               page, 0);
+                               clean_io_failure(BTRFS_I(inode)->root->fs_info,
+                                                failure_tree, tree, start,
+                                                page,
+                                                btrfs_ino(BTRFS_I(inode)), 0);
                }
 
                if (likely(uptodate))
@@ -2602,7 +2595,7 @@ static void end_bio_extent_readpage(struct bio *bio)
                                ret = bio_readpage_error(bio, offset, page,
                                                         start, end, mirror);
                                if (ret == 0) {
-                                       uptodate = !bio->bi_error;
+                                       uptodate = !bio->bi_status;
                                        offset += len;
                                        continue;
                                }
@@ -2660,77 +2653,80 @@ readpage_ok:
                endio_readpage_release_extent(tree, extent_start, extent_len,
                                              uptodate);
        if (io_bio->end_io)
-               io_bio->end_io(io_bio, bio->bi_error);
+               io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
        bio_put(bio);
 }
 
 /*
- * this allocates from the btrfs_bioset.  We're returning a bio right now
- * but you can call btrfs_io_bio for the appropriate container_of magic
+ * Initialize the members up to but not including 'bio'. Use after allocating a
+ * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
+ * 'bio' because use of __GFP_ZERO is not supported.
  */
-struct bio *
-btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
-               gfp_t gfp_flags)
+static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
 {
-       struct btrfs_io_bio *btrfs_bio;
-       struct bio *bio;
-
-       bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
+       memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
+}
 
-       if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-               while (!bio && (nr_vecs /= 2)) {
-                       bio = bio_alloc_bioset(gfp_flags,
-                                              nr_vecs, btrfs_bioset);
-               }
-       }
+/*
+ * The following helpers allocate a bio. As it's backed by a bioset, it'll
+ * never fail.  We're returning a bio right now but you can call btrfs_io_bio
+ * for the appropriate container_of magic
+ */
+struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
+{
+       struct bio *bio;
 
-       if (bio) {
-               bio->bi_bdev = bdev;
-               bio->bi_iter.bi_sector = first_sector;
-               btrfs_bio = btrfs_io_bio(bio);
-               btrfs_bio->csum = NULL;
-               btrfs_bio->csum_allocated = NULL;
-               btrfs_bio->end_io = NULL;
-       }
+       bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
+       bio->bi_bdev = bdev;
+       bio->bi_iter.bi_sector = first_byte >> 9;
+       btrfs_io_bio_init(btrfs_io_bio(bio));
        return bio;
 }
 
-struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+struct bio *btrfs_bio_clone(struct bio *bio)
 {
        struct btrfs_io_bio *btrfs_bio;
        struct bio *new;
 
-       new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
-       if (new) {
-               btrfs_bio = btrfs_io_bio(new);
-               btrfs_bio->csum = NULL;
-               btrfs_bio->csum_allocated = NULL;
-               btrfs_bio->end_io = NULL;
-       }
+       /* Bio allocation backed by a bioset does not fail */
+       new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
+       btrfs_bio = btrfs_io_bio(new);
+       btrfs_io_bio_init(btrfs_bio);
+       btrfs_bio->iter = bio->bi_iter;
        return new;
 }
 
-/* this also allocates from the btrfs_bioset */
-struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
 {
-       struct btrfs_io_bio *btrfs_bio;
        struct bio *bio;
 
-       bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
-       if (bio) {
-               btrfs_bio = btrfs_io_bio(bio);
-               btrfs_bio->csum = NULL;
-               btrfs_bio->csum_allocated = NULL;
-               btrfs_bio->end_io = NULL;
-       }
+       /* Bio allocation backed by a bioset does not fail */
+       bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
+       btrfs_io_bio_init(btrfs_io_bio(bio));
        return bio;
 }
 
+struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
+{
+       struct bio *bio;
+       struct btrfs_io_bio *btrfs_bio;
+
+       /* this will never fail when it's backed by a bioset */
+       bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
+       ASSERT(bio);
+
+       btrfs_bio = btrfs_io_bio(bio);
+       btrfs_io_bio_init(btrfs_bio);
+
+       bio_trim(bio, offset >> 9, size >> 9);
+       btrfs_bio->iter = bio->bi_iter;
+       return bio;
+}
 
 static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
                                       unsigned long bio_flags)
 {
-       int ret = 0;
+       blk_status_t ret = 0;
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct page *page = bvec->bv_page;
        struct extent_io_tree *tree = bio->bi_private;
@@ -2742,13 +2738,13 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
        bio_get(bio);
 
        if (tree->ops)
-               ret = tree->ops->submit_bio_hook(page->mapping->host, bio,
+               ret = tree->ops->submit_bio_hook(tree->private_data, bio,
                                           mirror_num, bio_flags, start);
        else
                btrfsic_submit_bio(bio);
 
        bio_put(bio);
-       return ret;
+       return blk_status_to_errno(ret);
 }
 
 static int merge_bio(struct extent_io_tree *tree, struct page *page,
@@ -2805,14 +2801,11 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree,
                }
        }
 
-       bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES,
-                       GFP_NOFS | __GFP_HIGH);
-       if (!bio)
-               return -ENOMEM;
-
+       bio = btrfs_bio_alloc(bdev, sector << 9);
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
        bio->bi_private = tree;
+       bio->bi_write_hint = page->mapping->host->i_write_hint;
        bio_set_op_attrs(bio, op, op_flags);
        if (wbc) {
                wbc_init_bio(wbc, bio);
@@ -2859,7 +2852,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
                em = *em_cached;
                if (extent_map_in_tree(em) && start >= em->start &&
                    start < extent_map_end(em)) {
-                       atomic_inc(&em->refs);
+                       refcount_inc(&em->refs);
                        return em;
                }
 
@@ -2870,7 +2863,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
        em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
        if (em_cached && !IS_ERR_OR_NULL(em)) {
                BUG_ON(*em_cached);
-               atomic_inc(&em->refs);
+               refcount_inc(&em->refs);
                *em_cached = em;
        }
        return em;
@@ -3694,7 +3687,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
                BUG_ON(!eb);
                done = atomic_dec_and_test(&eb->io_pages);
 
-               if (bio->bi_error ||
+               if (bio->bi_status ||
                    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
                        ClearPageUptodate(page);
                        set_btree_ioerr(page);
@@ -3744,7 +3737,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
                 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
                 */
                start = btrfs_item_nr_offset(nritems);
-               end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb);
+               end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
                memzero_extent_buffer(eb, start, end - start);
        }
 
@@ -4364,6 +4357,119 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
        return NULL;
 }
 
+/*
+ * To cache previous fiemap extent
+ *
+ * Will be used for merging fiemap extent
+ */
+struct fiemap_cache {
+       u64 offset;
+       u64 phys;
+       u64 len;
+       u32 flags;
+       bool cached;
+};
+
+/*
+ * Helper to submit fiemap extent.
+ *
+ * Will try to merge current fiemap extent specified by @offset, @phys,
+ * @len and @flags with cached one.
+ * And only when we fails to merge, cached one will be submitted as
+ * fiemap extent.
+ *
+ * Return value is the same as fiemap_fill_next_extent().
+ */
+static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
+                               struct fiemap_cache *cache,
+                               u64 offset, u64 phys, u64 len, u32 flags)
+{
+       int ret = 0;
+
+       if (!cache->cached)
+               goto assign;
+
+       /*
+        * Sanity check, extent_fiemap() should have ensured that new
+        * fiemap extent won't overlap with cahced one.
+        * Not recoverable.
+        *
+        * NOTE: Physical address can overlap, due to compression
+        */
+       if (cache->offset + cache->len > offset) {
+               WARN_ON(1);
+               return -EINVAL;
+       }
+
+       /*
+        * Only merges fiemap extents if
+        * 1) Their logical addresses are continuous
+        *
+        * 2) Their physical addresses are continuous
+        *    So truly compressed (physical size smaller than logical size)
+        *    extents won't get merged with each other
+        *
+        * 3) Share same flags except FIEMAP_EXTENT_LAST
+        *    So regular extent won't get merged with prealloc extent
+        */
+       if (cache->offset + cache->len  == offset &&
+           cache->phys + cache->len == phys  &&
+           (cache->flags & ~FIEMAP_EXTENT_LAST) ==
+                       (flags & ~FIEMAP_EXTENT_LAST)) {
+               cache->len += len;
+               cache->flags |= flags;
+               goto try_submit_last;
+       }
+
+       /* Not mergeable, need to submit cached one */
+       ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+                                     cache->len, cache->flags);
+       cache->cached = false;
+       if (ret)
+               return ret;
+assign:
+       cache->cached = true;
+       cache->offset = offset;
+       cache->phys = phys;
+       cache->len = len;
+       cache->flags = flags;
+try_submit_last:
+       if (cache->flags & FIEMAP_EXTENT_LAST) {
+               ret = fiemap_fill_next_extent(fieinfo, cache->offset,
+                               cache->phys, cache->len, cache->flags);
+               cache->cached = false;
+       }
+       return ret;
+}
+
+/*
+ * Emit last fiemap cache
+ *
+ * The last fiemap cache may still be cached in the following case:
+ * 0                 4k                    8k
+ * |<- Fiemap range ->|
+ * |<------------  First extent ----------->|
+ *
+ * In this case, the first extent range will be cached but not emitted.
+ * So we must emit it before ending extent_fiemap().
+ */
+static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
+                                 struct fiemap_extent_info *fieinfo,
+                                 struct fiemap_cache *cache)
+{
+       int ret;
+
+       if (!cache->cached)
+               return 0;
+
+       ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
+                                     cache->len, cache->flags);
+       cache->cached = false;
+       if (ret > 0)
+               ret = 0;
+       return ret;
+}
+
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -4381,6 +4487,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        struct extent_state *cached_state = NULL;
        struct btrfs_path *path;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct fiemap_cache cache = { 0 };
        int end = 0;
        u64 em_start = 0;
        u64 em_len = 0;
@@ -4560,8 +4667,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
-               ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-                                             em_len, flags);
+               ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
+                                          em_len, flags);
                if (ret) {
                        if (ret == 1)
                                ret = 0;
@@ -4569,6 +4676,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                }
        }
 out_free:
+       if (!ret)
+               ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
        free_extent_map(em);
 out:
        btrfs_free_path(path);