Merge branch 'error-handling' into for-linus
authorChris Mason <chris.mason@oracle.com>
Thu, 29 Mar 2012 00:31:37 +0000 (20:31 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 29 Mar 2012 00:31:37 +0000 (20:31 -0400)
Conflicts:
fs/btrfs/ctree.c
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/inode.c
fs/btrfs/scrub.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
12 files changed:
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/inode-item.c
fs/btrfs/inode.c
fs/btrfs/reada.c
fs/btrfs/scrub.c
fs/btrfs/struct-funcs.c
fs/btrfs/volumes.c

index e697afd18159f5477d2089543c9adfd1ac3c9475..e801f226d7e028b72ca08b5726ed409215c27b84 100644 (file)
@@ -156,10 +156,23 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 {
        struct extent_buffer *eb;
 
-       rcu_read_lock();
-       eb = rcu_dereference(root->node);
-       extent_buffer_get(eb);
-       rcu_read_unlock();
+       while (1) {
+               rcu_read_lock();
+               eb = rcu_dereference(root->node);
+
+               /*
+                * RCU really hurts here, we could free up the root node because
+                * it was cow'ed but we may not get the new root node yet so do
+                * the inc_not_zero dance and if it doesn't work then
+                * synchronize_rcu and try again.
+                */
+               if (atomic_inc_not_zero(&eb->refs)) {
+                       rcu_read_unlock();
+                       break;
+               }
+               rcu_read_unlock();
+               synchronize_rcu();
+       }
        return eb;
 }
 
@@ -514,7 +527,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
-       free_extent_buffer(buf);
+       free_extent_buffer_stale(buf);
        btrfs_mark_buffer_dirty(cow);
        *cow_ret = cow;
        return 0;
@@ -974,7 +987,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                root_sub_used(root, mid->len);
                btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                /* once for the root ptr */
-               free_extent_buffer(mid);
+               free_extent_buffer_stale(mid);
                return 0;
        }
        if (btrfs_header_nritems(mid) >
@@ -1028,7 +1041,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        del_ptr(trans, root, path, level + 1, pslot + 1);
                        root_sub_used(root, right->len);
                        btrfs_free_tree_block(trans, root, right, 0, 1, 0);
-                       free_extent_buffer(right);
+                       free_extent_buffer_stale(right);
                        right = NULL;
                } else {
                        struct btrfs_disk_key right_key;
@@ -1070,7 +1083,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                del_ptr(trans, root, path, level + 1, pslot);
                root_sub_used(root, mid->len);
                btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
-               free_extent_buffer(mid);
+               free_extent_buffer_stale(mid);
                mid = NULL;
        } else {
                /* update the parent key to reflect our changes */
@@ -1396,7 +1409,8 @@ static noinline int reada_for_balance(struct btrfs_root *root,
  * if lowest_unlock is 1, level 0 won't be unlocked
  */
 static noinline void unlock_up(struct btrfs_path *path, int level,
-                              int lowest_unlock)
+                              int lowest_unlock, int min_write_lock_level,
+                              int *write_lock_level)
 {
        int i;
        int skip_level = level;
@@ -1428,6 +1442,11 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
                if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
                        btrfs_tree_unlock_rw(t, path->locks[i]);
                        path->locks[i] = 0;
+                       if (write_lock_level &&
+                           i > min_write_lock_level &&
+                           i <= *write_lock_level) {
+                               *write_lock_level = i - 1;
+                       }
                }
        }
 }
@@ -1651,6 +1670,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        /* everything at write_lock_level or lower must be write locked */
        int write_lock_level = 0;
        u8 lowest_level = 0;
+       int min_write_lock_level;
 
        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
@@ -1678,6 +1698,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        if (cow && (p->keep_locks || p->lowest_level))
                write_lock_level = BTRFS_MAX_LEVEL;
 
+       min_write_lock_level = write_lock_level;
+
 again:
        /*
         * we try very hard to do read locks on the root
@@ -1809,7 +1831,8 @@ cow_done:
                                goto again;
                        }
 
-                       unlock_up(p, level, lowest_unlock);
+                       unlock_up(p, level, lowest_unlock,
+                                 min_write_lock_level, &write_lock_level);
 
                        if (level == lowest_level) {
                                if (dec)
@@ -1871,7 +1894,8 @@ cow_done:
                                }
                        }
                        if (!p->search_for_split)
-                               unlock_up(p, level, lowest_unlock);
+                               unlock_up(p, level, lowest_unlock,
+                                         min_write_lock_level, &write_lock_level);
                        goto done;
                }
        }
@@ -2320,6 +2344,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 {
        struct extent_buffer *left = path->nodes[0];
        struct extent_buffer *upper = path->nodes[1];
+       struct btrfs_map_token token;
        struct btrfs_disk_key disk_key;
        int slot;
        u32 i;
@@ -2331,6 +2356,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        u32 data_end;
        u32 this_item_size;
 
+       btrfs_init_map_token(&token);
+
        if (empty)
                nr = 0;
        else
@@ -2408,8 +2435,8 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        push_space = BTRFS_LEAF_DATA_SIZE(root);
        for (i = 0; i < right_nritems; i++) {
                item = btrfs_item_nr(right, i);
-               push_space -= btrfs_item_size(right, item);
-               btrfs_set_item_offset(right, item, push_space);
+               push_space -= btrfs_token_item_size(right, item, &token);
+               btrfs_set_token_item_offset(right, item, push_space, &token);
        }
 
        left_nritems -= push_items;
@@ -2539,6 +2566,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        int ret = 0;
        u32 this_item_size;
        u32 old_left_item_size;
+       struct btrfs_map_token token;
+
+       btrfs_init_map_token(&token);
 
        if (empty)
                nr = min(right_nritems, max_slot);
@@ -2599,9 +2629,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 
                item = btrfs_item_nr(left, i);
 
-               ioff = btrfs_item_offset(left, item);
-               btrfs_set_item_offset(left, item,
-                     ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+               ioff = btrfs_token_item_offset(left, item, &token);
+               btrfs_set_token_item_offset(left, item,
+                     ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
+                     &token);
        }
        btrfs_set_header_nritems(left, old_left_nritems + push_items);
 
@@ -2631,8 +2662,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        for (i = 0; i < right_nritems; i++) {
                item = btrfs_item_nr(right, i);
 
-               push_space = push_space - btrfs_item_size(right, item);
-               btrfs_set_item_offset(right, item, push_space);
+               push_space = push_space - btrfs_token_item_size(right,
+                                                               item, &token);
+               btrfs_set_token_item_offset(right, item, push_space, &token);
        }
 
        btrfs_mark_buffer_dirty(left);
@@ -2748,6 +2780,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
        int rt_data_off;
        int i;
        struct btrfs_disk_key disk_key;
+       struct btrfs_map_token token;
+
+       btrfs_init_map_token(&token);
 
        nritems = nritems - mid;
        btrfs_set_header_nritems(right, nritems);
@@ -2769,8 +2804,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
                struct btrfs_item *item = btrfs_item_nr(right, i);
                u32 ioff;
 
-               ioff = btrfs_item_offset(right, item);
-               btrfs_set_item_offset(right, item, ioff + rt_data_off);
+               ioff = btrfs_token_item_offset(right, item, &token);
+               btrfs_set_token_item_offset(right, item,
+                                           ioff + rt_data_off, &token);
        }
 
        btrfs_set_header_nritems(l, mid);
@@ -3246,6 +3282,9 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
        unsigned int old_size;
        unsigned int size_diff;
        int i;
+       struct btrfs_map_token token;
+
+       btrfs_init_map_token(&token);
 
        leaf = path->nodes[0];
        slot = path->slots[0];
@@ -3272,8 +3311,9 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
                u32 ioff;
                item = btrfs_item_nr(leaf, i);
 
-               ioff = btrfs_item_offset(leaf, item);
-               btrfs_set_item_offset(leaf, item, ioff + size_diff);
+               ioff = btrfs_token_item_offset(leaf, item, &token);
+               btrfs_set_token_item_offset(leaf, item,
+                                           ioff + size_diff, &token);
        }
 
        /* shift the data */
@@ -3342,6 +3382,9 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
        unsigned int old_data;
        unsigned int old_size;
        int i;
+       struct btrfs_map_token token;
+
+       btrfs_init_map_token(&token);
 
        leaf = path->nodes[0];
 
@@ -3371,8 +3414,9 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
                u32 ioff;
                item = btrfs_item_nr(leaf, i);
 
-               ioff = btrfs_item_offset(leaf, item);
-               btrfs_set_item_offset(leaf, item, ioff - data_size);
+               ioff = btrfs_token_item_offset(leaf, item, &token);
+               btrfs_set_token_item_offset(leaf, item,
+                                           ioff - data_size, &token);
        }
 
        /* shift the data */
@@ -3414,6 +3458,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
        unsigned int data_end;
        struct btrfs_disk_key disk_key;
        struct btrfs_key found_key;
+       struct btrfs_map_token token;
+
+       btrfs_init_map_token(&token);
 
        for (i = 0; i < nr; i++) {
                if (total_size + data_size[i] + sizeof(struct btrfs_item) >
@@ -3479,8 +3526,9 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
                        u32 ioff;
 
                        item = btrfs_item_nr(leaf, i);
-                       ioff = btrfs_item_offset(leaf, item);
-                       btrfs_set_item_offset(leaf, item, ioff - total_data);
+                       ioff = btrfs_token_item_offset(leaf, item, &token);
+                       btrfs_set_token_item_offset(leaf, item,
+                                                   ioff - total_data, &token);
                }
                /* shift the items */
                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -3507,9 +3555,10 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
                btrfs_set_item_key(leaf, &disk_key, slot + i);
                item = btrfs_item_nr(leaf, slot + i);
-               btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+               btrfs_set_token_item_offset(leaf, item,
+                                           data_end - data_size[i], &token);
                data_end -= data_size[i];
-               btrfs_set_item_size(leaf, item, data_size[i]);
+               btrfs_set_token_item_size(leaf, item, data_size[i], &token);
        }
        btrfs_set_header_nritems(leaf, nritems + nr);
        btrfs_mark_buffer_dirty(leaf);
@@ -3547,6 +3596,9 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans,
        struct btrfs_disk_key disk_key;
        struct extent_buffer *leaf;
        int slot;
+       struct btrfs_map_token token;
+
+       btrfs_init_map_token(&token);
 
        leaf = path->nodes[0];
        slot = path->slots[0];
@@ -3578,8 +3630,9 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans,
                        u32 ioff;
 
                        item = btrfs_item_nr(leaf, i);
-                       ioff = btrfs_item_offset(leaf, item);
-                       btrfs_set_item_offset(leaf, item, ioff - total_data);
+                       ioff = btrfs_token_item_offset(leaf, item, &token);
+                       btrfs_set_token_item_offset(leaf, item,
+                                                   ioff - total_data, &token);
                }
                /* shift the items */
                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -3598,9 +3651,10 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans,
                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
                btrfs_set_item_key(leaf, &disk_key, slot + i);
                item = btrfs_item_nr(leaf, slot + i);
-               btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+               btrfs_set_token_item_offset(leaf, item,
+                                           data_end - data_size[i], &token);
                data_end -= data_size[i];
-               btrfs_set_item_size(leaf, item, data_size[i]);
+               btrfs_set_token_item_size(leaf, item, data_size[i], &token);
        }
 
        btrfs_set_header_nritems(leaf, nritems + nr);
@@ -3740,7 +3794,9 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
        root_sub_used(root, leaf->len);
 
+       extent_buffer_get(leaf);
        btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
+       free_extent_buffer_stale(leaf);
 }
 /*
  * delete the item at the leaf level in path.  If that empties
@@ -3757,6 +3813,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        int wret;
        int i;
        u32 nritems;
+       struct btrfs_map_token token;
+
+       btrfs_init_map_token(&token);
 
        leaf = path->nodes[0];
        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
@@ -3778,8 +3837,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        u32 ioff;
 
                        item = btrfs_item_nr(leaf, i);
-                       ioff = btrfs_item_offset(leaf, item);
-                       btrfs_set_item_offset(leaf, item, ioff + dsize);
+                       ioff = btrfs_token_item_offset(leaf, item, &token);
+                       btrfs_set_token_item_offset(leaf, item,
+                                                   ioff + dsize, &token);
                }
 
                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -4013,7 +4073,7 @@ find_next_key:
                path->slots[level] = slot;
                if (level == path->lowest_level) {
                        ret = 0;
-                       unlock_up(path, level, 1);
+                       unlock_up(path, level, 1, 0, NULL);
                        goto out;
                }
                btrfs_set_path_blocking(path);
@@ -4024,7 +4084,7 @@ find_next_key:
 
                path->locks[level - 1] = BTRFS_READ_LOCK;
                path->nodes[level - 1] = cur;
-               unlock_up(path, level, 1);
+               unlock_up(path, level, 1, 0, NULL);
                btrfs_clear_path_blocking(path, NULL, 0);
        }
 out:
@@ -4260,7 +4320,7 @@ again:
        }
        ret = 0;
 done:
-       unlock_up(path, 0, 1);
+       unlock_up(path, 0, 1, 0, NULL);
        path->leave_spinning = old_spinning;
        if (!old_spinning)
                btrfs_set_path_blocking(path);
index b6ebea5582c62a7d12a656563874d04d8ffb471c..ed2d196f7a844df675651a387c7e295ebd2743b1 100644 (file)
@@ -48,6 +48,8 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAGIC "_BHRfS_M"
 
+#define BTRFS_MAX_MIRRORS 2
+
 #define BTRFS_MAX_LEVEL 8
 
 #define BTRFS_COMPAT_EXTENT_TREE_V0
@@ -137,6 +139,12 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
 
+/*
+ * the max metadata block size.  This limit is somewhat artificial,
+ * but the memmove costs go through the roof for larger blocks.
+ */
+#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
+
 /*
  * we can actually store much bigger names, but lets not confuse the rest
  * of linux
@@ -461,6 +469,19 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL  (1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS    (1ULL << 2)
 #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO    (1ULL << 3)
+/*
+ * some patches floated around with a second compression method
+ * lets save that incompat here for when they do get in
+ * Note we don't actually support it, we're just reserving the
+ * number
+ */
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2  (1ULL << 4)
+
+/*
+ * older kernels tried to do bigger metadata blocks, but the
+ * code was pretty buggy.  Lets not let them try anymore.
+ */
+#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA    (1ULL << 5)
 
 #define BTRFS_FEATURE_COMPAT_SUPP              0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP           0ULL
@@ -468,6 +489,7 @@ struct btrfs_super_block {
        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
+        BTRFS_FEATURE_INCOMPAT_BIG_METADATA |          \
         BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
 
 /*
@@ -1527,6 +1549,17 @@ struct btrfs_ioctl_defrag_range_args {
 
 #define BTRFS_INODE_ROOT_ITEM_INIT     (1 << 31)
 
+struct btrfs_map_token {
+       struct extent_buffer *eb;
+       char *kaddr;
+       unsigned long offset;
+};
+
+static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+{
+       memset(token, 0, sizeof(*token));
+}
+
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
@@ -1550,20 +1583,22 @@ struct btrfs_ioctl_defrag_range_args {
 #ifndef BTRFS_SETGET_FUNCS
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)                   \
 u##bits btrfs_##name(struct extent_buffer *eb, type *s);               \
+u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, struct btrfs_map_token *token);          \
+void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);\
 void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
 #endif
 
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)            \
 static inline u##bits btrfs_##name(struct extent_buffer *eb)           \
 {                                                                      \
-       type *p = page_address(eb->first_page);                         \
+       type *p = page_address(eb->pages[0]);                           \
        u##bits res = le##bits##_to_cpu(p->member);                     \
        return res;                                                     \
 }                                                                      \
 static inline void btrfs_set_##name(struct extent_buffer *eb,          \
                                    u##bits val)                        \
 {                                                                      \
-       type *p = page_address(eb->first_page);                         \
+       type *p = page_address(eb->pages[0]);                           \
        p->member = cpu_to_le##bits(val);                               \
 }
 
@@ -2467,8 +2502,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  u64 num_bytes, u64 min_alloc_size,
                                  u64 empty_size, u64 hint_byte,
-                                 u64 search_end, struct btrfs_key *ins,
-                                 u64 data);
+                                 struct btrfs_key *ins, u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
index fe087847c8e75b05721bc7bbeb195e30ec1a5b54..7b55eee15a51265e3d670cafa57c2772b8f0614a 100644 (file)
@@ -333,7 +333,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 
        lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
                         0, &cached_state);
-       if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
+       if (extent_buffer_uptodate(eb) &&
            btrfs_header_generation(eb) == parent_transid) {
                ret = 0;
                goto out;
@@ -344,7 +344,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                       (unsigned long long)parent_transid,
                       (unsigned long long)btrfs_header_generation(eb));
        ret = 1;
-       clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
+       clear_extent_buffer_uptodate(eb);
 out:
        unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
                             &cached_state, GFP_NOFS);
@@ -360,9 +360,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                                          u64 start, u64 parent_transid)
 {
        struct extent_io_tree *io_tree;
+       int failed = 0;
        int ret;
        int num_copies = 0;
        int mirror_num = 0;
+       int failed_mirror = 0;
 
        clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
@@ -370,9 +372,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                ret = read_extent_buffer_pages(io_tree, eb, start,
                                               WAIT_COMPLETE,
                                               btree_get_extent, mirror_num);
-               if (!ret &&
-                   !verify_parent_transid(io_tree, eb, parent_transid))
-                       return ret;
+               if (!ret && !verify_parent_transid(io_tree, eb, parent_transid))
+                       break;
 
                /*
                 * This buffer's crc is fine, but its contents are corrupted, so
@@ -380,18 +381,31 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                 * any less wrong.
                 */
                if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
-                       return ret;
+                       break;
+
+               if (!failed_mirror) {
+                       failed = 1;
+                       printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror);
+                       failed_mirror = eb->failed_mirror;
+               }
 
                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
                                              eb->start, eb->len);
                if (num_copies == 1)
-                       return ret;
+                       break;
 
                mirror_num++;
+               if (mirror_num == failed_mirror)
+                       mirror_num++;
+
                if (mirror_num > num_copies)
-                       return ret;
+                       break;
        }
-       return -EIO;
+
+       if (failed && !ret)
+               repair_eb_io_failure(root, eb, failed_mirror);
+
+       return ret;
 }
 
 /*
@@ -404,59 +418,28 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        struct extent_io_tree *tree;
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 found_start;
-       unsigned long len;
        struct extent_buffer *eb;
-       int ret = -EIO;
 
        tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-       if (page->private == EXTENT_PAGE_PRIVATE) {
-               WARN_ON(1);
-               goto out;
-       }
-       if (!page->private) {
-               WARN_ON(1);
-               goto out;
-       }
-       len = page->private >> 2;
-       WARN_ON(len == 0);
-
-       eb = alloc_extent_buffer(tree, start, len, page);
-       if (eb == NULL) {
-               WARN_ON(1);
-               ret = -ENOMEM;
-               goto out;
-       }
-       ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
-                                            btrfs_header_generation(eb));
-       if (ret) {
-               btrfs_printk(root->fs_info, KERN_WARNING
-                            "Failed to checksum dirty buffer @ %llu[%lu]\n",
-                             start, len);
-               goto err;
-       }
-       WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
-
-       ret = -EIO;
+       eb = (struct extent_buffer *)page->private;
+       if (page != eb->pages[0])
+               return 0;
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
                WARN_ON(1);
-               goto err;
+               return 0;
        }
-       if (eb->first_page != page) {
+       if (eb->pages[0] != page) {
                WARN_ON(1);
-               goto err;
+               return 0;
        }
        if (!PageUptodate(page)) {
                WARN_ON(1);
-               goto err;
+               return 0;
        }
        csum_tree_block(root, eb, 0);
-       ret = 0;
-err:
-       free_extent_buffer(eb);
-out:
-       return ret;
+       return 0;
 }
 
 static int check_tree_block_fsid(struct btrfs_root *root,
@@ -545,34 +528,74 @@ static noinline int check_leaf(struct btrfs_root *root,
        return 0;
 }
 
+struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
+                                      struct page *page, int max_walk)
+{
+       struct extent_buffer *eb;
+       u64 start = page_offset(page);
+       u64 target = start;
+       u64 min_start;
+
+       if (start < max_walk)
+               min_start = 0;
+       else
+               min_start = start - max_walk;
+
+       while (start >= min_start) {
+               eb = find_extent_buffer(tree, start, 0);
+               if (eb) {
+                       /*
+                        * we found an extent buffer and it contains our page
+                        * horray!
+                        */
+                       if (eb->start <= target &&
+                           eb->start + eb->len > target)
+                               return eb;
+
+                       /* we found an extent buffer that wasn't for us */
+                       free_extent_buffer(eb);
+                       return NULL;
+               }
+               if (start == 0)
+                       break;
+               start -= PAGE_CACHE_SIZE;
+       }
+       return NULL;
+}
+
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state)
 {
        struct extent_io_tree *tree;
        u64 found_start;
        int found_level;
-       unsigned long len;
        struct extent_buffer *eb;
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
        int ret = 0;
+       int reads_done;
 
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
-       if (page->private == EXTENT_PAGE_PRIVATE)
-               goto out;
        if (!page->private)
                goto out;
 
-       len = page->private >> 2;
-       WARN_ON(len == 0);
+       tree = &BTRFS_I(page->mapping->host)->io_tree;
+       eb = (struct extent_buffer *)page->private;
+
+       /* the pending IO might have been the only thing that kept this buffer
+        * in memory.  Make sure we have a ref for all this other checks
+        */
+       extent_buffer_get(eb);
+
+       reads_done = atomic_dec_and_test(&eb->io_pages);
+       if (!reads_done)
+               goto err;
 
-       eb = alloc_extent_buffer(tree, start, len, page);
-       if (eb == NULL) {
+       if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
                ret = -EIO;
-               goto out;
+               goto err;
        }
 
        found_start = btrfs_header_bytenr(eb);
-       if (found_start != start) {
+       if (found_start != eb->start) {
                printk_ratelimited(KERN_INFO "btrfs bad tree block start "
                               "%llu %llu\n",
                               (unsigned long long)found_start,
@@ -580,13 +603,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                ret = -EIO;
                goto err;
        }
-       if (eb->first_page != page) {
-               printk(KERN_INFO "btrfs bad first page %lu %lu\n",
-                      eb->first_page->index, page->index);
-               WARN_ON(1);
-               ret = -EIO;
-               goto err;
-       }
        if (check_tree_block_fsid(root, eb)) {
                printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
                               (unsigned long long)eb->start);
@@ -614,48 +630,31 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                ret = -EIO;
        }
 
-       end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
-       end = eb->start + end - 1;
+       if (!ret)
+               set_extent_buffer_uptodate(eb);
 err:
        if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
                clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
                btree_readahead_hook(root, eb, eb->start, ret);
        }
 
+       if (ret)
+               clear_extent_buffer_uptodate(eb);
        free_extent_buffer(eb);
 out:
        return ret;
 }
 
-static int btree_io_failed_hook(struct bio *failed_bio,
-                        struct page *page, u64 start, u64 end,
-                        int mirror_num, struct extent_state *state)
+static int btree_io_failed_hook(struct page *page, int failed_mirror)
 {
-       struct extent_io_tree *tree;
-       unsigned long len;
        struct extent_buffer *eb;
        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
-       if (page->private == EXTENT_PAGE_PRIVATE)
-               goto out;
-       if (!page->private)
-               goto out;
-
-       len = page->private >> 2;
-       WARN_ON(len == 0);
-
-       eb = alloc_extent_buffer(tree, start, len, page);
-       if (eb == NULL)
-               goto out;
-
-       if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
-               clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+       eb = (struct extent_buffer *)page->private;
+       set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+       eb->failed_mirror = failed_mirror;
+       if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
                btree_readahead_hook(root, eb, eb->start, -EIO);
-       }
-       free_extent_buffer(eb);
-
-out:
        return -EIO;    /* we fixed nothing */
 }
 
@@ -868,15 +867,16 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 {
        int ret;
 
-       ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, bio, 1);
-       if (ret)
-               return ret;
-
        if (!(rw & REQ_WRITE)) {
+
                /*
                 * called for a read, do the setup so that checksum validation
                 * can happen in the async kernel threads
                 */
+               ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+                                         bio, 1);
+               if (ret)
+                       return ret;
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
        }
@@ -914,34 +914,6 @@ static int btree_migratepage(struct address_space *mapping,
 }
 #endif
 
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
-       struct extent_io_tree *tree;
-       struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-       struct extent_buffer *eb;
-       int was_dirty;
-
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
-       if (!(current->flags & PF_MEMALLOC)) {
-               return extent_write_full_page(tree, page,
-                                             btree_get_extent, wbc);
-       }
-
-       redirty_page_for_writepage(wbc, page);
-       eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
-       WARN_ON(!eb);
-
-       was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
-       if (!was_dirty) {
-               spin_lock(&root->fs_info->delalloc_lock);
-               root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
-               spin_unlock(&root->fs_info->delalloc_lock);
-       }
-       free_extent_buffer(eb);
-
-       unlock_page(page);
-       return 0;
-}
 
 static int btree_writepages(struct address_space *mapping,
                            struct writeback_control *wbc)
@@ -961,7 +933,7 @@ static int btree_writepages(struct address_space *mapping,
                if (num_dirty < thresh)
                        return 0;
        }
-       return extent_writepages(tree, mapping, btree_get_extent, wbc);
+       return btree_write_cache_pages(mapping, wbc);
 }
 
 static int btree_readpage(struct file *file, struct page *page)
@@ -973,16 +945,8 @@ static int btree_readpage(struct file *file, struct page *page)
 
 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 {
-       struct extent_io_tree *tree;
-       struct extent_map_tree *map;
-       int ret;
-
        if (PageWriteback(page) || PageDirty(page))
                return 0;
-
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
-       map = &BTRFS_I(page->mapping->host)->extent_tree;
-
        /*
         * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
         * slab allocation from alloc_extent_state down the callchain where
@@ -990,18 +954,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
         */
        gfp_flags &= ~GFP_SLAB_BUG_MASK;
 
-       ret = try_release_extent_state(map, tree, page, gfp_flags);
-       if (!ret)
-               return 0;
-
-       ret = try_release_extent_buffer(tree, page);
-       if (ret == 1) {
-               ClearPagePrivate(page);
-               set_page_private(page, 0);
-               page_cache_release(page);
-       }
-
-       return ret;
+       return try_release_extent_buffer(page, gfp_flags);
 }
 
 static void btree_invalidatepage(struct page *page, unsigned long offset)
@@ -1019,15 +972,28 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
        }
 }
 
+static int btree_set_page_dirty(struct page *page)
+{
+       struct extent_buffer *eb;
+
+       BUG_ON(!PagePrivate(page));
+       eb = (struct extent_buffer *)page->private;
+       BUG_ON(!eb);
+       BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+       BUG_ON(!atomic_read(&eb->refs));
+       btrfs_assert_tree_locked(eb);
+       return __set_page_dirty_nobuffers(page);
+}
+
 static const struct address_space_operations btree_aops = {
        .readpage       = btree_readpage,
-       .writepage      = btree_writepage,
        .writepages     = btree_writepages,
        .releasepage    = btree_releasepage,
        .invalidatepage = btree_invalidatepage,
 #ifdef CONFIG_MIGRATION
        .migratepage    = btree_migratepage,
 #endif
+       .set_page_dirty = btree_set_page_dirty,
 };
 
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1070,7 +1036,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
        if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
                free_extent_buffer(buf);
                return -EIO;
-       } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+       } else if (extent_buffer_uptodate(buf)) {
                *eb = buf;
        } else {
                free_extent_buffer(buf);
@@ -1095,20 +1061,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
        struct extent_buffer *eb;
 
        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-                                bytenr, blocksize, NULL);
+                                bytenr, blocksize);
        return eb;
 }
 
 
 int btrfs_write_tree_block(struct extent_buffer *buf)
 {
-       return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+       return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
                                        buf->start + buf->len - 1);
 }
 
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
-       return filemap_fdatawait_range(buf->first_page->mapping,
+       return filemap_fdatawait_range(buf->pages[0]->mapping,
                                       buf->start, buf->start + buf->len - 1);
 }
 
@@ -1123,9 +1089,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                return NULL;
 
        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-
-       if (ret == 0)
-               set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
        return buf;
 
 }
@@ -1133,7 +1096,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      struct extent_buffer *buf)
 {
-       struct inode *btree_inode = root->fs_info->btree_inode;
        if (btrfs_header_generation(buf) ==
            root->fs_info->running_transaction->transid) {
                btrfs_assert_tree_locked(buf);
@@ -1155,8 +1117,7 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
                /* ugh, clear_extent_buffer_dirty needs to lock the page */
                btrfs_set_lock_blocking(buf);
-               clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
-                                         buf);
+               clear_extent_buffer_dirty(buf);
        }
 }
 
@@ -1539,41 +1500,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
        return 0;
 }
 
-static int bio_ready_for_csum(struct bio *bio)
-{
-       u64 length = 0;
-       u64 buf_len = 0;
-       u64 start = 0;
-       struct page *page;
-       struct extent_io_tree *io_tree = NULL;
-       struct bio_vec *bvec;
-       int i;
-       int ret;
-
-       bio_for_each_segment(bvec, bio, i) {
-               page = bvec->bv_page;
-               if (page->private == EXTENT_PAGE_PRIVATE) {
-                       length += bvec->bv_len;
-                       continue;
-               }
-               if (!page->private) {
-                       length += bvec->bv_len;
-                       continue;
-               }
-               length = bvec->bv_len;
-               buf_len = page->private >> 2;
-               start = page_offset(page) + bvec->bv_offset;
-               io_tree = &BTRFS_I(page->mapping->host)->io_tree;
-       }
-       /* are we fully contained in this bio? */
-       if (buf_len <= length)
-               return 1;
-
-       ret = extent_range_uptodate(io_tree, start + length,
-                                   start + buf_len - 1);
-       return ret;
-}
-
 /*
  * called by the kthread helper functions to finally call the bio end_io
  * functions.  This is where read checksum verification actually happens
@@ -1589,17 +1515,6 @@ static void end_workqueue_fn(struct btrfs_work *work)
        bio = end_io_wq->bio;
        fs_info = end_io_wq->info;
 
-       /* metadata bio reads are special because the whole tree block must
-        * be checksummed at once.  This makes sure the entire block is in
-        * ram and up to date before trying to verify things.  For
-        * blocksize <= pagesize, it is basically a noop
-        */
-       if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
-           !bio_ready_for_csum(bio)) {
-               btrfs_queue_worker(&fs_info->endio_meta_workers,
-                                  &end_io_wq->work);
-               return;
-       }
        error = end_io_wq->error;
        bio->bi_private = end_io_wq->private;
        bio->bi_end_io = end_io_wq->end_io;
@@ -2073,6 +1988,7 @@ int open_ctree(struct super_block *sb,
        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
                             fs_info->btree_inode->i_mapping);
+       BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
 
        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
@@ -2171,10 +2087,38 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
        }
 
+       if (btrfs_super_leafsize(disk_super) !=
+           btrfs_super_nodesize(disk_super)) {
+               printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+                      "blocksizes don't match.  node %d leaf %d\n",
+                      btrfs_super_nodesize(disk_super),
+                      btrfs_super_leafsize(disk_super));
+               err = -EINVAL;
+               goto fail_alloc;
+       }
+       if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+               printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+                      "blocksize (%d) was too large\n",
+                      btrfs_super_leafsize(disk_super));
+               err = -EINVAL;
+               goto fail_alloc;
+       }
+
        features = btrfs_super_incompat_flags(disk_super);
        features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
        if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+
+       /*
+        * flag our filesystem as having big metadata blocks if
+        * they are bigger than the page size
+        */
+       if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+               if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+                       printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
+               features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+       }
+
        btrfs_set_super_incompat_flags(disk_super, features);
 
        features = btrfs_super_compat_ro_flags(disk_super) &
@@ -3196,10 +3140,9 @@ int close_ctree(struct btrfs_root *root)
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
 {
        int ret;
-       struct inode *btree_inode = buf->first_page->mapping->host;
+       struct inode *btree_inode = buf->pages[0]->mapping->host;
 
-       ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
-                                    NULL);
+       ret = extent_buffer_uptodate(buf);
        if (!ret)
                return ret;
 
@@ -3210,16 +3153,13 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
-       struct inode *btree_inode = buf->first_page->mapping->host;
-       return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
-                                         buf);
+       return set_extent_buffer_uptodate(buf);
 }
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+       struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
        u64 transid = btrfs_header_generation(buf);
-       struct inode *btree_inode = root->fs_info->btree_inode;
        int was_dirty;
 
        btrfs_assert_tree_locked(buf);
@@ -3231,8 +3171,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
                        (unsigned long long)root->fs_info->generation);
                WARN_ON(1);
        }
-       was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
-                                           buf);
+       was_dirty = set_extent_buffer_dirty(buf);
        if (!was_dirty) {
                spin_lock(&root->fs_info->delalloc_lock);
                root->fs_info->dirty_metadata_bytes += buf->len;
@@ -3286,12 +3225,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 {
-       struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-       int ret;
-       ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-       if (ret == 0)
-               set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
-       return ret;
+       struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+       return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
 
 static int btree_lock_page_hook(struct page *page, void *data,
@@ -3299,17 +3234,21 @@ static int btree_lock_page_hook(struct page *page, void *data,
 {
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_buffer *eb;
-       unsigned long len;
-       u64 bytenr = page_offset(page);
 
-       if (page->private == EXTENT_PAGE_PRIVATE)
+       /*
+        * We culled this eb but the page is still hanging out on the mapping,
+        * carry on.
+        */
+       if (!PagePrivate(page))
                goto out;
 
-       len = page->private >> 2;
-       eb = find_extent_buffer(io_tree, bytenr, len);
-       if (!eb)
+       eb = (struct extent_buffer *)page->private;
+       if (!eb) {
+               WARN_ON(1);
+               goto out;
+       }
+       if (page != eb->pages[0])
                goto out;
 
        if (!btrfs_try_tree_write_lock(eb)) {
@@ -3328,7 +3267,6 @@ static int btree_lock_page_hook(struct page *page, void *data,
        }
 
        btrfs_tree_unlock(eb);
-       free_extent_buffer(eb);
 out:
        if (!trylock_page(page)) {
                flush_fn(data);
index 4b3f1eedced046ea04894b5ac113b3516e5ee9de..8b304e3537c48dffe49131a17d2d42814b1137f4 100644 (file)
@@ -5074,10 +5074,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        if (ret)
                                goto abort;
-               } else {
-                       invalidate_mapping_pages(info->btree_inode->i_mapping,
-                            bytenr >> PAGE_CACHE_SHIFT,
-                            (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
 
                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
@@ -5321,11 +5317,10 @@ static int get_block_group_index(struct btrfs_block_group_cache *cache)
 }
 
 enum btrfs_loop_type {
-       LOOP_FIND_IDEAL = 0,
-       LOOP_CACHING_NOWAIT = 1,
-       LOOP_CACHING_WAIT = 2,
-       LOOP_ALLOC_CHUNK = 3,
-       LOOP_NO_EMPTY_SIZE = 4,
+       LOOP_CACHING_NOWAIT = 0,
+       LOOP_CACHING_WAIT = 1,
+       LOOP_ALLOC_CHUNK = 2,
+       LOOP_NO_EMPTY_SIZE = 3,
 };
 
 /*
@@ -5339,7 +5334,6 @@ enum btrfs_loop_type {
 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *orig_root,
                                     u64 num_bytes, u64 empty_size,
-                                    u64 search_start, u64 search_end,
                                     u64 hint_byte, struct btrfs_key *ins,
                                     u64 data)
 {
@@ -5348,6 +5342,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_free_cluster *last_ptr = NULL;
        struct btrfs_block_group_cache *block_group = NULL;
        struct btrfs_block_group_cache *used_block_group;
+       u64 search_start = 0;
        int empty_cluster = 2 * 1024 * 1024;
        int allowed_chunk_alloc = 0;
        int done_chunk_alloc = 0;
@@ -5361,8 +5356,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        bool failed_alloc = false;
        bool use_cluster = true;
        bool have_caching_bg = false;
-       u64 ideal_cache_percent = 0;
-       u64 ideal_cache_offset = 0;
 
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -5412,7 +5405,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                empty_cluster = 0;
 
        if (search_start == hint_byte) {
-ideal_cache:
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
                used_block_group = block_group;
@@ -5424,8 +5416,7 @@ ideal_cache:
                 * picked out then we don't care that the block group is cached.
                 */
                if (block_group && block_group_bits(block_group, data) &&
-                   (block_group->cached != BTRFS_CACHE_NO ||
-                    search_start == ideal_cache_offset)) {
+                   block_group->cached != BTRFS_CACHE_NO) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
@@ -5479,45 +5470,13 @@ search:
 have_block_group:
                cached = block_group_cache_done(block_group);
                if (unlikely(!cached)) {
-                       u64 free_percent;
-
                        found_uncached_bg = true;
                        ret = cache_block_group(block_group, trans,
-                                               orig_root, 1);
-                       BUG_ON(ret < 0); /* -ENOMEM */
-                       if (block_group->cached == BTRFS_CACHE_FINISHED)
-                               goto alloc;
-
-                       free_percent = btrfs_block_group_used(&block_group->item);
-                       free_percent *= 100;
-                       free_percent = div64_u64(free_percent,
-                                                block_group->key.offset);
-                       free_percent = 100 - free_percent;
-                       if (free_percent > ideal_cache_percent &&
-                           likely(!block_group->ro)) {
-                               ideal_cache_offset = block_group->key.objectid;
-                               ideal_cache_percent = free_percent;
-                       }
-
-                       /*
-                        * The caching workers are limited to 2 threads, so we
-                        * can queue as much work as we care to.
-                        */
-                       if (loop > LOOP_FIND_IDEAL) {
-                               ret = cache_block_group(block_group, trans,
-                                                       orig_root, 0);
-                               BUG_ON(ret); /* -ENOMEM */
-                       }
-
-                       /*
-                        * If loop is set for cached only, try the next block
-                        * group.
-                        */
-                       if (loop == LOOP_FIND_IDEAL)
-                               goto loop;
+                                               orig_root, 0);
+                       BUG_ON(ret < 0);
+                       ret = 0;
                }
 
-alloc:
                if (unlikely(block_group->ro))
                        goto loop;
 
@@ -5668,11 +5627,6 @@ unclustered_alloc:
                }
 checks:
                search_start = stripe_align(root, offset);
-               /* move on to the next group */
-               if (search_start + num_bytes >= search_end) {
-                       btrfs_add_free_space(used_block_group, offset, num_bytes);
-                       goto loop;
-               }
 
                /* move on to the next group */
                if (search_start + num_bytes >
@@ -5723,9 +5677,7 @@ loop:
        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
                goto search;
 
-       /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
-        *                      for them to make caching progress.  Also
-        *                      determine the best possible bg to cache
+       /*
         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
         *                      caching kthreads as we move along
         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
@@ -5735,45 +5687,7 @@ loop:
         */
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                index = 0;
-               if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
-                       found_uncached_bg = false;
-                       loop++;
-                       if (!ideal_cache_percent)
-                               goto search;
-
-                       /*
-                        * 1 of the following 2 things have happened so far
-                        *
-                        * 1) We found an ideal block group for caching that
-                        * is mostly full and will cache quickly, so we might
-                        * as well wait for it.
-                        *
-                        * 2) We searched for cached only and we didn't find
-                        * anything, and we didn't start any caching kthreads
-                        * either, so chances are we will loop through and
-                        * start a couple caching kthreads, and then come back
-                        * around and just wait for them.  This will be slower
-                        * because we will have 2 caching kthreads reading at
-                        * the same time when we could have just started one
-                        * and waited for it to get far enough to give us an
-                        * allocation, so go ahead and go to the wait caching
-                        * loop.
-                        */
-                       loop = LOOP_CACHING_WAIT;
-                       search_start = ideal_cache_offset;
-                       ideal_cache_percent = 0;
-                       goto ideal_cache;
-               } else if (loop == LOOP_FIND_IDEAL) {
-                       /*
-                        * Didn't find a uncached bg, wait on anything we find
-                        * next.
-                        */
-                       loop = LOOP_CACHING_WAIT;
-                       goto search;
-               }
-
                loop++;
-
                if (loop == LOOP_ALLOC_CHUNK) {
                       if (allowed_chunk_alloc) {
                                ret = do_chunk_alloc(trans, root, num_bytes +
@@ -5866,12 +5780,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 num_bytes, u64 min_alloc_size,
                         u64 empty_size, u64 hint_byte,
-                        u64 search_end, struct btrfs_key *ins,
-                        u64 data)
+                        struct btrfs_key *ins, u64 data)
 {
        bool final_tried = false;
        int ret;
-       u64 search_start = 0;
 
        data = btrfs_get_alloc_profile(root, data);
 again:
@@ -5891,8 +5803,7 @@ again:
 
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
-                              search_start, search_end, hint_byte,
-                              ins, data);
+                              hint_byte, ins, data);
 
        if (ret == -ENOSPC) {
                if (!final_tried) {
@@ -6191,6 +6102,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
        clean_tree_block(trans, root, buf);
+       clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
        btrfs_set_lock_blocking(buf);
        btrfs_set_buffer_uptodate(buf);
@@ -6298,7 +6210,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                return ERR_CAST(block_rsv);
 
        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
-                                  empty_size, hint, (u64)-1, &ins, 0);
+                                  empty_size, hint, &ins, 0);
        if (ret) {
                unuse_block_rsv(root->fs_info, block_rsv, blocksize);
                return ERR_PTR(ret);
index 4c3ce7a0a7a4bb7a494cc498289b0bae025df76b..0c3ec003f273a1b9c1ec55008f5f86d795a9de86 100644 (file)
@@ -19,6 +19,7 @@
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "check-integrity.h"
+#include "locking.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -53,6 +54,7 @@ struct extent_page_data {
        unsigned int sync_io:1;
 };
 
+static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
 {
@@ -1929,6 +1931,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        return 0;
 }
 
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+                        int mirror_num)
+{
+       struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+       u64 start = eb->start;
+       unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+       int ret;
+
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+               ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+                                       start, p, mirror_num);
+               if (ret)
+                       break;
+               start += PAGE_CACHE_SIZE;
+       }
+
+       return ret;
+}
+
 /*
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
@@ -2275,6 +2297,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        u64 start;
        u64 end;
        int whole_page;
+       int failed_mirror;
        int ret;
 
        if (err)
@@ -2321,9 +2344,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                        else
                                clean_io_failure(start, page);
                }
-               if (!uptodate) {
-                       int failed_mirror;
+
+               if (!uptodate)
                        failed_mirror = (int)(unsigned long)bio->bi_bdev;
+
+               if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
+                       ret = tree->ops->readpage_io_failed_hook(page, failed_mirror);
+                       if (!ret && !err &&
+                           test_bit(BIO_UPTODATE, &bio->bi_flags))
+                               uptodate = 1;
+               } else if (!uptodate) {
                        /*
                         * The generic bio_readpage_error handles errors the
                         * following way: If possible, new read requests are
@@ -2337,7 +2367,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                        ret = bio_readpage_error(bio, page, start, end,
                                                        failed_mirror, NULL);
                        if (ret == 0) {
-error_handled:
                                uptodate =
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
@@ -2345,16 +2374,9 @@ error_handled:
                                uncache_state(&cached);
                                continue;
                        }
-                       if (tree->ops && tree->ops->readpage_io_failed_hook) {
-                               ret = tree->ops->readpage_io_failed_hook(
-                                                       bio, page, start, end,
-                                                       failed_mirror, state);
-                               if (ret == 0)
-                                       goto error_handled;
-                       }
                }
 
-               if (uptodate) {
+               if (uptodate && tree->track_uptodate) {
                        set_extent_uptodate(tree, start, end, &cached,
                                            GFP_ATOMIC);
                }
@@ -2507,19 +2529,24 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
        return ret;
 }
 
-void set_page_extent_mapped(struct page *page)
+void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
 {
        if (!PagePrivate(page)) {
                SetPagePrivate(page);
                page_cache_get(page);
-               set_page_private(page, EXTENT_PAGE_PRIVATE);
+               set_page_private(page, (unsigned long)eb);
+       } else {
+               WARN_ON(page->private != (unsigned long)eb);
        }
 }
 
-static void set_page_extent_head(struct page *page, unsigned long len)
+void set_page_extent_mapped(struct page *page)
 {
-       WARN_ON(!PagePrivate(page));
-       set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+       if (!PagePrivate(page)) {
+               SetPagePrivate(page);
+               page_cache_get(page);
+               set_page_private(page, EXTENT_PAGE_PRIVATE);
+       }
 }
 
 /*
@@ -3008,6 +3035,275 @@ done_unlocked:
        return 0;
 }
 
+static int eb_wait(void *word)
+{
+       io_schedule();
+       return 0;
+}
+
+static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+       wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
+                   TASK_UNINTERRUPTIBLE);
+}
+
+static int lock_extent_buffer_for_io(struct extent_buffer *eb,
+                                    struct btrfs_fs_info *fs_info,
+                                    struct extent_page_data *epd)
+{
+       unsigned long i, num_pages;
+       int flush = 0;
+       int ret = 0;
+
+       if (!btrfs_try_tree_write_lock(eb)) {
+               flush = 1;
+               flush_write_bio(epd);
+               btrfs_tree_lock(eb);
+       }
+
+       if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+               btrfs_tree_unlock(eb);
+               if (!epd->sync_io)
+                       return 0;
+               if (!flush) {
+                       flush_write_bio(epd);
+                       flush = 1;
+               }
+               while (1) {
+                       wait_on_extent_buffer_writeback(eb);
+                       btrfs_tree_lock(eb);
+                       if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+                               break;
+                       btrfs_tree_unlock(eb);
+               }
+       }
+
+       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+               set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+               btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+               spin_lock(&fs_info->delalloc_lock);
+               if (fs_info->dirty_metadata_bytes >= eb->len)
+                       fs_info->dirty_metadata_bytes -= eb->len;
+               else
+                       WARN_ON(1);
+               spin_unlock(&fs_info->delalloc_lock);
+               ret = 1;
+       }
+
+       btrfs_tree_unlock(eb);
+
+       if (!ret)
+               return ret;
+
+       num_pages = num_extent_pages(eb->start, eb->len);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+
+               if (!trylock_page(p)) {
+                       if (!flush) {
+                               flush_write_bio(epd);
+                               flush = 1;
+                       }
+                       lock_page(p);
+               }
+       }
+
+       return ret;
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+       clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+       smp_mb__after_clear_bit();
+       wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+{
+       int uptodate = err == 0;
+       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+       struct extent_buffer *eb;
+       int done;
+
+       do {
+               struct page *page = bvec->bv_page;
+
+               bvec--;
+               eb = (struct extent_buffer *)page->private;
+               BUG_ON(!eb);
+               done = atomic_dec_and_test(&eb->io_pages);
+
+               if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+                       set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+                       ClearPageUptodate(page);
+                       SetPageError(page);
+               }
+
+               end_page_writeback(page);
+
+               if (!done)
+                       continue;
+
+               end_extent_buffer_writeback(eb);
+       } while (bvec >= bio->bi_io_vec);
+
+       bio_put(bio);
+
+}
+
+static int write_one_eb(struct extent_buffer *eb,
+                       struct btrfs_fs_info *fs_info,
+                       struct writeback_control *wbc,
+                       struct extent_page_data *epd)
+{
+       struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+       u64 offset = eb->start;
+       unsigned long i, num_pages;
+       int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
+       int ret;
+
+       clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+       num_pages = num_extent_pages(eb->start, eb->len);
+       atomic_set(&eb->io_pages, num_pages);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+
+               clear_page_dirty_for_io(p);
+               set_page_writeback(p);
+               ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+                                        PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+                                        -1, end_bio_extent_buffer_writepage,
+                                        0, 0, 0);
+               if (ret) {
+                       set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+                       SetPageError(p);
+                       if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+                               end_extent_buffer_writeback(eb);
+                       ret = -EIO;
+                       break;
+               }
+               offset += PAGE_CACHE_SIZE;
+               update_nr_written(p, wbc, 1);
+               unlock_page(p);
+       }
+
+       if (unlikely(ret)) {
+               for (; i < num_pages; i++) {
+                       struct page *p = extent_buffer_page(eb, i);
+                       unlock_page(p);
+               }
+       }
+
+       return ret;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+                                  struct writeback_control *wbc)
+{
+       struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+       struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+       struct extent_buffer *eb, *prev_eb = NULL;
+       struct extent_page_data epd = {
+               .bio = NULL,
+               .tree = tree,
+               .extent_locked = 0,
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+       };
+       int ret = 0;
+       int done = 0;
+       int nr_to_write_done = 0;
+       struct pagevec pvec;
+       int nr_pages;
+       pgoff_t index;
+       pgoff_t end;            /* Inclusive */
+       int scanned = 0;
+       int tag;
+
+       pagevec_init(&pvec, 0);
+       if (wbc->range_cyclic) {
+               index = mapping->writeback_index; /* Start from prev offset */
+               end = -1;
+       } else {
+               index = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+               scanned = 1;
+       }
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag = PAGECACHE_TAG_TOWRITE;
+       else
+               tag = PAGECACHE_TAG_DIRTY;
+retry:
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag_pages_for_writeback(mapping, index, end);
+       while (!done && !nr_to_write_done && (index <= end) &&
+              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+               unsigned i;
+
+               scanned = 1;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+
+                       if (!PagePrivate(page))
+                               continue;
+
+                       if (!wbc->range_cyclic && page->index > end) {
+                               done = 1;
+                               break;
+                       }
+
+                       eb = (struct extent_buffer *)page->private;
+                       if (!eb) {
+                               WARN_ON(1);
+                               continue;
+                       }
+
+                       if (eb == prev_eb)
+                               continue;
+
+                       if (!atomic_inc_not_zero(&eb->refs)) {
+                               WARN_ON(1);
+                               continue;
+                       }
+
+                       prev_eb = eb;
+                       ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+                       if (!ret) {
+                               free_extent_buffer(eb);
+                               continue;
+                       }
+
+                       ret = write_one_eb(eb, fs_info, wbc, &epd);
+                       if (ret) {
+                               done = 1;
+                               free_extent_buffer(eb);
+                               break;
+                       }
+                       free_extent_buffer(eb);
+
+                       /*
+                        * the filesystem may choose to bump up nr_to_write.
+                        * We have to make sure to honor the new nr_to_write
+                        * at any time
+                        */
+                       nr_to_write_done = wbc->nr_to_write <= 0;
+               }
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+       if (!scanned && !done) {
+               /*
+                * We hit the last page and there is more work to be done: wrap
+                * back to the start of the file
+                */
+               scanned = 1;
+               index = 0;
+               goto retry;
+       }
+       flush_write_bio(&epd);
+       return ret;
+}
+
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -3592,26 +3888,7 @@ out:
 inline struct page *extent_buffer_page(struct extent_buffer *eb,
                                              unsigned long i)
 {
-       struct page *p;
-       struct address_space *mapping;
-
-       if (i == 0)
-               return eb->first_page;
-       i += eb->start >> PAGE_CACHE_SHIFT;
-       mapping = eb->first_page->mapping;
-       if (!mapping)
-               return NULL;
-
-       /*
-        * extent_buffer_page is only called after pinning the page
-        * by increasing the reference count.  So we know the page must
-        * be in the radix tree.
-        */
-       rcu_read_lock();
-       p = radix_tree_lookup(&mapping->page_tree, i);
-       rcu_read_unlock();
-
-       return p;
+       return eb->pages[i];
 }
 
 inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -3620,6 +3897,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len)
                (start >> PAGE_CACHE_SHIFT);
 }
 
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#if LEAK_DEBUG
+       unsigned long flags;
+       spin_lock_irqsave(&leak_lock, flags);
+       list_del(&eb->leak_list);
+       spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+       if (eb->pages && eb->pages != eb->inline_pages)
+               kfree(eb->pages);
+       kmem_cache_free(extent_buffer_cache, eb);
+}
+
 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                                                   u64 start,
                                                   unsigned long len,
@@ -3635,6 +3925,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                return NULL;
        eb->start = start;
        eb->len = len;
+       eb->tree = tree;
        rwlock_init(&eb->lock);
        atomic_set(&eb->write_locks, 0);
        atomic_set(&eb->read_locks, 0);
@@ -3651,20 +3942,32 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        list_add(&eb->leak_list, &buffers);
        spin_unlock_irqrestore(&leak_lock, flags);
 #endif
+       spin_lock_init(&eb->refs_lock);
        atomic_set(&eb->refs, 1);
+       atomic_set(&eb->io_pages, 0);
+
+       if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
+               struct page **pages;
+               int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
+                       PAGE_CACHE_SHIFT;
+               pages = kzalloc(num_pages, mask);
+               if (!pages) {
+                       __free_extent_buffer(eb);
+                       return NULL;
+               }
+               eb->pages = pages;
+       } else {
+               eb->pages = eb->inline_pages;
+       }
 
        return eb;
 }
 
-static void __free_extent_buffer(struct extent_buffer *eb)
+static int extent_buffer_under_io(struct extent_buffer *eb)
 {
-#if LEAK_DEBUG
-       unsigned long flags;
-       spin_lock_irqsave(&leak_lock, flags);
-       list_del(&eb->leak_list);
-       spin_unlock_irqrestore(&leak_lock, flags);
-#endif
-       kmem_cache_free(extent_buffer_cache, eb);
+       return (atomic_read(&eb->io_pages) ||
+               test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+               test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 }
 
 /*
@@ -3676,8 +3979,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
        unsigned long index;
        struct page *page;
 
-       if (!eb->first_page)
-               return;
+       BUG_ON(extent_buffer_under_io(eb));
 
        index = num_extent_pages(eb->start, eb->len);
        if (start_idx >= index)
@@ -3686,8 +3988,34 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
        do {
                index--;
                page = extent_buffer_page(eb, index);
-               if (page)
+               if (page) {
+                       spin_lock(&page->mapping->private_lock);
+                       /*
+                        * We do this since we'll remove the pages after we've
+                        * removed the eb from the radix tree, so we could race
+                        * and have this page now attached to the new eb.  So
+                        * only clear page_private if it's still connected to
+                        * this eb.
+                        */
+                       if (PagePrivate(page) &&
+                           page->private == (unsigned long)eb) {
+                               BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+                               BUG_ON(PageDirty(page));
+                               BUG_ON(PageWriteback(page));
+                               /*
+                                * We need to make sure we haven't be attached
+                                * to a new eb.
+                                */
+                               ClearPagePrivate(page);
+                               set_page_private(page, 0);
+                               /* One for the page private */
+                               page_cache_release(page);
+                       }
+                       spin_unlock(&page->mapping->private_lock);
+
+                       /* One for when we alloced the page */
                        page_cache_release(page);
+               }
        } while (index != start_idx);
 }
 
@@ -3700,9 +4028,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
        __free_extent_buffer(eb);
 }
 
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+       /* the ref bit is tricky.  We have to make sure it is set
+        * if we have the buffer dirty.   Otherwise the
+        * code to free a buffer can end up dropping a dirty
+        * page
+        *
+        * Once the ref bit is set, it won't go away while the
+        * buffer is dirty or in writeback, and it also won't
+        * go away while we have the reference count on the
+        * eb bumped.
+        *
+        * We can't just set the ref bit without bumping the
+        * ref on the eb because free_extent_buffer might
+        * see the ref bit and try to clear it.  If this happens
+        * free_extent_buffer might end up dropping our original
+        * ref by mistake and freeing the page before we are able
+        * to add one more ref.
+        *
+        * So bump the ref count first, then set the bit.  If someone
+        * beat us to it, drop the ref we added.
+        */
+       if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+               atomic_inc(&eb->refs);
+               if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+                       atomic_dec(&eb->refs);
+       }
+}
+
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+{
+       unsigned long num_pages, i;
+
+       check_buffer_tree_ref(eb);
+
+       num_pages = num_extent_pages(eb->start, eb->len);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+               mark_page_accessed(p);
+       }
+}
+
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
-                                         u64 start, unsigned long len,
-                                         struct page *page0)
+                                         u64 start, unsigned long len)
 {
        unsigned long num_pages = num_extent_pages(start, len);
        unsigned long i;
@@ -3718,7 +4087,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
-               mark_page_accessed(eb->first_page);
+               mark_extent_buffer_accessed(eb);
                return eb;
        }
        rcu_read_unlock();
@@ -3727,32 +4096,43 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        if (!eb)
                return NULL;
 
-       if (page0) {
-               eb->first_page = page0;
-               i = 1;
-               index++;
-               page_cache_get(page0);
-               mark_page_accessed(page0);
-               set_page_extent_mapped(page0);
-               set_page_extent_head(page0, len);
-               uptodate = PageUptodate(page0);
-       } else {
-               i = 0;
-       }
-       for (; i < num_pages; i++, index++) {
+       for (i = 0; i < num_pages; i++, index++) {
                p = find_or_create_page(mapping, index, GFP_NOFS);
                if (!p) {
                        WARN_ON(1);
                        goto free_eb;
                }
-               set_page_extent_mapped(p);
-               mark_page_accessed(p);
-               if (i == 0) {
-                       eb->first_page = p;
-                       set_page_extent_head(p, len);
-               } else {
-                       set_page_private(p, EXTENT_PAGE_PRIVATE);
+
+               spin_lock(&mapping->private_lock);
+               if (PagePrivate(p)) {
+                       /*
+                        * We could have already allocated an eb for this page
+                        * and attached one so lets see if we can get a ref on
+                        * the existing eb, and if we can we know it's good and
+                        * we can just return that one, else we know we can just
+                        * overwrite page->private.
+                        */
+                       exists = (struct extent_buffer *)p->private;
+                       if (atomic_inc_not_zero(&exists->refs)) {
+                               spin_unlock(&mapping->private_lock);
+                               unlock_page(p);
+                               mark_extent_buffer_accessed(exists);
+                               goto free_eb;
+                       }
+
+                       /*
+                        * Do this so attach doesn't complain and we need to
+                        * drop the ref the old guy had.
+                        */
+                       ClearPagePrivate(p);
+                       WARN_ON(PageDirty(p));
+                       page_cache_release(p);
                }
+               attach_extent_buffer_page(eb, p);
+               spin_unlock(&mapping->private_lock);
+               WARN_ON(PageDirty(p));
+               mark_page_accessed(p);
+               eb->pages[i] = p;
                if (!PageUptodate(p))
                        uptodate = 0;
 
@@ -3760,12 +4140,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                 * see below about how we avoid a nasty race with release page
                 * and why we unlock later
                 */
-               if (i != 0)
-                       unlock_page(p);
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
+again:
        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
        if (ret)
                goto free_eb;
@@ -3775,14 +4153,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        if (ret == -EEXIST) {
                exists = radix_tree_lookup(&tree->buffer,
                                                start >> PAGE_CACHE_SHIFT);
-               /* add one reference for the caller */
-               atomic_inc(&exists->refs);
+               if (!atomic_inc_not_zero(&exists->refs)) {
+                       spin_unlock(&tree->buffer_lock);
+                       radix_tree_preload_end();
+                       exists = NULL;
+                       goto again;
+               }
                spin_unlock(&tree->buffer_lock);
                radix_tree_preload_end();
+               mark_extent_buffer_accessed(exists);
                goto free_eb;
        }
        /* add one reference for the tree */
-       atomic_inc(&eb->refs);
+       spin_lock(&eb->refs_lock);
+       check_buffer_tree_ref(eb);
+       spin_unlock(&eb->refs_lock);
        spin_unlock(&tree->buffer_lock);
        radix_tree_preload_end();
 
@@ -3795,15 +4180,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
         * after the extent buffer is in the radix tree so
         * it doesn't get lost
         */
-       set_page_extent_mapped(eb->first_page);
-       set_page_extent_head(eb->first_page, eb->len);
-       if (!page0)
-               unlock_page(eb->first_page);
+       SetPageChecked(eb->pages[0]);
+       for (i = 1; i < num_pages; i++) {
+               p = extent_buffer_page(eb, i);
+               ClearPageChecked(p);
+               unlock_page(p);
+       }
+       unlock_page(eb->pages[0]);
        return eb;
 
 free_eb:
-       if (eb->first_page && !page0)
-               unlock_page(eb->first_page);
+       for (i = 0; i < num_pages; i++) {
+               if (eb->pages[i])
+                       unlock_page(eb->pages[i]);
+       }
 
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
@@ -3820,7 +4210,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
-               mark_page_accessed(eb->first_page);
+               mark_extent_buffer_accessed(eb);
                return eb;
        }
        rcu_read_unlock();
@@ -3828,19 +4218,71 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
        return NULL;
 }
 
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+       struct extent_buffer *eb =
+                       container_of(head, struct extent_buffer, rcu_head);
+
+       __free_extent_buffer(eb);
+}
+
+/* Expects to have eb->eb_lock already held */
+static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+{
+       WARN_ON(atomic_read(&eb->refs) == 0);
+       if (atomic_dec_and_test(&eb->refs)) {
+               struct extent_io_tree *tree = eb->tree;
+
+               spin_unlock(&eb->refs_lock);
+
+               spin_lock(&tree->buffer_lock);
+               radix_tree_delete(&tree->buffer,
+                                 eb->start >> PAGE_CACHE_SHIFT);
+               spin_unlock(&tree->buffer_lock);
+
+               /* Should be safe to release our pages at this point */
+               btrfs_release_extent_buffer_page(eb, 0);
+
+               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+               return;
+       }
+       spin_unlock(&eb->refs_lock);
+}
+
 void free_extent_buffer(struct extent_buffer *eb)
 {
        if (!eb)
                return;
 
-       if (!atomic_dec_and_test(&eb->refs))
+       spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) == 2 &&
+           test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+           !extent_buffer_under_io(eb) &&
+           test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+               atomic_dec(&eb->refs);
+
+       /*
+        * I know this is terrible, but it's temporary until we stop tracking
+        * the uptodate bits and such for the extent buffers.
+        */
+       release_extent_buffer(eb, GFP_ATOMIC);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+       if (!eb)
                return;
 
-       WARN_ON(1);
+       spin_lock(&eb->refs_lock);
+       set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+       if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+           test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+               atomic_dec(&eb->refs);
+       release_extent_buffer(eb, GFP_NOFS);
 }
 
-void clear_extent_buffer_dirty(struct extent_io_tree *tree,
-                             struct extent_buffer *eb)
+void clear_extent_buffer_dirty(struct extent_buffer *eb)
 {
        unsigned long i;
        unsigned long num_pages;
@@ -3856,10 +4298,6 @@ void clear_extent_buffer_dirty(struct extent_io_tree *tree,
                lock_page(page);
                WARN_ON(!PagePrivate(page));
 
-               set_page_extent_mapped(page);
-               if (i == 0)
-                       set_page_extent_head(page, eb->len);
-
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
                if (!PageDirty(page)) {
@@ -3871,23 +4309,29 @@ void clear_extent_buffer_dirty(struct extent_io_tree *tree,
                ClearPageError(page);
                unlock_page(page);
        }
+       WARN_ON(atomic_read(&eb->refs) == 0);
 }
 
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
-                            struct extent_buffer *eb)
+int set_extent_buffer_dirty(struct extent_buffer *eb)
 {
        unsigned long i;
        unsigned long num_pages;
        int was_dirty = 0;
 
+       check_buffer_tree_ref(eb);
+
        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
        num_pages = num_extent_pages(eb->start, eb->len);
+       WARN_ON(atomic_read(&eb->refs) == 0);
+       WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
        for (i = 0; i < num_pages; i++)
-               __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+               set_page_dirty(extent_buffer_page(eb, i));
        return was_dirty;
 }
 
-static int __eb_straddles_pages(u64 start, u64 len)
+static int range_straddles_pages(u64 start, u64 len)
 {
        if (len < PAGE_CACHE_SIZE)
                return 1;
@@ -3898,25 +4342,14 @@ static int __eb_straddles_pages(u64 start, u64 len)
        return 0;
 }
 
-static int eb_straddles_pages(struct extent_buffer *eb)
-{
-       return __eb_straddles_pages(eb->start, eb->len);
-}
-
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-                               struct extent_buffer *eb,
-                               struct extent_state **cached_state)
+int clear_extent_buffer_uptodate(struct extent_buffer *eb)
 {
        unsigned long i;
        struct page *page;
        unsigned long num_pages;
 
-       num_pages = num_extent_pages(eb->start, eb->len);
        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
-       clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                             cached_state, GFP_NOFS);
-
+       num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if (page)
@@ -3925,27 +4358,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
        return 0;
 }
 
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
-                               struct extent_buffer *eb)
+int set_extent_buffer_uptodate(struct extent_buffer *eb)
 {
        unsigned long i;
        struct page *page;
        unsigned long num_pages;
 
+       set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
-
-       if (eb_straddles_pages(eb)) {
-               set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                                   NULL, GFP_NOFS);
-       }
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-               if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-                   ((i == num_pages - 1) &&
-                    ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-                       check_page_uptodate(tree, page);
-                       continue;
-               }
                SetPageUptodate(page);
        }
        return 0;
@@ -3960,7 +4382,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
        int uptodate;
        unsigned long index;
 
-       if (__eb_straddles_pages(start, end - start + 1)) {
+       if (range_straddles_pages(start, end - start + 1)) {
                ret = test_range_bit(tree, start, end,
                                     EXTENT_UPTODATE, 1, NULL);
                if (ret)
@@ -3982,35 +4404,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
        return pg_uptodate;
 }
 
-int extent_buffer_uptodate(struct extent_io_tree *tree,
-                          struct extent_buffer *eb,
-                          struct extent_state *cached_state)
+int extent_buffer_uptodate(struct extent_buffer *eb)
 {
-       int ret = 0;
-       unsigned long num_pages;
-       unsigned long i;
-       struct page *page;
-       int pg_uptodate = 1;
-
-       if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
-               return 1;
-
-       if (eb_straddles_pages(eb)) {
-               ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                                  EXTENT_UPTODATE, 1, cached_state);
-               if (ret)
-                       return ret;
-       }
-
-       num_pages = num_extent_pages(eb->start, eb->len);
-       for (i = 0; i < num_pages; i++) {
-               page = extent_buffer_page(eb, i);
-               if (!PageUptodate(page)) {
-                       pg_uptodate = 0;
-                       break;
-               }
-       }
-       return pg_uptodate;
+       return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
@@ -4024,21 +4420,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        int ret = 0;
        int locked_pages = 0;
        int all_uptodate = 1;
-       int inc_all_pages = 0;
        unsigned long num_pages;
+       unsigned long num_reads = 0;
        struct bio *bio = NULL;
        unsigned long bio_flags = 0;
 
        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 0;
 
-       if (eb_straddles_pages(eb)) {
-               if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                                  EXTENT_UPTODATE, 1, NULL)) {
-                       return 0;
-               }
-       }
-
        if (start) {
                WARN_ON(start < eb->start);
                start_i = (start >> PAGE_CACHE_SHIFT) -
@@ -4057,8 +4446,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                        lock_page(page);
                }
                locked_pages++;
-               if (!PageUptodate(page))
+               if (!PageUptodate(page)) {
+                       num_reads++;
                        all_uptodate = 0;
+               }
        }
        if (all_uptodate) {
                if (start_i == 0)
@@ -4066,20 +4457,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                goto unlock_exit;
        }
 
+       clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+       eb->failed_mirror = 0;
+       atomic_set(&eb->io_pages, num_reads);
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-
-               WARN_ON(!PagePrivate(page));
-
-               set_page_extent_mapped(page);
-               if (i == 0)
-                       set_page_extent_head(page, eb->len);
-
-               if (inc_all_pages)
-                       page_cache_get(page);
                if (!PageUptodate(page)) {
-                       if (start_i == 0)
-                               inc_all_pages = 1;
                        ClearPageError(page);
                        err = __extent_read_full_page(tree, page,
                                                      get_extent, &bio,
@@ -4107,8 +4490,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                        ret = -EIO;
        }
 
-       if (!ret)
-               set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        return ret;
 
 unlock_exit:
@@ -4350,15 +4731,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
 {
        char *dst_kaddr = page_address(dst_page);
        char *src_kaddr;
+       int must_memmove = 0;
 
        if (dst_page != src_page) {
                src_kaddr = page_address(src_page);
        } else {
                src_kaddr = dst_kaddr;
-               BUG_ON(areas_overlap(src_off, dst_off, len));
+               if (areas_overlap(src_off, dst_off, len))
+                       must_memmove = 1;
        }
 
-       memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+       if (must_memmove)
+               memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+       else
+               memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
 }
 
 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -4428,7 +4814,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
-       if (!areas_overlap(src_offset, dst_offset, len)) {
+       if (dst_offset < src_offset) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }
@@ -4454,47 +4840,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        }
 }
 
-static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
-{
-       struct extent_buffer *eb =
-                       container_of(head, struct extent_buffer, rcu_head);
-
-       btrfs_release_extent_buffer(eb);
-}
-
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+int try_release_extent_buffer(struct page *page, gfp_t mask)
 {
-       u64 start = page_offset(page);
        struct extent_buffer *eb;
-       int ret = 1;
 
-       spin_lock(&tree->buffer_lock);
-       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-       if (!eb) {
-               spin_unlock(&tree->buffer_lock);
-               return ret;
+       /*
+        * We need to make sure noboody is attaching this page to an eb right
+        * now.
+        */
+       spin_lock(&page->mapping->private_lock);
+       if (!PagePrivate(page)) {
+               spin_unlock(&page->mapping->private_lock);
+               return 1;
        }
 
-       if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-               ret = 0;
-               goto out;
-       }
+       eb = (struct extent_buffer *)page->private;
+       BUG_ON(!eb);
 
        /*
-        * set @eb->refs to 0 if it is already 1, and then release the @eb.
-        * Or go back.
+        * This is a little awful but should be ok, we need to make sure that
+        * the eb doesn't disappear out from under us while we're looking at
+        * this page.
         */
-       if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
-               ret = 0;
-               goto out;
+       spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+               spin_unlock(&eb->refs_lock);
+               spin_unlock(&page->mapping->private_lock);
+               return 0;
        }
+       spin_unlock(&page->mapping->private_lock);
 
-       radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-out:
-       spin_unlock(&tree->buffer_lock);
+       if ((mask & GFP_NOFS) == GFP_NOFS)
+               mask = GFP_NOFS;
 
-       /* at this point we can safely release the extent buffer */
-       if (atomic_read(&eb->refs) == 0)
-               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
-       return ret;
+       /*
+        * If tree ref isn't set then we know the ref on this eb is a real ref,
+        * so just return, this page will likely be freed soon anyway.
+        */
+       if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+               spin_unlock(&eb->refs_lock);
+               return 0;
+       }
+       release_extent_buffer(eb, mask);
+
+       return 1;
 }
index 3a171c2592762d71d8cf1cf6472736989752ed3a..faf10eb57f75eb29edf1a8fb42468fb7924b666a 100644 (file)
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
 #define EXTENT_BUFFER_READAHEAD 4      /* this got triggered by readahead */
+#define EXTENT_BUFFER_TREE_REF 5
+#define EXTENT_BUFFER_STALE 6
+#define EXTENT_BUFFER_WRITEBACK 7
+#define EXTENT_BUFFER_IOERR 8
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -54,6 +58,7 @@
 #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
 
 struct extent_state;
+struct btrfs_root;
 
 typedef        int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
                                       struct bio *bio, int mirror_num,
@@ -69,9 +74,7 @@ struct extent_io_ops {
                              size_t size, struct bio *bio,
                              unsigned long bio_flags);
        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
-       int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-                                      u64 start, u64 end, int failed_mirror,
-                                      struct extent_state *state);
+       int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
        int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
                                        u64 start, u64 end,
                                       struct extent_state *state);
@@ -97,6 +100,7 @@ struct extent_io_tree {
        struct radix_tree_root buffer;
        struct address_space *mapping;
        u64 dirty_bytes;
+       int track_uptodate;
        spinlock_t lock;
        spinlock_t buffer_lock;
        struct extent_io_ops *ops;
@@ -119,16 +123,21 @@ struct extent_state {
        struct list_head leak_list;
 };
 
+#define INLINE_EXTENT_BUFFER_PAGES 16
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
 struct extent_buffer {
        u64 start;
        unsigned long len;
        unsigned long map_start;
        unsigned long map_len;
-       struct page *first_page;
        unsigned long bflags;
+       struct extent_io_tree *tree;
+       spinlock_t refs_lock;
+       atomic_t refs;
+       atomic_t io_pages;
+       int failed_mirror;
        struct list_head leak_list;
        struct rcu_head rcu_head;
-       atomic_t refs;
        pid_t lock_owner;
 
        /* count of read lock holders on the extent buffer */
@@ -152,6 +161,9 @@ struct extent_buffer {
         * to unlock
         */
        wait_queue_head_t read_lock_wq;
+       wait_queue_head_t lock_wq;
+       struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
+       struct page **pages;
 };
 
 static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -178,7 +190,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 int try_release_extent_mapping(struct extent_map_tree *map,
                               struct extent_io_tree *tree, struct page *page,
                               gfp_t mask);
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_buffer(struct page *page, gfp_t mask);
 int try_release_extent_state(struct extent_map_tree *map,
                             struct extent_io_tree *tree, struct page *page,
                             gfp_t mask);
@@ -239,6 +251,8 @@ int extent_writepages(struct extent_io_tree *tree,
                      struct address_space *mapping,
                      get_extent_t *get_extent,
                      struct writeback_control *wbc);
+int btree_write_cache_pages(struct address_space *mapping,
+                           struct writeback_control *wbc);
 int extent_readpages(struct extent_io_tree *tree,
                     struct address_space *mapping,
                     struct list_head *pages, unsigned nr_pages,
@@ -250,11 +264,11 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
-                                         u64 start, unsigned long len,
-                                         struct page *page0);
+                                         u64 start, unsigned long len);
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                         u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE      0
 #define WAIT_COMPLETE  1
 #define WAIT_PAGE_LOCK 2
@@ -287,18 +301,11 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 void memset_extent_buffer(struct extent_buffer *eb, char c,
                          unsigned long start, unsigned long len);
 void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
-void clear_extent_buffer_dirty(struct extent_io_tree *tree,
-                             struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
-                            struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
-                              struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-                               struct extent_buffer *eb,
-                               struct extent_state **cached_state);
-int extent_buffer_uptodate(struct extent_io_tree *tree,
-                          struct extent_buffer *eb,
-                          struct extent_state *cached_state);
+void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_buffer *eb);
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
                      unsigned long min_len, char **map,
                      unsigned long *map_start,
@@ -319,4 +326,6 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
                        u64 length, u64 logical, struct page *page,
                        int mirror_num);
 int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+                        int mirror_num);
 #endif
index 7ec58bd7c50d8b7cf5cadd2ef826bdcdac0b7d5d..a13cf1a96c73ca00f4baa453f864029c03525fe4 100644 (file)
@@ -19,6 +19,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "print-tree.h"
 
 static int find_name_in_backref(struct btrfs_path *path, const char *name,
                         int name_len, struct btrfs_inode_ref **ref_ret)
index d6420cca9c8d65b66b1c3acdc5be85e88b254c0f..eb6aec7bbacb5de71966d27e10809ad917ed80ff 100644 (file)
@@ -658,8 +658,7 @@ retry:
                        ret = btrfs_reserve_extent(trans, root,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
-                                          0, alloc_hint,
-                                          (u64)-1, &ins, 1);
+                                          0, alloc_hint, &ins, 1);
                        if (ret)
                                btrfs_abort_transaction(trans, root, ret);
                        btrfs_end_transaction(trans, root);
@@ -884,7 +883,7 @@ static noinline int cow_file_range(struct inode *inode,
                cur_alloc_size = disk_num_bytes;
                ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
                                           root->sectorsize, 0, alloc_hint,
-                                          (u64)-1, &ins, 1);
+                                          &ins, 1);
                if (ret < 0) {
                        btrfs_abort_transaction(trans, root, ret);
                        goto out_unlock;
@@ -5574,7 +5573,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 
        alloc_hint = get_extent_allocation_hint(inode, start, len);
        ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
-                                  alloc_hint, (u64)-1, &ins, 1);
+                                  alloc_hint, &ins, 1);
        if (ret) {
                em = ERR_PTR(ret);
                goto out;
@@ -6939,6 +6938,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        extent_map_tree_init(&ei->extent_tree);
        extent_io_tree_init(&ei->io_tree, &inode->i_data);
        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+       ei->io_tree.track_uptodate = 1;
+       ei->io_failure_tree.track_uptodate = 1;
        mutex_init(&ei->log_mutex);
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7480,7 +7481,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                }
 
                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
-                                          0, *alloc_hint, (u64)-1, &ins, 1);
+                                          0, *alloc_hint, &ins, 1);
                if (ret) {
                        if (own_trans)
                                btrfs_end_transaction(trans, root);
index 22db04550f6a03ff92580e801b860cc4ed757ebc..dc5d33146fdbb9f4fb3cf899d78a97ca361b6ac8 100644 (file)
@@ -54,7 +54,6 @@
  * than the 2 started one after another.
  */
 
-#define MAX_MIRRORS 2
 #define MAX_IN_FLIGHT 6
 
 struct reada_extctl {
@@ -71,7 +70,7 @@ struct reada_extent {
        struct list_head        extctl;
        struct kref             refcnt;
        spinlock_t              lock;
-       struct reada_zone       *zones[MAX_MIRRORS];
+       struct reada_zone       *zones[BTRFS_MAX_MIRRORS];
        int                     nzones;
        struct btrfs_device     *scheduled_for;
 };
@@ -84,7 +83,8 @@ struct reada_zone {
        spinlock_t              lock;
        int                     locked;
        struct btrfs_device     *device;
-       struct btrfs_device     *devs[MAX_MIRRORS]; /* full list, incl self */
+       struct btrfs_device     *devs[BTRFS_MAX_MIRRORS]; /* full list, incl
+                                                          * self */
        int                     ndevs;
        struct kref             refcnt;
 };
@@ -365,9 +365,9 @@ again:
        if (ret || !bbio || length < blocksize)
                goto error;
 
-       if (bbio->num_stripes > MAX_MIRRORS) {
+       if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
                printk(KERN_ERR "btrfs readahead: more than %d copies not "
-                               "supported", MAX_MIRRORS);
+                               "supported", BTRFS_MAX_MIRRORS);
                goto error;
        }
 
index 0209d8a9ae39904cbbfb1e0c832214b9b48cde12..07e59d97551a28bcfb477a98152a8f2ee89e56cf 100644 (file)
  * Future enhancements:
  *  - In case an unrepairable extent is encountered, track which files are
  *    affected and report them
- *  - In case of a read error on files with nodatasum, map the file and read
- *    the extent to trigger a writeback of the good copy
  *  - track and record media errors, throw out bad devices
  *  - add a mode to also read unallocated space
  */
 
-struct scrub_bio;
-struct scrub_page;
+struct scrub_block;
 struct scrub_dev;
-static void scrub_bio_end_io(struct bio *bio, int err);
-static void scrub_checksum(struct btrfs_work *work);
-static int scrub_checksum_data(struct scrub_dev *sdev,
-                              struct scrub_page *spag, void *buffer);
-static int scrub_checksum_tree_block(struct scrub_dev *sdev,
-                                    struct scrub_page *spag, u64 logical,
-                                    void *buffer);
-static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
-static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
-static void scrub_fixup_end_io(struct bio *bio, int err);
-static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
-                         struct page *page);
-static void scrub_fixup(struct scrub_bio *sbio, int ix);
 
 #define SCRUB_PAGES_PER_BIO    16      /* 64k per bio */
 #define SCRUB_BIOS_PER_DEV     16      /* 1 MB per device in flight */
+#define SCRUB_MAX_PAGES_PER_BLOCK      16      /* 64k per node/leaf/sector */
 
 struct scrub_page {
+       struct scrub_block      *sblock;
+       struct page             *page;
+       struct block_device     *bdev;
        u64                     flags;  /* extent flags */
        u64                     generation;
-       int                     mirror_num;
-       int                     have_csum;
+       u64                     logical;
+       u64                     physical;
+       struct {
+               unsigned int    mirror_num:8;
+               unsigned int    have_csum:1;
+               unsigned int    io_error:1;
+       };
        u8                      csum[BTRFS_CSUM_SIZE];
 };
 
@@ -77,12 +70,25 @@ struct scrub_bio {
        int                     err;
        u64                     logical;
        u64                     physical;
-       struct scrub_page       spag[SCRUB_PAGES_PER_BIO];
-       u64                     count;
+       struct scrub_page       *pagev[SCRUB_PAGES_PER_BIO];
+       int                     page_count;
        int                     next_free;
        struct btrfs_work       work;
 };
 
+struct scrub_block {
+       struct scrub_page       pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+       int                     page_count;
+       atomic_t                outstanding_pages;
+       atomic_t                ref_count; /* free mem on transition to zero */
+       struct scrub_dev        *sdev;
+       struct {
+               unsigned int    header_error:1;
+               unsigned int    checksum_error:1;
+               unsigned int    no_io_error_seen:1;
+       };
+};
+
 struct scrub_dev {
        struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
        struct btrfs_device     *dev;
@@ -96,6 +102,10 @@ struct scrub_dev {
        struct list_head        csum_list;
        atomic_t                cancel_req;
        int                     readonly;
+       int                     pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+       u32                     sectorsize;
+       u32                     nodesize;
+       u32                     leafsize;
        /*
         * statistics
         */
@@ -124,6 +134,43 @@ struct scrub_warning {
        int                     scratch_bufsize;
 };
 
+
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+                                    struct btrfs_mapping_tree *map_tree,
+                                    u64 length, u64 logical,
+                                    struct scrub_block *sblock);
+static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+                              struct scrub_block *sblock, int is_metadata,
+                              int have_csum, u8 *csum, u64 generation,
+                              u16 csum_size);
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+                                        struct scrub_block *sblock,
+                                        int is_metadata, int have_csum,
+                                        const u8 *csum, u64 generation,
+                                        u16 csum_size);
+static void scrub_complete_bio_end_io(struct bio *bio, int err);
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+                                            struct scrub_block *sblock_good,
+                                            int force_write);
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+                                           struct scrub_block *sblock_good,
+                                           int page_num, int force_write);
+static int scrub_checksum_data(struct scrub_block *sblock);
+static int scrub_checksum_tree_block(struct scrub_block *sblock);
+static int scrub_checksum_super(struct scrub_block *sblock);
+static void scrub_block_get(struct scrub_block *sblock);
+static void scrub_block_put(struct scrub_block *sblock);
+static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+                                struct scrub_page *spage);
+static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+                      u64 physical, u64 flags, u64 gen, int mirror_num,
+                      u8 *csum, int force);
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_block_complete(struct scrub_block *sblock);
+
+
 static void scrub_free_csums(struct scrub_dev *sdev)
 {
        while (!list_empty(&sdev->csum_list)) {
@@ -135,23 +182,6 @@ static void scrub_free_csums(struct scrub_dev *sdev)
        }
 }
 
-static void scrub_free_bio(struct bio *bio)
-{
-       int i;
-       struct page *last_page = NULL;
-
-       if (!bio)
-               return;
-
-       for (i = 0; i < bio->bi_vcnt; ++i) {
-               if (bio->bi_io_vec[i].bv_page == last_page)
-                       continue;
-               last_page = bio->bi_io_vec[i].bv_page;
-               __free_page(last_page);
-       }
-       bio_put(bio);
-}
-
 static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
 {
        int i;
@@ -159,13 +189,23 @@ static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
        if (!sdev)
                return;
 
+       /* this can happen when scrub is cancelled */
+       if (sdev->curr != -1) {
+               struct scrub_bio *sbio = sdev->bios[sdev->curr];
+
+               for (i = 0; i < sbio->page_count; i++) {
+                       BUG_ON(!sbio->pagev[i]);
+                       BUG_ON(!sbio->pagev[i]->page);
+                       scrub_block_put(sbio->pagev[i]->sblock);
+               }
+               bio_put(sbio->bio);
+       }
+
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                struct scrub_bio *sbio = sdev->bios[i];
 
                if (!sbio)
                        break;
-
-               scrub_free_bio(sbio->bio);
                kfree(sbio);
        }
 
@@ -179,11 +219,16 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
        struct scrub_dev *sdev;
        int             i;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+       int pages_per_bio;
 
+       pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
+                             bio_get_nr_vecs(dev->bdev));
        sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
        if (!sdev)
                goto nomem;
        sdev->dev = dev;
+       sdev->pages_per_bio = pages_per_bio;
+       sdev->curr = -1;
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                struct scrub_bio *sbio;
 
@@ -194,8 +239,8 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 
                sbio->index = i;
                sbio->sdev = sdev;
-               sbio->count = 0;
-               sbio->work.func = scrub_checksum;
+               sbio->page_count = 0;
+               sbio->work.func = scrub_bio_end_io_worker;
 
                if (i != SCRUB_BIOS_PER_DEV-1)
                        sdev->bios[i]->next_free = i + 1;
@@ -203,7 +248,9 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
                        sdev->bios[i]->next_free = -1;
        }
        sdev->first_free = 0;
-       sdev->curr = -1;
+       sdev->nodesize = dev->dev_root->nodesize;
+       sdev->leafsize = dev->dev_root->leafsize;
+       sdev->sectorsize = dev->dev_root->sectorsize;
        atomic_set(&sdev->in_flight, 0);
        atomic_set(&sdev->fixup_cnt, 0);
        atomic_set(&sdev->cancel_req, 0);
@@ -294,10 +341,9 @@ err:
        return 0;
 }
 
-static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
-                               int ix)
+static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 {
-       struct btrfs_device *dev = sbio->sdev->dev;
+       struct btrfs_device *dev = sblock->sdev->dev;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
        struct btrfs_path *path;
        struct btrfs_key found_key;
@@ -316,8 +362,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 
        swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
        swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
-       swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
-       swarn.logical = sbio->logical + ix * PAGE_SIZE;
+       BUG_ON(sblock->page_count < 1);
+       swarn.sector = (sblock->pagev[0].physical) >> 9;
+       swarn.logical = sblock->pagev[0].logical;
        swarn.errstr = errstr;
        swarn.dev = dev;
        swarn.msg_bufsize = bufsize;
@@ -342,7 +389,8 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
                do {
                        ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
                                                        &ref_root, &ref_level);
-                       printk(KERN_WARNING "%s at logical %llu on dev %s, "
+                       printk(KERN_WARNING
+                               "btrfs: %s at logical %llu on dev %s, "
                                "sector %llu: metadata %s (level %d) in tree "
                                "%llu\n", errstr, swarn.logical, dev->name,
                                (unsigned long long)swarn.sector,
@@ -531,9 +579,9 @@ out:
                spin_lock(&sdev->stat_lock);
                ++sdev->stat.uncorrectable_errors;
                spin_unlock(&sdev->stat_lock);
-               printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
-                                       "(nodatasum) error at logical %llu\n",
-                                       fixup->logical);
+               printk_ratelimited(KERN_ERR
+                       "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+                       (unsigned long long)fixup->logical, sdev->dev->name);
        }
 
        btrfs_free_path(path);
@@ -550,91 +598,168 @@ out:
 }
 
 /*
- * scrub_recheck_error gets called when either verification of the page
- * failed or the bio failed to read, e.g. with EIO. In the latter case,
- * recheck_error gets called for every page in the bio, even though only
- * one may be bad
+ * scrub_handle_errored_block gets called when either verification of the
+ * pages failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all pages in the bio, even though only one
+ * may be bad.
+ * The goal of this function is to repair the errored block by using the
+ * contents of one of the mirrors.
  */
-static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 {
-       struct scrub_dev *sdev = sbio->sdev;
-       u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+       struct scrub_dev *sdev = sblock_to_check->sdev;
+       struct btrfs_fs_info *fs_info;
+       u64 length;
+       u64 logical;
+       u64 generation;
+       unsigned int failed_mirror_index;
+       unsigned int is_metadata;
+       unsigned int have_csum;
+       u8 *csum;
+       struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+       struct scrub_block *sblock_bad;
+       int ret;
+       int mirror_index;
+       int page_num;
+       int success;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-                                       DEFAULT_RATELIMIT_BURST);
+                                     DEFAULT_RATELIMIT_BURST);
+
+       BUG_ON(sblock_to_check->page_count < 1);
+       fs_info = sdev->dev->dev_root->fs_info;
+       length = sblock_to_check->page_count * PAGE_SIZE;
+       logical = sblock_to_check->pagev[0].logical;
+       generation = sblock_to_check->pagev[0].generation;
+       BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
+       failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
+       is_metadata = !(sblock_to_check->pagev[0].flags &
+                       BTRFS_EXTENT_FLAG_DATA);
+       have_csum = sblock_to_check->pagev[0].have_csum;
+       csum = sblock_to_check->pagev[0].csum;
 
-       if (sbio->err) {
-               if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
-                                  sbio->bio->bi_io_vec[ix].bv_page) == 0) {
-                       if (scrub_fixup_check(sbio, ix) == 0)
-                               return 0;
-               }
-               if (__ratelimit(&_rs))
-                       scrub_print_warning("i/o error", sbio, ix);
-       } else {
-               if (__ratelimit(&_rs))
-                       scrub_print_warning("checksum error", sbio, ix);
+       /*
+        * read all mirrors one after the other. This includes to
+        * re-read the extent or metadata block that failed (that was
+        * the cause that this fixup code is called) another time,
+        * page by page this time in order to know which pages
+        * caused I/O errors and which ones are good (for all mirrors).
+        * It is the goal to handle the situation when more than one
+        * mirror contains I/O errors, but the errors do not
+        * overlap, i.e. the data can be repaired by selecting the
+        * pages from those mirrors without I/O error on the
+        * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
+        * would be that mirror #1 has an I/O error on the first page,
+        * the second page is good, and mirror #2 has an I/O error on
+        * the second page, but the first page is good.
+        * Then the first page of the first mirror can be repaired by
+        * taking the first page of the second mirror, and the
+        * second page of the second mirror can be repaired by
+        * copying the contents of the 2nd page of the 1st mirror.
+        * One more note: if the pages of one mirror contain I/O
+        * errors, the checksum cannot be verified. In order to get
+        * the best data for repairing, the first attempt is to find
+        * a mirror without I/O errors and with a validated checksum.
+        * Only if this is not possible, the pages are picked from
+        * mirrors with I/O errors without considering the checksum.
+        * If the latter is the case, at the end, the checksum of the
+        * repaired area is verified in order to correctly maintain
+        * the statistics.
+        */
+
+       sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
+                                    sizeof(*sblocks_for_recheck),
+                                    GFP_NOFS);
+       if (!sblocks_for_recheck) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.malloc_errors++;
+               sdev->stat.read_errors++;
+               sdev->stat.uncorrectable_errors++;
+               spin_unlock(&sdev->stat_lock);
+               goto out;
        }
 
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.read_errors;
-       spin_unlock(&sdev->stat_lock);
+       /* setup the context, map the logical blocks and alloc the pages */
+       ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+                                       logical, sblocks_for_recheck);
+       if (ret) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.read_errors++;
+               sdev->stat.uncorrectable_errors++;
+               spin_unlock(&sdev->stat_lock);
+               goto out;
+       }
+       BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+       sblock_bad = sblocks_for_recheck + failed_mirror_index;
 
-       scrub_fixup(sbio, ix);
-       return 1;
-}
+       /* build and submit the bios for the failed mirror, check checksums */
+       ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+                                 csum, generation, sdev->csum_size);
+       if (ret) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.read_errors++;
+               sdev->stat.uncorrectable_errors++;
+               spin_unlock(&sdev->stat_lock);
+               goto out;
+       }
 
-static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
-{
-       int ret = 1;
-       struct page *page;
-       void *buffer;
-       u64 flags = sbio->spag[ix].flags;
+       if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+           sblock_bad->no_io_error_seen) {
+               /*
+                * the error disappeared after reading page by page, or
+                * the area was part of a huge bio and other parts of the
+                * bio caused I/O errors, or the block layer merged several
+                * read requests into one and the error is caused by a
+                * different bio (usually one of the two latter cases is
+                * the cause)
+                */
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.unverified_errors++;
+               spin_unlock(&sdev->stat_lock);
 
-       page = sbio->bio->bi_io_vec[ix].bv_page;
-       buffer = kmap_atomic(page, KM_USER0);
-       if (flags & BTRFS_EXTENT_FLAG_DATA) {
-               ret = scrub_checksum_data(sbio->sdev,
-                                         sbio->spag + ix, buffer);
-       } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-               ret = scrub_checksum_tree_block(sbio->sdev,
-                                               sbio->spag + ix,
-                                               sbio->logical + ix * PAGE_SIZE,
-                                               buffer);
-       } else {
-               WARN_ON(1);
+               goto out;
        }
-       kunmap_atomic(buffer, KM_USER0);
 
-       return ret;
-}
+       if (!sblock_bad->no_io_error_seen) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.read_errors++;
+               spin_unlock(&sdev->stat_lock);
+               if (__ratelimit(&_rs))
+                       scrub_print_warning("i/o error", sblock_to_check);
+       } else if (sblock_bad->checksum_error) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.csum_errors++;
+               spin_unlock(&sdev->stat_lock);
+               if (__ratelimit(&_rs))
+                       scrub_print_warning("checksum error", sblock_to_check);
+       } else if (sblock_bad->header_error) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.verify_errors++;
+               spin_unlock(&sdev->stat_lock);
+               if (__ratelimit(&_rs))
+                       scrub_print_warning("checksum/header error",
+                                           sblock_to_check);
+       }
 
-static void scrub_fixup_end_io(struct bio *bio, int err)
-{
-       complete((struct completion *)bio->bi_private);
-}
+       if (sdev->readonly)
+               goto did_not_correct_error;
+
+       if (!is_metadata && !have_csum) {
+               struct scrub_fixup_nodatasum *fixup_nodatasum;
 
-static void scrub_fixup(struct scrub_bio *sbio, int ix)
-{
-       struct scrub_dev *sdev = sbio->sdev;
-       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
-       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-       struct btrfs_bio *bbio = NULL;
-       struct scrub_fixup_nodatasum *fixup;
-       u64 logical = sbio->logical + ix * PAGE_SIZE;
-       u64 length;
-       int i;
-       int ret;
-       DECLARE_COMPLETION_ONSTACK(complete);
-
-       if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
-           (sbio->spag[ix].have_csum == 0)) {
-               fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
-               if (!fixup)
-                       goto uncorrectable;
-               fixup->sdev = sdev;
-               fixup->logical = logical;
-               fixup->root = fs_info->extent_root;
-               fixup->mirror_num = sbio->spag[ix].mirror_num;
+               /*
+                * !is_metadata and !have_csum, this means that the data
+                * might not be COW'ed, that it might be modified
+                * concurrently. The general strategy to work on the
+                * commit root does not help in the case when COW is not
+                * used.
+                */
+               fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
+               if (!fixup_nodatasum)
+                       goto did_not_correct_error;
+               fixup_nodatasum->sdev = sdev;
+               fixup_nodatasum->logical = logical;
+               fixup_nodatasum->root = fs_info->extent_root;
+               fixup_nodatasum->mirror_num = failed_mirror_index + 1;
                /*
                 * increment scrubs_running to prevent cancel requests from
                 * completing as long as a fixup worker is running. we must also
@@ -649,235 +774,529 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
                atomic_inc(&fs_info->scrubs_paused);
                mutex_unlock(&fs_info->scrub_lock);
                atomic_inc(&sdev->fixup_cnt);
-               fixup->work.func = scrub_fixup_nodatasum;
-               btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
-               return;
+               fixup_nodatasum->work.func = scrub_fixup_nodatasum;
+               btrfs_queue_worker(&fs_info->scrub_workers,
+                                  &fixup_nodatasum->work);
+               goto out;
        }
 
-       length = PAGE_SIZE;
-       ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
-                             &bbio, 0);
-       if (ret || !bbio || length < PAGE_SIZE) {
-               printk(KERN_ERR
-                      "scrub_fixup: btrfs_map_block failed us for %llu\n",
-                      (unsigned long long)logical);
-               WARN_ON(1);
-               kfree(bbio);
-               return;
+       /*
+        * now build and submit the bios for the other mirrors, check
+        * checksums
+        */
+       for (mirror_index = 0;
+            mirror_index < BTRFS_MAX_MIRRORS &&
+            sblocks_for_recheck[mirror_index].page_count > 0;
+            mirror_index++) {
+               if (mirror_index == failed_mirror_index)
+                       continue;
+
+               /* build and submit the bios, check checksums */
+               ret = scrub_recheck_block(fs_info,
+                                         sblocks_for_recheck + mirror_index,
+                                         is_metadata, have_csum, csum,
+                                         generation, sdev->csum_size);
+               if (ret)
+                       goto did_not_correct_error;
        }
 
-       if (bbio->num_stripes == 1)
-               /* there aren't any replicas */
-               goto uncorrectable;
+       /*
+        * first try to pick the mirror which is completely without I/O
+        * errors and also does not have a checksum error.
+        * If one is found, and if a checksum is present, the full block
+        * that is known to contain an error is rewritten. Afterwards
+        * the block is known to be corrected.
+        * If a mirror is found which is completely correct, and no
+        * checksum is present, only those pages are rewritten that had
+        * an I/O error in the block to be repaired, since it cannot be
+        * determined, which copy of the other pages is better (and it
+        * could happen otherwise that a correct page would be
+        * overwritten by a bad one).
+        */
+       for (mirror_index = 0;
+            mirror_index < BTRFS_MAX_MIRRORS &&
+            sblocks_for_recheck[mirror_index].page_count > 0;
+            mirror_index++) {
+               struct scrub_block *sblock_other = sblocks_for_recheck +
+                                                  mirror_index;
+
+               if (!sblock_other->header_error &&
+                   !sblock_other->checksum_error &&
+                   sblock_other->no_io_error_seen) {
+                       int force_write = is_metadata || have_csum;
+
+                       ret = scrub_repair_block_from_good_copy(sblock_bad,
+                                                               sblock_other,
+                                                               force_write);
+                       if (0 == ret)
+                               goto corrected_error;
+               }
+       }
 
        /*
-        * first find a good copy
+        * in case of I/O errors in the area that is supposed to be
+        * repaired, continue by picking good copies of those pages.
+        * Select the good pages from mirrors to rewrite bad pages from
+        * the area to fix. Afterwards verify the checksum of the block
+        * that is supposed to be repaired. This verification step is
+        * only done for the purpose of statistic counting and for the
+        * final scrub report, whether errors remain.
+        * A perfect algorithm could make use of the checksum and try
+        * all possible combinations of pages from the different mirrors
+        * until the checksum verification succeeds. For example, when
+        * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+        * of mirror #2 is readable but the final checksum test fails,
+        * then the 2nd page of mirror #3 could be tried, whether now
+        * the final checksum succeedes. But this would be a rare
+        * exception and is therefore not implemented. At least it is
+        * avoided that the good copy is overwritten.
+        * A more useful improvement would be to pick the sectors
+        * without I/O error based on sector sizes (512 bytes on legacy
+        * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+        * mirror could be repaired by taking 512 byte of a different
+        * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+        * area are unreadable.
         */
-       for (i = 0; i < bbio->num_stripes; ++i) {
-               if (i + 1 == sbio->spag[ix].mirror_num)
-                       continue;
 
-               if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
-                                  bbio->stripes[i].physical >> 9,
-                                  sbio->bio->bi_io_vec[ix].bv_page)) {
-                       /* I/O-error, this is not a good copy */
+       /* can only fix I/O errors from here on */
+       if (sblock_bad->no_io_error_seen)
+               goto did_not_correct_error;
+
+       success = 1;
+       for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+               struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+
+               if (!page_bad->io_error)
                        continue;
+
+               for (mirror_index = 0;
+                    mirror_index < BTRFS_MAX_MIRRORS &&
+                    sblocks_for_recheck[mirror_index].page_count > 0;
+                    mirror_index++) {
+                       struct scrub_block *sblock_other = sblocks_for_recheck +
+                                                          mirror_index;
+                       struct scrub_page *page_other = sblock_other->pagev +
+                                                       page_num;
+
+                       if (!page_other->io_error) {
+                               ret = scrub_repair_page_from_good_copy(
+                                       sblock_bad, sblock_other, page_num, 0);
+                               if (0 == ret) {
+                                       page_bad->io_error = 0;
+                                       break; /* succeeded for this page */
+                               }
+                       }
                }
 
-               if (scrub_fixup_check(sbio, ix) == 0)
-                       break;
+               if (page_bad->io_error) {
+                       /* did not find a mirror to copy the page from */
+                       success = 0;
+               }
        }
-       if (i == bbio->num_stripes)
-               goto uncorrectable;
 
-       if (!sdev->readonly) {
-               /*
-                * bi_io_vec[ix].bv_page now contains good data, write it back
-                */
-               if (scrub_fixup_io(WRITE, sdev->dev->bdev,
-                                  (sbio->physical + ix * PAGE_SIZE) >> 9,
-                                  sbio->bio->bi_io_vec[ix].bv_page)) {
-                       /* I/O-error, writeback failed, give up */
-                       goto uncorrectable;
+       if (success) {
+               if (is_metadata || have_csum) {
+                       /*
+                        * need to verify the checksum now that all
+                        * sectors on disk are repaired (the write
+                        * request for data to be repaired is on its way).
+                        * Just be lazy and use scrub_recheck_block()
+                        * which re-reads the data before the checksum
+                        * is verified, but most likely the data comes out
+                        * of the page cache.
+                        */
+                       ret = scrub_recheck_block(fs_info, sblock_bad,
+                                                 is_metadata, have_csum, csum,
+                                                 generation, sdev->csum_size);
+                       if (!ret && !sblock_bad->header_error &&
+                           !sblock_bad->checksum_error &&
+                           sblock_bad->no_io_error_seen)
+                               goto corrected_error;
+                       else
+                               goto did_not_correct_error;
+               } else {
+corrected_error:
+                       spin_lock(&sdev->stat_lock);
+                       sdev->stat.corrected_errors++;
+                       spin_unlock(&sdev->stat_lock);
+                       printk_ratelimited(KERN_ERR
+                               "btrfs: fixed up error at logical %llu on dev %s\n",
+                               (unsigned long long)logical, sdev->dev->name);
                }
+       } else {
+did_not_correct_error:
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.uncorrectable_errors++;
+               spin_unlock(&sdev->stat_lock);
+               printk_ratelimited(KERN_ERR
+                       "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
+                       (unsigned long long)logical, sdev->dev->name);
        }
 
-       kfree(bbio);
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.corrected_errors;
-       spin_unlock(&sdev->stat_lock);
+out:
+       if (sblocks_for_recheck) {
+               for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
+                    mirror_index++) {
+                       struct scrub_block *sblock = sblocks_for_recheck +
+                                                    mirror_index;
+                       int page_index;
+
+                       for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
+                            page_index++)
+                               if (sblock->pagev[page_index].page)
+                                       __free_page(
+                                               sblock->pagev[page_index].page);
+               }
+               kfree(sblocks_for_recheck);
+       }
 
-       printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
-                              (unsigned long long)logical);
-       return;
+       return 0;
+}
 
-uncorrectable:
-       kfree(bbio);
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.uncorrectable_errors;
-       spin_unlock(&sdev->stat_lock);
+static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+                                    struct btrfs_mapping_tree *map_tree,
+                                    u64 length, u64 logical,
+                                    struct scrub_block *sblocks_for_recheck)
+{
+       int page_index;
+       int mirror_index;
+       int ret;
+
+       /*
+        * note: the three members sdev, ref_count and outstanding_pages
+        * are not used (and not set) in the blocks that are used for
+        * the recheck procedure
+        */
+
+       page_index = 0;
+       while (length > 0) {
+               u64 sublen = min_t(u64, length, PAGE_SIZE);
+               u64 mapped_length = sublen;
+               struct btrfs_bio *bbio = NULL;
+
+               /*
+                * with a length of PAGE_SIZE, each returned stripe
+                * represents one mirror
+                */
+               ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
+                                     &bbio, 0);
+               if (ret || !bbio || mapped_length < sublen) {
+                       kfree(bbio);
+                       return -EIO;
+               }
+
+               BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+               for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+                    mirror_index++) {
+                       struct scrub_block *sblock;
+                       struct scrub_page *page;
+
+                       if (mirror_index >= BTRFS_MAX_MIRRORS)
+                               continue;
+
+                       sblock = sblocks_for_recheck + mirror_index;
+                       page = sblock->pagev + page_index;
+                       page->logical = logical;
+                       page->physical = bbio->stripes[mirror_index].physical;
+                       page->bdev = bbio->stripes[mirror_index].dev->bdev;
+                       page->mirror_num = mirror_index + 1;
+                       page->page = alloc_page(GFP_NOFS);
+                       if (!page->page) {
+                               spin_lock(&sdev->stat_lock);
+                               sdev->stat.malloc_errors++;
+                               spin_unlock(&sdev->stat_lock);
+                               return -ENOMEM;
+                       }
+                       sblock->page_count++;
+               }
+               kfree(bbio);
+               length -= sublen;
+               logical += sublen;
+               page_index++;
+       }
 
-       printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
-                               "logical %llu\n", (unsigned long long)logical);
+       return 0;
 }
 
-static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
-                        struct page *page)
+/*
+ * this function will check the on disk data for checksum errors, header
+ * errors and read I/O errors. If any I/O errors happen, the exact pages
+ * which are errored are marked as being bad. The goal is to enable scrub
+ * to take those pages that are not errored from all the mirrors so that
+ * the pages that are errored in the just handled mirror can be repaired.
+ */
+static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+                              struct scrub_block *sblock, int is_metadata,
+                              int have_csum, u8 *csum, u64 generation,
+                              u16 csum_size)
 {
-       struct bio *bio = NULL;
-       int ret;
-       DECLARE_COMPLETION_ONSTACK(complete);
+       int page_num;
 
-       bio = bio_alloc(GFP_NOFS, 1);
-       bio->bi_bdev = bdev;
-       bio->bi_sector = sector;
-       bio_add_page(bio, page, PAGE_SIZE, 0);
-       bio->bi_end_io = scrub_fixup_end_io;
-       bio->bi_private = &complete;
-       btrfsic_submit_bio(rw, bio);
+       sblock->no_io_error_seen = 1;
+       sblock->header_error = 0;
+       sblock->checksum_error = 0;
 
-       /* this will also unplug the queue */
-       wait_for_completion(&complete);
+       for (page_num = 0; page_num < sblock->page_count; page_num++) {
+               struct bio *bio;
+               int ret;
+               struct scrub_page *page = sblock->pagev + page_num;
+               DECLARE_COMPLETION_ONSTACK(complete);
+
+               BUG_ON(!page->page);
+               bio = bio_alloc(GFP_NOFS, 1);
+               bio->bi_bdev = page->bdev;
+               bio->bi_sector = page->physical >> 9;
+               bio->bi_end_io = scrub_complete_bio_end_io;
+               bio->bi_private = &complete;
+
+               ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
+               if (PAGE_SIZE != ret) {
+                       bio_put(bio);
+                       return -EIO;
+               }
+               btrfsic_submit_bio(READ, bio);
 
-       ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
-       bio_put(bio);
-       return ret;
+               /* this will also unplug the queue */
+               wait_for_completion(&complete);
+
+               page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+               if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+                       sblock->no_io_error_seen = 0;
+               bio_put(bio);
+       }
+
+       if (sblock->no_io_error_seen)
+               scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+                                            have_csum, csum, generation,
+                                            csum_size);
+
+       return 0;
 }
 
-static void scrub_bio_end_io(struct bio *bio, int err)
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+                                        struct scrub_block *sblock,
+                                        int is_metadata, int have_csum,
+                                        const u8 *csum, u64 generation,
+                                        u16 csum_size)
 {
-       struct scrub_bio *sbio = bio->bi_private;
-       struct scrub_dev *sdev = sbio->sdev;
-       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+       int page_num;
+       u8 calculated_csum[BTRFS_CSUM_SIZE];
+       u32 crc = ~(u32)0;
+       struct btrfs_root *root = fs_info->extent_root;
+       void *mapped_buffer;
+
+       BUG_ON(!sblock->pagev[0].page);
+       if (is_metadata) {
+               struct btrfs_header *h;
+
+               mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
+               h = (struct btrfs_header *)mapped_buffer;
+
+               if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+                   generation != le64_to_cpu(h->generation) ||
+                   memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+                   memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+                          BTRFS_UUID_SIZE))
+                       sblock->header_error = 1;
+               csum = h->csum;
+       } else {
+               if (!have_csum)
+                       return;
 
-       sbio->err = err;
-       sbio->bio = bio;
+               mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
+       }
 
-       btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+       for (page_num = 0;;) {
+               if (page_num == 0 && is_metadata)
+                       crc = btrfs_csum_data(root,
+                               ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
+                               crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
+               else
+                       crc = btrfs_csum_data(root, mapped_buffer, crc,
+                                             PAGE_SIZE);
+
+               kunmap_atomic(mapped_buffer, KM_USER0);
+               page_num++;
+               if (page_num >= sblock->page_count)
+                       break;
+               BUG_ON(!sblock->pagev[page_num].page);
+
+               mapped_buffer = kmap_atomic(sblock->pagev[page_num].page,
+                                           KM_USER0);
+       }
+
+       btrfs_csum_final(crc, calculated_csum);
+       if (memcmp(calculated_csum, csum, csum_size))
+               sblock->checksum_error = 1;
 }
 
-static void scrub_checksum(struct btrfs_work *work)
+static void scrub_complete_bio_end_io(struct bio *bio, int err)
 {
-       struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
-       struct scrub_dev *sdev = sbio->sdev;
-       struct page *page;
-       void *buffer;
-       int i;
-       u64 flags;
-       u64 logical;
-       int ret;
+       complete((struct completion *)bio->bi_private);
+}
 
-       if (sbio->err) {
-               ret = 0;
-               for (i = 0; i < sbio->count; ++i)
-                       ret |= scrub_recheck_error(sbio, i);
-               if (!ret) {
-                       spin_lock(&sdev->stat_lock);
-                       ++sdev->stat.unverified_errors;
-                       spin_unlock(&sdev->stat_lock);
-               }
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+                                            struct scrub_block *sblock_good,
+                                            int force_write)
+{
+       int page_num;
+       int ret = 0;
 
-               sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-               sbio->bio->bi_phys_segments = 0;
-               sbio->bio->bi_idx = 0;
+       for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+               int ret_sub;
 
-               for (i = 0; i < sbio->count; i++) {
-                       struct bio_vec *bi;
-                       bi = &sbio->bio->bi_io_vec[i];
-                       bi->bv_offset = 0;
-                       bi->bv_len = PAGE_SIZE;
-               }
-               goto out;
+               ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
+                                                          sblock_good,
+                                                          page_num,
+                                                          force_write);
+               if (ret_sub)
+                       ret = ret_sub;
        }
-       for (i = 0; i < sbio->count; ++i) {
-               page = sbio->bio->bi_io_vec[i].bv_page;
-               buffer = kmap_atomic(page, KM_USER0);
-               flags = sbio->spag[i].flags;
-               logical = sbio->logical + i * PAGE_SIZE;
-               ret = 0;
-               if (flags & BTRFS_EXTENT_FLAG_DATA) {
-                       ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
-               } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-                       ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
-                                                       logical, buffer);
-               } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
-                       BUG_ON(i);
-                       (void)scrub_checksum_super(sbio, buffer);
-               } else {
-                       WARN_ON(1);
-               }
-               kunmap_atomic(buffer, KM_USER0);
-               if (ret) {
-                       ret = scrub_recheck_error(sbio, i);
-                       if (!ret) {
-                               spin_lock(&sdev->stat_lock);
-                               ++sdev->stat.unverified_errors;
-                               spin_unlock(&sdev->stat_lock);
-                       }
+
+       return ret;
+}
+
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+                                           struct scrub_block *sblock_good,
+                                           int page_num, int force_write)
+{
+       struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+       struct scrub_page *page_good = sblock_good->pagev + page_num;
+
+       BUG_ON(sblock_bad->pagev[page_num].page == NULL);
+       BUG_ON(sblock_good->pagev[page_num].page == NULL);
+       if (force_write || sblock_bad->header_error ||
+           sblock_bad->checksum_error || page_bad->io_error) {
+               struct bio *bio;
+               int ret;
+               DECLARE_COMPLETION_ONSTACK(complete);
+
+               bio = bio_alloc(GFP_NOFS, 1);
+               bio->bi_bdev = page_bad->bdev;
+               bio->bi_sector = page_bad->physical >> 9;
+               bio->bi_end_io = scrub_complete_bio_end_io;
+               bio->bi_private = &complete;
+
+               ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+               if (PAGE_SIZE != ret) {
+                       bio_put(bio);
+                       return -EIO;
                }
+               btrfsic_submit_bio(WRITE, bio);
+
+               /* this will also unplug the queue */
+               wait_for_completion(&complete);
+               bio_put(bio);
        }
 
-out:
-       scrub_free_bio(sbio->bio);
-       sbio->bio = NULL;
-       spin_lock(&sdev->list_lock);
-       sbio->next_free = sdev->first_free;
-       sdev->first_free = sbio->index;
-       spin_unlock(&sdev->list_lock);
-       atomic_dec(&sdev->in_flight);
-       wake_up(&sdev->list_wait);
+       return 0;
 }
 
-static int scrub_checksum_data(struct scrub_dev *sdev,
-                              struct scrub_page *spag, void *buffer)
+static void scrub_checksum(struct scrub_block *sblock)
 {
+       u64 flags;
+       int ret;
+
+       BUG_ON(sblock->page_count < 1);
+       flags = sblock->pagev[0].flags;
+       ret = 0;
+       if (flags & BTRFS_EXTENT_FLAG_DATA)
+               ret = scrub_checksum_data(sblock);
+       else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+               ret = scrub_checksum_tree_block(sblock);
+       else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+               (void)scrub_checksum_super(sblock);
+       else
+               WARN_ON(1);
+       if (ret)
+               scrub_handle_errored_block(sblock);
+}
+
+static int scrub_checksum_data(struct scrub_block *sblock)
+{
+       struct scrub_dev *sdev = sblock->sdev;
        u8 csum[BTRFS_CSUM_SIZE];
+       u8 *on_disk_csum;
+       struct page *page;
+       void *buffer;
        u32 crc = ~(u32)0;
        int fail = 0;
        struct btrfs_root *root = sdev->dev->dev_root;
+       u64 len;
+       int index;
 
-       if (!spag->have_csum)
+       BUG_ON(sblock->page_count < 1);
+       if (!sblock->pagev[0].have_csum)
                return 0;
 
-       crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+       on_disk_csum = sblock->pagev[0].csum;
+       page = sblock->pagev[0].page;
+       buffer = kmap_atomic(page, KM_USER0);
+
+       len = sdev->sectorsize;
+       index = 0;
+       for (;;) {
+               u64 l = min_t(u64, len, PAGE_SIZE);
+
+               crc = btrfs_csum_data(root, buffer, crc, l);
+               kunmap_atomic(buffer, KM_USER0);
+               len -= l;
+               if (len == 0)
+                       break;
+               index++;
+               BUG_ON(index >= sblock->page_count);
+               BUG_ON(!sblock->pagev[index].page);
+               page = sblock->pagev[index].page;
+               buffer = kmap_atomic(page, KM_USER0);
+       }
+
        btrfs_csum_final(crc, csum);
-       if (memcmp(csum, spag->csum, sdev->csum_size))
+       if (memcmp(csum, on_disk_csum, sdev->csum_size))
                fail = 1;
 
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.data_extents_scrubbed;
-       sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
-       if (fail)
+       if (fail) {
+               spin_lock(&sdev->stat_lock);
                ++sdev->stat.csum_errors;
-       spin_unlock(&sdev->stat_lock);
+               spin_unlock(&sdev->stat_lock);
+       }
 
        return fail;
 }
 
-static int scrub_checksum_tree_block(struct scrub_dev *sdev,
-                                    struct scrub_page *spag, u64 logical,
-                                    void *buffer)
+static int scrub_checksum_tree_block(struct scrub_block *sblock)
 {
+       struct scrub_dev *sdev = sblock->sdev;
        struct btrfs_header *h;
        struct btrfs_root *root = sdev->dev->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
-       u8 csum[BTRFS_CSUM_SIZE];
+       u8 calculated_csum[BTRFS_CSUM_SIZE];
+       u8 on_disk_csum[BTRFS_CSUM_SIZE];
+       struct page *page;
+       void *mapped_buffer;
+       u64 mapped_size;
+       void *p;
        u32 crc = ~(u32)0;
        int fail = 0;
        int crc_fail = 0;
+       u64 len;
+       int index;
+
+       BUG_ON(sblock->page_count < 1);
+       page = sblock->pagev[0].page;
+       mapped_buffer = kmap_atomic(page, KM_USER0);
+       h = (struct btrfs_header *)mapped_buffer;
+       memcpy(on_disk_csum, h->csum, sdev->csum_size);
 
        /*
         * we don't use the getter functions here, as we
         * a) don't have an extent buffer and
         * b) the page is already kmapped
         */
-       h = (struct btrfs_header *)buffer;
 
-       if (logical != le64_to_cpu(h->bytenr))
+       if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
                ++fail;
 
-       if (spag->generation != le64_to_cpu(h->generation))
+       if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
                ++fail;
 
        if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -887,51 +1306,99 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev,
                   BTRFS_UUID_SIZE))
                ++fail;
 
-       crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
-                             PAGE_SIZE - BTRFS_CSUM_SIZE);
-       btrfs_csum_final(crc, csum);
-       if (memcmp(csum, h->csum, sdev->csum_size))
+       BUG_ON(sdev->nodesize != sdev->leafsize);
+       len = sdev->nodesize - BTRFS_CSUM_SIZE;
+       mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+       p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+       index = 0;
+       for (;;) {
+               u64 l = min_t(u64, len, mapped_size);
+
+               crc = btrfs_csum_data(root, p, crc, l);
+               kunmap_atomic(mapped_buffer, KM_USER0);
+               len -= l;
+               if (len == 0)
+                       break;
+               index++;
+               BUG_ON(index >= sblock->page_count);
+               BUG_ON(!sblock->pagev[index].page);
+               page = sblock->pagev[index].page;
+               mapped_buffer = kmap_atomic(page, KM_USER0);
+               mapped_size = PAGE_SIZE;
+               p = mapped_buffer;
+       }
+
+       btrfs_csum_final(crc, calculated_csum);
+       if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
                ++crc_fail;
 
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.tree_extents_scrubbed;
-       sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
-       if (crc_fail)
-               ++sdev->stat.csum_errors;
-       if (fail)
-               ++sdev->stat.verify_errors;
-       spin_unlock(&sdev->stat_lock);
+       if (crc_fail || fail) {
+               spin_lock(&sdev->stat_lock);
+               if (crc_fail)
+                       ++sdev->stat.csum_errors;
+               if (fail)
+                       ++sdev->stat.verify_errors;
+               spin_unlock(&sdev->stat_lock);
+       }
 
        return fail || crc_fail;
 }
 
-static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+static int scrub_checksum_super(struct scrub_block *sblock)
 {
        struct btrfs_super_block *s;
-       u64 logical;
-       struct scrub_dev *sdev = sbio->sdev;
+       struct scrub_dev *sdev = sblock->sdev;
        struct btrfs_root *root = sdev->dev->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
-       u8 csum[BTRFS_CSUM_SIZE];
+       u8 calculated_csum[BTRFS_CSUM_SIZE];
+       u8 on_disk_csum[BTRFS_CSUM_SIZE];
+       struct page *page;
+       void *mapped_buffer;
+       u64 mapped_size;
+       void *p;
        u32 crc = ~(u32)0;
        int fail = 0;
+       u64 len;
+       int index;
 
-       s = (struct btrfs_super_block *)buffer;
-       logical = sbio->logical;
+       BUG_ON(sblock->page_count < 1);
+       page = sblock->pagev[0].page;
+       mapped_buffer = kmap_atomic(page, KM_USER0);
+       s = (struct btrfs_super_block *)mapped_buffer;
+       memcpy(on_disk_csum, s->csum, sdev->csum_size);
 
-       if (logical != le64_to_cpu(s->bytenr))
+       if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
                ++fail;
 
-       if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+       if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
                ++fail;
 
        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
                ++fail;
 
-       crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
-                             PAGE_SIZE - BTRFS_CSUM_SIZE);
-       btrfs_csum_final(crc, csum);
-       if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+       len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
+       mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+       p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+       index = 0;
+       for (;;) {
+               u64 l = min_t(u64, len, mapped_size);
+
+               crc = btrfs_csum_data(root, p, crc, l);
+               kunmap_atomic(mapped_buffer, KM_USER0);
+               len -= l;
+               if (len == 0)
+                       break;
+               index++;
+               BUG_ON(index >= sblock->page_count);
+               BUG_ON(!sblock->pagev[index].page);
+               page = sblock->pagev[index].page;
+               mapped_buffer = kmap_atomic(page, KM_USER0);
+               mapped_size = PAGE_SIZE;
+               p = mapped_buffer;
+       }
+
+       btrfs_csum_final(crc, calculated_csum);
+       if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
                ++fail;
 
        if (fail) {
@@ -948,6 +1415,23 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
        return fail;
 }
 
+static void scrub_block_get(struct scrub_block *sblock)
+{
+       atomic_inc(&sblock->ref_count);
+}
+
+static void scrub_block_put(struct scrub_block *sblock)
+{
+       if (atomic_dec_and_test(&sblock->ref_count)) {
+               int i;
+
+               for (i = 0; i < sblock->page_count; i++)
+                       if (sblock->pagev[i].page)
+                               __free_page(sblock->pagev[i].page);
+               kfree(sblock);
+       }
+}
+
 static void scrub_submit(struct scrub_dev *sdev)
 {
        struct scrub_bio *sbio;
@@ -956,19 +1440,17 @@ static void scrub_submit(struct scrub_dev *sdev)
                return;
 
        sbio = sdev->bios[sdev->curr];
-       sbio->err = 0;
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
 
        btrfsic_submit_bio(READ, sbio->bio);
 }
 
-static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
-                     u64 physical, u64 flags, u64 gen, int mirror_num,
-                     u8 *csum, int force)
+static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+                                struct scrub_page *spage)
 {
+       struct scrub_block *sblock = spage->sblock;
        struct scrub_bio *sbio;
-       struct page *page;
        int ret;
 
 again:
@@ -981,7 +1463,7 @@ again:
                if (sdev->curr != -1) {
                        sdev->first_free = sdev->bios[sdev->curr]->next_free;
                        sdev->bios[sdev->curr]->next_free = -1;
-                       sdev->bios[sdev->curr]->count = 0;
+                       sdev->bios[sdev->curr]->page_count = 0;
                        spin_unlock(&sdev->list_lock);
                } else {
                        spin_unlock(&sdev->list_lock);
@@ -989,53 +1471,200 @@ again:
                }
        }
        sbio = sdev->bios[sdev->curr];
-       if (sbio->count == 0) {
+       if (sbio->page_count == 0) {
                struct bio *bio;
 
-               sbio->physical = physical;
-               sbio->logical = logical;
-               bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
-               if (!bio)
-                       return -ENOMEM;
+               sbio->physical = spage->physical;
+               sbio->logical = spage->logical;
+               bio = sbio->bio;
+               if (!bio) {
+                       bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+                       if (!bio)
+                               return -ENOMEM;
+                       sbio->bio = bio;
+               }
 
                bio->bi_private = sbio;
                bio->bi_end_io = scrub_bio_end_io;
                bio->bi_bdev = sdev->dev->bdev;
-               bio->bi_sector = sbio->physical >> 9;
+               bio->bi_sector = spage->physical >> 9;
                sbio->err = 0;
-               sbio->bio = bio;
-       } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
-                  sbio->logical + sbio->count * PAGE_SIZE != logical) {
+       } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+                  spage->physical ||
+                  sbio->logical + sbio->page_count * PAGE_SIZE !=
+                  spage->logical) {
                scrub_submit(sdev);
                goto again;
        }
-       sbio->spag[sbio->count].flags = flags;
-       sbio->spag[sbio->count].generation = gen;
-       sbio->spag[sbio->count].have_csum = 0;
-       sbio->spag[sbio->count].mirror_num = mirror_num;
-
-       page = alloc_page(GFP_NOFS);
-       if (!page)
-               return -ENOMEM;
 
-       ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
-       if (!ret) {
-               __free_page(page);
+       sbio->pagev[sbio->page_count] = spage;
+       ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+       if (ret != PAGE_SIZE) {
+               if (sbio->page_count < 1) {
+                       bio_put(sbio->bio);
+                       sbio->bio = NULL;
+                       return -EIO;
+               }
                scrub_submit(sdev);
                goto again;
        }
 
-       if (csum) {
-               sbio->spag[sbio->count].have_csum = 1;
-               memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+       scrub_block_get(sblock); /* one for the added page */
+       atomic_inc(&sblock->outstanding_pages);
+       sbio->page_count++;
+       if (sbio->page_count == sdev->pages_per_bio)
+               scrub_submit(sdev);
+
+       return 0;
+}
+
+static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+                      u64 physical, u64 flags, u64 gen, int mirror_num,
+                      u8 *csum, int force)
+{
+       struct scrub_block *sblock;
+       int index;
+
+       sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+       if (!sblock) {
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.malloc_errors++;
+               spin_unlock(&sdev->stat_lock);
+               return -ENOMEM;
+       }
+
+       /* one ref inside this function, plus one for each page later on */
+       atomic_set(&sblock->ref_count, 1);
+       sblock->sdev = sdev;
+       sblock->no_io_error_seen = 1;
+
+       for (index = 0; len > 0; index++) {
+               struct scrub_page *spage = sblock->pagev + index;
+               u64 l = min_t(u64, len, PAGE_SIZE);
+
+               BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+               spage->page = alloc_page(GFP_NOFS);
+               if (!spage->page) {
+                       spin_lock(&sdev->stat_lock);
+                       sdev->stat.malloc_errors++;
+                       spin_unlock(&sdev->stat_lock);
+                       while (index > 0) {
+                               index--;
+                               __free_page(sblock->pagev[index].page);
+                       }
+                       kfree(sblock);
+                       return -ENOMEM;
+               }
+               spage->sblock = sblock;
+               spage->bdev = sdev->dev->bdev;
+               spage->flags = flags;
+               spage->generation = gen;
+               spage->logical = logical;
+               spage->physical = physical;
+               spage->mirror_num = mirror_num;
+               if (csum) {
+                       spage->have_csum = 1;
+                       memcpy(spage->csum, csum, sdev->csum_size);
+               } else {
+                       spage->have_csum = 0;
+               }
+               sblock->page_count++;
+               len -= l;
+               logical += l;
+               physical += l;
+       }
+
+       BUG_ON(sblock->page_count == 0);
+       for (index = 0; index < sblock->page_count; index++) {
+               struct scrub_page *spage = sblock->pagev + index;
+               int ret;
+
+               ret = scrub_add_page_to_bio(sdev, spage);
+               if (ret) {
+                       scrub_block_put(sblock);
+                       return ret;
+               }
        }
-       ++sbio->count;
-       if (sbio->count == SCRUB_PAGES_PER_BIO || force)
+
+       if (force)
                scrub_submit(sdev);
 
+       /* last one frees, either here or in bio completion for last page */
+       scrub_block_put(sblock);
        return 0;
 }
 
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+       struct scrub_bio *sbio = bio->bi_private;
+       struct scrub_dev *sdev = sbio->sdev;
+       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+
+       sbio->err = err;
+       sbio->bio = bio;
+
+       btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_bio_end_io_worker(struct btrfs_work *work)
+{
+       struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+       struct scrub_dev *sdev = sbio->sdev;
+       int i;
+
+       BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+       if (sbio->err) {
+               for (i = 0; i < sbio->page_count; i++) {
+                       struct scrub_page *spage = sbio->pagev[i];
+
+                       spage->io_error = 1;
+                       spage->sblock->no_io_error_seen = 0;
+               }
+       }
+
+       /* now complete the scrub_block items that have all pages completed */
+       for (i = 0; i < sbio->page_count; i++) {
+               struct scrub_page *spage = sbio->pagev[i];
+               struct scrub_block *sblock = spage->sblock;
+
+               if (atomic_dec_and_test(&sblock->outstanding_pages))
+                       scrub_block_complete(sblock);
+               scrub_block_put(sblock);
+       }
+
+       if (sbio->err) {
+               /* what is this good for??? */
+               sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+               sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+               sbio->bio->bi_phys_segments = 0;
+               sbio->bio->bi_idx = 0;
+
+               for (i = 0; i < sbio->page_count; i++) {
+                       struct bio_vec *bi;
+                       bi = &sbio->bio->bi_io_vec[i];
+                       bi->bv_offset = 0;
+                       bi->bv_len = PAGE_SIZE;
+               }
+       }
+
+       bio_put(sbio->bio);
+       sbio->bio = NULL;
+       spin_lock(&sdev->list_lock);
+       sbio->next_free = sdev->first_free;
+       sdev->first_free = sbio->index;
+       spin_unlock(&sdev->list_lock);
+       atomic_dec(&sdev->in_flight);
+       wake_up(&sdev->list_wait);
+}
+
+static void scrub_block_complete(struct scrub_block *sblock)
+{
+       if (!sblock->no_io_error_seen)
+               scrub_handle_errored_block(sblock);
+       else
+               scrub_checksum(sblock);
+}
+
 static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
                           u8 *csum)
 {
@@ -1043,7 +1672,6 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
        int ret = 0;
        unsigned long i;
        unsigned long num_sectors;
-       u32 sectorsize = sdev->dev->dev_root->sectorsize;
 
        while (!list_empty(&sdev->csum_list)) {
                sum = list_first_entry(&sdev->csum_list,
@@ -1061,7 +1689,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
        if (!sum)
                return 0;
 
-       num_sectors = sum->len / sectorsize;
+       num_sectors = sum->len / sdev->sectorsize;
        for (i = 0; i < num_sectors; ++i) {
                if (sum->sums[i].bytenr == logical) {
                        memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
@@ -1082,9 +1710,28 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
 {
        int ret;
        u8 csum[BTRFS_CSUM_SIZE];
+       u32 blocksize;
+
+       if (flags & BTRFS_EXTENT_FLAG_DATA) {
+               blocksize = sdev->sectorsize;
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.data_extents_scrubbed++;
+               sdev->stat.data_bytes_scrubbed += len;
+               spin_unlock(&sdev->stat_lock);
+       } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+               BUG_ON(sdev->nodesize != sdev->leafsize);
+               blocksize = sdev->nodesize;
+               spin_lock(&sdev->stat_lock);
+               sdev->stat.tree_extents_scrubbed++;
+               sdev->stat.tree_bytes_scrubbed += len;
+               spin_unlock(&sdev->stat_lock);
+       } else {
+               blocksize = sdev->sectorsize;
+               BUG_ON(1);
+       }
 
        while (len) {
-               u64 l = min_t(u64, len, PAGE_SIZE);
+               u64 l = min_t(u64, len, blocksize);
                int have_csum = 0;
 
                if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@ -1093,8 +1740,8 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
                        if (have_csum == 0)
                                ++sdev->stat.no_csum;
                }
-               ret = scrub_page(sdev, logical, l, physical, flags, gen,
-                                mirror_num, have_csum ? csum : NULL, 0);
+               ret = scrub_pages(sdev, logical, l, physical, flags, gen,
+                                 mirror_num, have_csum ? csum : NULL, 0);
                if (ret)
                        return ret;
                len -= l;
@@ -1159,6 +1806,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        if (!path)
                return -ENOMEM;
 
+       /*
+        * work on commit root. The related disk blocks are static as
+        * long as COW is applied. This means, it is save to rewrite
+        * them to repair disk errors without any race conditions
+        */
        path->search_commit_root = 1;
        path->skip_locking = 1;
 
@@ -1512,11 +2164,11 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
 
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
-               if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+               if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
                        break;
 
-               ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
-                                BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+               ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+                                    BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
                if (ret)
                        return ret;
        }
@@ -1575,10 +2227,30 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
        /*
         * check some assumptions
         */
-       if (root->sectorsize != PAGE_SIZE ||
-           root->sectorsize != root->leafsize ||
-           root->sectorsize != root->nodesize) {
-               printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+       if (root->nodesize != root->leafsize) {
+               printk(KERN_ERR
+                      "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
+                      root->nodesize, root->leafsize);
+               return -EINVAL;
+       }
+
+       if (root->nodesize > BTRFS_STRIPE_LEN) {
+               /*
+                * in this case scrub is unable to calculate the checksum
+                * the way scrub is implemented. Do not handle this
+                * situation at all because it won't ever happen.
+                */
+               printk(KERN_ERR
+                      "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
+                      root->nodesize, BTRFS_STRIPE_LEN);
+               return -EINVAL;
+       }
+
+       if (root->sectorsize != PAGE_SIZE) {
+               /* not supported for data w/o checksums */
+               printk(KERN_ERR
+                      "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
+                      root->sectorsize, (unsigned long long)PAGE_SIZE);
                return -EINVAL;
        }
 
@@ -1732,6 +2404,7 @@ int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
 
        return 0;
 }
+
 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
index bc1f6ad18442bc728a10e9919b91162af1b60b4d..c6ffa58124192a9e069b1464d863b33300bcf6e9 100644 (file)
@@ -44,8 +44,9 @@
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)                   \
 u##bits btrfs_##name(struct extent_buffer *eb, type *s);               \
 void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \
-u##bits btrfs_##name(struct extent_buffer *eb,                         \
-                                  type *s)                             \
+void btrfs_set_token_##name(struct extent_buffer *eb, type *s, u##bits val, struct btrfs_map_token *token);    \
+u##bits btrfs_token_##name(struct extent_buffer *eb,                           \
+                          type *s, struct btrfs_map_token *token)      \
 {                                                                      \
        unsigned long part_offset = (unsigned long)s;                   \
        unsigned long offset = part_offset + offsetof(type, member);    \
@@ -54,9 +55,18 @@ u##bits btrfs_##name(struct extent_buffer *eb,                               \
        char *kaddr;                                            \
        unsigned long map_start;                                \
        unsigned long map_len;                                  \
+       unsigned long mem_len = sizeof(((type *)0)->member);    \
        u##bits res;                                            \
+       if (token && token->kaddr && token->offset <= offset && \
+           token->eb == eb &&                                  \
+          (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \
+               kaddr = token->kaddr;                           \
+               p = (type *)(kaddr + part_offset - token->offset);  \
+               res = le##bits##_to_cpu(p->member);             \
+               return res;                                     \
+       }                                                       \
        err = map_private_extent_buffer(eb, offset,             \
-                       sizeof(((type *)0)->member),            \
+                       mem_len,                                \
                        &kaddr, &map_start, &map_len);          \
        if (err) {                                              \
                __le##bits leres;                               \
@@ -65,10 +75,15 @@ u##bits btrfs_##name(struct extent_buffer *eb,                              \
        }                                                       \
        p = (type *)(kaddr + part_offset - map_start);          \
        res = le##bits##_to_cpu(p->member);                     \
+       if (token) {                                            \
+               token->kaddr = kaddr;                           \
+               token->offset = map_start;                      \
+               token->eb = eb;                                 \
+       }                                                       \
        return res;                                             \
 }                                                                      \
-void btrfs_set_##name(struct extent_buffer *eb,                                \
-                                   type *s, u##bits val)               \
+void btrfs_set_token_##name(struct extent_buffer *eb,                          \
+                           type *s, u##bits val, struct btrfs_map_token *token)                \
 {                                                                      \
        unsigned long part_offset = (unsigned long)s;                   \
        unsigned long offset = part_offset + offsetof(type, member);    \
@@ -77,8 +92,17 @@ void btrfs_set_##name(struct extent_buffer *eb,                              \
        char *kaddr;                                            \
        unsigned long map_start;                                \
        unsigned long map_len;                                  \
+       unsigned long mem_len = sizeof(((type *)0)->member);    \
+       if (token && token->kaddr && token->offset <= offset && \
+           token->eb == eb &&                                  \
+          (token->offset + PAGE_CACHE_SIZE >= offset + mem_len)) { \
+               kaddr = token->kaddr;                           \
+               p = (type *)(kaddr + part_offset - token->offset);  \
+               p->member = cpu_to_le##bits(val);               \
+               return;                                         \
+       }                                                       \
        err = map_private_extent_buffer(eb, offset,             \
-                       sizeof(((type *)0)->member),            \
+                       mem_len,                                \
                        &kaddr, &map_start, &map_len);          \
        if (err) {                                              \
                __le##bits val2;                                \
@@ -88,7 +112,22 @@ void btrfs_set_##name(struct extent_buffer *eb,                             \
        }                                                       \
        p = (type *)(kaddr + part_offset - map_start);          \
        p->member = cpu_to_le##bits(val);                       \
-}
+       if (token) {                                            \
+               token->kaddr = kaddr;                           \
+               token->offset = map_start;                      \
+               token->eb = eb;                                 \
+       }                                                       \
+}                                                              \
+void btrfs_set_##name(struct extent_buffer *eb,                        \
+                     type *s, u##bits val)                     \
+{                                                              \
+       btrfs_set_token_##name(eb, s, val, NULL);               \
+}                                                              \
+u##bits btrfs_##name(struct extent_buffer *eb,                 \
+                     type *s)                                  \
+{                                                              \
+       return btrfs_token_##name(eb, s, NULL);                 \
+}                                                              \
 
 #include "ctree.h"
 
index 57305e88ea827b4bc65e47bc13787bfa70b40054..d64cd6cbdbb65075ddddc98bc923503bec320302 100644 (file)
@@ -4434,7 +4434,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
         * to silence the warning eg. on PowerPC 64.
         */
        if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
-               SetPageUptodate(sb->first_page);
+               SetPageUptodate(sb->pages[0]);
 
        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
        array_size = btrfs_super_sys_array_size(super_copy);