Merge git://git.jan-o-sch.net/btrfs-unstable into for-linus
authorChris Mason <chris.mason@oracle.com>
Thu, 29 Mar 2012 00:33:40 +0000 (20:33 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 29 Mar 2012 00:33:40 +0000 (20:33 -0400)
Conflicts:
fs/btrfs/transaction.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
1  2 
fs/btrfs/backref.c
fs/btrfs/ioctl.c
fs/btrfs/scrub.c
fs/btrfs/super.c

diff --combined fs/btrfs/backref.c
index 4c79547f4a0c6db8eb65179ae7221c714304a9df,56136d9046f099b702d77e95cdf160ce8af85e2c..f4e90748940abf6c1f36f3186177e4640bd24546
@@@ -116,6 -116,7 +116,7 @@@ add_parent
   * to a logical address
   */
  static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+                                       int search_commit_root,
                                        struct __prelim_ref *ref,
                                        struct ulist *parents)
  {
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+       path->search_commit_root = !!search_commit_root;
  
        root_key.objectid = ref->root_id;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
@@@ -188,6 -190,7 +190,7 @@@ out
   * resolve all indirect backrefs from the list
   */
  static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+                                  int search_commit_root,
                                   struct list_head *head)
  {
        int err;
                        continue;
                if (ref->count == 0)
                        continue;
-               err = __resolve_indirect_ref(fs_info, ref, parents);
+               err = __resolve_indirect_ref(fs_info, search_commit_root,
+                                            ref, parents);
                if (err) {
                        if (ret == 0)
                                ret = err;
@@@ -586,6 -590,7 +590,7 @@@ static int find_parent_nodes(struct btr
        struct btrfs_delayed_ref_head *head;
        int info_level = 0;
        int ret;
+       int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
        struct list_head prefs_delayed;
        struct list_head prefs;
        struct __prelim_ref *ref;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+       path->search_commit_root = !!search_commit_root;
  
        /*
         * grab both a lock on the path and a lock on the delayed ref head.
@@@ -614,35 -620,39 +620,39 @@@ again
                goto out;
        BUG_ON(ret == 0);
  
-       /*
-        * look if there are updates for this ref queued and lock the head
-        */
-       delayed_refs = &trans->transaction->delayed_refs;
-       spin_lock(&delayed_refs->lock);
-       head = btrfs_find_delayed_ref_head(trans, bytenr);
-       if (head) {
-               if (!mutex_trylock(&head->mutex)) {
-                       atomic_inc(&head->node.refs);
-                       spin_unlock(&delayed_refs->lock);
-                       btrfs_release_path(path);
-                       /*
-                        * Mutex was contended, block until it's
-                        * released and try again
-                        */
-                       mutex_lock(&head->mutex);
-                       mutex_unlock(&head->mutex);
-                       btrfs_put_delayed_ref(&head->node);
-                       goto again;
-               }
-               ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
-               if (ret) {
-                       spin_unlock(&delayed_refs->lock);
-                       goto out;
+       if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+               /*
+                * look if there are updates for this ref queued and lock the
+                * head
+                */
+               delayed_refs = &trans->transaction->delayed_refs;
+               spin_lock(&delayed_refs->lock);
+               head = btrfs_find_delayed_ref_head(trans, bytenr);
+               if (head) {
+                       if (!mutex_trylock(&head->mutex)) {
+                               atomic_inc(&head->node.refs);
+                               spin_unlock(&delayed_refs->lock);
+                               btrfs_release_path(path);
+                               /*
+                                * Mutex was contended, block until it's
+                                * released and try again
+                                */
+                               mutex_lock(&head->mutex);
+                               mutex_unlock(&head->mutex);
+                               btrfs_put_delayed_ref(&head->node);
+                               goto again;
+                       }
+                       ret = __add_delayed_refs(head, seq, &info_key,
+                                                &prefs_delayed);
+                       if (ret) {
+                               spin_unlock(&delayed_refs->lock);
+                               goto out;
+                       }
                }
+               spin_unlock(&delayed_refs->lock);
        }
-       spin_unlock(&delayed_refs->lock);
  
        if (path->slots[0]) {
                struct extent_buffer *leaf;
        if (ret)
                goto out;
  
-       ret = __resolve_indirect_refs(fs_info, &prefs);
+       ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs);
        if (ret)
                goto out;
  
@@@ -1074,8 -1084,7 +1084,7 @@@ int tree_backref_for_extent(unsigned lo
        return 0;
  }
  
- static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
-                               struct btrfs_path *path, u64 logical,
+ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical,
                                u64 orig_extent_item_objectid,
                                u64 extent_item_pos, u64 root,
                                iterate_extent_inodes_t *iterate, void *ctx)
   * calls iterate() for every inode that references the extent identified by
   * the given parameters.
   * when the iterator function returns a non-zero value, iteration stops.
-  * path is guaranteed to be in released state when iterate() is called.
   */
  int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
-                               struct btrfs_path *path,
                                u64 extent_item_objectid, u64 extent_item_pos,
+                               int search_commit_root,
                                iterate_extent_inodes_t *iterate, void *ctx)
  {
        int ret;
        struct list_head data_refs = LIST_HEAD_INIT(data_refs);
        struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
        struct btrfs_trans_handle *trans;
-       struct ulist *refs;
-       struct ulist *roots;
+       struct ulist *refs = NULL;
+       struct ulist *roots = NULL;
        struct ulist_node *ref_node = NULL;
        struct ulist_node *root_node = NULL;
        struct seq_list seq_elem;
-       struct btrfs_delayed_ref_root *delayed_refs;
-       trans = btrfs_join_transaction(fs_info->extent_root);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
+       struct btrfs_delayed_ref_root *delayed_refs = NULL;
  
        pr_debug("resolving all inodes for extent %llu\n",
                        extent_item_objectid);
  
-       delayed_refs = &trans->transaction->delayed_refs;
-       spin_lock(&delayed_refs->lock);
-       btrfs_get_delayed_seq(delayed_refs, &seq_elem);
-       spin_unlock(&delayed_refs->lock);
+       if (search_commit_root) {
+               trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
+       } else {
+               trans = btrfs_join_transaction(fs_info->extent_root);
+               if (IS_ERR(trans))
+                       return PTR_ERR(trans);
+               delayed_refs = &trans->transaction->delayed_refs;
+               spin_lock(&delayed_refs->lock);
+               btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+               spin_unlock(&delayed_refs->lock);
+       }
  
        ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
                                   extent_item_pos, seq_elem.seq,
                while (!ret && (root_node = ulist_next(roots, root_node))) {
                        pr_debug("root %llu references leaf %llu\n",
                                        root_node->val, ref_node->val);
-                       ret = iterate_leaf_refs(fs_info, path, ref_node->val,
+                       ret = iterate_leaf_refs(fs_info, ref_node->val,
                                                extent_item_objectid,
                                                extent_item_pos, root_node->val,
                                                iterate, ctx);
        ulist_free(refs);
        ulist_free(roots);
  out:
-       btrfs_put_delayed_seq(delayed_refs, &seq_elem);
-       btrfs_end_transaction(trans, fs_info->extent_root);
+       if (!search_commit_root) {
+               btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+               btrfs_end_transaction(trans, fs_info->extent_root);
+       }
        return ret;
  }
  
@@@ -1210,6 -1225,7 +1225,7 @@@ int iterate_inodes_from_logical(u64 log
        int ret;
        u64 extent_item_pos;
        struct btrfs_key found_key;
+       int search_commit_root = path->search_commit_root;
  
        ret = extent_from_logical(fs_info, logical, path,
                                        &found_key);
                return ret;
  
        extent_item_pos = logical - found_key.objectid;
-       ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       extent_item_pos, iterate, ctx);
+       ret = iterate_extent_inodes(fs_info, found_key.objectid,
+                                       extent_item_pos, search_commit_root,
+                                       iterate, ctx);
  
        return ret;
  }
@@@ -1342,6 -1359,12 +1359,6 @@@ int paths_from_inode(u64 inum, struct i
                                inode_to_path, ipath);
  }
  
 -/*
 - * allocates space to return multiple file system paths for an inode.
 - * total_bytes to allocate are passed, note that space usable for actual path
 - * information will be total_bytes - sizeof(struct inode_fs_paths).
 - * the returned pointer must be freed with free_ipath() in the end.
 - */
  struct btrfs_data_container *init_data_container(u32 total_bytes)
  {
        struct btrfs_data_container *data;
@@@ -1397,6 -1420,5 +1414,6 @@@ struct inode_fs_paths *init_ipath(s32 t
  
  void free_ipath(struct inode_fs_paths *ipath)
  {
 +      kfree(ipath->fspath);
        kfree(ipath);
  }
diff --combined fs/btrfs/ioctl.c
index 20580920071470168a645db08fe60f84166ad5af,013c6371e3e8d707ba2020cd385b2f5067adabaa..a979ab7d396746413348fa7ad3ee72e32d59da0e
@@@ -206,7 -206,7 +206,7 @@@ static int btrfs_ioctl_setflags(struct 
                }
        }
  
 -      ret = mnt_want_write(file->f_path.mnt);
 +      ret = mnt_want_write_file(file);
        if (ret)
                goto out_unlock;
  
                inode->i_flags = i_oldflags;
        }
  
 -      mnt_drop_write(file->f_path.mnt);
 +      mnt_drop_write_file(file);
   out_unlock:
        mutex_unlock(&inode->i_mutex);
        return ret;
@@@ -286,13 -286,14 +286,13 @@@ static int btrfs_ioctl_getversion(struc
  
  static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
  {
 -      struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
 -      struct btrfs_fs_info *fs_info = root->fs_info;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
        struct btrfs_device *device;
        struct request_queue *q;
        struct fstrim_range range;
        u64 minlen = ULLONG_MAX;
        u64 num_devices = 0;
 -      u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
 +      u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
        int ret;
  
        if (!capable(CAP_SYS_ADMIN))
  
        range.len = min(range.len, total_bytes - range.start);
        range.minlen = max(range.minlen, minlen);
 -      ret = btrfs_trim_fs(root, &range);
 +      ret = btrfs_trim_fs(fs_info->tree_root, &range);
        if (ret < 0)
                return ret;
  
@@@ -425,37 -426,22 +425,37 @@@ static noinline int create_subvol(struc
  
        key.offset = (u64)-1;
        new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
 -      BUG_ON(IS_ERR(new_root));
 +      if (IS_ERR(new_root)) {
 +              btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
 +              ret = PTR_ERR(new_root);
 +              goto fail;
 +      }
  
        btrfs_record_root_in_trans(trans, new_root);
  
        ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
 +      if (ret) {
 +              /* We potentially lose an unused inode item here */
 +              btrfs_abort_transaction(trans, root, ret);
 +              goto fail;
 +      }
 +
        /*
         * insert the directory item
         */
        ret = btrfs_set_inode_index(dir, &index);
 -      BUG_ON(ret);
 +      if (ret) {
 +              btrfs_abort_transaction(trans, root, ret);
 +              goto fail;
 +      }
  
        ret = btrfs_insert_dir_item(trans, root,
                                    name, namelen, dir, &key,
                                    BTRFS_FT_DIR, index);
 -      if (ret)
 +      if (ret) {
 +              btrfs_abort_transaction(trans, root, ret);
                goto fail;
 +      }
  
        btrfs_i_size_write(dir, dir->i_size + namelen * 2);
        ret = btrfs_update_inode(trans, root, dir);
@@@ -812,9 -798,9 +812,9 @@@ static int should_defrag_range(struct i
  
        if (!em) {
                /* get the big lock and read metadata off disk */
 -              lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
 +              lock_extent(io_tree, start, start + len - 1);
                em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
 -              unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
 +              unlock_extent(io_tree, start, start + len - 1);
  
                if (IS_ERR(em))
                        return 0;
@@@ -902,10 -888,10 +902,10 @@@ again
                page_start = page_offset(page);
                page_end = page_start + PAGE_CACHE_SIZE - 1;
                while (1) {
 -                      lock_extent(tree, page_start, page_end, GFP_NOFS);
 +                      lock_extent(tree, page_start, page_end);
                        ordered = btrfs_lookup_ordered_extent(inode,
                                                              page_start);
 -                      unlock_extent(tree, page_start, page_end, GFP_NOFS);
 +                      unlock_extent(tree, page_start, page_end);
                        if (!ordered)
                                break;
  
        page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
  
        lock_extent_bits(&BTRFS_I(inode)->io_tree,
 -                       page_start, page_end - 1, 0, &cached_state,
 -                       GFP_NOFS);
 +                       page_start, page_end - 1, 0, &cached_state);
        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
                          page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
                          EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
@@@ -1896,7 -1883,7 +1896,7 @@@ static noinline int btrfs_ioctl_snap_de
                goto out;
        }
  
 -      err = mnt_want_write(file->f_path.mnt);
 +      err = mnt_want_write_file(file);
        if (err)
                goto out;
  
                                dest->root_key.objectid,
                                dentry->d_name.name,
                                dentry->d_name.len);
 -      BUG_ON(ret);
 +      if (ret) {
 +              err = ret;
 +              btrfs_abort_transaction(trans, root, ret);
 +              goto out_end_trans;
 +      }
  
        btrfs_record_root_in_trans(trans, dest);
  
                ret = btrfs_insert_orphan_item(trans,
                                        root->fs_info->tree_root,
                                        dest->root_key.objectid);
 -              BUG_ON(ret);
 +              if (ret) {
 +                      btrfs_abort_transaction(trans, root, ret);
 +                      err = ret;
 +                      goto out_end_trans;
 +              }
        }
 -
 +out_end_trans:
        ret = btrfs_end_transaction(trans, root);
 -      BUG_ON(ret);
 +      if (ret && !err)
 +              err = ret;
        inode->i_flags |= S_DEAD;
  out_up_write:
        up_write(&root->fs_info->subvol_sem);
@@@ -2021,7 -1999,7 +2021,7 @@@ out_dput
        dput(dentry);
  out_unlock_dir:
        mutex_unlock(&dir->i_mutex);
 -      mnt_drop_write(file->f_path.mnt);
 +      mnt_drop_write_file(file);
  out:
        kfree(vol_args);
        return err;
@@@ -2037,7 -2015,7 +2037,7 @@@ static int btrfs_ioctl_defrag(struct fi
        if (btrfs_root_readonly(root))
                return -EROFS;
  
 -      ret = mnt_want_write(file->f_path.mnt);
 +      ret = mnt_want_write_file(file);
        if (ret)
                return ret;
  
                ret = -EINVAL;
        }
  out:
 -      mnt_drop_write(file->f_path.mnt);
 +      mnt_drop_write_file(file);
        return ret;
  }
  
@@@ -2267,7 -2245,7 +2267,7 @@@ static noinline long btrfs_ioctl_clone(
        if (btrfs_root_readonly(root))
                return -EROFS;
  
 -      ret = mnt_want_write(file->f_path.mnt);
 +      ret = mnt_want_write_file(file);
        if (ret)
                return ret;
  
           another, and lock file content */
        while (1) {
                struct btrfs_ordered_extent *ordered;
 -              lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
 +              lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
                ordered = btrfs_lookup_first_ordered_extent(src, off+len);
                if (!ordered &&
                    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
                                   EXTENT_DELALLOC, 0, NULL))
                        break;
 -              unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
 +              unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
                btrfs_wait_ordered_range(src, off, len);
                                                         new_key.offset,
                                                         new_key.offset + datal,
                                                         &hint_byte, 1);
 -                              BUG_ON(ret);
 +                              if (ret) {
 +                                      btrfs_abort_transaction(trans, root,
 +                                                              ret);
 +                                      btrfs_end_transaction(trans, root);
 +                                      goto out;
 +                              }
  
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
 -                              BUG_ON(ret);
 +                              if (ret) {
 +                                      btrfs_abort_transaction(trans, root,
 +                                                              ret);
 +                                      btrfs_end_transaction(trans, root);
 +                                      goto out;
 +                              }
  
                                leaf = path->nodes[0];
                                slot = path->slots[0];
                                                        btrfs_ino(inode),
                                                        new_key.offset - datao,
                                                        0);
 -                                      BUG_ON(ret);
 +                                      if (ret) {
 +                                              btrfs_abort_transaction(trans,
 +                                                                      root,
 +                                                                      ret);
 +                                              btrfs_end_transaction(trans,
 +                                                                    root);
 +                                              goto out;
 +
 +                                      }
                                }
                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
                                u64 skip = 0;
                                                         new_key.offset,
                                                         new_key.offset + datal,
                                                         &hint_byte, 1);
 -                              BUG_ON(ret);
 +                              if (ret) {
 +                                      btrfs_abort_transaction(trans, root,
 +                                                              ret);
 +                                      btrfs_end_transaction(trans, root);
 +                                      goto out;
 +                              }
  
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
 -                              BUG_ON(ret);
 +                              if (ret) {
 +                                      btrfs_abort_transaction(trans, root,
 +                                                              ret);
 +                                      btrfs_end_transaction(trans, root);
 +                                      goto out;
 +                              }
  
                                if (skip) {
                                        u32 start =
                                btrfs_i_size_write(inode, endoff);
  
                        ret = btrfs_update_inode(trans, root, inode);
 -                      BUG_ON(ret);
 -                      btrfs_end_transaction(trans, root);
 +                      if (ret) {
 +                              btrfs_abort_transaction(trans, root, ret);
 +                              btrfs_end_transaction(trans, root);
 +                              goto out;
 +                      }
 +                      ret = btrfs_end_transaction(trans, root);
                }
  next:
                btrfs_release_path(path);
        ret = 0;
  out:
        btrfs_release_path(path);
 -      unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
 +      unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
  out_unlock:
        mutex_unlock(&src->i_mutex);
        mutex_unlock(&inode->i_mutex);
  out_fput:
        fput(src_file);
  out_drop_write:
 -      mnt_drop_write(file->f_path.mnt);
 +      mnt_drop_write_file(file);
        return ret;
  }
  
@@@ -2654,7 -2600,7 +2654,7 @@@ static long btrfs_ioctl_trans_start(str
        if (btrfs_root_readonly(root))
                goto out;
  
 -      ret = mnt_want_write(file->f_path.mnt);
 +      ret = mnt_want_write_file(file);
        if (ret)
                goto out;
  
  
  out_drop:
        atomic_dec(&root->fs_info->open_ioctl_trans);
 -      mnt_drop_write(file->f_path.mnt);
 +      mnt_drop_write_file(file);
  out:
        return ret;
  }
@@@ -2905,7 -2851,7 +2905,7 @@@ long btrfs_ioctl_trans_end(struct file 
  
        atomic_dec(&root->fs_info->open_ioctl_trans);
  
 -      mnt_drop_write(file->f_path.mnt);
 +      mnt_drop_write_file(file);
        return 0;
  }
  
@@@ -3121,8 -3067,8 +3121,8 @@@ static long btrfs_ioctl_logical_to_ino(
                goto out;
  
        extent_item_pos = loi->logical - key.objectid;
-       ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
-                                       extent_item_pos, build_ino_list,
+       ret = iterate_extent_inodes(root->fs_info, key.objectid,
+                                       extent_item_pos, 0, build_ino_list,
                                        inodes);
  
        if (ret < 0)
diff --combined fs/btrfs/scrub.c
index 07e59d97551a28bcfb477a98152a8f2ee89e56cf,b9b84cdfc3591c0b43f07f89a232eaa51d9acd0d..c9a2c1aef4bd0d21dd412c6be7ab639487dab57e
   * Future enhancements:
   *  - In case an unrepairable extent is encountered, track which files are
   *    affected and report them
 - *  - In case of a read error on files with nodatasum, map the file and read
 - *    the extent to trigger a writeback of the good copy
   *  - track and record media errors, throw out bad devices
   *  - add a mode to also read unallocated space
   */
  
 -struct scrub_bio;
 -struct scrub_page;
 +struct scrub_block;
  struct scrub_dev;
 -static void scrub_bio_end_io(struct bio *bio, int err);
 -static void scrub_checksum(struct btrfs_work *work);
 -static int scrub_checksum_data(struct scrub_dev *sdev,
 -                             struct scrub_page *spag, void *buffer);
 -static int scrub_checksum_tree_block(struct scrub_dev *sdev,
 -                                   struct scrub_page *spag, u64 logical,
 -                                   void *buffer);
 -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
 -static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
 -static void scrub_fixup_end_io(struct bio *bio, int err);
 -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
 -                        struct page *page);
 -static void scrub_fixup(struct scrub_bio *sbio, int ix);
  
  #define SCRUB_PAGES_PER_BIO   16      /* 64k per bio */
  #define SCRUB_BIOS_PER_DEV    16      /* 1 MB per device in flight */
 +#define SCRUB_MAX_PAGES_PER_BLOCK     16      /* 64k per node/leaf/sector */
  
  struct scrub_page {
 +      struct scrub_block      *sblock;
 +      struct page             *page;
 +      struct block_device     *bdev;
        u64                     flags;  /* extent flags */
        u64                     generation;
 -      int                     mirror_num;
 -      int                     have_csum;
 +      u64                     logical;
 +      u64                     physical;
 +      struct {
 +              unsigned int    mirror_num:8;
 +              unsigned int    have_csum:1;
 +              unsigned int    io_error:1;
 +      };
        u8                      csum[BTRFS_CSUM_SIZE];
  };
  
@@@ -70,25 -77,12 +70,25 @@@ struct scrub_bio 
        int                     err;
        u64                     logical;
        u64                     physical;
 -      struct scrub_page       spag[SCRUB_PAGES_PER_BIO];
 -      u64                     count;
 +      struct scrub_page       *pagev[SCRUB_PAGES_PER_BIO];
 +      int                     page_count;
        int                     next_free;
        struct btrfs_work       work;
  };
  
 +struct scrub_block {
 +      struct scrub_page       pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 +      int                     page_count;
 +      atomic_t                outstanding_pages;
 +      atomic_t                ref_count; /* free mem on transition to zero */
 +      struct scrub_dev        *sdev;
 +      struct {
 +              unsigned int    header_error:1;
 +              unsigned int    checksum_error:1;
 +              unsigned int    no_io_error_seen:1;
 +      };
 +};
 +
  struct scrub_dev {
        struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
        struct btrfs_device     *dev;
        struct list_head        csum_list;
        atomic_t                cancel_req;
        int                     readonly;
 +      int                     pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
 +      u32                     sectorsize;
 +      u32                     nodesize;
 +      u32                     leafsize;
        /*
         * statistics
         */
@@@ -134,43 -124,6 +134,43 @@@ struct scrub_warning 
        int                     scratch_bufsize;
  };
  
 +
 +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 +static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 +                                   struct btrfs_mapping_tree *map_tree,
 +                                   u64 length, u64 logical,
 +                                   struct scrub_block *sblock);
 +static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
 +                             struct scrub_block *sblock, int is_metadata,
 +                             int have_csum, u8 *csum, u64 generation,
 +                             u16 csum_size);
 +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 +                                       struct scrub_block *sblock,
 +                                       int is_metadata, int have_csum,
 +                                       const u8 *csum, u64 generation,
 +                                       u16 csum_size);
 +static void scrub_complete_bio_end_io(struct bio *bio, int err);
 +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 +                                           struct scrub_block *sblock_good,
 +                                           int force_write);
 +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 +                                          struct scrub_block *sblock_good,
 +                                          int page_num, int force_write);
 +static int scrub_checksum_data(struct scrub_block *sblock);
 +static int scrub_checksum_tree_block(struct scrub_block *sblock);
 +static int scrub_checksum_super(struct scrub_block *sblock);
 +static void scrub_block_get(struct scrub_block *sblock);
 +static void scrub_block_put(struct scrub_block *sblock);
 +static int scrub_add_page_to_bio(struct scrub_dev *sdev,
 +                               struct scrub_page *spage);
 +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
 +                     u64 physical, u64 flags, u64 gen, int mirror_num,
 +                     u8 *csum, int force);
 +static void scrub_bio_end_io(struct bio *bio, int err);
 +static void scrub_bio_end_io_worker(struct btrfs_work *work);
 +static void scrub_block_complete(struct scrub_block *sblock);
 +
 +
  static void scrub_free_csums(struct scrub_dev *sdev)
  {
        while (!list_empty(&sdev->csum_list)) {
        }
  }
  
 -static void scrub_free_bio(struct bio *bio)
 -{
 -      int i;
 -      struct page *last_page = NULL;
 -
 -      if (!bio)
 -              return;
 -
 -      for (i = 0; i < bio->bi_vcnt; ++i) {
 -              if (bio->bi_io_vec[i].bv_page == last_page)
 -                      continue;
 -              last_page = bio->bi_io_vec[i].bv_page;
 -              __free_page(last_page);
 -      }
 -      bio_put(bio);
 -}
 -
  static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
  {
        int i;
        if (!sdev)
                return;
  
 +      /* this can happen when scrub is cancelled */
 +      if (sdev->curr != -1) {
 +              struct scrub_bio *sbio = sdev->bios[sdev->curr];
 +
 +              for (i = 0; i < sbio->page_count; i++) {
 +                      BUG_ON(!sbio->pagev[i]);
 +                      BUG_ON(!sbio->pagev[i]->page);
 +                      scrub_block_put(sbio->pagev[i]->sblock);
 +              }
 +              bio_put(sbio->bio);
 +      }
 +
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                struct scrub_bio *sbio = sdev->bios[i];
  
                if (!sbio)
                        break;
 -
 -              scrub_free_bio(sbio->bio);
                kfree(sbio);
        }
  
@@@ -219,16 -179,11 +219,16 @@@ struct scrub_dev *scrub_setup_dev(struc
        struct scrub_dev *sdev;
        int             i;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 +      int pages_per_bio;
  
 +      pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
 +                            bio_get_nr_vecs(dev->bdev));
        sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
        if (!sdev)
                goto nomem;
        sdev->dev = dev;
 +      sdev->pages_per_bio = pages_per_bio;
 +      sdev->curr = -1;
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                struct scrub_bio *sbio;
  
  
                sbio->index = i;
                sbio->sdev = sdev;
 -              sbio->count = 0;
 -              sbio->work.func = scrub_checksum;
 +              sbio->page_count = 0;
 +              sbio->work.func = scrub_bio_end_io_worker;
  
                if (i != SCRUB_BIOS_PER_DEV-1)
                        sdev->bios[i]->next_free = i + 1;
                        sdev->bios[i]->next_free = -1;
        }
        sdev->first_free = 0;
 -      sdev->curr = -1;
 +      sdev->nodesize = dev->dev_root->nodesize;
 +      sdev->leafsize = dev->dev_root->leafsize;
 +      sdev->sectorsize = dev->dev_root->sectorsize;
        atomic_set(&sdev->in_flight, 0);
        atomic_set(&sdev->fixup_cnt, 0);
        atomic_set(&sdev->cancel_req, 0);
@@@ -341,9 -294,10 +341,9 @@@ err
        return 0;
  }
  
 -static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
 -                              int ix)
 +static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
  {
 -      struct btrfs_device *dev = sbio->sdev->dev;
 +      struct btrfs_device *dev = sblock->sdev->dev;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
        struct btrfs_path *path;
        struct btrfs_key found_key;
  
        swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
        swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
 -      swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
 -      swarn.logical = sbio->logical + ix * PAGE_SIZE;
 +      BUG_ON(sblock->page_count < 1);
 +      swarn.sector = (sblock->pagev[0].physical) >> 9;
 +      swarn.logical = sblock->pagev[0].logical;
        swarn.errstr = errstr;
        swarn.dev = dev;
        swarn.msg_bufsize = bufsize;
                do {
                        ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
                                                        &ref_root, &ref_level);
 -                      printk(KERN_WARNING "%s at logical %llu on dev %s, "
 +                      printk(KERN_WARNING
 +                              "btrfs: %s at logical %llu on dev %s, "
                                "sector %llu: metadata %s (level %d) in tree "
                                "%llu\n", errstr, swarn.logical, dev->name,
                                (unsigned long long)swarn.sector,
                } while (ret != 1);
        } else {
                swarn.path = path;
-               iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       extent_item_pos,
+               iterate_extent_inodes(fs_info, found_key.objectid,
+                                       extent_item_pos, 1,
                                        scrub_print_warning_inode, &swarn);
        }
  
@@@ -579,9 -531,9 +579,9 @@@ out
                spin_lock(&sdev->stat_lock);
                ++sdev->stat.uncorrectable_errors;
                spin_unlock(&sdev->stat_lock);
 -              printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
 -                                      "(nodatasum) error at logical %llu\n",
 -                                      fixup->logical);
 +              printk_ratelimited(KERN_ERR
 +                      "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 +                      (unsigned long long)fixup->logical, sdev->dev->name);
        }
  
        btrfs_free_path(path);
  }
  
  /*
 - * scrub_recheck_error gets called when either verification of the page
 - * failed or the bio failed to read, e.g. with EIO. In the latter case,
 - * recheck_error gets called for every page in the bio, even though only
 - * one may be bad
 + * scrub_handle_errored_block gets called when either verification of the
 + * pages failed or the bio failed to read, e.g. with EIO. In the latter
 + * case, this function handles all pages in the bio, even though only one
 + * may be bad.
 + * The goal of this function is to repair the errored block by using the
 + * contents of one of the mirrors.
   */
 -static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
 +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
  {
 -      struct scrub_dev *sdev = sbio->sdev;
 -      u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
 +      struct scrub_dev *sdev = sblock_to_check->sdev;
 +      struct btrfs_fs_info *fs_info;
 +      u64 length;
 +      u64 logical;
 +      u64 generation;
 +      unsigned int failed_mirror_index;
 +      unsigned int is_metadata;
 +      unsigned int have_csum;
 +      u8 *csum;
 +      struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 +      struct scrub_block *sblock_bad;
 +      int ret;
 +      int mirror_index;
 +      int page_num;
 +      int success;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 -                                      DEFAULT_RATELIMIT_BURST);
 +                                    DEFAULT_RATELIMIT_BURST);
 +
 +      BUG_ON(sblock_to_check->page_count < 1);
 +      fs_info = sdev->dev->dev_root->fs_info;
 +      length = sblock_to_check->page_count * PAGE_SIZE;
 +      logical = sblock_to_check->pagev[0].logical;
 +      generation = sblock_to_check->pagev[0].generation;
 +      BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
 +      failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
 +      is_metadata = !(sblock_to_check->pagev[0].flags &
 +                      BTRFS_EXTENT_FLAG_DATA);
 +      have_csum = sblock_to_check->pagev[0].have_csum;
 +      csum = sblock_to_check->pagev[0].csum;
  
 -      if (sbio->err) {
 -              if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
 -                                 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
 -                      if (scrub_fixup_check(sbio, ix) == 0)
 -                              return 0;
 -              }
 -              if (__ratelimit(&_rs))
 -                      scrub_print_warning("i/o error", sbio, ix);
 -      } else {
 -              if (__ratelimit(&_rs))
 -                      scrub_print_warning("checksum error", sbio, ix);
 +      /*
 +       * read all mirrors one after the other. This includes to
 +       * re-read the extent or metadata block that failed (that was
 +       * the cause that this fixup code is called) another time,
 +       * page by page this time in order to know which pages
 +       * caused I/O errors and which ones are good (for all mirrors).
 +       * It is the goal to handle the situation when more than one
 +       * mirror contains I/O errors, but the errors do not
 +       * overlap, i.e. the data can be repaired by selecting the
 +       * pages from those mirrors without I/O error on the
 +       * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 +       * would be that mirror #1 has an I/O error on the first page,
 +       * the second page is good, and mirror #2 has an I/O error on
 +       * the second page, but the first page is good.
 +       * Then the first page of the first mirror can be repaired by
 +       * taking the first page of the second mirror, and the
 +       * second page of the second mirror can be repaired by
 +       * copying the contents of the 2nd page of the 1st mirror.
 +       * One more note: if the pages of one mirror contain I/O
 +       * errors, the checksum cannot be verified. In order to get
 +       * the best data for repairing, the first attempt is to find
 +       * a mirror without I/O errors and with a validated checksum.
 +       * Only if this is not possible, the pages are picked from
 +       * mirrors with I/O errors without considering the checksum.
 +       * If the latter is the case, at the end, the checksum of the
 +       * repaired area is verified in order to correctly maintain
 +       * the statistics.
 +       */
 +
 +      sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
 +                                   sizeof(*sblocks_for_recheck),
 +                                   GFP_NOFS);
 +      if (!sblocks_for_recheck) {
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.malloc_errors++;
 +              sdev->stat.read_errors++;
 +              sdev->stat.uncorrectable_errors++;
 +              spin_unlock(&sdev->stat_lock);
 +              goto out;
        }
  
 -      spin_lock(&sdev->stat_lock);
 -      ++sdev->stat.read_errors;
 -      spin_unlock(&sdev->stat_lock);
 +      /* setup the context, map the logical blocks and alloc the pages */
 +      ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
 +                                      logical, sblocks_for_recheck);
 +      if (ret) {
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.read_errors++;
 +              sdev->stat.uncorrectable_errors++;
 +              spin_unlock(&sdev->stat_lock);
 +              goto out;
 +      }
 +      BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 +      sblock_bad = sblocks_for_recheck + failed_mirror_index;
  
 -      scrub_fixup(sbio, ix);
 -      return 1;
 -}
 +      /* build and submit the bios for the failed mirror, check checksums */
 +      ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 +                                csum, generation, sdev->csum_size);
 +      if (ret) {
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.read_errors++;
 +              sdev->stat.uncorrectable_errors++;
 +              spin_unlock(&sdev->stat_lock);
 +              goto out;
 +      }
  
 -static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
 -{
 -      int ret = 1;
 -      struct page *page;
 -      void *buffer;
 -      u64 flags = sbio->spag[ix].flags;
 +      if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 +          sblock_bad->no_io_error_seen) {
 +              /*
 +               * the error disappeared after reading page by page, or
 +               * the area was part of a huge bio and other parts of the
 +               * bio caused I/O errors, or the block layer merged several
 +               * read requests into one and the error is caused by a
 +               * different bio (usually one of the two latter cases is
 +               * the cause)
 +               */
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.unverified_errors++;
 +              spin_unlock(&sdev->stat_lock);
  
 -      page = sbio->bio->bi_io_vec[ix].bv_page;
 -      buffer = kmap_atomic(page, KM_USER0);
 -      if (flags & BTRFS_EXTENT_FLAG_DATA) {
 -              ret = scrub_checksum_data(sbio->sdev,
 -                                        sbio->spag + ix, buffer);
 -      } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 -              ret = scrub_checksum_tree_block(sbio->sdev,
 -                                              sbio->spag + ix,
 -                                              sbio->logical + ix * PAGE_SIZE,
 -                                              buffer);
 -      } else {
 -              WARN_ON(1);
 +              goto out;
        }
 -      kunmap_atomic(buffer, KM_USER0);
  
 -      return ret;
 -}
 +      if (!sblock_bad->no_io_error_seen) {
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.read_errors++;
 +              spin_unlock(&sdev->stat_lock);
 +              if (__ratelimit(&_rs))
 +                      scrub_print_warning("i/o error", sblock_to_check);
 +      } else if (sblock_bad->checksum_error) {
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.csum_errors++;
 +              spin_unlock(&sdev->stat_lock);
 +              if (__ratelimit(&_rs))
 +                      scrub_print_warning("checksum error", sblock_to_check);
 +      } else if (sblock_bad->header_error) {
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.verify_errors++;
 +              spin_unlock(&sdev->stat_lock);
 +              if (__ratelimit(&_rs))
 +                      scrub_print_warning("checksum/header error",
 +                                          sblock_to_check);
 +      }
  
 -static void scrub_fixup_end_io(struct bio *bio, int err)
 -{
 -      complete((struct completion *)bio->bi_private);
 -}
 +      if (sdev->readonly)
 +              goto did_not_correct_error;
  
 -static void scrub_fixup(struct scrub_bio *sbio, int ix)
 -{
 -      struct scrub_dev *sdev = sbio->sdev;
 -      struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 -      struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 -      struct btrfs_bio *bbio = NULL;
 -      struct scrub_fixup_nodatasum *fixup;
 -      u64 logical = sbio->logical + ix * PAGE_SIZE;
 -      u64 length;
 -      int i;
 -      int ret;
 -      DECLARE_COMPLETION_ONSTACK(complete);
 -
 -      if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
 -          (sbio->spag[ix].have_csum == 0)) {
 -              fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
 -              if (!fixup)
 -                      goto uncorrectable;
 -              fixup->sdev = sdev;
 -              fixup->logical = logical;
 -              fixup->root = fs_info->extent_root;
 -              fixup->mirror_num = sbio->spag[ix].mirror_num;
 +      if (!is_metadata && !have_csum) {
 +              struct scrub_fixup_nodatasum *fixup_nodatasum;
 +
 +              /*
 +               * !is_metadata and !have_csum, this means that the data
 +               * might not be COW'ed, that it might be modified
 +               * concurrently. The general strategy to work on the
 +               * commit root does not help in the case when COW is not
 +               * used.
 +               */
 +              fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
 +              if (!fixup_nodatasum)
 +                      goto did_not_correct_error;
 +              fixup_nodatasum->sdev = sdev;
 +              fixup_nodatasum->logical = logical;
 +              fixup_nodatasum->root = fs_info->extent_root;
 +              fixup_nodatasum->mirror_num = failed_mirror_index + 1;
                /*
                 * increment scrubs_running to prevent cancel requests from
                 * completing as long as a fixup worker is running. we must also
                atomic_inc(&fs_info->scrubs_paused);
                mutex_unlock(&fs_info->scrub_lock);
                atomic_inc(&sdev->fixup_cnt);
 -              fixup->work.func = scrub_fixup_nodatasum;
 -              btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
 -              return;
 +              fixup_nodatasum->work.func = scrub_fixup_nodatasum;
 +              btrfs_queue_worker(&fs_info->scrub_workers,
 +                                 &fixup_nodatasum->work);
 +              goto out;
        }
  
 -      length = PAGE_SIZE;
 -      ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
 -                            &bbio, 0);
 -      if (ret || !bbio || length < PAGE_SIZE) {
 -              printk(KERN_ERR
 -                     "scrub_fixup: btrfs_map_block failed us for %llu\n",
 -                     (unsigned long long)logical);
 -              WARN_ON(1);
 -              kfree(bbio);
 -              return;
 +      /*
 +       * now build and submit the bios for the other mirrors, check
 +       * checksums
 +       */
 +      for (mirror_index = 0;
 +           mirror_index < BTRFS_MAX_MIRRORS &&
 +           sblocks_for_recheck[mirror_index].page_count > 0;
 +           mirror_index++) {
 +              if (mirror_index == failed_mirror_index)
 +                      continue;
 +
 +              /* build and submit the bios, check checksums */
 +              ret = scrub_recheck_block(fs_info,
 +                                        sblocks_for_recheck + mirror_index,
 +                                        is_metadata, have_csum, csum,
 +                                        generation, sdev->csum_size);
 +              if (ret)
 +                      goto did_not_correct_error;
        }
  
 -      if (bbio->num_stripes == 1)
 -              /* there aren't any replicas */
 -              goto uncorrectable;
 +      /*
 +       * first try to pick the mirror which is completely without I/O
 +       * errors and also does not have a checksum error.
 +       * If one is found, and if a checksum is present, the full block
 +       * that is known to contain an error is rewritten. Afterwards
 +       * the block is known to be corrected.
 +       * If a mirror is found which is completely correct, and no
 +       * checksum is present, only those pages are rewritten that had
 +       * an I/O error in the block to be repaired, since it cannot be
 +       * determined, which copy of the other pages is better (and it
 +       * could happen otherwise that a correct page would be
 +       * overwritten by a bad one).
 +       */
 +      for (mirror_index = 0;
 +           mirror_index < BTRFS_MAX_MIRRORS &&
 +           sblocks_for_recheck[mirror_index].page_count > 0;
 +           mirror_index++) {
 +              struct scrub_block *sblock_other = sblocks_for_recheck +
 +                                                 mirror_index;
 +
 +              if (!sblock_other->header_error &&
 +                  !sblock_other->checksum_error &&
 +                  sblock_other->no_io_error_seen) {
 +                      int force_write = is_metadata || have_csum;
 +
 +                      ret = scrub_repair_block_from_good_copy(sblock_bad,
 +                                                              sblock_other,
 +                                                              force_write);
 +                      if (0 == ret)
 +                              goto corrected_error;
 +              }
 +      }
  
        /*
 -       * first find a good copy
 +       * in case of I/O errors in the area that is supposed to be
 +       * repaired, continue by picking good copies of those pages.
 +       * Select the good pages from mirrors to rewrite bad pages from
 +       * the area to fix. Afterwards verify the checksum of the block
 +       * that is supposed to be repaired. This verification step is
 +       * only done for the purpose of statistic counting and for the
 +       * final scrub report, whether errors remain.
 +       * A perfect algorithm could make use of the checksum and try
 +       * all possible combinations of pages from the different mirrors
 +       * until the checksum verification succeeds. For example, when
 +       * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
 +       * of mirror #2 is readable but the final checksum test fails,
 +       * then the 2nd page of mirror #3 could be tried, whether now
 +       * the final checksum succeedes. But this would be a rare
 +       * exception and is therefore not implemented. At least it is
 +       * avoided that the good copy is overwritten.
 +       * A more useful improvement would be to pick the sectors
 +       * without I/O error based on sector sizes (512 bytes on legacy
 +       * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
 +       * mirror could be repaired by taking 512 byte of a different
 +       * mirror, even if other 512 byte sectors in the same PAGE_SIZE
 +       * area are unreadable.
         */
 -      for (i = 0; i < bbio->num_stripes; ++i) {
 -              if (i + 1 == sbio->spag[ix].mirror_num)
 -                      continue;
  
 -              if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
 -                                 bbio->stripes[i].physical >> 9,
 -                                 sbio->bio->bi_io_vec[ix].bv_page)) {
 -                      /* I/O-error, this is not a good copy */
 +      /* can only fix I/O errors from here on */
 +      if (sblock_bad->no_io_error_seen)
 +              goto did_not_correct_error;
 +
 +      success = 1;
 +      for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
 +              struct scrub_page *page_bad = sblock_bad->pagev + page_num;
 +
 +              if (!page_bad->io_error)
                        continue;
 +
 +              for (mirror_index = 0;
 +                   mirror_index < BTRFS_MAX_MIRRORS &&
 +                   sblocks_for_recheck[mirror_index].page_count > 0;
 +                   mirror_index++) {
 +                      struct scrub_block *sblock_other = sblocks_for_recheck +
 +                                                         mirror_index;
 +                      struct scrub_page *page_other = sblock_other->pagev +
 +                                                      page_num;
 +
 +                      if (!page_other->io_error) {
 +                              ret = scrub_repair_page_from_good_copy(
 +                                      sblock_bad, sblock_other, page_num, 0);
 +                              if (0 == ret) {
 +                                      page_bad->io_error = 0;
 +                                      break; /* succeeded for this page */
 +                              }
 +                      }
                }
  
 -              if (scrub_fixup_check(sbio, ix) == 0)
 -                      break;
 +              if (page_bad->io_error) {
 +                      /* did not find a mirror to copy the page from */
 +                      success = 0;
 +              }
        }
 -      if (i == bbio->num_stripes)
 -              goto uncorrectable;
  
 -      if (!sdev->readonly) {
 -              /*
 -               * bi_io_vec[ix].bv_page now contains good data, write it back
 -               */
 -              if (scrub_fixup_io(WRITE, sdev->dev->bdev,
 -                                 (sbio->physical + ix * PAGE_SIZE) >> 9,
 -                                 sbio->bio->bi_io_vec[ix].bv_page)) {
 -                      /* I/O-error, writeback failed, give up */
 -                      goto uncorrectable;
 +      if (success) {
 +              if (is_metadata || have_csum) {
 +                      /*
 +                       * need to verify the checksum now that all
 +                       * sectors on disk are repaired (the write
 +                       * request for data to be repaired is on its way).
 +                       * Just be lazy and use scrub_recheck_block()
 +                       * which re-reads the data before the checksum
 +                       * is verified, but most likely the data comes out
 +                       * of the page cache.
 +                       */
 +                      ret = scrub_recheck_block(fs_info, sblock_bad,
 +                                                is_metadata, have_csum, csum,
 +                                                generation, sdev->csum_size);
 +                      if (!ret && !sblock_bad->header_error &&
 +                          !sblock_bad->checksum_error &&
 +                          sblock_bad->no_io_error_seen)
 +                              goto corrected_error;
 +                      else
 +                              goto did_not_correct_error;
 +              } else {
 +corrected_error:
 +                      spin_lock(&sdev->stat_lock);
 +                      sdev->stat.corrected_errors++;
 +                      spin_unlock(&sdev->stat_lock);
 +                      printk_ratelimited(KERN_ERR
 +                              "btrfs: fixed up error at logical %llu on dev %s\n",
 +                              (unsigned long long)logical, sdev->dev->name);
                }
 +      } else {
 +did_not_correct_error:
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.uncorrectable_errors++;
 +              spin_unlock(&sdev->stat_lock);
 +              printk_ratelimited(KERN_ERR
 +                      "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
 +                      (unsigned long long)logical, sdev->dev->name);
        }
  
 -      kfree(bbio);
 -      spin_lock(&sdev->stat_lock);
 -      ++sdev->stat.corrected_errors;
 -      spin_unlock(&sdev->stat_lock);
 +out:
 +      if (sblocks_for_recheck) {
 +              for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
 +                   mirror_index++) {
 +                      struct scrub_block *sblock = sblocks_for_recheck +
 +                                                   mirror_index;
 +                      int page_index;
 +
 +                      for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
 +                           page_index++)
 +                              if (sblock->pagev[page_index].page)
 +                                      __free_page(
 +                                              sblock->pagev[page_index].page);
 +              }
 +              kfree(sblocks_for_recheck);
 +      }
  
 -      printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
 -                             (unsigned long long)logical);
 -      return;
 +      return 0;
 +}
  
 -uncorrectable:
 -      kfree(bbio);
 -      spin_lock(&sdev->stat_lock);
 -      ++sdev->stat.uncorrectable_errors;
 -      spin_unlock(&sdev->stat_lock);
 +static int scrub_setup_recheck_block(struct scrub_dev *sdev,
 +                                   struct btrfs_mapping_tree *map_tree,
 +                                   u64 length, u64 logical,
 +                                   struct scrub_block *sblocks_for_recheck)
 +{
 +      int page_index;
 +      int mirror_index;
 +      int ret;
 +
 +      /*
 +       * note: the three members sdev, ref_count and outstanding_pages
 +       * are not used (and not set) in the blocks that are used for
 +       * the recheck procedure
 +       */
 +
 +      page_index = 0;
 +      while (length > 0) {
 +              u64 sublen = min_t(u64, length, PAGE_SIZE);
 +              u64 mapped_length = sublen;
 +              struct btrfs_bio *bbio = NULL;
 +
 +              /*
 +               * with a length of PAGE_SIZE, each returned stripe
 +               * represents one mirror
 +               */
 +              ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
 +                                    &bbio, 0);
 +              if (ret || !bbio || mapped_length < sublen) {
 +                      kfree(bbio);
 +                      return -EIO;
 +              }
  
 -      printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
 -                              "logical %llu\n", (unsigned long long)logical);
 +              BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
 +              for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
 +                   mirror_index++) {
 +                      struct scrub_block *sblock;
 +                      struct scrub_page *page;
 +
 +                      if (mirror_index >= BTRFS_MAX_MIRRORS)
 +                              continue;
 +
 +                      sblock = sblocks_for_recheck + mirror_index;
 +                      page = sblock->pagev + page_index;
 +                      page->logical = logical;
 +                      page->physical = bbio->stripes[mirror_index].physical;
 +                      page->bdev = bbio->stripes[mirror_index].dev->bdev;
 +                      page->mirror_num = mirror_index + 1;
 +                      page->page = alloc_page(GFP_NOFS);
 +                      if (!page->page) {
 +                              spin_lock(&sdev->stat_lock);
 +                              sdev->stat.malloc_errors++;
 +                              spin_unlock(&sdev->stat_lock);
 +                              return -ENOMEM;
 +                      }
 +                      sblock->page_count++;
 +              }
 +              kfree(bbio);
 +              length -= sublen;
 +              logical += sublen;
 +              page_index++;
 +      }
 +
 +      return 0;
  }
  
 -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
 -                       struct page *page)
 +/*
 + * this function will check the on disk data for checksum errors, header
 + * errors and read I/O errors. If any I/O errors happen, the exact pages
 + * which are errored are marked as being bad. The goal is to enable scrub
 + * to take those pages that are not errored from all the mirrors so that
 + * the pages that are errored in the just handled mirror can be repaired.
 + */
 +static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
 +                             struct scrub_block *sblock, int is_metadata,
 +                             int have_csum, u8 *csum, u64 generation,
 +                             u16 csum_size)
  {
 -      struct bio *bio = NULL;
 -      int ret;
 -      DECLARE_COMPLETION_ONSTACK(complete);
 +      int page_num;
 +
 +      sblock->no_io_error_seen = 1;
 +      sblock->header_error = 0;
 +      sblock->checksum_error = 0;
 +
 +      for (page_num = 0; page_num < sblock->page_count; page_num++) {
 +              struct bio *bio;
 +              int ret;
 +              struct scrub_page *page = sblock->pagev + page_num;
 +              DECLARE_COMPLETION_ONSTACK(complete);
 +
 +              BUG_ON(!page->page);
 +              bio = bio_alloc(GFP_NOFS, 1);
 +              bio->bi_bdev = page->bdev;
 +              bio->bi_sector = page->physical >> 9;
 +              bio->bi_end_io = scrub_complete_bio_end_io;
 +              bio->bi_private = &complete;
 +
 +              ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
 +              if (PAGE_SIZE != ret) {
 +                      bio_put(bio);
 +                      return -EIO;
 +              }
 +              btrfsic_submit_bio(READ, bio);
  
 -      bio = bio_alloc(GFP_NOFS, 1);
 -      bio->bi_bdev = bdev;
 -      bio->bi_sector = sector;
 -      bio_add_page(bio, page, PAGE_SIZE, 0);
 -      bio->bi_end_io = scrub_fixup_end_io;
 -      bio->bi_private = &complete;
 -      btrfsic_submit_bio(rw, bio);
 +              /* this will also unplug the queue */
 +              wait_for_completion(&complete);
  
 -      /* this will also unplug the queue */
 -      wait_for_completion(&complete);
 +              page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
 +              if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 +                      sblock->no_io_error_seen = 0;
 +              bio_put(bio);
 +      }
  
 -      ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
 -      bio_put(bio);
 -      return ret;
 +      if (sblock->no_io_error_seen)
 +              scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
 +                                           have_csum, csum, generation,
 +                                           csum_size);
 +
 +      return 0;
  }
  
 -static void scrub_bio_end_io(struct bio *bio, int err)
 +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 +                                       struct scrub_block *sblock,
 +                                       int is_metadata, int have_csum,
 +                                       const u8 *csum, u64 generation,
 +                                       u16 csum_size)
  {
 -      struct scrub_bio *sbio = bio->bi_private;
 -      struct scrub_dev *sdev = sbio->sdev;
 -      struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 +      int page_num;
 +      u8 calculated_csum[BTRFS_CSUM_SIZE];
 +      u32 crc = ~(u32)0;
 +      struct btrfs_root *root = fs_info->extent_root;
 +      void *mapped_buffer;
 +
 +      BUG_ON(!sblock->pagev[0].page);
 +      if (is_metadata) {
 +              struct btrfs_header *h;
 +
 +              mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
 +              h = (struct btrfs_header *)mapped_buffer;
 +
 +              if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
 +                  generation != le64_to_cpu(h->generation) ||
 +                  memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
 +                  memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
 +                         BTRFS_UUID_SIZE))
 +                      sblock->header_error = 1;
 +              csum = h->csum;
 +      } else {
 +              if (!have_csum)
 +                      return;
  
 -      sbio->err = err;
 -      sbio->bio = bio;
 +              mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
 +      }
  
 -      btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
 +      for (page_num = 0;;) {
 +              if (page_num == 0 && is_metadata)
 +                      crc = btrfs_csum_data(root,
 +                              ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
 +                              crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
 +              else
 +                      crc = btrfs_csum_data(root, mapped_buffer, crc,
 +                                            PAGE_SIZE);
 +
 +              kunmap_atomic(mapped_buffer, KM_USER0);
 +              page_num++;
 +              if (page_num >= sblock->page_count)
 +                      break;
 +              BUG_ON(!sblock->pagev[page_num].page);
 +
 +              mapped_buffer = kmap_atomic(sblock->pagev[page_num].page,
 +                                          KM_USER0);
 +      }
 +
 +      btrfs_csum_final(crc, calculated_csum);
 +      if (memcmp(calculated_csum, csum, csum_size))
 +              sblock->checksum_error = 1;
  }
  
 -static void scrub_checksum(struct btrfs_work *work)
 +static void scrub_complete_bio_end_io(struct bio *bio, int err)
  {
 -      struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
 -      struct scrub_dev *sdev = sbio->sdev;
 -      struct page *page;
 -      void *buffer;
 -      int i;
 -      u64 flags;
 -      u64 logical;
 -      int ret;
 +      complete((struct completion *)bio->bi_private);
 +}
  
 -      if (sbio->err) {
 -              ret = 0;
 -              for (i = 0; i < sbio->count; ++i)
 -                      ret |= scrub_recheck_error(sbio, i);
 -              if (!ret) {
 -                      spin_lock(&sdev->stat_lock);
 -                      ++sdev->stat.unverified_errors;
 -                      spin_unlock(&sdev->stat_lock);
 -              }
 +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 +                                           struct scrub_block *sblock_good,
 +                                           int force_write)
 +{
 +      int page_num;
 +      int ret = 0;
  
 -              sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 -              sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
 -              sbio->bio->bi_phys_segments = 0;
 -              sbio->bio->bi_idx = 0;
 +      for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
 +              int ret_sub;
  
 -              for (i = 0; i < sbio->count; i++) {
 -                      struct bio_vec *bi;
 -                      bi = &sbio->bio->bi_io_vec[i];
 -                      bi->bv_offset = 0;
 -                      bi->bv_len = PAGE_SIZE;
 -              }
 -              goto out;
 +              ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
 +                                                         sblock_good,
 +                                                         page_num,
 +                                                         force_write);
 +              if (ret_sub)
 +                      ret = ret_sub;
        }
 -      for (i = 0; i < sbio->count; ++i) {
 -              page = sbio->bio->bi_io_vec[i].bv_page;
 -              buffer = kmap_atomic(page, KM_USER0);
 -              flags = sbio->spag[i].flags;
 -              logical = sbio->logical + i * PAGE_SIZE;
 -              ret = 0;
 -              if (flags & BTRFS_EXTENT_FLAG_DATA) {
 -                      ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
 -              } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 -                      ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
 -                                                      logical, buffer);
 -              } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
 -                      BUG_ON(i);
 -                      (void)scrub_checksum_super(sbio, buffer);
 -              } else {
 -                      WARN_ON(1);
 -              }
 -              kunmap_atomic(buffer, KM_USER0);
 -              if (ret) {
 -                      ret = scrub_recheck_error(sbio, i);
 -                      if (!ret) {
 -                              spin_lock(&sdev->stat_lock);
 -                              ++sdev->stat.unverified_errors;
 -                              spin_unlock(&sdev->stat_lock);
 -                      }
 +
 +      return ret;
 +}
 +
 +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 +                                          struct scrub_block *sblock_good,
 +                                          int page_num, int force_write)
 +{
 +      struct scrub_page *page_bad = sblock_bad->pagev + page_num;
 +      struct scrub_page *page_good = sblock_good->pagev + page_num;
 +
 +      BUG_ON(sblock_bad->pagev[page_num].page == NULL);
 +      BUG_ON(sblock_good->pagev[page_num].page == NULL);
 +      if (force_write || sblock_bad->header_error ||
 +          sblock_bad->checksum_error || page_bad->io_error) {
 +              struct bio *bio;
 +              int ret;
 +              DECLARE_COMPLETION_ONSTACK(complete);
 +
 +              bio = bio_alloc(GFP_NOFS, 1);
 +              bio->bi_bdev = page_bad->bdev;
 +              bio->bi_sector = page_bad->physical >> 9;
 +              bio->bi_end_io = scrub_complete_bio_end_io;
 +              bio->bi_private = &complete;
 +
 +              ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
 +              if (PAGE_SIZE != ret) {
 +                      bio_put(bio);
 +                      return -EIO;
                }
 +              btrfsic_submit_bio(WRITE, bio);
 +
 +              /* this will also unplug the queue */
 +              wait_for_completion(&complete);
 +              bio_put(bio);
        }
  
 -out:
 -      scrub_free_bio(sbio->bio);
 -      sbio->bio = NULL;
 -      spin_lock(&sdev->list_lock);
 -      sbio->next_free = sdev->first_free;
 -      sdev->first_free = sbio->index;
 -      spin_unlock(&sdev->list_lock);
 -      atomic_dec(&sdev->in_flight);
 -      wake_up(&sdev->list_wait);
 +      return 0;
 +}
 +
 +static void scrub_checksum(struct scrub_block *sblock)
 +{
 +      u64 flags;
 +      int ret;
 +
 +      BUG_ON(sblock->page_count < 1);
 +      flags = sblock->pagev[0].flags;
 +      ret = 0;
 +      if (flags & BTRFS_EXTENT_FLAG_DATA)
 +              ret = scrub_checksum_data(sblock);
 +      else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 +              ret = scrub_checksum_tree_block(sblock);
 +      else if (flags & BTRFS_EXTENT_FLAG_SUPER)
 +              (void)scrub_checksum_super(sblock);
 +      else
 +              WARN_ON(1);
 +      if (ret)
 +              scrub_handle_errored_block(sblock);
  }
  
 -static int scrub_checksum_data(struct scrub_dev *sdev,
 -                             struct scrub_page *spag, void *buffer)
 +static int scrub_checksum_data(struct scrub_block *sblock)
  {
 +      struct scrub_dev *sdev = sblock->sdev;
        u8 csum[BTRFS_CSUM_SIZE];
 +      u8 *on_disk_csum;
 +      struct page *page;
 +      void *buffer;
        u32 crc = ~(u32)0;
        int fail = 0;
        struct btrfs_root *root = sdev->dev->dev_root;
 +      u64 len;
 +      int index;
  
 -      if (!spag->have_csum)
 +      BUG_ON(sblock->page_count < 1);
 +      if (!sblock->pagev[0].have_csum)
                return 0;
  
 -      crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
 +      on_disk_csum = sblock->pagev[0].csum;
 +      page = sblock->pagev[0].page;
 +      buffer = kmap_atomic(page, KM_USER0);
 +
 +      len = sdev->sectorsize;
 +      index = 0;
 +      for (;;) {
 +              u64 l = min_t(u64, len, PAGE_SIZE);
 +
 +              crc = btrfs_csum_data(root, buffer, crc, l);
 +              kunmap_atomic(buffer, KM_USER0);
 +              len -= l;
 +              if (len == 0)
 +                      break;
 +              index++;
 +              BUG_ON(index >= sblock->page_count);
 +              BUG_ON(!sblock->pagev[index].page);
 +              page = sblock->pagev[index].page;
 +              buffer = kmap_atomic(page, KM_USER0);
 +      }
 +
        btrfs_csum_final(crc, csum);
 -      if (memcmp(csum, spag->csum, sdev->csum_size))
 +      if (memcmp(csum, on_disk_csum, sdev->csum_size))
                fail = 1;
  
 -      spin_lock(&sdev->stat_lock);
 -      ++sdev->stat.data_extents_scrubbed;
 -      sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
 -      if (fail)
 +      if (fail) {
 +              spin_lock(&sdev->stat_lock);
                ++sdev->stat.csum_errors;
 -      spin_unlock(&sdev->stat_lock);
 +              spin_unlock(&sdev->stat_lock);
 +      }
  
        return fail;
  }
  
 -static int scrub_checksum_tree_block(struct scrub_dev *sdev,
 -                                   struct scrub_page *spag, u64 logical,
 -                                   void *buffer)
 +static int scrub_checksum_tree_block(struct scrub_block *sblock)
  {
 +      struct scrub_dev *sdev = sblock->sdev;
        struct btrfs_header *h;
        struct btrfs_root *root = sdev->dev->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
 -      u8 csum[BTRFS_CSUM_SIZE];
 +      u8 calculated_csum[BTRFS_CSUM_SIZE];
 +      u8 on_disk_csum[BTRFS_CSUM_SIZE];
 +      struct page *page;
 +      void *mapped_buffer;
 +      u64 mapped_size;
 +      void *p;
        u32 crc = ~(u32)0;
        int fail = 0;
        int crc_fail = 0;
 +      u64 len;
 +      int index;
 +
 +      BUG_ON(sblock->page_count < 1);
 +      page = sblock->pagev[0].page;
 +      mapped_buffer = kmap_atomic(page, KM_USER0);
 +      h = (struct btrfs_header *)mapped_buffer;
 +      memcpy(on_disk_csum, h->csum, sdev->csum_size);
  
        /*
         * we don't use the getter functions here, as we
         * a) don't have an extent buffer and
         * b) the page is already kmapped
         */
 -      h = (struct btrfs_header *)buffer;
  
 -      if (logical != le64_to_cpu(h->bytenr))
 +      if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
                ++fail;
  
 -      if (spag->generation != le64_to_cpu(h->generation))
 +      if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
                ++fail;
  
        if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
                   BTRFS_UUID_SIZE))
                ++fail;
  
 -      crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
 -                            PAGE_SIZE - BTRFS_CSUM_SIZE);
 -      btrfs_csum_final(crc, csum);
 -      if (memcmp(csum, h->csum, sdev->csum_size))
 +      BUG_ON(sdev->nodesize != sdev->leafsize);
 +      len = sdev->nodesize - BTRFS_CSUM_SIZE;
 +      mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
 +      p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
 +      index = 0;
 +      for (;;) {
 +              u64 l = min_t(u64, len, mapped_size);
 +
 +              crc = btrfs_csum_data(root, p, crc, l);
 +              kunmap_atomic(mapped_buffer, KM_USER0);
 +              len -= l;
 +              if (len == 0)
 +                      break;
 +              index++;
 +              BUG_ON(index >= sblock->page_count);
 +              BUG_ON(!sblock->pagev[index].page);
 +              page = sblock->pagev[index].page;
 +              mapped_buffer = kmap_atomic(page, KM_USER0);
 +              mapped_size = PAGE_SIZE;
 +              p = mapped_buffer;
 +      }
 +
 +      btrfs_csum_final(crc, calculated_csum);
 +      if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
                ++crc_fail;
  
 -      spin_lock(&sdev->stat_lock);
 -      ++sdev->stat.tree_extents_scrubbed;
 -      sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
 -      if (crc_fail)
 -              ++sdev->stat.csum_errors;
 -      if (fail)
 -              ++sdev->stat.verify_errors;
 -      spin_unlock(&sdev->stat_lock);
 +      if (crc_fail || fail) {
 +              spin_lock(&sdev->stat_lock);
 +              if (crc_fail)
 +                      ++sdev->stat.csum_errors;
 +              if (fail)
 +                      ++sdev->stat.verify_errors;
 +              spin_unlock(&sdev->stat_lock);
 +      }
  
        return fail || crc_fail;
  }
  
 -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
 +static int scrub_checksum_super(struct scrub_block *sblock)
  {
        struct btrfs_super_block *s;
 -      u64 logical;
 -      struct scrub_dev *sdev = sbio->sdev;
 +      struct scrub_dev *sdev = sblock->sdev;
        struct btrfs_root *root = sdev->dev->dev_root;
        struct btrfs_fs_info *fs_info = root->fs_info;
 -      u8 csum[BTRFS_CSUM_SIZE];
 +      u8 calculated_csum[BTRFS_CSUM_SIZE];
 +      u8 on_disk_csum[BTRFS_CSUM_SIZE];
 +      struct page *page;
 +      void *mapped_buffer;
 +      u64 mapped_size;
 +      void *p;
        u32 crc = ~(u32)0;
        int fail = 0;
 +      u64 len;
 +      int index;
  
 -      s = (struct btrfs_super_block *)buffer;
 -      logical = sbio->logical;
 +      BUG_ON(sblock->page_count < 1);
 +      page = sblock->pagev[0].page;
 +      mapped_buffer = kmap_atomic(page, KM_USER0);
 +      s = (struct btrfs_super_block *)mapped_buffer;
 +      memcpy(on_disk_csum, s->csum, sdev->csum_size);
  
 -      if (logical != le64_to_cpu(s->bytenr))
 +      if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
                ++fail;
  
 -      if (sbio->spag[0].generation != le64_to_cpu(s->generation))
 +      if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
                ++fail;
  
        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
                ++fail;
  
 -      crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
 -                            PAGE_SIZE - BTRFS_CSUM_SIZE);
 -      btrfs_csum_final(crc, csum);
 -      if (memcmp(csum, s->csum, sbio->sdev->csum_size))
 +      len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
 +      mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
 +      p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
 +      index = 0;
 +      for (;;) {
 +              u64 l = min_t(u64, len, mapped_size);
 +
 +              crc = btrfs_csum_data(root, p, crc, l);
 +              kunmap_atomic(mapped_buffer, KM_USER0);
 +              len -= l;
 +              if (len == 0)
 +                      break;
 +              index++;
 +              BUG_ON(index >= sblock->page_count);
 +              BUG_ON(!sblock->pagev[index].page);
 +              page = sblock->pagev[index].page;
 +              mapped_buffer = kmap_atomic(page, KM_USER0);
 +              mapped_size = PAGE_SIZE;
 +              p = mapped_buffer;
 +      }
 +
 +      btrfs_csum_final(crc, calculated_csum);
 +      if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
                ++fail;
  
        if (fail) {
        return fail;
  }
  
 -static int scrub_submit(struct scrub_dev *sdev)
 +static void scrub_block_get(struct scrub_block *sblock)
 +{
 +      atomic_inc(&sblock->ref_count);
 +}
 +
 +static void scrub_block_put(struct scrub_block *sblock)
 +{
 +      if (atomic_dec_and_test(&sblock->ref_count)) {
 +              int i;
 +
 +              for (i = 0; i < sblock->page_count; i++)
 +                      if (sblock->pagev[i].page)
 +                              __free_page(sblock->pagev[i].page);
 +              kfree(sblock);
 +      }
 +}
 +
 +static void scrub_submit(struct scrub_dev *sdev)
  {
        struct scrub_bio *sbio;
  
        if (sdev->curr == -1)
 -              return 0;
 +              return;
  
        sbio = sdev->bios[sdev->curr];
 -      sbio->err = 0;
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
  
        btrfsic_submit_bio(READ, sbio->bio);
 -
 -      return 0;
  }
  
 -static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
 -                    u64 physical, u64 flags, u64 gen, int mirror_num,
 -                    u8 *csum, int force)
 +static int scrub_add_page_to_bio(struct scrub_dev *sdev,
 +                               struct scrub_page *spage)
  {
 +      struct scrub_block *sblock = spage->sblock;
        struct scrub_bio *sbio;
 -      struct page *page;
        int ret;
  
  again:
                if (sdev->curr != -1) {
                        sdev->first_free = sdev->bios[sdev->curr]->next_free;
                        sdev->bios[sdev->curr]->next_free = -1;
 -                      sdev->bios[sdev->curr]->count = 0;
 +                      sdev->bios[sdev->curr]->page_count = 0;
                        spin_unlock(&sdev->list_lock);
                } else {
                        spin_unlock(&sdev->list_lock);
                }
        }
        sbio = sdev->bios[sdev->curr];
 -      if (sbio->count == 0) {
 +      if (sbio->page_count == 0) {
                struct bio *bio;
  
 -              sbio->physical = physical;
 -              sbio->logical = logical;
 -              bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
 -              if (!bio)
 -                      return -ENOMEM;
 +              sbio->physical = spage->physical;
 +              sbio->logical = spage->logical;
 +              bio = sbio->bio;
 +              if (!bio) {
 +                      bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
 +                      if (!bio)
 +                              return -ENOMEM;
 +                      sbio->bio = bio;
 +              }
  
                bio->bi_private = sbio;
                bio->bi_end_io = scrub_bio_end_io;
                bio->bi_bdev = sdev->dev->bdev;
 -              bio->bi_sector = sbio->physical >> 9;
 +              bio->bi_sector = spage->physical >> 9;
                sbio->err = 0;
 -              sbio->bio = bio;
 -      } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
 -                 sbio->logical + sbio->count * PAGE_SIZE != logical) {
 -              ret = scrub_submit(sdev);
 -              if (ret)
 -                      return ret;
 +      } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 +                 spage->physical ||
 +                 sbio->logical + sbio->page_count * PAGE_SIZE !=
 +                 spage->logical) {
 +              scrub_submit(sdev);
                goto again;
        }
 -      sbio->spag[sbio->count].flags = flags;
 -      sbio->spag[sbio->count].generation = gen;
 -      sbio->spag[sbio->count].have_csum = 0;
 -      sbio->spag[sbio->count].mirror_num = mirror_num;
 -
 -      page = alloc_page(GFP_NOFS);
 -      if (!page)
 -              return -ENOMEM;
  
 -      ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
 -      if (!ret) {
 -              __free_page(page);
 -              ret = scrub_submit(sdev);
 -              if (ret)
 -                      return ret;
 +      sbio->pagev[sbio->page_count] = spage;
 +      ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
 +      if (ret != PAGE_SIZE) {
 +              if (sbio->page_count < 1) {
 +                      bio_put(sbio->bio);
 +                      sbio->bio = NULL;
 +                      return -EIO;
 +              }
 +              scrub_submit(sdev);
                goto again;
        }
  
 -      if (csum) {
 -              sbio->spag[sbio->count].have_csum = 1;
 -              memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
 +      scrub_block_get(sblock); /* one for the added page */
 +      atomic_inc(&sblock->outstanding_pages);
 +      sbio->page_count++;
 +      if (sbio->page_count == sdev->pages_per_bio)
 +              scrub_submit(sdev);
 +
 +      return 0;
 +}
 +
 +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
 +                     u64 physical, u64 flags, u64 gen, int mirror_num,
 +                     u8 *csum, int force)
 +{
 +      struct scrub_block *sblock;
 +      int index;
 +
 +      sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
 +      if (!sblock) {
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.malloc_errors++;
 +              spin_unlock(&sdev->stat_lock);
 +              return -ENOMEM;
        }
 -      ++sbio->count;
 -      if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
 +
 +      /* one ref inside this function, plus one for each page later on */
 +      atomic_set(&sblock->ref_count, 1);
 +      sblock->sdev = sdev;
 +      sblock->no_io_error_seen = 1;
 +
 +      for (index = 0; len > 0; index++) {
 +              struct scrub_page *spage = sblock->pagev + index;
 +              u64 l = min_t(u64, len, PAGE_SIZE);
 +
 +              BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
 +              spage->page = alloc_page(GFP_NOFS);
 +              if (!spage->page) {
 +                      spin_lock(&sdev->stat_lock);
 +                      sdev->stat.malloc_errors++;
 +                      spin_unlock(&sdev->stat_lock);
 +                      while (index > 0) {
 +                              index--;
 +                              __free_page(sblock->pagev[index].page);
 +                      }
 +                      kfree(sblock);
 +                      return -ENOMEM;
 +              }
 +              spage->sblock = sblock;
 +              spage->bdev = sdev->dev->bdev;
 +              spage->flags = flags;
 +              spage->generation = gen;
 +              spage->logical = logical;
 +              spage->physical = physical;
 +              spage->mirror_num = mirror_num;
 +              if (csum) {
 +                      spage->have_csum = 1;
 +                      memcpy(spage->csum, csum, sdev->csum_size);
 +              } else {
 +                      spage->have_csum = 0;
 +              }
 +              sblock->page_count++;
 +              len -= l;
 +              logical += l;
 +              physical += l;
 +      }
 +
 +      BUG_ON(sblock->page_count == 0);
 +      for (index = 0; index < sblock->page_count; index++) {
 +              struct scrub_page *spage = sblock->pagev + index;
                int ret;
  
 -              ret = scrub_submit(sdev);
 -              if (ret)
 +              ret = scrub_add_page_to_bio(sdev, spage);
 +              if (ret) {
 +                      scrub_block_put(sblock);
                        return ret;
 +              }
        }
  
 +      if (force)
 +              scrub_submit(sdev);
 +
 +      /* last one frees, either here or in bio completion for last page */
 +      scrub_block_put(sblock);
        return 0;
  }
  
 +static void scrub_bio_end_io(struct bio *bio, int err)
 +{
 +      struct scrub_bio *sbio = bio->bi_private;
 +      struct scrub_dev *sdev = sbio->sdev;
 +      struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 +
 +      sbio->err = err;
 +      sbio->bio = bio;
 +
 +      btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
 +}
 +
 +static void scrub_bio_end_io_worker(struct btrfs_work *work)
 +{
 +      struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
 +      struct scrub_dev *sdev = sbio->sdev;
 +      int i;
 +
 +      BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
 +      if (sbio->err) {
 +              for (i = 0; i < sbio->page_count; i++) {
 +                      struct scrub_page *spage = sbio->pagev[i];
 +
 +                      spage->io_error = 1;
 +                      spage->sblock->no_io_error_seen = 0;
 +              }
 +      }
 +
 +      /* now complete the scrub_block items that have all pages completed */
 +      for (i = 0; i < sbio->page_count; i++) {
 +              struct scrub_page *spage = sbio->pagev[i];
 +              struct scrub_block *sblock = spage->sblock;
 +
 +              if (atomic_dec_and_test(&sblock->outstanding_pages))
 +                      scrub_block_complete(sblock);
 +              scrub_block_put(sblock);
 +      }
 +
 +      if (sbio->err) {
 +              /* what is this good for??? */
 +              sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 +              sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
 +              sbio->bio->bi_phys_segments = 0;
 +              sbio->bio->bi_idx = 0;
 +
 +              for (i = 0; i < sbio->page_count; i++) {
 +                      struct bio_vec *bi;
 +                      bi = &sbio->bio->bi_io_vec[i];
 +                      bi->bv_offset = 0;
 +                      bi->bv_len = PAGE_SIZE;
 +              }
 +      }
 +
 +      bio_put(sbio->bio);
 +      sbio->bio = NULL;
 +      spin_lock(&sdev->list_lock);
 +      sbio->next_free = sdev->first_free;
 +      sdev->first_free = sbio->index;
 +      spin_unlock(&sdev->list_lock);
 +      atomic_dec(&sdev->in_flight);
 +      wake_up(&sdev->list_wait);
 +}
 +
 +static void scrub_block_complete(struct scrub_block *sblock)
 +{
 +      if (!sblock->no_io_error_seen)
 +              scrub_handle_errored_block(sblock);
 +      else
 +              scrub_checksum(sblock);
 +}
 +
  static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
                           u8 *csum)
  {
        int ret = 0;
        unsigned long i;
        unsigned long num_sectors;
 -      u32 sectorsize = sdev->dev->dev_root->sectorsize;
  
        while (!list_empty(&sdev->csum_list)) {
                sum = list_first_entry(&sdev->csum_list,
        if (!sum)
                return 0;
  
 -      num_sectors = sum->len / sectorsize;
 +      num_sectors = sum->len / sdev->sectorsize;
        for (i = 0; i < num_sectors; ++i) {
                if (sum->sums[i].bytenr == logical) {
                        memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
@@@ -1710,28 -1093,9 +1710,28 @@@ static int scrub_extent(struct scrub_de
  {
        int ret;
        u8 csum[BTRFS_CSUM_SIZE];
 +      u32 blocksize;
 +
 +      if (flags & BTRFS_EXTENT_FLAG_DATA) {
 +              blocksize = sdev->sectorsize;
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.data_extents_scrubbed++;
 +              sdev->stat.data_bytes_scrubbed += len;
 +              spin_unlock(&sdev->stat_lock);
 +      } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 +              BUG_ON(sdev->nodesize != sdev->leafsize);
 +              blocksize = sdev->nodesize;
 +              spin_lock(&sdev->stat_lock);
 +              sdev->stat.tree_extents_scrubbed++;
 +              sdev->stat.tree_bytes_scrubbed += len;
 +              spin_unlock(&sdev->stat_lock);
 +      } else {
 +              blocksize = sdev->sectorsize;
 +              BUG_ON(1);
 +      }
  
        while (len) {
 -              u64 l = min_t(u64, len, PAGE_SIZE);
 +              u64 l = min_t(u64, len, blocksize);
                int have_csum = 0;
  
                if (flags & BTRFS_EXTENT_FLAG_DATA) {
                        if (have_csum == 0)
                                ++sdev->stat.no_csum;
                }
 -              ret = scrub_page(sdev, logical, l, physical, flags, gen,
 -                               mirror_num, have_csum ? csum : NULL, 0);
 +              ret = scrub_pages(sdev, logical, l, physical, flags, gen,
 +                                mirror_num, have_csum ? csum : NULL, 0);
                if (ret)
                        return ret;
                len -= l;
@@@ -1806,11 -1170,6 +1806,11 @@@ static noinline_for_stack int scrub_str
        if (!path)
                return -ENOMEM;
  
 +      /*
 +       * work on commit root. The related disk blocks are static as
 +       * long as COW is applied. This means, it is save to rewrite
 +       * them to repair disk errors without any race conditions
 +       */
        path->search_commit_root = 1;
        path->skip_locking = 1;
  
@@@ -2157,18 -1516,15 +2157,18 @@@ static noinline_for_stack int scrub_sup
        struct btrfs_device *device = sdev->dev;
        struct btrfs_root *root = device->dev_root;
  
 +      if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
 +              return -EIO;
 +
        gen = root->fs_info->last_trans_committed;
  
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
 -              if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
 +              if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
                        break;
  
 -              ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
 -                               BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
 +              ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
 +                                   BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
                if (ret)
                        return ret;
        }
@@@ -2227,30 -1583,10 +2227,30 @@@ int btrfs_scrub_dev(struct btrfs_root *
        /*
         * check some assumptions
         */
 -      if (root->sectorsize != PAGE_SIZE ||
 -          root->sectorsize != root->leafsize ||
 -          root->sectorsize != root->nodesize) {
 -              printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
 +      if (root->nodesize != root->leafsize) {
 +              printk(KERN_ERR
 +                     "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
 +                     root->nodesize, root->leafsize);
 +              return -EINVAL;
 +      }
 +
 +      if (root->nodesize > BTRFS_STRIPE_LEN) {
 +              /*
 +               * in this case scrub is unable to calculate the checksum
 +               * the way scrub is implemented. Do not handle this
 +               * situation at all because it won't ever happen.
 +               */
 +              printk(KERN_ERR
 +                     "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
 +                     root->nodesize, BTRFS_STRIPE_LEN);
 +              return -EINVAL;
 +      }
 +
 +      if (root->sectorsize != PAGE_SIZE) {
 +              /* not supported for data w/o checksums */
 +              printk(KERN_ERR
 +                     "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
 +                     root->sectorsize, (unsigned long long)PAGE_SIZE);
                return -EINVAL;
        }
  
        return ret;
  }
  
 -int btrfs_scrub_pause(struct btrfs_root *root)
 +void btrfs_scrub_pause(struct btrfs_root *root)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
  
                mutex_lock(&fs_info->scrub_lock);
        }
        mutex_unlock(&fs_info->scrub_lock);
 -
 -      return 0;
  }
  
 -int btrfs_scrub_continue(struct btrfs_root *root)
 +void btrfs_scrub_continue(struct btrfs_root *root)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
  
        atomic_dec(&fs_info->scrub_pause_req);
        wake_up(&fs_info->scrub_pause_wait);
 -      return 0;
  }
  
 -int btrfs_scrub_pause_super(struct btrfs_root *root)
 +void btrfs_scrub_pause_super(struct btrfs_root *root)
  {
        down_write(&root->fs_info->scrub_super_lock);
 -      return 0;
  }
  
 -int btrfs_scrub_continue_super(struct btrfs_root *root)
 +void btrfs_scrub_continue_super(struct btrfs_root *root)
  {
        up_write(&root->fs_info->scrub_super_lock);
 -      return 0;
  }
  
 -int btrfs_scrub_cancel(struct btrfs_root *root)
 +int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
  {
 -      struct btrfs_fs_info *fs_info = root->fs_info;
  
        mutex_lock(&fs_info->scrub_lock);
        if (!atomic_read(&fs_info->scrubs_running)) {
        return 0;
  }
  
 +int btrfs_scrub_cancel(struct btrfs_root *root)
 +{
 +      return __btrfs_scrub_cancel(root->fs_info);
 +}
 +
  int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
  
        return 0;
  }
 +
  int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
  {
        struct btrfs_fs_info *fs_info = root->fs_info;
diff --combined fs/btrfs/super.c
index 9db64165123a5ca3f11b1740fa462198fe724edb,5239003d453eed85ef82b6c2f9a5412f9607a922..84571d7da12e93d76537c52653891287b6683497
@@@ -40,6 -40,7 +40,6 @@@
  #include <linux/magic.h>
  #include <linux/slab.h>
  #include <linux/cleancache.h>
 -#include <linux/mnt_namespace.h>
  #include <linux/ratelimit.h>
  #include "compat.h"
  #include "delayed-inode.h"
@@@ -76,9 -77,6 +76,9 @@@ static const char *btrfs_decode_error(s
        case -EROFS:
                errstr = "Readonly filesystem";
                break;
 +      case -EEXIST:
 +              errstr = "Object already exists";
 +              break;
        default:
                if (nbuf) {
                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
@@@ -119,8 -117,6 +119,8 @@@ static void btrfs_handle_error(struct b
        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
                sb->s_flags |= MS_RDONLY;
                printk(KERN_INFO "btrfs is forced readonly\n");
 +              __btrfs_scrub_cancel(fs_info);
 +//            WARN_ON(1);
        }
  }
  
   * invokes the approciate error response.
   */
  void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 -                   unsigned int line, int errno)
 +                     unsigned int line, int errno, const char *fmt, ...)
  {
        struct super_block *sb = fs_info->sb;
        char nbuf[16];
        const char *errstr;
 +      va_list args;
 +      va_start(args, fmt);
  
        /*
         * Special case: if the error is EROFS, and we're already
         * under MS_RDONLY, then it is safe here.
         */
        if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
 -              return;
 +              return;
  
 -      errstr = btrfs_decode_error(fs_info, errno, nbuf);
 -      printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
 -              sb->s_id, function, line, errstr);
 -      save_error_info(fs_info);
 +      errstr = btrfs_decode_error(fs_info, errno, nbuf);
 +      if (fmt) {
 +              struct va_format vaf = {
 +                      .fmt = fmt,
 +                      .va = &args,
 +              };
  
 -      btrfs_handle_error(fs_info);
 +              printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
 +                      sb->s_id, function, line, errstr, &vaf);
 +      } else {
 +              printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
 +                      sb->s_id, function, line, errstr);
 +      }
 +
 +      /* Don't go through full error handling during mount */
 +      if (sb->s_flags & MS_BORN) {
 +              save_error_info(fs_info);
 +              btrfs_handle_error(fs_info);
 +      }
 +      va_end(args);
  }
  
 -static void btrfs_put_super(struct super_block *sb)
 +const char *logtypes[] = {
 +      "emergency",
 +      "alert",
 +      "critical",
 +      "error",
 +      "warning",
 +      "notice",
 +      "info",
 +      "debug",
 +};
 +
 +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
  {
 -      struct btrfs_root *root = btrfs_sb(sb);
 -      int ret;
 +      struct super_block *sb = fs_info->sb;
 +      char lvl[4];
 +      struct va_format vaf;
 +      va_list args;
 +      const char *type = logtypes[4];
 +
 +      va_start(args, fmt);
 +
 +      if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
 +              strncpy(lvl, fmt, 3);
 +              fmt += 3;
 +              type = logtypes[fmt[1] - '0'];
 +      } else
 +              *lvl = '\0';
 +
 +      vaf.fmt = fmt;
 +      vaf.va = &args;
 +      printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf);
 +}
  
 -      ret = close_ctree(root);
 -      sb->s_fs_info = NULL;
 +/*
 + * We only mark the transaction aborted and then set the file system read-only.
 + * This will prevent new transactions from starting or trying to join this
 + * one.
 + *
 + * This means that error recovery at the call site is limited to freeing
 + * any local memory allocations and passing the error code up without
 + * further cleanup. The transaction should complete as it normally would
 + * in the call path but will return -EIO.
 + *
 + * We'll complete the cleanup in btrfs_end_transaction and
 + * btrfs_commit_transaction.
 + */
 +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 +                             struct btrfs_root *root, const char *function,
 +                             unsigned int line, int errno)
 +{
 +      WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
 +      trans->aborted = errno;
 +      /* Nothing used. The other threads that have joined this
 +       * transaction may be able to continue. */
 +      if (!trans->blocks_used) {
 +              btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
 +              return;
 +      }
 +      trans->transaction->aborted = errno;
 +      __btrfs_std_error(root->fs_info, function, line, errno, NULL);
 +}
 +/*
 + * __btrfs_panic decodes unexpected, fatal errors from the caller,
 + * issues an alert, and either panics or BUGs, depending on mount options.
 + */
 +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 +                 unsigned int line, int errno, const char *fmt, ...)
 +{
 +      char nbuf[16];
 +      char *s_id = "<unknown>";
 +      const char *errstr;
 +      struct va_format vaf = { .fmt = fmt };
 +      va_list args;
 +
 +      if (fs_info)
 +              s_id = fs_info->sb->s_id;
  
 -      (void)ret; /* FIXME: need to fix VFS to return error? */
 +      va_start(args, fmt);
 +      vaf.va = &args;
 +
 +      errstr = btrfs_decode_error(fs_info, errno, nbuf);
 +      if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
 +              panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
 +                      s_id, function, line, &vaf, errstr);
 +
 +      printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
 +             s_id, function, line, &vaf, errstr);
 +      va_end(args);
 +      /* Caller calls BUG() */
 +}
 +
 +static void btrfs_put_super(struct super_block *sb)
 +{
 +      (void)close_ctree(btrfs_sb(sb)->tree_root);
 +      /* FIXME: need to fix VFS to return error? */
 +      /* AV: return it _where_?  ->put_super() can be triggered by any number
 +       * of async events, up to and including delivery of SIGKILL to the
 +       * last process that kept it busy.  Or segfault in the aforementioned
 +       * process...  Whom would you report that to?
 +       */
  }
  
  enum {
        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
        Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
        Opt_check_integrity, Opt_check_integrity_including_extent_data,
 -      Opt_check_integrity_print_mask,
 +      Opt_check_integrity_print_mask, Opt_fatal_errors,
        Opt_err,
  };
  
@@@ -318,14 -207,12 +318,14 @@@ static match_table_t tokens = 
        {Opt_check_integrity, "check_int"},
        {Opt_check_integrity_including_extent_data, "check_int_data"},
        {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
 +      {Opt_fatal_errors, "fatal_errors=%s"},
        {Opt_err, NULL},
  };
  
  /*
   * Regular mount options parser.  Everything that is needed only when
   * reading in a new superblock is parsed here.
 + * XXX JDM: This needs to be cleaned up for remount.
   */
  int btrfs_parse_options(struct btrfs_root *root, char *options)
  {
                        ret = -EINVAL;
                        goto out;
  #endif
 +              case Opt_fatal_errors:
 +                      if (strcmp(args[0].from, "panic") == 0)
 +                              btrfs_set_opt(info->mount_opt,
 +                                            PANIC_ON_FATAL_ERROR);
 +                      else if (strcmp(args[0].from, "bug") == 0)
 +                              btrfs_clear_opt(info->mount_opt,
 +                                            PANIC_ON_FATAL_ERROR);
 +                      else {
 +                              ret = -EINVAL;
 +                              goto out;
 +                      }
 +                      break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@@ -667,8 -542,7 +667,8 @@@ out
  static struct dentry *get_default_root(struct super_block *sb,
                                       u64 subvol_objectid)
  {
 -      struct btrfs_root *root = sb->s_fs_info;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 +      struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_root *new_root;
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
         * will mount by default if we haven't been given a specific subvolume
         * to mount.
         */
 -      dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 +      dir_id = btrfs_super_root_dir(fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
        if (IS_ERR(di)) {
                btrfs_free_path(path);
                 */
                btrfs_free_path(path);
                dir_id = BTRFS_FIRST_FREE_OBJECTID;
 -              new_root = root->fs_info->fs_root;
 +              new_root = fs_info->fs_root;
                goto setup_root;
        }
  
        btrfs_free_path(path);
  
  find_root:
 -      new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
 +      new_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (IS_ERR(new_root))
                return ERR_CAST(new_root);
  
@@@ -756,7 -630,7 +756,7 @@@ static int btrfs_fill_super(struct supe
  {
        struct inode *inode;
        struct dentry *root_dentry;
 -      struct btrfs_root *tree_root;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_key key;
        int err;
  
        sb->s_flags |= MS_POSIXACL;
  #endif
  
 -      tree_root = open_ctree(sb, fs_devices, (char *)data);
 -
 -      if (IS_ERR(tree_root)) {
 +      err = open_ctree(sb, fs_devices, (char *)data);
 +      if (err) {
                printk("btrfs: open_ctree failed\n");
 -              return PTR_ERR(tree_root);
 +              return err;
        }
 -      sb->s_fs_info = tree_root;
  
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
 -      inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
 +      inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto fail_close;
  
        save_mount_options(sb, data);
        cleancache_init_fs(sb);
 +      sb->s_flags |= MS_ACTIVE;
        return 0;
  
  fail_close:
 -      close_ctree(tree_root);
 +      close_ctree(fs_info->tree_root);
        return err;
  }
  
  int btrfs_sync_fs(struct super_block *sb, int wait)
  {
        struct btrfs_trans_handle *trans;
 -      struct btrfs_root *root = btrfs_sb(sb);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 +      struct btrfs_root *root = fs_info->tree_root;
        int ret;
  
        trace_btrfs_sync_fs(wait);
  
        if (!wait) {
 -              filemap_flush(root->fs_info->btree_inode->i_mapping);
 +              filemap_flush(fs_info->btree_inode->i_mapping);
                return 0;
        }
  
        return ret;
  }
  
 -static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 +static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
  {
 -      struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
 -      struct btrfs_fs_info *info = root->fs_info;
 +      struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
 +      struct btrfs_root *root = info->tree_root;
        char *compress_type;
  
        if (btrfs_test_opt(root, DEGRADED))
                seq_puts(seq, ",inode_cache");
        if (btrfs_test_opt(root, SKIP_BALANCE))
                seq_puts(seq, ",skip_balance");
 +      if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
 +              seq_puts(seq, ",fatal_errors=panic");
        return 0;
  }
  
  static int btrfs_test_super(struct super_block *s, void *data)
  {
 -      struct btrfs_root *test_root = data;
 -      struct btrfs_root *root = btrfs_sb(s);
 +      struct btrfs_fs_info *p = data;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(s);
  
 -      /*
 -       * If this super block is going away, return false as it
 -       * can't match as an existing super block.
 -       */
 -      if (!atomic_read(&s->s_active))
 -              return 0;
 -      return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
 +      return fs_info->fs_devices == p->fs_devices;
  }
  
  static int btrfs_set_super(struct super_block *s, void *data)
  {
 -      s->s_fs_info = data;
 -
 -      return set_anon_super(s, data);
 +      int err = set_anon_super(s, data);
 +      if (!err)
 +              s->s_fs_info = data;
 +      return err;
  }
  
  /*
@@@ -1070,6 -947,12 +1070,6 @@@ static struct dentry *btrfs_mount(struc
        if (!fs_info)
                return ERR_PTR(-ENOMEM);
  
 -      fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
 -      if (!fs_info->tree_root) {
 -              error = -ENOMEM;
 -              goto error_fs_info;
 -      }
 -      fs_info->tree_root->fs_info = fs_info;
        fs_info->fs_devices = fs_devices;
  
        fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
        }
  
        bdev = fs_devices->latest_bdev;
 -      s = sget(fs_type, btrfs_test_super, btrfs_set_super,
 -               fs_info->tree_root);
 +      s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
        if (IS_ERR(s)) {
                error = PTR_ERR(s);
                goto error_close_devices;
        }
  
        if (s->s_root) {
 -              if ((flags ^ s->s_flags) & MS_RDONLY) {
 -                      deactivate_locked_super(s);
 -                      error = -EBUSY;
 -                      goto error_close_devices;
 -              }
 -
                btrfs_close_devices(fs_devices);
                free_fs_info(fs_info);
 +              if ((flags ^ s->s_flags) & MS_RDONLY)
 +                      error = -EBUSY;
        } else {
                char b[BDEVNAME_SIZE];
  
                s->s_flags = flags | MS_NOSEC;
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 -              btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 +              btrfs_sb(s)->bdev_holder = fs_type;
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
 -              if (error) {
 -                      deactivate_locked_super(s);
 -                      return ERR_PTR(error);
 -              }
 -
 -              s->s_flags |= MS_ACTIVE;
        }
  
 -      root = get_default_root(s, subvol_objectid);
 -      if (IS_ERR(root)) {
 +      root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
 +      if (IS_ERR(root))
                deactivate_locked_super(s);
 -              return root;
 -      }
  
        return root;
  
@@@ -1125,22 -1021,12 +1125,22 @@@ error_fs_info
  
  static int btrfs_remount(struct super_block *sb, int *flags, char *data)
  {
 -      struct btrfs_root *root = btrfs_sb(sb);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 +      struct btrfs_root *root = fs_info->tree_root;
 +      unsigned old_flags = sb->s_flags;
 +      unsigned long old_opts = fs_info->mount_opt;
 +      unsigned long old_compress_type = fs_info->compress_type;
 +      u64 old_max_inline = fs_info->max_inline;
 +      u64 old_alloc_start = fs_info->alloc_start;
 +      int old_thread_pool_size = fs_info->thread_pool_size;
 +      unsigned int old_metadata_ratio = fs_info->metadata_ratio;
        int ret;
  
        ret = btrfs_parse_options(root, data);
 -      if (ret)
 -              return -EINVAL;
 +      if (ret) {
 +              ret = -EINVAL;
 +              goto restore;
 +      }
  
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
        if (*flags & MS_RDONLY) {
                sb->s_flags |= MS_RDONLY;
  
 -              ret =  btrfs_commit_super(root);
 -              WARN_ON(ret);
 +              ret = btrfs_commit_super(root);
 +              if (ret)
 +                      goto restore;
        } else {
 -              if (root->fs_info->fs_devices->rw_devices == 0)
 -                      return -EACCES;
 +              if (fs_info->fs_devices->rw_devices == 0)
 +                      ret = -EACCES;
 +                      goto restore;
  
 -              if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
 -                      return -EINVAL;
 +              if (btrfs_super_log_root(fs_info->super_copy) != 0)
 +                      ret = -EINVAL;
 +                      goto restore;
  
 -              ret = btrfs_cleanup_fs_roots(root->fs_info);
 -              WARN_ON(ret);
 +              ret = btrfs_cleanup_fs_roots(fs_info);
 +              if (ret)
 +                      goto restore;
  
                /* recover relocation */
                ret = btrfs_recover_relocation(root);
 -              WARN_ON(ret);
 +              if (ret)
 +                      goto restore;
  
                sb->s_flags &= ~MS_RDONLY;
        }
  
        return 0;
 +
 +restore:
 +      /* We've hit an error - don't reset MS_RDONLY */
 +      if (sb->s_flags & MS_RDONLY)
 +              old_flags |= MS_RDONLY;
 +      sb->s_flags = old_flags;
 +      fs_info->mount_opt = old_opts;
 +      fs_info->compress_type = old_compress_type;
 +      fs_info->max_inline = old_max_inline;
 +      fs_info->alloc_start = old_alloc_start;
 +      fs_info->thread_pool_size = old_thread_pool_size;
 +      fs_info->metadata_ratio = old_metadata_ratio;
 +      return ret;
  }
  
  /* Used to sort the devices by max_avail(descending sort) */
@@@ -1344,18 -1212,18 +1344,18 @@@ static int btrfs_calc_avail_data_space(
  
  static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
  {
 -      struct btrfs_root *root = btrfs_sb(dentry->d_sb);
 -      struct btrfs_super_block *disk_super = root->fs_info->super_copy;
 -      struct list_head *head = &root->fs_info->space_info;
 +      struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
 +      struct btrfs_super_block *disk_super = fs_info->super_copy;
 +      struct list_head *head = &fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
        u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
 -      __be32 *fsid = (__be32 *)root->fs_info->fsid;
 +      __be32 *fsid = (__be32 *)fs_info->fsid;
        int ret;
  
        /* holding chunk_muext to avoid allocating new chunks */
 -      mutex_lock(&root->fs_info->chunk_mutex);
 +      mutex_lock(&fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
        buf->f_bavail = total_free_data;
 -      ret = btrfs_calc_avail_data_space(root, &total_free_data);
 +      ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
        if (ret) {
 -              mutex_unlock(&root->fs_info->chunk_mutex);
 +              mutex_unlock(&fs_info->chunk_mutex);
                return ret;
        }
        buf->f_bavail += total_free_data;
        buf->f_bavail = buf->f_bavail >> bits;
 -      mutex_unlock(&root->fs_info->chunk_mutex);
 +      mutex_unlock(&fs_info->chunk_mutex);
  
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
        return 0;
  }
  
 +static void btrfs_kill_super(struct super_block *sb)
 +{
 +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 +      kill_anon_super(sb);
 +      free_fs_info(fs_info);
 +}
 +
  static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
        .mount          = btrfs_mount,
 -      .kill_sb        = kill_anon_super,
 +      .kill_sb        = btrfs_kill_super,
        .fs_flags       = FS_REQUIRES_DEV,
  };
  
@@@ -1440,17 -1301,17 +1440,17 @@@ static long btrfs_control_ioctl(struct 
  
  static int btrfs_freeze(struct super_block *sb)
  {
 -      struct btrfs_root *root = btrfs_sb(sb);
 -      mutex_lock(&root->fs_info->transaction_kthread_mutex);
 -      mutex_lock(&root->fs_info->cleaner_mutex);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 +      mutex_lock(&fs_info->transaction_kthread_mutex);
 +      mutex_lock(&fs_info->cleaner_mutex);
        return 0;
  }
  
  static int btrfs_unfreeze(struct super_block *sb)
  {
 -      struct btrfs_root *root = btrfs_sb(sb);
 -      mutex_unlock(&root->fs_info->cleaner_mutex);
 -      mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 +      mutex_unlock(&fs_info->cleaner_mutex);
 +      mutex_unlock(&fs_info->transaction_kthread_mutex);
        return 0;
  }
  
@@@ -1515,7 -1376,9 +1515,7 @@@ static int __init init_btrfs_fs(void
        if (err)
                return err;
  
 -      err = btrfs_init_compress();
 -      if (err)
 -              goto free_sysfs;
 +      btrfs_init_compress();
  
        err = btrfs_init_cachep();
        if (err)
        if (err)
                goto unregister_ioctl;
  
+       btrfs_init_lockdep();
        printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
        return 0;
  
@@@ -1556,6 -1421,7 +1558,6 @@@ free_cachep
        btrfs_destroy_cachep();
  free_compress:
        btrfs_exit_compress();
 -free_sysfs:
        btrfs_exit_sysfs();
        return err;
  }