Merge git://git.jan-o-sch.net/btrfs-unstable into for-linus

author Chris Mason <chris.mason@oracle.com>

Thu, 29 Mar 2012 00:33:40 +0000 (20:33 -0400)

committer Chris Mason <chris.mason@oracle.com>

Thu, 29 Mar 2012 00:33:40 +0000 (20:33 -0400)
author Chris Mason <chris.mason@oracle.com>
Thu, 29 Mar 2012 00:33:40 +0000 (20:33 -0400)
committer Chris Mason <chris.mason@oracle.com>
Thu, 29 Mar 2012 00:33:40 +0000 (20:33 -0400)
diff --combined fs/btrfs/backref.c

index 4c79547f4a0c6db8eb65179ae7221c714304a9df,56136d9046f099b702d77e95cdf160ce8af85e2c..f4e90748940abf6c1f36f3186177e4640bd24546
--- 1/fs/btrfs/backref.c
--- 2/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@@ -116,6 -116,7 +116,7 @@@ add_parent
    * to a logical address
    */
   static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+                                       int search_commit_root,
                                         struct __prelim_ref *ref,
                                         struct ulist *parents)
   {
@@@ -131,6 -132,7 +132,7 @@@
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
+       path->search_commit_root = !!search_commit_root;
   
         root_key.objectid = ref->root_id;
         root_key.type = BTRFS_ROOT_ITEM_KEY;
@@@ -188,6 -190,7 +190,7 @@@ out
    * resolve all indirect backrefs from the list
    */
   static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+                                  int search_commit_root,
                                    struct list_head *head)
   {
         int err;
@@@ -212,7 -215,8 +215,8 @@@
                         continue;
                 if (ref->count == 0)
                         continue;
-               err = __resolve_indirect_ref(fs_info, ref, parents);
+               err = __resolve_indirect_ref(fs_info, search_commit_root,
+                                            ref, parents);
                 if (err) {
                         if (ret == 0)
                                 ret = err;
@@@ -586,6 -590,7 +590,7 @@@ static int find_parent_nodes(struct btr
         struct btrfs_delayed_ref_head *head;
         int info_level = 0;
         int ret;
+       int search_commit_root = (trans == BTRFS_BACKREF_SEARCH_COMMIT_ROOT);
         struct list_head prefs_delayed;
         struct list_head prefs;
         struct __prelim_ref *ref;
@@@ -600,6 -605,7 +605,7 @@@
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
+       path->search_commit_root = !!search_commit_root;
   
         /*
          * grab both a lock on the path and a lock on the delayed ref head.
@@@ -614,35 -620,39 +620,39 @@@ again
                 goto out;
         BUG_ON(ret == 0);
   
-       /*
-        * look if there are updates for this ref queued and lock the head
-        */
-       delayed_refs = &trans->transaction->delayed_refs;
-       spin_lock(&delayed_refs->lock);
-       head = btrfs_find_delayed_ref_head(trans, bytenr);
-       if (head) {
-               if (!mutex_trylock(&head->mutex)) {
-                       atomic_inc(&head->node.refs);
-                       spin_unlock(&delayed_refs->lock);
- 
-                       btrfs_release_path(path);
- 
-                       /*
-                        * Mutex was contended, block until it's
-                        * released and try again
-                        */
-                       mutex_lock(&head->mutex);
-                       mutex_unlock(&head->mutex);
-                       btrfs_put_delayed_ref(&head->node);
-                       goto again;
-               }
-               ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
-               if (ret) {
-                       spin_unlock(&delayed_refs->lock);
-                       goto out;
+       if (trans != BTRFS_BACKREF_SEARCH_COMMIT_ROOT) {
+               /*
+                * look if there are updates for this ref queued and lock the
+                * head
+                */
+               delayed_refs = &trans->transaction->delayed_refs;
+               spin_lock(&delayed_refs->lock);
+               head = btrfs_find_delayed_ref_head(trans, bytenr);
+               if (head) {
+                       if (!mutex_trylock(&head->mutex)) {
+                               atomic_inc(&head->node.refs);
+                               spin_unlock(&delayed_refs->lock);
+ 
+                               btrfs_release_path(path);
+ 
+                               /*
+                                * Mutex was contended, block until it's
+                                * released and try again
+                                */
+                               mutex_lock(&head->mutex);
+                               mutex_unlock(&head->mutex);
+                               btrfs_put_delayed_ref(&head->node);
+                               goto again;
+                       }
+                       ret = __add_delayed_refs(head, seq, &info_key,
+                                                &prefs_delayed);
+                       if (ret) {
+                               spin_unlock(&delayed_refs->lock);
+                               goto out;
+                       }
                 }
+               spin_unlock(&delayed_refs->lock);
         }
-       spin_unlock(&delayed_refs->lock);
   
         if (path->slots[0]) {
                 struct extent_buffer *leaf;
@@@ -679,7 -689,7 +689,7 @@@
         if (ret)
                 goto out;
   
-       ret = __resolve_indirect_refs(fs_info, &prefs);
+       ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs);
         if (ret)
                 goto out;
   
@@@ -1074,8 -1084,7 +1084,7 @@@ int tree_backref_for_extent(unsigned lo
         return 0;
   }
   
- static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
-                               struct btrfs_path *path, u64 logical,
+ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical,
                                 u64 orig_extent_item_objectid,
                                 u64 extent_item_pos, u64 root,
                                 iterate_extent_inodes_t *iterate, void *ctx)
@@@ -1143,35 -1152,38 +1152,38 @@@
    * calls iterate() for every inode that references the extent identified by
    * the given parameters.
    * when the iterator function returns a non-zero value, iteration stops.
-  * path is guaranteed to be in released state when iterate() is called.
    */
   int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
-                               struct btrfs_path *path,
                                 u64 extent_item_objectid, u64 extent_item_pos,
+                               int search_commit_root,
                                 iterate_extent_inodes_t *iterate, void *ctx)
   {
         int ret;
         struct list_head data_refs = LIST_HEAD_INIT(data_refs);
         struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
         struct btrfs_trans_handle *trans;
-       struct ulist *refs;
-       struct ulist *roots;
+       struct ulist *refs = NULL;
+       struct ulist *roots = NULL;
         struct ulist_node *ref_node = NULL;
         struct ulist_node *root_node = NULL;
         struct seq_list seq_elem;
-       struct btrfs_delayed_ref_root *delayed_refs;
- 
-       trans = btrfs_join_transaction(fs_info->extent_root);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
+       struct btrfs_delayed_ref_root *delayed_refs = NULL;
   
         pr_debug("resolving all inodes for extent %llu\n",
                         extent_item_objectid);
   
-       delayed_refs = &trans->transaction->delayed_refs;
-       spin_lock(&delayed_refs->lock);
-       btrfs_get_delayed_seq(delayed_refs, &seq_elem);
-       spin_unlock(&delayed_refs->lock);
+       if (search_commit_root) {
+               trans = BTRFS_BACKREF_SEARCH_COMMIT_ROOT;
+       } else {
+               trans = btrfs_join_transaction(fs_info->extent_root);
+               if (IS_ERR(trans))
+                       return PTR_ERR(trans);
+ 
+               delayed_refs = &trans->transaction->delayed_refs;
+               spin_lock(&delayed_refs->lock);
+               btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+               spin_unlock(&delayed_refs->lock);
+       }
   
         ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
                                    extent_item_pos, seq_elem.seq,
@@@ -1188,7 -1200,7 +1200,7 @@@
                 while (!ret && (root_node = ulist_next(roots, root_node))) {
                         pr_debug("root %llu references leaf %llu\n",
                                         root_node->val, ref_node->val);
-                       ret = iterate_leaf_refs(fs_info, path, ref_node->val,
+                       ret = iterate_leaf_refs(fs_info, ref_node->val,
                                                 extent_item_objectid,
                                                 extent_item_pos, root_node->val,
                                                 iterate, ctx);
@@@ -1198,8 -1210,11 +1210,11 @@@
         ulist_free(refs);
         ulist_free(roots);
   out:
-       btrfs_put_delayed_seq(delayed_refs, &seq_elem);
-       btrfs_end_transaction(trans, fs_info->extent_root);
+       if (!search_commit_root) {
+               btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+               btrfs_end_transaction(trans, fs_info->extent_root);
+       }
+ 
         return ret;
   }
   
@@@ -1210,6 -1225,7 +1225,7 @@@ int iterate_inodes_from_logical(u64 log
         int ret;
         u64 extent_item_pos;
         struct btrfs_key found_key;
+       int search_commit_root = path->search_commit_root;
   
         ret = extent_from_logical(fs_info, logical, path,
                                         &found_key);
@@@ -1220,8 -1236,9 +1236,9 @@@
                 return ret;
   
         extent_item_pos = logical - found_key.objectid;
-       ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       extent_item_pos, iterate, ctx);
+       ret = iterate_extent_inodes(fs_info, found_key.objectid,
+                                       extent_item_pos, search_commit_root,
+                                       iterate, ctx);
   
         return ret;
   }
@@@ -1342,6 -1359,12 +1359,6 @@@ int paths_from_inode(u64 inum, struct i
                                 inode_to_path, ipath);
   }
   
- -/*
- - * allocates space to return multiple file system paths for an inode.
- - * total_bytes to allocate are passed, note that space usable for actual path
- - * information will be total_bytes - sizeof(struct inode_fs_paths).
- - * the returned pointer must be freed with free_ipath() in the end.
- - */
   struct btrfs_data_container *init_data_container(u32 total_bytes)
   {
         struct btrfs_data_container *data;
@@@ -1397,6 -1420,5 +1414,6 @@@ struct inode_fs_paths *init_ipath(s32 t
   
   void free_ipath(struct inode_fs_paths *ipath)
   {
+ +      kfree(ipath->fspath);
         kfree(ipath);
   }
diff --combined fs/btrfs/ioctl.c

index 20580920071470168a645db08fe60f84166ad5af,013c6371e3e8d707ba2020cd385b2f5067adabaa..a979ab7d396746413348fa7ad3ee72e32d59da0e
--- 1/fs/btrfs/ioctl.c
--- 2/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@@ -206,7 -206,7 +206,7 @@@ static int btrfs_ioctl_setflags(struct 
                 }
         }
   
- -      ret = mnt_want_write(file->f_path.mnt);
+ +      ret = mnt_want_write_file(file);
         if (ret)
                 goto out_unlock;
   
@@@ -271,7 -271,7 +271,7 @@@
                 inode->i_flags = i_oldflags;
         }
   
- -      mnt_drop_write(file->f_path.mnt);
+ +      mnt_drop_write_file(file);
    out_unlock:
         mutex_unlock(&inode->i_mutex);
         return ret;
@@@ -286,13 -286,14 +286,13 @@@ static int btrfs_ioctl_getversion(struc
   
   static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
   {
- -      struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
- -      struct btrfs_fs_info *fs_info = root->fs_info;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
         struct btrfs_device *device;
         struct request_queue *q;
         struct fstrim_range range;
         u64 minlen = ULLONG_MAX;
         u64 num_devices = 0;
- -      u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ +      u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
         int ret;
   
         if (!capable(CAP_SYS_ADMIN))
@@@ -321,7 -322,7 +321,7 @@@
   
         range.len = min(range.len, total_bytes - range.start);
         range.minlen = max(range.minlen, minlen);
- -      ret = btrfs_trim_fs(root, &range);
+ +      ret = btrfs_trim_fs(fs_info->tree_root, &range);
         if (ret < 0)
                 return ret;
   
@@@ -425,37 -426,22 +425,37 @@@ static noinline int create_subvol(struc
   
         key.offset = (u64)-1;
         new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
- -      BUG_ON(IS_ERR(new_root));
+ +      if (IS_ERR(new_root)) {
+ +              btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
+ +              ret = PTR_ERR(new_root);
+ +              goto fail;
+ +      }
   
         btrfs_record_root_in_trans(trans, new_root);
   
         ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
+ +      if (ret) {
+ +              /* We potentially lose an unused inode item here */
+ +              btrfs_abort_transaction(trans, root, ret);
+ +              goto fail;
+ +      }
+ +
         /*
          * insert the directory item
          */
         ret = btrfs_set_inode_index(dir, &index);
- -      BUG_ON(ret);
+ +      if (ret) {
+ +              btrfs_abort_transaction(trans, root, ret);
+ +              goto fail;
+ +      }
   
         ret = btrfs_insert_dir_item(trans, root,
                                     name, namelen, dir, &key,
                                     BTRFS_FT_DIR, index);
- -      if (ret)
+ +      if (ret) {
+ +              btrfs_abort_transaction(trans, root, ret);
                 goto fail;
+ +      }
   
         btrfs_i_size_write(dir, dir->i_size + namelen * 2);
         ret = btrfs_update_inode(trans, root, dir);
@@@ -812,9 -798,9 +812,9 @@@ static int should_defrag_range(struct i
   
         if (!em) {
                 /* get the big lock and read metadata off disk */
- -              lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+ +              lock_extent(io_tree, start, start + len - 1);
                 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- -              unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+ +              unlock_extent(io_tree, start, start + len - 1);
   
                 if (IS_ERR(em))
                         return 0;
@@@ -902,10 -888,10 +902,10 @@@ again
                 page_start = page_offset(page);
                 page_end = page_start + PAGE_CACHE_SIZE - 1;
                 while (1) {
- -                      lock_extent(tree, page_start, page_end, GFP_NOFS);
+ +                      lock_extent(tree, page_start, page_end);
                         ordered = btrfs_lookup_ordered_extent(inode,
                                                               page_start);
- -                      unlock_extent(tree, page_start, page_end, GFP_NOFS);
+ +                      unlock_extent(tree, page_start, page_end);
                         if (!ordered)
                                 break;
   
@@@ -961,7 -947,8 +961,7 @@@
         page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
   
         lock_extent_bits(&BTRFS_I(inode)->io_tree,
- -                       page_start, page_end - 1, 0, &cached_state,
- -                       GFP_NOFS);
+ +                       page_start, page_end - 1, 0, &cached_state);
         clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
                           page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
                           EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
@@@ -1896,7 -1883,7 +1896,7 @@@ static noinline int btrfs_ioctl_snap_de
                 goto out;
         }
   
- -      err = mnt_want_write(file->f_path.mnt);
+ +      err = mnt_want_write_file(file);
         if (err)
                 goto out;
   
@@@ -1980,11 -1967,7 +1980,11 @@@
                                 dest->root_key.objectid,
                                 dentry->d_name.name,
                                 dentry->d_name.len);
- -      BUG_ON(ret);
+ +      if (ret) {
+ +              err = ret;
+ +              btrfs_abort_transaction(trans, root, ret);
+ +              goto out_end_trans;
+ +      }
   
         btrfs_record_root_in_trans(trans, dest);
   
@@@ -1997,16 -1980,11 +1997,16 @@@
                 ret = btrfs_insert_orphan_item(trans,
                                         root->fs_info->tree_root,
                                         dest->root_key.objectid);
- -              BUG_ON(ret);
+ +              if (ret) {
+ +                      btrfs_abort_transaction(trans, root, ret);
+ +                      err = ret;
+ +                      goto out_end_trans;
+ +              }
         }
- -
+ +out_end_trans:
         ret = btrfs_end_transaction(trans, root);
- -      BUG_ON(ret);
+ +      if (ret && !err)
+ +              err = ret;
         inode->i_flags |= S_DEAD;
   out_up_write:
         up_write(&root->fs_info->subvol_sem);
@@@ -2021,7 -1999,7 +2021,7 @@@ out_dput
         dput(dentry);
   out_unlock_dir:
         mutex_unlock(&dir->i_mutex);
- -      mnt_drop_write(file->f_path.mnt);
+ +      mnt_drop_write_file(file);
   out:
         kfree(vol_args);
         return err;
@@@ -2037,7 -2015,7 +2037,7 @@@ static int btrfs_ioctl_defrag(struct fi
         if (btrfs_root_readonly(root))
                 return -EROFS;
   
- -      ret = mnt_want_write(file->f_path.mnt);
+ +      ret = mnt_want_write_file(file);
         if (ret)
                 return ret;
   
@@@ -2090,7 -2068,7 +2090,7 @@@
                 ret = -EINVAL;
         }
   out:
- -      mnt_drop_write(file->f_path.mnt);
+ +      mnt_drop_write_file(file);
         return ret;
   }
   
@@@ -2267,7 -2245,7 +2267,7 @@@ static noinline long btrfs_ioctl_clone(
         if (btrfs_root_readonly(root))
                 return -EROFS;
   
- -      ret = mnt_want_write(file->f_path.mnt);
+ +      ret = mnt_want_write_file(file);
         if (ret)
                 return ret;
   
@@@ -2349,13 -2327,13 +2349,13 @@@
            another, and lock file content */
         while (1) {
                 struct btrfs_ordered_extent *ordered;
- -              lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ +              lock_extent(&BTRFS_I(src)->io_tree, off, off+len);
                 ordered = btrfs_lookup_first_ordered_extent(src, off+len);
                 if (!ordered &&
                     !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
                                    EXTENT_DELALLOC, 0, NULL))
                         break;
- -              unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ +              unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
                 if (ordered)
                         btrfs_put_ordered_extent(ordered);
                 btrfs_wait_ordered_range(src, off, len);
@@@ -2470,21 -2448,11 +2470,21 @@@
                                                          new_key.offset,
                                                          new_key.offset + datal,
                                                          &hint_byte, 1);
- -                              BUG_ON(ret);
+ +                              if (ret) {
+ +                                      btrfs_abort_transaction(trans, root,
+ +                                                              ret);
+ +                                      btrfs_end_transaction(trans, root);
+ +                                      goto out;
+ +                              }
   
                                 ret = btrfs_insert_empty_item(trans, root, path,
                                                               &new_key, size);
- -                              BUG_ON(ret);
+ +                              if (ret) {
+ +                                      btrfs_abort_transaction(trans, root,
+ +                                                              ret);
+ +                                      btrfs_end_transaction(trans, root);
+ +                                      goto out;
+ +                              }
   
                                 leaf = path->nodes[0];
                                 slot = path->slots[0];
@@@ -2511,15 -2479,7 +2511,15 @@@
                                                         btrfs_ino(inode),
                                                         new_key.offset - datao,
                                                         0);
- -                                      BUG_ON(ret);
+ +                                      if (ret) {
+ +                                              btrfs_abort_transaction(trans,
+ +                                                                      root,
+ +                                                                      ret);
+ +                                              btrfs_end_transaction(trans,
+ +                                                                    root);
+ +                                              goto out;
+ +
+ +                                      }
                                 }
                         } else if (type == BTRFS_FILE_EXTENT_INLINE) {
                                 u64 skip = 0;
@@@ -2544,21 -2504,11 +2544,21 @@@
                                                          new_key.offset,
                                                          new_key.offset + datal,
                                                          &hint_byte, 1);
- -                              BUG_ON(ret);
+ +                              if (ret) {
+ +                                      btrfs_abort_transaction(trans, root,
+ +                                                              ret);
+ +                                      btrfs_end_transaction(trans, root);
+ +                                      goto out;
+ +                              }
   
                                 ret = btrfs_insert_empty_item(trans, root, path,
                                                               &new_key, size);
- -                              BUG_ON(ret);
+ +                              if (ret) {
+ +                                      btrfs_abort_transaction(trans, root,
+ +                                                              ret);
+ +                                      btrfs_end_transaction(trans, root);
+ +                                      goto out;
+ +                              }
   
                                 if (skip) {
                                         u32 start =
@@@ -2592,12 -2542,8 +2592,12 @@@
                                 btrfs_i_size_write(inode, endoff);
   
                         ret = btrfs_update_inode(trans, root, inode);
- -                      BUG_ON(ret);
- -                      btrfs_end_transaction(trans, root);
+ +                      if (ret) {
+ +                              btrfs_abort_transaction(trans, root, ret);
+ +                              btrfs_end_transaction(trans, root);
+ +                              goto out;
+ +                      }
+ +                      ret = btrfs_end_transaction(trans, root);
                 }
   next:
                 btrfs_release_path(path);
@@@ -2606,7 -2552,7 +2606,7 @@@
         ret = 0;
   out:
         btrfs_release_path(path);
- -      unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+ +      unlock_extent(&BTRFS_I(src)->io_tree, off, off+len);
   out_unlock:
         mutex_unlock(&src->i_mutex);
         mutex_unlock(&inode->i_mutex);
@@@ -2615,7 -2561,7 +2615,7 @@@
   out_fput:
         fput(src_file);
   out_drop_write:
- -      mnt_drop_write(file->f_path.mnt);
+ +      mnt_drop_write_file(file);
         return ret;
   }
   
@@@ -2654,7 -2600,7 +2654,7 @@@ static long btrfs_ioctl_trans_start(str
         if (btrfs_root_readonly(root))
                 goto out;
   
- -      ret = mnt_want_write(file->f_path.mnt);
+ +      ret = mnt_want_write_file(file);
         if (ret)
                 goto out;
   
@@@ -2670,7 -2616,7 +2670,7 @@@
   
   out_drop:
         atomic_dec(&root->fs_info->open_ioctl_trans);
- -      mnt_drop_write(file->f_path.mnt);
+ +      mnt_drop_write_file(file);
   out:
         return ret;
   }
@@@ -2905,7 -2851,7 +2905,7 @@@ long btrfs_ioctl_trans_end(struct file 
   
         atomic_dec(&root->fs_info->open_ioctl_trans);
   
- -      mnt_drop_write(file->f_path.mnt);
+ +      mnt_drop_write_file(file);
         return 0;
   }
   
@@@ -3121,8 -3067,8 +3121,8 @@@ static long btrfs_ioctl_logical_to_ino(
                 goto out;
   
         extent_item_pos = loi->logical - key.objectid;
-       ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
-                                       extent_item_pos, build_ino_list,
+       ret = iterate_extent_inodes(root->fs_info, key.objectid,
+                                       extent_item_pos, 0, build_ino_list,
                                         inodes);
   
         if (ret < 0)
diff --combined fs/btrfs/scrub.c

index 07e59d97551a28bcfb477a98152a8f2ee89e56cf,b9b84cdfc3591c0b43f07f89a232eaa51d9acd0d..c9a2c1aef4bd0d21dd412c6be7ab639487dab57e
--- 1/fs/btrfs/scrub.c
--- 2/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@@ -36,30 -36,37 +36,30 @@@
    * Future enhancements:
    *  - In case an unrepairable extent is encountered, track which files are
    *    affected and report them
- - *  - In case of a read error on files with nodatasum, map the file and read
- - *    the extent to trigger a writeback of the good copy
    *  - track and record media errors, throw out bad devices
    *  - add a mode to also read unallocated space
    */
   
- -struct scrub_bio;
- -struct scrub_page;
+ +struct scrub_block;
   struct scrub_dev;
- -static void scrub_bio_end_io(struct bio *bio, int err);
- -static void scrub_checksum(struct btrfs_work *work);
- -static int scrub_checksum_data(struct scrub_dev *sdev,
- -                             struct scrub_page *spag, void *buffer);
- -static int scrub_checksum_tree_block(struct scrub_dev *sdev,
- -                                   struct scrub_page *spag, u64 logical,
- -                                   void *buffer);
- -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
- -static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
- -static void scrub_fixup_end_io(struct bio *bio, int err);
- -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
- -                        struct page *page);
- -static void scrub_fixup(struct scrub_bio *sbio, int ix);
   
   #define SCRUB_PAGES_PER_BIO   16      /* 64k per bio */
   #define SCRUB_BIOS_PER_DEV    16      /* 1 MB per device in flight */
+ +#define SCRUB_MAX_PAGES_PER_BLOCK     16      /* 64k per node/leaf/sector */
   
   struct scrub_page {
+ +      struct scrub_block      *sblock;
+ +      struct page             *page;
+ +      struct block_device     *bdev;
         u64                     flags;  /* extent flags */
         u64                     generation;
- -      int                     mirror_num;
- -      int                     have_csum;
+ +      u64                     logical;
+ +      u64                     physical;
+ +      struct {
+ +              unsigned int    mirror_num:8;
+ +              unsigned int    have_csum:1;
+ +              unsigned int    io_error:1;
+ +      };
         u8                      csum[BTRFS_CSUM_SIZE];
   };
   
@@@ -70,25 -77,12 +70,25 @@@ struct scrub_bio 
         int                     err;
         u64                     logical;
         u64                     physical;
- -      struct scrub_page       spag[SCRUB_PAGES_PER_BIO];
- -      u64                     count;
+ +      struct scrub_page       *pagev[SCRUB_PAGES_PER_BIO];
+ +      int                     page_count;
         int                     next_free;
         struct btrfs_work       work;
   };
   
+ +struct scrub_block {
+ +      struct scrub_page       pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+ +      int                     page_count;
+ +      atomic_t                outstanding_pages;
+ +      atomic_t                ref_count; /* free mem on transition to zero */
+ +      struct scrub_dev        *sdev;
+ +      struct {
+ +              unsigned int    header_error:1;
+ +              unsigned int    checksum_error:1;
+ +              unsigned int    no_io_error_seen:1;
+ +      };
+ +};
+ +
   struct scrub_dev {
         struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
         struct btrfs_device     *dev;
@@@ -102,10 -96,6 +102,10 @@@
         struct list_head        csum_list;
         atomic_t                cancel_req;
         int                     readonly;
+ +      int                     pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+ +      u32                     sectorsize;
+ +      u32                     nodesize;
+ +      u32                     leafsize;
         /*
          * statistics
          */
@@@ -134,43 -124,6 +134,43 @@@ struct scrub_warning 
         int                     scratch_bufsize;
   };
   
+ +
+ +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+ +static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+ +                                   struct btrfs_mapping_tree *map_tree,
+ +                                   u64 length, u64 logical,
+ +                                   struct scrub_block *sblock);
+ +static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ +                             struct scrub_block *sblock, int is_metadata,
+ +                             int have_csum, u8 *csum, u64 generation,
+ +                             u16 csum_size);
+ +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+ +                                       struct scrub_block *sblock,
+ +                                       int is_metadata, int have_csum,
+ +                                       const u8 *csum, u64 generation,
+ +                                       u16 csum_size);
+ +static void scrub_complete_bio_end_io(struct bio *bio, int err);
+ +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ +                                           struct scrub_block *sblock_good,
+ +                                           int force_write);
+ +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+ +                                          struct scrub_block *sblock_good,
+ +                                          int page_num, int force_write);
+ +static int scrub_checksum_data(struct scrub_block *sblock);
+ +static int scrub_checksum_tree_block(struct scrub_block *sblock);
+ +static int scrub_checksum_super(struct scrub_block *sblock);
+ +static void scrub_block_get(struct scrub_block *sblock);
+ +static void scrub_block_put(struct scrub_block *sblock);
+ +static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+ +                               struct scrub_page *spage);
+ +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+ +                     u64 physical, u64 flags, u64 gen, int mirror_num,
+ +                     u8 *csum, int force);
+ +static void scrub_bio_end_io(struct bio *bio, int err);
+ +static void scrub_bio_end_io_worker(struct btrfs_work *work);
+ +static void scrub_block_complete(struct scrub_block *sblock);
+ +
+ +
   static void scrub_free_csums(struct scrub_dev *sdev)
   {
         while (!list_empty(&sdev->csum_list)) {
@@@ -182,6 -135,23 +182,6 @@@
         }
   }
   
- -static void scrub_free_bio(struct bio *bio)
- -{
- -      int i;
- -      struct page *last_page = NULL;
- -
- -      if (!bio)
- -              return;
- -
- -      for (i = 0; i < bio->bi_vcnt; ++i) {
- -              if (bio->bi_io_vec[i].bv_page == last_page)
- -                      continue;
- -              last_page = bio->bi_io_vec[i].bv_page;
- -              __free_page(last_page);
- -      }
- -      bio_put(bio);
- -}
- -
   static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
   {
         int i;
@@@ -189,23 -159,13 +189,23 @@@
         if (!sdev)
                 return;
   
+ +      /* this can happen when scrub is cancelled */
+ +      if (sdev->curr != -1) {
+ +              struct scrub_bio *sbio = sdev->bios[sdev->curr];
+ +
+ +              for (i = 0; i < sbio->page_count; i++) {
+ +                      BUG_ON(!sbio->pagev[i]);
+ +                      BUG_ON(!sbio->pagev[i]->page);
+ +                      scrub_block_put(sbio->pagev[i]->sblock);
+ +              }
+ +              bio_put(sbio->bio);
+ +      }
+ +
         for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                 struct scrub_bio *sbio = sdev->bios[i];
   
                 if (!sbio)
                         break;
- -
- -              scrub_free_bio(sbio->bio);
                 kfree(sbio);
         }
   
@@@ -219,16 -179,11 +219,16 @@@ struct scrub_dev *scrub_setup_dev(struc
         struct scrub_dev *sdev;
         int             i;
         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+ +      int pages_per_bio;
   
+ +      pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
+ +                            bio_get_nr_vecs(dev->bdev));
         sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
         if (!sdev)
                 goto nomem;
         sdev->dev = dev;
+ +      sdev->pages_per_bio = pages_per_bio;
+ +      sdev->curr = -1;
         for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                 struct scrub_bio *sbio;
   
@@@ -239,8 -194,8 +239,8 @@@
   
                 sbio->index = i;
                 sbio->sdev = sdev;
- -              sbio->count = 0;
- -              sbio->work.func = scrub_checksum;
+ +              sbio->page_count = 0;
+ +              sbio->work.func = scrub_bio_end_io_worker;
   
                 if (i != SCRUB_BIOS_PER_DEV-1)
                         sdev->bios[i]->next_free = i + 1;
@@@ -248,9 -203,7 +248,9 @@@
                         sdev->bios[i]->next_free = -1;
         }
         sdev->first_free = 0;
- -      sdev->curr = -1;
+ +      sdev->nodesize = dev->dev_root->nodesize;
+ +      sdev->leafsize = dev->dev_root->leafsize;
+ +      sdev->sectorsize = dev->dev_root->sectorsize;
         atomic_set(&sdev->in_flight, 0);
         atomic_set(&sdev->fixup_cnt, 0);
         atomic_set(&sdev->cancel_req, 0);
@@@ -341,9 -294,10 +341,9 @@@ err
         return 0;
   }
   
- -static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
- -                              int ix)
+ +static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
   {
- -      struct btrfs_device *dev = sbio->sdev->dev;
+ +      struct btrfs_device *dev = sblock->sdev->dev;
         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
         struct btrfs_path *path;
         struct btrfs_key found_key;
@@@ -362,9 -316,8 +362,9 @@@
   
         swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
         swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
- -      swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
- -      swarn.logical = sbio->logical + ix * PAGE_SIZE;
+ +      BUG_ON(sblock->page_count < 1);
+ +      swarn.sector = (sblock->pagev[0].physical) >> 9;
+ +      swarn.logical = sblock->pagev[0].logical;
         swarn.errstr = errstr;
         swarn.dev = dev;
         swarn.msg_bufsize = bufsize;
@@@ -389,8 -342,7 +389,8 @@@
                 do {
                         ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
                                                         &ref_root, &ref_level);
- -                      printk(KERN_WARNING "%s at logical %llu on dev %s, "
+ +                      printk(KERN_WARNING
+ +                              "btrfs: %s at logical %llu on dev %s, "
                                 "sector %llu: metadata %s (level %d) in tree "
                                 "%llu\n", errstr, swarn.logical, dev->name,
                                 (unsigned long long)swarn.sector,
@@@ -400,8 -352,8 +400,8 @@@
                 } while (ret != 1);
         } else {
                 swarn.path = path;
-               iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       extent_item_pos,
+               iterate_extent_inodes(fs_info, found_key.objectid,
+                                       extent_item_pos, 1,
                                         scrub_print_warning_inode, &swarn);
         }
   
@@@ -579,9 -531,9 +579,9 @@@ out
                 spin_lock(&sdev->stat_lock);
                 ++sdev->stat.uncorrectable_errors;
                 spin_unlock(&sdev->stat_lock);
- -              printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
- -                                      "(nodatasum) error at logical %llu\n",
- -                                      fixup->logical);
+ +              printk_ratelimited(KERN_ERR
+ +                      "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+ +                      (unsigned long long)fixup->logical, sdev->dev->name);
         }
   
         btrfs_free_path(path);
@@@ -598,168 -550,91 +598,168 @@@
   }
   
   /*
- - * scrub_recheck_error gets called when either verification of the page
- - * failed or the bio failed to read, e.g. with EIO. In the latter case,
- - * recheck_error gets called for every page in the bio, even though only
- - * one may be bad
+ + * scrub_handle_errored_block gets called when either verification of the
+ + * pages failed or the bio failed to read, e.g. with EIO. In the latter
+ + * case, this function handles all pages in the bio, even though only one
+ + * may be bad.
+ + * The goal of this function is to repair the errored block by using the
+ + * contents of one of the mirrors.
    */
- -static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
+ +static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
   {
- -      struct scrub_dev *sdev = sbio->sdev;
- -      u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+ +      struct scrub_dev *sdev = sblock_to_check->sdev;
+ +      struct btrfs_fs_info *fs_info;
+ +      u64 length;
+ +      u64 logical;
+ +      u64 generation;
+ +      unsigned int failed_mirror_index;
+ +      unsigned int is_metadata;
+ +      unsigned int have_csum;
+ +      u8 *csum;
+ +      struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+ +      struct scrub_block *sblock_bad;
+ +      int ret;
+ +      int mirror_index;
+ +      int page_num;
+ +      int success;
         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- -                                      DEFAULT_RATELIMIT_BURST);
+ +                                    DEFAULT_RATELIMIT_BURST);
+ +
+ +      BUG_ON(sblock_to_check->page_count < 1);
+ +      fs_info = sdev->dev->dev_root->fs_info;
+ +      length = sblock_to_check->page_count * PAGE_SIZE;
+ +      logical = sblock_to_check->pagev[0].logical;
+ +      generation = sblock_to_check->pagev[0].generation;
+ +      BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
+ +      failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
+ +      is_metadata = !(sblock_to_check->pagev[0].flags &
+ +                      BTRFS_EXTENT_FLAG_DATA);
+ +      have_csum = sblock_to_check->pagev[0].have_csum;
+ +      csum = sblock_to_check->pagev[0].csum;
   
- -      if (sbio->err) {
- -              if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
- -                                 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
- -                      if (scrub_fixup_check(sbio, ix) == 0)
- -                              return 0;
- -              }
- -              if (__ratelimit(&_rs))
- -                      scrub_print_warning("i/o error", sbio, ix);
- -      } else {
- -              if (__ratelimit(&_rs))
- -                      scrub_print_warning("checksum error", sbio, ix);
+ +      /*
+ +       * read all mirrors one after the other. This includes to
+ +       * re-read the extent or metadata block that failed (that was
+ +       * the cause that this fixup code is called) another time,
+ +       * page by page this time in order to know which pages
+ +       * caused I/O errors and which ones are good (for all mirrors).
+ +       * It is the goal to handle the situation when more than one
+ +       * mirror contains I/O errors, but the errors do not
+ +       * overlap, i.e. the data can be repaired by selecting the
+ +       * pages from those mirrors without I/O error on the
+ +       * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
+ +       * would be that mirror #1 has an I/O error on the first page,
+ +       * the second page is good, and mirror #2 has an I/O error on
+ +       * the second page, but the first page is good.
+ +       * Then the first page of the first mirror can be repaired by
+ +       * taking the first page of the second mirror, and the
+ +       * second page of the second mirror can be repaired by
+ +       * copying the contents of the 2nd page of the 1st mirror.
+ +       * One more note: if the pages of one mirror contain I/O
+ +       * errors, the checksum cannot be verified. In order to get
+ +       * the best data for repairing, the first attempt is to find
+ +       * a mirror without I/O errors and with a validated checksum.
+ +       * Only if this is not possible, the pages are picked from
+ +       * mirrors with I/O errors without considering the checksum.
+ +       * If the latter is the case, at the end, the checksum of the
+ +       * repaired area is verified in order to correctly maintain
+ +       * the statistics.
+ +       */
+ +
+ +      sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
+ +                                   sizeof(*sblocks_for_recheck),
+ +                                   GFP_NOFS);
+ +      if (!sblocks_for_recheck) {
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.malloc_errors++;
+ +              sdev->stat.read_errors++;
+ +              sdev->stat.uncorrectable_errors++;
+ +              spin_unlock(&sdev->stat_lock);
+ +              goto out;
         }
   
- -      spin_lock(&sdev->stat_lock);
- -      ++sdev->stat.read_errors;
- -      spin_unlock(&sdev->stat_lock);
+ +      /* setup the context, map the logical blocks and alloc the pages */
+ +      ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+ +                                      logical, sblocks_for_recheck);
+ +      if (ret) {
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.read_errors++;
+ +              sdev->stat.uncorrectable_errors++;
+ +              spin_unlock(&sdev->stat_lock);
+ +              goto out;
+ +      }
+ +      BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+ +      sblock_bad = sblocks_for_recheck + failed_mirror_index;
   
- -      scrub_fixup(sbio, ix);
- -      return 1;
- -}
+ +      /* build and submit the bios for the failed mirror, check checksums */
+ +      ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+ +                                csum, generation, sdev->csum_size);
+ +      if (ret) {
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.read_errors++;
+ +              sdev->stat.uncorrectable_errors++;
+ +              spin_unlock(&sdev->stat_lock);
+ +              goto out;
+ +      }
   
- -static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
- -{
- -      int ret = 1;
- -      struct page *page;
- -      void *buffer;
- -      u64 flags = sbio->spag[ix].flags;
+ +      if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+ +          sblock_bad->no_io_error_seen) {
+ +              /*
+ +               * the error disappeared after reading page by page, or
+ +               * the area was part of a huge bio and other parts of the
+ +               * bio caused I/O errors, or the block layer merged several
+ +               * read requests into one and the error is caused by a
+ +               * different bio (usually one of the two latter cases is
+ +               * the cause)
+ +               */
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.unverified_errors++;
+ +              spin_unlock(&sdev->stat_lock);
   
- -      page = sbio->bio->bi_io_vec[ix].bv_page;
- -      buffer = kmap_atomic(page, KM_USER0);
- -      if (flags & BTRFS_EXTENT_FLAG_DATA) {
- -              ret = scrub_checksum_data(sbio->sdev,
- -                                        sbio->spag + ix, buffer);
- -      } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- -              ret = scrub_checksum_tree_block(sbio->sdev,
- -                                              sbio->spag + ix,
- -                                              sbio->logical + ix * PAGE_SIZE,
- -                                              buffer);
- -      } else {
- -              WARN_ON(1);
+ +              goto out;
         }
- -      kunmap_atomic(buffer, KM_USER0);
   
- -      return ret;
- -}
+ +      if (!sblock_bad->no_io_error_seen) {
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.read_errors++;
+ +              spin_unlock(&sdev->stat_lock);
+ +              if (__ratelimit(&_rs))
+ +                      scrub_print_warning("i/o error", sblock_to_check);
+ +      } else if (sblock_bad->checksum_error) {
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.csum_errors++;
+ +              spin_unlock(&sdev->stat_lock);
+ +              if (__ratelimit(&_rs))
+ +                      scrub_print_warning("checksum error", sblock_to_check);
+ +      } else if (sblock_bad->header_error) {
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.verify_errors++;
+ +              spin_unlock(&sdev->stat_lock);
+ +              if (__ratelimit(&_rs))
+ +                      scrub_print_warning("checksum/header error",
+ +                                          sblock_to_check);
+ +      }
   
- -static void scrub_fixup_end_io(struct bio *bio, int err)
- -{
- -      complete((struct completion *)bio->bi_private);
- -}
+ +      if (sdev->readonly)
+ +              goto did_not_correct_error;
   
- -static void scrub_fixup(struct scrub_bio *sbio, int ix)
- -{
- -      struct scrub_dev *sdev = sbio->sdev;
- -      struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
- -      struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
- -      struct btrfs_bio *bbio = NULL;
- -      struct scrub_fixup_nodatasum *fixup;
- -      u64 logical = sbio->logical + ix * PAGE_SIZE;
- -      u64 length;
- -      int i;
- -      int ret;
- -      DECLARE_COMPLETION_ONSTACK(complete);
- -
- -      if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
- -          (sbio->spag[ix].have_csum == 0)) {
- -              fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
- -              if (!fixup)
- -                      goto uncorrectable;
- -              fixup->sdev = sdev;
- -              fixup->logical = logical;
- -              fixup->root = fs_info->extent_root;
- -              fixup->mirror_num = sbio->spag[ix].mirror_num;
+ +      if (!is_metadata && !have_csum) {
+ +              struct scrub_fixup_nodatasum *fixup_nodatasum;
+ +
+ +              /*
+ +               * !is_metadata and !have_csum, this means that the data
+ +               * might not be COW'ed, that it might be modified
+ +               * concurrently. The general strategy to work on the
+ +               * commit root does not help in the case when COW is not
+ +               * used.
+ +               */
+ +              fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
+ +              if (!fixup_nodatasum)
+ +                      goto did_not_correct_error;
+ +              fixup_nodatasum->sdev = sdev;
+ +              fixup_nodatasum->logical = logical;
+ +              fixup_nodatasum->root = fs_info->extent_root;
+ +              fixup_nodatasum->mirror_num = failed_mirror_index + 1;
                 /*
                  * increment scrubs_running to prevent cancel requests from
                  * completing as long as a fixup worker is running. we must also
@@@ -774,529 -649,235 +774,529 @@@
                 atomic_inc(&fs_info->scrubs_paused);
                 mutex_unlock(&fs_info->scrub_lock);
                 atomic_inc(&sdev->fixup_cnt);
- -              fixup->work.func = scrub_fixup_nodatasum;
- -              btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
- -              return;
+ +              fixup_nodatasum->work.func = scrub_fixup_nodatasum;
+ +              btrfs_queue_worker(&fs_info->scrub_workers,
+ +                                 &fixup_nodatasum->work);
+ +              goto out;
         }
   
- -      length = PAGE_SIZE;
- -      ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
- -                            &bbio, 0);
- -      if (ret || !bbio || length < PAGE_SIZE) {
- -              printk(KERN_ERR
- -                     "scrub_fixup: btrfs_map_block failed us for %llu\n",
- -                     (unsigned long long)logical);
- -              WARN_ON(1);
- -              kfree(bbio);
- -              return;
+ +      /*
+ +       * now build and submit the bios for the other mirrors, check
+ +       * checksums
+ +       */
+ +      for (mirror_index = 0;
+ +           mirror_index < BTRFS_MAX_MIRRORS &&
+ +           sblocks_for_recheck[mirror_index].page_count > 0;
+ +           mirror_index++) {
+ +              if (mirror_index == failed_mirror_index)
+ +                      continue;
+ +
+ +              /* build and submit the bios, check checksums */
+ +              ret = scrub_recheck_block(fs_info,
+ +                                        sblocks_for_recheck + mirror_index,
+ +                                        is_metadata, have_csum, csum,
+ +                                        generation, sdev->csum_size);
+ +              if (ret)
+ +                      goto did_not_correct_error;
         }
   
- -      if (bbio->num_stripes == 1)
- -              /* there aren't any replicas */
- -              goto uncorrectable;
+ +      /*
+ +       * first try to pick the mirror which is completely without I/O
+ +       * errors and also does not have a checksum error.
+ +       * If one is found, and if a checksum is present, the full block
+ +       * that is known to contain an error is rewritten. Afterwards
+ +       * the block is known to be corrected.
+ +       * If a mirror is found which is completely correct, and no
+ +       * checksum is present, only those pages are rewritten that had
+ +       * an I/O error in the block to be repaired, since it cannot be
+ +       * determined, which copy of the other pages is better (and it
+ +       * could happen otherwise that a correct page would be
+ +       * overwritten by a bad one).
+ +       */
+ +      for (mirror_index = 0;
+ +           mirror_index < BTRFS_MAX_MIRRORS &&
+ +           sblocks_for_recheck[mirror_index].page_count > 0;
+ +           mirror_index++) {
+ +              struct scrub_block *sblock_other = sblocks_for_recheck +
+ +                                                 mirror_index;
+ +
+ +              if (!sblock_other->header_error &&
+ +                  !sblock_other->checksum_error &&
+ +                  sblock_other->no_io_error_seen) {
+ +                      int force_write = is_metadata || have_csum;
+ +
+ +                      ret = scrub_repair_block_from_good_copy(sblock_bad,
+ +                                                              sblock_other,
+ +                                                              force_write);
+ +                      if (0 == ret)
+ +                              goto corrected_error;
+ +              }
+ +      }
   
         /*
- -       * first find a good copy
+ +       * in case of I/O errors in the area that is supposed to be
+ +       * repaired, continue by picking good copies of those pages.
+ +       * Select the good pages from mirrors to rewrite bad pages from
+ +       * the area to fix. Afterwards verify the checksum of the block
+ +       * that is supposed to be repaired. This verification step is
+ +       * only done for the purpose of statistic counting and for the
+ +       * final scrub report, whether errors remain.
+ +       * A perfect algorithm could make use of the checksum and try
+ +       * all possible combinations of pages from the different mirrors
+ +       * until the checksum verification succeeds. For example, when
+ +       * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+ +       * of mirror #2 is readable but the final checksum test fails,
+ +       * then the 2nd page of mirror #3 could be tried, whether now
+ +       * the final checksum succeedes. But this would be a rare
+ +       * exception and is therefore not implemented. At least it is
+ +       * avoided that the good copy is overwritten.
+ +       * A more useful improvement would be to pick the sectors
+ +       * without I/O error based on sector sizes (512 bytes on legacy
+ +       * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+ +       * mirror could be repaired by taking 512 byte of a different
+ +       * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+ +       * area are unreadable.
          */
- -      for (i = 0; i < bbio->num_stripes; ++i) {
- -              if (i + 1 == sbio->spag[ix].mirror_num)
- -                      continue;
   
- -              if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
- -                                 bbio->stripes[i].physical >> 9,
- -                                 sbio->bio->bi_io_vec[ix].bv_page)) {
- -                      /* I/O-error, this is not a good copy */
+ +      /* can only fix I/O errors from here on */
+ +      if (sblock_bad->no_io_error_seen)
+ +              goto did_not_correct_error;
+ +
+ +      success = 1;
+ +      for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ +              struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+ +
+ +              if (!page_bad->io_error)
                         continue;
+ +
+ +              for (mirror_index = 0;
+ +                   mirror_index < BTRFS_MAX_MIRRORS &&
+ +                   sblocks_for_recheck[mirror_index].page_count > 0;
+ +                   mirror_index++) {
+ +                      struct scrub_block *sblock_other = sblocks_for_recheck +
+ +                                                         mirror_index;
+ +                      struct scrub_page *page_other = sblock_other->pagev +
+ +                                                      page_num;
+ +
+ +                      if (!page_other->io_error) {
+ +                              ret = scrub_repair_page_from_good_copy(
+ +                                      sblock_bad, sblock_other, page_num, 0);
+ +                              if (0 == ret) {
+ +                                      page_bad->io_error = 0;
+ +                                      break; /* succeeded for this page */
+ +                              }
+ +                      }
                 }
   
- -              if (scrub_fixup_check(sbio, ix) == 0)
- -                      break;
+ +              if (page_bad->io_error) {
+ +                      /* did not find a mirror to copy the page from */
+ +                      success = 0;
+ +              }
         }
- -      if (i == bbio->num_stripes)
- -              goto uncorrectable;
   
- -      if (!sdev->readonly) {
- -              /*
- -               * bi_io_vec[ix].bv_page now contains good data, write it back
- -               */
- -              if (scrub_fixup_io(WRITE, sdev->dev->bdev,
- -                                 (sbio->physical + ix * PAGE_SIZE) >> 9,
- -                                 sbio->bio->bi_io_vec[ix].bv_page)) {
- -                      /* I/O-error, writeback failed, give up */
- -                      goto uncorrectable;
+ +      if (success) {
+ +              if (is_metadata || have_csum) {
+ +                      /*
+ +                       * need to verify the checksum now that all
+ +                       * sectors on disk are repaired (the write
+ +                       * request for data to be repaired is on its way).
+ +                       * Just be lazy and use scrub_recheck_block()
+ +                       * which re-reads the data before the checksum
+ +                       * is verified, but most likely the data comes out
+ +                       * of the page cache.
+ +                       */
+ +                      ret = scrub_recheck_block(fs_info, sblock_bad,
+ +                                                is_metadata, have_csum, csum,
+ +                                                generation, sdev->csum_size);
+ +                      if (!ret && !sblock_bad->header_error &&
+ +                          !sblock_bad->checksum_error &&
+ +                          sblock_bad->no_io_error_seen)
+ +                              goto corrected_error;
+ +                      else
+ +                              goto did_not_correct_error;
+ +              } else {
+ +corrected_error:
+ +                      spin_lock(&sdev->stat_lock);
+ +                      sdev->stat.corrected_errors++;
+ +                      spin_unlock(&sdev->stat_lock);
+ +                      printk_ratelimited(KERN_ERR
+ +                              "btrfs: fixed up error at logical %llu on dev %s\n",
+ +                              (unsigned long long)logical, sdev->dev->name);
                 }
+ +      } else {
+ +did_not_correct_error:
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.uncorrectable_errors++;
+ +              spin_unlock(&sdev->stat_lock);
+ +              printk_ratelimited(KERN_ERR
+ +                      "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
+ +                      (unsigned long long)logical, sdev->dev->name);
         }
   
- -      kfree(bbio);
- -      spin_lock(&sdev->stat_lock);
- -      ++sdev->stat.corrected_errors;
- -      spin_unlock(&sdev->stat_lock);
+ +out:
+ +      if (sblocks_for_recheck) {
+ +              for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
+ +                   mirror_index++) {
+ +                      struct scrub_block *sblock = sblocks_for_recheck +
+ +                                                   mirror_index;
+ +                      int page_index;
+ +
+ +                      for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
+ +                           page_index++)
+ +                              if (sblock->pagev[page_index].page)
+ +                                      __free_page(
+ +                                              sblock->pagev[page_index].page);
+ +              }
+ +              kfree(sblocks_for_recheck);
+ +      }
   
- -      printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
- -                             (unsigned long long)logical);
- -      return;
+ +      return 0;
+ +}
   
- -uncorrectable:
- -      kfree(bbio);
- -      spin_lock(&sdev->stat_lock);
- -      ++sdev->stat.uncorrectable_errors;
- -      spin_unlock(&sdev->stat_lock);
+ +static int scrub_setup_recheck_block(struct scrub_dev *sdev,
+ +                                   struct btrfs_mapping_tree *map_tree,
+ +                                   u64 length, u64 logical,
+ +                                   struct scrub_block *sblocks_for_recheck)
+ +{
+ +      int page_index;
+ +      int mirror_index;
+ +      int ret;
+ +
+ +      /*
+ +       * note: the three members sdev, ref_count and outstanding_pages
+ +       * are not used (and not set) in the blocks that are used for
+ +       * the recheck procedure
+ +       */
+ +
+ +      page_index = 0;
+ +      while (length > 0) {
+ +              u64 sublen = min_t(u64, length, PAGE_SIZE);
+ +              u64 mapped_length = sublen;
+ +              struct btrfs_bio *bbio = NULL;
+ +
+ +              /*
+ +               * with a length of PAGE_SIZE, each returned stripe
+ +               * represents one mirror
+ +               */
+ +              ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
+ +                                    &bbio, 0);
+ +              if (ret || !bbio || mapped_length < sublen) {
+ +                      kfree(bbio);
+ +                      return -EIO;
+ +              }
   
- -      printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
- -                              "logical %llu\n", (unsigned long long)logical);
+ +              BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+ +              for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+ +                   mirror_index++) {
+ +                      struct scrub_block *sblock;
+ +                      struct scrub_page *page;
+ +
+ +                      if (mirror_index >= BTRFS_MAX_MIRRORS)
+ +                              continue;
+ +
+ +                      sblock = sblocks_for_recheck + mirror_index;
+ +                      page = sblock->pagev + page_index;
+ +                      page->logical = logical;
+ +                      page->physical = bbio->stripes[mirror_index].physical;
+ +                      page->bdev = bbio->stripes[mirror_index].dev->bdev;
+ +                      page->mirror_num = mirror_index + 1;
+ +                      page->page = alloc_page(GFP_NOFS);
+ +                      if (!page->page) {
+ +                              spin_lock(&sdev->stat_lock);
+ +                              sdev->stat.malloc_errors++;
+ +                              spin_unlock(&sdev->stat_lock);
+ +                              return -ENOMEM;
+ +                      }
+ +                      sblock->page_count++;
+ +              }
+ +              kfree(bbio);
+ +              length -= sublen;
+ +              logical += sublen;
+ +              page_index++;
+ +      }
+ +
+ +      return 0;
   }
   
- -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
- -                       struct page *page)
+ +/*
+ + * this function will check the on disk data for checksum errors, header
+ + * errors and read I/O errors. If any I/O errors happen, the exact pages
+ + * which are errored are marked as being bad. The goal is to enable scrub
+ + * to take those pages that are not errored from all the mirrors so that
+ + * the pages that are errored in the just handled mirror can be repaired.
+ + */
+ +static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
+ +                             struct scrub_block *sblock, int is_metadata,
+ +                             int have_csum, u8 *csum, u64 generation,
+ +                             u16 csum_size)
   {
- -      struct bio *bio = NULL;
- -      int ret;
- -      DECLARE_COMPLETION_ONSTACK(complete);
+ +      int page_num;
+ +
+ +      sblock->no_io_error_seen = 1;
+ +      sblock->header_error = 0;
+ +      sblock->checksum_error = 0;
+ +
+ +      for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ +              struct bio *bio;
+ +              int ret;
+ +              struct scrub_page *page = sblock->pagev + page_num;
+ +              DECLARE_COMPLETION_ONSTACK(complete);
+ +
+ +              BUG_ON(!page->page);
+ +              bio = bio_alloc(GFP_NOFS, 1);
+ +              bio->bi_bdev = page->bdev;
+ +              bio->bi_sector = page->physical >> 9;
+ +              bio->bi_end_io = scrub_complete_bio_end_io;
+ +              bio->bi_private = &complete;
+ +
+ +              ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
+ +              if (PAGE_SIZE != ret) {
+ +                      bio_put(bio);
+ +                      return -EIO;
+ +              }
+ +              btrfsic_submit_bio(READ, bio);
   
- -      bio = bio_alloc(GFP_NOFS, 1);
- -      bio->bi_bdev = bdev;
- -      bio->bi_sector = sector;
- -      bio_add_page(bio, page, PAGE_SIZE, 0);
- -      bio->bi_end_io = scrub_fixup_end_io;
- -      bio->bi_private = &complete;
- -      btrfsic_submit_bio(rw, bio);
+ +              /* this will also unplug the queue */
+ +              wait_for_completion(&complete);
   
- -      /* this will also unplug the queue */
- -      wait_for_completion(&complete);
+ +              page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+ +              if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ +                      sblock->no_io_error_seen = 0;
+ +              bio_put(bio);
+ +      }
   
- -      ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
- -      bio_put(bio);
- -      return ret;
+ +      if (sblock->no_io_error_seen)
+ +              scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+ +                                           have_csum, csum, generation,
+ +                                           csum_size);
+ +
+ +      return 0;
   }
   
- -static void scrub_bio_end_io(struct bio *bio, int err)
+ +static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+ +                                       struct scrub_block *sblock,
+ +                                       int is_metadata, int have_csum,
+ +                                       const u8 *csum, u64 generation,
+ +                                       u16 csum_size)
   {
- -      struct scrub_bio *sbio = bio->bi_private;
- -      struct scrub_dev *sdev = sbio->sdev;
- -      struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+ +      int page_num;
+ +      u8 calculated_csum[BTRFS_CSUM_SIZE];
+ +      u32 crc = ~(u32)0;
+ +      struct btrfs_root *root = fs_info->extent_root;
+ +      void *mapped_buffer;
+ +
+ +      BUG_ON(!sblock->pagev[0].page);
+ +      if (is_metadata) {
+ +              struct btrfs_header *h;
+ +
+ +              mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
+ +              h = (struct btrfs_header *)mapped_buffer;
+ +
+ +              if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+ +                  generation != le64_to_cpu(h->generation) ||
+ +                  memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+ +                  memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+ +                         BTRFS_UUID_SIZE))
+ +                      sblock->header_error = 1;
+ +              csum = h->csum;
+ +      } else {
+ +              if (!have_csum)
+ +                      return;
   
- -      sbio->err = err;
- -      sbio->bio = bio;
+ +              mapped_buffer = kmap_atomic(sblock->pagev[0].page, KM_USER0);
+ +      }
   
- -      btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+ +      for (page_num = 0;;) {
+ +              if (page_num == 0 && is_metadata)
+ +                      crc = btrfs_csum_data(root,
+ +                              ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
+ +                              crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
+ +              else
+ +                      crc = btrfs_csum_data(root, mapped_buffer, crc,
+ +                                            PAGE_SIZE);
+ +
+ +              kunmap_atomic(mapped_buffer, KM_USER0);
+ +              page_num++;
+ +              if (page_num >= sblock->page_count)
+ +                      break;
+ +              BUG_ON(!sblock->pagev[page_num].page);
+ +
+ +              mapped_buffer = kmap_atomic(sblock->pagev[page_num].page,
+ +                                          KM_USER0);
+ +      }
+ +
+ +      btrfs_csum_final(crc, calculated_csum);
+ +      if (memcmp(calculated_csum, csum, csum_size))
+ +              sblock->checksum_error = 1;
   }
   
- -static void scrub_checksum(struct btrfs_work *work)
+ +static void scrub_complete_bio_end_io(struct bio *bio, int err)
   {
- -      struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
- -      struct scrub_dev *sdev = sbio->sdev;
- -      struct page *page;
- -      void *buffer;
- -      int i;
- -      u64 flags;
- -      u64 logical;
- -      int ret;
+ +      complete((struct completion *)bio->bi_private);
+ +}
   
- -      if (sbio->err) {
- -              ret = 0;
- -              for (i = 0; i < sbio->count; ++i)
- -                      ret |= scrub_recheck_error(sbio, i);
- -              if (!ret) {
- -                      spin_lock(&sdev->stat_lock);
- -                      ++sdev->stat.unverified_errors;
- -                      spin_unlock(&sdev->stat_lock);
- -              }
+ +static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+ +                                           struct scrub_block *sblock_good,
+ +                                           int force_write)
+ +{
+ +      int page_num;
+ +      int ret = 0;
   
- -              sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
- -              sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
- -              sbio->bio->bi_phys_segments = 0;
- -              sbio->bio->bi_idx = 0;
+ +      for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ +              int ret_sub;
   
- -              for (i = 0; i < sbio->count; i++) {
- -                      struct bio_vec *bi;
- -                      bi = &sbio->bio->bi_io_vec[i];
- -                      bi->bv_offset = 0;
- -                      bi->bv_len = PAGE_SIZE;
- -              }
- -              goto out;
+ +              ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
+ +                                                         sblock_good,
+ +                                                         page_num,
+ +                                                         force_write);
+ +              if (ret_sub)
+ +                      ret = ret_sub;
         }
- -      for (i = 0; i < sbio->count; ++i) {
- -              page = sbio->bio->bi_io_vec[i].bv_page;
- -              buffer = kmap_atomic(page, KM_USER0);
- -              flags = sbio->spag[i].flags;
- -              logical = sbio->logical + i * PAGE_SIZE;
- -              ret = 0;
- -              if (flags & BTRFS_EXTENT_FLAG_DATA) {
- -                      ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
- -              } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- -                      ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
- -                                                      logical, buffer);
- -              } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
- -                      BUG_ON(i);
- -                      (void)scrub_checksum_super(sbio, buffer);
- -              } else {
- -                      WARN_ON(1);
- -              }
- -              kunmap_atomic(buffer, KM_USER0);
- -              if (ret) {
- -                      ret = scrub_recheck_error(sbio, i);
- -                      if (!ret) {
- -                              spin_lock(&sdev->stat_lock);
- -                              ++sdev->stat.unverified_errors;
- -                              spin_unlock(&sdev->stat_lock);
- -                      }
+ +
+ +      return ret;
+ +}
+ +
+ +static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+ +                                          struct scrub_block *sblock_good,
+ +                                          int page_num, int force_write)
+ +{
+ +      struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+ +      struct scrub_page *page_good = sblock_good->pagev + page_num;
+ +
+ +      BUG_ON(sblock_bad->pagev[page_num].page == NULL);
+ +      BUG_ON(sblock_good->pagev[page_num].page == NULL);
+ +      if (force_write || sblock_bad->header_error ||
+ +          sblock_bad->checksum_error || page_bad->io_error) {
+ +              struct bio *bio;
+ +              int ret;
+ +              DECLARE_COMPLETION_ONSTACK(complete);
+ +
+ +              bio = bio_alloc(GFP_NOFS, 1);
+ +              bio->bi_bdev = page_bad->bdev;
+ +              bio->bi_sector = page_bad->physical >> 9;
+ +              bio->bi_end_io = scrub_complete_bio_end_io;
+ +              bio->bi_private = &complete;
+ +
+ +              ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+ +              if (PAGE_SIZE != ret) {
+ +                      bio_put(bio);
+ +                      return -EIO;
                 }
+ +              btrfsic_submit_bio(WRITE, bio);
+ +
+ +              /* this will also unplug the queue */
+ +              wait_for_completion(&complete);
+ +              bio_put(bio);
         }
   
- -out:
- -      scrub_free_bio(sbio->bio);
- -      sbio->bio = NULL;
- -      spin_lock(&sdev->list_lock);
- -      sbio->next_free = sdev->first_free;
- -      sdev->first_free = sbio->index;
- -      spin_unlock(&sdev->list_lock);
- -      atomic_dec(&sdev->in_flight);
- -      wake_up(&sdev->list_wait);
+ +      return 0;
+ +}
+ +
+ +static void scrub_checksum(struct scrub_block *sblock)
+ +{
+ +      u64 flags;
+ +      int ret;
+ +
+ +      BUG_ON(sblock->page_count < 1);
+ +      flags = sblock->pagev[0].flags;
+ +      ret = 0;
+ +      if (flags & BTRFS_EXTENT_FLAG_DATA)
+ +              ret = scrub_checksum_data(sblock);
+ +      else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+ +              ret = scrub_checksum_tree_block(sblock);
+ +      else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+ +              (void)scrub_checksum_super(sblock);
+ +      else
+ +              WARN_ON(1);
+ +      if (ret)
+ +              scrub_handle_errored_block(sblock);
   }
   
- -static int scrub_checksum_data(struct scrub_dev *sdev,
- -                             struct scrub_page *spag, void *buffer)
+ +static int scrub_checksum_data(struct scrub_block *sblock)
   {
+ +      struct scrub_dev *sdev = sblock->sdev;
         u8 csum[BTRFS_CSUM_SIZE];
+ +      u8 *on_disk_csum;
+ +      struct page *page;
+ +      void *buffer;
         u32 crc = ~(u32)0;
         int fail = 0;
         struct btrfs_root *root = sdev->dev->dev_root;
+ +      u64 len;
+ +      int index;
   
- -      if (!spag->have_csum)
+ +      BUG_ON(sblock->page_count < 1);
+ +      if (!sblock->pagev[0].have_csum)
                 return 0;
   
- -      crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+ +      on_disk_csum = sblock->pagev[0].csum;
+ +      page = sblock->pagev[0].page;
+ +      buffer = kmap_atomic(page, KM_USER0);
+ +
+ +      len = sdev->sectorsize;
+ +      index = 0;
+ +      for (;;) {
+ +              u64 l = min_t(u64, len, PAGE_SIZE);
+ +
+ +              crc = btrfs_csum_data(root, buffer, crc, l);
+ +              kunmap_atomic(buffer, KM_USER0);
+ +              len -= l;
+ +              if (len == 0)
+ +                      break;
+ +              index++;
+ +              BUG_ON(index >= sblock->page_count);
+ +              BUG_ON(!sblock->pagev[index].page);
+ +              page = sblock->pagev[index].page;
+ +              buffer = kmap_atomic(page, KM_USER0);
+ +      }
+ +
         btrfs_csum_final(crc, csum);
- -      if (memcmp(csum, spag->csum, sdev->csum_size))
+ +      if (memcmp(csum, on_disk_csum, sdev->csum_size))
                 fail = 1;
   
- -      spin_lock(&sdev->stat_lock);
- -      ++sdev->stat.data_extents_scrubbed;
- -      sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
- -      if (fail)
+ +      if (fail) {
+ +              spin_lock(&sdev->stat_lock);
                 ++sdev->stat.csum_errors;
- -      spin_unlock(&sdev->stat_lock);
+ +              spin_unlock(&sdev->stat_lock);
+ +      }
   
         return fail;
   }
   
- -static int scrub_checksum_tree_block(struct scrub_dev *sdev,
- -                                   struct scrub_page *spag, u64 logical,
- -                                   void *buffer)
+ +static int scrub_checksum_tree_block(struct scrub_block *sblock)
   {
+ +      struct scrub_dev *sdev = sblock->sdev;
         struct btrfs_header *h;
         struct btrfs_root *root = sdev->dev->dev_root;
         struct btrfs_fs_info *fs_info = root->fs_info;
- -      u8 csum[BTRFS_CSUM_SIZE];
+ +      u8 calculated_csum[BTRFS_CSUM_SIZE];
+ +      u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ +      struct page *page;
+ +      void *mapped_buffer;
+ +      u64 mapped_size;
+ +      void *p;
         u32 crc = ~(u32)0;
         int fail = 0;
         int crc_fail = 0;
+ +      u64 len;
+ +      int index;
+ +
+ +      BUG_ON(sblock->page_count < 1);
+ +      page = sblock->pagev[0].page;
+ +      mapped_buffer = kmap_atomic(page, KM_USER0);
+ +      h = (struct btrfs_header *)mapped_buffer;
+ +      memcpy(on_disk_csum, h->csum, sdev->csum_size);
   
         /*
          * we don't use the getter functions here, as we
          * a) don't have an extent buffer and
          * b) the page is already kmapped
          */
- -      h = (struct btrfs_header *)buffer;
   
- -      if (logical != le64_to_cpu(h->bytenr))
+ +      if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
                 ++fail;
   
- -      if (spag->generation != le64_to_cpu(h->generation))
+ +      if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
                 ++fail;
   
         if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@@ -1306,99 -887,51 +1306,99 @@@
                    BTRFS_UUID_SIZE))
                 ++fail;
   
- -      crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
- -                            PAGE_SIZE - BTRFS_CSUM_SIZE);
- -      btrfs_csum_final(crc, csum);
- -      if (memcmp(csum, h->csum, sdev->csum_size))
+ +      BUG_ON(sdev->nodesize != sdev->leafsize);
+ +      len = sdev->nodesize - BTRFS_CSUM_SIZE;
+ +      mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+ +      p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+ +      index = 0;
+ +      for (;;) {
+ +              u64 l = min_t(u64, len, mapped_size);
+ +
+ +              crc = btrfs_csum_data(root, p, crc, l);
+ +              kunmap_atomic(mapped_buffer, KM_USER0);
+ +              len -= l;
+ +              if (len == 0)
+ +                      break;
+ +              index++;
+ +              BUG_ON(index >= sblock->page_count);
+ +              BUG_ON(!sblock->pagev[index].page);
+ +              page = sblock->pagev[index].page;
+ +              mapped_buffer = kmap_atomic(page, KM_USER0);
+ +              mapped_size = PAGE_SIZE;
+ +              p = mapped_buffer;
+ +      }
+ +
+ +      btrfs_csum_final(crc, calculated_csum);
+ +      if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
                 ++crc_fail;
   
- -      spin_lock(&sdev->stat_lock);
- -      ++sdev->stat.tree_extents_scrubbed;
- -      sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
- -      if (crc_fail)
- -              ++sdev->stat.csum_errors;
- -      if (fail)
- -              ++sdev->stat.verify_errors;
- -      spin_unlock(&sdev->stat_lock);
+ +      if (crc_fail || fail) {
+ +              spin_lock(&sdev->stat_lock);
+ +              if (crc_fail)
+ +                      ++sdev->stat.csum_errors;
+ +              if (fail)
+ +                      ++sdev->stat.verify_errors;
+ +              spin_unlock(&sdev->stat_lock);
+ +      }
   
         return fail || crc_fail;
   }
   
- -static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+ +static int scrub_checksum_super(struct scrub_block *sblock)
   {
         struct btrfs_super_block *s;
- -      u64 logical;
- -      struct scrub_dev *sdev = sbio->sdev;
+ +      struct scrub_dev *sdev = sblock->sdev;
         struct btrfs_root *root = sdev->dev->dev_root;
         struct btrfs_fs_info *fs_info = root->fs_info;
- -      u8 csum[BTRFS_CSUM_SIZE];
+ +      u8 calculated_csum[BTRFS_CSUM_SIZE];
+ +      u8 on_disk_csum[BTRFS_CSUM_SIZE];
+ +      struct page *page;
+ +      void *mapped_buffer;
+ +      u64 mapped_size;
+ +      void *p;
         u32 crc = ~(u32)0;
         int fail = 0;
+ +      u64 len;
+ +      int index;
   
- -      s = (struct btrfs_super_block *)buffer;
- -      logical = sbio->logical;
+ +      BUG_ON(sblock->page_count < 1);
+ +      page = sblock->pagev[0].page;
+ +      mapped_buffer = kmap_atomic(page, KM_USER0);
+ +      s = (struct btrfs_super_block *)mapped_buffer;
+ +      memcpy(on_disk_csum, s->csum, sdev->csum_size);
   
- -      if (logical != le64_to_cpu(s->bytenr))
+ +      if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
                 ++fail;
   
- -      if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+ +      if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
                 ++fail;
   
         if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
                 ++fail;
   
- -      crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
- -                            PAGE_SIZE - BTRFS_CSUM_SIZE);
- -      btrfs_csum_final(crc, csum);
- -      if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+ +      len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
+ +      mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+ +      p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+ +      index = 0;
+ +      for (;;) {
+ +              u64 l = min_t(u64, len, mapped_size);
+ +
+ +              crc = btrfs_csum_data(root, p, crc, l);
+ +              kunmap_atomic(mapped_buffer, KM_USER0);
+ +              len -= l;
+ +              if (len == 0)
+ +                      break;
+ +              index++;
+ +              BUG_ON(index >= sblock->page_count);
+ +              BUG_ON(!sblock->pagev[index].page);
+ +              page = sblock->pagev[index].page;
+ +              mapped_buffer = kmap_atomic(page, KM_USER0);
+ +              mapped_size = PAGE_SIZE;
+ +              p = mapped_buffer;
+ +      }
+ +
+ +      btrfs_csum_final(crc, calculated_csum);
+ +      if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
                 ++fail;
   
         if (fail) {
@@@ -1415,42 -948,29 +1415,42 @@@
         return fail;
   }
   
- -static int scrub_submit(struct scrub_dev *sdev)
+ +static void scrub_block_get(struct scrub_block *sblock)
+ +{
+ +      atomic_inc(&sblock->ref_count);
+ +}
+ +
+ +static void scrub_block_put(struct scrub_block *sblock)
+ +{
+ +      if (atomic_dec_and_test(&sblock->ref_count)) {
+ +              int i;
+ +
+ +              for (i = 0; i < sblock->page_count; i++)
+ +                      if (sblock->pagev[i].page)
+ +                              __free_page(sblock->pagev[i].page);
+ +              kfree(sblock);
+ +      }
+ +}
+ +
+ +static void scrub_submit(struct scrub_dev *sdev)
   {
         struct scrub_bio *sbio;
   
         if (sdev->curr == -1)
- -              return 0;
+ +              return;
   
         sbio = sdev->bios[sdev->curr];
- -      sbio->err = 0;
         sdev->curr = -1;
         atomic_inc(&sdev->in_flight);
   
         btrfsic_submit_bio(READ, sbio->bio);
- -
- -      return 0;
   }
   
- -static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
- -                    u64 physical, u64 flags, u64 gen, int mirror_num,
- -                    u8 *csum, int force)
+ +static int scrub_add_page_to_bio(struct scrub_dev *sdev,
+ +                               struct scrub_page *spage)
   {
+ +      struct scrub_block *sblock = spage->sblock;
         struct scrub_bio *sbio;
- -      struct page *page;
         int ret;
   
   again:
@@@ -1463,7 -983,7 +1463,7 @@@
                 if (sdev->curr != -1) {
                         sdev->first_free = sdev->bios[sdev->curr]->next_free;
                         sdev->bios[sdev->curr]->next_free = -1;
- -                      sdev->bios[sdev->curr]->count = 0;
+ +                      sdev->bios[sdev->curr]->page_count = 0;
                         spin_unlock(&sdev->list_lock);
                 } else {
                         spin_unlock(&sdev->list_lock);
@@@ -1471,200 -991,62 +1471,200 @@@
                 }
         }
         sbio = sdev->bios[sdev->curr];
- -      if (sbio->count == 0) {
+ +      if (sbio->page_count == 0) {
                 struct bio *bio;
   
- -              sbio->physical = physical;
- -              sbio->logical = logical;
- -              bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
- -              if (!bio)
- -                      return -ENOMEM;
+ +              sbio->physical = spage->physical;
+ +              sbio->logical = spage->logical;
+ +              bio = sbio->bio;
+ +              if (!bio) {
+ +                      bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+ +                      if (!bio)
+ +                              return -ENOMEM;
+ +                      sbio->bio = bio;
+ +              }
   
                 bio->bi_private = sbio;
                 bio->bi_end_io = scrub_bio_end_io;
                 bio->bi_bdev = sdev->dev->bdev;
- -              bio->bi_sector = sbio->physical >> 9;
+ +              bio->bi_sector = spage->physical >> 9;
                 sbio->err = 0;
- -              sbio->bio = bio;
- -      } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
- -                 sbio->logical + sbio->count * PAGE_SIZE != logical) {
- -              ret = scrub_submit(sdev);
- -              if (ret)
- -                      return ret;
+ +      } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+ +                 spage->physical ||
+ +                 sbio->logical + sbio->page_count * PAGE_SIZE !=
+ +                 spage->logical) {
+ +              scrub_submit(sdev);
                 goto again;
         }
- -      sbio->spag[sbio->count].flags = flags;
- -      sbio->spag[sbio->count].generation = gen;
- -      sbio->spag[sbio->count].have_csum = 0;
- -      sbio->spag[sbio->count].mirror_num = mirror_num;
- -
- -      page = alloc_page(GFP_NOFS);
- -      if (!page)
- -              return -ENOMEM;
   
- -      ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
- -      if (!ret) {
- -              __free_page(page);
- -              ret = scrub_submit(sdev);
- -              if (ret)
- -                      return ret;
+ +      sbio->pagev[sbio->page_count] = spage;
+ +      ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+ +      if (ret != PAGE_SIZE) {
+ +              if (sbio->page_count < 1) {
+ +                      bio_put(sbio->bio);
+ +                      sbio->bio = NULL;
+ +                      return -EIO;
+ +              }
+ +              scrub_submit(sdev);
                 goto again;
         }
   
- -      if (csum) {
- -              sbio->spag[sbio->count].have_csum = 1;
- -              memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+ +      scrub_block_get(sblock); /* one for the added page */
+ +      atomic_inc(&sblock->outstanding_pages);
+ +      sbio->page_count++;
+ +      if (sbio->page_count == sdev->pages_per_bio)
+ +              scrub_submit(sdev);
+ +
+ +      return 0;
+ +}
+ +
+ +static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
+ +                     u64 physical, u64 flags, u64 gen, int mirror_num,
+ +                     u8 *csum, int force)
+ +{
+ +      struct scrub_block *sblock;
+ +      int index;
+ +
+ +      sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+ +      if (!sblock) {
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.malloc_errors++;
+ +              spin_unlock(&sdev->stat_lock);
+ +              return -ENOMEM;
         }
- -      ++sbio->count;
- -      if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
+ +
+ +      /* one ref inside this function, plus one for each page later on */
+ +      atomic_set(&sblock->ref_count, 1);
+ +      sblock->sdev = sdev;
+ +      sblock->no_io_error_seen = 1;
+ +
+ +      for (index = 0; len > 0; index++) {
+ +              struct scrub_page *spage = sblock->pagev + index;
+ +              u64 l = min_t(u64, len, PAGE_SIZE);
+ +
+ +              BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ +              spage->page = alloc_page(GFP_NOFS);
+ +              if (!spage->page) {
+ +                      spin_lock(&sdev->stat_lock);
+ +                      sdev->stat.malloc_errors++;
+ +                      spin_unlock(&sdev->stat_lock);
+ +                      while (index > 0) {
+ +                              index--;
+ +                              __free_page(sblock->pagev[index].page);
+ +                      }
+ +                      kfree(sblock);
+ +                      return -ENOMEM;
+ +              }
+ +              spage->sblock = sblock;
+ +              spage->bdev = sdev->dev->bdev;
+ +              spage->flags = flags;
+ +              spage->generation = gen;
+ +              spage->logical = logical;
+ +              spage->physical = physical;
+ +              spage->mirror_num = mirror_num;
+ +              if (csum) {
+ +                      spage->have_csum = 1;
+ +                      memcpy(spage->csum, csum, sdev->csum_size);
+ +              } else {
+ +                      spage->have_csum = 0;
+ +              }
+ +              sblock->page_count++;
+ +              len -= l;
+ +              logical += l;
+ +              physical += l;
+ +      }
+ +
+ +      BUG_ON(sblock->page_count == 0);
+ +      for (index = 0; index < sblock->page_count; index++) {
+ +              struct scrub_page *spage = sblock->pagev + index;
                 int ret;
   
- -              ret = scrub_submit(sdev);
- -              if (ret)
+ +              ret = scrub_add_page_to_bio(sdev, spage);
+ +              if (ret) {
+ +                      scrub_block_put(sblock);
                         return ret;
+ +              }
         }
   
+ +      if (force)
+ +              scrub_submit(sdev);
+ +
+ +      /* last one frees, either here or in bio completion for last page */
+ +      scrub_block_put(sblock);
         return 0;
   }
   
+ +static void scrub_bio_end_io(struct bio *bio, int err)
+ +{
+ +      struct scrub_bio *sbio = bio->bi_private;
+ +      struct scrub_dev *sdev = sbio->sdev;
+ +      struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+ +
+ +      sbio->err = err;
+ +      sbio->bio = bio;
+ +
+ +      btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+ +}
+ +
+ +static void scrub_bio_end_io_worker(struct btrfs_work *work)
+ +{
+ +      struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+ +      struct scrub_dev *sdev = sbio->sdev;
+ +      int i;
+ +
+ +      BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+ +      if (sbio->err) {
+ +              for (i = 0; i < sbio->page_count; i++) {
+ +                      struct scrub_page *spage = sbio->pagev[i];
+ +
+ +                      spage->io_error = 1;
+ +                      spage->sblock->no_io_error_seen = 0;
+ +              }
+ +      }
+ +
+ +      /* now complete the scrub_block items that have all pages completed */
+ +      for (i = 0; i < sbio->page_count; i++) {
+ +              struct scrub_page *spage = sbio->pagev[i];
+ +              struct scrub_block *sblock = spage->sblock;
+ +
+ +              if (atomic_dec_and_test(&sblock->outstanding_pages))
+ +                      scrub_block_complete(sblock);
+ +              scrub_block_put(sblock);
+ +      }
+ +
+ +      if (sbio->err) {
+ +              /* what is this good for??? */
+ +              sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ +              sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+ +              sbio->bio->bi_phys_segments = 0;
+ +              sbio->bio->bi_idx = 0;
+ +
+ +              for (i = 0; i < sbio->page_count; i++) {
+ +                      struct bio_vec *bi;
+ +                      bi = &sbio->bio->bi_io_vec[i];
+ +                      bi->bv_offset = 0;
+ +                      bi->bv_len = PAGE_SIZE;
+ +              }
+ +      }
+ +
+ +      bio_put(sbio->bio);
+ +      sbio->bio = NULL;
+ +      spin_lock(&sdev->list_lock);
+ +      sbio->next_free = sdev->first_free;
+ +      sdev->first_free = sbio->index;
+ +      spin_unlock(&sdev->list_lock);
+ +      atomic_dec(&sdev->in_flight);
+ +      wake_up(&sdev->list_wait);
+ +}
+ +
+ +static void scrub_block_complete(struct scrub_block *sblock)
+ +{
+ +      if (!sblock->no_io_error_seen)
+ +              scrub_handle_errored_block(sblock);
+ +      else
+ +              scrub_checksum(sblock);
+ +}
+ +
   static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
                            u8 *csum)
   {
@@@ -1672,6 -1054,7 +1672,6 @@@
         int ret = 0;
         unsigned long i;
         unsigned long num_sectors;
- -      u32 sectorsize = sdev->dev->dev_root->sectorsize;
   
         while (!list_empty(&sdev->csum_list)) {
                 sum = list_first_entry(&sdev->csum_list,
@@@ -1689,7 -1072,7 +1689,7 @@@
         if (!sum)
                 return 0;
   
- -      num_sectors = sum->len / sectorsize;
+ +      num_sectors = sum->len / sdev->sectorsize;
         for (i = 0; i < num_sectors; ++i) {
                 if (sum->sums[i].bytenr == logical) {
                         memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
@@@ -1710,28 -1093,9 +1710,28 @@@ static int scrub_extent(struct scrub_de
   {
         int ret;
         u8 csum[BTRFS_CSUM_SIZE];
+ +      u32 blocksize;
+ +
+ +      if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ +              blocksize = sdev->sectorsize;
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.data_extents_scrubbed++;
+ +              sdev->stat.data_bytes_scrubbed += len;
+ +              spin_unlock(&sdev->stat_lock);
+ +      } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ +              BUG_ON(sdev->nodesize != sdev->leafsize);
+ +              blocksize = sdev->nodesize;
+ +              spin_lock(&sdev->stat_lock);
+ +              sdev->stat.tree_extents_scrubbed++;
+ +              sdev->stat.tree_bytes_scrubbed += len;
+ +              spin_unlock(&sdev->stat_lock);
+ +      } else {
+ +              blocksize = sdev->sectorsize;
+ +              BUG_ON(1);
+ +      }
   
         while (len) {
- -              u64 l = min_t(u64, len, PAGE_SIZE);
+ +              u64 l = min_t(u64, len, blocksize);
                 int have_csum = 0;
   
                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
@@@ -1740,8 -1104,8 +1740,8 @@@
                         if (have_csum == 0)
                                 ++sdev->stat.no_csum;
                 }
- -              ret = scrub_page(sdev, logical, l, physical, flags, gen,
- -                               mirror_num, have_csum ? csum : NULL, 0);
+ +              ret = scrub_pages(sdev, logical, l, physical, flags, gen,
+ +                                mirror_num, have_csum ? csum : NULL, 0);
                 if (ret)
                         return ret;
                 len -= l;
@@@ -1806,11 -1170,6 +1806,11 @@@ static noinline_for_stack int scrub_str
         if (!path)
                 return -ENOMEM;
   
+ +      /*
+ +       * work on commit root. The related disk blocks are static as
+ +       * long as COW is applied. This means, it is save to rewrite
+ +       * them to repair disk errors without any race conditions
+ +       */
         path->search_commit_root = 1;
         path->skip_locking = 1;
   
@@@ -2157,18 -1516,15 +2157,18 @@@ static noinline_for_stack int scrub_sup
         struct btrfs_device *device = sdev->dev;
         struct btrfs_root *root = device->dev_root;
   
+ +      if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+ +              return -EIO;
+ +
         gen = root->fs_info->last_trans_committed;
   
         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                 bytenr = btrfs_sb_offset(i);
- -              if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ +              if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
                         break;
   
- -              ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
- -                               BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+ +              ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+ +                                   BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
                 if (ret)
                         return ret;
         }
@@@ -2227,30 -1583,10 +2227,30 @@@ int btrfs_scrub_dev(struct btrfs_root *
         /*
          * check some assumptions
          */
- -      if (root->sectorsize != PAGE_SIZE ||
- -          root->sectorsize != root->leafsize ||
- -          root->sectorsize != root->nodesize) {
- -              printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+ +      if (root->nodesize != root->leafsize) {
+ +              printk(KERN_ERR
+ +                     "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
+ +                     root->nodesize, root->leafsize);
+ +              return -EINVAL;
+ +      }
+ +
+ +      if (root->nodesize > BTRFS_STRIPE_LEN) {
+ +              /*
+ +               * in this case scrub is unable to calculate the checksum
+ +               * the way scrub is implemented. Do not handle this
+ +               * situation at all because it won't ever happen.
+ +               */
+ +              printk(KERN_ERR
+ +                     "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
+ +                     root->nodesize, BTRFS_STRIPE_LEN);
+ +              return -EINVAL;
+ +      }
+ +
+ +      if (root->sectorsize != PAGE_SIZE) {
+ +              /* not supported for data w/o checksums */
+ +              printk(KERN_ERR
+ +                     "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
+ +                     root->sectorsize, (unsigned long long)PAGE_SIZE);
                 return -EINVAL;
         }
   
@@@ -2320,7 -1656,7 +2320,7 @@@
         return ret;
   }
   
- -int btrfs_scrub_pause(struct btrfs_root *root)
+ +void btrfs_scrub_pause(struct btrfs_root *root)
   {
         struct btrfs_fs_info *fs_info = root->fs_info;
   
@@@ -2335,28 -1671,34 +2335,28 @@@
                 mutex_lock(&fs_info->scrub_lock);
         }
         mutex_unlock(&fs_info->scrub_lock);
- -
- -      return 0;
   }
   
- -int btrfs_scrub_continue(struct btrfs_root *root)
+ +void btrfs_scrub_continue(struct btrfs_root *root)
   {
         struct btrfs_fs_info *fs_info = root->fs_info;
   
         atomic_dec(&fs_info->scrub_pause_req);
         wake_up(&fs_info->scrub_pause_wait);
- -      return 0;
   }
   
- -int btrfs_scrub_pause_super(struct btrfs_root *root)
+ +void btrfs_scrub_pause_super(struct btrfs_root *root)
   {
         down_write(&root->fs_info->scrub_super_lock);
- -      return 0;
   }
   
- -int btrfs_scrub_continue_super(struct btrfs_root *root)
+ +void btrfs_scrub_continue_super(struct btrfs_root *root)
   {
         up_write(&root->fs_info->scrub_super_lock);
- -      return 0;
   }
   
- -int btrfs_scrub_cancel(struct btrfs_root *root)
+ +int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
   {
- -      struct btrfs_fs_info *fs_info = root->fs_info;
   
         mutex_lock(&fs_info->scrub_lock);
         if (!atomic_read(&fs_info->scrubs_running)) {
@@@ -2377,11 -1719,6 +2377,11 @@@
         return 0;
   }
   
+ +int btrfs_scrub_cancel(struct btrfs_root *root)
+ +{
+ +      return __btrfs_scrub_cancel(root->fs_info);
+ +}
+ +
   int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
   {
         struct btrfs_fs_info *fs_info = root->fs_info;
@@@ -2404,7 -1741,6 +2404,7 @@@
   
         return 0;
   }
+ +
   int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
   {
         struct btrfs_fs_info *fs_info = root->fs_info;
diff --combined fs/btrfs/super.c

index 9db64165123a5ca3f11b1740fa462198fe724edb,5239003d453eed85ef82b6c2f9a5412f9607a922..84571d7da12e93d76537c52653891287b6683497
--- 1/fs/btrfs/super.c
--- 2/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@@ -40,6 -40,7 +40,6 @@@
   #include <linux/magic.h>
   #include <linux/slab.h>
   #include <linux/cleancache.h>
- -#include <linux/mnt_namespace.h>
   #include <linux/ratelimit.h>
   #include "compat.h"
   #include "delayed-inode.h"
@@@ -76,9 -77,6 +76,9 @@@ static const char *btrfs_decode_error(s
         case -EROFS:
                 errstr = "Readonly filesystem";
                 break;
+ +      case -EEXIST:
+ +              errstr = "Object already exists";
+ +              break;
         default:
                 if (nbuf) {
                         if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
@@@ -119,8 -117,6 +119,8 @@@ static void btrfs_handle_error(struct b
         if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
                 sb->s_flags |= MS_RDONLY;
                 printk(KERN_INFO "btrfs is forced readonly\n");
+ +              __btrfs_scrub_cancel(fs_info);
+ +//            WARN_ON(1);
         }
   }
   
@@@ -129,143 -125,36 +129,143 @@@
    * invokes the approciate error response.
    */
   void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- -                   unsigned int line, int errno)
+ +                     unsigned int line, int errno, const char *fmt, ...)
   {
         struct super_block *sb = fs_info->sb;
         char nbuf[16];
         const char *errstr;
+ +      va_list args;
+ +      va_start(args, fmt);
   
         /*
          * Special case: if the error is EROFS, and we're already
          * under MS_RDONLY, then it is safe here.
          */
         if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
- -              return;
+ +              return;
   
- -      errstr = btrfs_decode_error(fs_info, errno, nbuf);
- -      printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
- -              sb->s_id, function, line, errstr);
- -      save_error_info(fs_info);
+ +      errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ +      if (fmt) {
+ +              struct va_format vaf = {
+ +                      .fmt = fmt,
+ +                      .va = &args,
+ +              };
   
- -      btrfs_handle_error(fs_info);
+ +              printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
+ +                      sb->s_id, function, line, errstr, &vaf);
+ +      } else {
+ +              printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+ +                      sb->s_id, function, line, errstr);
+ +      }
+ +
+ +      /* Don't go through full error handling during mount */
+ +      if (sb->s_flags & MS_BORN) {
+ +              save_error_info(fs_info);
+ +              btrfs_handle_error(fs_info);
+ +      }
+ +      va_end(args);
   }
   
- -static void btrfs_put_super(struct super_block *sb)
+ +const char *logtypes[] = {
+ +      "emergency",
+ +      "alert",
+ +      "critical",
+ +      "error",
+ +      "warning",
+ +      "notice",
+ +      "info",
+ +      "debug",
+ +};
+ +
+ +void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
   {
- -      struct btrfs_root *root = btrfs_sb(sb);
- -      int ret;
+ +      struct super_block *sb = fs_info->sb;
+ +      char lvl[4];
+ +      struct va_format vaf;
+ +      va_list args;
+ +      const char *type = logtypes[4];
+ +
+ +      va_start(args, fmt);
+ +
+ +      if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
+ +              strncpy(lvl, fmt, 3);
+ +              fmt += 3;
+ +              type = logtypes[fmt[1] - '0'];
+ +      } else
+ +              *lvl = '\0';
+ +
+ +      vaf.fmt = fmt;
+ +      vaf.va = &args;
+ +      printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf);
+ +}
   
- -      ret = close_ctree(root);
- -      sb->s_fs_info = NULL;
+ +/*
+ + * We only mark the transaction aborted and then set the file system read-only.
+ + * This will prevent new transactions from starting or trying to join this
+ + * one.
+ + *
+ + * This means that error recovery at the call site is limited to freeing
+ + * any local memory allocations and passing the error code up without
+ + * further cleanup. The transaction should complete as it normally would
+ + * in the call path but will return -EIO.
+ + *
+ + * We'll complete the cleanup in btrfs_end_transaction and
+ + * btrfs_commit_transaction.
+ + */
+ +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+ +                             struct btrfs_root *root, const char *function,
+ +                             unsigned int line, int errno)
+ +{
+ +      WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted");
+ +      trans->aborted = errno;
+ +      /* Nothing used. The other threads that have joined this
+ +       * transaction may be able to continue. */
+ +      if (!trans->blocks_used) {
+ +              btrfs_printk(root->fs_info, "Aborting unused transaction.\n");
+ +              return;
+ +      }
+ +      trans->transaction->aborted = errno;
+ +      __btrfs_std_error(root->fs_info, function, line, errno, NULL);
+ +}
+ +/*
+ + * __btrfs_panic decodes unexpected, fatal errors from the caller,
+ + * issues an alert, and either panics or BUGs, depending on mount options.
+ + */
+ +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+ +                 unsigned int line, int errno, const char *fmt, ...)
+ +{
+ +      char nbuf[16];
+ +      char *s_id = "<unknown>";
+ +      const char *errstr;
+ +      struct va_format vaf = { .fmt = fmt };
+ +      va_list args;
+ +
+ +      if (fs_info)
+ +              s_id = fs_info->sb->s_id;
   
- -      (void)ret; /* FIXME: need to fix VFS to return error? */
+ +      va_start(args, fmt);
+ +      vaf.va = &args;
+ +
+ +      errstr = btrfs_decode_error(fs_info, errno, nbuf);
+ +      if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
+ +              panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
+ +                      s_id, function, line, &vaf, errstr);
+ +
+ +      printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
+ +             s_id, function, line, &vaf, errstr);
+ +      va_end(args);
+ +      /* Caller calls BUG() */
+ +}
+ +
+ +static void btrfs_put_super(struct super_block *sb)
+ +{
+ +      (void)close_ctree(btrfs_sb(sb)->tree_root);
+ +      /* FIXME: need to fix VFS to return error? */
+ +      /* AV: return it _where_?  ->put_super() can be triggered by any number
+ +       * of async events, up to and including delivery of SIGKILL to the
+ +       * last process that kept it busy.  Or segfault in the aforementioned
+ +       * process...  Whom would you report that to?
+ +       */
   }
   
   enum {
@@@ -278,7 -167,7 +278,7 @@@
         Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
         Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
         Opt_check_integrity, Opt_check_integrity_including_extent_data,
- -      Opt_check_integrity_print_mask,
+ +      Opt_check_integrity_print_mask, Opt_fatal_errors,
         Opt_err,
   };
   
@@@ -318,14 -207,12 +318,14 @@@ static match_table_t tokens = 
         {Opt_check_integrity, "check_int"},
         {Opt_check_integrity_including_extent_data, "check_int_data"},
         {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+ +      {Opt_fatal_errors, "fatal_errors=%s"},
         {Opt_err, NULL},
   };
   
   /*
    * Regular mount options parser.  Everything that is needed only when
    * reading in a new superblock is parsed here.
+ + * XXX JDM: This needs to be cleaned up for remount.
    */
   int btrfs_parse_options(struct btrfs_root *root, char *options)
   {
@@@ -552,18 -439,6 +552,18 @@@
                         ret = -EINVAL;
                         goto out;
   #endif
+ +              case Opt_fatal_errors:
+ +                      if (strcmp(args[0].from, "panic") == 0)
+ +                              btrfs_set_opt(info->mount_opt,
+ +                                            PANIC_ON_FATAL_ERROR);
+ +                      else if (strcmp(args[0].from, "bug") == 0)
+ +                              btrfs_clear_opt(info->mount_opt,
+ +                                            PANIC_ON_FATAL_ERROR);
+ +                      else {
+ +                              ret = -EINVAL;
+ +                              goto out;
+ +                      }
+ +                      break;
                 case Opt_err:
                         printk(KERN_INFO "btrfs: unrecognized mount option "
                                "'%s'\n", p);
@@@ -667,8 -542,7 +667,8 @@@ out
   static struct dentry *get_default_root(struct super_block *sb,
                                        u64 subvol_objectid)
   {
- -      struct btrfs_root *root = sb->s_fs_info;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ +      struct btrfs_root *root = fs_info->tree_root;
         struct btrfs_root *new_root;
         struct btrfs_dir_item *di;
         struct btrfs_path *path;
@@@ -698,7 -572,7 +698,7 @@@
          * will mount by default if we haven't been given a specific subvolume
          * to mount.
          */
- -      dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+ +      dir_id = btrfs_super_root_dir(fs_info->super_copy);
         di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
         if (IS_ERR(di)) {
                 btrfs_free_path(path);
@@@ -712,7 -586,7 +712,7 @@@
                  */
                 btrfs_free_path(path);
                 dir_id = BTRFS_FIRST_FREE_OBJECTID;
- -              new_root = root->fs_info->fs_root;
+ +              new_root = fs_info->fs_root;
                 goto setup_root;
         }
   
@@@ -720,7 -594,7 +720,7 @@@
         btrfs_free_path(path);
   
   find_root:
- -      new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ +      new_root = btrfs_read_fs_root_no_name(fs_info, &location);
         if (IS_ERR(new_root))
                 return ERR_CAST(new_root);
   
@@@ -756,7 -630,7 +756,7 @@@ static int btrfs_fill_super(struct supe
   {
         struct inode *inode;
         struct dentry *root_dentry;
- -      struct btrfs_root *tree_root;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
         struct btrfs_key key;
         int err;
   
@@@ -771,16 -645,18 +771,16 @@@
         sb->s_flags |= MS_POSIXACL;
   #endif
   
- -      tree_root = open_ctree(sb, fs_devices, (char *)data);
- -
- -      if (IS_ERR(tree_root)) {
+ +      err = open_ctree(sb, fs_devices, (char *)data);
+ +      if (err) {
                 printk("btrfs: open_ctree failed\n");
- -              return PTR_ERR(tree_root);
+ +              return err;
         }
- -      sb->s_fs_info = tree_root;
   
         key.objectid = BTRFS_FIRST_FREE_OBJECTID;
         key.type = BTRFS_INODE_ITEM_KEY;
         key.offset = 0;
- -      inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+ +      inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
         if (IS_ERR(inode)) {
                 err = PTR_ERR(inode);
                 goto fail_close;
@@@ -797,25 -673,23 +797,25 @@@
   
         save_mount_options(sb, data);
         cleancache_init_fs(sb);
+ +      sb->s_flags |= MS_ACTIVE;
         return 0;
   
   fail_close:
- -      close_ctree(tree_root);
+ +      close_ctree(fs_info->tree_root);
         return err;
   }
   
   int btrfs_sync_fs(struct super_block *sb, int wait)
   {
         struct btrfs_trans_handle *trans;
- -      struct btrfs_root *root = btrfs_sb(sb);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ +      struct btrfs_root *root = fs_info->tree_root;
         int ret;
   
         trace_btrfs_sync_fs(wait);
   
         if (!wait) {
- -              filemap_flush(root->fs_info->btree_inode->i_mapping);
+ +              filemap_flush(fs_info->btree_inode->i_mapping);
                 return 0;
         }
   
@@@ -829,10 -703,10 +829,10 @@@
         return ret;
   }
   
- -static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+ +static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
   {
- -      struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
- -      struct btrfs_fs_info *info = root->fs_info;
+ +      struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+ +      struct btrfs_root *root = info->tree_root;
         char *compress_type;
   
         if (btrfs_test_opt(root, DEGRADED))
@@@ -892,25 -766,28 +892,25 @@@
                 seq_puts(seq, ",inode_cache");
         if (btrfs_test_opt(root, SKIP_BALANCE))
                 seq_puts(seq, ",skip_balance");
+ +      if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+ +              seq_puts(seq, ",fatal_errors=panic");
         return 0;
   }
   
   static int btrfs_test_super(struct super_block *s, void *data)
   {
- -      struct btrfs_root *test_root = data;
- -      struct btrfs_root *root = btrfs_sb(s);
+ +      struct btrfs_fs_info *p = data;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(s);
   
- -      /*
- -       * If this super block is going away, return false as it
- -       * can't match as an existing super block.
- -       */
- -      if (!atomic_read(&s->s_active))
- -              return 0;
- -      return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+ +      return fs_info->fs_devices == p->fs_devices;
   }
   
   static int btrfs_set_super(struct super_block *s, void *data)
   {
- -      s->s_fs_info = data;
- -
- -      return set_anon_super(s, data);
+ +      int err = set_anon_super(s, data);
+ +      if (!err)
+ +              s->s_fs_info = data;
+ +      return err;
   }
   
   /*
@@@ -1070,6 -947,12 +1070,6 @@@ static struct dentry *btrfs_mount(struc
         if (!fs_info)
                 return ERR_PTR(-ENOMEM);
   
- -      fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- -      if (!fs_info->tree_root) {
- -              error = -ENOMEM;
- -              goto error_fs_info;
- -      }
- -      fs_info->tree_root->fs_info = fs_info;
         fs_info->fs_devices = fs_devices;
   
         fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@@ -1089,30 -972,43 +1089,30 @@@
         }
   
         bdev = fs_devices->latest_bdev;
- -      s = sget(fs_type, btrfs_test_super, btrfs_set_super,
- -               fs_info->tree_root);
+ +      s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
         if (IS_ERR(s)) {
                 error = PTR_ERR(s);
                 goto error_close_devices;
         }
   
         if (s->s_root) {
- -              if ((flags ^ s->s_flags) & MS_RDONLY) {
- -                      deactivate_locked_super(s);
- -                      error = -EBUSY;
- -                      goto error_close_devices;
- -              }
- -
                 btrfs_close_devices(fs_devices);
                 free_fs_info(fs_info);
+ +              if ((flags ^ s->s_flags) & MS_RDONLY)
+ +                      error = -EBUSY;
         } else {
                 char b[BDEVNAME_SIZE];
   
                 s->s_flags = flags | MS_NOSEC;
                 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
- -              btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+ +              btrfs_sb(s)->bdev_holder = fs_type;
                 error = btrfs_fill_super(s, fs_devices, data,
                                          flags & MS_SILENT ? 1 : 0);
- -              if (error) {
- -                      deactivate_locked_super(s);
- -                      return ERR_PTR(error);
- -              }
- -
- -              s->s_flags |= MS_ACTIVE;
         }
   
- -      root = get_default_root(s, subvol_objectid);
- -      if (IS_ERR(root)) {
+ +      root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+ +      if (IS_ERR(root))
                 deactivate_locked_super(s);
- -              return root;
- -      }
   
         return root;
   
@@@ -1125,22 -1021,12 +1125,22 @@@ error_fs_info
   
   static int btrfs_remount(struct super_block *sb, int *flags, char *data)
   {
- -      struct btrfs_root *root = btrfs_sb(sb);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ +      struct btrfs_root *root = fs_info->tree_root;
+ +      unsigned old_flags = sb->s_flags;
+ +      unsigned long old_opts = fs_info->mount_opt;
+ +      unsigned long old_compress_type = fs_info->compress_type;
+ +      u64 old_max_inline = fs_info->max_inline;
+ +      u64 old_alloc_start = fs_info->alloc_start;
+ +      int old_thread_pool_size = fs_info->thread_pool_size;
+ +      unsigned int old_metadata_ratio = fs_info->metadata_ratio;
         int ret;
   
         ret = btrfs_parse_options(root, data);
- -      if (ret)
- -              return -EINVAL;
+ +      if (ret) {
+ +              ret = -EINVAL;
+ +              goto restore;
+ +      }
   
         if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                 return 0;
@@@ -1148,44 -1034,26 +1148,44 @@@
         if (*flags & MS_RDONLY) {
                 sb->s_flags |= MS_RDONLY;
   
- -              ret =  btrfs_commit_super(root);
- -              WARN_ON(ret);
+ +              ret = btrfs_commit_super(root);
+ +              if (ret)
+ +                      goto restore;
         } else {
- -              if (root->fs_info->fs_devices->rw_devices == 0)
- -                      return -EACCES;
+ +              if (fs_info->fs_devices->rw_devices == 0)
+ +                      ret = -EACCES;
+ +                      goto restore;
   
- -              if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
- -                      return -EINVAL;
+ +              if (btrfs_super_log_root(fs_info->super_copy) != 0)
+ +                      ret = -EINVAL;
+ +                      goto restore;
   
- -              ret = btrfs_cleanup_fs_roots(root->fs_info);
- -              WARN_ON(ret);
+ +              ret = btrfs_cleanup_fs_roots(fs_info);
+ +              if (ret)
+ +                      goto restore;
   
                 /* recover relocation */
                 ret = btrfs_recover_relocation(root);
- -              WARN_ON(ret);
+ +              if (ret)
+ +                      goto restore;
   
                 sb->s_flags &= ~MS_RDONLY;
         }
   
         return 0;
+ +
+ +restore:
+ +      /* We've hit an error - don't reset MS_RDONLY */
+ +      if (sb->s_flags & MS_RDONLY)
+ +              old_flags |= MS_RDONLY;
+ +      sb->s_flags = old_flags;
+ +      fs_info->mount_opt = old_opts;
+ +      fs_info->compress_type = old_compress_type;
+ +      fs_info->max_inline = old_max_inline;
+ +      fs_info->alloc_start = old_alloc_start;
+ +      fs_info->thread_pool_size = old_thread_pool_size;
+ +      fs_info->metadata_ratio = old_metadata_ratio;
+ +      return ret;
   }
   
   /* Used to sort the devices by max_avail(descending sort) */
@@@ -1344,18 -1212,18 +1344,18 @@@ static int btrfs_calc_avail_data_space(
   
   static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
   {
- -      struct btrfs_root *root = btrfs_sb(dentry->d_sb);
- -      struct btrfs_super_block *disk_super = root->fs_info->super_copy;
- -      struct list_head *head = &root->fs_info->space_info;
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+ +      struct btrfs_super_block *disk_super = fs_info->super_copy;
+ +      struct list_head *head = &fs_info->space_info;
         struct btrfs_space_info *found;
         u64 total_used = 0;
         u64 total_free_data = 0;
         int bits = dentry->d_sb->s_blocksize_bits;
- -      __be32 *fsid = (__be32 *)root->fs_info->fsid;
+ +      __be32 *fsid = (__be32 *)fs_info->fsid;
         int ret;
   
         /* holding chunk_muext to avoid allocating new chunks */
- -      mutex_lock(&root->fs_info->chunk_mutex);
+ +      mutex_lock(&fs_info->chunk_mutex);
         rcu_read_lock();
         list_for_each_entry_rcu(found, head, list) {
                 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@@ -1374,14 -1242,14 +1374,14 @@@
         buf->f_bsize = dentry->d_sb->s_blocksize;
         buf->f_type = BTRFS_SUPER_MAGIC;
         buf->f_bavail = total_free_data;
- -      ret = btrfs_calc_avail_data_space(root, &total_free_data);
+ +      ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
         if (ret) {
- -              mutex_unlock(&root->fs_info->chunk_mutex);
+ +              mutex_unlock(&fs_info->chunk_mutex);
                 return ret;
         }
         buf->f_bavail += total_free_data;
         buf->f_bavail = buf->f_bavail >> bits;
- -      mutex_unlock(&root->fs_info->chunk_mutex);
+ +      mutex_unlock(&fs_info->chunk_mutex);
   
         /* We treat it as constant endianness (it doesn't matter _which_)
            because we want the fsid to come out the same whether mounted
@@@ -1395,18 -1263,11 +1395,18 @@@
         return 0;
   }
   
+ +static void btrfs_kill_super(struct super_block *sb)
+ +{
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ +      kill_anon_super(sb);
+ +      free_fs_info(fs_info);
+ +}
+ +
   static struct file_system_type btrfs_fs_type = {
         .owner          = THIS_MODULE,
         .name           = "btrfs",
         .mount          = btrfs_mount,
- -      .kill_sb        = kill_anon_super,
+ +      .kill_sb        = btrfs_kill_super,
         .fs_flags       = FS_REQUIRES_DEV,
   };
   
@@@ -1440,17 -1301,17 +1440,17 @@@ static long btrfs_control_ioctl(struct 
   
   static int btrfs_freeze(struct super_block *sb)
   {
- -      struct btrfs_root *root = btrfs_sb(sb);
- -      mutex_lock(&root->fs_info->transaction_kthread_mutex);
- -      mutex_lock(&root->fs_info->cleaner_mutex);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ +      mutex_lock(&fs_info->transaction_kthread_mutex);
+ +      mutex_lock(&fs_info->cleaner_mutex);
         return 0;
   }
   
   static int btrfs_unfreeze(struct super_block *sb)
   {
- -      struct btrfs_root *root = btrfs_sb(sb);
- -      mutex_unlock(&root->fs_info->cleaner_mutex);
- -      mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ +      struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ +      mutex_unlock(&fs_info->cleaner_mutex);
+ +      mutex_unlock(&fs_info->transaction_kthread_mutex);
         return 0;
   }
   
@@@ -1515,7 -1376,9 +1515,7 @@@ static int __init init_btrfs_fs(void
         if (err)
                 return err;
   
- -      err = btrfs_init_compress();
- -      if (err)
- -              goto free_sysfs;
+ +      btrfs_init_compress();
   
         err = btrfs_init_cachep();
         if (err)
@@@ -1541,6 -1404,8 +1541,8 @@@
         if (err)
                 goto unregister_ioctl;
   
+       btrfs_init_lockdep();
+ 
         printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
         return 0;
   
@@@ -1556,6 -1421,7 +1558,6 @@@ free_cachep
         btrfs_destroy_cachep();
   free_compress:
         btrfs_exit_compress();
- -free_sysfs:
         btrfs_exit_sysfs();
         return err;
   }
author	Chris Mason <chris.mason@oracle.com>
	Thu, 29 Mar 2012 00:33:40 +0000 (20:33 -0400)
committer	Chris Mason <chris.mason@oracle.com>
	Thu, 29 Mar 2012 00:33:40 +0000 (20:33 -0400)
		1	2
fs/btrfs/backref.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/ioctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/scrub.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/super.c	patch \|	diff1 \|	diff2 \|	blob \| history