btrfs: Take trans lock before access running trans in check_delayed_ref

[linux-2.6-block.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index c1618ab9fecfb06a50861f26e4d445fe9d63190e..3871658b6ab1de7b5a87200fb252a87f54fdef9a 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
  /*
   * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
   */
+
  #include <linux/sched.h>
  #include <linux/sched/signal.h>
  #include <linux/pagemap.h>
@@ -27,7 +15,7 @@
  #include <linux/ratelimit.h>
  #include <linux/percpu_counter.h>
  #include <linux/lockdep.h>
-#include "hash.h"
+#include <linux/crc32c.h>
  #include "tree-log.h"
  #include "disk-io.h"
  #include "print-tree.h"
@@ -535,13 +523,11 @@ static noinline void caching_thread(struct btrfs_work *work)
         struct btrfs_block_group_cache *block_group;
         struct btrfs_fs_info *fs_info;
         struct btrfs_caching_control *caching_ctl;
-       struct btrfs_root *extent_root;
         int ret;
  
         caching_ctl = container_of(work, struct btrfs_caching_control, work);
         block_group = caching_ctl->block_group;
         fs_info = block_group->fs_info;
-       extent_root = fs_info->extent_root;
  
         mutex_lock(&caching_ctl->mutex);
         down_read(&fs_info->commit_root_sem);
@@ -1203,11 +1189,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
         __le64 lenum;
  
         lenum = cpu_to_le64(root_objectid);
-       high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+       high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
         lenum = cpu_to_le64(owner);
-       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
         lenum = cpu_to_le64(offset);
-       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
  
         return ((u64)high_crc << 31) ^ (u64)low_crc;
  }
@@ -2615,13 +2601,19 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
         trace_run_delayed_ref_head(fs_info, head, 0);
  
         if (head->total_ref_mod < 0) {
-               struct btrfs_block_group_cache *cache;
+               struct btrfs_space_info *space_info;
+               u64 flags;
  
-               cache = btrfs_lookup_block_group(fs_info, head->bytenr);
-               ASSERT(cache);
-               percpu_counter_add(&cache->space_info->total_bytes_pinned,
+               if (head->is_data)
+                       flags = BTRFS_BLOCK_GROUP_DATA;
+               else if (head->is_system)
+                       flags = BTRFS_BLOCK_GROUP_SYSTEM;
+               else
+                       flags = BTRFS_BLOCK_GROUP_METADATA;
+               space_info = __find_space_info(fs_info, flags);
+               ASSERT(space_info);
+               percpu_counter_add(&space_info->total_bytes_pinned,
                                    -head->num_bytes);
-               btrfs_put_block_group(cache);
  
                 if (head->is_data) {
                         spin_lock(&delayed_refs->lock);
@@ -2652,9 +2644,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
   * Returns -ENOMEM or -EIO on failure and will abort the transaction.
   */
  static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                                            struct btrfs_fs_info *fs_info,
                                              unsigned long nr)
  {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
         struct btrfs_delayed_ref_root *delayed_refs;
         struct btrfs_delayed_ref_node *ref;
         struct btrfs_delayed_ref_head *locked_ref = NULL;
@@ -2994,7 +2986,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
         if (trans->transid > async->transid)
                 goto end;
  
-       ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
+       ret = btrfs_run_delayed_refs(trans, async->count);
         if (ret)
                 async->error = ret;
  end:
@@ -3053,8 +3045,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
   * Returns <0 on error and aborts the transaction
   */
  int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                          struct btrfs_fs_info *fs_info, unsigned long count)
+                          unsigned long count)
  {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
         struct rb_node *node;
         struct btrfs_delayed_ref_root *delayed_refs;
         struct btrfs_delayed_ref_head *head;
@@ -3078,7 +3071,7 @@ again:
         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
  #endif
         trans->can_flush_pending_bgs = false;
-       ret = __btrfs_run_delayed_refs(trans, fs_info, count);
+       ret = __btrfs_run_delayed_refs(trans, count);
         if (ret < 0) {
                 btrfs_abort_transaction(trans, ret);
                 return ret;
@@ -3086,7 +3079,7 @@ again:
  
         if (run_all) {
                 if (!list_empty(&trans->new_bgs))
-                       btrfs_create_pending_block_groups(trans, fs_info);
+                       btrfs_create_pending_block_groups(trans);
  
                 spin_lock(&delayed_refs->lock);
                 node = rb_first(&delayed_refs->href_root);
@@ -3149,7 +3142,11 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
         struct rb_node *node;
         int ret = 0;
  
+       spin_lock(&root->fs_info->trans_lock);
         cur_trans = root->fs_info->running_transaction;
+       if (cur_trans)
+               refcount_inc(&cur_trans->use_count);
+       spin_unlock(&root->fs_info->trans_lock);
         if (!cur_trans)
                 return 0;
  
@@ -3158,6 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
         if (!head) {
                 spin_unlock(&delayed_refs->lock);
+               btrfs_put_transaction(cur_trans);
                 return 0;
         }
  
@@ -3174,6 +3172,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
                 mutex_lock(&head->mutex);
                 mutex_unlock(&head->mutex);
                 btrfs_put_delayed_ref_head(head);
+               btrfs_put_transaction(cur_trans);
                 return -EAGAIN;
         }
         spin_unlock(&delayed_refs->lock);
@@ -3206,6 +3205,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
         }
         spin_unlock(&head->lock);
         mutex_unlock(&head->mutex);
+       btrfs_put_transaction(cur_trans);
         return ret;
  }
  
@@ -3660,9 +3660,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
   * the commit latency by getting rid of the easy block groups while
   * we're still allowing others to join the commit.
   */
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
-                                  struct btrfs_fs_info *fs_info)
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
  {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
         struct btrfs_block_group_cache *cache;
         struct btrfs_transaction *cur_trans = trans->transaction;
         int ret = 0;
@@ -3686,7 +3686,7 @@ again:
          * make sure all the block groups on our dirty list actually
          * exist
          */
-       btrfs_create_pending_block_groups(trans, fs_info);
+       btrfs_create_pending_block_groups(trans);
  
         if (!path) {
                 path = btrfs_alloc_path();
@@ -3741,8 +3741,9 @@ again:
                                 should_put = 0;
  
                                 /*
-                                * the cache_write_mutex is protecting
-                                * the io_list
+                                * The cache_write_mutex is protecting the
+                                * io_list, also refer to the definition of
+                                * btrfs_transaction::io_bgs for more details
                                  */
                                 list_add_tail(&cache->io_list, io);
                         } else {
@@ -3800,7 +3801,7 @@ again:
          * go through delayed refs for all the stuff we've just kicked off
          * and then loop back (just once)
          */
-       ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+       ret = btrfs_run_delayed_refs(trans, 0);
         if (!ret && loops == 0) {
                 loops++;
                 spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3882,7 +3883,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                 cache_save_setup(cache, trans, path);
  
                 if (!ret)
-                       ret = btrfs_run_delayed_refs(trans, fs_info,
+                       ret = btrfs_run_delayed_refs(trans,
                                                      (unsigned long) -1);
  
                 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
@@ -3934,6 +3935,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
         }
         spin_unlock(&cur_trans->dirty_bgs_lock);
  
+       /*
+        * Refer to the definition of io_bgs member for details why it's safe
+        * to use it without any locking
+        */
         while (!list_empty(io)) {
                 cache = list_first_entry(io, struct btrfs_block_group_cache,
                                          io_list);
@@ -4333,8 +4338,7 @@ again:
  
                 /* commit the current transaction and try again */
  commit_trans:
-               if (need_commit &&
-                   !atomic_read(&fs_info->open_ioctl_trans)) {
+               if (need_commit) {
                         need_commit--;
  
                         if (need_commit > 0) {
@@ -4542,7 +4546,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
          * Needed because we can end up allocating a system chunk and for an
          * atomic and race free space reservation in the chunk block reserve.
          */
-       ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+       lockdep_assert_held(&fs_info->chunk_mutex);
  
         info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
         spin_lock(&info->lock);
@@ -4603,11 +4607,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                 return -ENOSPC;
  
         space_info = __find_space_info(fs_info, flags);
-       if (!space_info) {
-               ret = create_space_info(fs_info, flags, &space_info);
-               if (ret)
-                       return ret;
-       }
+       ASSERT(space_info);
  
  again:
         spin_lock(&space_info->lock);
@@ -4644,6 +4644,7 @@ again:
         if (wait_for_alloc) {
                 mutex_unlock(&fs_info->chunk_mutex);
                 wait_for_alloc = 0;
+               cond_resched();
                 goto again;
         }
  
@@ -4706,7 +4707,7 @@ out:
          */
         if (trans->can_flush_pending_bgs &&
             trans->chunk_bytes_reserved >= (u64)SZ_2M) {
-               btrfs_create_pending_block_groups(trans, fs_info);
+               btrfs_create_pending_block_groups(trans);
                 btrfs_trans_release_chunk_metadata(trans);
         }
         return ret;
@@ -4827,7 +4828,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
         long time_left;
         unsigned long nr_pages;
         int loops;
-       enum btrfs_reserve_flush_enum flush;
  
         /* Calc the number of the pages we need flush for space reservation */
         items = calc_reclaim_items_nr(fs_info, to_reclaim);
@@ -4868,10 +4868,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
                            atomic_read(&fs_info->async_delalloc_pages) <=
                            (int)max_reclaim);
  skip_async:
-               if (!trans)
-                       flush = BTRFS_RESERVE_FLUSH_ALL;
-               else
-                       flush = BTRFS_RESERVE_NO_FLUSH;
                 spin_lock(&space_info->lock);
                 if (list_empty(&space_info->tickets) &&
                     list_empty(&space_info->priority_tickets)) {
@@ -4994,7 +4990,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                         ret = PTR_ERR(trans);
                         break;
                 }
-               ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+               ret = btrfs_run_delayed_items_nr(trans, nr);
                 btrfs_end_transaction(trans);
                 break;
         case FLUSH_DELALLOC:
@@ -5389,10 +5385,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
                     !block_rsv_use_bytes(global_rsv, orig_bytes))
                         ret = 0;
         }
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
                 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
                                               block_rsv->space_info->flags,
                                               orig_bytes, 1);
+
+               if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+                       dump_space_info(fs_info, block_rsv->space_info,
+                                       orig_bytes, 0);
+       }
         return ret;
  }
  
@@ -5572,14 +5573,18 @@ again:
  
  static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                                     struct btrfs_block_rsv *block_rsv,
-                                   struct btrfs_block_rsv *dest, u64 num_bytes)
+                                   struct btrfs_block_rsv *dest, u64 num_bytes,
+                                   u64 *qgroup_to_release_ret)
  {
         struct btrfs_space_info *space_info = block_rsv->space_info;
+       u64 qgroup_to_release = 0;
         u64 ret;
  
         spin_lock(&block_rsv->lock);
-       if (num_bytes == (u64)-1)
+       if (num_bytes == (u64)-1) {
                 num_bytes = block_rsv->size;
+               qgroup_to_release = block_rsv->qgroup_rsv_size;
+       }
         block_rsv->size -= num_bytes;
         if (block_rsv->reserved >= block_rsv->size) {
                 num_bytes = block_rsv->reserved - block_rsv->size;
@@ -5588,6 +5593,13 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
         } else {
                 num_bytes = 0;
         }
+       if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
+               qgroup_to_release = block_rsv->qgroup_rsv_reserved -
+                                   block_rsv->qgroup_rsv_size;
+               block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
+       } else {
+               qgroup_to_release = 0;
+       }
         spin_unlock(&block_rsv->lock);
  
         ret = num_bytes;
@@ -5610,6 +5622,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                         space_info_add_old_bytes(fs_info, space_info,
                                                  num_bytes);
         }
+       if (qgroup_to_release_ret)
+               *qgroup_to_release_ret = qgroup_to_release;
         return ret;
  }
  
@@ -5751,48 +5765,72 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
         struct btrfs_root *root = inode->root;
         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
         u64 num_bytes = 0;
+       u64 qgroup_num_bytes = 0;
         int ret = -ENOSPC;
  
         spin_lock(&block_rsv->lock);
         if (block_rsv->reserved < block_rsv->size)
                 num_bytes = block_rsv->size - block_rsv->reserved;
+       if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+               qgroup_num_bytes = block_rsv->qgroup_rsv_size -
+                                  block_rsv->qgroup_rsv_reserved;
         spin_unlock(&block_rsv->lock);
  
         if (num_bytes == 0)
                 return 0;
  
+       ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
+       if (ret)
+               return ret;
         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
         if (!ret) {
                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                               btrfs_ino(inode), num_bytes, 1);
-       }
+
+               /* Don't forget to increase qgroup_rsv_reserved */
+               spin_lock(&block_rsv->lock);
+               block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+               spin_unlock(&block_rsv->lock);
+       } else
+               btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
         return ret;
  }
  
  /**
   * btrfs_inode_rsv_release - release any excessive reservation.
   * @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ *   Unlike normal operation, qgroup meta reservation needs to know if we are
+ *   freeing qgroup reservation or just converting it into per-trans.  Normally
+ *   @qgroup_free is true for error handling, and false for normal release.
   *
   * This is the same as btrfs_block_rsv_release, except that it handles the
   * tracepoint for the reservation.
   */
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
  {
         struct btrfs_fs_info *fs_info = inode->root->fs_info;
         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
         u64 released = 0;
+       u64 qgroup_to_release = 0;
  
         /*
          * Since we statically set the block_rsv->size we just want to say we
          * are releasing 0 bytes, and then we'll just get the reservation over
          * the size free'd.
          */
-       released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
+       released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
+                                          &qgroup_to_release);
         if (released > 0)
                 trace_btrfs_space_reservation(fs_info, "delalloc",
                                               btrfs_ino(inode), released, 0);
+       if (qgroup_free)
+               btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
+       else
+               btrfs_qgroup_convert_reserved_meta(inode->root,
+                                                  qgroup_to_release);
  }
  
  void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -5804,7 +5842,7 @@ void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
         if (global_rsv == block_rsv ||
             block_rsv->space_info != global_rsv->space_info)
                 global_rsv = NULL;
-       block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
+       block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
  }
  
  static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5884,7 +5922,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
  static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
  {
         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
-                               (u64)-1);
+                               (u64)-1, NULL);
         WARN_ON(fs_info->trans_block_rsv.size > 0);
         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
         WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5893,24 +5931,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
  }
  
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info)
-{
-       if (!trans->block_rsv) {
-               ASSERT(!trans->bytes_reserved);
-               return;
-       }
-
-       if (!trans->bytes_reserved)
-               return;
-
-       ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
-       trace_btrfs_space_reservation(fs_info, "transaction",
-                                     trans->transid, trans->bytes_reserved, 0);
-       btrfs_block_rsv_release(fs_info, trans->block_rsv,
-                               trans->bytes_reserved);
-       trans->bytes_reserved = 0;
-}
  
  /*
   * To be called after all the new block groups attached to the transaction
@@ -5926,7 +5946,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
         WARN_ON_ONCE(!list_empty(&trans->new_bgs));
  
         block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
-                               trans->chunk_bytes_reserved);
+                               trans->chunk_bytes_reserved, NULL);
         trans->chunk_bytes_reserved = 0;
  }
  
@@ -5952,7 +5972,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
          */
         u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
  
-       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 
+       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
                         num_bytes, 1);
         return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
  }
@@ -5996,7 +6016,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
                 /* One for parent inode, two for dir entries */
                 num_bytes = 3 * fs_info->nodesize;
-               ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+               ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
                 if (ret)
                         return ret;
         } else {
@@ -6015,7 +6035,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
  
         if (ret && *qgroup_reserved)
-               btrfs_qgroup_free_meta(root, *qgroup_reserved);
+               btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
  
         return ret;
  }
@@ -6031,6 +6051,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
  {
         struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
         u64 reserve_size = 0;
+       u64 qgroup_rsv_size = 0;
         u64 csum_leaves;
         unsigned outstanding_extents;
  
@@ -6043,16 +6064,23 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
                                                  inode->csum_bytes);
         reserve_size += btrfs_calc_trans_metadata_size(fs_info,
                                                        csum_leaves);
+       /*
+        * For qgroup rsv, the calculation is very simple:
+        * account one nodesize for each outstanding extent
+        *
+        * This is overestimating in most cases.
+        */
+       qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
  
         spin_lock(&block_rsv->lock);
         block_rsv->size = reserve_size;
+       block_rsv->qgroup_rsv_size = qgroup_rsv_size;
         spin_unlock(&block_rsv->lock);
  }
  
  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-       struct btrfs_root *root = inode->root;
         unsigned nr_extents;
         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
         int ret = 0;
@@ -6069,13 +6097,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
         if (btrfs_is_free_space_inode(inode)) {
                 flush = BTRFS_RESERVE_NO_FLUSH;
                 delalloc_lock = false;
-       } else if (current->journal_info) {
-               flush = BTRFS_RESERVE_FLUSH_LIMIT;
-       }
+       } else {
+               if (current->journal_info)
+                       flush = BTRFS_RESERVE_FLUSH_LIMIT;
  
-       if (flush != BTRFS_RESERVE_NO_FLUSH &&
-           btrfs_transaction_in_commit(fs_info))
-               schedule_timeout(1);
+               if (btrfs_transaction_in_commit(fs_info))
+                       schedule_timeout(1);
+       }
  
         if (delalloc_lock)
                 mutex_lock(&inode->delalloc_mutex);
@@ -6090,19 +6118,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
         spin_unlock(&inode->lock);
  
-       if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
-               ret = btrfs_qgroup_reserve_meta(root,
-                               nr_extents * fs_info->nodesize, true);
-               if (ret)
-                       goto out_fail;
-       }
-
         ret = btrfs_inode_rsv_refill(inode, flush);
-       if (unlikely(ret)) {
-               btrfs_qgroup_free_meta(root,
-                                      nr_extents * fs_info->nodesize);
+       if (unlikely(ret))
                 goto out_fail;
-       }
  
         if (delalloc_lock)
                 mutex_unlock(&inode->delalloc_mutex);
@@ -6116,7 +6134,7 @@ out_fail:
         btrfs_calculate_inode_block_rsv_size(fs_info, inode);
         spin_unlock(&inode->lock);
  
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, true);
         if (delalloc_lock)
                 mutex_unlock(&inode->delalloc_mutex);
         return ret;
@@ -6126,12 +6144,14 @@ out_fail:
   * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
   * @inode: the inode to release the reservation for.
   * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
   *
   * This will release the metadata reservation for an inode.  This can be called
   * once we complete IO for a given set of bytes to release their metadata
   * reservations, or on error for the same reason.
   */
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+                                    bool qgroup_free)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
  
@@ -6144,13 +6164,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
         if (btrfs_is_testing(fs_info))
                 return;
  
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, qgroup_free);
  }
  
  /**
   * btrfs_delalloc_release_extents - release our outstanding_extents
   * @inode: the inode to balance the reservation for.
   * @num_bytes: the number of bytes we originally reserved with
+ * @qgroup_free: do we need to free qgroup meta reservation or convert them.
   *
   * When we reserve space we increase outstanding_extents for the extents we may
   * add.  Once we've set the range as delalloc or created our ordered extents we
@@ -6158,7 +6179,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
   * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
   * with btrfs_delalloc_reserve_metadata.
   */
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+                                   bool qgroup_free)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
         unsigned num_extents;
@@ -6172,7 +6194,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
         if (btrfs_is_testing(fs_info))
                 return;
  
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, qgroup_free);
  }
  
  /**
@@ -6228,9 +6250,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
   */
  void btrfs_delalloc_release_space(struct inode *inode,
                                   struct extent_changeset *reserved,
-                                 u64 start, u64 len)
+                                 u64 start, u64 len, bool qgroup_free)
  {
-       btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
+       btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
         btrfs_free_reserved_data_space(inode, reserved, start, len);
  }
  
@@ -6785,9 +6807,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
         return 0;
  }
  
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
  {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
         struct btrfs_block_group_cache *block_group, *tmp;
         struct list_head *deleted_bgs;
         struct extent_io_tree *unpin;
@@ -7353,29 +7375,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
         return ret;
  }
  
-int __get_raid_index(u64 flags)
-{
-       if (flags & BTRFS_BLOCK_GROUP_RAID10)
-               return BTRFS_RAID_RAID10;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-               return BTRFS_RAID_RAID1;
-       else if (flags & BTRFS_BLOCK_GROUP_DUP)
-               return BTRFS_RAID_DUP;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-               return BTRFS_RAID_RAID0;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-               return BTRFS_RAID_RAID5;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-               return BTRFS_RAID_RAID6;
-
-       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
-int get_block_group_index(struct btrfs_block_group_cache *cache)
-{
-       return __get_raid_index(cache->flags);
-}
-
  static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
         [BTRFS_RAID_RAID10]     = "raid10",
         [BTRFS_RAID_RAID1]      = "raid1",
@@ -7490,7 +7489,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
         u64 empty_cluster = 0;
         struct btrfs_space_info *space_info;
         int loop = 0;
-       int index = __get_raid_index(flags);
+       int index = btrfs_bg_flags_to_raid_index(flags);
         bool failed_cluster_refill = false;
         bool failed_alloc = false;
         bool use_cluster = true;
@@ -7576,7 +7575,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                                 btrfs_put_block_group(block_group);
                                 up_read(&space_info->groups_sem);
                         } else {
-                               index = get_block_group_index(block_group);
+                               index = btrfs_bg_flags_to_raid_index(
+                                               block_group->flags);
                                 btrfs_lock_block_group(block_group, delalloc);
                                 goto have_block_group;
                         }
@@ -7586,7 +7586,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
         }
  search:
         have_caching_bg = false;
-       if (index == 0 || index == __get_raid_index(flags))
+       if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
                 full_search = true;
         down_read(&space_info->groups_sem);
         list_for_each_entry(block_group, &space_info->block_groups[index],
@@ -7844,7 +7844,8 @@ checks:
  loop:
                 failed_cluster_refill = false;
                 failed_alloc = false;
-               BUG_ON(index != get_block_group_index(block_group));
+               BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+                      index);
                 btrfs_release_block_group(block_group, delalloc);
                 cond_resched();
         }
@@ -7998,6 +7999,51 @@ again:
         up_read(&info->groups_sem);
  }
  
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ *                       hole that is at least as big as @num_bytes.
+ *
+ * @root           -   The root that will contain this extent
+ *
+ * @ram_bytes      -   The amount of space in ram that @num_bytes take. This
+ *                     is used for accounting purposes. This value differs
+ *                     from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes      -   Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size -   Indicates the minimum amount of space that the
+ *                     allocator should try to satisfy. In some cases
+ *                     @num_bytes may be larger than what is required and if
+ *                     the filesystem is fragmented then allocation fails.
+ *                     However, the presence of @min_alloc_size gives a
+ *                     chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size     -   A hint that you plan on doing more COW. This is the
+ *                     size in bytes the allocator should try to find free
+ *                     next to the block it returns.  This is just a hint and
+ *                     may be ignored by the allocator.
+ *
+ * @hint_byte      -   Hint to the allocator to start searching above the byte
+ *                     address passed. It might be ignored.
+ *
+ * @ins            -   This key is modified to record the found hole. It will
+ *                     have the following values:
+ *                     ins->objectid == start position
+ *                     ins->flags = BTRFS_EXTENT_ITEM_KEY
+ *                     ins->offset == the size of the hole.
+ *
+ * @is_data        -   Boolean flag indicating whether an extent is
+ *                     allocated for data (true) or metadata (false)
+ *
+ * @delalloc       -   Boolean flag indicating whether this allocation is for
+ *                     delalloc or not. If 'true' data_rwsem of block groups
+ *                     is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
  int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
                          u64 num_bytes, u64 min_alloc_size,
                          u64 empty_size, u64 hint_byte,
@@ -8407,7 +8453,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
  {
         block_rsv_add_bytes(block_rsv, blocksize, 0);
-       block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
+       block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
  }
  
  /*
@@ -8701,6 +8747,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
         u64 parent;
         u32 blocksize;
         struct btrfs_key key;
+       struct btrfs_key first_key;
         struct extent_buffer *next;
         int level = wc->level;
         int reada = 0;
@@ -8721,6 +8768,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
         }
  
         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+       btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+                             path->slots[level]);
         blocksize = fs_info->nodesize;
  
         next = find_extent_buffer(fs_info, bytenr);
@@ -8785,7 +8834,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
         if (!next) {
                 if (reada && level == 1)
                         reada_walk_down(trans, root, wc, path);
-               next = read_tree_block(fs_info, bytenr, generation);
+               next = read_tree_block(fs_info, bytenr, generation, level - 1,
+                                      &first_key);
                 if (IS_ERR(next)) {
                         return PTR_ERR(next);
                 } else if (!extent_buffer_uptodate(next)) {
@@ -9650,7 +9700,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
          */
         target = get_restripe_target(fs_info, block_group->flags);
         if (target) {
-               index = __get_raid_index(extended_to_chunk(target));
+               index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
         } else {
                 /*
                  * this is just a balance, so if we were marked as full
@@ -9664,7 +9714,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
                         goto out;
                 }
  
-               index = get_block_group_index(block_group);
+               index = btrfs_bg_flags_to_raid_index(block_group->flags);
         }
  
         if (index == BTRFS_RAID_RAID10) {
@@ -9913,10 +9963,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
         return 0;
  }
  
+/* link_block_group will queue up kobjects to add when we're reclaim-safe */
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_space_info *space_info;
+       struct raid_kobject *rkobj;
+       LIST_HEAD(list);
+       int index;
+       int ret = 0;
+
+       spin_lock(&fs_info->pending_raid_kobjs_lock);
+       list_splice_init(&fs_info->pending_raid_kobjs, &list);
+       spin_unlock(&fs_info->pending_raid_kobjs_lock);
+
+       list_for_each_entry(rkobj, &list, list) {
+               space_info = __find_space_info(fs_info, rkobj->flags);
+               index = btrfs_bg_flags_to_raid_index(rkobj->flags);
+
+               ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+                                 "%s", get_raid_name(index));
+               if (ret) {
+                       kobject_put(&rkobj->kobj);
+                       break;
+               }
+       }
+       if (ret)
+               btrfs_warn(fs_info,
+                          "failed to add kobject for block cache, ignoring");
+}
+
  static void link_block_group(struct btrfs_block_group_cache *cache)
  {
         struct btrfs_space_info *space_info = cache->space_info;
-       int index = get_block_group_index(cache);
+       struct btrfs_fs_info *fs_info = cache->fs_info;
+       int index = btrfs_bg_flags_to_raid_index(cache->flags);
         bool first = false;
  
         down_write(&space_info->groups_sem);
@@ -9926,27 +10006,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
         up_write(&space_info->groups_sem);
  
         if (first) {
-               struct raid_kobject *rkobj;
-               int ret;
-
-               rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
-               if (!rkobj)
-                       goto out_err;
-               rkobj->raid_type = index;
-               kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
-               ret = kobject_add(&rkobj->kobj, &space_info->kobj,
-                                 "%s", get_raid_name(index));
-               if (ret) {
-                       kobject_put(&rkobj->kobj);
-                       goto out_err;
+               struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+               if (!rkobj) {
+                       btrfs_warn(cache->fs_info,
+                               "couldn't alloc memory for raid level kobject");
+                       return;
                 }
+               rkobj->flags = cache->flags;
+               kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+               spin_lock(&fs_info->pending_raid_kobjs_lock);
+               list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
+               spin_unlock(&fs_info->pending_raid_kobjs_lock);
                 space_info->block_group_kobjs[index] = &rkobj->kobj;
         }
-
-       return;
-out_err:
-       btrfs_warn(cache->fs_info,
-                  "failed to add kobject for block cache, ignoring");
  }
  
  static struct btrfs_block_group_cache *
@@ -10162,6 +10235,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
                         inc_block_group_ro(cache, 1);
         }
  
+       btrfs_add_raid_kobjects(info);
         init_global_block_rsv(info);
         ret = 0;
  error:
@@ -10169,9 +10243,9 @@ error:
         return ret;
  }
  
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
-                                      struct btrfs_fs_info *fs_info)
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
  {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
         struct btrfs_block_group_cache *block_group, *tmp;
         struct btrfs_root *extent_root = fs_info->extent_root;
         struct btrfs_block_group_item item;
@@ -10256,15 +10330,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
          * with its ->space_info set.
          */
         cache->space_info = __find_space_info(fs_info, cache->flags);
-       if (!cache->space_info) {
-               ret = create_space_info(fs_info, cache->flags,
-                                      &cache->space_info);
-               if (ret) {
-                       btrfs_remove_free_space_cache(cache);
-                       btrfs_put_block_group(cache);
-                       return ret;
-               }
-       }
+       ASSERT(cache->space_info);
  
         ret = btrfs_add_block_group_cache(fs_info, cache);
         if (ret) {
@@ -10336,7 +10402,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                                   block_group->key.offset);
  
         memcpy(&key, &block_group->key, sizeof(key));
-       index = get_block_group_index(block_group);
+       index = btrfs_bg_flags_to_raid_index(block_group->flags);
         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                   BTRFS_BLOCK_GROUP_RAID1 |
                                   BTRFS_BLOCK_GROUP_RAID10))