btrfs: Take trans lock before access running trans in check_delayed_ref
[linux-2.6-block.git] / fs / btrfs / extent-tree.c
index c1618ab9fecfb06a50861f26e4d445fe9d63190e..3871658b6ab1de7b5a87200fb252a87f54fdef9a 100644 (file)
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
  */
+
 #include <linux/sched.h>
 #include <linux/sched/signal.h>
 #include <linux/pagemap.h>
@@ -27,7 +15,7 @@
 #include <linux/ratelimit.h>
 #include <linux/percpu_counter.h>
 #include <linux/lockdep.h>
-#include "hash.h"
+#include <linux/crc32c.h>
 #include "tree-log.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -535,13 +523,11 @@ static noinline void caching_thread(struct btrfs_work *work)
        struct btrfs_block_group_cache *block_group;
        struct btrfs_fs_info *fs_info;
        struct btrfs_caching_control *caching_ctl;
-       struct btrfs_root *extent_root;
        int ret;
 
        caching_ctl = container_of(work, struct btrfs_caching_control, work);
        block_group = caching_ctl->block_group;
        fs_info = block_group->fs_info;
-       extent_root = fs_info->extent_root;
 
        mutex_lock(&caching_ctl->mutex);
        down_read(&fs_info->commit_root_sem);
@@ -1203,11 +1189,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
        __le64 lenum;
 
        lenum = cpu_to_le64(root_objectid);
-       high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+       high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(owner);
-       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
        lenum = cpu_to_le64(offset);
-       low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+       low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 
        return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
@@ -2615,13 +2601,19 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
        trace_run_delayed_ref_head(fs_info, head, 0);
 
        if (head->total_ref_mod < 0) {
-               struct btrfs_block_group_cache *cache;
+               struct btrfs_space_info *space_info;
+               u64 flags;
 
-               cache = btrfs_lookup_block_group(fs_info, head->bytenr);
-               ASSERT(cache);
-               percpu_counter_add(&cache->space_info->total_bytes_pinned,
+               if (head->is_data)
+                       flags = BTRFS_BLOCK_GROUP_DATA;
+               else if (head->is_system)
+                       flags = BTRFS_BLOCK_GROUP_SYSTEM;
+               else
+                       flags = BTRFS_BLOCK_GROUP_METADATA;
+               space_info = __find_space_info(fs_info, flags);
+               ASSERT(space_info);
+               percpu_counter_add(&space_info->total_bytes_pinned,
                                   -head->num_bytes);
-               btrfs_put_block_group(cache);
 
                if (head->is_data) {
                        spin_lock(&delayed_refs->lock);
@@ -2652,9 +2644,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
  */
 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                                            struct btrfs_fs_info *fs_info,
                                             unsigned long nr)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
@@ -2994,7 +2986,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
        if (trans->transid > async->transid)
                goto end;
 
-       ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
+       ret = btrfs_run_delayed_refs(trans, async->count);
        if (ret)
                async->error = ret;
 end:
@@ -3053,8 +3045,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
  * Returns <0 on error and aborts the transaction
  */
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
-                          struct btrfs_fs_info *fs_info, unsigned long count)
+                          unsigned long count)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct rb_node *node;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_head *head;
@@ -3078,7 +3071,7 @@ again:
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
        trans->can_flush_pending_bgs = false;
-       ret = __btrfs_run_delayed_refs(trans, fs_info, count);
+       ret = __btrfs_run_delayed_refs(trans, count);
        if (ret < 0) {
                btrfs_abort_transaction(trans, ret);
                return ret;
@@ -3086,7 +3079,7 @@ again:
 
        if (run_all) {
                if (!list_empty(&trans->new_bgs))
-                       btrfs_create_pending_block_groups(trans, fs_info);
+                       btrfs_create_pending_block_groups(trans);
 
                spin_lock(&delayed_refs->lock);
                node = rb_first(&delayed_refs->href_root);
@@ -3149,7 +3142,11 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
        struct rb_node *node;
        int ret = 0;
 
+       spin_lock(&root->fs_info->trans_lock);
        cur_trans = root->fs_info->running_transaction;
+       if (cur_trans)
+               refcount_inc(&cur_trans->use_count);
+       spin_unlock(&root->fs_info->trans_lock);
        if (!cur_trans)
                return 0;
 
@@ -3158,6 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
        if (!head) {
                spin_unlock(&delayed_refs->lock);
+               btrfs_put_transaction(cur_trans);
                return 0;
        }
 
@@ -3174,6 +3172,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
                mutex_lock(&head->mutex);
                mutex_unlock(&head->mutex);
                btrfs_put_delayed_ref_head(head);
+               btrfs_put_transaction(cur_trans);
                return -EAGAIN;
        }
        spin_unlock(&delayed_refs->lock);
@@ -3206,6 +3205,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
        }
        spin_unlock(&head->lock);
        mutex_unlock(&head->mutex);
+       btrfs_put_transaction(cur_trans);
        return ret;
 }
 
@@ -3660,9 +3660,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
  * the commit latency by getting rid of the easy block groups while
  * we're still allowing others to join the commit.
  */
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
-                                  struct btrfs_fs_info *fs_info)
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *cache;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;
@@ -3686,7 +3686,7 @@ again:
         * make sure all the block groups on our dirty list actually
         * exist
         */
-       btrfs_create_pending_block_groups(trans, fs_info);
+       btrfs_create_pending_block_groups(trans);
 
        if (!path) {
                path = btrfs_alloc_path();
@@ -3741,8 +3741,9 @@ again:
                                should_put = 0;
 
                                /*
-                                * the cache_write_mutex is protecting
-                                * the io_list
+                                * The cache_write_mutex is protecting the
+                                * io_list, also refer to the definition of
+                                * btrfs_transaction::io_bgs for more details
                                 */
                                list_add_tail(&cache->io_list, io);
                        } else {
@@ -3800,7 +3801,7 @@ again:
         * go through delayed refs for all the stuff we've just kicked off
         * and then loop back (just once)
         */
-       ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+       ret = btrfs_run_delayed_refs(trans, 0);
        if (!ret && loops == 0) {
                loops++;
                spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3882,7 +3883,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                cache_save_setup(cache, trans, path);
 
                if (!ret)
-                       ret = btrfs_run_delayed_refs(trans, fs_info,
+                       ret = btrfs_run_delayed_refs(trans,
                                                     (unsigned long) -1);
 
                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
@@ -3934,6 +3935,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
        }
        spin_unlock(&cur_trans->dirty_bgs_lock);
 
+       /*
+        * Refer to the definition of io_bgs member for details why it's safe
+        * to use it without any locking
+        */
        while (!list_empty(io)) {
                cache = list_first_entry(io, struct btrfs_block_group_cache,
                                         io_list);
@@ -4333,8 +4338,7 @@ again:
 
                /* commit the current transaction and try again */
 commit_trans:
-               if (need_commit &&
-                   !atomic_read(&fs_info->open_ioctl_trans)) {
+               if (need_commit) {
                        need_commit--;
 
                        if (need_commit > 0) {
@@ -4542,7 +4546,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
         * Needed because we can end up allocating a system chunk and for an
         * atomic and race free space reservation in the chunk block reserve.
         */
-       ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+       lockdep_assert_held(&fs_info->chunk_mutex);
 
        info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
        spin_lock(&info->lock);
@@ -4603,11 +4607,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                return -ENOSPC;
 
        space_info = __find_space_info(fs_info, flags);
-       if (!space_info) {
-               ret = create_space_info(fs_info, flags, &space_info);
-               if (ret)
-                       return ret;
-       }
+       ASSERT(space_info);
 
 again:
        spin_lock(&space_info->lock);
@@ -4644,6 +4644,7 @@ again:
        if (wait_for_alloc) {
                mutex_unlock(&fs_info->chunk_mutex);
                wait_for_alloc = 0;
+               cond_resched();
                goto again;
        }
 
@@ -4706,7 +4707,7 @@ out:
         */
        if (trans->can_flush_pending_bgs &&
            trans->chunk_bytes_reserved >= (u64)SZ_2M) {
-               btrfs_create_pending_block_groups(trans, fs_info);
+               btrfs_create_pending_block_groups(trans);
                btrfs_trans_release_chunk_metadata(trans);
        }
        return ret;
@@ -4827,7 +4828,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
        long time_left;
        unsigned long nr_pages;
        int loops;
-       enum btrfs_reserve_flush_enum flush;
 
        /* Calc the number of the pages we need flush for space reservation */
        items = calc_reclaim_items_nr(fs_info, to_reclaim);
@@ -4868,10 +4868,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
                           atomic_read(&fs_info->async_delalloc_pages) <=
                           (int)max_reclaim);
 skip_async:
-               if (!trans)
-                       flush = BTRFS_RESERVE_FLUSH_ALL;
-               else
-                       flush = BTRFS_RESERVE_NO_FLUSH;
                spin_lock(&space_info->lock);
                if (list_empty(&space_info->tickets) &&
                    list_empty(&space_info->priority_tickets)) {
@@ -4994,7 +4990,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
                        ret = PTR_ERR(trans);
                        break;
                }
-               ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+               ret = btrfs_run_delayed_items_nr(trans, nr);
                btrfs_end_transaction(trans);
                break;
        case FLUSH_DELALLOC:
@@ -5389,10 +5385,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
                    !block_rsv_use_bytes(global_rsv, orig_bytes))
                        ret = 0;
        }
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
                trace_btrfs_space_reservation(fs_info, "space_info:enospc",
                                              block_rsv->space_info->flags,
                                              orig_bytes, 1);
+
+               if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+                       dump_space_info(fs_info, block_rsv->space_info,
+                                       orig_bytes, 0);
+       }
        return ret;
 }
 
@@ -5572,14 +5573,18 @@ again:
 
 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                                    struct btrfs_block_rsv *block_rsv,
-                                   struct btrfs_block_rsv *dest, u64 num_bytes)
+                                   struct btrfs_block_rsv *dest, u64 num_bytes,
+                                   u64 *qgroup_to_release_ret)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
+       u64 qgroup_to_release = 0;
        u64 ret;
 
        spin_lock(&block_rsv->lock);
-       if (num_bytes == (u64)-1)
+       if (num_bytes == (u64)-1) {
                num_bytes = block_rsv->size;
+               qgroup_to_release = block_rsv->qgroup_rsv_size;
+       }
        block_rsv->size -= num_bytes;
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
@@ -5588,6 +5593,13 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
        } else {
                num_bytes = 0;
        }
+       if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
+               qgroup_to_release = block_rsv->qgroup_rsv_reserved -
+                                   block_rsv->qgroup_rsv_size;
+               block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
+       } else {
+               qgroup_to_release = 0;
+       }
        spin_unlock(&block_rsv->lock);
 
        ret = num_bytes;
@@ -5610,6 +5622,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
                        space_info_add_old_bytes(fs_info, space_info,
                                                 num_bytes);
        }
+       if (qgroup_to_release_ret)
+               *qgroup_to_release_ret = qgroup_to_release;
        return ret;
 }
 
@@ -5751,48 +5765,72 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
        struct btrfs_root *root = inode->root;
        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
        u64 num_bytes = 0;
+       u64 qgroup_num_bytes = 0;
        int ret = -ENOSPC;
 
        spin_lock(&block_rsv->lock);
        if (block_rsv->reserved < block_rsv->size)
                num_bytes = block_rsv->size - block_rsv->reserved;
+       if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
+               qgroup_num_bytes = block_rsv->qgroup_rsv_size -
+                                  block_rsv->qgroup_rsv_reserved;
        spin_unlock(&block_rsv->lock);
 
        if (num_bytes == 0)
                return 0;
 
+       ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
+       if (ret)
+               return ret;
        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 0);
                trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                              btrfs_ino(inode), num_bytes, 1);
-       }
+
+               /* Don't forget to increase qgroup_rsv_reserved */
+               spin_lock(&block_rsv->lock);
+               block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+               spin_unlock(&block_rsv->lock);
+       } else
+               btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
        return ret;
 }
 
 /**
  * btrfs_inode_rsv_release - release any excessive reservation.
  * @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ *   Unlike normal operation, qgroup meta reservation needs to know if we are
+ *   freeing qgroup reservation or just converting it into per-trans.  Normally
+ *   @qgroup_free is true for error handling, and false for normal release.
  *
  * This is the same as btrfs_block_rsv_release, except that it handles the
  * tracepoint for the reservation.
  */
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
        u64 released = 0;
+       u64 qgroup_to_release = 0;
 
        /*
         * Since we statically set the block_rsv->size we just want to say we
         * are releasing 0 bytes, and then we'll just get the reservation over
         * the size free'd.
         */
-       released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0);
+       released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
+                                          &qgroup_to_release);
        if (released > 0)
                trace_btrfs_space_reservation(fs_info, "delalloc",
                                              btrfs_ino(inode), released, 0);
+       if (qgroup_free)
+               btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
+       else
+               btrfs_qgroup_convert_reserved_meta(inode->root,
+                                                  qgroup_to_release);
 }
 
 void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -5804,7 +5842,7 @@ void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
        if (global_rsv == block_rsv ||
            block_rsv->space_info != global_rsv->space_info)
                global_rsv = NULL;
-       block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
+       block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
 }
 
 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -5884,7 +5922,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
-                               (u64)-1);
+                               (u64)-1, NULL);
        WARN_ON(fs_info->trans_block_rsv.size > 0);
        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
        WARN_ON(fs_info->chunk_block_rsv.size > 0);
@@ -5893,24 +5931,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 }
 
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-                                 struct btrfs_fs_info *fs_info)
-{
-       if (!trans->block_rsv) {
-               ASSERT(!trans->bytes_reserved);
-               return;
-       }
-
-       if (!trans->bytes_reserved)
-               return;
-
-       ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
-       trace_btrfs_space_reservation(fs_info, "transaction",
-                                     trans->transid, trans->bytes_reserved, 0);
-       btrfs_block_rsv_release(fs_info, trans->block_rsv,
-                               trans->bytes_reserved);
-       trans->bytes_reserved = 0;
-}
 
 /*
  * To be called after all the new block groups attached to the transaction
@@ -5926,7 +5946,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
        WARN_ON_ONCE(!list_empty(&trans->new_bgs));
 
        block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
-                               trans->chunk_bytes_reserved);
+                               trans->chunk_bytes_reserved, NULL);
        trans->chunk_bytes_reserved = 0;
 }
 
@@ -5952,7 +5972,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
         */
        u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 
-       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), 
+       trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
                        num_bytes, 1);
        return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
 }
@@ -5996,7 +6016,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
                /* One for parent inode, two for dir entries */
                num_bytes = 3 * fs_info->nodesize;
-               ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+               ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
                if (ret)
                        return ret;
        } else {
@@ -6015,7 +6035,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
 
        if (ret && *qgroup_reserved)
-               btrfs_qgroup_free_meta(root, *qgroup_reserved);
+               btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
 
        return ret;
 }
@@ -6031,6 +6051,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 {
        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
        u64 reserve_size = 0;
+       u64 qgroup_rsv_size = 0;
        u64 csum_leaves;
        unsigned outstanding_extents;
 
@@ -6043,16 +6064,23 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
                                                 inode->csum_bytes);
        reserve_size += btrfs_calc_trans_metadata_size(fs_info,
                                                       csum_leaves);
+       /*
+        * For qgroup rsv, the calculation is very simple:
+        * account one nodesize for each outstanding extent
+        *
+        * This is overestimating in most cases.
+        */
+       qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
 
        spin_lock(&block_rsv->lock);
        block_rsv->size = reserve_size;
+       block_rsv->qgroup_rsv_size = qgroup_rsv_size;
        spin_unlock(&block_rsv->lock);
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
-       struct btrfs_root *root = inode->root;
        unsigned nr_extents;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret = 0;
@@ -6069,13 +6097,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
        if (btrfs_is_free_space_inode(inode)) {
                flush = BTRFS_RESERVE_NO_FLUSH;
                delalloc_lock = false;
-       } else if (current->journal_info) {
-               flush = BTRFS_RESERVE_FLUSH_LIMIT;
-       }
+       } else {
+               if (current->journal_info)
+                       flush = BTRFS_RESERVE_FLUSH_LIMIT;
 
-       if (flush != BTRFS_RESERVE_NO_FLUSH &&
-           btrfs_transaction_in_commit(fs_info))
-               schedule_timeout(1);
+               if (btrfs_transaction_in_commit(fs_info))
+                       schedule_timeout(1);
+       }
 
        if (delalloc_lock)
                mutex_lock(&inode->delalloc_mutex);
@@ -6090,19 +6118,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 
-       if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
-               ret = btrfs_qgroup_reserve_meta(root,
-                               nr_extents * fs_info->nodesize, true);
-               if (ret)
-                       goto out_fail;
-       }
-
        ret = btrfs_inode_rsv_refill(inode, flush);
-       if (unlikely(ret)) {
-               btrfs_qgroup_free_meta(root,
-                                      nr_extents * fs_info->nodesize);
+       if (unlikely(ret))
                goto out_fail;
-       }
 
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
@@ -6116,7 +6134,7 @@ out_fail:
        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, true);
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
        return ret;
@@ -6126,12 +6144,14 @@ out_fail:
  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
  * @inode: the inode to release the reservation for.
  * @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
  *
  * This will release the metadata reservation for an inode.  This can be called
  * once we complete IO for a given set of bytes to release their metadata
  * reservations, or on error for the same reason.
  */
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+                                    bool qgroup_free)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
 
@@ -6144,13 +6164,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
        if (btrfs_is_testing(fs_info))
                return;
 
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, qgroup_free);
 }
 
 /**
  * btrfs_delalloc_release_extents - release our outstanding_extents
  * @inode: the inode to balance the reservation for.
  * @num_bytes: the number of bytes we originally reserved with
+ * @qgroup_free: do we need to free qgroup meta reservation or convert them.
  *
  * When we reserve space we increase outstanding_extents for the extents we may
  * add.  Once we've set the range as delalloc or created our ordered extents we
@@ -6158,7 +6179,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
  * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
  * with btrfs_delalloc_reserve_metadata.
  */
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+                                   bool qgroup_free)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
        unsigned num_extents;
@@ -6172,7 +6194,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
        if (btrfs_is_testing(fs_info))
                return;
 
-       btrfs_inode_rsv_release(inode);
+       btrfs_inode_rsv_release(inode, qgroup_free);
 }
 
 /**
@@ -6228,9 +6250,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
  */
 void btrfs_delalloc_release_space(struct inode *inode,
                                  struct extent_changeset *reserved,
-                                 u64 start, u64 len)
+                                 u64 start, u64 len, bool qgroup_free)
 {
-       btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
+       btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
        btrfs_free_reserved_data_space(inode, reserved, start, len);
 }
 
@@ -6785,9 +6807,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
        return 0;
 }
 
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *block_group, *tmp;
        struct list_head *deleted_bgs;
        struct extent_io_tree *unpin;
@@ -7353,29 +7375,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return ret;
 }
 
-int __get_raid_index(u64 flags)
-{
-       if (flags & BTRFS_BLOCK_GROUP_RAID10)
-               return BTRFS_RAID_RAID10;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-               return BTRFS_RAID_RAID1;
-       else if (flags & BTRFS_BLOCK_GROUP_DUP)
-               return BTRFS_RAID_DUP;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-               return BTRFS_RAID_RAID0;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-               return BTRFS_RAID_RAID5;
-       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
-               return BTRFS_RAID_RAID6;
-
-       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
-int get_block_group_index(struct btrfs_block_group_cache *cache)
-{
-       return __get_raid_index(cache->flags);
-}
-
 static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
        [BTRFS_RAID_RAID10]     = "raid10",
        [BTRFS_RAID_RAID1]      = "raid1",
@@ -7490,7 +7489,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
        u64 empty_cluster = 0;
        struct btrfs_space_info *space_info;
        int loop = 0;
-       int index = __get_raid_index(flags);
+       int index = btrfs_bg_flags_to_raid_index(flags);
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
        bool use_cluster = true;
@@ -7576,7 +7575,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
-                               index = get_block_group_index(block_group);
+                               index = btrfs_bg_flags_to_raid_index(
+                                               block_group->flags);
                                btrfs_lock_block_group(block_group, delalloc);
                                goto have_block_group;
                        }
@@ -7586,7 +7586,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
        }
 search:
        have_caching_bg = false;
-       if (index == 0 || index == __get_raid_index(flags))
+       if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
                full_search = true;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups[index],
@@ -7844,7 +7844,8 @@ checks:
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
-               BUG_ON(index != get_block_group_index(block_group));
+               BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+                      index);
                btrfs_release_block_group(block_group, delalloc);
                cond_resched();
        }
@@ -7998,6 +7999,51 @@ again:
        up_read(&info->groups_sem);
 }
 
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ *                       hole that is at least as big as @num_bytes.
+ *
+ * @root           -   The root that will contain this extent
+ *
+ * @ram_bytes      -   The amount of space in ram that @num_bytes take. This
+ *                     is used for accounting purposes. This value differs
+ *                     from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes      -   Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size -   Indicates the minimum amount of space that the
+ *                     allocator should try to satisfy. In some cases
+ *                     @num_bytes may be larger than what is required and if
+ *                     the filesystem is fragmented then allocation fails.
+ *                     However, the presence of @min_alloc_size gives a
+ *                     chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size     -   A hint that you plan on doing more COW. This is the
+ *                     size in bytes the allocator should try to find free
+ *                     next to the block it returns.  This is just a hint and
+ *                     may be ignored by the allocator.
+ *
+ * @hint_byte      -   Hint to the allocator to start searching above the byte
+ *                     address passed. It might be ignored.
+ *
+ * @ins            -   This key is modified to record the found hole. It will
+ *                     have the following values:
+ *                     ins->objectid == start position
+ *                     ins->flags = BTRFS_EXTENT_ITEM_KEY
+ *                     ins->offset == the size of the hole.
+ *
+ * @is_data        -   Boolean flag indicating whether an extent is
+ *                     allocated for data (true) or metadata (false)
+ *
+ * @delalloc       -   Boolean flag indicating whether this allocation is for
+ *                     delalloc or not. If 'true' data_rwsem of block groups
+ *                     is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
                         u64 num_bytes, u64 min_alloc_size,
                         u64 empty_size, u64 hint_byte,
@@ -8407,7 +8453,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
                            struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
        block_rsv_add_bytes(block_rsv, blocksize, 0);
-       block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
+       block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
 }
 
 /*
@@ -8701,6 +8747,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        u64 parent;
        u32 blocksize;
        struct btrfs_key key;
+       struct btrfs_key first_key;
        struct extent_buffer *next;
        int level = wc->level;
        int reada = 0;
@@ -8721,6 +8768,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        }
 
        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+       btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+                             path->slots[level]);
        blocksize = fs_info->nodesize;
 
        next = find_extent_buffer(fs_info, bytenr);
@@ -8785,7 +8834,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        if (!next) {
                if (reada && level == 1)
                        reada_walk_down(trans, root, wc, path);
-               next = read_tree_block(fs_info, bytenr, generation);
+               next = read_tree_block(fs_info, bytenr, generation, level - 1,
+                                      &first_key);
                if (IS_ERR(next)) {
                        return PTR_ERR(next);
                } else if (!extent_buffer_uptodate(next)) {
@@ -9650,7 +9700,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
         */
        target = get_restripe_target(fs_info, block_group->flags);
        if (target) {
-               index = __get_raid_index(extended_to_chunk(target));
+               index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
        } else {
                /*
                 * this is just a balance, so if we were marked as full
@@ -9664,7 +9714,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
                        goto out;
                }
 
-               index = get_block_group_index(block_group);
+               index = btrfs_bg_flags_to_raid_index(block_group->flags);
        }
 
        if (index == BTRFS_RAID_RAID10) {
@@ -9913,10 +9963,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        return 0;
 }
 
+/* link_block_group will queue up kobjects to add when we're reclaim-safe */
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_space_info *space_info;
+       struct raid_kobject *rkobj;
+       LIST_HEAD(list);
+       int index;
+       int ret = 0;
+
+       spin_lock(&fs_info->pending_raid_kobjs_lock);
+       list_splice_init(&fs_info->pending_raid_kobjs, &list);
+       spin_unlock(&fs_info->pending_raid_kobjs_lock);
+
+       list_for_each_entry(rkobj, &list, list) {
+               space_info = __find_space_info(fs_info, rkobj->flags);
+               index = btrfs_bg_flags_to_raid_index(rkobj->flags);
+
+               ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+                                 "%s", get_raid_name(index));
+               if (ret) {
+                       kobject_put(&rkobj->kobj);
+                       break;
+               }
+       }
+       if (ret)
+               btrfs_warn(fs_info,
+                          "failed to add kobject for block cache, ignoring");
+}
+
 static void link_block_group(struct btrfs_block_group_cache *cache)
 {
        struct btrfs_space_info *space_info = cache->space_info;
-       int index = get_block_group_index(cache);
+       struct btrfs_fs_info *fs_info = cache->fs_info;
+       int index = btrfs_bg_flags_to_raid_index(cache->flags);
        bool first = false;
 
        down_write(&space_info->groups_sem);
@@ -9926,27 +10006,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
        up_write(&space_info->groups_sem);
 
        if (first) {
-               struct raid_kobject *rkobj;
-               int ret;
-
-               rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
-               if (!rkobj)
-                       goto out_err;
-               rkobj->raid_type = index;
-               kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
-               ret = kobject_add(&rkobj->kobj, &space_info->kobj,
-                                 "%s", get_raid_name(index));
-               if (ret) {
-                       kobject_put(&rkobj->kobj);
-                       goto out_err;
+               struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+               if (!rkobj) {
+                       btrfs_warn(cache->fs_info,
+                               "couldn't alloc memory for raid level kobject");
+                       return;
                }
+               rkobj->flags = cache->flags;
+               kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+               spin_lock(&fs_info->pending_raid_kobjs_lock);
+               list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
+               spin_unlock(&fs_info->pending_raid_kobjs_lock);
                space_info->block_group_kobjs[index] = &rkobj->kobj;
        }
-
-       return;
-out_err:
-       btrfs_warn(cache->fs_info,
-                  "failed to add kobject for block cache, ignoring");
 }
 
 static struct btrfs_block_group_cache *
@@ -10162,6 +10235,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
                        inc_block_group_ro(cache, 1);
        }
 
+       btrfs_add_raid_kobjects(info);
        init_global_block_rsv(info);
        ret = 0;
 error:
@@ -10169,9 +10243,9 @@ error:
        return ret;
 }
 
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
-                                      struct btrfs_fs_info *fs_info)
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *block_group, *tmp;
        struct btrfs_root *extent_root = fs_info->extent_root;
        struct btrfs_block_group_item item;
@@ -10256,15 +10330,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
         * with its ->space_info set.
         */
        cache->space_info = __find_space_info(fs_info, cache->flags);
-       if (!cache->space_info) {
-               ret = create_space_info(fs_info, cache->flags,
-                                      &cache->space_info);
-               if (ret) {
-                       btrfs_remove_free_space_cache(cache);
-                       btrfs_put_block_group(cache);
-                       return ret;
-               }
-       }
+       ASSERT(cache->space_info);
 
        ret = btrfs_add_block_group_cache(fs_info, cache);
        if (ret) {
@@ -10336,7 +10402,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                                  block_group->key.offset);
 
        memcpy(&key, &block_group->key, sizeof(key));
-       index = get_block_group_index(block_group);
+       index = btrfs_bg_flags_to_raid_index(block_group->flags);
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10))