Btrfs: be more polite in the async caching threads
[linux-2.6-block.git] / fs / btrfs / extent-tree.c
index edc7d208c5ce6032fc7de520f5f4a00739755fec..dc84daee6bc472dbc480efd7ad179933277c5eb4 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/blkdev.h>
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
+#include <linux/kthread.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
 
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+       smp_mb();
+       return cache->cached == BTRFS_CACHE_FINISHED;
+}
+
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
        return (cache->flags & bits) == bits;
@@ -145,21 +153,71 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
        return ret;
 }
 
+/*
+ * We always set EXTENT_LOCKED for the super mirror extents so we don't
+ * overwrite them, so those bits need to be unset.  Also, if we are unmounting
+ * with pinned extents still sitting there because we had a block group caching,
+ * we need to clear those now, since we are done.
+ */
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+{
+       u64 start, end, last = 0;
+       int ret;
+
+       while (1) {
+               ret = find_first_extent_bit(&info->pinned_extents, last,
+                                           &start, &end,
+                                           EXTENT_LOCKED|EXTENT_DIRTY);
+               if (ret)
+                       break;
+
+               clear_extent_bits(&info->pinned_extents, start, end,
+                                 EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+               last = end+1;
+       }
+}
+
+static int remove_sb_from_cache(struct btrfs_root *root,
+                               struct btrfs_block_group_cache *cache)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       u64 bytenr;
+       u64 *logical;
+       int stripe_len;
+       int i, nr, ret;
+
+       for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+               bytenr = btrfs_sb_offset(i);
+               ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+                                      cache->key.objectid, bytenr,
+                                      0, &logical, &nr, &stripe_len);
+               BUG_ON(ret);
+               while (nr--) {
+                       try_lock_extent(&fs_info->pinned_extents,
+                                       logical[nr],
+                                       logical[nr] + stripe_len - 1, GFP_NOFS);
+               }
+               kfree(logical);
+       }
+
+       return 0;
+}
+
 /*
  * this is only called by cache_block_group, since we could have freed extents
  * we need to check the pinned_extents for any extents that can't be used yet
  * since their free space will be released as soon as the transaction commits.
  */
-static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_fs_info *info, u64 start, u64 end)
 {
-       u64 extent_start, extent_end, size;
+       u64 extent_start, extent_end, size, total_added = 0;
        int ret;
 
        while (start < end) {
                ret = find_first_extent_bit(&info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                           EXTENT_DIRTY);
+                                           EXTENT_DIRTY|EXTENT_LOCKED);
                if (ret)
                        break;
 
@@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
                        start = extent_end + 1;
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
+                       total_added += size;
                        ret = btrfs_add_free_space(block_group, start,
                                                   size);
                        BUG_ON(ret);
@@ -178,84 +237,80 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 
        if (start < end) {
                size = end - start;
+               total_added += size;
                ret = btrfs_add_free_space(block_group, start, size);
                BUG_ON(ret);
        }
 
-       return 0;
-}
-
-static int remove_sb_from_cache(struct btrfs_root *root,
-                               struct btrfs_block_group_cache *cache)
-{
-       u64 bytenr;
-       u64 *logical;
-       int stripe_len;
-       int i, nr, ret;
-
-       for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-               bytenr = btrfs_sb_offset(i);
-               ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
-                                      cache->key.objectid, bytenr, 0,
-                                      &logical, &nr, &stripe_len);
-               BUG_ON(ret);
-               while (nr--) {
-                       btrfs_remove_free_space(cache, logical[nr],
-                                               stripe_len);
-               }
-               kfree(logical);
-       }
-       return 0;
+       return total_added;
 }
 
-static int cache_block_group(struct btrfs_root *root,
-                            struct btrfs_block_group_cache *block_group)
+static int caching_kthread(void *data)
 {
+       struct btrfs_block_group_cache *block_group = data;
+       struct btrfs_fs_info *fs_info = block_group->fs_info;
+       u64 last = 0;
        struct btrfs_path *path;
        int ret = 0;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        int slot;
-       u64 last;
-
-       if (!block_group)
-               return 0;
+       u64 total_found = 0;
 
-       root = root->fs_info->extent_root;
-
-       if (block_group->cached)
-               return 0;
+       BUG_ON(!fs_info);
 
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 
-       path->reada = 2;
+       atomic_inc(&block_group->space_info->caching_threads);
+       last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+again:
+       /* need to make sure the commit_root doesn't disappear */
+       down_read(&fs_info->extent_commit_sem);
+
        /*
-        * we get into deadlocks with paths held by callers of this function.
-        * since the alloc_mutex is protecting things right now, just
-        * skip the locking here
+        * We don't want to deadlock with somebody trying to allocate a new
+        * extent for the extent root while also trying to search the extent
+        * root to add free space.  So we skip locking and search the commit
+        * root, since its read-only
         */
        path->skip_locking = 1;
-       last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+       path->search_commit_root = 1;
+       path->reada = 2;
+
        key.objectid = last;
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+       ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
 
        while (1) {
+               smp_mb();
+               if (block_group->fs_info->closing > 1) {
+                       last = (u64)-1;
+                       break;
+               }
+
                leaf = path->nodes[0];
                slot = path->slots[0];
                if (slot >= btrfs_header_nritems(leaf)) {
-                       ret = btrfs_next_leaf(root, path);
+                       ret = btrfs_next_leaf(fs_info->extent_root, path);
                        if (ret < 0)
                                goto err;
-                       if (ret == 0)
-                               continue;
-                       else
+                       else if (ret)
                                break;
+
+                       if (need_resched() ||
+                           btrfs_transaction_in_commit(fs_info)) {
+                               btrfs_release_path(fs_info->extent_root, path);
+                               up_read(&fs_info->extent_commit_sem);
+                               schedule_timeout(1);
+                               goto again;
+                       }
+
+                       continue;
                }
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid < block_group->key.objectid)
@@ -266,24 +321,59 @@ static int cache_block_group(struct btrfs_root *root,
                        break;
 
                if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-                       add_new_free_space(block_group, root->fs_info, last,
-                                          key.objectid);
-
+                       total_found += add_new_free_space(block_group,
+                                                         fs_info, last,
+                                                         key.objectid);
                        last = key.objectid + key.offset;
                }
+
+               if (total_found > (1024 * 1024 * 2)) {
+                       total_found = 0;
+                       wake_up(&block_group->caching_q);
+               }
 next:
                path->slots[0]++;
        }
+       ret = 0;
 
-       add_new_free_space(block_group, root->fs_info, last,
-                          block_group->key.objectid +
-                          block_group->key.offset);
+       total_found += add_new_free_space(block_group, fs_info, last,
+                                         block_group->key.objectid +
+                                         block_group->key.offset);
+
+       spin_lock(&block_group->lock);
+       block_group->cached = BTRFS_CACHE_FINISHED;
+       spin_unlock(&block_group->lock);
 
-       block_group->cached = 1;
-       remove_sb_from_cache(root, block_group);
-       ret = 0;
 err:
        btrfs_free_path(path);
+       up_read(&fs_info->extent_commit_sem);
+       atomic_dec(&block_group->space_info->caching_threads);
+       wake_up(&block_group->caching_q);
+
+       return 0;
+}
+
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+       struct task_struct *tsk;
+       int ret = 0;
+
+       spin_lock(&cache->lock);
+       if (cache->cached != BTRFS_CACHE_NO) {
+               spin_unlock(&cache->lock);
+               return ret;
+       }
+       cache->cached = BTRFS_CACHE_STARTED;
+       spin_unlock(&cache->lock);
+
+       tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+                         cache->key.objectid);
+       if (IS_ERR(tsk)) {
+               ret = PTR_ERR(tsk);
+               printk(KERN_ERR "error running thread %d\n", ret);
+               BUG();
+       }
+
        return ret;
 }
 
@@ -990,15 +1080,13 @@ static inline int extent_ref_type(u64 parent, u64 owner)
        return type;
 }
 
-static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
+static int find_next_key(struct btrfs_path *path, int level,
+                        struct btrfs_key *key)
 
 {
-       int level;
-       BUG_ON(!path->keep_locks);
-       for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+       for (; level < BTRFS_MAX_LEVEL; level++) {
                if (!path->nodes[level])
                        break;
-               btrfs_assert_tree_locked(path->nodes[level]);
                if (path->slots[level] + 1 >=
                    btrfs_header_nritems(path->nodes[level]))
                        continue;
@@ -1158,7 +1246,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
                 * For simplicity, we just do not add new inline back
                 * ref if there is any kind of item for this block
                 */
-               if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
+               if (find_next_key(path, 0, &key) == 0 &&
+                   key.objectid == bytenr &&
                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
                        err = -EAGAIN;
                        goto out;
@@ -2388,13 +2477,29 @@ fail:
 
 }
 
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+                struct btrfs_block_group_cache *cache)
+{
+       struct rb_node *node;
+       spin_lock(&root->fs_info->block_group_cache_lock);
+       node = rb_next(&cache->cache_node);
+       btrfs_put_block_group(cache);
+       if (node) {
+               cache = rb_entry(node, struct btrfs_block_group_cache,
+                                cache_node);
+               atomic_inc(&cache->count);
+       } else
+               cache = NULL;
+       spin_unlock(&root->fs_info->block_group_cache_lock);
+       return cache;
+}
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-       struct btrfs_block_group_cache *cache, *entry;
-       struct rb_node *n;
+       struct btrfs_block_group_cache *cache;
        int err = 0;
-       int werr = 0;
        struct btrfs_path *path;
        u64 last = 0;
 
@@ -2403,39 +2508,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                return -ENOMEM;
 
        while (1) {
-               cache = NULL;
-               spin_lock(&root->fs_info->block_group_cache_lock);
-               for (n = rb_first(&root->fs_info->block_group_cache_tree);
-                    n; n = rb_next(n)) {
-                       entry = rb_entry(n, struct btrfs_block_group_cache,
-                                        cache_node);
-                       if (entry->dirty) {
-                               cache = entry;
-                               break;
-                       }
+               if (last == 0) {
+                       err = btrfs_run_delayed_refs(trans, root,
+                                                    (unsigned long)-1);
+                       BUG_ON(err);
                }
-               spin_unlock(&root->fs_info->block_group_cache_lock);
 
-               if (!cache)
-                       break;
+               cache = btrfs_lookup_first_block_group(root->fs_info, last);
+               while (cache) {
+                       if (cache->dirty)
+                               break;
+                       cache = next_block_group(root, cache);
+               }
+               if (!cache) {
+                       if (last == 0)
+                               break;
+                       last = 0;
+                       continue;
+               }
 
                cache->dirty = 0;
-               last += cache->key.offset;
+               last = cache->key.objectid + cache->key.offset;
 
-               err = write_one_cache_group(trans, root,
-                                           path, cache);
-               /*
-                * if we fail to write the cache group, we want
-                * to keep it marked dirty in hopes that a later
-                * write will work
-                */
-               if (err) {
-                       werr = err;
-                       continue;
-               }
+               err = write_one_cache_group(trans, root, path, cache);
+               BUG_ON(err);
+               btrfs_put_block_group(cache);
        }
+
        btrfs_free_path(path);
-       return werr;
+       return 0;
 }
 
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2485,6 +2586,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->force_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
+       atomic_set(&found->caching_threads, 0);
        return 0;
 }
 
@@ -2697,7 +2799,7 @@ again:
 
                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
                       ", %llu bytes_used, %llu bytes_reserved, "
-                      "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+                      "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
                       "%llu total\n", (unsigned long long)bytes,
                       (unsigned long long)data_sinfo->bytes_delalloc,
                       (unsigned long long)data_sinfo->bytes_used,
@@ -2948,13 +3050,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
 
-       if (pin) {
+       if (pin)
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
-       } else {
-               clear_extent_dirty(&fs_info->pinned_extents,
-                               bytenr, bytenr + num - 1, GFP_NOFS);
-       }
 
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2970,14 +3068,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                        spin_unlock(&cache->space_info->lock);
                        fs_info->total_pinned += len;
                } else {
+                       int unpin = 0;
+
+                       /*
+                        * in order to not race with the block group caching, we
+                        * only want to unpin the extent if we are cached.  If
+                        * we aren't cached, we want to start async caching this
+                        * block group so we can free the extent the next time
+                        * around.
+                        */
                        spin_lock(&cache->space_info->lock);
                        spin_lock(&cache->lock);
-                       cache->pinned -= len;
-                       cache->space_info->bytes_pinned -= len;
+                       unpin = (cache->cached == BTRFS_CACHE_FINISHED);
+                       if (likely(unpin)) {
+                               cache->pinned -= len;
+                               cache->space_info->bytes_pinned -= len;
+                               fs_info->total_pinned -= len;
+                       }
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                       fs_info->total_pinned -= len;
-                       if (cache->cached)
+
+                       if (likely(unpin))
+                               clear_extent_dirty(&fs_info->pinned_extents,
+                                                  bytenr, bytenr + len -1,
+                                                  GFP_NOFS);
+                       else
+                               cache_block_group(cache);
+
+                       if (unpin)
                                btrfs_add_free_space(cache, bytenr, len);
                }
                btrfs_put_block_group(cache);
@@ -3031,6 +3149,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
                                            &start, &end, EXTENT_DIRTY);
                if (ret)
                        break;
+
                set_extent_dirty(copy, start, end, GFP_NOFS);
                last = end + 1;
        }
@@ -3059,6 +3178,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
                cond_resched();
        }
+
        return ret;
 }
 
@@ -3436,6 +3556,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
        return ret;
 }
 
+/*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once.  So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes.  Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+                               u64 num_bytes)
+{
+       DEFINE_WAIT(wait);
+
+       prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+
+       if (block_group_cache_done(cache)) {
+               finish_wait(&cache->caching_q, &wait);
+               return 0;
+       }
+       schedule();
+       finish_wait(&cache->caching_q, &wait);
+
+       wait_event(cache->caching_q, block_group_cache_done(cache) ||
+                  (cache->free_space >= num_bytes));
+       return 0;
+}
+
+enum btrfs_loop_type {
+       LOOP_CACHED_ONLY = 0,
+       LOOP_CACHING_NOWAIT = 1,
+       LOOP_CACHING_WAIT = 2,
+       LOOP_ALLOC_CHUNK = 3,
+       LOOP_NO_EMPTY_SIZE = 4,
+};
+
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
@@ -3461,6 +3620,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+       bool found_uncached_bg = false;
 
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3492,15 +3652,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
 
-       if (!last_ptr) {
+       if (!last_ptr)
                empty_cluster = 0;
-               loop = 1;
-       }
 
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
-               if (block_group && block_group_bits(block_group, data)) {
+               /*
+                * we don't want to use the block group if it doesn't match our
+                * allocation bits, or if its not cached.
+                */
+               if (block_group && block_group_bits(block_group, data) &&
+                   block_group_cache_done(block_group)) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
@@ -3523,21 +3686,35 @@ search:
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups, list) {
                u64 offset;
+               int cached;
 
                atomic_inc(&block_group->count);
                search_start = block_group->key.objectid;
 
 have_block_group:
-               if (unlikely(!block_group->cached)) {
-                       mutex_lock(&block_group->cache_mutex);
-                       ret = cache_block_group(root, block_group);
-                       mutex_unlock(&block_group->cache_mutex);
-                       if (ret) {
-                               btrfs_put_block_group(block_group);
-                               break;
+               if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+                       /*
+                        * we want to start caching kthreads, but not too many
+                        * right off the bat so we don't overwhelm the system,
+                        * so only start them if there are less than 2 and we're
+                        * in the initial allocation phase.
+                        */
+                       if (loop > LOOP_CACHING_NOWAIT ||
+                           atomic_read(&space_info->caching_threads) < 2) {
+                               ret = cache_block_group(block_group);
+                               BUG_ON(ret);
                        }
                }
 
+               cached = block_group_cache_done(block_group);
+               if (unlikely(!cached)) {
+                       found_uncached_bg = true;
+
+                       /* if we only want cached bgs, loop */
+                       if (loop == LOOP_CACHED_ONLY)
+                               goto loop;
+               }
+
                if (unlikely(block_group->ro))
                        goto loop;
 
@@ -3616,14 +3793,21 @@ refill_cluster:
                                        spin_unlock(&last_ptr->refill_lock);
                                        goto checks;
                                }
+                       } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+                               spin_unlock(&last_ptr->refill_lock);
+
+                               wait_block_group_cache_progress(block_group,
+                                      num_bytes + empty_cluster + empty_size);
+                               goto have_block_group;
                        }
+
                        /*
                         * at this point we either didn't find a cluster
                         * or we weren't able to allocate a block from our
                         * cluster.  Free the cluster we've been trying
                         * to use, and go to the next block group
                         */
-                       if (loop < 2) {
+                       if (loop < LOOP_NO_EMPTY_SIZE) {
                                btrfs_return_cluster_to_free_space(NULL,
                                                                   last_ptr);
                                spin_unlock(&last_ptr->refill_lock);
@@ -3634,11 +3818,17 @@ refill_cluster:
 
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
-               if (!offset)
+               if (!offset && (cached || (!cached &&
+                                          loop == LOOP_CACHING_NOWAIT))) {
                        goto loop;
+               } else if (!offset && (!cached &&
+                                      loop > LOOP_CACHING_NOWAIT)) {
+                       wait_block_group_cache_progress(block_group,
+                                       num_bytes + empty_size);
+                       goto have_block_group;
+               }
 checks:
                search_start = stripe_align(root, offset);
-
                /* move on to the next group */
                if (search_start + num_bytes >= search_end) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3684,13 +3874,26 @@ loop:
        }
        up_read(&space_info->groups_sem);
 
-       /* loop == 0, try to find a clustered alloc in every block group
-        * loop == 1, try again after forcing a chunk allocation
-        * loop == 2, set empty_size and empty_cluster to 0 and try again
+       /* LOOP_CACHED_ONLY, only search fully cached block groups
+        * LOOP_CACHING_NOWAIT, search partially cached block groups, but
+        *                      dont wait foR them to finish caching
+        * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+        * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+        * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+        *                      again
         */
-       if (!ins->objectid && loop < 3 &&
-           (empty_size || empty_cluster || allowed_chunk_alloc)) {
-               if (loop >= 2) {
+       if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
+           (found_uncached_bg || empty_size || empty_cluster ||
+            allowed_chunk_alloc)) {
+               if (found_uncached_bg) {
+                       found_uncached_bg = false;
+                       if (loop < LOOP_CACHING_WAIT) {
+                               loop++;
+                               goto search;
+                       }
+               }
+
+               if (loop == LOOP_ALLOC_CHUNK) {
                        empty_size = 0;
                        empty_cluster = 0;
                }
@@ -3703,7 +3906,7 @@ loop:
                        space_info->force_alloc = 1;
                }
 
-               if (loop < 3) {
+               if (loop < LOOP_NO_EMPTY_SIZE) {
                        loop++;
                        goto search;
                }
@@ -3799,7 +4002,7 @@ again:
                               num_bytes, data, 1);
                goto again;
        }
-       if (ret) {
+       if (ret == -ENOSPC) {
                struct btrfs_space_info *sinfo;
 
                sinfo = __find_space_info(root->fs_info, data);
@@ -3807,7 +4010,6 @@ again:
                       "wanted %llu\n", (unsigned long long)data,
                       (unsigned long long)num_bytes);
                dump_space_info(sinfo, num_bytes);
-               BUG();
        }
 
        return ret;
@@ -3845,7 +4047,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
                                     empty_size, hint_byte, search_end, ins,
                                     data);
-       update_reserved_extents(root, ins->objectid, ins->offset, 1);
+       if (!ret)
+               update_reserved_extents(root, ins->objectid, ins->offset, 1);
+
        return ret;
 }
 
@@ -4007,9 +4211,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *block_group;
 
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-       mutex_lock(&block_group->cache_mutex);
-       cache_block_group(root, block_group);
-       mutex_unlock(&block_group->cache_mutex);
+       cache_block_group(block_group);
+       wait_event(block_group->caching_q,
+                  block_group_cache_done(block_group));
 
        ret = btrfs_remove_free_space(block_group, ins->objectid,
                                      ins->offset);
@@ -4040,7 +4244,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
                                     empty_size, hint_byte, search_end,
                                     ins, 0);
-       BUG_ON(ret);
+       if (ret)
+               return ret;
 
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent == 0)
@@ -4128,6 +4333,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        return buf;
 }
 
+#if 0
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf)
 {
@@ -4171,8 +4377,6 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
 
-#if 0
-
 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
                                        struct btrfs_leaf_ref *ref)
@@ -4553,262 +4757,471 @@ out:
 }
 #endif
 
+struct walk_control {
+       u64 refs[BTRFS_MAX_LEVEL];
+       u64 flags[BTRFS_MAX_LEVEL];
+       struct btrfs_key update_progress;
+       int stage;
+       int level;
+       int shared_level;
+       int update_ref;
+       int keep_locks;
+};
+
+#define DROP_REFERENCE 1
+#define UPDATE_BACKREF 2
+
 /*
- * helper function for drop_subtree, this function is similar to
- * walk_down_tree. The main difference is that it checks reference
- * counts while tree blocks are locked.
+ * hepler to process tree block while walking down the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block. if the block is shared and
+ * we need update back refs for the subtree rooted at the
+ * block, this function changes wc->stage to UPDATE_BACKREF
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
  */
-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
-                                  struct btrfs_path *path, int *level)
+                                  struct btrfs_path *path,
+                                  struct walk_control *wc)
 {
-       struct extent_buffer *next;
-       struct extent_buffer *cur;
-       struct extent_buffer *parent;
-       u64 bytenr;
-       u64 ptr_gen;
-       u64 refs;
-       u64 flags;
-       u32 blocksize;
+       int level = wc->level;
+       struct extent_buffer *eb = path->nodes[level];
+       struct btrfs_key key;
+       u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
        int ret;
 
-       cur = path->nodes[*level];
-       ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
-                                      &refs, &flags);
-       BUG_ON(ret);
-       if (refs > 1)
-               goto out;
+       if (wc->stage == UPDATE_BACKREF &&
+           btrfs_header_owner(eb) != root->root_key.objectid)
+               return 1;
 
-       BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+       /*
+        * when reference count of tree block is 1, it won't increase
+        * again. once full backref flag is set, we never clear it.
+        */
+       if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+           (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+               BUG_ON(!path->locks[level]);
+               ret = btrfs_lookup_extent_info(trans, root,
+                                              eb->start, eb->len,
+                                              &wc->refs[level],
+                                              &wc->flags[level]);
+               BUG_ON(ret);
+               BUG_ON(wc->refs[level] == 0);
+       }
 
-       while (*level >= 0) {
-               cur = path->nodes[*level];
-               if (*level == 0) {
-                       ret = btrfs_drop_leaf_ref(trans, root, cur);
-                       BUG_ON(ret);
-                       clean_tree_block(trans, root, cur);
-                       break;
-               }
-               if (path->slots[*level] >= btrfs_header_nritems(cur)) {
-                       clean_tree_block(trans, root, cur);
-                       break;
+       if (wc->stage == DROP_REFERENCE &&
+           wc->update_ref && wc->refs[level] > 1) {
+               BUG_ON(eb == root->node);
+               BUG_ON(path->slots[level] > 0);
+               if (level == 0)
+                       btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
+               else
+                       btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
+               if (btrfs_header_owner(eb) == root->root_key.objectid &&
+                   btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
+                       wc->stage = UPDATE_BACKREF;
+                       wc->shared_level = level;
                }
+       }
 
-               bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
-               blocksize = btrfs_level_size(root, *level - 1);
-               ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+       if (wc->stage == DROP_REFERENCE) {
+               if (wc->refs[level] > 1)
+                       return 1;
 
-               next = read_tree_block(root, bytenr, blocksize, ptr_gen);
-               btrfs_tree_lock(next);
-               btrfs_set_lock_blocking(next);
+               if (path->locks[level] && !wc->keep_locks) {
+                       btrfs_tree_unlock(eb);
+                       path->locks[level] = 0;
+               }
+               return 0;
+       }
 
-               ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-                                              &refs, &flags);
+       /* wc->stage == UPDATE_BACKREF */
+       if (!(wc->flags[level] & flag)) {
+               BUG_ON(!path->locks[level]);
+               ret = btrfs_inc_ref(trans, root, eb, 1);
                BUG_ON(ret);
-               if (refs > 1) {
-                       parent = path->nodes[*level];
-                       ret = btrfs_free_extent(trans, root, bytenr,
-                                               blocksize, parent->start,
-                                               btrfs_header_owner(parent),
-                                               *level - 1, 0);
+               ret = btrfs_dec_ref(trans, root, eb, 0);
+               BUG_ON(ret);
+               ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
+                                                 eb->len, flag, 0);
+               BUG_ON(ret);
+               wc->flags[level] |= flag;
+       }
+
+       /*
+        * the block is shared by multiple trees, so it's not good to
+        * keep the tree lock
+        */
+       if (path->locks[level] && level > 0) {
+               btrfs_tree_unlock(eb);
+               path->locks[level] = 0;
+       }
+       return 0;
+}
+
+/*
+ * hepler to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
+ */
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct walk_control *wc)
+{
+       int ret = 0;
+       int level = wc->level;
+       struct extent_buffer *eb = path->nodes[level];
+       u64 parent = 0;
+
+       if (wc->stage == UPDATE_BACKREF) {
+               BUG_ON(wc->shared_level < level);
+               if (level < wc->shared_level)
+                       goto out;
+
+               BUG_ON(wc->refs[level] <= 1);
+               ret = find_next_key(path, level + 1, &wc->update_progress);
+               if (ret > 0)
+                       wc->update_ref = 0;
+
+               wc->stage = DROP_REFERENCE;
+               wc->shared_level = -1;
+               path->slots[level] = 0;
+
+               /*
+                * check reference count again if the block isn't locked.
+                * we should start walking down the tree again if reference
+                * count is one.
+                */
+               if (!path->locks[level]) {
+                       BUG_ON(level == 0);
+                       btrfs_tree_lock(eb);
+                       btrfs_set_lock_blocking(eb);
+                       path->locks[level] = 1;
+
+                       ret = btrfs_lookup_extent_info(trans, root,
+                                                      eb->start, eb->len,
+                                                      &wc->refs[level],
+                                                      &wc->flags[level]);
                        BUG_ON(ret);
-                       path->slots[*level]++;
-                       btrfs_tree_unlock(next);
-                       free_extent_buffer(next);
-                       continue;
+                       BUG_ON(wc->refs[level] == 0);
+                       if (wc->refs[level] == 1) {
+                               btrfs_tree_unlock(eb);
+                               path->locks[level] = 0;
+                               return 1;
+                       }
+               } else {
+                       BUG_ON(level != 0);
                }
+       }
 
-               BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+       /* wc->stage == DROP_REFERENCE */
+       BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
 
-               *level = btrfs_header_level(next);
-               path->nodes[*level] = next;
-               path->slots[*level] = 0;
-               path->locks[*level] = 1;
-               cond_resched();
+       if (wc->refs[level] == 1) {
+               if (level == 0) {
+                       if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                               ret = btrfs_dec_ref(trans, root, eb, 1);
+                       else
+                               ret = btrfs_dec_ref(trans, root, eb, 0);
+                       BUG_ON(ret);
+               }
+               /* make block locked assertion in clean_tree_block happy */
+               if (!path->locks[level] &&
+                   btrfs_header_generation(eb) == trans->transid) {
+                       btrfs_tree_lock(eb);
+                       btrfs_set_lock_blocking(eb);
+                       path->locks[level] = 1;
+               }
+               clean_tree_block(trans, root, eb);
+       }
+
+       if (eb == root->node) {
+               if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                       parent = eb->start;
+               else
+                       BUG_ON(root->root_key.objectid !=
+                              btrfs_header_owner(eb));
+       } else {
+               if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                       parent = path->nodes[level + 1]->start;
+               else
+                       BUG_ON(root->root_key.objectid !=
+                              btrfs_header_owner(path->nodes[level + 1]));
        }
-out:
-       if (path->nodes[*level] == root->node)
-               parent = path->nodes[*level];
-       else
-               parent = path->nodes[*level + 1];
-       bytenr = path->nodes[*level]->start;
-       blocksize = path->nodes[*level]->len;
 
-       ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
-                               btrfs_header_owner(parent), *level, 0);
+       ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
+                               root->root_key.objectid, level, 0);
        BUG_ON(ret);
+out:
+       wc->refs[level] = 0;
+       wc->flags[level] = 0;
+       return ret;
+}
 
-       if (path->locks[*level]) {
-               btrfs_tree_unlock(path->nodes[*level]);
-               path->locks[*level] = 0;
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct btrfs_path *path,
+                                  struct walk_control *wc)
+{
+       struct extent_buffer *next;
+       struct extent_buffer *cur;
+       u64 bytenr;
+       u64 ptr_gen;
+       u32 blocksize;
+       int level = wc->level;
+       int ret;
+
+       while (level >= 0) {
+               cur = path->nodes[level];
+               BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
+
+               ret = walk_down_proc(trans, root, path, wc);
+               if (ret > 0)
+                       break;
+
+               if (level == 0)
+                       break;
+
+               bytenr = btrfs_node_blockptr(cur, path->slots[level]);
+               blocksize = btrfs_level_size(root, level - 1);
+               ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
+
+               next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+               btrfs_tree_lock(next);
+               btrfs_set_lock_blocking(next);
+
+               level--;
+               BUG_ON(level != btrfs_header_level(next));
+               path->nodes[level] = next;
+               path->slots[level] = 0;
+               path->locks[level] = 1;
+               wc->level = level;
        }
-       free_extent_buffer(path->nodes[*level]);
-       path->nodes[*level] = NULL;
-       *level += 1;
-       cond_resched();
        return 0;
 }
 
-/*
- * helper for dropping snapshots.  This walks back up the tree in the path
- * to find the first node higher up where we haven't yet gone through
- * all the slots
- */
 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path,
-                                int *level, int max_level)
+                                struct walk_control *wc, int max_level)
 {
-       struct btrfs_root_item *root_item = &root->root_item;
-       int i;
-       int slot;
+       int level = wc->level;
        int ret;
 
-       for (i = *level; i < max_level && path->nodes[i]; i++) {
-               slot = path->slots[i];
-               if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
-                       /*
-                        * there is more work to do in this level.
-                        * Update the drop_progress marker to reflect
-                        * the work we've done so far, and then bump
-                        * the slot number
-                        */
-                       path->slots[i]++;
-                       WARN_ON(*level == 0);
-                       if (max_level == BTRFS_MAX_LEVEL) {
-                               btrfs_node_key(path->nodes[i],
-                                              &root_item->drop_progress,
-                                              path->slots[i]);
-                               root_item->drop_level = i;
-                       }
-                       *level = i;
+       path->slots[level] = btrfs_header_nritems(path->nodes[level]);
+       while (level < max_level && path->nodes[level]) {
+               wc->level = level;
+               if (path->slots[level] + 1 <
+                   btrfs_header_nritems(path->nodes[level])) {
+                       path->slots[level]++;
                        return 0;
                } else {
-                       struct extent_buffer *parent;
-
-                       /*
-                        * this whole node is done, free our reference
-                        * on it and go up one level
-                        */
-                       if (path->nodes[*level] == root->node)
-                               parent = path->nodes[*level];
-                       else
-                               parent = path->nodes[*level + 1];
+                       ret = walk_up_proc(trans, root, path, wc);
+                       if (ret > 0)
+                               return 0;
 
-                       clean_tree_block(trans, root, path->nodes[i]);
-                       ret = btrfs_free_extent(trans, root,
-                                               path->nodes[i]->start,
-                                               path->nodes[i]->len,
-                                               parent->start,
-                                               btrfs_header_owner(parent),
-                                               *level, 0);
-                       BUG_ON(ret);
-                       if (path->locks[*level]) {
-                               btrfs_tree_unlock(path->nodes[i]);
-                               path->locks[i] = 0;
+                       if (path->locks[level]) {
+                               btrfs_tree_unlock(path->nodes[level]);
+                               path->locks[level] = 0;
                        }
-                       free_extent_buffer(path->nodes[i]);
-                       path->nodes[i] = NULL;
-                       *level = i + 1;
+                       free_extent_buffer(path->nodes[level]);
+                       path->nodes[level] = NULL;
+                       level++;
                }
        }
        return 1;
 }
 
 /*
- * drop the reference count on the tree rooted at 'snap'.  This traverses
- * the tree freeing any blocks that have a ref count of zero after being
- * decremented.
+ * drop a subvolume tree.
+ *
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
  */
-int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
-                       *root)
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
 {
-       int ret = 0;
-       int wret;
-       int level;
        struct btrfs_path *path;
-       int update_count;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *tree_root = root->fs_info->tree_root;
        struct btrfs_root_item *root_item = &root->root_item;
+       struct walk_control *wc;
+       struct btrfs_key key;
+       int err = 0;
+       int ret;
+       int level;
 
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
-       level = btrfs_header_level(root->node);
+       wc = kzalloc(sizeof(*wc), GFP_NOFS);
+       BUG_ON(!wc);
+
+       trans = btrfs_start_transaction(tree_root, 1);
+
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+               level = btrfs_header_level(root->node);
                path->nodes[level] = btrfs_lock_root_node(root);
                btrfs_set_lock_blocking(path->nodes[level]);
                path->slots[level] = 0;
                path->locks[level] = 1;
+               memset(&wc->update_progress, 0,
+                      sizeof(wc->update_progress));
        } else {
-               struct btrfs_key key;
-               struct btrfs_disk_key found_key;
-               struct extent_buffer *node;
-
                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+               memcpy(&wc->update_progress, &key,
+                      sizeof(wc->update_progress));
+
                level = root_item->drop_level;
+               BUG_ON(level == 0);
                path->lowest_level = level;
-               wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-               if (wret < 0) {
-                       ret = wret;
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               path->lowest_level = 0;
+               if (ret < 0) {
+                       err = ret;
                        goto out;
                }
-               node = path->nodes[level];
-               btrfs_node_key(node, &found_key, path->slots[level]);
-               WARN_ON(memcmp(&found_key, &root_item->drop_progress,
-                              sizeof(found_key)));
+               btrfs_node_key_to_cpu(path->nodes[level], &key,
+                                     path->slots[level]);
+               WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
+
                /*
                 * unlock our path, this is safe because only this
                 * function is allowed to delete this snapshot
                 */
                btrfs_unlock_up_safe(path, 0);
+
+               level = btrfs_header_level(root->node);
+               while (1) {
+                       btrfs_tree_lock(path->nodes[level]);
+                       btrfs_set_lock_blocking(path->nodes[level]);
+
+                       ret = btrfs_lookup_extent_info(trans, root,
+                                               path->nodes[level]->start,
+                                               path->nodes[level]->len,
+                                               &wc->refs[level],
+                                               &wc->flags[level]);
+                       BUG_ON(ret);
+                       BUG_ON(wc->refs[level] == 0);
+
+                       if (level == root_item->drop_level)
+                               break;
+
+                       btrfs_tree_unlock(path->nodes[level]);
+                       WARN_ON(wc->refs[level] != 1);
+                       level--;
+               }
        }
+
+       wc->level = level;
+       wc->shared_level = -1;
+       wc->stage = DROP_REFERENCE;
+       wc->update_ref = update_ref;
+       wc->keep_locks = 0;
+
        while (1) {
-               unsigned long update;
-               wret = walk_down_tree(trans, root, path, &level);
-               if (wret > 0)
+               ret = walk_down_tree(trans, root, path, wc);
+               if (ret < 0) {
+                       err = ret;
                        break;
-               if (wret < 0)
-                       ret = wret;
+               }
 
-               wret = walk_up_tree(trans, root, path, &level,
-                                   BTRFS_MAX_LEVEL);
-               if (wret > 0)
+               ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
+               if (ret < 0) {
+                       err = ret;
                        break;
-               if (wret < 0)
-                       ret = wret;
-               if (trans->transaction->in_commit ||
-                   trans->transaction->delayed_refs.flushing) {
-                       ret = -EAGAIN;
+               }
+
+               if (ret > 0) {
+                       BUG_ON(wc->stage != DROP_REFERENCE);
                        break;
                }
-               for (update_count = 0; update_count < 16; update_count++) {
+
+               if (wc->stage == DROP_REFERENCE) {
+                       level = wc->level;
+                       btrfs_node_key(path->nodes[level],
+                                      &root_item->drop_progress,
+                                      path->slots[level]);
+                       root_item->drop_level = level;
+               }
+
+               BUG_ON(wc->level == 0);
+               if (trans->transaction->in_commit ||
+                   trans->transaction->delayed_refs.flushing) {
+                       ret = btrfs_update_root(trans, tree_root,
+                                               &root->root_key,
+                                               root_item);
+                       BUG_ON(ret);
+
+                       btrfs_end_transaction(trans, tree_root);
+                       trans = btrfs_start_transaction(tree_root, 1);
+               } else {
+                       unsigned long update;
                        update = trans->delayed_ref_updates;
                        trans->delayed_ref_updates = 0;
                        if (update)
-                               btrfs_run_delayed_refs(trans, root, update);
-                       else
-                               break;
+                               btrfs_run_delayed_refs(trans, tree_root,
+                                                      update);
                }
        }
+       btrfs_release_path(root, path);
+       BUG_ON(err);
+
+       ret = btrfs_del_root(trans, tree_root, &root->root_key);
+       BUG_ON(ret);
+
+       free_extent_buffer(root->node);
+       free_extent_buffer(root->commit_root);
+       kfree(root);
 out:
+       btrfs_end_transaction(trans, tree_root);
+       kfree(wc);
        btrfs_free_path(path);
-       return ret;
+       return err;
 }
 
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
                        struct extent_buffer *parent)
 {
        struct btrfs_path *path;
+       struct walk_control *wc;
        int level;
        int parent_level;
        int ret = 0;
        int wret;
 
+       BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
+       wc = kzalloc(sizeof(*wc), GFP_NOFS);
+       BUG_ON(!wc);
+
        btrfs_assert_tree_locked(parent);
        parent_level = btrfs_header_level(parent);
        extent_buffer_get(parent);
@@ -4817,24 +5230,33 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 
        btrfs_assert_tree_locked(node);
        level = btrfs_header_level(node);
-       extent_buffer_get(node);
        path->nodes[level] = node;
        path->slots[level] = 0;
+       path->locks[level] = 1;
+
+       wc->refs[parent_level] = 1;
+       wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+       wc->level = level;
+       wc->shared_level = -1;
+       wc->stage = DROP_REFERENCE;
+       wc->update_ref = 0;
+       wc->keep_locks = 1;
 
        while (1) {
-               wret = walk_down_tree(trans, root, path, &level);
-               if (wret < 0)
+               wret = walk_down_tree(trans, root, path, wc);
+               if (wret < 0) {
                        ret = wret;
-               if (wret != 0)
                        break;
+               }
 
-               wret = walk_up_tree(trans, root, path, &level, parent_level);
+               wret = walk_up_tree(trans, root, path, wc, parent_level);
                if (wret < 0)
                        ret = wret;
                if (wret != 0)
                        break;
        }
 
+       kfree(wc);
        btrfs_free_path(path);
        return ret;
 }
@@ -6739,11 +7161,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                         &info->block_group_cache_tree);
                spin_unlock(&info->block_group_cache_lock);
 
-               btrfs_remove_free_space_cache(block_group);
                down_write(&block_group->space_info->groups_sem);
                list_del(&block_group->list);
                up_write(&block_group->space_info->groups_sem);
 
+               if (block_group->cached == BTRFS_CACHE_STARTED)
+                       wait_event(block_group->caching_q,
+                                  block_group_cache_done(block_group));
+
+               btrfs_remove_free_space_cache(block_group);
+
                WARN_ON(atomic_read(&block_group->count) != 1);
                kfree(block_group);
 
@@ -6809,9 +7236,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
                spin_lock_init(&cache->tree_lock);
-               mutex_init(&cache->cache_mutex);
+               cache->fs_info = info;
+               init_waitqueue_head(&cache->caching_q);
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
+
+               /*
+                * we only want to have 32k of ram per block group for keeping
+                * track of free space, and if we pass 1/2 of that we want to
+                * start converting things over to using bitmaps
+                */
+               cache->extents_thresh = ((1024 * 32) / 2) /
+                       sizeof(struct btrfs_free_space);
+
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -6820,6 +7257,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                key.objectid = found_key.objectid + found_key.offset;
                btrfs_release_path(root, path);
                cache->flags = btrfs_block_group_flags(&cache->item);
+               cache->sectorsize = root->sectorsize;
+
+               remove_sb_from_cache(root, cache);
+
+               /*
+                * check for two cases, either we are full, and therefore
+                * don't need to bother with the caching work since we won't
+                * find any space, or we are empty, and we can just add all
+                * the space in and be done with it.  This saves us _alot_ of
+                * time, particularly in the full case.
+                */
+               if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+                       cache->cached = BTRFS_CACHE_FINISHED;
+               } else if (btrfs_block_group_used(&cache->item) == 0) {
+                       cache->cached = BTRFS_CACHE_FINISHED;
+                       add_new_free_space(cache, root->fs_info,
+                                          found_key.objectid,
+                                          found_key.objectid +
+                                          found_key.offset);
+               }
 
                ret = update_space_info(info, cache->flags, found_key.offset,
                                        btrfs_block_group_used(&cache->item),
@@ -6863,10 +7320,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.objectid = chunk_offset;
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+       cache->sectorsize = root->sectorsize;
+
+       /*
+        * we only want to have 32k of ram per block group for keeping track
+        * of free space, and if we pass 1/2 of that we want to start
+        * converting things over to using bitmaps
+        */
+       cache->extents_thresh = ((1024 * 32) / 2) /
+               sizeof(struct btrfs_free_space);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        spin_lock_init(&cache->tree_lock);
-       mutex_init(&cache->cache_mutex);
+       init_waitqueue_head(&cache->caching_q);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
 
@@ -6875,6 +7341,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->flags = type;
        btrfs_set_block_group_flags(&cache->item, type);
 
+       cache->cached = BTRFS_CACHE_FINISHED;
+       remove_sb_from_cache(root, cache);
+
+       add_new_free_space(cache, root->fs_info, chunk_offset,
+                          chunk_offset + size);
+
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
@@ -6933,7 +7405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
        spin_unlock(&root->fs_info->block_group_cache_lock);
-       btrfs_remove_free_space_cache(block_group);
+
        down_write(&block_group->space_info->groups_sem);
        /*
         * we must use list_del_init so people can check to see if they
@@ -6942,11 +7414,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        list_del_init(&block_group->list);
        up_write(&block_group->space_info->groups_sem);
 
+       if (block_group->cached == BTRFS_CACHE_STARTED)
+               wait_event(block_group->caching_q,
+                          block_group_cache_done(block_group));
+
+       btrfs_remove_free_space_cache(block_group);
+
        spin_lock(&block_group->space_info->lock);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        spin_unlock(&block_group->space_info->lock);
-       block_group->space_info->full = 0;
+
+       btrfs_clear_space_info_full(root->fs_info);
 
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);