+ ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
+ if (!ref) {
+ btrfs_free_leaf_ref(root, orig_ref);
+ return -ENOMEM;
+ }
+
+ ref->nritems = orig_ref->nritems;
+ memcpy(ref->extents, orig_ref->extents,
+ sizeof(ref->extents[0]) * ref->nritems);
+
+ btrfs_free_leaf_ref(root, orig_ref);
+
+ ref->root_gen = trans->transid;
+ ref->bytenr = buf->start;
+ ref->owner = btrfs_header_owner(buf);
+ ref->generation = btrfs_header_generation(buf);
+ ret = btrfs_add_leaf_ref(root, ref, 0);
+ WARN_ON(ret);
+ btrfs_free_leaf_ref(root, ref);
+ }
+ return 0;
+}
+
+static int noinline invalidate_extent_cache(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_block_group_cache *group,
+ struct btrfs_root *target_root)
+{
+ struct btrfs_key key;
+ struct inode *inode = NULL;
+ struct btrfs_file_extent_item *fi;
+ u64 num_bytes;
+ u64 skip_objectid = 0;
+ u32 nritems;
+ u32 i;
+
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (key.objectid == skip_objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+ continue;
+ if (!inode || inode->i_ino != key.objectid) {
+ iput(inode);
+ inode = btrfs_ilookup(target_root->fs_info->sb,
+ key.objectid, target_root, 1);
+ }
+ if (!inode) {
+ skip_objectid = key.objectid;
+ continue;
+ }
+ num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+ key.offset + num_bytes - 1, GFP_NOFS);
+ btrfs_drop_extent_cache(inode, key.offset,
+ key.offset + num_bytes - 1, 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+ key.offset + num_bytes - 1, GFP_NOFS);
+ cond_resched();
+ }
+ iput(inode);
+ return 0;
+}
+
+static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_block_group_cache *group,
+ struct inode *reloc_inode)
+{
+ struct btrfs_key key;
+ struct btrfs_key extent_key;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_leaf_ref *ref;
+ struct disk_extent *new_extent;
+ u64 bytenr;
+ u64 num_bytes;
+ u32 nritems;
+ u32 i;
+ int ext_index;
+ int nr_extent;
+ int ret;
+
+ new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
+ BUG_ON(!new_extent);
+
+ ref = btrfs_lookup_leaf_ref(root, leaf->start);
+ BUG_ON(!ref);
+
+ ext_index = -1;
+ nritems = btrfs_header_nritems(leaf);
+ for (i = 0; i < nritems; i++) {
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+ fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(leaf, fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ if (bytenr == 0)
+ continue;
+
+ ext_index++;
+ if (bytenr >= group->key.objectid + group->key.offset ||
+ bytenr + num_bytes <= group->key.objectid)
+ continue;
+
+ extent_key.objectid = bytenr;
+ extent_key.offset = num_bytes;
+ extent_key.type = BTRFS_EXTENT_ITEM_KEY;
+ nr_extent = 1;
+ ret = get_new_locations(reloc_inode, &extent_key,
+ group->key.objectid, 1,
+ &new_extent, &nr_extent);
+ if (ret > 0)
+ continue;
+ BUG_ON(ret < 0);
+
+ BUG_ON(ref->extents[ext_index].bytenr != bytenr);
+ BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
+ ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
+ ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
+
+ btrfs_set_file_extent_disk_bytenr(leaf, fi,
+ new_extent->disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+ new_extent->disk_num_bytes);
+ btrfs_mark_buffer_dirty(leaf);
+
+ ret = btrfs_inc_extent_ref(trans, root,
+ new_extent->disk_bytenr,
+ new_extent->disk_num_bytes,
+ leaf->start,
+ root->root_key.objectid,
+ trans->transid, key.objectid);
+ BUG_ON(ret);
+ ret = btrfs_free_extent(trans, root,
+ bytenr, num_bytes, leaf->start,
+ btrfs_header_owner(leaf),
+ btrfs_header_generation(leaf),
+ key.objectid, 0);
+ BUG_ON(ret);
+ cond_resched();
+ }
+ kfree(new_extent);
+ BUG_ON(ext_index + 1 != ref->nritems);
+ btrfs_free_leaf_ref(root, ref);
+ return 0;
+}
+
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ int ret;
+
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ root->reloc_root = NULL;
+ list_add(&reloc_root->dead_list,
+ &root->fs_info->dead_reloc_roots);
+
+ btrfs_set_root_bytenr(&reloc_root->root_item,
+ reloc_root->node->start);
+ btrfs_set_root_level(&root->root_item,
+ btrfs_header_level(reloc_root->node));
+ memset(&reloc_root->root_item.drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ reloc_root->root_item.drop_level = 0;
+
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key,
+ &reloc_root->root_item);
+ BUG_ON(ret);
+ }
+ return 0;
+}
+
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root *prev_root = NULL;
+ struct list_head dead_roots;
+ int ret;
+ unsigned long nr;
+
+ INIT_LIST_HEAD(&dead_roots);
+ list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
+
+ while (!list_empty(&dead_roots)) {
+ reloc_root = list_entry(dead_roots.prev,
+ struct btrfs_root, dead_list);
+ list_del_init(&reloc_root->dead_list);
+
+ BUG_ON(reloc_root->commit_root != NULL);
+ while (1) {
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(!trans);
+
+ mutex_lock(&root->fs_info->drop_mutex);
+ ret = btrfs_drop_snapshot(trans, reloc_root);
+ if (ret != -EAGAIN)
+ break;
+ mutex_unlock(&root->fs_info->drop_mutex);
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_btree_balance_dirty(root, nr);
+ }
+
+ free_extent_buffer(reloc_root->node);
+
+ ret = btrfs_del_root(trans, root->fs_info->tree_root,
+ &reloc_root->root_key);
+ BUG_ON(ret);
+ mutex_unlock(&root->fs_info->drop_mutex);
+
+ nr = trans->blocks_used;
+ ret = btrfs_end_transaction(trans, root);
+ BUG_ON(ret);
+ btrfs_btree_balance_dirty(root, nr);
+
+ kfree(prev_root);
+ prev_root = reloc_root;
+ }
+ if (prev_root) {
+ btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
+ kfree(prev_root);
+ }
+ return 0;
+}
+
+int btrfs_add_dead_reloc_root(struct btrfs_root *root)
+{
+ list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
+ return 0;
+}
+
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_key location;
+ int found;
+ int ret;
+
+ mutex_lock(&root->fs_info->tree_reloc_mutex);
+ ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
+ BUG_ON(ret);
+ found = !list_empty(&root->fs_info->dead_reloc_roots);
+ mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+ if (found) {
+ trans = btrfs_start_transaction(root, 1);
+ BUG_ON(!trans);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ }
+
+ location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+ location.offset = (u64)-1;
+ location.type = BTRFS_ROOT_ITEM_KEY;
+
+ reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ BUG_ON(!reloc_root);
+ btrfs_orphan_cleanup(reloc_root);
+ return 0;
+}
+
+static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb;
+ struct btrfs_root_item *root_item;
+ struct btrfs_key root_key;
+ int ret;
+
+ BUG_ON(!root->ref_cows);
+ if (root->reloc_root)
+ return 0;
+
+ root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+ BUG_ON(!root_item);
+
+ ret = btrfs_copy_root(trans, root, root->commit_root,
+ &eb, BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+
+ root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+ root_key.offset = root->root_key.objectid;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+
+ memcpy(root_item, &root->root_item, sizeof(root_item));
+ btrfs_set_root_refs(root_item, 0);
+ btrfs_set_root_bytenr(root_item, eb->start);
+ btrfs_set_root_level(root_item, btrfs_header_level(eb));
+ btrfs_set_root_generation(root_item, trans->transid);
+
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+
+ ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+ &root_key, root_item);
+ BUG_ON(ret);
+ kfree(root_item);
+
+ reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+ &root_key);
+ BUG_ON(!reloc_root);
+ reloc_root->last_trans = trans->transid;
+ reloc_root->commit_root = NULL;
+ reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
+
+ root->reloc_root = reloc_root;
+ return 0;
+}
+
+/*
+ * Core function of space balance.
+ *
+ * The idea is using reloc trees to relocate tree blocks in reference
+ * counted roots. There is one reloc tree for each subvol, and all
+ * reloc trees share same root key objectid. Reloc trees are snapshots
+ * of the latest committed roots of subvols (root->commit_root).
+ *
+ * To relocate a tree block referenced by a subvol, there are two steps.
+ * COW the block through subvol's reloc tree, then update block pointer
+ * in the subvol to point to the new block. Since all reloc trees share
+ * same root key objectid, doing special handing for tree blocks owned
+ * by them is easy. Once a tree block has been COWed in one reloc tree,
+ * we can use the resulting new block directly when the same block is
+ * required to COW again through other reloc trees. By this way, relocated
+ * tree blocks are shared between reloc trees, so they are also shared
+ * between subvols.
+ */
+static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *first_key,
+ struct btrfs_ref_path *ref_path,
+ struct btrfs_block_group_cache *group,
+ struct inode *reloc_inode)
+{
+ struct btrfs_root *reloc_root;
+ struct extent_buffer *eb = NULL;
+ struct btrfs_key *keys;
+ u64 *nodes;
+ int level;
+ int shared_level;
+ int lowest_level = 0;
+ int ret;
+
+ if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+ lowest_level = ref_path->owner_objectid;
+
+ if (!root->ref_cows) {
+ path->lowest_level = lowest_level;
+ ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
+ BUG_ON(ret < 0);
+ path->lowest_level = 0;
+ btrfs_release_path(root, path);
+ return 0;
+ }
+
+ mutex_lock(&root->fs_info->tree_reloc_mutex);
+ ret = init_reloc_tree(trans, root);
+ BUG_ON(ret);
+ reloc_root = root->reloc_root;
+
+ shared_level = ref_path->shared_level;
+ ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
+
+ keys = ref_path->node_keys;
+ nodes = ref_path->new_nodes;
+ memset(&keys[shared_level + 1], 0,
+ sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
+ memset(&nodes[shared_level + 1], 0,
+ sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
+
+ if (nodes[lowest_level] == 0) {
+ path->lowest_level = lowest_level;
+ ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+ 0, 1);
+ BUG_ON(ret);
+ for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
+ eb = path->nodes[level];
+ if (!eb || eb == reloc_root->node)
+ break;
+ nodes[level] = eb->start;
+ if (level == 0)
+ btrfs_item_key_to_cpu(eb, &keys[level], 0);
+ else
+ btrfs_node_key_to_cpu(eb, &keys[level], 0);
+ }
+ if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+ eb = path->nodes[0];
+ ret = replace_extents_in_leaf(trans, reloc_root, eb,
+ group, reloc_inode);
+ BUG_ON(ret);
+ }
+ btrfs_release_path(reloc_root, path);
+ } else {
+ ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
+ lowest_level);
+ BUG_ON(ret);
+ }