From 3f157a2fd2ad731e1ed9964fecdc5f459f04a4a4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jun 2008 16:01:31 -0400 Subject: [PATCH] Btrfs: Online btree defragmentation fixes The btree defragger wasn't making forward progress because the new key wasn't being saved by the btrfs_search_forward function. This also disables the automatic btree defrag, it wasn't scaling well to huge filesystems. The auto-defrag needs to be done differently. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 170 ++++++++++++++++++++++++++++++++++++++--- fs/btrfs/ctree.h | 7 +- fs/btrfs/disk-io.c | 61 +-------------- fs/btrfs/disk-io.h | 6 -- fs/btrfs/extent-tree.c | 2 - fs/btrfs/super.c | 1 - fs/btrfs/transaction.c | 35 +-------- fs/btrfs/transaction.h | 1 - fs/btrfs/tree-defrag.c | 36 ++++----- 9 files changed, 190 insertions(+), 129 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7f4cc2b88d09..0cb80f32a9c7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -63,10 +63,9 @@ void btrfs_free_path(struct btrfs_path *p) void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) { int i; - int keep = p->keep_locks; - int skip = p->skip_locking; for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + p->slots[i] = 0; if (!p->nodes[i]) continue; if (p->locks[i]) { @@ -74,10 +73,8 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) p->locks[i] = 0; } free_extent_buffer(p->nodes[i]); + p->nodes[i] = NULL; } - memset(p, 0, sizeof(*p)); - p->keep_locks = keep; - p->skip_locking = skip; } struct extent_buffer *btrfs_root_node(struct btrfs_root *root) @@ -463,8 +460,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, search_start = cur->start; last_block = cur->start; *last_ret = search_start; - if (parent_level == 1) - btrfs_clear_buffer_defrag(cur); btrfs_tree_unlock(cur); free_extent_buffer(cur); } @@ -2969,8 +2964,138 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) return 1; } +/* + * A helper function to walk down the tree starting at min_key, and looking + * for nodes or leaves that are either in cache or have a minimum + * transaction id. This is used by the btree defrag code, but could + * also be used to search for blocks that have changed since a given + * transaction id. + * + * This does not cow, but it does stuff the starting key it finds back + * into min_key, so you can call btrfs_search_slot with cow=1 on the + * key and get a writable path. + * + * This does lock as it descends, and path->keep_locks should be set + * to 1 by the caller. + * + * This honors path->lowest_level to prevent descent past a given level + * of the tree. + * + * returns zero if something useful was found, < 0 on error and 1 if there + * was nothing in the tree that matched the search criteria. + */ +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_path *path, int cache_only, + u64 min_trans) +{ + struct extent_buffer *cur; + struct btrfs_key found_key; + int slot; + u32 nritems; + int level; + int ret = 1; + +again: + cur = btrfs_lock_root_node(root); + level = btrfs_header_level(cur); + path->nodes[level] = cur; + path->locks[level] = 1; + + if (btrfs_header_generation(cur) < min_trans) { + ret = 1; + goto out; + } + while(1) { + nritems = btrfs_header_nritems(cur); + level = btrfs_header_level(cur); + bin_search(cur, min_key, level, &slot); + + /* at level = 0, we're done, setup the path and exit */ + if (level == 0) { + ret = 0; + path->slots[level] = slot; + btrfs_item_key_to_cpu(cur, &found_key, slot); + goto out; + } + /* + * check this node pointer against the cache_only and + * min_trans parameters. If it isn't in cache or is too + * old, skip to the next one. + */ + while(slot < nritems) { + u64 blockptr; + u64 gen; + struct extent_buffer *tmp; + blockptr = btrfs_node_blockptr(cur, slot); + gen = btrfs_node_ptr_generation(cur, slot); + if (gen < min_trans) { + slot++; + continue; + } + if (!cache_only) + break; + + tmp = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + free_extent_buffer(tmp); + break; + } + if (tmp) + free_extent_buffer(tmp); + slot++; + } + /* + * we didn't find a candidate key in this node, walk forward + * and find another one + */ + if (slot >= nritems) { + ret = btrfs_find_next_key(root, path, min_key, level, + cache_only, min_trans); + if (ret == 0) { + btrfs_release_path(root, path); + goto again; + } else { + goto out; + } + } + /* save our key for returning back */ + btrfs_node_key_to_cpu(cur, &found_key, slot); + path->slots[level] = slot; + if (level == path->lowest_level) { + ret = 0; + unlock_up(path, level, 1); + goto out; + } + cur = read_node_slot(root, cur, slot); + + btrfs_tree_lock(cur); + path->locks[level - 1] = 1; + path->nodes[level - 1] = cur; + unlock_up(path, level, 1); + } +out: + if (ret == 0) + memcpy(min_key, &found_key, sizeof(found_key)); + return ret; +} + +/* + * this is similar to btrfs_next_leaf, but does not try to preserve + * and fixup the path. It looks for and returns the next key in the + * tree based on the current path and the cache_only and min_trans + * parameters. + * + * 0 is returned if another key is found, < 0 if there are any errors + * and 1 is returned if there are no higher keys in the tree + * + * path->keep_locks should be set to 1 on the search made before + * calling this function. + */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int lowest_level) + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans) { int level = lowest_level; int slot; @@ -2982,6 +3107,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, slot = path->slots[level] + 1; c = path->nodes[level]; +next: if (slot >= btrfs_header_nritems(c)) { level++; if (level == BTRFS_MAX_LEVEL) { @@ -2991,8 +3117,28 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, } if (level == 0) btrfs_item_key_to_cpu(c, key, slot); - else + else { + u64 blockptr = btrfs_node_blockptr(c, slot); + u64 gen = btrfs_node_ptr_generation(c, slot); + + if (cache_only) { + struct extent_buffer *cur; + cur = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + if (!cur || !btrfs_buffer_uptodate(cur, gen)) { + slot++; + if (cur) + free_extent_buffer(cur); + goto next; + } + free_extent_buffer(cur); + } + if (gen < min_trans) { + slot++; + goto next; + } btrfs_node_key_to_cpu(c, key, slot); + } return 0; } return 1; @@ -3095,6 +3241,12 @@ done: return 0; } +/* + * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps + * searching until it gets past min_objectid or finds an item of 'type' + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ca8e6f15859e..a28796482b4a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -609,6 +609,7 @@ struct btrfs_root { u64 last_inode_alloc; int ref_cows; int track_dirty; + u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; int defrag_running; @@ -1412,7 +1413,11 @@ int btrfs_previous_item(struct btrfs_root *root, struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int lowest_level); + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans); +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_path *path, int cache_only, + u64 min_trans); int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 31ca9f89388d..4cdc0b6a2672 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -295,7 +295,6 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page) ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); BUG_ON(ret); - btrfs_clear_buffer_defrag(eb); found_start = btrfs_header_bytenr(eb); if (found_start != start) { printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n", @@ -355,7 +354,6 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, } eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); - btrfs_clear_buffer_defrag(eb); found_start = btrfs_header_bytenr(eb); if (found_start != start) { ret = -EIO; @@ -736,6 +734,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); memset(&root->root_kobj, 0, sizeof(root->root_kobj)); + root->defrag_trans_start = fs_info->generation; init_completion(&root->kobj_unregister); root->defrag_running = 0; root->defrag_level = 0; @@ -1168,7 +1167,6 @@ static int transaction_kthread(void *arg) goto sleep; } mutex_unlock(&root->fs_info->trans_mutex); - btrfs_defrag_dirty_roots(root->fs_info); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); sleep: @@ -1434,12 +1432,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, tree_root, "btrfs-transaction"); if (!fs_info->transaction_kthread) - goto fail_trans_kthread; + goto fail_cleaner; return tree_root; -fail_trans_kthread: +fail_cleaner: kthread_stop(fs_info->cleaner_kthread); fail_extent_root: free_extent_buffer(extent_root->node); @@ -1662,7 +1660,6 @@ int close_ctree(struct btrfs_root *root) kthread_stop(root->fs_info->transaction_kthread); kthread_stop(root->fs_info->cleaner_kthread); - btrfs_defrag_dirty_roots(root->fs_info); btrfs_clean_old_snapshots(root); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); @@ -1794,58 +1791,6 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) return; } -void btrfs_set_buffer_defrag(struct extent_buffer *buf) -{ - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - struct inode *btree_inode = root->fs_info->btree_inode; - set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, - buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS); -} - -void btrfs_set_buffer_defrag_done(struct extent_buffer *buf) -{ - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - struct inode *btree_inode = root->fs_info->btree_inode; - set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start, - buf->start + buf->len - 1, EXTENT_DEFRAG_DONE, - GFP_NOFS); -} - -int btrfs_buffer_defrag(struct extent_buffer *buf) -{ - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - struct inode *btree_inode = root->fs_info->btree_inode; - return test_range_bit(&BTRFS_I(btree_inode)->io_tree, - buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0); -} - -int btrfs_buffer_defrag_done(struct extent_buffer *buf) -{ - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - struct inode *btree_inode = root->fs_info->btree_inode; - return test_range_bit(&BTRFS_I(btree_inode)->io_tree, - buf->start, buf->start + buf->len - 1, - EXTENT_DEFRAG_DONE, 0); -} - -int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf) -{ - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - struct inode *btree_inode = root->fs_info->btree_inode; - return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree, - buf->start, buf->start + buf->len - 1, - EXTENT_DEFRAG_DONE, GFP_NOFS); -} - -int btrfs_clear_buffer_defrag(struct extent_buffer *buf) -{ - struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; - struct inode *btree_inode = root->fs_info->btree_inode; - return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree, - buf->start, buf->start + buf->len - 1, - EXTENT_DEFRAG, GFP_NOFS); -} - int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index deff6b4815a7..353c3c50c957 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -61,12 +61,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int wait_on_tree_block_writeback(struct btrfs_root *root, struct extent_buffer *buf); -void btrfs_set_buffer_defrag(struct extent_buffer *buf); -void btrfs_set_buffer_defrag_done(struct extent_buffer *buf); -int btrfs_buffer_defrag(struct extent_buffer *buf); -int btrfs_buffer_defrag_done(struct extent_buffer *buf); -int btrfs_clear_buffer_defrag(struct extent_buffer *buf); -int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dc3c03c6612d..5e0857ffbc35 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2095,8 +2095,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); - if (!btrfs_test_opt(root, SSD)) - btrfs_set_buffer_defrag(buf); trans->blocks_used++; return buf; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 726d6871fa13..5e28cf5c2e85 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -365,7 +365,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } btrfs_clean_old_snapshots(root); - btrfs_defrag_dirty_roots(root->fs_info); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); sb->s_dirt = 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8e909cb97c6d..98f422d9ab07 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -30,7 +30,6 @@ extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_transaction_cachep; #define BTRFS_ROOT_TRANS_TAG 0 -#define BTRFS_ROOT_DEFRAG_TAG 1 static noinline void put_transaction(struct btrfs_transaction *transaction) { @@ -92,9 +91,6 @@ static noinline int record_root_in_trans(struct btrfs_root *root) radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); - radix_tree_tag_set(&root->fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_DEFRAG_TAG); root->commit_root = btrfs_root_node(root); } else { WARN_ON(1); @@ -403,44 +399,15 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) cond_resched(); trans = btrfs_start_transaction(root, 1); - if (ret != -EAGAIN) + if (root->fs_info->closing || ret != -EAGAIN) break; } root->defrag_running = 0; smp_mb(); - radix_tree_tag_clear(&info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_DEFRAG_TAG); btrfs_end_transaction(trans, root); return 0; } -int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info) -{ - struct btrfs_root *gang[1]; - struct btrfs_root *root; - int i; - int ret; - int err = 0; - u64 last = 0; - - while(1) { - ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix, - (void **)gang, last, - ARRAY_SIZE(gang), - BTRFS_ROOT_DEFRAG_TAG); - if (ret == 0) - break; - for (i = 0; i < ret; i++) { - root = gang[i]; - last = root->root_key.objectid + 1; - btrfs_defrag_root(root, 1); - } - } - btrfs_defrag_root(info->extent_root, 1); - return err; -} - static noinline int drop_dirty_roots(struct btrfs_root *tree_root, struct list_head *list) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index e1e5a06b65f4..9ccd5a5b170f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -84,7 +84,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest, struct list_head *dead_list); -int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info); int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b17693f61fbc..cc2650b06952 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -32,10 +32,13 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int wret; int level; int orig_level; - int i; int is_extent = 0; int next_key_ret = 0; u64 last_ret = 0; + u64 min_trans = 0; + + if (cache_only) + goto out; if (root->fs_info->extent_root == root) { /* @@ -43,10 +46,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, * we can't defrag the extent root without deadlock */ goto out; -#if 0 - mutex_lock(&root->fs_info->alloc_mutex); - is_extent = 1; -#endif } if (root->ref_cows == 0 && !is_extent) @@ -84,6 +83,17 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, path->lowest_level = 1; path->keep_locks = 1; + if (cache_only) + min_trans = root->defrag_trans_start; + + ret = btrfs_search_forward(root, &key, path, cache_only, min_trans); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(root, path); wret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (wret < 0) { @@ -95,7 +105,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + min_trans); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, cache_only, &last_ret, @@ -106,19 +117,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ret = -EAGAIN; } - for (i = 1; i < BTRFS_MAX_LEVEL; i++) { - if (path->locks[i]) { - btrfs_tree_unlock(path->nodes[i]); - path->locks[i] = 0; - } - if (path->nodes[i]) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - } + btrfs_release_path(root, path); if (is_extent) btrfs_extent_post_op(trans, root); - out: if (is_extent) mutex_unlock(&root->fs_info->alloc_mutex); @@ -138,6 +139,7 @@ done: if (ret != -EAGAIN) { memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + root->defrag_trans_start = trans->transid; } return ret; } -- 2.25.1