Btrfs: Add balance ioctl to restripe the chunks
authorChris Mason <chris.mason@oracle.com>
Mon, 28 Apr 2008 19:29:52 +0000 (15:29 -0400)
committerChris Mason <chris.mason@oracle.com>
Thu, 25 Sep 2008 15:04:02 +0000 (11:04 -0400)
Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/inode.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h

index b9257b37bb9635a74d486af56aec84f7b0fa5aeb..73b92dd150ff656f045fdc3c946a3f9544ef2806 100644 (file)
@@ -1364,7 +1364,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       u64 root_objectid, u64 ref_generation,
                       u64 owner, u64 owner_offset,
                       u64 empty_size, u64 hint_byte,
-                      u64 search_end, struct btrfs_key *ins, int data);
+                      u64 search_end, struct btrfs_key *ins, u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                  struct extent_buffer *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
index fe4fe709c312fc60732cf93a0afac9f1961a5b98..95aee5a29375f24a193ac2cf05916eec18dc4324 100644 (file)
@@ -17,6 +17,7 @@
  */
 #include <linux/sched.h>
 #include <linux/pagemap.h>
+#include <linux/writeback.h>
 #include "hash.h"
 #include "crc32c.h"
 #include "ctree.h"
@@ -1058,6 +1059,26 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        }
 }
 
+static u64 reduce_alloc_profile(u64 flags)
+{
+       if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+           (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                     BTRFS_BLOCK_GROUP_RAID10)))
+               flags &= ~BTRFS_BLOCK_GROUP_DUP;
+
+       if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
+           (flags & BTRFS_BLOCK_GROUP_RAID10))
+               flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+
+       if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+           ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+            (flags & BTRFS_BLOCK_GROUP_RAID10) |
+            (flags & BTRFS_BLOCK_GROUP_DUP)))
+               flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+       return flags;
+}
+
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags)
@@ -1068,6 +1089,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        u64 num_bytes;
        int ret;
 
+       flags = reduce_alloc_profile(flags);
+
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
                ret = update_space_info(extent_root->fs_info, flags,
@@ -1684,6 +1707,7 @@ enospc:
 error:
        return ret;
 }
+
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the key for the extent through ins, and a tree buffer for
@@ -1697,7 +1721,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       u64 root_objectid, u64 ref_generation,
                       u64 owner, u64 owner_offset,
                       u64 empty_size, u64 hint_byte,
-                      u64 search_end, struct btrfs_key *ins, int data)
+                      u64 search_end, struct btrfs_key *ins, u64 data)
 {
        int ret;
        int pending_ret;
@@ -1727,6 +1751,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
        }
 again:
+       data = reduce_alloc_profile(data);
        if (root->ref_cows) {
                if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
@@ -1752,6 +1777,9 @@ again:
                num_bytes = max(num_bytes, min_alloc_size);
                goto again;
        }
+       if (ret) {
+               printk("allocation failed flags %Lu\n", data);
+       }
        BUG_ON(ret);
        if (ret)
                return ret;
@@ -2274,8 +2302,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 {
        u64 page_start;
        u64 page_end;
-       u64 delalloc_start;
-       u64 existing_delalloc;
        unsigned long last_index;
        unsigned long i;
        struct page *page;
@@ -2293,7 +2319,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
        ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
 
        file_ra_state_init(ra, inode->i_mapping);
-       kfree(ra);
 
        for (; i <= last_index; i++) {
                if (total_read % ra_pages == 0) {
@@ -2313,26 +2338,30 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
                                goto out_unlock;
                        }
                }
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+               ClearPageDirty(page);
+#else
+               cancel_dirty_page(page, PAGE_CACHE_SIZE);
+#endif
+               wait_on_page_writeback(page);
+               set_page_extent_mapped(page);
                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
                page_end = page_start + PAGE_CACHE_SIZE - 1;
 
                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
-               delalloc_start = page_start;
-               existing_delalloc = count_range_bits(io_tree,
-                                            &delalloc_start, page_end,
-                                            PAGE_CACHE_SIZE, EXTENT_DELALLOC);
-
+               set_page_dirty(page);
                set_extent_delalloc(io_tree, page_start,
                                    page_end, GFP_NOFS);
 
                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-               set_page_dirty(page);
                unlock_page(page);
                page_cache_release(page);
+               balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
        }
 
 out_unlock:
+       kfree(ra);
        mutex_unlock(&inode->i_mutex);
        return 0;
 }
@@ -2397,8 +2426,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
                        goto out;
                }
                relocate_inode_pages(inode, ref_offset, extent_key->offset);
-               /* FIXME, data=ordered will help get rid of this */
-               filemap_fdatawrite(inode->i_mapping);
                iput(inode);
                mutex_lock(&extent_root->fs_info->fs_mutex);
        } else {
@@ -2486,6 +2513,47 @@ out:
        return ret;
 }
 
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+       u64 num_devices;
+       u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+               BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
+       num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
+       if (num_devices == 1) {
+               stripped |= BTRFS_BLOCK_GROUP_DUP;
+               stripped = flags & ~stripped;
+
+               /* turn raid0 into single device chunks */
+               if (flags & BTRFS_BLOCK_GROUP_RAID0)
+                       return stripped;
+
+               /* turn mirroring into duplication */
+               if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                            BTRFS_BLOCK_GROUP_RAID10))
+                       return stripped | BTRFS_BLOCK_GROUP_DUP;
+               return flags;
+       } else {
+               /* they already had raid on here, just return */
+               if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+                   (flags & BTRFS_BLOCK_GROUP_RAID1)) {
+               }
+               if (flags & stripped)
+                       return flags;
+
+               stripped |= BTRFS_BLOCK_GROUP_DUP;
+               stripped = flags & ~stripped;
+
+               /* switch duplicated blocks with raid1 */
+               if (flags & BTRFS_BLOCK_GROUP_DUP)
+                       return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+               /* turn single device chunks into raid0 */
+               return stripped | BTRFS_BLOCK_GROUP_RAID0;
+       }
+       return flags;
+}
+
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 {
        struct btrfs_trans_handle *trans;
@@ -2494,6 +2562,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
        u64 cur_byte;
        u64 total_found;
        u64 shrink_last_byte;
+       u64 new_alloc_flags;
        struct btrfs_block_group_cache *shrink_block_group;
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_key key;
@@ -2511,17 +2580,20 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 
        shrink_block_group->space_info->total_bytes -=
                shrink_block_group->key.offset;
-printk("shrink_extent_tree %Lu -> %Lu type %Lu\n", shrink_start, shrink_last_byte, shrink_block_group->flags);
        path = btrfs_alloc_path();
        root = root->fs_info->extent_root;
        path->reada = 2;
 
 again:
-       trans = btrfs_start_transaction(root, 1);
-       do_chunk_alloc(trans, root->fs_info->extent_root,
+       if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+               trans = btrfs_start_transaction(root, 1);
+               new_alloc_flags = update_block_group_flags(root,
+                                                  shrink_block_group->flags);
+               do_chunk_alloc(trans, root->fs_info->extent_root,
                        btrfs_block_group_used(&shrink_block_group->item) +
-                       2 * 1024 * 1024, shrink_block_group->flags);
-       btrfs_end_transaction(trans, root);
+                       2 * 1024 * 1024, new_alloc_flags);
+               btrfs_end_transaction(trans, root);
+       }
        shrink_block_group->ro = 1;
 
        total_found = 0;
index 1a74b5018699d5c5d181afe5e75e84e739ac6e82..994834474590c92e12988f7cbffe64a7ab05feca 100644 (file)
@@ -2864,6 +2864,15 @@ int btrfs_defrag_file(struct file *file) {
                                goto out_unlock;
                        }
                }
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+               ClearPageDirty(page);
+#else
+               cancel_dirty_page(page, PAGE_CACHE_SIZE);
+#endif
+               wait_on_page_writeback(page);
+               set_page_extent_mapped(page);
+
                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
                page_end = page_start + PAGE_CACHE_SIZE - 1;
 
@@ -3105,6 +3114,8 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_resize(root, (void __user *)arg);
        case BTRFS_IOC_ADD_DEV:
                return btrfs_ioctl_add_dev(root, (void __user *)arg);
+       case BTRFS_IOC_BALANCE:
+               return btrfs_balance(root->fs_info->dev_root);
        }
 
        return -ENOTTY;
index b93c15aa17db054814e3012bf12e15d2b7dc5f6e..6476ecbf132e8e2ac095385d1fde33221b7efbc2 100644 (file)
@@ -869,6 +869,107 @@ out:
        return 0;
 }
 
+static u64 div_factor(u64 num, int factor)
+{
+       if (factor == 10)
+               return num;
+       num *= factor;
+       do_div(num, 10);
+       return num;
+}
+
+
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+       int ret;
+       struct list_head *cur;
+       struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+       struct btrfs_device *device;
+       u64 old_size;
+       u64 size_to_free;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct btrfs_chunk *chunk;
+       struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_key found_key;
+
+
+       dev_root = dev_root->fs_info->dev_root;
+
+       mutex_lock(&dev_root->fs_info->fs_mutex);
+       /* step one make some room on all the devices */
+       list_for_each(cur, devices) {
+               device = list_entry(cur, struct btrfs_device, dev_list);
+               old_size = device->total_bytes;
+               size_to_free = div_factor(old_size, 1);
+               size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+               if (device->total_bytes - device->bytes_used > size_to_free)
+                       continue;
+
+               ret = btrfs_shrink_device(device, old_size - size_to_free);
+               BUG_ON(ret);
+
+               trans = btrfs_start_transaction(dev_root, 1);
+               BUG_ON(!trans);
+
+               ret = btrfs_grow_device(trans, device, old_size);
+               BUG_ON(ret);
+
+               btrfs_end_transaction(trans, dev_root);
+       }
+
+       /* step two, relocate all the chunks */
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+
+       key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+       key.offset = (u64)-1;
+       key.type = BTRFS_CHUNK_ITEM_KEY;
+
+       while(1) {
+               ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+               if (ret < 0)
+                       goto error;
+
+               /*
+                * this shouldn't happen, it means the last relocate
+                * failed
+                */
+               if (ret == 0)
+                       break;
+
+               ret = btrfs_previous_item(chunk_root, path, 0,
+                                         BTRFS_CHUNK_ITEM_KEY);
+               if (ret) {
+                       break;
+               }
+               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                     path->slots[0]);
+               if (found_key.objectid != key.objectid)
+                       break;
+               chunk = btrfs_item_ptr(path->nodes[0],
+                                      path->slots[0],
+                                      struct btrfs_chunk);
+               key.offset = found_key.offset;
+               /* chunk zero is special */
+               if (key.offset == 0)
+                       break;
+
+               ret = btrfs_relocate_chunk(chunk_root,
+                                          chunk_root->root_key.objectid,
+                                          found_key.objectid,
+                                          found_key.offset);
+               BUG_ON(ret);
+               btrfs_release_path(chunk_root, path);
+       }
+       ret = 0;
+error:
+       btrfs_free_path(path);
+       mutex_unlock(&dev_root->fs_info->fs_mutex);
+       return ret;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -985,15 +1086,6 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
        return 0;
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-       if (factor == 10)
-               return num;
-       num *= factor;
-       do_div(num, 10);
-       return num;
-}
-
 static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
                               int sub_stripes)
 {
@@ -1040,6 +1132,11 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        int stripe_len = 64 * 1024;
        struct btrfs_key key;
 
+       if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+           (type & BTRFS_BLOCK_GROUP_DUP)) {
+               WARN_ON(1);
+               type &= ~BTRFS_BLOCK_GROUP_DUP;
+       }
        dev_list = &extent_root->fs_info->fs_devices->alloc_list;
        if (list_empty(dev_list))
                return -ENOSPC;
index 6f173450378b121e6fbc20cc1eb6a7ca3f648ee6..3f9a17f2e41b24b112c162ee85ea094df7bb9389 100644 (file)
@@ -134,4 +134,5 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
                                       u8 *uuid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_balance(struct btrfs_root *dev_root);
 #endif