Btrfs: Wait for IO on the block device inodes of newly added devices

[linux-2.6-block.git] / fs / btrfs / volumes.c
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index ba396857102453a60d93b2c43f036e953457ecf1..f63cf7621a01a3b46665fa87100dd31939e6c459 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -56,6 +56,18 @@ void btrfs_unlock_volumes(void)
         mutex_unlock(&uuid_mutex);
  }
  
+static void lock_chunks(struct btrfs_root *root)
+{
+       mutex_lock(&root->fs_info->alloc_mutex);
+       mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+       mutex_unlock(&root->fs_info->chunk_mutex);
+       mutex_unlock(&root->fs_info->alloc_mutex);
+}
+
  int btrfs_cleanup_fs_uuids(void)
  {
         struct btrfs_fs_devices *fs_devices;
@@ -82,8 +94,8 @@ int btrfs_cleanup_fs_uuids(void)
         return 0;
  }
  
-static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
-                                         u8 *uuid)
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+                                                  u64 devid, u8 *uuid)
  {
         struct btrfs_device *dev;
         struct list_head *cur;
@@ -98,7 +110,7 @@ static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
         return NULL;
  }
  
-static struct btrfs_fs_devices *find_fsid(u8 *fsid)
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
  {
         struct list_head *cur;
         struct btrfs_fs_devices *fs_devices;
@@ -122,16 +134,22 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
   * the list if the block device is congested.  This way, multiple devices
   * can make progress from a single worker thread.
   */
-int run_scheduled_bios(struct btrfs_device *device)
+static int noinline run_scheduled_bios(struct btrfs_device *device)
  {
         struct bio *pending;
         struct backing_dev_info *bdi;
+       struct btrfs_fs_info *fs_info;
         struct bio *tail;
         struct bio *cur;
         int again = 0;
         unsigned long num_run = 0;
+       unsigned long limit;
  
         bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+       fs_info = device->dev_root->fs_info;
+       limit = btrfs_async_submit_limit(fs_info);
+       limit = limit * 2 / 3;
+
  loop:
         spin_lock(&device->io_lock);
  
@@ -167,8 +185,16 @@ loop:
                 cur = pending;
                 pending = pending->bi_next;
                 cur->bi_next = NULL;
-               atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+               atomic_dec(&fs_info->nr_async_bios);
+
+               if (atomic_read(&fs_info->nr_async_bios) < limit &&
+                   waitqueue_active(&fs_info->async_submit_wait))
+                       wake_up(&fs_info->async_submit_wait);
+
+               BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+               bio_get(cur);
                 submit_bio(cur->bi_rw, cur);
+               bio_put(cur);
                 num_run++;
  
                 /*
@@ -176,10 +202,11 @@ loop:
                  * is now congested.  Back off and let other work structs
                  * run instead
                  */
-               if (pending && num_run && bdi_write_congested(bdi)) {
+               if (pending && bdi_write_congested(bdi)) {
                         struct bio *old_head;
  
                         spin_lock(&device->io_lock);
+
                         old_head = device->pending_bios;
                         device->pending_bios = pending;
                         if (device->pending_bio_tail)
@@ -206,7 +233,7 @@ void pending_bios_fn(struct btrfs_work *work)
         run_scheduled_bios(device);
  }
  
-static int device_list_add(const char *path,
+static noinline int device_list_add(const char *path,
                            struct btrfs_super_block *disk_super,
                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
  {
@@ -271,13 +298,17 @@ again:
         list_for_each(cur, head) {
                 device = list_entry(cur, struct btrfs_device, dev_list);
                 if (!device->in_fs_metadata) {
-                       if (device->bdev) {
-                               close_bdev_excl(device->bdev);
-                               fs_devices->open_devices--;
-                       }
+                       struct block_device *bdev;
                         list_del(&device->dev_list);
                         list_del(&device->dev_alloc_list);
                         fs_devices->num_devices--;
+                       if (device->bdev) {
+                               bdev = device->bdev;
+                               fs_devices->open_devices--;
+                               mutex_unlock(&uuid_mutex);
+                               close_bdev_excl(bdev);
+                               mutex_lock(&uuid_mutex);
+                       }
                         kfree(device->name);
                         kfree(device);
                         goto again;
@@ -449,10 +480,10 @@ error:
   * called very infrequently and that a given device has a small number
   * of extents
   */
-static int find_free_dev_extent(struct btrfs_trans_handle *trans,
-                               struct btrfs_device *device,
-                               struct btrfs_path *path,
-                               u64 num_bytes, u64 *start)
+static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                                        struct btrfs_device *device,
+                                        struct btrfs_path *path,
+                                        u64 num_bytes, u64 *start)
  {
         struct btrfs_key key;
         struct btrfs_root *root = device->dev_root;
@@ -614,7 +645,7 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
                            struct btrfs_device *device,
                            u64 chunk_tree, u64 chunk_objectid,
                            u64 chunk_offset,
@@ -662,7 +693,8 @@ err:
         return ret;
  }
  
-static int find_next_chunk(struct btrfs_root *root, u64 objectid, u64 *offset)
+static noinline int find_next_chunk(struct btrfs_root *root,
+                                   u64 objectid, u64 *offset)
  {
         struct btrfs_path *path;
         int ret;
@@ -704,8 +736,8 @@ error:
         return ret;
  }
  
-static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
-                          u64 *objectid)
+static noinline int find_next_devid(struct btrfs_root *root,
+                                   struct btrfs_path *path, u64 *objectid)
  {
         int ret;
         struct btrfs_key key;
@@ -818,6 +850,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
         key.type = BTRFS_DEV_ITEM_KEY;
         key.offset = device->devid;
+       lock_chunks(root);
  
         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
         if (ret < 0)
@@ -852,6 +885,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
                                     total_bytes - 1);
  out:
         btrfs_free_path(path);
+       unlock_chunks(root);
         btrfs_commit_transaction(trans, root);
         return ret;
  }
@@ -866,8 +900,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         u64 devid;
         int ret = 0;
  
-       mutex_lock(&root->fs_info->fs_mutex);
         mutex_lock(&uuid_mutex);
+       mutex_lock(&root->fs_info->volume_mutex);
  
         all_avail = root->fs_info->avail_data_alloc_bits |
                 root->fs_info->avail_system_alloc_bits |
@@ -983,8 +1017,8 @@ error_close:
         if (bdev)
                 close_bdev_excl(bdev);
  out:
+       mutex_unlock(&root->fs_info->volume_mutex);
         mutex_unlock(&uuid_mutex);
-       mutex_unlock(&root->fs_info->fs_mutex);
         return ret;
  }
  
@@ -1003,8 +1037,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         if (!bdev) {
                 return -EIO;
         }
-       mutex_lock(&root->fs_info->fs_mutex);
+
+       filemap_write_and_wait(bdev->bd_inode->i_mapping);
+       mutex_lock(&root->fs_info->volume_mutex);
+
         trans = btrfs_start_transaction(root, 1);
+       lock_chunks(root);
         devices = &root->fs_info->fs_devices->devices;
         list_for_each(cur, devices) {
                 device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1042,6 +1080,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         if (ret)
                 goto out_close_bdev;
  
+       set_blocksize(device->bdev, 4096);
+
         total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
         btrfs_set_super_total_bytes(&root->fs_info->super_copy,
                                     total_bytes + device->total_bytes);
@@ -1056,8 +1096,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         root->fs_info->fs_devices->num_devices++;
         root->fs_info->fs_devices->open_devices++;
  out:
+       unlock_chunks(root);
         btrfs_end_transaction(trans, root);
-       mutex_unlock(&root->fs_info->fs_mutex);
+       mutex_unlock(&root->fs_info->volume_mutex);
+
         return ret;
  
  out_close_bdev:
@@ -1065,8 +1107,8 @@ out_close_bdev:
         goto out;
  }
  
-int btrfs_update_device(struct btrfs_trans_handle *trans,
-                       struct btrfs_device *device)
+int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
+                                struct btrfs_device *device)
  {
         int ret;
         struct btrfs_path *path;
@@ -1111,7 +1153,7 @@ out:
         return ret;
  }
  
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
                       struct btrfs_device *device, u64 new_size)
  {
         struct btrfs_super_block *super_copy =
@@ -1123,6 +1165,16 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
         return btrfs_update_device(trans, device);
  }
  
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+                     struct btrfs_device *device, u64 new_size)
+{
+       int ret;
+       lock_chunks(device->dev_root);
+       ret = __btrfs_grow_device(trans, device, new_size);
+       unlock_chunks(device->dev_root);
+       return ret;
+}
+
  static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 chunk_tree, u64 chunk_objectid,
@@ -1217,12 +1269,14 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
         em_tree = &root->fs_info->mapping_tree.map_tree;
  
         /* step one, relocate all the extents inside this chunk */
-       ret = btrfs_shrink_extent_tree(extent_root, chunk_offset);
+       ret = btrfs_relocate_block_group(extent_root, chunk_offset);
         BUG_ON(ret);
  
         trans = btrfs_start_transaction(root, 1);
         BUG_ON(!trans);
  
+       lock_chunks(root);
+
         /*
          * step two, delete the device extents and the
          * chunk tree entries
@@ -1255,18 +1309,22 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
                 BUG_ON(ret);
         }
  
+       ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+       BUG_ON(ret);
+
         spin_lock(&em_tree->lock);
         remove_extent_mapping(em_tree, em);
+       spin_unlock(&em_tree->lock);
+
         kfree(map);
         em->bdev = NULL;
  
         /* once for the tree */
         free_extent_map(em);
-       spin_unlock(&em_tree->lock);
-
         /* once for us */
         free_extent_map(em);
  
+       unlock_chunks(root);
         btrfs_end_transaction(trans, root);
         return 0;
  }
@@ -1297,9 +1355,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
         struct btrfs_key found_key;
  
  
+       mutex_lock(&dev_root->fs_info->volume_mutex);
         dev_root = dev_root->fs_info->dev_root;
  
-       mutex_lock(&dev_root->fs_info->fs_mutex);
         /* step one make some room on all the devices */
         list_for_each(cur, devices) {
                 device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1343,13 +1401,14 @@ int btrfs_balance(struct btrfs_root *dev_root)
  
                 ret = btrfs_previous_item(chunk_root, path, 0,
                                           BTRFS_CHUNK_ITEM_KEY);
-               if (ret) {
+               if (ret)
                         break;
-               }
+
                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                       path->slots[0]);
                 if (found_key.objectid != key.objectid)
                         break;
+
                 chunk = btrfs_item_ptr(path->nodes[0],
                                        path->slots[0],
                                        struct btrfs_chunk);
@@ -1358,17 +1417,17 @@ int btrfs_balance(struct btrfs_root *dev_root)
                 if (key.offset == 0)
                         break;
  
+               btrfs_release_path(chunk_root, path);
                 ret = btrfs_relocate_chunk(chunk_root,
                                            chunk_root->root_key.objectid,
                                            found_key.objectid,
                                            found_key.offset);
                 BUG_ON(ret);
-               btrfs_release_path(chunk_root, path);
         }
         ret = 0;
  error:
         btrfs_free_path(path);
-       mutex_unlock(&dev_root->fs_info->fs_mutex);
+       mutex_unlock(&dev_root->fs_info->volume_mutex);
         return ret;
  }
  
@@ -1408,14 +1467,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
  
         path->reada = 2;
  
+       lock_chunks(root);
+
         device->total_bytes = new_size;
         ret = btrfs_update_device(trans, device);
         if (ret) {
+               unlock_chunks(root);
                 btrfs_end_transaction(trans, root);
                 goto done;
         }
         WARN_ON(diff > old_total);
         btrfs_set_super_total_bytes(super_copy, old_total - diff);
+       unlock_chunks(root);
         btrfs_end_transaction(trans, root);
  
         key.objectid = device->devid;
@@ -1488,8 +1551,8 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
         return 0;
  }
  
-static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
-                              int sub_stripes)
+static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
+                                       int num_stripes, int sub_stripes)
  {
         if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
                 return calc_size;
@@ -1609,8 +1672,13 @@ again:
         else
                 min_free = calc_size;
  
-       /* we add 1MB because we never use the first 1MB of the device */
-       min_free += 1024 * 1024;
+       /*
+        * we add 1MB because we never use the first 1MB of the device, unless
+        * we've looped, then we are likely allocating the maximum amount of
+        * space left already
+        */
+       if (!looped)
+               min_free += 1024 * 1024;
  
         /* build a private list of devices we will allocate from */
         while(index < num_stripes) {
@@ -2016,23 +2084,22 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
  }
  
  
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
  static void end_bio_multi_stripe(struct bio *bio, int err)
-#else
-static int end_bio_multi_stripe(struct bio *bio,
-                                  unsigned int bytes_done, int err)
-#endif
  {
         struct btrfs_multi_bio *multi = bio->bi_private;
+       int is_orig_bio = 0;
  
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-       if (bio->bi_size)
-               return 1;
-#endif
         if (err)
                 atomic_inc(&multi->error);
  
+       if (bio == multi->orig_bio)
+               is_orig_bio = 1;
+
         if (atomic_dec_and_test(&multi->stripes_pending)) {
+               if (!is_orig_bio) {
+                       bio_put(bio);
+                       bio = multi->orig_bio;
+               }
                 bio->bi_private = multi->private;
                 bio->bi_end_io = multi->end_io;
                 /* only send an error to the higher layers if it is
@@ -2050,17 +2117,10 @@ static int end_bio_multi_stripe(struct bio *bio,
                 }
                 kfree(multi);
  
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-               bio_endio(bio, bio->bi_size, err);
-#else
                 bio_endio(bio, err);
-#endif
-       } else {
+       } else if (!is_orig_bio) {
                 bio_put(bio);
         }
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-       return 0;
-#endif
  }
  
  struct async_sched {
@@ -2077,24 +2137,28 @@ struct async_sched {
   * This will add one bio to the pending list for a device and make sure
   * the work struct is scheduled.
   */
-int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
-                int rw, struct bio *bio)
+static int noinline schedule_bio(struct btrfs_root *root,
+                                struct btrfs_device *device,
+                                int rw, struct bio *bio)
  {
         int should_queue = 1;
  
         /* don't bother with additional async steps for reads, right now */
         if (!(rw & (1 << BIO_RW))) {
+               bio_get(bio);
                 submit_bio(rw, bio);
+               bio_put(bio);
                 return 0;
         }
  
         /*
-        * nr_async_sumbits allows us to reliably return congestion to the
+        * nr_async_bios allows us to reliably return congestion to the
          * higher layers.  Otherwise, the async bio makes it appear we have
          * made progress against dirty pages when we've really just put it
          * on a queue for later
          */
-       atomic_inc(&root->fs_info->nr_async_submits);
+       atomic_inc(&root->fs_info->nr_async_bios);
+       WARN_ON(bio->bi_next);
         bio->bi_next = NULL;
         bio->bi_rw |= rw;
  
@@ -2147,6 +2211,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
         }
         multi->end_io = first_bio->bi_end_io;
         multi->private = first_bio->bi_private;
+       multi->orig_bio = first_bio;
         atomic_set(&multi->stripes_pending, multi->num_stripes);
  
         while(dev_nr < total_devs) {
@@ -2171,11 +2236,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                 } else {
                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
                         bio->bi_sector = logical >> 9;
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-                       bio_endio(bio, bio->bi_size, -EIO);
-#else
                         bio_endio(bio, -EIO);
-#endif
                 }
                 dev_nr++;
         }
@@ -2486,4 +2547,3 @@ again:
  error:
         return ret;
  }
-