Btrfs: fix outstanding_extents accounting in DIO

[linux-2.6-block.git] / fs / btrfs / transaction.c
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index d89c6d3542cab4c55372954ae97e9e32ec1ba443..932709af5163191714c671a7fa541ef5a03303a7 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
         }
  }
  
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+       spin_lock(&tree->lock);
+       while (!RB_EMPTY_ROOT(&tree->state)) {
+               struct rb_node *node;
+               struct extent_state *state;
+
+               node = rb_first(&tree->state);
+               state = rb_entry(node, struct extent_state, rb_node);
+               rb_erase(&state->rb_node, &tree->state);
+               RB_CLEAR_NODE(&state->rb_node);
+               /*
+                * btree io trees aren't supposed to have tasks waiting for
+                * changes in the flags of extent states ever.
+                */
+               ASSERT(!waitqueue_active(&state->wq));
+               free_extent_state(state);
+               if (need_resched()) {
+                       spin_unlock(&tree->lock);
+                       cond_resched();
+                       spin_lock(&tree->lock);
+               }
+       }
+       spin_unlock(&tree->lock);
+}
+
  static noinline void switch_commit_roots(struct btrfs_transaction *trans,
                                          struct btrfs_fs_info *fs_info)
  {
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
                 root->commit_root = btrfs_root_node(root);
                 if (is_fstree(root->objectid))
                         btrfs_unpin_free_ino(root);
+               clear_btree_io_tree(&root->dirty_log_pages);
         }
         up_write(&fs_info->commit_root_sem);
  }
@@ -193,6 +220,7 @@ loop:
          * commit the transaction.
          */
         atomic_set(&cur_trans->use_count, 2);
+       cur_trans->have_free_bgs = 0;
         cur_trans->start_time = get_seconds();
  
         cur_trans->delayed_refs.href_root = RB_ROOT;
@@ -220,6 +248,9 @@ loop:
         INIT_LIST_HEAD(&cur_trans->pending_snapshots);
         INIT_LIST_HEAD(&cur_trans->pending_chunks);
         INIT_LIST_HEAD(&cur_trans->switch_commits);
+       INIT_LIST_HEAD(&cur_trans->pending_ordered);
+       INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+       spin_lock_init(&cur_trans->dirty_bgs_lock);
         list_add_tail(&cur_trans->list, &fs_info->trans_list);
         extent_io_tree_init(&cur_trans->dirty_pages,
                              fs_info->btree_inode->i_mapping);
@@ -386,7 +417,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
         int ret;
  
         /* Send isn't supposed to start transactions. */
-       ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
+       ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
  
         if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                 return ERR_PTR(-EROFS);
@@ -408,7 +439,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
         if (num_items > 0 && root != root->fs_info->chunk_root) {
                 if (root->fs_info->quota_enabled &&
                     is_fstree(root->root_key.objectid)) {
-                       qgroup_reserved = num_items * root->leafsize;
+                       qgroup_reserved = num_items * root->nodesize;
                         ret = btrfs_qgroup_reserve(root, qgroup_reserved);
                         if (ret)
                                 return ERR_PTR(ret);
@@ -418,7 +449,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
                 /*
                  * Do the reservation for the relocation root creation
                  */
-               if (unlikely(need_reserve_reloc_root(root))) {
+               if (need_reserve_reloc_root(root)) {
                         num_bytes += root->nodesize;
                         reloc_reserved = true;
                 }
@@ -488,6 +519,7 @@ again:
         h->sync = false;
         INIT_LIST_HEAD(&h->qgroup_ref_list);
         INIT_LIST_HEAD(&h->new_bgs);
+       INIT_LIST_HEAD(&h->ordered);
  
         smp_mb();
         if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -609,7 +641,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                 if (transid <= root->fs_info->last_trans_committed)
                         goto out;
  
-               ret = -EINVAL;
                 /* find specified transaction */
                 spin_lock(&root->fs_info->trans_lock);
                 list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -625,9 +656,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                         }
                 }
                 spin_unlock(&root->fs_info->trans_lock);
-               /* The specified transaction doesn't exist */
-               if (!cur_trans)
+
+               /*
+                * The specified transaction doesn't exist, or we
+                * raced with btrfs_commit_transaction
+                */
+               if (!cur_trans) {
+                       if (transid > root->fs_info->last_trans_committed)
+                               ret = -EINVAL;
                         goto out;
+               }
         } else {
                 /* find newest transaction that is committing | committed */
                 spin_lock(&root->fs_info->trans_lock);
@@ -713,6 +751,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
         if (!list_empty(&trans->new_bgs))
                 btrfs_create_pending_block_groups(trans, root);
  
+       if (!list_empty(&trans->ordered)) {
+               spin_lock(&info->trans_lock);
+               list_splice(&trans->ordered, &cur_trans->pending_ordered);
+               spin_unlock(&info->trans_lock);
+       }
+
         trans->delayed_ref_updates = 0;
         if (!trans->sync) {
                 must_run_delayed_refs =
@@ -822,17 +866,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
  
         while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                       mark, &cached_state)) {
-               convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
-                                  mark, &cached_state, GFP_NOFS);
-               cached_state = NULL;
-               err = filemap_fdatawrite_range(mapping, start, end);
+               bool wait_writeback = false;
+
+               err = convert_extent_bit(dirty_pages, start, end,
+                                        EXTENT_NEED_WAIT,
+                                        mark, &cached_state, GFP_NOFS);
+               /*
+                * convert_extent_bit can return -ENOMEM, which is most of the
+                * time a temporary error. So when it happens, ignore the error
+                * and wait for writeback of this range to finish - because we
+                * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+                * to btrfs_wait_marked_extents() would not know that writeback
+                * for this range started and therefore wouldn't wait for it to
+                * finish - we don't want to commit a superblock that points to
+                * btree nodes/leafs for which writeback hasn't finished yet
+                * (and without errors).
+                * We cleanup any entries left in the io tree when committing
+                * the transaction (through clear_btree_io_tree()).
+                */
+               if (err == -ENOMEM) {
+                       err = 0;
+                       wait_writeback = true;
+               }
+               if (!err)
+                       err = filemap_fdatawrite_range(mapping, start, end);
                 if (err)
                         werr = err;
+               else if (wait_writeback)
+                       werr = filemap_fdatawait_range(mapping, start, end);
+               free_extent_state(cached_state);
+               cached_state = NULL;
                 cond_resched();
                 start = end + 1;
         }
-       if (err)
-               werr = err;
         return werr;
  }
  
@@ -851,19 +917,55 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
         struct extent_state *cached_state = NULL;
         u64 start = 0;
         u64 end;
+       struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
+       bool errors = false;
  
         while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                       EXTENT_NEED_WAIT, &cached_state)) {
-               clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
-                                0, 0, &cached_state, GFP_NOFS);
-               err = filemap_fdatawait_range(mapping, start, end);
+               /*
+                * Ignore -ENOMEM errors returned by clear_extent_bit().
+                * When committing the transaction, we'll remove any entries
+                * left in the io tree. For a log commit, we don't remove them
+                * after committing the log because the tree can be accessed
+                * concurrently - we do it only at transaction commit time when
+                * it's safe to do it (through clear_btree_io_tree()).
+                */
+               err = clear_extent_bit(dirty_pages, start, end,
+                                      EXTENT_NEED_WAIT,
+                                      0, 0, &cached_state, GFP_NOFS);
+               if (err == -ENOMEM)
+                       err = 0;
+               if (!err)
+                       err = filemap_fdatawait_range(mapping, start, end);
                 if (err)
                         werr = err;
+               free_extent_state(cached_state);
+               cached_state = NULL;
                 cond_resched();
                 start = end + 1;
         }
         if (err)
                 werr = err;
+
+       if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+               if ((mark & EXTENT_DIRTY) &&
+                   test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
+                                      &btree_ino->runtime_flags))
+                       errors = true;
+
+               if ((mark & EXTENT_NEW) &&
+                   test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
+                                      &btree_ino->runtime_flags))
+                       errors = true;
+       } else {
+               if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
+                                      &btree_ino->runtime_flags))
+                       errors = true;
+       }
+
+       if (errors && !werr)
+               werr = -EIO;
+
         return werr;
  }
  
@@ -891,17 +993,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
         return 0;
  }
  
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root)
  {
-       if (!trans || !trans->transaction) {
-               struct inode *btree_inode;
-               btree_inode = root->fs_info->btree_inode;
-               return filemap_write_and_wait(btree_inode->i_mapping);
-       }
-       return btrfs_write_and_wait_marked_extents(root,
+       int ret;
+
+       ret = btrfs_write_and_wait_marked_extents(root,
                                            &trans->transaction->dirty_pages,
                                            EXTENT_DIRTY);
+       clear_btree_io_tree(&trans->transaction->dirty_pages);
+
+       return ret;
  }
  
  /*
@@ -923,7 +1025,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
         struct btrfs_root *tree_root = root->fs_info->tree_root;
  
         old_root_used = btrfs_root_used(&root->root_item);
-       btrfs_write_dirty_block_groups(trans, root);
  
         while (1) {
                 old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -939,9 +1040,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                         return ret;
  
                 old_root_used = btrfs_root_used(&root->root_item);
-               ret = btrfs_write_dirty_block_groups(trans, root);
-               if (ret)
-                       return ret;
         }
  
         return 0;
@@ -958,14 +1056,11 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
+       struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
         struct list_head *next;
         struct extent_buffer *eb;
         int ret;
  
-       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-       if (ret)
-               return ret;
-
         eb = btrfs_lock_root_node(fs_info->tree_root);
         ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
                               0, &eb);
@@ -989,15 +1084,20 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
         if (ret)
                 return ret;
  
+       ret = btrfs_setup_space_cache(trans, root);
+       if (ret)
+               return ret;
+
         /* run_qgroups might have added some more refs */
         ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
         if (ret)
                 return ret;
-
+again:
         while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                 next = fs_info->dirty_cowonly_roots.next;
                 list_del_init(next);
                 root = list_entry(next, struct btrfs_root, dirty_list);
+               clear_bit(BTRFS_ROOT_DIRTY, &root->state);
  
                 if (root != fs_info->extent_root)
                         list_add_tail(&root->dirty_list,
@@ -1005,8 +1105,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                 ret = update_cowonly_root(trans, root);
                 if (ret)
                         return ret;
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+               if (ret)
+                       return ret;
         }
  
+       while (!list_empty(dirty_bgs)) {
+               ret = btrfs_write_dirty_block_groups(trans, root);
+               if (ret)
+                       return ret;
+               ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+               if (ret)
+                       return ret;
+       }
+
+       if (!list_empty(&fs_info->dirty_cowonly_roots))
+               goto again;
+
         list_add_tail(&fs_info->extent_root->dirty_list,
                       &trans->transaction->switch_commits);
         btrfs_after_dev_replace_commit(fs_info);
@@ -1624,11 +1739,34 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
                 btrfs_wait_ordered_roots(fs_info, -1);
  }
  
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+                          struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_ordered_extent *ordered;
+
+       spin_lock(&fs_info->trans_lock);
+       while (!list_empty(&cur_trans->pending_ordered)) {
+               ordered = list_first_entry(&cur_trans->pending_ordered,
+                                          struct btrfs_ordered_extent,
+                                          trans_list);
+               list_del_init(&ordered->trans_list);
+               spin_unlock(&fs_info->trans_lock);
+
+               wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+                                                  &ordered->flags));
+               btrfs_put_ordered_extent(ordered);
+               spin_lock(&fs_info->trans_lock);
+       }
+       spin_unlock(&fs_info->trans_lock);
+}
+
  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root)
  {
         struct btrfs_transaction *cur_trans = trans->transaction;
         struct btrfs_transaction *prev_trans = NULL;
+       struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
         int ret;
  
         /* Stop the commit early if ->aborted is set */
@@ -1673,6 +1811,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         }
  
         spin_lock(&root->fs_info->trans_lock);
+       list_splice(&trans->ordered, &cur_trans->pending_ordered);
         if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
                 spin_unlock(&root->fs_info->trans_lock);
                 atomic_inc(&cur_trans->use_count);
@@ -1680,6 +1819,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
                 wait_for_commit(root, cur_trans);
  
+               if (unlikely(cur_trans->aborted))
+                       ret = cur_trans->aborted;
+
                 btrfs_put_transaction(cur_trans);
  
                 return ret;
@@ -1725,6 +1867,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_wait_delalloc_flush(root->fs_info);
  
+       btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
         btrfs_scrub_pause(root);
         /*
          * Ok now we need to make sure to block out any other joins while we
@@ -1813,13 +1957,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         }
  
         /*
-        * Since the transaction is done, we should set the inode map cache flag
-        * before any other comming transaction.
+        * Since the transaction is done, we can apply the pending changes
+        * before the next transaction.
          */
-       if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
-               btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
-       else
-               btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+       btrfs_apply_pending_changes(root->fs_info);
  
         /* commit_fs_roots gets rid of all the tree log roots, it is now
          * safe to free the root of tree log roots
@@ -1861,6 +2002,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         switch_commit_roots(cur_trans, root->fs_info);
  
         assert_qgroups_uptodate(trans);
+       ASSERT(list_empty(&cur_trans->dirty_bgs));
         update_super_roots(root);
  
         btrfs_set_super_log_root(root->fs_info->super_copy, 0);
@@ -1868,6 +2010,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
                sizeof(*root->fs_info->super_copy));
  
+       btrfs_update_commit_device_size(root->fs_info);
+       btrfs_update_commit_device_bytes_used(root, cur_trans);
+
+       clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
+       clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
+
         spin_lock(&root->fs_info->trans_lock);
         cur_trans->state = TRANS_STATE_UNBLOCKED;
         root->fs_info->running_transaction = NULL;
@@ -1898,6 +2046,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_finish_extent_commit(trans, root);
  
+       if (cur_trans->have_free_bgs)
+               btrfs_clear_space_info_full(root->fs_info);
+
         root->fs_info->last_trans_committed = cur_trans->transid;
         /*
          * We needn't acquire the lock here because there is no other task
@@ -1981,9 +2132,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
                 ret = btrfs_drop_snapshot(root, NULL, 0, 0);
         else
                 ret = btrfs_drop_snapshot(root, NULL, 1, 0);
-       /*
-        * If we encounter a transaction abort during snapshot cleaning, we
-        * don't want to crash here
-        */
+
         return (ret < 0) ? 0 : 1;
  }
+
+void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
+{
+       unsigned long prev;
+       unsigned long bit;
+
+       prev = cmpxchg(&fs_info->pending_changes, 0, 0);
+       if (!prev)
+               return;
+
+       bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
+       if (prev & bit)
+               btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+       prev &= ~bit;
+
+       bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
+       if (prev & bit)
+               btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
+       prev &= ~bit;
+
+       bit = 1 << BTRFS_PENDING_COMMIT;
+       if (prev & bit)
+               btrfs_debug(fs_info, "pending commit done");
+       prev &= ~bit;
+
+       if (prev)
+               btrfs_warn(fs_info,
+                       "unknown pending changes left 0x%lx, ignoring", prev);
+}