Btrfs: fix race between fs trimming and block group remove/allocation

[linux-2.6-block.git] / fs / btrfs / extent-tree.c
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index a5e64dda2db9b8c877d7c45eac2ec67cb7faab83..dbc115a25798c68a688ae0725c6e37090282374c 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9005,6 +9005,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
         INIT_LIST_HEAD(&cache->bg_list);
         INIT_LIST_HEAD(&cache->ro_list);
         btrfs_init_free_space_ctl(cache);
+       atomic_set(&cache->trimming, 0);
  
         return cache;
  }
@@ -9306,7 +9307,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
  }
  
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 group_start)
+                            struct btrfs_root *root, u64 group_start,
+                            struct extent_map *em)
  {
         struct btrfs_path *path;
         struct btrfs_block_group_cache *block_group;
@@ -9319,6 +9321,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         int index;
         int factor;
         struct btrfs_caching_control *caching_ctl = NULL;
+       bool remove_em;
  
         root = root->fs_info->extent_root;
  
@@ -9464,6 +9467,61 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
  
         memcpy(&key, &block_group->key, sizeof(key));
  
+       lock_chunks(root);
+       spin_lock(&block_group->lock);
+       block_group->removed = 1;
+       /*
+        * At this point trimming can't start on this block group, because we
+        * removed the block group from the tree fs_info->block_group_cache_tree
+        * so no one can't find it anymore and even if someone already got this
+        * block group before we removed it from the rbtree, they have already
+        * incremented block_group->trimming - if they didn't, they won't find
+        * any free space entries because we already removed them all when we
+        * called btrfs_remove_free_space_cache().
+        *
+        * And we must not remove the extent map from the fs_info->mapping_tree
+        * to prevent the same logical address range and physical device space
+        * ranges from being reused for a new block group. This is because our
+        * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
+        * completely transactionless, so while it is trimming a range the
+        * currently running transaction might finish and a new one start,
+        * allowing for new block groups to be created that can reuse the same
+        * physical device locations unless we take this special care.
+        */
+       remove_em = (atomic_read(&block_group->trimming) == 0);
+       /*
+        * Make sure a trimmer task always sees the em in the pinned_chunks list
+        * if it sees block_group->removed == 1 (needs to lock block_group->lock
+        * before checking block_group->removed).
+        */
+       if (!remove_em) {
+               /*
+                * Our em might be in trans->transaction->pending_chunks which
+                * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
+                * and so is the fs_info->pinned_chunks list.
+                *
+                * So at this point we must be holding the chunk_mutex to avoid
+                * any races with chunk allocation (more specifically at
+                * volumes.c:contains_pending_extent()), to ensure it always
+                * sees the em, either in the pending_chunks list or in the
+                * pinned_chunks list.
+                */
+               list_move_tail(&em->list, &root->fs_info->pinned_chunks);
+       }
+       spin_unlock(&block_group->lock);
+       unlock_chunks(root);
+
+       if (remove_em) {
+               struct extent_map_tree *em_tree;
+
+               em_tree = &root->fs_info->mapping_tree.map_tree;
+               write_lock(&em_tree->lock);
+               remove_extent_mapping(em_tree, em);
+               write_unlock(&em_tree->lock);
+               /* once for the tree */
+               free_extent_map(em);
+       }
+
         btrfs_put_block_group(block_group);
         btrfs_put_block_group(block_group);