Btrfs: fix hang during inode eviction due to concurrent readahead

author Filipe Manana <fdmanana@suse.com>

Mon, 25 May 2015 23:55:42 +0000 (00:55 +0100)

committer Chris Mason <clm@fb.com>

Wed, 3 Jun 2015 11:03:09 +0000 (04:03 -0700)
author Filipe Manana <fdmanana@suse.com>
Mon, 25 May 2015 23:55:42 +0000 (00:55 +0100)
committer Chris Mason <clm@fb.com>
Wed, 3 Jun 2015 11:03:09 +0000 (04:03 -0700)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 8bb013672aee061e81eb03fcba6d51db9cd169af..855935f6671ae59b1b025c3916d553bc689191ef 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4986,24 +4986,40 @@ static void evict_inode_truncate_pages(struct inode *inode)
         }
         write_unlock(&map_tree->lock);
  
+       /*
+        * Keep looping until we have no more ranges in the io tree.
+        * We can have ongoing bios started by readpages (called from readahead)
+        * that didn't get their end io callbacks called yet or they are still
+        * in progress ((extent_io.c:end_bio_extent_readpage()). This means some
+        * ranges can still be locked and eviction started because before
+        * submitting those bios, which are executed by a separate task (work
+        * queue kthread), inode references (inode->i_count) were not taken
+        * (which would be dropped in the end io callback of each bio).
+        * Therefore here we effectively end up waiting for those bios and
+        * anyone else holding locked ranges without having bumped the inode's
+        * reference count - if we don't do it, when they access the inode's
+        * io_tree to unlock a range it may be too late, leading to an
+        * use-after-free issue.
+        */
         spin_lock(&io_tree->lock);
         while (!RB_EMPTY_ROOT(&io_tree->state)) {
                 struct extent_state *state;
                 struct extent_state *cached_state = NULL;
+               u64 start;
+               u64 end;
  
                 node = rb_first(&io_tree->state);
                 state = rb_entry(node, struct extent_state, rb_node);
-               atomic_inc(&state->refs);
+               start = state->start;
+               end = state->end;
                 spin_unlock(&io_tree->lock);
  
-               lock_extent_bits(io_tree, state->start, state->end,
-                                0, &cached_state);
-               clear_extent_bit(io_tree, state->start, state->end,
+               lock_extent_bits(io_tree, start, end, 0, &cached_state);
+               clear_extent_bit(io_tree, start, end,
                                  EXTENT_LOCKED | EXTENT_DIRTY |
                                  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
                                  EXTENT_DEFRAG, 1, 1,
                                  &cached_state, GFP_NOFS);
-               free_extent_state(state);
  
                 cond_resched();
                 spin_lock(&io_tree->lock);
author	Filipe Manana <fdmanana@suse.com>
	Mon, 25 May 2015 23:55:42 +0000 (00:55 +0100)
committer	Chris Mason <clm@fb.com>
	Wed, 3 Jun 2015 11:03:09 +0000 (04:03 -0700)