btrfs: convert the io_failure_tree to a plain rb_tree
authorJosef Bacik <josef@toxicpanda.com>
Fri, 9 Sep 2022 21:53:16 +0000 (17:53 -0400)
committerDavid Sterba <dsterba@suse.com>
Mon, 26 Sep 2022 10:28:02 +0000 (12:28 +0200)
We still have this oddity of stashing the io_failure_record in the
extent state for the io_failure_tree, which is leftover from when we
used to stuff private pointers in extent_io_trees.

However this doesn't make a lot of sense for the io failure records, we
can simply use a normal rb_tree for this.  This will allow us to further
simplify the extent_io_tree code by removing the io_failure_rec pointer
from the extent state.

Convert the io_failure_tree to an rb tree + spinlock in the inode, and
then use our rb tree simple helpers to insert and find failed records.
This greatly cleans up this code and makes it easier to separate out the
extent_io_tree code.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/btrfs_inode.h
fs/btrfs/extent-io-tree.h
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/inode.c
fs/btrfs/misc.h
include/trace/events/btrfs.h

index b160b8e124e01152ae345297dfa1083d1c0d45f6..108af52ba870b9a07bd92328c9ed9e7386336de3 100644 (file)
@@ -94,7 +94,8 @@ struct btrfs_inode {
        /* special utility tree used to record which mirrors have already been
         * tried when checksums fail for a given block
         */
-       struct extent_io_tree io_failure_tree;
+       struct rb_root io_failure_tree;
+       spinlock_t io_failure_lock;
 
        /*
         * Keep track of where the inode has extent items mapped in order to
index 5584968643eb2d892767f464df1f29a29d71e2f9..ee2ba4b6e4a158e9d665116e93f94451500f4008 100644 (file)
@@ -56,7 +56,6 @@ enum {
        IO_TREE_FS_EXCLUDED_EXTENTS,
        IO_TREE_BTREE_INODE_IO,
        IO_TREE_INODE_IO,
-       IO_TREE_INODE_IO_FAILURE,
        IO_TREE_RELOC_BLOCKS,
        IO_TREE_TRANS_DIRTY_PAGES,
        IO_TREE_ROOT_DIRTY_LOG_PAGES,
@@ -89,8 +88,6 @@ struct extent_state {
        refcount_t refs;
        u32 state;
 
-       struct io_failure_record *failrec;
-
 #ifdef CONFIG_BTRFS_DEBUG
        struct list_head leak_list;
 #endif
index 156ab8cc1accd9b8662df841f0acb59af5aa7d9c..1ab14fa7f837507792a65e331a9907197779eae8 100644 (file)
@@ -326,7 +326,6 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
        if (!state)
                return state;
        state->state = 0;
-       state->failrec = NULL;
        RB_CLEAR_NODE(&state->rb_node);
        btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
        refcount_set(&state->refs, 1);
@@ -2159,64 +2158,29 @@ out:
        return total_bytes;
 }
 
-/*
- * set the private field for a given byte offset in the tree.  If there isn't
- * an extent_state there already, this does nothing.
- */
-static int set_state_failrec(struct extent_io_tree *tree, u64 start,
-                            struct io_failure_record *failrec)
+static int insert_failrec(struct btrfs_inode *inode,
+                         struct io_failure_record *failrec)
 {
-       struct rb_node *node;
-       struct extent_state *state;
-       int ret = 0;
+       struct rb_node *exist;
 
-       spin_lock(&tree->lock);
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search(tree, start);
-       if (!node) {
-               ret = -ENOENT;
-               goto out;
-       }
-       state = rb_entry(node, struct extent_state, rb_node);
-       if (state->start != start) {
-               ret = -ENOENT;
-               goto out;
-       }
-       state->failrec = failrec;
-out:
-       spin_unlock(&tree->lock);
-       return ret;
+       spin_lock(&inode->io_failure_lock);
+       exist = rb_simple_insert(&inode->io_failure_tree, failrec->bytenr,
+                                &failrec->rb_node);
+       spin_unlock(&inode->io_failure_lock);
+
+       return (exist == NULL) ? 0 : -EEXIST;
 }
 
-static struct io_failure_record *get_state_failrec(struct extent_io_tree *tree,
-                                                  u64 start)
+static struct io_failure_record *get_failrec(struct btrfs_inode *inode, u64 start)
 {
        struct rb_node *node;
-       struct extent_state *state;
-       struct io_failure_record *failrec;
+       struct io_failure_record *failrec = ERR_PTR(-ENOENT);
 
-       spin_lock(&tree->lock);
-       /*
-        * this search will find all the extents that end after
-        * our range starts.
-        */
-       node = tree_search(tree, start);
-       if (!node) {
-               failrec = ERR_PTR(-ENOENT);
-               goto out;
-       }
-       state = rb_entry(node, struct extent_state, rb_node);
-       if (state->start != start) {
-               failrec = ERR_PTR(-ENOENT);
-               goto out;
-       }
-
-       failrec = state->failrec;
-out:
-       spin_unlock(&tree->lock);
+       spin_lock(&inode->io_failure_lock);
+       node = rb_simple_search(&inode->io_failure_tree, start);
+       if (node)
+               failrec = rb_entry(node, struct io_failure_record, rb_node);
+       spin_unlock(&inode->io_failure_lock);
        return failrec;
 }
 
@@ -2276,28 +2240,20 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
        return bitset;
 }
 
-static int free_io_failure(struct extent_io_tree *failure_tree,
-                          struct extent_io_tree *io_tree,
+static int free_io_failure(struct btrfs_inode *inode,
                           struct io_failure_record *rec)
 {
        int ret;
-       int err = 0;
 
-       set_state_failrec(failure_tree, rec->start, NULL);
-       ret = clear_extent_bits(failure_tree, rec->start,
-                               rec->start + rec->len - 1,
-                               EXTENT_LOCKED | EXTENT_DIRTY);
-       if (ret)
-               err = ret;
+       spin_lock(&inode->io_failure_lock);
+       rb_erase(&rec->rb_node, &inode->io_failure_tree);
+       spin_unlock(&inode->io_failure_lock);
 
-       ret = clear_extent_bits(io_tree, rec->start,
-                               rec->start + rec->len - 1,
+       ret = clear_extent_bits(&inode->io_tree, rec->bytenr,
+                               rec->bytenr + rec->len - 1,
                                EXTENT_DAMAGED);
-       if (ret && !err)
-               err = ret;
-
        kfree(rec);
-       return err;
+       return ret;
 }
 
 /*
@@ -2436,22 +2392,13 @@ int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
                           struct page *page, unsigned int pg_offset)
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       struct extent_io_tree *failure_tree = &inode->io_failure_tree;
        struct extent_io_tree *io_tree = &inode->io_tree;
        u64 ino = btrfs_ino(inode);
-       u64 private;
        struct io_failure_record *failrec;
        struct extent_state *state;
        int mirror;
-       int ret;
-
-       private = 0;
-       ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
-                              EXTENT_DIRTY, 0);
-       if (!ret)
-               return 0;
 
-       failrec = get_state_failrec(failure_tree, start);
+       failrec = get_failrec(inode, start);
        if (IS_ERR(failrec))
                return 0;
 
@@ -2462,12 +2409,12 @@ int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
 
        spin_lock(&io_tree->lock);
        state = find_first_extent_bit_state(io_tree,
-                                           failrec->start,
+                                           failrec->bytenr,
                                            EXTENT_LOCKED);
        spin_unlock(&io_tree->lock);
 
-       if (!state || state->start > failrec->start ||
-           state->end < failrec->start + failrec->len - 1)
+       if (!state || state->start > failrec->bytenr ||
+           state->end < failrec->bytenr + failrec->len - 1)
                goto out;
 
        mirror = failrec->this_mirror;
@@ -2478,7 +2425,7 @@ int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
        } while (mirror != failrec->failed_mirror);
 
 out:
-       free_io_failure(failure_tree, io_tree, failrec);
+       free_io_failure(inode, failrec);
        return 0;
 }
 
@@ -2490,30 +2437,26 @@ out:
  */
 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
 {
-       struct extent_io_tree *failure_tree = &inode->io_failure_tree;
        struct io_failure_record *failrec;
-       struct extent_state *state, *next;
+       struct rb_node *node, *next;
 
-       if (RB_EMPTY_ROOT(&failure_tree->state))
+       if (RB_EMPTY_ROOT(&inode->io_failure_tree))
                return;
 
-       spin_lock(&failure_tree->lock);
-       state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
-       while (state) {
-               if (state->start > end)
+       spin_lock(&inode->io_failure_lock);
+       node = rb_simple_search_first(&inode->io_failure_tree, start);
+       while (node) {
+               failrec = rb_entry(node, struct io_failure_record, rb_node);
+               if (failrec->bytenr > end)
                        break;
 
-               ASSERT(state->end <= end);
-
-               next = next_state(state);
-
-               failrec = state->failrec;
-               free_extent_state(state);
+               next = rb_next(node);
+               rb_erase(&failrec->rb_node, &inode->io_failure_tree);
                kfree(failrec);
 
-               state = next;
+               node = next;
        }
-       spin_unlock(&failure_tree->lock);
+       spin_unlock(&inode->io_failure_lock);
 }
 
 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
@@ -2523,16 +2466,15 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
        u64 start = bbio->file_offset + bio_offset;
        struct io_failure_record *failrec;
-       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
        const u32 sectorsize = fs_info->sectorsize;
        int ret;
 
-       failrec = get_state_failrec(failure_tree, start);
+       failrec = get_failrec(BTRFS_I(inode), start);
        if (!IS_ERR(failrec)) {
                btrfs_debug(fs_info,
        "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
-                       failrec->logical, failrec->start, failrec->len);
+                       failrec->logical, failrec->bytenr, failrec->len);
                /*
                 * when data can be on disk more than twice, add to failrec here
                 * (e.g. with a list for failed_mirror) to make
@@ -2547,7 +2489,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
        if (!failrec)
                return ERR_PTR(-ENOMEM);
 
-       failrec->start = start;
+       RB_CLEAR_NODE(&failrec->rb_node);
+       failrec->bytenr = start;
        failrec->len = sectorsize;
        failrec->failed_mirror = bbio->mirror_num;
        failrec->this_mirror = bbio->mirror_num;
@@ -2572,17 +2515,17 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode
        }
 
        /* Set the bits in the private failure tree */
-       ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
-                             EXTENT_LOCKED | EXTENT_DIRTY);
-       if (ret >= 0) {
-               ret = set_state_failrec(failure_tree, start, failrec);
-               /* Set the bits in the inode's tree */
-               ret = set_extent_bits(tree, start, start + sectorsize - 1,
-                                     EXTENT_DAMAGED);
-       } else if (ret < 0) {
+       ret = insert_failrec(BTRFS_I(inode), failrec);
+       if (ret) {
                kfree(failrec);
                return ERR_PTR(ret);
        }
+       ret = set_extent_bits(tree, start, start + sectorsize - 1,
+                             EXTENT_DAMAGED);
+       if (ret) {
+               free_io_failure(BTRFS_I(inode), failrec);
+               return ERR_PTR(ret);
+       }
 
        return failrec;
 }
@@ -2594,8 +2537,6 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
        u64 start = failed_bbio->file_offset + bio_offset;
        struct io_failure_record *failrec;
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
-       struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
        struct bio *failed_bio = &failed_bbio->bio;
        const int icsum = bio_offset >> fs_info->sectorsize_bits;
        struct bio *repair_bio;
@@ -2624,7 +2565,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
                btrfs_debug(fs_info,
                        "failed to repair num_copies %d this_mirror %d failed_mirror %d",
                        failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
-               free_io_failure(failure_tree, tree, failrec);
+               free_io_failure(BTRFS_I(inode), failrec);
                return -EIO;
        }
 
index 69a86ae6fd5087a4ec2c70dc54976604f6296efb..1c47176690736c1a533f324f0a1f213ef6846c36 100644 (file)
@@ -254,8 +254,12 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
  * bio end_io callback is called to indicate things have failed.
  */
 struct io_failure_record {
+       /* Use rb_simple_node for search/insert */
+       struct {
+               struct rb_node rb_node;
+               u64 bytenr;
+       };
        struct page *page;
-       u64 start;
        u64 len;
        u64 logical;
        int this_mirror;
index 548665299e57dc26ddcde75a125940f841dc5a75..8def3a67adb797fcec2705150d4a0cedeece6a64 100644 (file)
@@ -8790,6 +8790,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_log_commit = 0;
 
        spin_lock_init(&ei->lock);
+       spin_lock_init(&ei->io_failure_lock);
        ei->outstanding_extents = 0;
        if (sb->s_magic != BTRFS_TEST_MAGIC)
                btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
@@ -8806,12 +8807,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree);
        extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
-       extent_io_tree_init(fs_info, &ei->io_failure_tree,
-                           IO_TREE_INODE_IO_FAILURE, inode);
        extent_io_tree_init(fs_info, &ei->file_extent_tree,
                            IO_TREE_INODE_FILE_EXTENT, inode);
+       ei->io_failure_tree = RB_ROOT;
        ei->io_tree.track_uptodate = true;
-       ei->io_failure_tree.track_uptodate = true;
        atomic_set(&ei->sync_writers, 0);
        mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
index 340f995652f2dac699fcaadaa4b2c0c7346327b3..f9850edfd726793344f2bafb557338858de0ba75 100644 (file)
@@ -88,6 +88,41 @@ static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr)
        return NULL;
 }
 
+/*
+ * Search @root from an entry that starts or comes after @bytenr.
+ *
+ * @root:      the root to search.
+ * @bytenr:    bytenr to search from.
+ *
+ * Return the rb_node that start at or after @bytenr.  If there is no entry at
+ * or after @bytner return NULL.
+ */
+static inline struct rb_node *rb_simple_search_first(struct rb_root *root,
+                                                    u64 bytenr)
+{
+       struct rb_node *node = root->rb_node, *ret = NULL;
+       struct rb_simple_node *entry, *ret_entry = NULL;
+
+       while (node) {
+               entry = rb_entry(node, struct rb_simple_node, rb_node);
+
+               if (bytenr < entry->bytenr) {
+                       if (!ret || entry->bytenr < ret_entry->bytenr) {
+                               ret = node;
+                               ret_entry = entry;
+                       }
+
+                       node = node->rb_left;
+               } else if (bytenr > entry->bytenr) {
+                       node = node->rb_right;
+               } else {
+                       return node;
+               }
+       }
+
+       return ret;
+}
+
 static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
                                               struct rb_node *node)
 {
index 73df80d462dc83ef760270988d71553931418da2..4db905311d679e2ff4ad27b9034e334a095ab681 100644 (file)
@@ -84,7 +84,6 @@ struct raid56_bio_trace_info;
        EM( IO_TREE_FS_EXCLUDED_EXTENTS,  "EXCLUDED_EXTENTS")       \
        EM( IO_TREE_BTREE_INODE_IO,       "BTREE_INODE_IO")         \
        EM( IO_TREE_INODE_IO,             "INODE_IO")               \
-       EM( IO_TREE_INODE_IO_FAILURE,     "INODE_IO_FAILURE")       \
        EM( IO_TREE_RELOC_BLOCKS,         "RELOC_BLOCKS")           \
        EM( IO_TREE_TRANS_DIRTY_PAGES,    "TRANS_DIRTY_PAGES")      \
        EM( IO_TREE_ROOT_DIRTY_LOG_PAGES, "ROOT_DIRTY_LOG_PAGES")   \