btrfs: convert the buffer_radix to an xarray
authorJosef Bacik <josef@toxicpanda.com>
Mon, 28 Apr 2025 14:52:55 +0000 (10:52 -0400)
committerDavid Sterba <dsterba@suse.com>
Thu, 15 May 2025 12:30:50 +0000 (14:30 +0200)
In order to fully utilize xarray tagging to improve writeback we need to
convert the buffer_radix to a proper xarray.  This conversion is
relatively straightforward as the radix code uses the xarray underneath.
Using xarray directly allows for quite a lot less code.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/fs.h
fs/btrfs/tests/btrfs-tests.c
fs/btrfs/zoned.c

index 1cdc1436fe4aae2c1a30e694e9a8dd47eaae25f0..1288b9bac47c46627e45b1bca4d0b0c024d92c8d 100644 (file)
@@ -2761,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
        return ret;
 }
 
+/*
+ * Lockdep gets confused between our buffer_tree which requires IRQ locking because
+ * we modify marks in the IRQ context, and our delayed inode xarray which doesn't
+ * have these requirements. Use a class key so lockdep doesn't get them mixed up.
+ */
+static struct lock_class_key buffer_xa_class;
+
 void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
 {
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
-       INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+
+       /* Use the same flags as mapping->i_pages. */
+       xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+       lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
+
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2776,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
        spin_lock_init(&fs_info->super_lock);
-       spin_lock_init(&fs_info->buffer_lock);
        spin_lock_init(&fs_info->unused_bgs_lock);
        spin_lock_init(&fs_info->treelog_bg_lock);
        spin_lock_init(&fs_info->zone_active_bgs_lock);
index dc8db26c821c2dd762f4c85c3835017e240861ae..cf3c0318898a194c0618d013d48818232a500915 100644 (file)
@@ -1866,19 +1866,17 @@ static void set_btree_ioerr(struct extent_buffer *eb)
  * context.
  */
 static struct extent_buffer *find_extent_buffer_nolock(
-               const struct btrfs_fs_info *fs_info, u64 start)
+               struct btrfs_fs_info *fs_info, u64 start)
 {
        struct extent_buffer *eb;
+       unsigned long index = (start >> fs_info->sectorsize_bits);
 
        rcu_read_lock();
-       eb = radix_tree_lookup(&fs_info->buffer_radix,
-                              start >> fs_info->sectorsize_bits);
-       if (eb && atomic_inc_not_zero(&eb->refs)) {
-               rcu_read_unlock();
-               return eb;
-       }
+       eb = xa_load(&fs_info->buffer_tree, index);
+       if (eb && !atomic_inc_not_zero(&eb->refs))
+               eb = NULL;
        rcu_read_unlock();
-       return NULL;
+       return eb;
 }
 
 static void end_bbio_meta_write(struct btrfs_bio *bbio)
@@ -2742,11 +2740,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
 
        if (!btrfs_meta_is_subpage(fs_info)) {
                /*
-                * We do this since we'll remove the pages after we've
-                * removed the eb from the radix tree, so we could race
-                * and have this page now attached to the new eb.  So
-                * only clear folio if it's still connected to
-                * this eb.
+                * We do this since we'll remove the pages after we've removed
+                * the eb from the xarray, so we could race and have this page
+                * now attached to the new eb.  So only clear folio if it's
+                * still connected to this eb.
                 */
                if (folio_test_private(folio) && folio_get_private(folio) == eb) {
                        BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
@@ -2911,9 +2908,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
 {
        int refs;
        /*
-        * The TREE_REF bit is first set when the extent_buffer is added
-        * to the radix tree. It is also reset, if unset, when a new reference
-        * is created by find_extent_buffer.
+        * The TREE_REF bit is first set when the extent_buffer is added to the
+        * xarray. It is also reset, if unset, when a new reference is created
+        * by find_extent_buffer.
         *
         * It is only cleared in two cases: freeing the last non-tree
         * reference to the extent_buffer when its STALE bit is set or
@@ -2925,13 +2922,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
         * conditions between the calls to check_buffer_tree_ref in those
         * codepaths and clearing TREE_REF in try_release_extent_buffer.
         *
-        * The actual lifetime of the extent_buffer in the radix tree is
-        * adequately protected by the refcount, but the TREE_REF bit and
-        * its corresponding reference are not. To protect against this
-        * class of races, we call check_buffer_tree_ref from the codepaths
-        * which trigger io. Note that once io is initiated, TREE_REF can no
-        * longer be cleared, so that is the moment at which any such race is
-        * best fixed.
+        * The actual lifetime of the extent_buffer in the xarray is adequately
+        * protected by the refcount, but the TREE_REF bit and its corresponding
+        * reference are not. To protect against this class of races, we call
+        * check_buffer_tree_ref() from the code paths which trigger io. Note that
+        * once io is initiated, TREE_REF can no longer be cleared, so that is
+        * the moment at which any such race is best fixed.
         */
        refs = atomic_read(&eb->refs);
        if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -2995,23 +2991,25 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
                return ERR_PTR(-ENOMEM);
        eb->fs_info = fs_info;
 again:
-       ret = radix_tree_preload(GFP_NOFS);
-       if (ret) {
-               exists = ERR_PTR(ret);
-               goto free_eb;
+       xa_lock_irq(&fs_info->buffer_tree);
+       exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits,
+                             NULL, eb, GFP_NOFS);
+       if (xa_is_err(exists)) {
+               ret = xa_err(exists);
+               xa_unlock_irq(&fs_info->buffer_tree);
+               btrfs_release_extent_buffer(eb);
+               return ERR_PTR(ret);
        }
-       spin_lock(&fs_info->buffer_lock);
-       ret = radix_tree_insert(&fs_info->buffer_radix,
-                               start >> fs_info->sectorsize_bits, eb);
-       spin_unlock(&fs_info->buffer_lock);
-       radix_tree_preload_end();
-       if (ret == -EEXIST) {
-               exists = find_extent_buffer(fs_info, start);
-               if (exists)
-                       goto free_eb;
-               else
+       if (exists) {
+               if (!atomic_inc_not_zero(&exists->refs)) {
+                       /* The extent buffer is being freed, retry. */
+                       xa_unlock_irq(&fs_info->buffer_tree);
                        goto again;
+               }
+               xa_unlock_irq(&fs_info->buffer_tree);
+               goto free_eb;
        }
+       xa_unlock_irq(&fs_info->buffer_tree);
        check_buffer_tree_ref(eb);
 
        return eb;
@@ -3032,9 +3030,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
        lockdep_assert_held(&folio->mapping->i_private_lock);
 
        /*
-        * For subpage case, we completely rely on radix tree to ensure we
-        * don't try to insert two ebs for the same bytenr.  So here we always
-        * return NULL and just continue.
+        * For subpage case, we completely rely on xarray to ensure we don't try
+        * to insert two ebs for the same bytenr.  So here we always return NULL
+        * and just continue.
         */
        if (btrfs_meta_is_subpage(fs_info))
                return NULL;
@@ -3165,7 +3163,7 @@ finish:
        /*
         * To inform we have an extra eb under allocation, so that
         * detach_extent_buffer_page() won't release the folio private when the
-        * eb hasn't been inserted into radix tree yet.
+        * eb hasn't been inserted into the xarray yet.
         *
         * The ref will be decreased when the eb releases the page, in
         * detach_extent_buffer_page().  Thus needs no special handling in the
@@ -3299,10 +3297,9 @@ reallocate:
 
                /*
                 * We can't unlock the pages just yet since the extent buffer
-                * hasn't been properly inserted in the radix tree, this
-                * opens a race with btree_release_folio which can free a page
-                * while we are still filling in all pages for the buffer and
-                * we could crash.
+                * hasn't been properly inserted into the xarray, this opens a
+                * race with btree_release_folio() which can free a page while we
+                * are still filling in all pages for the buffer and we could crash.
                 */
        }
        if (uptodate)
@@ -3311,23 +3308,25 @@ reallocate:
        if (page_contig)
                eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
 again:
-       ret = radix_tree_preload(GFP_NOFS);
-       if (ret)
+       xa_lock_irq(&fs_info->buffer_tree);
+       existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
+                                  start >> fs_info->sectorsize_bits, NULL, eb,
+                                  GFP_NOFS);
+       if (xa_is_err(existing_eb)) {
+               ret = xa_err(existing_eb);
+               xa_unlock_irq(&fs_info->buffer_tree);
                goto out;
-
-       spin_lock(&fs_info->buffer_lock);
-       ret = radix_tree_insert(&fs_info->buffer_radix,
-                               start >> fs_info->sectorsize_bits, eb);
-       spin_unlock(&fs_info->buffer_lock);
-       radix_tree_preload_end();
-       if (ret == -EEXIST) {
-               ret = 0;
-               existing_eb = find_extent_buffer(fs_info, start);
-               if (existing_eb)
-                       goto out;
-               else
+       }
+       if (existing_eb) {
+               if (!atomic_inc_not_zero(&existing_eb->refs)) {
+                       xa_unlock_irq(&fs_info->buffer_tree);
                        goto again;
+               }
+               xa_unlock_irq(&fs_info->buffer_tree);
+               goto out;
        }
+       xa_unlock_irq(&fs_info->buffer_tree);
+
        /* add one reference for the tree */
        check_buffer_tree_ref(eb);
 
@@ -3397,10 +3396,23 @@ static int release_extent_buffer(struct extent_buffer *eb)
 
                spin_unlock(&eb->refs_lock);
 
-               spin_lock(&fs_info->buffer_lock);
-               radix_tree_delete_item(&fs_info->buffer_radix,
-                                      eb->start >> fs_info->sectorsize_bits, eb);
-               spin_unlock(&fs_info->buffer_lock);
+               /*
+                * We're erasing, theoretically there will be no allocations, so
+                * just use GFP_ATOMIC.
+                *
+                * We use cmpxchg instead of erase because we do not know if
+                * this eb is actually in the tree or not, we could be cleaning
+                * up an eb that we allocated but never inserted into the tree.
+                * Thus use cmpxchg to remove it from the tree if it is there,
+                * or leave the other entry if this isn't in the tree.
+                *
+                * The documentation says that putting a NULL value is the same
+                * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't
+                * in this case.
+                */
+               xa_cmpxchg_irq(&fs_info->buffer_tree,
+                              eb->start >> fs_info->sectorsize_bits, eb, NULL,
+                              GFP_ATOMIC);
 
                btrfs_leak_debug_del_eb(eb);
                /* Should be safe to release folios at this point. */
@@ -4231,71 +4243,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
        }
 }
 
-#define GANG_LOOKUP_SIZE       16
-static struct extent_buffer *get_next_extent_buffer(
-               const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr)
-{
-       struct extent_buffer *gang[GANG_LOOKUP_SIZE];
-       struct extent_buffer *found = NULL;
-       u64 folio_start = folio_pos(folio);
-       u64 cur = folio_start;
-
-       ASSERT(in_range(bytenr, folio_start, PAGE_SIZE));
-       lockdep_assert_held(&fs_info->buffer_lock);
-
-       while (cur < folio_start + PAGE_SIZE) {
-               int ret;
-               int i;
-
-               ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
-                               (void **)gang, cur >> fs_info->sectorsize_bits,
-                               min_t(unsigned int, GANG_LOOKUP_SIZE,
-                                     PAGE_SIZE / fs_info->nodesize));
-               if (ret == 0)
-                       goto out;
-               for (i = 0; i < ret; i++) {
-                       /* Already beyond page end */
-                       if (gang[i]->start >= folio_start + PAGE_SIZE)
-                               goto out;
-                       /* Found one */
-                       if (gang[i]->start >= bytenr) {
-                               found = gang[i];
-                               goto out;
-                       }
-               }
-               cur = gang[ret - 1]->start + gang[ret - 1]->len;
-       }
-out:
-       return found;
-}
-
 static int try_release_subpage_extent_buffer(struct folio *folio)
 {
        struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
-       u64 cur = folio_pos(folio);
-       const u64 end = cur + PAGE_SIZE;
+       struct extent_buffer *eb;
+       unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits);
+       unsigned long index = start;
+       unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1;
        int ret;
 
-       while (cur < end) {
-               struct extent_buffer *eb = NULL;
-
-               /*
-                * Unlike try_release_extent_buffer() which uses folio private
-                * to grab buffer, for subpage case we rely on radix tree, thus
-                * we need to ensure radix tree consistency.
-                *
-                * We also want an atomic snapshot of the radix tree, thus go
-                * with spinlock rather than RCU.
-                */
-               spin_lock(&fs_info->buffer_lock);
-               eb = get_next_extent_buffer(fs_info, folio, cur);
-               if (!eb) {
-                       /* No more eb in the page range after or at cur */
-                       spin_unlock(&fs_info->buffer_lock);
-                       break;
-               }
-               cur = eb->start + eb->len;
-
+       xa_lock_irq(&fs_info->buffer_tree);
+       xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
                /*
                 * The same as try_release_extent_buffer(), to ensure the eb
                 * won't disappear out from under us.
@@ -4303,10 +4261,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
                spin_lock(&eb->refs_lock);
                if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
                        spin_unlock(&eb->refs_lock);
-                       spin_unlock(&fs_info->buffer_lock);
-                       break;
+                       continue;
                }
-               spin_unlock(&fs_info->buffer_lock);
+               xa_unlock_irq(&fs_info->buffer_tree);
 
                /*
                 * If tree ref isn't set then we know the ref on this eb is a
@@ -4324,7 +4281,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
                 * release_extent_buffer() will release the refs_lock.
                 */
                release_extent_buffer(eb);
+               xa_lock_irq(&fs_info->buffer_tree);
        }
+       xa_unlock_irq(&fs_info->buffer_tree);
+
        /*
         * Finally to check if we have cleared folio private, as if we have
         * released all ebs in the page, the folio private should be cleared now.
index 7baa2ed45198f92c43c56b17e0909538926dac69..007b7c9b39091903022501c4d964ab61130f3cad 100644 (file)
@@ -777,10 +777,8 @@ struct btrfs_fs_info {
 
        struct btrfs_delayed_root *delayed_root;
 
-       /* Extent buffer radix tree */
-       spinlock_t buffer_lock;
        /* Entries are eb->start / sectorsize */
-       struct radix_tree_root buffer_radix;
+       struct xarray buffer_tree;
 
        /* Next backup root to be overwritten */
        int backup_root_index;
index 02a915eb51fb316b4a34d5dbc73da88f0f7914ea..b576897d71cc42561e909a21500fc0b2de663395 100644 (file)
@@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
 
 void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 {
-       struct radix_tree_iter iter;
-       void **slot;
        struct btrfs_device *dev, *tmp;
+       struct extent_buffer *eb;
+       unsigned long index;
 
        if (!fs_info)
                return;
@@ -169,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 
        test_mnt->mnt_sb->s_fs_info = NULL;
 
-       spin_lock(&fs_info->buffer_lock);
-       radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
-               struct extent_buffer *eb;
-
-               eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
-               if (!eb)
-                       continue;
-               /* Shouldn't happen but that kind of thinking creates CVE's */
-               if (radix_tree_exception(eb)) {
-                       if (radix_tree_deref_retry(eb))
-                               slot = radix_tree_iter_retry(&iter);
-                       continue;
-               }
-               slot = radix_tree_iter_resume(slot, &iter);
-               spin_unlock(&fs_info->buffer_lock);
-               free_extent_buffer_stale(eb);
-               spin_lock(&fs_info->buffer_lock);
+       xa_lock_irq(&fs_info->buffer_tree);
+       xa_for_each(&fs_info->buffer_tree, index, eb) {
+               xa_unlock_irq(&fs_info->buffer_tree);
+               free_extent_buffer(eb);
+               xa_lock_irq(&fs_info->buffer_tree);
        }
-       spin_unlock(&fs_info->buffer_lock);
+       xa_unlock_irq(&fs_info->buffer_tree);
 
        btrfs_mapping_tree_free(fs_info);
        list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
index 7fc2f73dc8d8a6260b793bb99ad93a7df86b08b2..b5b0156d5b95a8c8e166e85bf2d51da437d9ce2a 100644 (file)
@@ -2171,27 +2171,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
 {
        struct btrfs_fs_info *fs_info = block_group->fs_info;
        const u64 end = block_group->start + block_group->length;
-       struct radix_tree_iter iter;
        struct extent_buffer *eb;
-       void __rcu **slot;
+       unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits);
 
        rcu_read_lock();
-       radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
-                                block_group->start >> fs_info->sectorsize_bits) {
-               eb = radix_tree_deref_slot(slot);
-               if (!eb)
-                       continue;
-               if (radix_tree_deref_retry(eb)) {
-                       slot = radix_tree_iter_retry(&iter);
-                       continue;
-               }
-
+       xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
                if (eb->start < block_group->start)
                        continue;
                if (eb->start >= end)
                        break;
-
-               slot = radix_tree_iter_resume(slot, &iter);
                rcu_read_unlock();
                wait_on_extent_buffer_writeback(eb);
                rcu_read_lock();