From 19d7f65f032f898df8c164d378be502901772eb7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 28 Apr 2025 10:52:55 -0400 Subject: [PATCH] btrfs: convert the buffer_radix to an xarray In order to fully utilize xarray tagging to improve writeback we need to convert the buffer_radix to a proper xarray. This conversion is relatively straightforward as the radix code uses the xarray underneath. Using xarray directly allows for quite a lot less code. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 ++- fs/btrfs/extent_io.c | 212 ++++++++++++++--------------------- fs/btrfs/fs.h | 4 +- fs/btrfs/tests/btrfs-tests.c | 28 ++--- fs/btrfs/zoned.c | 16 +-- 5 files changed, 109 insertions(+), 165 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1cdc1436fe4a..1288b9bac47c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2761,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } +/* + * Lockdep gets confused between our buffer_tree which requires IRQ locking because + * we modify marks in the IRQ context, and our delayed inode xarray which doesn't + * have these requirements. Use a class key so lockdep doesn't get them mixed up. + */ +static struct lock_class_key buffer_xa_class; + void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + + /* Use the same flags as mapping->i_pages. */ + xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); + lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class); + INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2776,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->zone_active_bgs_lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dc8db26c821c..cf3c0318898a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1866,19 +1866,17 @@ static void set_btree_ioerr(struct extent_buffer *eb) * context. */ static struct extent_buffer *find_extent_buffer_nolock( - const struct btrfs_fs_info *fs_info, u64 start) + struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; + unsigned long index = (start >> fs_info->sectorsize_bits); rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - return eb; - } + eb = xa_load(&fs_info->buffer_tree, index); + if (eb && !atomic_inc_not_zero(&eb->refs)) + eb = NULL; rcu_read_unlock(); - return NULL; + return eb; } static void end_bbio_meta_write(struct btrfs_bio *bbio) @@ -2742,11 +2740,10 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo if (!btrfs_meta_is_subpage(fs_info)) { /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear folio if it's still connected to - * this eb. + * We do this since we'll remove the pages after we've removed + * the eb from the xarray, so we could race and have this page + * now attached to the new eb. So only clear folio if it's + * still connected to this eb. */ if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); @@ -2911,9 +2908,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; /* - * The TREE_REF bit is first set when the extent_buffer is added - * to the radix tree. It is also reset, if unset, when a new reference - * is created by find_extent_buffer. + * The TREE_REF bit is first set when the extent_buffer is added to the + * xarray. It is also reset, if unset, when a new reference is created + * by find_extent_buffer. * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or @@ -2925,13 +2922,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * The actual lifetime of the extent_buffer in the radix tree is - * adequately protected by the refcount, but the TREE_REF bit and - * its corresponding reference are not. To protect against this - * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io. Note that once io is initiated, TREE_REF can no - * longer be cleared, so that is the moment at which any such race is - * best fixed. + * The actual lifetime of the extent_buffer in the xarray is adequately + * protected by the refcount, but the TREE_REF bit and its corresponding + * reference are not. To protect against this class of races, we call + * check_buffer_tree_ref() from the code paths which trigger io. Note that + * once io is initiated, TREE_REF can no longer be cleared, so that is + * the moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -2995,23 +2991,25 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; + xa_lock_irq(&fs_info->buffer_tree); + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits, + NULL, eb, GFP_NOFS); + if (xa_is_err(exists)) { + ret = xa_err(exists); + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return ERR_PTR(ret); } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - else + if (exists) { + if (!atomic_inc_not_zero(&exists->refs)) { + /* The extent buffer is being freed, retry. */ + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + goto free_eb; } + xa_unlock_irq(&fs_info->buffer_tree); check_buffer_tree_ref(eb); return eb; @@ -3032,9 +3030,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, lockdep_assert_held(&folio->mapping->i_private_lock); /* - * For subpage case, we completely rely on radix tree to ensure we - * don't try to insert two ebs for the same bytenr. So here we always - * return NULL and just continue. + * For subpage case, we completely rely on xarray to ensure we don't try + * to insert two ebs for the same bytenr. So here we always return NULL + * and just continue. */ if (btrfs_meta_is_subpage(fs_info)) return NULL; @@ -3165,7 +3163,7 @@ finish: /* * To inform we have an extra eb under allocation, so that * detach_extent_buffer_page() won't release the folio private when the - * eb hasn't been inserted into radix tree yet. + * eb hasn't been inserted into the xarray yet. * * The ref will be decreased when the eb releases the page, in * detach_extent_buffer_page(). Thus needs no special handling in the @@ -3299,10 +3297,9 @@ reallocate: /* * We can't unlock the pages just yet since the extent buffer - * hasn't been properly inserted in the radix tree, this - * opens a race with btree_release_folio which can free a page - * while we are still filling in all pages for the buffer and - * we could crash. + * hasn't been properly inserted into the xarray, this opens a + * race with btree_release_folio() which can free a page while we + * are still filling in all pages for the buffer and we could crash. */ } if (uptodate) @@ -3311,23 +3308,25 @@ reallocate: if (page_contig) eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) + xa_lock_irq(&fs_info->buffer_tree); + existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, + start >> fs_info->sectorsize_bits, NULL, eb, + GFP_NOFS); + if (xa_is_err(existing_eb)) { + ret = xa_err(existing_eb); + xa_unlock_irq(&fs_info->buffer_tree); goto out; - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - ret = 0; - existing_eb = find_extent_buffer(fs_info, start); - if (existing_eb) - goto out; - else + } + if (existing_eb) { + if (!atomic_inc_not_zero(&existing_eb->refs)) { + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + goto out; } + xa_unlock_irq(&fs_info->buffer_tree); + /* add one reference for the tree */ check_buffer_tree_ref(eb); @@ -3397,10 +3396,23 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete_item(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); + /* + * We're erasing, theoretically there will be no allocations, so + * just use GFP_ATOMIC. + * + * We use cmpxchg instead of erase because we do not know if + * this eb is actually in the tree or not, we could be cleaning + * up an eb that we allocated but never inserted into the tree. + * Thus use cmpxchg to remove it from the tree if it is there, + * or leave the other entry if this isn't in the tree. + * + * The documentation says that putting a NULL value is the same + * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't + * in this case. + */ + xa_cmpxchg_irq(&fs_info->buffer_tree, + eb->start >> fs_info->sectorsize_bits, eb, NULL, + GFP_ATOMIC); btrfs_leak_debug_del_eb(eb); /* Should be safe to release folios at this point. */ @@ -4231,71 +4243,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 -static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) -{ - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); - lockdep_assert_held(&fs_info->buffer_lock); - - while (cur < folio_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= folio_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; - } -out: - return found; -} - static int try_release_subpage_extent_buffer(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - u64 cur = folio_pos(folio); - const u64 end = cur + PAGE_SIZE; + struct extent_buffer *eb; + unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits); + unsigned long index = start; + unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1; int ret; - while (cur < end) { - struct extent_buffer *eb = NULL; - - /* - * Unlike try_release_extent_buffer() which uses folio private - * to grab buffer, for subpage case we rely on radix tree, thus - * we need to ensure radix tree consistency. - * - * We also want an atomic snapshot of the radix tree, thus go - * with spinlock rather than RCU. - */ - spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, folio, cur); - if (!eb) { - /* No more eb in the page range after or at cur */ - spin_unlock(&fs_info->buffer_lock); - break; - } - cur = eb->start + eb->len; - + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { /* * The same as try_release_extent_buffer(), to ensure the eb * won't disappear out from under us. @@ -4303,10 +4261,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&fs_info->buffer_lock); - break; + continue; } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4324,7 +4281,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * release_extent_buffer() will release the refs_lock. */ release_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } + xa_unlock_irq(&fs_info->buffer_tree); + /* * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 7baa2ed45198..007b7c9b3909 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -777,10 +777,8 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer radix tree */ - spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; + struct xarray buffer_tree; /* Next backup root to be overwritten */ int backup_root_index; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 02a915eb51fb..b576897d71cc 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - struct radix_tree_iter iter; - void **slot; struct btrfs_device *dev, *tmp; + struct extent_buffer *eb; + unsigned long index; if (!fs_info) return; @@ -169,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - spin_lock(&fs_info->buffer_lock); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { - struct extent_buffer *eb; - - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); - if (!eb) - continue; - /* Shouldn't happen but that kind of thinking creates CVE's */ - if (radix_tree_exception(eb)) { - if (radix_tree_deref_retry(eb)) - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - spin_unlock(&fs_info->buffer_lock); - free_extent_buffer_stale(eb); - spin_lock(&fs_info->buffer_lock); + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each(&fs_info->buffer_tree, index, eb) { + xa_unlock_irq(&fs_info->buffer_tree); + free_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 7fc2f73dc8d8..b5b0156d5b95 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2171,27 +2171,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; - struct radix_tree_iter iter; struct extent_buffer *eb; - void __rcu **slot; + unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); rcu_read_lock(); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, - block_group->start >> fs_info->sectorsize_bits) { - eb = radix_tree_deref_slot(slot); - if (!eb) - continue; - if (radix_tree_deref_retry(eb)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - + xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { if (eb->start < block_group->start) continue; if (eb->start >= end) break; - - slot = radix_tree_iter_resume(slot, &iter); rcu_read_unlock(); wait_on_extent_buffer_writeback(eb); rcu_read_lock(); -- 2.25.1