btrfs: refactor how we handle reserved space inside copy_one_range()
authorQu Wenruo <wqu@suse.com>
Thu, 27 Mar 2025 02:14:58 +0000 (12:44 +1030)
committerDavid Sterba <dsterba@suse.com>
Thu, 15 May 2025 12:30:40 +0000 (14:30 +0200)
There are several things not ideal in copy_one_range():

- Unnecessary temporary variables
  * block_offset
  * reserve_bytes
  * dirty_blocks
  * num_blocks
  * release_bytes
  These are utilized to handle short-copy cases.

- Inconsistent handling of btrfs_delalloc_release_extents()
  There is a hidden behavior that, after reserving metadata for X bytes
  of data write, we have to call btrfs_delalloc_release_extents() with X
  once and only once.

  Calling btrfs_delalloc_release_extents(X - 4K) and
  btrfs_delalloc_release_extents(4K) will cause outstanding extents
  accounting to go wrong.

  This is because the outstanding extents mechanism is not designed to
  handle shrinking of reserved space.

Improve above situations by:

- Use a single @reserved_start and @reserved_len pair
  Now we reserve space for the initial range, and if a short copy
  happened and we need to shrink the reserved space, we can easily
  calculate the new length, and update @reserved_len.

- Introduce helpers to shrink reserved data and metadata space
  This is done by two new helpers, shrink_reserved_space() and
  btrfs_delalloc_shrink_extents().

  The later will do a better calculation if we need to modify the
  outstanding extents, and the first one will be utilized inside
  copy_one_range().

- Manually unlock, release reserved space and return if no byte is
  copied

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/delalloc-space.c
fs/btrfs/delalloc-space.h
fs/btrfs/file.c

index 88e900e5a43d436654432fafa4fdb92f65737c0b..1479be2427cb44dc15321c6c4a22f31cabe493ce 100644 (file)
@@ -439,6 +439,29 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
        btrfs_inode_rsv_release(inode, true);
 }
 
+/* Shrink a previously reserved extent to a new length. */
+void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       const u32 reserved_num_extents = count_max_extents(fs_info, reserved_len);
+       const u32 new_num_extents = count_max_extents(fs_info, new_len);
+       const int diff_num_extents = new_num_extents - reserved_num_extents;
+
+       ASSERT(new_len <= reserved_len);
+       if (new_num_extents == reserved_num_extents)
+               return;
+
+       spin_lock(&inode->lock);
+       btrfs_mod_outstanding_extents(inode, diff_num_extents);
+       btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+       spin_unlock(&inode->lock);
+
+       if (btrfs_is_testing(fs_info))
+               return;
+
+       btrfs_inode_rsv_release(inode, true);
+}
+
 /*
  * Reserve data and metadata space for delalloc
  *
index 3f32953c0a80b2e47bf7118764c62eb784acbfa9..069005959479d6e891a80f36286128338b68641f 100644 (file)
@@ -27,5 +27,6 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
                                    u64 disk_num_bytes, bool noflush);
 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len);
 
 #endif /* BTRFS_DELALLOC_SPACE_H */
index 023b8abe50a4872964ee954d8fde45f32baed1bc..35e64b231dde5fc522f606eb847a3ae7a9e7bca9 100644 (file)
@@ -1149,6 +1149,23 @@ static ssize_t reserve_space(struct btrfs_inode *inode,
        return reserve_bytes;
 }
 
+/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */
+static void shrink_reserved_space(struct btrfs_inode *inode,
+                                 struct extent_changeset *data_reserved,
+                                 u64 reserved_start, u64 reserved_len,
+                                 u64 new_len, bool only_release_metadata)
+{
+       const u64 diff = reserved_len - new_len;
+
+       ASSERT(new_len <= reserved_len);
+       btrfs_delalloc_shrink_extents(inode, reserved_len, new_len);
+       if (only_release_metadata)
+               btrfs_delalloc_release_metadata(inode, diff, true);
+       else
+               btrfs_delalloc_release_space(inode, data_reserved,
+                                            reserved_start + new_len, diff, true);
+}
+
 /*
  * Do the heavy-lifting work to copy one range into one folio of the page cache.
  *
@@ -1162,14 +1179,11 @@ static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
 {
        struct btrfs_fs_info *fs_info = inode->root->fs_info;
        struct extent_state *cached_state = NULL;
-       const size_t block_offset = (start & (fs_info->sectorsize - 1));
        size_t write_bytes = min(iov_iter_count(iter), PAGE_SIZE - offset_in_page(start));
-       size_t reserve_bytes;
        size_t copied;
-       size_t dirty_blocks;
-       size_t num_blocks;
+       const u64 reserved_start = round_down(start, fs_info->sectorsize);
+       u64 reserved_len;
        struct folio *folio = NULL;
-       u64 release_bytes;
        int extents_locked;
        u64 lockstart;
        u64 lockend;
@@ -1188,23 +1202,25 @@ static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
                            &only_release_metadata);
        if (ret < 0)
                return ret;
-       reserve_bytes = ret;
-       release_bytes = reserve_bytes;
+       reserved_len = ret;
+       /* Write range must be inside the reserved range. */
+       ASSERT(reserved_start <= start);
+       ASSERT(start + write_bytes <= reserved_start + reserved_len);
 
 again:
        ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
                                                    bdp_flags);
        if (ret) {
-               btrfs_delalloc_release_extents(inode, reserve_bytes);
-               release_space(inode, *data_reserved, start, release_bytes,
+               btrfs_delalloc_release_extents(inode, reserved_len);
+               release_space(inode, *data_reserved, reserved_start, reserved_len,
                              only_release_metadata);
                return ret;
        }
 
        ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false);
        if (ret) {
-               btrfs_delalloc_release_extents(inode, reserve_bytes);
-               release_space(inode, *data_reserved, start, release_bytes,
+               btrfs_delalloc_release_extents(inode, reserved_len);
+               release_space(inode, *data_reserved, reserved_start, reserved_len,
                              only_release_metadata);
                return ret;
        }
@@ -1215,8 +1231,8 @@ again:
                if (!nowait && extents_locked == -EAGAIN)
                        goto again;
 
-               btrfs_delalloc_release_extents(inode, reserve_bytes);
-               release_space(inode, *data_reserved, start, release_bytes,
+               btrfs_delalloc_release_extents(inode, reserved_len);
+               release_space(inode, *data_reserved, reserved_start, reserved_len,
                              only_release_metadata);
                ret = extents_locked;
                return ret;
@@ -1226,41 +1242,43 @@ again:
                                             write_bytes, iter);
        flush_dcache_folio(folio);
 
-       /*
-        * If we get a partial write, we can end up with partially uptodate
-        * page. Although if sector size < page size we can handle it, but if
-        * it's not sector aligned it can cause a lot of complexity, so make
-        * sure they don't happen by forcing retry this copy.
-        */
        if (unlikely(copied < write_bytes)) {
+               u64 last_block;
+
+               /*
+                * The original write range doesn't need an uptodate folio as
+                * the range is block aligned. But now a short copy happened.
+                * We cannot handle it without an uptodate folio.
+                *
+                * So just revert the range and we will retry.
+                */
                if (!folio_test_uptodate(folio)) {
                        iov_iter_revert(iter, copied);
                        copied = 0;
                }
-       }
-
-       num_blocks = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
-       dirty_blocks = round_up(copied + block_offset, fs_info->sectorsize);
-       dirty_blocks = BTRFS_BYTES_TO_BLKS(fs_info, dirty_blocks);
 
-       if (copied == 0)
-               dirty_blocks = 0;
+               /* No copied bytes, unlock, release reserved space and exit. */
+               if (copied == 0) {
+                       if (extents_locked)
+                               unlock_extent(&inode->io_tree, lockstart, lockend,
+                                             &cached_state);
+                       else
+                               free_extent_state(cached_state);
+                       btrfs_delalloc_release_extents(inode, reserved_len);
+                       release_space(inode, *data_reserved, reserved_start, reserved_len,
+                                     only_release_metadata);
+                       btrfs_drop_folio(fs_info, folio, start, copied);
+                       return 0;
+               }
 
-       if (num_blocks > dirty_blocks) {
-               /* Release everything except the sectors we dirtied. */
-               release_bytes -= dirty_blocks << fs_info->sectorsize_bits;
-               if (only_release_metadata) {
-                       btrfs_delalloc_release_metadata(inode, release_bytes, true);
-               } else {
-                       const u64 release_start = round_up(start + copied,
-                                                          fs_info->sectorsize);
+               /* Release the reserved space beyond the last block. */
+               last_block = round_up(start + copied, fs_info->sectorsize);
 
-                       btrfs_delalloc_release_space(inode, *data_reserved,
-                                                    release_start, release_bytes,
-                                                    true);
-               }
+               shrink_reserved_space(inode, *data_reserved, reserved_start,
+                                     reserved_len, last_block - reserved_start,
+                                     only_release_metadata);
+               reserved_len = last_block - reserved_start;
        }
-       release_bytes = round_up(copied + block_offset, fs_info->sectorsize);
 
        ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
                                only_release_metadata);
@@ -1276,10 +1294,10 @@ again:
        else
                free_extent_state(cached_state);
 
-       btrfs_delalloc_release_extents(inode, reserve_bytes);
+       btrfs_delalloc_release_extents(inode, reserved_len);
        if (ret) {
                btrfs_drop_folio(fs_info, folio, start, copied);
-               release_space(inode, *data_reserved, start, release_bytes,
+               release_space(inode, *data_reserved, reserved_start, reserved_len,
                              only_release_metadata);
                return ret;
        }