btrfs: optimize the logical to physical mapping for zoned writes
authorChristoph Hellwig <hch@lst.de>
Wed, 24 May 2023 15:03:08 +0000 (17:03 +0200)
committerDavid Sterba <dsterba@suse.com>
Mon, 19 Jun 2023 11:59:32 +0000 (13:59 +0200)
The current code to store the final logical to physical mapping for a
zone append write in the extent tree is rather inefficient.  It first has
to split the ordered extent so that there is one ordered extent per bio,
so that it can look up the ordered extent on I/O completion in
btrfs_record_physical_zoned and store the physical LBA returned by the
block driver in the ordered extent.

btrfs_rewrite_logical_zoned then has to do a lookup in the chunk tree to
see what physical address the logical address for this bio / ordered
extent is mapped to, and then rewrite it in the extent tree.

To optimize this process, we can store the physical address assigned in
the chunk tree to the original logical address and a pointer to
btrfs_ordered_sum structure the in the btrfs_bio structure, and then use
this information to rewrite the logical address in the btrfs_ordered_sum
structure directly at I/O completion time in btrfs_record_physical_zoned.
btrfs_rewrite_logical_zoned then simply updates the logical address in
the extent tree and the ordered_extent itself.

The code in btrfs_rewrite_logical_zoned now runs for all data I/O
completions in zoned file systems, which is fine as there is no remapping
to do for non-append writes to conventional zones or for relocation, and
the overhead for quickly breaking out of the loop is very low.

Because zoned file systems now need the ordered_sums structure to
record the actual write location returned by zone append, allocate dummy
structures without the csum array for them when the I/O doesn't use
checksums, and free them when completing the ordered_extent.

Note that the btrfs_bio doesn't grow as the new field are places into
a union that is so far not used for data writes and has plenty of space
left in it.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/bio.c
fs/btrfs/bio.h
fs/btrfs/file-item.c
fs/btrfs/file-item.h
fs/btrfs/inode.c
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/zoned.c

index ea6c81f9d1a36eeb1d32670ad9dd427eb4391df3..54cfab7394d377206198a4294a46f392010e81ec 100644 (file)
@@ -431,6 +431,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
                u64 zone_start = round_down(physical, dev->fs_info->zone_size);
 
                ASSERT(btrfs_dev_is_sequential(dev, physical));
+               btrfs_bio(bio)->orig_physical = physical;
                bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
        }
        btrfs_debug_in_rcu(dev->fs_info,
@@ -685,6 +686,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
                        ret = btrfs_bio_csum(bbio);
                        if (ret)
                                goto fail_put_bio;
+               } else if (use_append) {
+                       ret = btrfs_alloc_dummy_sum(bbio);
+                       if (ret)
+                               goto fail_put_bio;
                }
        }
 
index a8eca3a6567320c68103eb1288bfa8d4e64d87ee..8a29980159b404e05ea636e874aa4d68abb64d31 100644 (file)
@@ -39,8 +39,8 @@ struct btrfs_bio {
 
        union {
                /*
-                * Data checksumming and original I/O information for internal
-                * use in the btrfs_submit_bio machinery.
+                * For data reads: checksumming and original I/O information.
+                * (for internal use in the btrfs_submit_bio machinery only)
                 */
                struct {
                        u8 *csum;
@@ -48,7 +48,18 @@ struct btrfs_bio {
                        struct bvec_iter saved_iter;
                };
 
-               /* For metadata parentness verification. */
+               /*
+                * For data writes:
+                * - pointer to the checksums for this bio
+                * - original physical address from the allocator
+                *   (for zone append only)
+                */
+               struct {
+                       struct btrfs_ordered_sum *sums;
+                       u64 orig_physical;
+               };
+
+               /* For metadata reads: parentness verification. */
                struct btrfs_tree_parent_check parent_check;
        };
 
index 415e50904db311e31e9e282d880363d3cfa58b7b..0cb4a9921d21eddb43868ca70c718d951909b4fd 100644 (file)
@@ -818,11 +818,41 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
 
        }
        this_sum_bytes = 0;
+
+       /*
+        * The ->sums assignment is for zoned writes, where a bio never spans
+        * ordered extents and is only done unconditionally because that's cheaper
+        * than a branch.
+        */
+       bbio->sums = sums;
        btrfs_add_ordered_sum(ordered, sums);
        btrfs_put_ordered_extent(ordered);
        return 0;
 }
 
+/*
+ * Nodatasum I/O on zoned file systems still requires an btrfs_ordered_sum to
+ * record the updated logical address on Zone Append completion.
+ * Allocate just the structure with an empty sums array here for that case.
+ */
+blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
+{
+       struct btrfs_ordered_extent *ordered =
+               btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
+
+       if (WARN_ON_ONCE(!ordered))
+               return BLK_STS_IOERR;
+
+       bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS);
+       if (!bbio->sums)
+               return BLK_STS_RESOURCE;
+       bbio->sums->len = bbio->bio.bi_iter.bi_size;
+       bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+       btrfs_add_ordered_sum(ordered, bbio->sums);
+       btrfs_put_ordered_extent(ordered);
+       return 0;
+}
+
 /*
  * Remove one checksum overlapping a range.
  *
index 6be8725cd574748ac6c64dbe91b5e9eccc64dd89..4ec669b690080a85db3625b3d478b3e348db8693 100644 (file)
@@ -50,6 +50,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_ordered_sum *sums);
 blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio);
+blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio);
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                             struct list_head *list, int search_commit,
                             bool nowait);
index ad8196f31cdb3a87bcf24c9bba2bc4567cfd5b4a..31c5b7c176d36637f692837f054ad119d75af3f7 100644 (file)
@@ -3301,14 +3301,10 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                goto out;
        }
 
-       /* A valid ->physical implies a write on a sequential zone. */
-       if (ordered_extent->physical != (u64)-1) {
+       if (btrfs_is_zoned(fs_info)) {
                btrfs_rewrite_logical_zoned(ordered_extent);
                btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
                                        ordered_extent->disk_num_bytes);
-       } else if (btrfs_is_data_reloc_root(inode->root)) {
-               btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
-                                       ordered_extent->disk_num_bytes);
        }
 
        if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
index a9778a91511e19b231431f592b2b859c940a2d61..324a5a8c844a72673605c4a32d4acf13c307d9b3 100644 (file)
@@ -209,7 +209,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
        entry->compress_type = compress_type;
        entry->truncated_len = (u64)-1;
        entry->qgroup_rsv = ret;
-       entry->physical = (u64)-1;
 
        ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
        entry->flags = flags;
index ebc980ac967ad4798f99dc046d437ae05b46e551..dc700aa515b58bba0bf2fe57f12939a570d4f485 100644 (file)
@@ -151,12 +151,6 @@ struct btrfs_ordered_extent {
        struct completion completion;
        struct btrfs_work flush_work;
        struct list_head work_list;
-
-       /*
-        * Used to reverse-map physical address returned from ZONE_APPEND write
-        * command in a workqueue context
-        */
-       u64 physical;
 };
 
 static inline void
index eca49e6e0e5f5099f0453a8aad5d14a3c5364e6c..b55b0d4ee86f85f2ba795ceb583b9c51bf011f80 100644 (file)
@@ -1657,51 +1657,28 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
 void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
 {
        const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
-       struct btrfs_ordered_extent *ordered;
+       struct btrfs_ordered_sum *sum = bbio->sums;
 
-       ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
-       if (WARN_ON(!ordered))
-               return;
-
-       ordered->physical = physical;
-       btrfs_put_ordered_extent(ordered);
+       if (physical < bbio->orig_physical)
+               sum->logical -= bbio->orig_physical - physical;
+       else
+               sum->logical += physical - bbio->orig_physical;
 }
 
 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
 {
        struct btrfs_inode *inode = BTRFS_I(ordered->inode);
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
-       struct extent_map_tree *em_tree;
+       struct extent_map_tree *em_tree = &inode->extent_tree;
        struct extent_map *em;
-       struct btrfs_ordered_sum *sum;
-       u64 orig_logical = ordered->disk_bytenr;
-       struct map_lookup *map;
-       u64 physical = ordered->physical;
-       u64 chunk_start_phys;
-       u64 logical;
+       struct btrfs_ordered_sum *sum =
+               list_first_entry(&ordered->list, typeof(*sum), list);
+       u64 logical = sum->logical;
 
-       em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
-       if (IS_ERR(em))
-               return;
-       map = em->map_lookup;
-       chunk_start_phys = map->stripes[0].physical;
-
-       if (WARN_ON_ONCE(map->num_stripes > 1) ||
-           WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) ||
-           WARN_ON_ONCE(physical < chunk_start_phys) ||
-           WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) {
-               free_extent_map(em);
-               return;
-       }
-       logical = em->start + (physical - map->stripes[0].physical);
-       free_extent_map(em);
-
-       if (orig_logical == logical)
-               return;
+       if (ordered->disk_bytenr == logical)
+               goto out;
 
        ordered->disk_bytenr = logical;
 
-       em_tree = &inode->extent_tree;
        write_lock(&em_tree->lock);
        em = search_extent_mapping(em_tree, ordered->file_offset,
                                   ordered->num_bytes);
@@ -1709,11 +1686,17 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
        free_extent_map(em);
        write_unlock(&em_tree->lock);
 
-       list_for_each_entry(sum, &ordered->list, list) {
-               if (logical < orig_logical)
-                       sum->logical -= orig_logical - logical;
-               else
-                       sum->logical += logical - orig_logical;
+out:
+       /*
+        * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
+        * were allocated by btrfs_alloc_dummy_sum only to record the logical
+        * addresses and don't contain actual checksums.  We thus must free them
+        * here so that we don't attempt to log the csums later.
+        */
+       if ((inode->flags & BTRFS_INODE_NODATASUM) ||
+           test_bit(BTRFS_FS_STATE_NO_CSUMS, &inode->root->fs_info->fs_state)) {
+               list_del(&sum->list);
+               kfree(sum);
        }
 }