ext4: Add multi-fsblock atomic write support with bigalloc

author Ritesh Harjani (IBM) <ritesh.list@gmail.com>

Thu, 15 May 2025 19:50:53 +0000 (01:20 +0530)

committer Theodore Ts'o <tytso@mit.edu>

Tue, 20 May 2025 14:31:12 +0000 (10:31 -0400)
author Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Thu, 15 May 2025 19:50:53 +0000 (01:20 +0530)
committer Theodore Ts'o <tytso@mit.edu>
Tue, 20 May 2025 14:31:12 +0000 (10:31 -0400)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 635a95a7f7151740a81433e1dfd297f72b264ebc..201afaaa508a9f16ff51381b1ae8a0591ce775c1 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3751,6 +3751,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                           loff_t len);
  extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                           loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_extents_atomic(handle_t *handle,
+                       struct inode *inode, loff_t offset, ssize_t len);
  extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
                                              ext4_io_end_t *io_end);
  extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

index ba91ad3dfcbc258c82eeeca49989d9a0c89f4eea..74c16736ae34ec33ea80e3e549bea2cab0551da9 100644 (file)
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4796,6 +4796,93 @@ out_inode_lock:
         return ret;
  }
  
+/*
+ * This function converts a range of blocks to written extents. The caller of
+ * this function will pass the start offset and the size. all unwritten extents
+ * within this range will be converted to written extents.
+ *
+ * This function is called from the direct IO end io call back function for
+ * atomic writes, to convert the unwritten extents after IO is completed.
+ *
+ * Note that the requirement for atomic writes is that all conversion should
+ * happen atomically in a single fs journal transaction. We mainly only allocate
+ * unwritten extents either on a hole on a pre-exiting unwritten extent range in
+ * ext4_map_blocks_atomic_write(). The only case where we can have multiple
+ * unwritten extents in a range [offset, offset+len) is when there is a split
+ * unwritten extent between two leaf nodes which was cached in extent status
+ * cache during ext4_iomap_alloc() time. That will allow
+ * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going
+ * into the slow path. That means we might need a loop for conversion of this
+ * unwritten extent split across leaf block within a single journal transaction.
+ * Split extents across leaf nodes is a rare case, but let's still handle that
+ * to meet the requirements of multi-fsblock atomic writes.
+ *
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
+                                         loff_t offset, ssize_t len)
+{
+       unsigned int max_blocks;
+       int ret = 0, ret2 = 0, ret3 = 0;
+       struct ext4_map_blocks map;
+       unsigned int blkbits = inode->i_blkbits;
+       unsigned int credits = 0;
+       int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT;
+
+       map.m_lblk = offset >> blkbits;
+       max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
+
+       if (!handle) {
+               /*
+                * TODO: An optimization can be added later by having an extent
+                * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that
+                * it can tell if the extent in the cache is a split extent.
+                * But for now let's assume pextents as 2 always.
+                */
+               credits = ext4_meta_trans_blocks(inode, max_blocks, 2);
+       }
+
+       if (credits) {
+               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       return ret;
+               }
+       }
+
+       while (ret >= 0 && ret < max_blocks) {
+               map.m_lblk += ret;
+               map.m_len = (max_blocks -= ret);
+               ret = ext4_map_blocks(handle, inode, &map, flags);
+               if (ret != max_blocks)
+                       ext4_msg(inode->i_sb, KERN_INFO,
+                                    "inode #%lu: block %u: len %u: "
+                                    "split block mapping found for atomic write, "
+                                    "ret = %d",
+                                    inode->i_ino, map.m_lblk,
+                                    map.m_len, ret);
+               if (ret <= 0)
+                       break;
+       }
+
+       ret2 = ext4_mark_inode_dirty(handle, inode);
+
+       if (credits) {
+               ret3 = ext4_journal_stop(handle);
+               if (unlikely(ret3))
+                       ret2 = ret3;
+       }
+
+       if (ret <= 0 || ret2)
+               ext4_warning(inode->i_sb,
+                            "inode #%lu: block %u: len %u: "
+                            "returned %d or %d",
+                            inode->i_ino, map.m_lblk,
+                            map.m_len, ret, ret2);
+
+       return ret > 0 ? ret2 : ret;
+}
+
  /*
   * This function convert a range of blocks to written extents
   * The caller of this function will pass the start offset and the size.
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index b845a25f7932c6b49606cc7e2c1b8872b9781bb8..21df81347147cc7ab3648590b6dbd40e31e2c20b 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
         loff_t pos = iocb->ki_pos;
         struct inode *inode = file_inode(iocb->ki_filp);
  
-       if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+
+       if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) &&
+                       (iocb->ki_flags & IOCB_ATOMIC))
+               error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos,
+                                                             size);
+       else if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
                 error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
         if (error)
                 return error;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 14c19a38cd4fb93d325eb2b460ddb4c21d7af9d1..2f59036df8009da41c5901cd19d9728993891069 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3467,12 +3467,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
         }
  }
  
+static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
+                       struct inode *inode, struct ext4_map_blocks *map)
+{
+       ext4_lblk_t m_lblk = map->m_lblk;
+       unsigned int m_len = map->m_len;
+       unsigned int mapped_len = 0, m_flags = 0;
+       ext4_fsblk_t next_pblk;
+       bool check_next_pblk = false;
+       int ret = 0;
+
+       WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb));
+
+       /*
+        * This is a slow path in case of mixed mapping. We use
+        * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single
+        * contiguous mapped mapping. This will ensure any unwritten or hole
+        * regions within the requested range is zeroed out and we return
+        * a single contiguous mapped extent.
+        */
+       m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+
+       do {
+               ret = ext4_map_blocks(handle, inode, map, m_flags);
+               if (ret < 0 && ret != -ENOSPC)
+                       goto out_err;
+               /*
+                * This should never happen, but let's return an error code to
+                * avoid an infinite loop in here.
+                */
+               if (ret == 0) {
+                       ret = -EFSCORRUPTED;
+                       ext4_warning_inode(inode,
+                               "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d",
+                               m_flags, ret);
+                       goto out_err;
+               }
+               /*
+                * With bigalloc we should never get ENOSPC nor discontiguous
+                * physical extents.
+                */
+               if ((check_next_pblk && next_pblk != map->m_pblk) ||
+                               ret == -ENOSPC) {
+                       ext4_warning_inode(inode,
+                               "Non-contiguous allocation detected: expected %llu, got %llu, "
+                               "or ext4_map_blocks() returned out of space ret: %d",
+                               next_pblk, map->m_pblk, ret);
+                       ret = -EFSCORRUPTED;
+                       goto out_err;
+               }
+               next_pblk = map->m_pblk + map->m_len;
+               check_next_pblk = true;
+
+               mapped_len += map->m_len;
+               map->m_lblk += map->m_len;
+               map->m_len = m_len - mapped_len;
+       } while (mapped_len < m_len);
+
+       /*
+        * We might have done some work in above loop, so we need to query the
+        * start of the physical extent, based on the origin m_lblk and m_len.
+        * Let's also ensure we were able to allocate the required range for
+        * mixed mapping case.
+        */
+       map->m_lblk = m_lblk;
+       map->m_len = m_len;
+       map->m_flags = 0;
+
+       ret = ext4_map_blocks(handle, inode, map,
+                             EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF);
+       if (ret != m_len) {
+               ext4_warning_inode(inode,
+                       "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n",
+                       m_lblk, m_len, ret);
+               ret = -EINVAL;
+       }
+       return ret;
+
+out_err:
+       /* reset map before returning an error */
+       map->m_lblk = m_lblk;
+       map->m_len = m_len;
+       map->m_flags = 0;
+       return ret;
+}
+
+/*
+ * ext4_map_blocks_atomic: Helper routine to ensure the entire requested
+ * range in @map [lblk, lblk + len) is one single contiguous extent with no
+ * mixed mappings.
+ *
+ * We first use m_flags passed to us by our caller (ext4_iomap_alloc()).
+ * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying
+ * physical extent for the requested range does not have a single contiguous
+ * mapping type i.e. (Hole, Mapped, or Unwritten) throughout.
+ * In that case we will loop over the requested range to allocate and zero out
+ * the unwritten / holes in between, to get a single mapped extent from
+ * [m_lblk, m_lblk +  m_len). Note that this is only possible because we know
+ * this can be called only with bigalloc enabled filesystem where the underlying
+ * cluster is already allocated. This avoids allocating discontiguous extents
+ * in the slow path due to multiple calls to ext4_map_blocks().
+ * The slow path is mostly non-performance critical path, so it should be ok to
+ * loop using ext4_map_blocks() with appropriate flags to allocate & zero the
+ * underlying short holes/unwritten extents within the requested range.
+ */
+static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode,
+                               struct ext4_map_blocks *map, int m_flags,
+                               bool *force_commit)
+{
+       ext4_lblk_t m_lblk = map->m_lblk;
+       unsigned int m_len = map->m_len;
+       int ret = 0;
+
+       WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb));
+
+       ret = ext4_map_blocks(handle, inode, map, m_flags);
+       if (ret < 0 || ret == m_len)
+               goto out;
+       /*
+        * This is a mixed mapping case where we were not able to allocate
+        * a single contiguous extent. In that case let's reset requested
+        * mapping and call the slow path.
+        */
+       map->m_lblk = m_lblk;
+       map->m_len = m_len;
+       map->m_flags = 0;
+
+       /*
+        * slow path means we have mixed mapping, that means we will need
+        * to force txn commit.
+        */
+       *force_commit = true;
+       return ext4_map_blocks_atomic_write_slow(handle, inode, map);
+out:
+       return ret;
+}
+
  static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
                             unsigned int flags)
  {
         handle_t *handle;
         u8 blkbits = inode->i_blkbits;
         int ret, dio_credits, m_flags = 0, retries = 0;
+       bool force_commit = false;
  
         /*
          * Trim the mapping request to the maximum value that we can map at
@@ -3480,7 +3617,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
          */
         if (map->m_len > DIO_MAX_BLOCKS)
                 map->m_len = DIO_MAX_BLOCKS;
-       dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+
+       /*
+        * journal credits estimation for atomic writes. We call
+        * ext4_map_blocks(), to find if there could be a mixed mapping. If yes,
+        * then let's assume the no. of pextents required can be m_len i.e.
+        * every alternate block can be unwritten and hole.
+        */
+       if (flags & IOMAP_ATOMIC) {
+               unsigned int orig_mlen = map->m_len;
+
+               ret = ext4_map_blocks(NULL, inode, map, 0);
+               if (ret < 0)
+                       return ret;
+               if (map->m_len < orig_mlen) {
+                       map->m_len = orig_mlen;
+                       dio_credits = ext4_meta_trans_blocks(inode, orig_mlen,
+                                                            map->m_len);
+               } else {
+                       dio_credits = ext4_chunk_trans_blocks(inode,
+                                                             map->m_len);
+               }
+       } else {
+               dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+       }
  
  retry:
         /*
@@ -3511,7 +3671,11 @@ retry:
         else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                 m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
  
-       ret = ext4_map_blocks(handle, inode, map, m_flags);
+       if (flags & IOMAP_ATOMIC)
+               ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
+                                                  &force_commit);
+       else
+               ret = ext4_map_blocks(handle, inode, map, m_flags);
  
         /*
          * We cannot fill holes in indirect tree based inodes as that could
@@ -3525,6 +3689,22 @@ retry:
         if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                 goto retry;
  
+       /*
+        * Force commit the current transaction if the allocation spans a mixed
+        * mapping range. This ensures any pending metadata updates (like
+        * unwritten to written extents conversion) in this range are in
+        * consistent state with the file data blocks, before performing the
+        * actual write I/O. If the commit fails, the whole I/O must be aborted
+        * to prevent any possible torn writes.
+        */
+       if (ret > 0 && force_commit) {
+               int ret2;
+
+               ret2 = ext4_force_commit(inode->i_sb);
+               if (ret2)
+                       return ret2;
+       }
+
         return ret;
  }
  
@@ -3535,6 +3715,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
         int ret;
         struct ext4_map_blocks map;
         u8 blkbits = inode->i_blkbits;
+       unsigned int orig_mlen;
  
         if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                 return -EINVAL;
@@ -3548,6 +3729,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
         map.m_lblk = offset >> blkbits;
         map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                           EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+       orig_mlen = map.m_len;
  
         if (flags & IOMAP_WRITE) {
                 /*
@@ -3558,8 +3740,16 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                  */
                 if (offset + length <= i_size_read(inode)) {
                         ret = ext4_map_blocks(NULL, inode, &map, 0);
-                       if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
-                               goto out;
+                       /*
+                        * For atomic writes the entire requested length should
+                        * be mapped.
+                        */
+                       if (map.m_flags & EXT4_MAP_MAPPED) {
+                               if ((!(flags & IOMAP_ATOMIC) && ret > 0) ||
+                                  (flags & IOMAP_ATOMIC && ret >= orig_mlen))
+                                       goto out;
+                       }
+                       map.m_len = orig_mlen;
                 }
                 ret = ext4_iomap_alloc(inode, &map, flags);
         } else {
@@ -3580,6 +3770,16 @@ out:
          */
         map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
  
+       /*
+        * Before returning to iomap, let's ensure the allocated mapping
+        * covers the entire requested length for atomic writes.
+        */
+       if (flags & IOMAP_ATOMIC) {
+               if (map.m_len < (length >> blkbits)) {
+                       WARN_ON_ONCE(1);
+                       return -EINVAL;
+               }
+       }
         ext4_set_iomap(inode, iomap, &map, offset, length, flags);
  
         return 0;
author	Ritesh Harjani (IBM) <ritesh.list@gmail.com>
	Thu, 15 May 2025 19:50:53 +0000 (01:20 +0530)
committer	Theodore Ts'o <tytso@mit.edu>
	Tue, 20 May 2025 14:31:12 +0000 (10:31 -0400)
fs/ext4/ext4.h		patch \| blob \| blame \| history
fs/ext4/extents.c		patch \| blob \| blame \| history
fs/ext4/file.c		patch \| blob \| blame \| history
fs/ext4/inode.c		patch \| blob \| blame \| history