From 402e38e6b71f5739119ca3107f375e112d63c7c5 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Wed, 23 Apr 2025 16:52:51 +0800
Subject: [PATCH] ext4: prevent stale extent cache entries caused by concurrent
 I/O writeback

Currently, in the I/O writeback path, ext4_map_blocks() may attempt to
cache additional unrelated extents in the extent status tree without
holding the inode's i_rwsem and the mapping's invalidate_lock. This can
lead to stale extent status entries remaining in certain scenarios,
potentially causing data corruption.

For example, when performing a collapse range in ext4_collapse_range(),
it clears the extent cache and dirty pages before removing blocks and
shifting extents. It also holds the i_data_sem during these two
operations. However, both ext4_ext_remove_space() and
ext4_ext_shift_extents() may briefly release the i_data_sem if journal
credits are insufficient (ext4_datasem_ensure_credits()). If another
writeback process writes dirty pages from other regions during this
interval, it may cache extents that are about to be modified. Unless
ext4_collapse_range() explicitly clears the extent cache again, these
cached entries can become stale and inconsistent with the actual
extents.

     0 a  n       b      c         m
     | |  |       |      |         |
    [www][wwwwww][wwwwwwww]...[wwwww][wwww]...
          |                           |
          N                           M

Assume that block a is dirty. The collapse range operation is removing
data from n to m and drops i_data_sem immediately after removing the
extent from b to c. At the same time, a concurrent writeback begins to
write back block a; it will reloads the extent from [n, b) into the
extent status tree since it does not hold the i_rwsem or the
invalidate_lock. After the collapse range operation, it left the stale
extent [n, b), which points logical block n to N, but the actual
physical block of n should be M.

Similarly, both ext4_insert_range() and ext4_truncate() have the same
problem. ext4_punch_hole() survived since it re-add a hole extent entry
after removing space since commit 9f1118223aa0 ("ext4: add a hole extent
entry in cache after punch").

In most cases, during dirty page writeback, the block mapping
information is likely to be found in the extent cache, making it less
necessary to search for physical extents. Consequently, loading
unrelated extent caches during writeback appears to be ineffective.
Therefore, fix this by adds EXT4_EX_NOCACHE in the writeback path to
prevent caching of unrelated extents, eliminating this potential source
of corruption.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://patch.msgid.link/20250423085257.122685-4-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h        |  1 +
 fs/ext4/extents.c     | 12 +++++++++---
 fs/ext4/fast_commit.c |  3 ++-
 fs/ext4/inode.c       | 28 ++++++++++++++++++++--------
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 84cbe8024f1e..3e724def1b06 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -741,6 +741,7 @@ enum {
 #define EXT4_EX_NOCACHE				0x40000000
 #define EXT4_EX_FORCE_CACHE			0x20000000
 #define EXT4_EX_NOFAIL				0x10000000
+#define EXT4_EX_FILTER				0x70000000
 
 /*
  * Flags used by ext4_free_blocks
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d8eac736cc9a..8a5724b2dc51 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4202,7 +4202,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 
 	/* find extent for this block */
-	path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
+	path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
 	if (IS_ERR(path)) {
 		err = PTR_ERR(path);
 		goto out;
@@ -4315,7 +4315,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		goto out;
 	ar.lright = map->m_lblk;
 	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
-				    &ex2, 0);
+				    &ex2, flags);
 	if (err < 0)
 		goto out;
 
@@ -4820,8 +4820,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
 				break;
 			}
 		}
+		/*
+		 * Do not cache any unrelated extents, as it does not hold the
+		 * i_rwsem or invalidate_lock, which could corrupt the extent
+		 * status tree.
+		 */
 		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+				      EXT4_GET_BLOCKS_IO_CONVERT_EXT |
+				      EXT4_EX_NOCACHE);
 		if (ret <= 0)
 			ext4_warning(inode->i_sb,
 				     "inode #%lu: block %u: len %u: "
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index bfe5b3c40078..1392241de5e6 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -918,7 +918,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 		map.m_lblk = cur_lblk_off;
 		map.m_len = new_blk_size - cur_lblk_off + 1;
 		ret = ext4_map_blocks(NULL, inode, &map,
-				      EXT4_GET_BLOCKS_IO_SUBMIT);
+				      EXT4_GET_BLOCKS_IO_SUBMIT |
+				      EXT4_EX_NOCACHE);
 		if (ret < 0)
 			return -ECANCELED;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3005053e92a7..8c0d6fa58f26 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -463,15 +463,16 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 #endif /* ES_AGGRESSIVE_TEST */
 
 static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
-				 struct ext4_map_blocks *map)
+				 struct ext4_map_blocks *map, int flags)
 {
 	unsigned int status;
 	int retval;
 
+	flags &= EXT4_EX_FILTER;
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		retval = ext4_ext_map_blocks(handle, inode, map, 0);
+		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	else
-		retval = ext4_ind_map_blocks(handle, inode, map, 0);
+		retval = ext4_ind_map_blocks(handle, inode, map, flags);
 
 	if (retval <= 0)
 		return retval;
@@ -622,6 +623,13 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
 		return -EFSCORRUPTED;
 
+	/*
+	 * Do not allow caching of unrelated ranges of extents during I/O
+	 * submission.
+	 */
+	if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+		WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE));
+
 	/* Lookup extent status tree firstly */
 	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
 	    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
@@ -667,7 +675,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * file system block.
 	 */
 	down_read(&EXT4_I(inode)->i_data_sem);
-	retval = ext4_map_query_blocks(handle, inode, map);
+	retval = ext4_map_query_blocks(handle, inode, map, flags);
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 found:
@@ -1807,7 +1815,7 @@ found:
 	if (ext4_has_inline_data(inode))
 		retval = 0;
 	else
-		retval = ext4_map_query_blocks(NULL, inode, map);
+		retval = ext4_map_query_blocks(NULL, inode, map, 0);
 	up_read(&EXT4_I(inode)->i_data_sem);
 	if (retval)
 		return retval < 0 ? retval : 0;
@@ -1830,7 +1838,7 @@ add_delayed:
 			goto found;
 		}
 	} else if (!ext4_has_inline_data(inode)) {
-		retval = ext4_map_query_blocks(NULL, inode, map);
+		retval = ext4_map_query_blocks(NULL, inode, map, 0);
 		if (retval) {
 			up_write(&EXT4_I(inode)->i_data_sem);
 			return retval < 0 ? retval : 0;
@@ -2214,11 +2222,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
 	 * previously reserved. However we must not fail because we're in
 	 * writeback and there is nothing we can do about it so it might result
 	 * in data loss.  So use reserved blocks to allocate metadata if
-	 * possible.
+	 * possible. In addition, do not cache any unrelated extents, as it
+	 * only holds the folio lock but does not hold the i_rwsem or
+	 * invalidate_lock, which could corrupt the extent status tree.
 	 */
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
 			   EXT4_GET_BLOCKS_METADATA_NOFAIL |
-			   EXT4_GET_BLOCKS_IO_SUBMIT;
+			   EXT4_GET_BLOCKS_IO_SUBMIT |
+			   EXT4_EX_NOCACHE;
+
 	dioread_nolock = ext4_should_dioread_nolock(inode);
 	if (dioread_nolock)
 		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
-- 
2.25.1