ext4: fix data corruption with EXT4_GET_BLOCKS_ZERO
authorJan Kara <jack@suse.cz>
Fri, 26 May 2017 21:40:52 +0000 (17:40 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Fri, 26 May 2017 21:40:52 +0000 (17:40 -0400)
When ext4_map_blocks() is called with EXT4_GET_BLOCKS_ZERO to zero-out
allocated blocks and these blocks are actually converted from unwritten
extent the following race can happen:

CPU0 CPU1

page fault page fault
... ...
ext4_map_blocks()
  ext4_ext_map_blocks()
    ext4_ext_handle_unwritten_extents()
      ext4_ext_convert_to_initialized()
- zero out converted extent
ext4_zeroout_es()
  - inserts extent as initialized in status tree

ext4_map_blocks()
  ext4_es_lookup_extent()
    - finds initialized extent
write data
  ext4_issue_zeroout()
    - zeroes out new extent overwriting data

This problem can be reproduced by generic/340 for the fallocated case
for the last block in the file.

Fix the problem by avoiding zeroing out the area we are mapping with
ext4_map_blocks() in ext4_ext_convert_to_initialized(). It is pointless
to zero out this area in the first place as the caller asked us to
convert the area to initialized because he is just going to write data
there before the transaction finishes. To achieve this we delete the
special case of zeroing out full extent as that will be handled by the
cases below zeroing only the part of the extent that needs it. We also
instruct ext4_split_extent() that the middle of extent being split
contains data so that ext4_split_extent_at() cannot zero out full extent
in case of ENOSPC.

CC: stable@vger.kernel.org
Fixes: 12735f881952c32b31bc4e433768f18489f79ec9
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
fs/ext4/extents.c

index 2a97dff87b961771383c33e97dc57997eb6586fb..83a513efc824a39dbf876f0225f1d0d6ff78f385 100644 (file)
@@ -3413,13 +3413,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        struct ext4_sb_info *sbi;
        struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
-       struct ext4_extent zero_ex;
+       struct ext4_extent zero_ex1, zero_ex2;
        struct ext4_extent *ex, *abut_ex;
        ext4_lblk_t ee_block, eof_block;
        unsigned int ee_len, depth, map_len = map->m_len;
        int allocated = 0, max_zeroout = 0;
        int err = 0;
-       int split_flag = 0;
+       int split_flag = EXT4_EXT_DATA_VALID2;
 
        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
@@ -3436,7 +3436,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-       zero_ex.ee_len = 0;
+       zero_ex1.ee_len = 0;
+       zero_ex2.ee_len = 0;
 
        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
 
@@ -3576,62 +3577,52 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        if (ext4_encrypted_inode(inode))
                max_zeroout = 0;
 
-       /* If extent is less than s_max_zeroout_kb, zeroout directly */
-       if (max_zeroout && (ee_len <= max_zeroout)) {
-               err = ext4_ext_zeroout(inode, ex);
-               if (err)
-                       goto out;
-               zero_ex.ee_block = ex->ee_block;
-               zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
-               ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
-
-               err = ext4_ext_get_access(handle, inode, path + depth);
-               if (err)
-                       goto out;
-               ext4_ext_mark_initialized(ex);
-               ext4_ext_try_to_merge(handle, inode, path, ex);
-               err = ext4_ext_dirty(handle, inode, path + path->p_depth);
-               goto out;
-       }
-
        /*
-        * four cases:
+        * five cases:
         * 1. split the extent into three extents.
-        * 2. split the extent into two extents, zeroout the first half.
-        * 3. split the extent into two extents, zeroout the second half.
+        * 2. split the extent into two extents, zeroout the head of the first
+        *    extent.
+        * 3. split the extent into two extents, zeroout the tail of the second
+        *    extent.
         * 4. split the extent into two extents with out zeroout.
+        * 5. no splitting needed, just possibly zeroout the head and / or the
+        *    tail of the extent.
         */
        split_map.m_lblk = map->m_lblk;
        split_map.m_len = map->m_len;
 
-       if (max_zeroout && (allocated > map->m_len)) {
+       if (max_zeroout && (allocated > split_map.m_len)) {
                if (allocated <= max_zeroout) {
-                       /* case 3 */
-                       zero_ex.ee_block =
-                                        cpu_to_le32(map->m_lblk);
-                       zero_ex.ee_len = cpu_to_le16(allocated);
-                       ext4_ext_store_pblock(&zero_ex,
-                               ext4_ext_pblock(ex) + map->m_lblk - ee_block);
-                       err = ext4_ext_zeroout(inode, &zero_ex);
+                       /* case 3 or 5 */
+                       zero_ex1.ee_block =
+                                cpu_to_le32(split_map.m_lblk +
+                                            split_map.m_len);
+                       zero_ex1.ee_len =
+                               cpu_to_le16(allocated - split_map.m_len);
+                       ext4_ext_store_pblock(&zero_ex1,
+                               ext4_ext_pblock(ex) + split_map.m_lblk +
+                               split_map.m_len - ee_block);
+                       err = ext4_ext_zeroout(inode, &zero_ex1);
                        if (err)
                                goto out;
-                       split_map.m_lblk = map->m_lblk;
                        split_map.m_len = allocated;
-               } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
-                       /* case 2 */
-                       if (map->m_lblk != ee_block) {
-                               zero_ex.ee_block = ex->ee_block;
-                               zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+               }
+               if (split_map.m_lblk - ee_block + split_map.m_len <
+                                                               max_zeroout) {
+                       /* case 2 or 5 */
+                       if (split_map.m_lblk != ee_block) {
+                               zero_ex2.ee_block = ex->ee_block;
+                               zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
                                                        ee_block);
-                               ext4_ext_store_pblock(&zero_ex,
+                               ext4_ext_store_pblock(&zero_ex2,
                                                      ext4_ext_pblock(ex));
-                               err = ext4_ext_zeroout(inode, &zero_ex);
+                               err = ext4_ext_zeroout(inode, &zero_ex2);
                                if (err)
                                        goto out;
                        }
 
+                       split_map.m_len += split_map.m_lblk - ee_block;
                        split_map.m_lblk = ee_block;
-                       split_map.m_len = map->m_lblk - ee_block + map->m_len;
                        allocated = map->m_len;
                }
        }
@@ -3642,8 +3633,11 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                err = 0;
 out:
        /* If we have gotten a failure, don't zero out status tree */
-       if (!err)
-               err = ext4_zeroout_es(inode, &zero_ex);
+       if (!err) {
+               err = ext4_zeroout_es(inode, &zero_ex1);
+               if (!err)
+                       err = ext4_zeroout_es(inode, &zero_ex2);
+       }
        return err ? err : allocated;
 }