Merge tag 'for-5.3-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Aug 2019 21:19:41 +0000 (14:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Aug 2019 21:19:41 +0000 (14:19 -0700)
Pull btrfs fixes from David Sterba:

 - tiny race window during 2 transactions aborting at the same time can
   accidentally lead to a commit

 - regression fix, possible deadlock during fiemap

 - fix for an old bug when incremental send can fail on a file that has
   been deduplicated in a special way

* tag 'for-5.3-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  Btrfs: fix deadlock between fiemap and transaction commits
  Btrfs: fix race leading to fs corruption after transaction abort
  Btrfs: fix incremental send failure after deduplication

fs/btrfs/backref.c
fs/btrfs/send.c
fs/btrfs/transaction.c
fs/btrfs/transaction.h

index 89116afda7a20ff48f41e95a5f6e0ba8ff3da8d6..e5d85311d5d5d4eadd857fda99a99bd881074b38 100644 (file)
@@ -1483,7 +1483,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
        ulist_init(roots);
        ulist_init(tmp);
 
-       trans = btrfs_attach_transaction(root);
+       trans = btrfs_join_transaction_nostart(root);
        if (IS_ERR(trans)) {
                if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) {
                        ret = PTR_ERR(trans);
index 69b59bf75882eff54415e03ef08943db946ef087..c3c0c064c25dadd317841e1d2761e2629b6ec9f3 100644 (file)
@@ -6322,68 +6322,21 @@ static int changed_extent(struct send_ctx *sctx,
 {
        int ret = 0;
 
-       if (sctx->cur_ino != sctx->cmp_key->objectid) {
-
-               if (result == BTRFS_COMPARE_TREE_CHANGED) {
-                       struct extent_buffer *leaf_l;
-                       struct extent_buffer *leaf_r;
-                       struct btrfs_file_extent_item *ei_l;
-                       struct btrfs_file_extent_item *ei_r;
-
-                       leaf_l = sctx->left_path->nodes[0];
-                       leaf_r = sctx->right_path->nodes[0];
-                       ei_l = btrfs_item_ptr(leaf_l,
-                                             sctx->left_path->slots[0],
-                                             struct btrfs_file_extent_item);
-                       ei_r = btrfs_item_ptr(leaf_r,
-                                             sctx->right_path->slots[0],
-                                             struct btrfs_file_extent_item);
-
-                       /*
-                        * We may have found an extent item that has changed
-                        * only its disk_bytenr field and the corresponding
-                        * inode item was not updated. This case happens due to
-                        * very specific timings during relocation when a leaf
-                        * that contains file extent items is COWed while
-                        * relocation is ongoing and its in the stage where it
-                        * updates data pointers. So when this happens we can
-                        * safely ignore it since we know it's the same extent,
-                        * but just at different logical and physical locations
-                        * (when an extent is fully replaced with a new one, we
-                        * know the generation number must have changed too,
-                        * since snapshot creation implies committing the current
-                        * transaction, and the inode item must have been updated
-                        * as well).
-                        * This replacement of the disk_bytenr happens at
-                        * relocation.c:replace_file_extents() through
-                        * relocation.c:btrfs_reloc_cow_block().
-                        */
-                       if (btrfs_file_extent_generation(leaf_l, ei_l) ==
-                           btrfs_file_extent_generation(leaf_r, ei_r) &&
-                           btrfs_file_extent_ram_bytes(leaf_l, ei_l) ==
-                           btrfs_file_extent_ram_bytes(leaf_r, ei_r) &&
-                           btrfs_file_extent_compression(leaf_l, ei_l) ==
-                           btrfs_file_extent_compression(leaf_r, ei_r) &&
-                           btrfs_file_extent_encryption(leaf_l, ei_l) ==
-                           btrfs_file_extent_encryption(leaf_r, ei_r) &&
-                           btrfs_file_extent_other_encoding(leaf_l, ei_l) ==
-                           btrfs_file_extent_other_encoding(leaf_r, ei_r) &&
-                           btrfs_file_extent_type(leaf_l, ei_l) ==
-                           btrfs_file_extent_type(leaf_r, ei_r) &&
-                           btrfs_file_extent_disk_bytenr(leaf_l, ei_l) !=
-                           btrfs_file_extent_disk_bytenr(leaf_r, ei_r) &&
-                           btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) ==
-                           btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) &&
-                           btrfs_file_extent_offset(leaf_l, ei_l) ==
-                           btrfs_file_extent_offset(leaf_r, ei_r) &&
-                           btrfs_file_extent_num_bytes(leaf_l, ei_l) ==
-                           btrfs_file_extent_num_bytes(leaf_r, ei_r))
-                               return 0;
-               }
-
-               inconsistent_snapshot_error(sctx, result, "extent");
-               return -EIO;
-       }
+       /*
+        * We have found an extent item that changed without the inode item
+        * having changed. This can happen either after relocation (where the
+        * disk_bytenr of an extent item is replaced at
+        * relocation.c:replace_file_extents()) or after deduplication into a
+        * file in both the parent and send snapshots (where an extent item can
+        * get modified or replaced with a new one). Note that deduplication
+        * updates the inode item, but it only changes the iversion (sequence
+        * field in the inode item) of the inode, so if a file is deduplicated
+        * the same amount of times in both the parent and send snapshots, its
+        * iversion becames the same in both snapshots, whence the inode item is
+        * the same on both snapshots.
+        */
+       if (sctx->cur_ino != sctx->cmp_key->objectid)
+               return 0;
 
        if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
                if (result != BTRFS_COMPARE_TREE_DELETED)
index 3b8ae1a8f02df77769542a3c0a0782f452c8c791..e3adb714c04b364869e9da5c36e2f9fa0157b7fb 100644 (file)
@@ -28,15 +28,18 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
        [TRANS_STATE_COMMIT_START]      = (__TRANS_START | __TRANS_ATTACH),
        [TRANS_STATE_COMMIT_DOING]      = (__TRANS_START |
                                           __TRANS_ATTACH |
-                                          __TRANS_JOIN),
+                                          __TRANS_JOIN |
+                                          __TRANS_JOIN_NOSTART),
        [TRANS_STATE_UNBLOCKED]         = (__TRANS_START |
                                           __TRANS_ATTACH |
                                           __TRANS_JOIN |
-                                          __TRANS_JOIN_NOLOCK),
+                                          __TRANS_JOIN_NOLOCK |
+                                          __TRANS_JOIN_NOSTART),
        [TRANS_STATE_COMPLETED]         = (__TRANS_START |
                                           __TRANS_ATTACH |
                                           __TRANS_JOIN |
-                                          __TRANS_JOIN_NOLOCK),
+                                          __TRANS_JOIN_NOLOCK |
+                                          __TRANS_JOIN_NOSTART),
 };
 
 void btrfs_put_transaction(struct btrfs_transaction *transaction)
@@ -543,7 +546,8 @@ again:
                ret = join_transaction(fs_info, type);
                if (ret == -EBUSY) {
                        wait_current_trans(fs_info);
-                       if (unlikely(type == TRANS_ATTACH))
+                       if (unlikely(type == TRANS_ATTACH ||
+                                    type == TRANS_JOIN_NOSTART))
                                ret = -ENOENT;
                }
        } while (ret == -EBUSY);
@@ -659,6 +663,16 @@ struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root
                                 BTRFS_RESERVE_NO_FLUSH, true);
 }
 
+/*
+ * Similar to regular join but it never starts a transaction when none is
+ * running or after waiting for the current one to finish.
+ */
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
+{
+       return start_transaction(root, 0, TRANS_JOIN_NOSTART,
+                                BTRFS_RESERVE_NO_FLUSH, true);
+}
+
 /*
  * btrfs_attach_transaction() - catch the running transaction
  *
@@ -2037,6 +2051,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
                }
        } else {
                spin_unlock(&fs_info->trans_lock);
+               /*
+                * The previous transaction was aborted and was already removed
+                * from the list of transactions at fs_info->trans_list. So we
+                * abort to prevent writing a new superblock that reflects a
+                * corrupt state (pointing to trees with unwritten nodes/leafs).
+                */
+               if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+                       ret = -EROFS;
+                       goto cleanup_transaction;
+               }
        }
 
        extwriter_counter_dec(cur_trans, trans->type);
index 527ea94b57d91a6c289cb4b4b67f6a07ecb85934..2c5a6f6e5bb0904eecef07941f9ad76f3ba2f036 100644 (file)
@@ -94,11 +94,13 @@ struct btrfs_transaction {
 #define __TRANS_JOIN           (1U << 11)
 #define __TRANS_JOIN_NOLOCK    (1U << 12)
 #define __TRANS_DUMMY          (1U << 13)
+#define __TRANS_JOIN_NOSTART   (1U << 14)
 
 #define TRANS_START            (__TRANS_START | __TRANS_FREEZABLE)
 #define TRANS_ATTACH           (__TRANS_ATTACH)
 #define TRANS_JOIN             (__TRANS_JOIN | __TRANS_FREEZABLE)
 #define TRANS_JOIN_NOLOCK      (__TRANS_JOIN_NOLOCK)
+#define TRANS_JOIN_NOSTART     (__TRANS_JOIN_NOSTART)
 
 #define TRANS_EXTWRITERS       (__TRANS_START | __TRANS_ATTACH)
 
@@ -183,6 +185,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
                                        int min_factor);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
                                        struct btrfs_root *root);