btrfs: qgroup: Use delayed subtree rescan for balance
authorQu Wenruo <wqu@suse.com>
Wed, 23 Jan 2019 07:15:17 +0000 (15:15 +0800)
committerDavid Sterba <dsterba@suse.com>
Mon, 25 Feb 2019 13:13:26 +0000 (14:13 +0100)
Before this patch, qgroup code traces the whole subtree of subvolume and
reloc trees unconditionally.

This makes qgroup numbers consistent, but it could cause tons of
unnecessary extent tracing, which causes a lot of overhead.

However for subtree swap of balance, just swap both subtrees because
they contain the same contents and tree structure, so qgroup numbers
won't change.

It's the race window between subtree swap and transaction commit could
cause qgroup number change.

This patch will delay the qgroup subtree scan until COW happens for the
subtree root.

So if there is no other operations for the fs, balance won't cause extra
qgroup overhead. (best case scenario)
Depending on the workload, most of the subtree scan can still be
avoided.

Only for worst case scenario, it will fall back to old subtree swap
overhead. (scan all swapped subtrees)

[[Benchmark]]
Hardware:
VM 4G vRAM, 8 vCPUs,
disk is using 'unsafe' cache mode,
backing device is SAMSUNG 850 evo SSD.
Host has 16G ram.

Mkfs parameter:
--nodesize 4K (To bump up tree size)

Initial subvolume contents:
4G data copied from /usr and /lib.
(With enough regular small files)

Snapshots:
16 snapshots of the original subvolume.
each snapshot has 3 random files modified.

balance parameter:
-m

So the content should be pretty similar to a real world root fs layout.

And after file system population, there is no other activity, so it
should be the best case scenario.

                     | v4.20-rc1            | w/ patchset    | diff
-----------------------------------------------------------------------
relocated extents    | 22615                | 22457          | -0.1%
qgroup dirty extents | 163457               | 121606         | -25.6%
time (sys)           | 22.884s              | 18.842s        | -17.6%
time (real)          | 27.724s              | 22.884s        | -17.5%

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/ctree.c
fs/btrfs/qgroup.c
fs/btrfs/qgroup.h
fs/btrfs/relocation.c

index 5a6c39b44c84f4c2f4e39e09797d213ea0f564e7..0f2c20e0b108937a464077cf7e4e23cd18f71bdb 100644 (file)
@@ -13,6 +13,7 @@
 #include "print-tree.h"
 #include "locking.h"
 #include "volumes.h"
+#include "qgroup.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_path *path, int level);
@@ -1489,6 +1490,13 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
                btrfs_set_lock_blocking(parent);
        btrfs_set_lock_blocking(buf);
 
+       /*
+        * Before CoWing this block for later modification, check if it's
+        * the subtree root and do the delayed subtree trace if needed.
+        *
+        * Also We don't care about the error, as it's handled internally.
+        */
+       btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
        ret = __btrfs_cow_block(trans, root, buf, parent,
                                 parent_slot, cow_ret, search_start, 0);
 
index 7166d202b26a8f4dc4b51aa4201879c319f52748..f20a09aedbc03f87ace09f10ea42624e4ae3ecbe 100644 (file)
@@ -3968,3 +3968,91 @@ out:
                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
        return ret;
 }
+
+/*
+ * Check if the tree block is a subtree root, and if so do the needed
+ * delayed subtree trace for qgroup.
+ *
+ * This is called during btrfs_cow_block().
+ */
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct extent_buffer *subvol_eb)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
+       struct btrfs_qgroup_swapped_block *block;
+       struct extent_buffer *reloc_eb = NULL;
+       struct rb_node *node;
+       bool found = false;
+       bool swapped = false;
+       int level = btrfs_header_level(subvol_eb);
+       int ret = 0;
+       int i;
+
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+               return 0;
+       if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
+               return 0;
+
+       spin_lock(&blocks->lock);
+       if (!blocks->swapped) {
+               spin_unlock(&blocks->lock);
+               return 0;
+       }
+       node = blocks->blocks[level].rb_node;
+
+       while (node) {
+               block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+               if (block->subvol_bytenr < subvol_eb->start) {
+                       node = node->rb_left;
+               } else if (block->subvol_bytenr > subvol_eb->start) {
+                       node = node->rb_right;
+               } else {
+                       found = true;
+                       break;
+               }
+       }
+       if (!found) {
+               spin_unlock(&blocks->lock);
+               goto out;
+       }
+       /* Found one, remove it from @blocks first and update blocks->swapped */
+       rb_erase(&block->node, &blocks->blocks[level]);
+       for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+               if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
+                       swapped = true;
+                       break;
+               }
+       }
+       blocks->swapped = swapped;
+       spin_unlock(&blocks->lock);
+
+       /* Read out reloc subtree root */
+       reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
+                                  block->reloc_generation, block->level,
+                                  &block->first_key);
+       if (IS_ERR(reloc_eb)) {
+               ret = PTR_ERR(reloc_eb);
+               reloc_eb = NULL;
+               goto free_out;
+       }
+       if (!extent_buffer_uptodate(reloc_eb)) {
+               ret = -EIO;
+               goto free_out;
+       }
+
+       ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
+                       block->last_snapshot, block->trace_leaf);
+free_out:
+       kfree(block);
+       free_extent_buffer(reloc_eb);
+out:
+       if (ret < 0) {
+               btrfs_err_rl(fs_info,
+                            "failed to account subtree at bytenr %llu: %d",
+                            subvol_eb->start, ret);
+               fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+       }
+       return ret;
+}
index 8dc17020e5be38086568ce3786745286c9c3066e..3f5e0b413312082a52214b4fae5f69225f51e330 100644 (file)
@@ -416,5 +416,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
                struct extent_buffer *subvol_parent, int subvol_slot,
                struct extent_buffer *reloc_parent, int reloc_slot,
                u64 last_snapshot);
+int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
+               struct btrfs_root *root, struct extent_buffer *eb);
 
 #endif
index 0c528918c844efbf02e58d9323c4325f094fe815..e04cf11059b51819aa31ad9e5e9186849b95d232 100644 (file)
@@ -1889,16 +1889,12 @@ again:
                 *    If not traced, we will leak data numbers
                 * 2) Fs subtree
                 *    If not traced, we will double count old data
-                *    and tree block numbers, if current trans doesn't free
-                *    data reloc tree inode.
+                *
+                * We don't scan the subtree right now, but only record
+                * the swapped tree blocks.
+                * The real subtree rescan is delayed until we have new
+                * CoW on the subtree root node before transaction commit.
                 */
-               ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
-                               parent, slot, path->nodes[level],
-                               path->slots[level], last_snapshot);
-               if (ret < 0)
-                       break;
-
-               btrfs_node_key_to_cpu(parent, &first_key, slot);
                ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
                                rc->block_group, parent, slot,
                                path->nodes[level], path->slots[level],