bcachefs: Pin btree cache in ram for random access in fsck
authorKent Overstreet <kent.overstreet@linux.dev>
Tue, 23 Jan 2024 05:01:07 +0000 (00:01 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Thu, 14 Mar 2024 01:22:24 +0000 (21:22 -0400)
Various phases of fsck involve checking references from one btree to
another: this means doing a sequential scan of one btree, and then
mostly random access into the second.

This is particularly painful for checking extents <-> backpointers; we
can prefetch btree node access on the sequential scan, but not on the
random access portion, and this is particularly painful on spinning
rust, where we'd like to keep the pipeline fairly full of btree node
reads so that the elevator can reduce seeking.

This patch implements prefetching and pinning of the portion of the
btree that we'll be doing random access to. We already calculate how
much of the random access btree will fit in memory so it's a fairly
straightforward change.

This will put more pressure on system memory usage, so we introduce a
new option, fsck_memory_usage_percent, which is the percentage of total
system ram that fsck is allowed to pin.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/backpointers.c
fs/bcachefs/bbpos_types.h
fs/bcachefs/btree_cache.c
fs/bcachefs/btree_types.h
fs/bcachefs/opts.h

index f9ccefc74c49d1390f6becfa077346b72284a4e8..f2b33fe40bd8e01d0656bd833a91d048028ce21c 100644 (file)
@@ -554,60 +554,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
        };
 }
 
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+static u64 mem_may_pin_bytes(struct bch_fs *c)
 {
        struct sysinfo i;
-       u64 mem_bytes;
-
        si_meminfo(&i);
-       mem_bytes = i.totalram * i.mem_unit;
-       return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
+
+       u64 mem_bytes = i.totalram * i.mem_unit;
+       return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+       return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
 }
 
 static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
-                                       unsigned btree_leaf_mask,
-                                       unsigned btree_interior_mask,
+                                       u64 btree_leaf_mask,
+                                       u64 btree_interior_mask,
                                        struct bbpos start, struct bbpos *end)
 {
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
-       enum btree_id btree;
+       struct bch_fs *c = trans->c;
+       s64 mem_may_pin = mem_may_pin_bytes(c);
        int ret = 0;
 
-       for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
-               unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+       btree_interior_mask |= btree_leaf_mask;
+
+       c->btree_cache.pinned_nodes_leaf_mask           = btree_leaf_mask;
+       c->btree_cache.pinned_nodes_interior_mask       = btree_interior_mask;
+       c->btree_cache.pinned_nodes_start               = start;
+       c->btree_cache.pinned_nodes_end                 = *end = BBPOS_MAX;
+
+       for (enum btree_id btree = start.btree;
+            btree < BTREE_ID_NR && !ret;
+            btree++) {
+               unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
+               struct btree_iter iter;
+               struct btree *b;
 
                if (!((1U << btree) & btree_leaf_mask) &&
                    !((1U << btree) & btree_interior_mask))
                        continue;
 
-               bch2_trans_node_iter_init(trans, &iter, btree,
-                                         btree == start.btree ? start.pos : POS_MIN,
-                                         0, depth, 0);
-               /*
-                * for_each_btree_key_contineu() doesn't check the return value
-                * from bch2_btree_iter_advance(), which is needed when
-                * iterating over interior nodes where we'll see keys at
-                * SPOS_MAX:
-                */
-               do {
-                       k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
-                       ret = bkey_err(k);
-                       if (!k.k || ret)
-                               break;
-
-                       --btree_nodes;
-                       if (!btree_nodes) {
-                               *end = BBPOS(btree, k.k->p);
+               __for_each_btree_node(trans, iter, btree,
+                                     btree == start.btree ? start.pos : POS_MIN,
+                                     0, depth, BTREE_ITER_PREFETCH, b, ret) {
+                       mem_may_pin -= btree_buf_bytes(b);
+                       if (mem_may_pin <= 0) {
+                               c->btree_cache.pinned_nodes_end = *end =
+                                       BBPOS(btree, b->key.k.p);
                                bch2_trans_iter_exit(trans, &iter);
                                return 0;
                        }
-               } while (bch2_btree_iter_advance(&iter));
+               }
                bch2_trans_iter_exit(trans, &iter);
        }
 
-       *end = BBPOS_MAX;
        return ret;
 }
 
@@ -665,62 +666,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
        return 0;
 }
 
-static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
-                                        struct bpos bucket)
-{
-       return bch2_dev_exists2(c, bucket.inode)
-               ? bucket_pos_to_bp(c, bucket, 0)
-               : bucket;
-}
-
-static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
-                                       struct bpos start, struct bpos *end)
-{
-       struct btree_iter alloc_iter;
-       struct btree_iter bp_iter;
-       struct bkey_s_c alloc_k, bp_k;
-       size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
-       bool alloc_end = false, bp_end = false;
-       int ret = 0;
-
-       bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
-                                 start, 0, 1, 0);
-       bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
-                                 bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
-       while (1) {
-               alloc_k = !alloc_end
-                       ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
-                       : bkey_s_c_null;
-               bp_k = !bp_end
-                       ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
-                       : bkey_s_c_null;
-
-               ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
-               if ((!alloc_k.k && !bp_k.k) || ret) {
-                       *end = SPOS_MAX;
-                       break;
-               }
-
-               --btree_nodes;
-               if (!btree_nodes) {
-                       *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
-                       break;
-               }
-
-               if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
-                   bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
-                       if (!bch2_btree_iter_advance(&alloc_iter))
-                               alloc_end = true;
-               } else {
-                       if (!bch2_btree_iter_advance(&bp_iter))
-                               bp_end = true;
-               }
-       }
-       bch2_trans_iter_exit(trans, &bp_iter);
-       bch2_trans_iter_exit(trans, &alloc_iter);
-       return ret;
-}
-
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
        struct btree_trans *trans = bch2_trans_get(c);
@@ -731,10 +676,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
        bkey_init(&s.last_flushed.k->k);
 
        while (1) {
-               ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
+               struct bbpos end;
+               ret = bch2_get_btree_in_memory_pos(trans,
+                               BIT_ULL(BTREE_ID_backpointers),
+                               BIT_ULL(BTREE_ID_backpointers),
+                               BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
                if (ret)
                        break;
 
+               s.bucket_end = end.pos;
+
                if ( bpos_eq(s.bucket_start, POS_MIN) &&
                    !bpos_eq(s.bucket_end, SPOS_MAX))
                        bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
@@ -762,6 +713,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
        bch2_trans_put(trans);
        bch2_bkey_buf_exit(&s.last_flushed, c);
 
+       c->btree_cache.pinned_nodes_leaf_mask = 0;
+       c->btree_cache.pinned_nodes_interior_mask = 0;
+
        bch_err_fn(c, ret);
        return ret;
 }
@@ -867,6 +821,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
        }
        bch2_trans_put(trans);
 
+       c->btree_cache.pinned_nodes_leaf_mask = 0;
+       c->btree_cache.pinned_nodes_interior_mask = 0;
+
        bch_err_fn(c, ret);
        return ret;
 }
index 5198e94cf3b89c09f88c1304bc1aa2ff5f7cc35a..f63893344f80aa721554ba3f95124cfc824edbee 100644 (file)
@@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
 }
 
 #define BBPOS_MIN      BBPOS(0, POS_MIN)
-#define BBPOS_MAX      BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#define BBPOS_MAX      BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
 
 #endif /* _BCACHEFS_BBPOS_TYPES_H */
index e8665258360e7c716452b9dba68cee129fd0fbc9..562561a9a510e8ce55cdee26a9b064d4c07cf02d 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bbpos.h"
 #include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_io.h"
@@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
        int ret = 0;
 
        lockdep_assert_held(&bc->lock);
+
+       struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+       u64 mask = b->c.level
+               ? bc->pinned_nodes_interior_mask
+               : bc->pinned_nodes_leaf_mask;
+
+       if ((mask & BIT_ULL(b->c.btree_id)) &&
+           bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+           bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
+               return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
 wait_on_io:
        if (b->flags & ((1U << BTREE_NODE_dirty)|
                        (1U << BTREE_NODE_read_in_flight)|
index d4f4e52ee6ede2ce119eecc549a0485efdec38c6..9404d96c38f3b368726a6603b601b241b5106100 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
+#include "bbpos_types.h"
 #include "btree_key_cache_types.h"
 #include "buckets_types.h"
 #include "darray.h"
@@ -173,6 +174,11 @@ struct btree_cache {
         */
        struct task_struct      *alloc_lock;
        struct closure_waitlist alloc_wait;
+
+       struct bbpos            pinned_nodes_start;
+       struct bbpos            pinned_nodes_end;
+       u64                     pinned_nodes_leaf_mask;
+       u64                     pinned_nodes_interior_mask;
 };
 
 struct btree_node_iter {
index 9a83aca31b4b2a994767d1afd16e58baead0be58..136083c11f3a3aecc575501c33c0b3868f38113f 100644 (file)
@@ -337,6 +337,11 @@ enum fsck_err_opts {
          OPT_BOOL(),                                                   \
          BCH2_NO_SB_OPT,               false,                          \
          NULL,         "Run fsck on mount")                            \
+       x(fsck_memory_usage_percent,    u8,                             \
+         OPT_FS|OPT_MOUNT,                                             \
+         OPT_UINT(20, 70),                                             \
+         BCH2_NO_SB_OPT,               50,                             \
+         NULL,         "Maximum percentage of system ram fsck is allowed to pin")\
        x(fix_errors,                   u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
          OPT_FN(bch2_opt_fix_errors),                                  \