bcachefs: Make check_key_has_snapshot safer
authorKent Overstreet <kent.overstreet@linux.dev>
Sat, 31 May 2025 17:10:43 +0000 (13:10 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Mon, 2 Jun 2025 16:16:36 +0000 (12:16 -0400)
Snapshot deletion v2 added sentinal values for deleted snapshots, so
"key for deleted snapshot" - i.e. snapshot deletion missed something -
is safe to repair automatically.

But if we find a key for a missing snapshot we have no idea what
happened, and we shouldn't delete it unless we're very sure that
everything else is consistent.

So hook it up to the new bch2_require_recovery_pass(), we'll now only
delete if snapshots and subvolumes have recenlty been checked.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/data_update.c
fs/bcachefs/snapshot.c

index 92035859e5883d7d1963a0953d0cac78a7f4f18a..5f11743489743198895a57cbc73ab621d254da76 100644 (file)
@@ -822,16 +822,11 @@ int bch2_data_update_init(struct btree_trans *trans,
        int ret = 0;
 
        if (k.k->p.snapshot) {
-               /*
-                * We'll go ERO if we see a key for a missing snapshot, and if
-                * we're still in recovery we want to give that a chance to
-                * repair:
-                */
-               if (unlikely(test_bit(BCH_FS_in_recovery, &c->flags) &&
-                            bch2_snapshot_id_state(c, k.k->p.snapshot) == SNAPSHOT_ID_empty))
-                       return bch_err_throw(c, data_update_done_no_snapshot);
-
                ret = bch2_check_key_has_snapshot(trans, iter, k);
+               if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
+                       /* Can't repair yet, waiting on other recovery passes */
+                       return bch_err_throw(c, data_update_done_no_snapshot);
+               }
                if (ret < 0)
                        return ret;
                if (ret) /* key was deleted */
index 0649bdd5e4d23529d9e7ca66e6e33f29b3d22fb6..35dff323bfdb4f55ed09b2f67ebb46e2344931ea 100644 (file)
@@ -1045,19 +1045,39 @@ int __bch2_check_key_has_snapshot(struct btree_trans *trans,
                ret = bch2_btree_delete_at(trans, iter,
                                           BTREE_UPDATE_internal_snapshot_node) ?: 1;
 
-       /*
-        * Snapshot missing: we should have caught this with btree_lost_data and
-        * kicked off reconstruct_snapshots, so if we end up here we have no
-        * idea what happened:
-        */
-       if (fsck_err_on(state == SNAPSHOT_ID_empty,
-                       trans, bkey_in_missing_snapshot,
-                       "key in missing snapshot %s, delete?",
-                       (bch2_btree_id_to_text(&buf, iter->btree_id),
-                        prt_char(&buf, ' '),
-                        bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
-               ret = bch2_btree_delete_at(trans, iter,
-                                          BTREE_UPDATE_internal_snapshot_node) ?: 1;
+       if (state == SNAPSHOT_ID_empty) {
+               /*
+                * Snapshot missing: we should have caught this with btree_lost_data and
+                * kicked off reconstruct_snapshots, so if we end up here we have no
+                * idea what happened.
+                *
+                * Do not delete unless we know that subvolumes and snapshots
+                * are consistent:
+                *
+                * XXX:
+                *
+                * We could be smarter here, and instead of using the generic
+                * recovery pass ratelimiting, track if there have been any
+                * changes to the snapshots or inodes btrees since those passes
+                * last ran.
+                */
+               ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
+               ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
+
+               if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
+                       ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
+
+               unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
+
+               if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
+                            "key in missing snapshot %s, delete?",
+                            (bch2_btree_id_to_text(&buf, iter->btree_id),
+                             prt_char(&buf, ' '),
+                             bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+                       ret = bch2_btree_delete_at(trans, iter,
+                                                  BTREE_UPDATE_internal_snapshot_node) ?: 1;
+               }
+       }
 fsck_err:
        printbuf_exit(&buf);
        return ret;