bcachefs: Pass inode bkey to check_path()
[linux-block.git] / fs / bcachefs / fsck.c
CommitLineData
1c6fdbd8
KO
1// SPDX-License-Identifier: GPL-2.0
2
3#include "bcachefs.h"
07a1006a 4#include "bkey_buf.h"
88dfe193 5#include "btree_cache.h"
1c6fdbd8 6#include "btree_update.h"
e3dc75eb 7#include "buckets.h"
91d961ba 8#include "darray.h"
1c6fdbd8
KO
9#include "dirent.h"
10#include "error.h"
96385742 11#include "fs-common.h"
1c6fdbd8
KO
12#include "fsck.h"
13#include "inode.h"
14#include "keylist.h"
ad52bac2 15#include "recovery.h"
8e877caa 16#include "snapshot.h"
1c6fdbd8
KO
17#include "super.h"
18#include "xattr.h"
19
fc51b041 20#include <linux/bsearch.h>
1c6fdbd8 21#include <linux/dcache.h> /* struct qstr */
1c6fdbd8 22
42590b53
KO
23/*
24 * XXX: this is handling transaction restarts without returning
25 * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26 */
ef1669ff
KO
27static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28 u32 snapshot)
424eb881 29{
424eb881
KO
30 u64 sectors = 0;
31
44ddd8ad
KO
32 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33 SPOS(inum, 0, snapshot),
34 POS(inum, U64_MAX),
35 0, k, ({
424eb881
KO
36 if (bkey_extent_is_allocation(k.k))
37 sectors += k.k->size;
44ddd8ad
KO
38 0;
39 }));
94f651e2
KO
40
41 return ret ?: sectors;
424eb881
KO
42}
43
ef1669ff
KO
44static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45 u32 snapshot)
46{
ef1669ff 47 u64 subdirs = 0;
ef1669ff 48
44ddd8ad 49 int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
3a860b5a
KO
50 SPOS(inum, 0, snapshot),
51 POS(inum, U64_MAX),
44ddd8ad
KO
52 0, k, ({
53 if (k.k->type == KEY_TYPE_dirent &&
54 bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
ef1669ff 55 subdirs++;
44ddd8ad
KO
56 0;
57 }));
ef1669ff
KO
58
59 return ret ?: subdirs;
60}
61
c98d132e
KO
62static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63 u32 *snapshot, u64 *inum)
81ed9ce3 64{
97996ddf 65 struct bch_subvolume s;
81ed9ce3
KO
66 int ret;
67
97996ddf 68 ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
81ed9ce3 69
97996ddf
KO
70 *snapshot = le32_to_cpu(s.snapshot);
71 *inum = le64_to_cpu(s.inode);
81ed9ce3 72 return ret;
81ed9ce3
KO
73}
74
c27314b4
KO
75static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
76 struct bch_inode_unpacked *inode)
77{
78 struct btree_iter iter;
79 struct bkey_s_c k;
80 int ret;
81
82 bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
83 POS(0, inode_nr),
84 BTREE_ITER_ALL_SNAPSHOTS);
85 k = bch2_btree_iter_peek(&iter);
86 ret = bkey_err(k);
87 if (ret)
88 goto err;
89
e88a75eb 90 if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
e47a390a 91 ret = -BCH_ERR_ENOENT_inode;
c27314b4
KO
92 goto err;
93 }
94
3e52c222 95 ret = bch2_inode_unpack(k, inode);
c27314b4 96err:
d2a990d1 97 bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
c27314b4
KO
98 bch2_trans_iter_exit(trans, &iter);
99 return ret;
100}
101
c98d132e 102static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
23f25223
KO
103 struct bch_inode_unpacked *inode,
104 u32 *snapshot)
8a85b20c 105{
67e0dd8f 106 struct btree_iter iter;
8a85b20c
KO
107 struct bkey_s_c k;
108 int ret;
109
bcb79a51
KO
110 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
111 SPOS(0, inode_nr, *snapshot), 0);
8a85b20c
KO
112 ret = bkey_err(k);
113 if (ret)
114 goto err;
115
3e52c222
KO
116 ret = bkey_is_inode(k.k)
117 ? bch2_inode_unpack(k, inode)
e47a390a 118 : -BCH_ERR_ENOENT_inode;
4db65027
KO
119 if (!ret)
120 *snapshot = iter.pos.snapshot;
8a85b20c 121err:
67e0dd8f 122 bch2_trans_iter_exit(trans, &iter);
8a85b20c
KO
123 return ret;
124}
125
d2fda304 126static int lookup_dirent_in_snapshot(struct btree_trans *trans,
ef1669ff
KO
127 struct bch_hash_info hash_info,
128 subvol_inum dir, struct qstr *name,
d2fda304 129 u64 *target, unsigned *type, u32 snapshot)
ef1669ff
KO
130{
131 struct btree_iter iter;
132 struct bkey_s_c_dirent d;
d2fda304
KO
133 int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
134 &hash_info, dir, name, 0, snapshot);
ef1669ff
KO
135 if (ret)
136 return ret;
137
138 d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
139 *target = le64_to_cpu(d.v->d_inum);
140 *type = d.v->d_type;
141 bch2_trans_iter_exit(trans, &iter);
142 return 0;
143}
144
ae8bbb9f 145static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
1c6fdbd8 146{
0f238367 147 struct bch_fs *c = trans->c;
67e0dd8f 148 struct btree_iter iter;
1c6fdbd8
KO
149 struct bch_inode_unpacked dir_inode;
150 struct bch_hash_info dir_hash_info;
1c6fdbd8 151 int ret;
1c6fdbd8 152
c27314b4 153 ret = lookup_first_inode(trans, pos.inode, &dir_inode);
b1fd23df 154 if (ret)
e492e7b6 155 goto err;
1c6fdbd8
KO
156
157 dir_hash_info = bch2_hash_info_init(c, &dir_inode);
158
67e0dd8f 159 bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
b1fd23df 160
ae8bbb9f 161 ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
41fc8622
KO
162 &dir_hash_info, &iter,
163 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
67e0dd8f 164 bch2_trans_iter_exit(trans, &iter);
e492e7b6 165err:
d2a990d1 166 bch_err_fn(c, ret);
ae8bbb9f 167 return ret;
b1fd23df
KO
168}
169
58686a25 170/* Get lost+found, create if it doesn't exist: */
d296e7b1 171static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
58686a25 172 struct bch_inode_unpacked *lostfound)
1c6fdbd8 173{
58686a25 174 struct bch_fs *c = trans->c;
58686a25 175 struct qstr lostfound_str = QSTR("lost+found");
ef1669ff
KO
176 u64 inum = 0;
177 unsigned d_type = 0;
58686a25
KO
178 int ret;
179
d296e7b1
KO
180 struct bch_snapshot_tree st;
181 ret = bch2_snapshot_tree_lookup(trans,
182 bch2_snapshot_tree(c, snapshot), &st);
ef1669ff
KO
183 if (ret)
184 return ret;
81ed9ce3 185
d296e7b1
KO
186 subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
187 u32 subvol_snapshot;
188
c98d132e
KO
189 ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
190 &subvol_snapshot, &root_inum.inum);
d296e7b1 191 bch_err_msg(c, ret, "looking up root subvol");
285b181a 192 if (ret)
58686a25
KO
193 return ret;
194
d296e7b1
KO
195 struct bch_inode_unpacked root_inode;
196 struct bch_hash_info root_hash_info;
d2fda304
KO
197 u32 root_inode_snapshot = snapshot;
198 ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
c98d132e 199 bch_err_msg(c, ret, "looking up root inode");
d296e7b1
KO
200 if (ret)
201 return ret;
202
203 root_hash_info = bch2_hash_info_init(c, &root_inode);
ef1669ff 204
d2fda304
KO
205 ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
206 &lostfound_str, &inum, &d_type, snapshot);
d296e7b1 207 if (bch2_err_matches(ret, ENOENT))
58686a25 208 goto create_lostfound;
58686a25 209
d2a990d1 210 bch_err_fn(c, ret);
285b181a 211 if (ret)
ef1669ff 212 return ret;
ef1669ff
KO
213
214 if (d_type != DT_DIR) {
215 bch_err(c, "error looking up lost+found: not a directory");
40a53b92 216 return -BCH_ERR_ENOENT_not_directory;
ef1669ff
KO
217 }
218
285b181a 219 /*
067d228b 220 * The bch2_check_dirents pass has already run, dangling dirents
285b181a
KO
221 * shouldn't exist here:
222 */
d2fda304
KO
223 ret = lookup_inode(trans, inum, lostfound, &snapshot);
224 bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
225 inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
226 return ret;
58686a25 227
58686a25 228create_lostfound:
d296e7b1
KO
229 /*
230 * XXX: we could have a nicer log message here if we had a nice way to
231 * walk backpointers to print a path
232 */
233 bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
234
235 u64 now = bch2_current_time(c);
236 struct btree_iter lostfound_iter = { NULL };
237 u64 cpu = raw_smp_processor_id();
238
285b181a 239 bch2_inode_init_early(c, lostfound);
d296e7b1
KO
240 bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
241 lostfound->bi_dir = root_inode.bi_inum;
242
243 root_inode.bi_nlink++;
244
245 ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
246 if (ret)
247 goto err;
285b181a 248
d296e7b1
KO
249 bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
250 ret = bch2_btree_iter_traverse(&lostfound_iter);
251 if (ret)
252 goto err;
253
254 ret = bch2_dirent_create_snapshot(trans,
255 root_inode.bi_inum, snapshot, &root_hash_info,
256 mode_to_type(lostfound->bi_mode),
257 &lostfound_str,
258 lostfound->bi_inum,
259 &lostfound->bi_dir_offset,
260 BCH_HASH_SET_MUST_CREATE) ?:
261 bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
262 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
263err:
d2a990d1 264 bch_err_msg(c, ret, "creating lost+found");
d296e7b1 265 bch2_trans_iter_exit(trans, &lostfound_iter);
285b181a 266 return ret;
58686a25
KO
267}
268
c98d132e 269static int reattach_inode(struct btree_trans *trans,
81ed9ce3 270 struct bch_inode_unpacked *inode,
ef1669ff 271 u32 inode_snapshot)
58686a25
KO
272{
273 struct bch_hash_info dir_hash;
274 struct bch_inode_unpacked lostfound;
1c6fdbd8
KO
275 char name_buf[20];
276 struct qstr name;
176cf4bf 277 u64 dir_offset = 0;
1c6fdbd8
KO
278 int ret;
279
d296e7b1 280 ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
176cf4bf 281 if (ret)
d3ff7fec 282 return ret;
176cf4bf 283
58686a25
KO
284 if (S_ISDIR(inode->bi_mode)) {
285 lostfound.bi_nlink++;
176cf4bf 286
69c8e6ce 287 ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
176cf4bf 288 if (ret)
d3ff7fec 289 return ret;
176cf4bf
KO
290 }
291
58686a25 292 dir_hash = bch2_hash_info_init(trans->c, &lostfound);
176cf4bf 293
58686a25
KO
294 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
295 name = (struct qstr) QSTR(name_buf);
176cf4bf 296
d296e7b1
KO
297 ret = bch2_dirent_create_snapshot(trans,
298 lostfound.bi_inum, inode_snapshot,
299 &dir_hash,
300 inode_d_type(inode),
301 &name, inode->bi_inum, &dir_offset,
302 BCH_HASH_SET_MUST_CREATE);
285b181a
KO
303 if (ret)
304 return ret;
305
306 inode->bi_dir = lostfound.bi_inum;
307 inode->bi_dir_offset = dir_offset;
308
69c8e6ce 309 return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
285b181a
KO
310}
311
d3ff7fec
KO
312static int remove_backpointer(struct btree_trans *trans,
313 struct bch_inode_unpacked *inode)
314{
67e0dd8f 315 struct btree_iter iter;
bcb79a51 316 struct bkey_s_c_dirent d;
d3ff7fec
KO
317 int ret;
318
bcb79a51
KO
319 d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
320 POS(inode->bi_dir, inode->bi_dir_offset), 0,
321 dirent);
322 ret = bkey_err(d) ?:
323 __remove_dirent(trans, d.k->p);
67e0dd8f 324 bch2_trans_iter_exit(trans, &iter);
d3ff7fec
KO
325 return ret;
326}
327
49124d8a
KO
328struct snapshots_seen_entry {
329 u32 id;
330 u32 equiv;
331};
332
333struct snapshots_seen {
334 struct bpos pos;
335 DARRAY(struct snapshots_seen_entry) ids;
336};
337
338static inline void snapshots_seen_exit(struct snapshots_seen *s)
339{
340 darray_exit(&s->ids);
341}
342
343static inline void snapshots_seen_init(struct snapshots_seen *s)
344{
345 memset(s, 0, sizeof(*s));
346}
347
e2bd0617
KO
348static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
349{
350 struct snapshots_seen_entry *i, n = {
351 .id = id,
352 .equiv = bch2_snapshot_equiv(c, id),
353 };
354 int ret = 0;
355
defd9e39 356 __darray_for_each(s->ids, i) {
e2bd0617
KO
357 if (i->id == id)
358 return 0;
359 if (i->id > id)
360 break;
361 }
362
363 ret = darray_insert_item(&s->ids, i - s->ids.data, n);
364 if (ret)
365 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
366 s->ids.size);
367 return ret;
368}
369
49124d8a
KO
370static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
371 enum btree_id btree_id, struct bpos pos)
ef1669ff 372{
defd9e39 373 struct snapshots_seen_entry n = {
49124d8a
KO
374 .id = pos.snapshot,
375 .equiv = bch2_snapshot_equiv(c, pos.snapshot),
376 };
c7be3cb5 377 int ret = 0;
ef1669ff 378
e88a75eb 379 if (!bkey_eq(s->pos, pos))
91d961ba 380 s->ids.nr = 0;
49124d8a 381
ef1669ff 382 s->pos = pos;
6b20d746 383 s->pos.snapshot = n.equiv;
ef1669ff 384
6b20d746
KO
385 darray_for_each(s->ids, i) {
386 if (i->id == n.id)
49124d8a 387 return 0;
6b20d746
KO
388
389 /*
390 * We currently don't rigorously track for snapshot cleanup
391 * needing to be run, so it shouldn't be a fsck error yet:
392 */
393 if (i->equiv == n.equiv) {
394 bch_err(c, "snapshot deletion did not finish:\n"
395 " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
88dfe193 396 bch2_btree_id_str(btree_id),
6b20d746
KO
397 pos.inode, pos.offset,
398 i->id, n.id, n.equiv);
3c471b65 399 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
ae2e13d7 400 return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
49124d8a 401 }
6b20d746 402 }
49124d8a
KO
403
404 ret = darray_push(&s->ids, n);
405 if (ret)
406 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
407 s->ids.size);
408 return ret;
ef1669ff
KO
409}
410
411/**
412 * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
413 * and @ancestor hasn't been overwritten in @seen
414 *
96dea3d5
KO
415 * @c: filesystem handle
416 * @seen: list of snapshot ids already seen at current position
417 * @id: descendent snapshot id
418 * @ancestor: ancestor snapshot id
419 *
420 * Returns: whether key in @ancestor snapshot is visible in @id snapshot
ef1669ff
KO
421 */
422static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
423 u32 id, u32 ancestor)
424{
425 ssize_t i;
426
464ee192
KO
427 EBUG_ON(id > ancestor);
428 EBUG_ON(!bch2_snapshot_is_equiv(c, id));
429 EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
ef1669ff
KO
430
431 /* @ancestor should be the snapshot most recently added to @seen */
464ee192
KO
432 EBUG_ON(ancestor != seen->pos.snapshot);
433 EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
ef1669ff
KO
434
435 if (id == ancestor)
436 return true;
437
438 if (!bch2_snapshot_is_ancestor(c, id, ancestor))
439 return false;
440
464ee192
KO
441 /*
442 * We know that @id is a descendant of @ancestor, we're checking if
443 * we've seen a key that overwrote @ancestor - i.e. also a descendent of
444 * @ascestor and with @id as a descendent.
445 *
446 * But we already know that we're scanning IDs between @id and @ancestor
447 * numerically, since snapshot ID lists are kept sorted, so if we find
448 * an id that's an ancestor of @id we're done:
449 */
450
91d961ba 451 for (i = seen->ids.nr - 2;
49124d8a 452 i >= 0 && seen->ids.data[i].equiv >= id;
ef1669ff 453 --i)
464ee192 454 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
ef1669ff
KO
455 return false;
456
457 return true;
458}
459
460/**
461 * ref_visible - given a key with snapshot id @src that points to a key with
462 * snapshot id @dst, test whether there is some snapshot in which @dst is
463 * visible.
464 *
96dea3d5
KO
465 * @c: filesystem handle
466 * @s: list of snapshot IDs already seen at @src
467 * @src: snapshot ID of src key
468 * @dst: snapshot ID of dst key
469 * Returns: true if there is some snapshot in which @dst is visible
ef1669ff 470 *
96dea3d5 471 * Assumes we're visiting @src keys in natural key order
ef1669ff 472 */
96dea3d5
KO
473static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
474 u32 src, u32 dst)
ef1669ff
KO
475{
476 return dst <= src
477 ? key_visible_in_snapshot(c, s, dst, src)
478 : bch2_snapshot_is_ancestor(c, src, dst);
479}
480
c58029ec
DH
481static int ref_visible2(struct bch_fs *c,
482 u32 src, struct snapshots_seen *src_seen,
483 u32 dst, struct snapshots_seen *dst_seen)
484{
485 src = bch2_snapshot_equiv(c, src);
486 dst = bch2_snapshot_equiv(c, dst);
487
488 if (dst > src) {
489 swap(dst, src);
490 swap(dst_seen, src_seen);
491 }
492 return key_visible_in_snapshot(c, src_seen, dst, src);
493}
494
49124d8a
KO
495#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
496 for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
497 (_i)->snapshot <= (_snapshot); _i++) \
ef1669ff
KO
498 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
499
91d961ba
KO
500struct inode_walker_entry {
501 struct bch_inode_unpacked inode;
502 u32 snapshot;
f9f52bc4 503 bool seen_this_pos;
91d961ba
KO
504 u64 count;
505};
506
1c6fdbd8 507struct inode_walker {
ef1669ff 508 bool first_this_inode;
43b81a4e 509 bool recalculate_sums;
f9f52bc4 510 struct bpos last_pos;
ef1669ff 511
91d961ba 512 DARRAY(struct inode_walker_entry) inodes;
1c6fdbd8
KO
513};
514
ef1669ff
KO
515static void inode_walker_exit(struct inode_walker *w)
516{
91d961ba 517 darray_exit(&w->inodes);
ef1669ff
KO
518}
519
1c6fdbd8
KO
520static struct inode_walker inode_walker_init(void)
521{
ef1669ff
KO
522 return (struct inode_walker) { 0, };
523}
524
ef1669ff 525static int add_inode(struct bch_fs *c, struct inode_walker *w,
3e52c222 526 struct bkey_s_c inode)
ef1669ff
KO
527{
528 struct bch_inode_unpacked u;
ef1669ff
KO
529
530 BUG_ON(bch2_inode_unpack(inode, &u));
531
91d961ba 532 return darray_push(&w->inodes, ((struct inode_walker_entry) {
ef1669ff 533 .inode = u,
49124d8a 534 .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot),
91d961ba 535 }));
1c6fdbd8
KO
536}
537
06dcca51
KO
538static int get_inodes_all_snapshots(struct btree_trans *trans,
539 struct inode_walker *w, u64 inum)
1c6fdbd8 540{
ef1669ff
KO
541 struct bch_fs *c = trans->c;
542 struct btree_iter iter;
543 struct bkey_s_c k;
ef1669ff 544 int ret;
1c6fdbd8 545
43b81a4e 546 w->recalculate_sums = false;
91d961ba 547 w->inodes.nr = 0;
ef1669ff 548
27b2df98
KO
549 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
550 BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
06dcca51 551 if (k.k->p.offset != inum)
ef1669ff
KO
552 break;
553
3e52c222
KO
554 if (bkey_is_inode(k.k))
555 add_inode(c, w, k);
ef1669ff
KO
556 }
557 bch2_trans_iter_exit(trans, &iter);
558
559 if (ret)
560 return ret;
561
f9f52bc4 562 w->first_this_inode = true;
27b2df98 563 return 0;
06dcca51
KO
564}
565
566static struct inode_walker_entry *
971a1503 567lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
06dcca51 568{
971a1503
KO
569 bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
570 u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
06dcca51 571
971a1503 572 struct inode_walker_entry *i;
defd9e39 573 __darray_for_each(w->inodes, i)
06dcca51 574 if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
ef1669ff 575 goto found;
06dcca51
KO
576
577 return NULL;
ef1669ff 578found:
06dcca51 579 BUG_ON(snapshot > i->snapshot);
ef1669ff 580
a57f4d61 581 if (snapshot != i->snapshot && !is_whiteout) {
06dcca51 582 struct inode_walker_entry new = *i;
49124d8a 583
06dcca51
KO
584 new.snapshot = snapshot;
585 new.count = 0;
49124d8a 586
971a1503
KO
587 struct printbuf buf = PRINTBUF;
588 bch2_bkey_val_to_text(&buf, c, k);
589
590 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
591 "unexpected because we should always update the inode when we update a key in that inode\n"
592 "%s",
593 w->last_pos.inode, snapshot, i->snapshot, buf.buf);
594 printbuf_exit(&buf);
ef1669ff 595
06dcca51 596 while (i > w->inodes.data && i[-1].snapshot > snapshot)
ef1669ff
KO
597 --i;
598
971a1503
KO
599 size_t pos = i - w->inodes.data;
600 int ret = darray_insert_item(&w->inodes, pos, new);
ef1669ff 601 if (ret)
06dcca51 602 return ERR_PTR(ret);
20e6d9a8
KO
603
604 i = w->inodes.data + pos;
ef1669ff
KO
605 }
606
607 return i;
1c6fdbd8
KO
608}
609
06dcca51 610static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
971a1503
KO
611 struct inode_walker *w,
612 struct bkey_s_c k)
06dcca51 613{
971a1503
KO
614 if (w->last_pos.inode != k.k->p.inode) {
615 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
f9f52bc4
KO
616 if (ret)
617 return ERR_PTR(ret);
971a1503 618 } else if (bkey_cmp(w->last_pos, k.k->p)) {
f9f52bc4
KO
619 darray_for_each(w->inodes, i)
620 i->seen_this_pos = false;
f9f52bc4
KO
621 }
622
971a1503 623 w->last_pos = k.k->p;
06dcca51 624
971a1503 625 return lookup_inode_for_snapshot(trans->c, w, k);
06dcca51
KO
626}
627
ef1669ff
KO
628static int __get_visible_inodes(struct btree_trans *trans,
629 struct inode_walker *w,
630 struct snapshots_seen *s,
631 u64 inum)
632{
633 struct bch_fs *c = trans->c;
634 struct btree_iter iter;
635 struct bkey_s_c k;
636 int ret;
637
91d961ba 638 w->inodes.nr = 0;
ef1669ff 639
12043cf1 640 for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
ef1669ff 641 BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
49124d8a
KO
642 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
643
ef1669ff
KO
644 if (k.k->p.offset != inum)
645 break;
646
49124d8a 647 if (!ref_visible(c, s, s->pos.snapshot, equiv))
ef1669ff
KO
648 continue;
649
49124d8a 650 if (bkey_is_inode(k.k))
3e52c222 651 add_inode(c, w, k);
49124d8a
KO
652
653 if (equiv >= s->pos.snapshot)
654 break;
ef1669ff
KO
655 }
656 bch2_trans_iter_exit(trans, &iter);
657
658 return ret;
659}
660
661static int check_key_has_snapshot(struct btree_trans *trans,
662 struct btree_iter *iter,
663 struct bkey_s_c k)
664{
665 struct bch_fs *c = trans->c;
fa8e94fa 666 struct printbuf buf = PRINTBUF;
ef1669ff
KO
667 int ret = 0;
668
49124d8a 669 if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
b65db750
KO
670 bkey_in_missing_snapshot,
671 "key in missing snapshot: %s",
672 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
fa8e94fa 673 ret = bch2_btree_delete_at(trans, iter,
285b181a 674 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
ef1669ff 675fsck_err:
fa8e94fa 676 printbuf_exit(&buf);
ef1669ff 677 return ret;
914f2786
KO
678}
679
7ac2c55e
KO
680static int hash_redo_key(struct btree_trans *trans,
681 const struct bch_hash_desc desc,
682 struct bch_hash_info *hash_info,
683 struct btree_iter *k_iter, struct bkey_s_c k)
1c6fdbd8 684{
e3b4b48c 685 struct bkey_i *delete;
1c6fdbd8 686 struct bkey_i *tmp;
1c6fdbd8 687
e3b4b48c
KO
688 delete = bch2_trans_kmalloc(trans, sizeof(*delete));
689 if (IS_ERR(delete))
690 return PTR_ERR(delete);
691
dbda63bb 692 tmp = bch2_bkey_make_mut_noupdate(trans, k);
b1fd23df
KO
693 if (IS_ERR(tmp))
694 return PTR_ERR(tmp);
1c6fdbd8 695
e3b4b48c
KO
696 bkey_init(&delete->k);
697 delete->k.p = k_iter->pos;
8c3f6da9
KO
698 return bch2_btree_iter_traverse(k_iter) ?:
699 bch2_trans_update(trans, k_iter, delete, 0) ?:
23f25223 700 bch2_hash_set_in_snapshot(trans, desc, hash_info,
5877d887
KO
701 (subvol_inum) { 0, k.k->p.inode },
702 k.k->p.snapshot, tmp,
703 BCH_HASH_SET_MUST_CREATE,
704 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
3f0e297d 705 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
1c6fdbd8
KO
706}
707
7ac2c55e
KO
708static int hash_check_key(struct btree_trans *trans,
709 const struct bch_hash_desc desc,
710 struct bch_hash_info *hash_info,
711 struct btree_iter *k_iter, struct bkey_s_c hash_k)
d69f41d6 712{
424eb881 713 struct bch_fs *c = trans->c;
67e0dd8f 714 struct btree_iter iter = { NULL };
fa8e94fa 715 struct printbuf buf = PRINTBUF;
7ac2c55e
KO
716 struct bkey_s_c k;
717 u64 hash;
d69f41d6
KO
718 int ret = 0;
719
7ac2c55e
KO
720 if (hash_k.k->type != desc.key_type)
721 return 0;
722
723 hash = desc.hash_bkey(hash_info, hash_k);
724
725 if (likely(hash == hash_k.k->p.offset))
d69f41d6
KO
726 return 0;
727
7ac2c55e
KO
728 if (hash_k.k->p.offset < hash)
729 goto bad_hash;
d69f41d6 730
d8f31407 731 for_each_btree_key_norestart(trans, iter, desc.btree_id,
419fc65f 732 SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
d8f31407 733 BTREE_ITER_SLOTS, k, ret) {
e88a75eb 734 if (bkey_eq(k.k->p, hash_k.k->p))
d69f41d6
KO
735 break;
736
7ac2c55e
KO
737 if (fsck_err_on(k.k->type == desc.key_type &&
738 !desc.cmp_bkey(k, hash_k), c,
b65db750 739 hash_table_key_duplicate,
d69f41d6 740 "duplicate hash table keys:\n%s",
fa8e94fa
KO
741 (printbuf_reset(&buf),
742 bch2_bkey_val_to_text(&buf, c, hash_k),
743 buf.buf))) {
285b181a 744 ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
d69f41d6
KO
745 break;
746 }
d69f41d6 747
7ac2c55e 748 if (bkey_deleted(k.k)) {
67e0dd8f 749 bch2_trans_iter_exit(trans, &iter);
7ac2c55e 750 goto bad_hash;
1c6fdbd8 751 }
7ac2c55e 752 }
fa8e94fa 753out:
67e0dd8f 754 bch2_trans_iter_exit(trans, &iter);
fa8e94fa 755 printbuf_exit(&buf);
1c6fdbd8 756 return ret;
7ac2c55e 757bad_hash:
b65db750
KO
758 if (fsck_err(c, hash_table_key_wrong_offset,
759 "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
88dfe193 760 bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
fa8e94fa 761 (printbuf_reset(&buf),
1ed0a5d2
KO
762 bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
763 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
d2a990d1 764 bch_err_fn(c, ret);
c8d5b714 765 if (ret)
1ed0a5d2 766 return ret;
1ed0a5d2 767 ret = -BCH_ERR_transaction_restart_nested;
741daa5b 768 }
741daa5b 769fsck_err:
fa8e94fa 770 goto out;
741daa5b
KO
771}
772
0b17618f
KO
773static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
774 struct btree_iter *iter,
775 struct bpos pos)
776{
777 return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
778}
779
780static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
781 struct btree_iter *iter,
782 struct bch_inode_unpacked *inode,
783 u32 *snapshot)
784{
785 if (inode->bi_subvol) {
786 u64 inum;
787 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
788 if (ret)
789 return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
790 }
791
792 return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
793}
794
795static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
796 struct bkey_s_c_dirent d)
797{
798 return inode->bi_dir == d.k->p.inode &&
799 inode->bi_dir_offset == d.k->p.offset;
800}
801
802static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
803 struct bch_inode_unpacked *inode)
804{
805 return d.v->d_type == DT_SUBVOL
806 ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
807 : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
808}
809
359d1bad
KO
810static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
811{
812 struct btree_iter iter;
813 struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
52f3a72f 814 int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
359d1bad 815 bch2_trans_iter_exit(trans, &iter);
52f3a72f 816 return ret;
359d1bad
KO
817}
818
0b17618f
KO
819static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
820 struct bch_inode_unpacked *inode,
821 u32 inode_snapshot, bool *write_inode)
822{
823 struct bch_fs *c = trans->c;
824 struct printbuf buf = PRINTBUF;
825
826 struct btree_iter dirent_iter = {};
827 struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
828 int ret = bkey_err(d);
829 if (ret && !bch2_err_matches(ret, ENOENT))
830 return ret;
831
832 if (fsck_err_on(ret,
833 c, inode_points_to_missing_dirent,
834 "inode points to missing dirent\n%s",
835 (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
836 fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
837 c, inode_points_to_wrong_dirent,
838 "inode points to dirent that does not point back:\n%s",
839 (bch2_bkey_val_to_text(&buf, c, inode_k),
840 prt_newline(&buf),
841 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
842 /*
843 * We just clear the backpointer fields for now. If we find a
844 * dirent that points to this inode in check_dirents(), we'll
845 * update it then; then when we get to check_path() if the
846 * backpointer is still 0 we'll reattach it.
847 */
848 inode->bi_dir = 0;
849 inode->bi_dir_offset = 0;
850 inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
851 *write_inode = true;
852 }
853
854 ret = 0;
855fsck_err:
856 bch2_trans_iter_exit(trans, &dirent_iter);
857 printbuf_exit(&buf);
858 bch_err_fn(c, ret);
859 return ret;
860}
861
5c16add5
KO
862static int check_inode(struct btree_trans *trans,
863 struct btree_iter *iter,
a1783320 864 struct bkey_s_c k,
ef1669ff 865 struct bch_inode_unpacked *prev,
49124d8a 866 struct snapshots_seen *s,
285b181a 867 bool full)
5c16add5
KO
868{
869 struct bch_fs *c = trans->c;
285b181a 870 struct bch_inode_unpacked u;
5c16add5 871 bool do_update = false;
285b181a
KO
872 int ret;
873
285b181a 874 ret = check_key_has_snapshot(trans, iter, k);
e492e7b6
KO
875 if (ret < 0)
876 goto err;
285b181a 877 if (ret)
e492e7b6 878 return 0;
285b181a 879
49124d8a
KO
880 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
881 if (ret)
882 goto err;
883
3e52c222 884 if (!bkey_is_inode(k.k))
285b181a
KO
885 return 0;
886
3e52c222 887 BUG_ON(bch2_inode_unpack(k, &u));
285b181a
KO
888
889 if (!full &&
103ffe9a
KO
890 !(u.bi_flags & (BCH_INODE_i_size_dirty|
891 BCH_INODE_i_sectors_dirty|
892 BCH_INODE_unlinked)))
285b181a
KO
893 return 0;
894
285b181a
KO
895 if (prev->bi_inum != u.bi_inum)
896 *prev = u;
897
898 if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
b65db750
KO
899 inode_d_type(prev) != inode_d_type(&u),
900 c, inode_snapshot_mismatch,
ef1669ff
KO
901 "inodes in different snapshots don't match")) {
902 bch_err(c, "repair not implemented yet");
359d1bad 903 return -BCH_ERR_fsck_repair_unimplemented;
ef1669ff 904 }
5c16add5 905
103ffe9a 906 if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
a111901f
KO
907 bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
908 struct bpos new_min_pos;
909
910 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
911 if (ret)
912 goto err;
913
103ffe9a 914 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
a111901f 915
69c8e6ce
KO
916 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
917
d2a990d1
KO
918 bch_err_msg(c, ret, "in fsck updating inode");
919 if (ret)
a111901f 920 return ret;
a111901f
KO
921
922 if (!bpos_eq(new_min_pos, POS_MIN))
923 bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
924 return 0;
925 }
926
074cbcda 927 if (u.bi_flags & BCH_INODE_unlinked) {
359d1bad
KO
928 ret = check_inode_deleted_list(trans, k.k->p);
929 if (ret < 0)
930 return ret;
931
52f3a72f 932 fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
359d1bad
KO
933 "inode %llu:%u unlinked, but not on deleted list",
934 u.bi_inum, k.k->p.snapshot);
935 ret = 0;
936 }
937
103ffe9a 938 if (u.bi_flags & BCH_INODE_unlinked &&
5c16add5 939 (!c->sb.clean ||
b65db750
KO
940 fsck_err(c, inode_unlinked_but_clean,
941 "filesystem marked clean, but inode %llu unlinked",
5c16add5 942 u.bi_inum))) {
7904c82c 943 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
d2a990d1 944 bch_err_msg(c, ret, "in fsck deleting inode");
5c16add5
KO
945 return ret;
946 }
947
103ffe9a 948 if (u.bi_flags & BCH_INODE_i_size_dirty &&
5c16add5 949 (!c->sb.clean ||
b65db750
KO
950 fsck_err(c, inode_i_size_dirty_but_clean,
951 "filesystem marked clean, but inode %llu has i_size dirty",
5c16add5
KO
952 u.bi_inum))) {
953 bch_verbose(c, "truncating inode %llu", u.bi_inum);
954
5c16add5
KO
955 /*
956 * XXX: need to truncate partial blocks too here - or ideally
957 * just switch units to bytes and that issue goes away
958 */
959 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
ef1669ff
KO
960 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
961 iter->pos.snapshot),
5c16add5 962 POS(u.bi_inum, U64_MAX),
ef1669ff 963 0, NULL);
d2a990d1 964 bch_err_msg(c, ret, "in fsck truncating inode");
51fe0332 965 if (ret)
5c16add5 966 return ret;
5c16add5
KO
967
968 /*
969 * We truncated without our normal sector accounting hook, just
970 * make sure we recalculate it:
971 */
103ffe9a 972 u.bi_flags |= BCH_INODE_i_sectors_dirty;
5c16add5 973
103ffe9a 974 u.bi_flags &= ~BCH_INODE_i_size_dirty;
5c16add5
KO
975 do_update = true;
976 }
977
103ffe9a 978 if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
5c16add5 979 (!c->sb.clean ||
b65db750
KO
980 fsck_err(c, inode_i_sectors_dirty_but_clean,
981 "filesystem marked clean, but inode %llu has i_sectors dirty",
5c16add5
KO
982 u.bi_inum))) {
983 s64 sectors;
984
985 bch_verbose(c, "recounting sectors for inode %llu",
986 u.bi_inum);
987
ef1669ff 988 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
5c16add5 989 if (sectors < 0) {
d2a990d1 990 bch_err_msg(c, sectors, "in fsck recounting inode sectors");
5c16add5
KO
991 return sectors;
992 }
993
994 u.bi_sectors = sectors;
103ffe9a 995 u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
5c16add5
KO
996 do_update = true;
997 }
998
103ffe9a 999 if (u.bi_flags & BCH_INODE_backptr_untrusted) {
d3ff7fec
KO
1000 u.bi_dir = 0;
1001 u.bi_dir_offset = 0;
103ffe9a 1002 u.bi_flags &= ~BCH_INODE_backptr_untrusted;
5c16add5
KO
1003 do_update = true;
1004 }
1005
0b17618f
KO
1006 if (u.bi_dir || u.bi_dir_offset) {
1007 ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1008 if (ret)
1009 goto err;
1010 }
1011
0b498a5a
KO
1012 if (fsck_err_on(u.bi_parent_subvol &&
1013 (u.bi_subvol == 0 ||
1014 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
f4e68c85 1015 c, inode_bi_parent_nonzero,
0b498a5a
KO
1016 "inode %llu:%u has subvol %u but nonzero parent subvol %u",
1017 u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1018 u.bi_parent_subvol = 0;
1019 do_update = true;
1020 }
1021
f2b02d09
KO
1022 if (u.bi_subvol) {
1023 struct bch_subvolume s;
1024
1025 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1026 if (ret && !bch2_err_matches(ret, ENOENT))
1027 goto err;
1028
1029 if (fsck_err_on(ret,
1030 c, inode_bi_subvol_missing,
1031 "inode %llu:%u bi_subvol points to missing subvolume %u",
1032 u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1033 fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1034 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1035 k.k->p.snapshot),
1036 c, inode_bi_subvol_wrong,
1037 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1038 u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1039 le64_to_cpu(s.inode),
1040 le32_to_cpu(s.snapshot))) {
1041 u.bi_subvol = 0;
1042 u.bi_parent_subvol = 0;
1043 do_update = true;
1044 }
1045 }
1046
5c16add5 1047 if (do_update) {
69c8e6ce 1048 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
d2a990d1 1049 bch_err_msg(c, ret, "in fsck updating inode");
a190cbcf 1050 if (ret)
a111901f 1051 return ret;
5c16add5 1052 }
e492e7b6 1053err:
5c16add5 1054fsck_err:
d2a990d1 1055 bch_err_fn(c, ret);
5c16add5
KO
1056 return ret;
1057}
1058
067d228b 1059int bch2_check_inodes(struct bch_fs *c)
5c16add5 1060{
067d228b 1061 bool full = c->opts.fsck;
285b181a 1062 struct bch_inode_unpacked prev = { 0 };
49124d8a 1063 struct snapshots_seen s;
5c16add5 1064
49124d8a 1065 snapshots_seen_init(&s);
5c16add5 1066
4eb3877e
KO
1067 int ret = bch2_trans_run(c,
1068 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1069 POS_MIN,
1070 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1071 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1072 check_inode(trans, &iter, k, &prev, &s, full)));
ef1669ff 1073
49124d8a 1074 snapshots_seen_exit(&s);
d2a990d1 1075 bch_err_fn(c, ret);
285b181a
KO
1076 return ret;
1077}
ef1669ff 1078
ef1669ff
KO
1079static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1080{
1081 struct bch_fs *c = trans->c;
0763c552
KO
1082 u32 restart_count = trans->restart_count;
1083 int ret = 0;
ef1669ff
KO
1084 s64 count2;
1085
91d961ba 1086 darray_for_each(w->inodes, i) {
ef1669ff
KO
1087 if (i->inode.bi_sectors == i->count)
1088 continue;
1089
f9f52bc4 1090 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
ef1669ff 1091
43b81a4e 1092 if (w->recalculate_sums)
ef1669ff 1093 i->count = count2;
43b81a4e
KO
1094
1095 if (i->count != count2) {
1096 bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1097 w->last_pos.inode, i->snapshot, i->count, count2);
1098 return -BCH_ERR_internal_fsck_err;
ef1669ff
KO
1099 }
1100
103ffe9a 1101 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
b65db750
KO
1102 c, inode_i_sectors_wrong,
1103 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1104 w->last_pos.inode, i->snapshot,
1105 i->inode.bi_sectors, i->count)) {
1ed0a5d2 1106 i->inode.bi_sectors = i->count;
69c8e6ce 1107 ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1ed0a5d2
KO
1108 if (ret)
1109 break;
1ed0a5d2 1110 }
ef1669ff
KO
1111 }
1112fsck_err:
d2a990d1 1113 bch_err_fn(c, ret);
c872afa2 1114 return ret ?: trans_was_restarted(trans, restart_count);
ef1669ff
KO
1115}
1116
c58029ec
DH
1117struct extent_end {
1118 u32 snapshot;
1119 u64 offset;
1120 struct snapshots_seen seen;
1121};
1122
a397b8df
KO
1123struct extent_ends {
1124 struct bpos last_pos;
1125 DARRAY(struct extent_end) e;
1126};
1127
1128static void extent_ends_reset(struct extent_ends *extent_ends)
1129{
a397b8df
KO
1130 darray_for_each(extent_ends->e, i)
1131 snapshots_seen_exit(&i->seen);
a397b8df
KO
1132 extent_ends->e.nr = 0;
1133}
1134
1135static void extent_ends_exit(struct extent_ends *extent_ends)
1136{
1137 extent_ends_reset(extent_ends);
1138 darray_exit(&extent_ends->e);
1139}
1140
1141static void extent_ends_init(struct extent_ends *extent_ends)
1142{
1143 memset(extent_ends, 0, sizeof(*extent_ends));
1144}
1145
1146static int extent_ends_at(struct bch_fs *c,
1147 struct extent_ends *extent_ends,
1148 struct snapshots_seen *seen,
1149 struct bkey_s_c k)
1150{
1151 struct extent_end *i, n = (struct extent_end) {
1152 .offset = k.k->p.offset,
1153 .snapshot = k.k->p.snapshot,
1154 .seen = *seen,
1155 };
1156
1157 n.seen.ids.data = kmemdup(seen->ids.data,
1158 sizeof(seen->ids.data[0]) * seen->ids.size,
1159 GFP_KERNEL);
1160 if (!n.seen.ids.data)
1161 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1162
defd9e39 1163 __darray_for_each(extent_ends->e, i) {
a397b8df
KO
1164 if (i->snapshot == k.k->p.snapshot) {
1165 snapshots_seen_exit(&i->seen);
1166 *i = n;
1167 return 0;
1168 }
1169
1170 if (i->snapshot >= k.k->p.snapshot)
1171 break;
1172 }
1173
1174 return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1175}
c58029ec 1176
43b81a4e
KO
1177static int overlapping_extents_found(struct btree_trans *trans,
1178 enum btree_id btree,
e2bd0617
KO
1179 struct bpos pos1, struct snapshots_seen *pos1_seen,
1180 struct bkey pos2,
1181 bool *fixed,
1182 struct extent_end *extent_end)
454377d8 1183{
43b81a4e
KO
1184 struct bch_fs *c = trans->c;
1185 struct printbuf buf = PRINTBUF;
e2bd0617
KO
1186 struct btree_iter iter1, iter2 = { NULL };
1187 struct bkey_s_c k1, k2;
454377d8
KO
1188 int ret;
1189
43b81a4e
KO
1190 BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1191
e2bd0617
KO
1192 bch2_trans_iter_init(trans, &iter1, btree, pos1,
1193 BTREE_ITER_ALL_SNAPSHOTS|
1194 BTREE_ITER_NOT_EXTENTS);
1195 k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1196 ret = bkey_err(k1);
454377d8 1197 if (ret)
43b81a4e
KO
1198 goto err;
1199
1200 prt_str(&buf, "\n ");
e2bd0617 1201 bch2_bkey_val_to_text(&buf, c, k1);
43b81a4e 1202
e2bd0617
KO
1203 if (!bpos_eq(pos1, k1.k->p)) {
1204 prt_str(&buf, "\n wanted\n ");
1205 bch2_bpos_to_text(&buf, pos1);
1206 prt_str(&buf, "\n ");
1207 bch2_bkey_to_text(&buf, &pos2);
1208
1209 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
43b81a4e
KO
1210 __func__, buf.buf);
1211 ret = -BCH_ERR_internal_fsck_err;
1212 goto err;
1213 }
1214
e2bd0617
KO
1215 bch2_trans_copy_iter(&iter2, &iter1);
1216
43b81a4e 1217 while (1) {
e2bd0617 1218 bch2_btree_iter_advance(&iter2);
43b81a4e 1219
e2bd0617
KO
1220 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1221 ret = bkey_err(k2);
43b81a4e
KO
1222 if (ret)
1223 goto err;
1224
e2bd0617 1225 if (bpos_ge(k2.k->p, pos2.p))
43b81a4e 1226 break;
43b81a4e
KO
1227 }
1228
1229 prt_str(&buf, "\n ");
e2bd0617 1230 bch2_bkey_val_to_text(&buf, c, k2);
43b81a4e 1231
e2bd0617
KO
1232 if (bpos_gt(k2.k->p, pos2.p) ||
1233 pos2.size != k2.k->size) {
43b81a4e
KO
1234 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1235 __func__, buf.buf);
1236 ret = -BCH_ERR_internal_fsck_err;
1237 goto err;
1238 }
1239
e2bd0617
KO
1240 prt_printf(&buf, "\n overwriting %s extent",
1241 pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1242
b65db750
KO
1243 if (fsck_err(c, extent_overlapping,
1244 "overlapping extents%s", buf.buf)) {
e2bd0617
KO
1245 struct btree_iter *old_iter = &iter1;
1246 struct disk_reservation res = { 0 };
454377d8 1247
e2bd0617
KO
1248 if (pos1.snapshot < pos2.p.snapshot) {
1249 old_iter = &iter2;
1250 swap(k1, k2);
1251 }
1252
6474b706 1253 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
e2bd0617
KO
1254
1255 ret = bch2_trans_update_extent_overwrite(trans, old_iter,
1256 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
1257 k1, k2) ?:
3f0e297d 1258 bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
e2bd0617
KO
1259 bch2_disk_reservation_put(c, &res);
1260
1261 if (ret)
43b81a4e
KO
1262 goto err;
1263
1264 *fixed = true;
e2bd0617
KO
1265
1266 if (pos1.snapshot == pos2.p.snapshot) {
1267 /*
1268 * We overwrote the first extent, and did the overwrite
1269 * in the same snapshot:
1270 */
1271 extent_end->offset = bkey_start_offset(&pos2);
1272 } else if (pos1.snapshot > pos2.p.snapshot) {
1273 /*
1274 * We overwrote the first extent in pos2's snapshot:
1275 */
1276 ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1277 } else {
1278 /*
1279 * We overwrote the second extent - restart
1280 * check_extent() from the top:
1281 */
1282 ret = -BCH_ERR_transaction_restart_nested;
1283 }
43b81a4e
KO
1284 }
1285fsck_err:
1286err:
e2bd0617
KO
1287 bch2_trans_iter_exit(trans, &iter2);
1288 bch2_trans_iter_exit(trans, &iter1);
43b81a4e
KO
1289 printbuf_exit(&buf);
1290 return ret;
454377d8
KO
1291}
1292
c58029ec
DH
1293static int check_overlapping_extents(struct btree_trans *trans,
1294 struct snapshots_seen *seen,
a397b8df 1295 struct extent_ends *extent_ends,
c58029ec 1296 struct bkey_s_c k,
a397b8df 1297 u32 equiv,
e2bd0617
KO
1298 struct btree_iter *iter,
1299 bool *fixed)
c58029ec
DH
1300{
1301 struct bch_fs *c = trans->c;
c58029ec
DH
1302 int ret = 0;
1303
a397b8df
KO
1304 /* transaction restart, running again */
1305 if (bpos_eq(extent_ends->last_pos, k.k->p))
1306 return 0;
1307
1308 if (extent_ends->last_pos.inode != k.k->p.inode)
1309 extent_ends_reset(extent_ends);
1310
1311 darray_for_each(extent_ends->e, i) {
1312 if (i->offset <= bkey_start_offset(k.k))
c58029ec
DH
1313 continue;
1314
1315 if (!ref_visible2(c,
1316 k.k->p.snapshot, seen,
1317 i->snapshot, &i->seen))
1318 continue;
1319
a397b8df
KO
1320 ret = overlapping_extents_found(trans, iter->btree_id,
1321 SPOS(iter->pos.inode,
1322 i->offset,
1323 i->snapshot),
e2bd0617
KO
1324 &i->seen,
1325 *k.k, fixed, i);
a397b8df
KO
1326 if (ret)
1327 goto err;
c58029ec
DH
1328 }
1329
a397b8df
KO
1330 ret = extent_ends_at(c, extent_ends, seen, k);
1331 if (ret)
1332 goto err;
c58029ec 1333
a397b8df
KO
1334 extent_ends->last_pos = k.k->p;
1335err:
e2bd0617 1336 return ret;
c58029ec
DH
1337}
1338
9db2f860
KO
1339static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1340 struct bkey_s_c k)
1341{
1342 struct bch_fs *c = trans->c;
1343 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1344 struct bch_extent_crc_unpacked crc;
1345 const union bch_extent_entry *i;
1346 unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1347
1348 bkey_for_each_crc(k.k, ptrs, crc, i)
1349 if (crc_is_encoded(crc) &&
1350 crc.uncompressed_size > encoded_extent_max_sectors) {
1351 struct printbuf buf = PRINTBUF;
1352
1353 bch2_bkey_val_to_text(&buf, c, k);
1354 bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
1355 printbuf_exit(&buf);
1356 }
1357
1358 return 0;
1359}
1360
ef1669ff 1361static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
a1783320 1362 struct bkey_s_c k,
ef1669ff 1363 struct inode_walker *inode,
c58029ec 1364 struct snapshots_seen *s,
a397b8df 1365 struct extent_ends *extent_ends)
ef1669ff
KO
1366{
1367 struct bch_fs *c = trans->c;
ef1669ff 1368 struct inode_walker_entry *i;
fa8e94fa 1369 struct printbuf buf = PRINTBUF;
0d8f320d 1370 struct bpos equiv = k.k->p;
ef1669ff 1371 int ret = 0;
ef1669ff 1372
0d8f320d
KO
1373 equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1374
ef1669ff 1375 ret = check_key_has_snapshot(trans, iter, k);
fa8e94fa
KO
1376 if (ret) {
1377 ret = ret < 0 ? ret : 0;
1378 goto out;
1379 }
ef1669ff 1380
f9f52bc4 1381 if (inode->last_pos.inode != k.k->p.inode) {
ef1669ff
KO
1382 ret = check_i_sectors(trans, inode);
1383 if (ret)
fa8e94fa 1384 goto err;
ef1669ff 1385 }
292dea86 1386
971a1503 1387 i = walk_inode(trans, inode, k);
a57f4d61
KO
1388 ret = PTR_ERR_OR_ZERO(i);
1389 if (ret)
1390 goto err;
1391
650eb16b 1392 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
c58029ec
DH
1393 if (ret)
1394 goto err;
ef1669ff 1395
650eb16b 1396 if (k.k->type != KEY_TYPE_whiteout) {
b65db750 1397 if (fsck_err_on(!i, c, extent_in_missing_inode,
650eb16b
KO
1398 "extent in missing inode:\n %s",
1399 (printbuf_reset(&buf),
1400 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1401 goto delete;
1402
1403 if (fsck_err_on(i &&
1404 !S_ISREG(i->inode.bi_mode) &&
b65db750
KO
1405 !S_ISLNK(i->inode.bi_mode),
1406 c, extent_in_non_reg_inode,
650eb16b
KO
1407 "extent in non regular inode mode %o:\n %s",
1408 i->inode.bi_mode,
1409 (printbuf_reset(&buf),
1410 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1411 goto delete;
ef1669ff 1412
a397b8df 1413 ret = check_overlapping_extents(trans, s, extent_ends, k,
e2bd0617
KO
1414 equiv.snapshot, iter,
1415 &inode->recalculate_sums);
650eb16b 1416 if (ret)
e2bd0617 1417 goto err;
650eb16b 1418 }
ef1669ff 1419
0d06b4ec 1420 /*
a0076086
KO
1421 * Check inodes in reverse order, from oldest snapshots to newest,
1422 * starting from the inode that matches this extent's snapshot. If we
1423 * didn't have one, iterate over all inodes:
0d06b4ec 1424 */
a0076086
KO
1425 if (!i)
1426 i = inode->inodes.data + inode->inodes.nr - 1;
1427
1428 for (;
650eb16b 1429 inode->inodes.data && i >= inode->inodes.data;
0d06b4ec
KO
1430 --i) {
1431 if (i->snapshot > equiv.snapshot ||
1432 !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
1433 continue;
1434
650eb16b 1435 if (k.k->type != KEY_TYPE_whiteout) {
103ffe9a 1436 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
650eb16b 1437 k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
b65db750
KO
1438 !bkey_extent_is_reservation(k),
1439 c, extent_past_end_of_inode,
650eb16b
KO
1440 "extent type past end of inode %llu:%u, i_size %llu\n %s",
1441 i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1442 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1443 struct btree_iter iter2;
1444
1445 bch2_trans_copy_iter(&iter2, iter);
1446 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1447 ret = bch2_btree_iter_traverse(&iter2) ?:
1448 bch2_btree_delete_at(trans, &iter2,
1449 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1450 bch2_trans_iter_exit(trans, &iter2);
0d06b4ec
KO
1451 if (ret)
1452 goto err;
650eb16b 1453
650eb16b 1454 iter->k.type = KEY_TYPE_whiteout;
ef1669ff 1455 }
a0076086
KO
1456
1457 if (bkey_extent_is_allocation(k.k))
1458 i->count += k.k->size;
ef1669ff 1459 }
8a85b20c 1460
a0076086
KO
1461 i->seen_this_pos = true;
1462 }
fa8e94fa
KO
1463out:
1464err:
ef1669ff 1465fsck_err:
fa8e94fa 1466 printbuf_exit(&buf);
d2a990d1 1467 bch_err_fn(c, ret);
8a85b20c 1468 return ret;
0d8f320d
KO
1469delete:
1470 ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1471 goto out;
8a85b20c
KO
1472}
1473
1c6fdbd8
KO
1474/*
1475 * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1476 * that i_size an i_sectors are consistent
1477 */
067d228b 1478int bch2_check_extents(struct bch_fs *c)
1c6fdbd8
KO
1479{
1480 struct inode_walker w = inode_walker_init();
ef1669ff 1481 struct snapshots_seen s;
a397b8df 1482 struct extent_ends extent_ends;
e3dc75eb 1483 struct disk_reservation res = { 0 };
1c6fdbd8 1484
ef1669ff 1485 snapshots_seen_init(&s);
a397b8df 1486 extent_ends_init(&extent_ends);
424eb881 1487
4eb3877e
KO
1488 int ret = bch2_trans_run(c,
1489 for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1490 POS(BCACHEFS_ROOT_INO, 0),
1491 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1492 &res, NULL,
1493 BCH_TRANS_COMMIT_no_enospc, ({
1494 bch2_disk_reservation_put(c, &res);
1495 check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1496 check_extent_overbig(trans, &iter, k);
1497 })) ?:
1498 check_i_sectors(trans, &w));
c58029ec 1499
e3dc75eb 1500 bch2_disk_reservation_put(c, &res);
a397b8df 1501 extent_ends_exit(&extent_ends);
ef1669ff 1502 inode_walker_exit(&w);
ef1669ff
KO
1503 snapshots_seen_exit(&s);
1504
d2a990d1 1505 bch_err_fn(c, ret);
ef1669ff
KO
1506 return ret;
1507}
1508
9db2f860
KO
1509int bch2_check_indirect_extents(struct bch_fs *c)
1510{
9db2f860 1511 struct disk_reservation res = { 0 };
9db2f860 1512
4eb3877e
KO
1513 int ret = bch2_trans_run(c,
1514 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1515 POS_MIN,
1516 BTREE_ITER_PREFETCH, k,
1517 &res, NULL,
1518 BCH_TRANS_COMMIT_no_enospc, ({
1519 bch2_disk_reservation_put(c, &res);
1520 check_extent_overbig(trans, &iter, k);
1521 })));
9db2f860
KO
1522
1523 bch2_disk_reservation_put(c, &res);
9db2f860
KO
1524 bch_err_fn(c, ret);
1525 return ret;
1526}
1527
ef1669ff
KO
1528static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1529{
1530 struct bch_fs *c = trans->c;
0763c552
KO
1531 u32 restart_count = trans->restart_count;
1532 int ret = 0;
ef1669ff
KO
1533 s64 count2;
1534
91d961ba 1535 darray_for_each(w->inodes, i) {
ef1669ff
KO
1536 if (i->inode.bi_nlink == i->count)
1537 continue;
1538
f9f52bc4 1539 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
9e343161
KO
1540 if (count2 < 0)
1541 return count2;
ef1669ff
KO
1542
1543 if (i->count != count2) {
1544 bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1545 i->count, count2);
1546 i->count = count2;
1547 if (i->inode.bi_nlink == i->count)
1548 continue;
1549 }
1550
b65db750
KO
1551 if (fsck_err_on(i->inode.bi_nlink != i->count,
1552 c, inode_dir_wrong_nlink,
ef1669ff 1553 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
f9f52bc4 1554 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
ef1669ff 1555 i->inode.bi_nlink = i->count;
69c8e6ce 1556 ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
abcecb49
KO
1557 if (ret)
1558 break;
1559 }
ef1669ff
KO
1560 }
1561fsck_err:
d2a990d1 1562 bch_err_fn(c, ret);
c872afa2 1563 return ret ?: trans_was_restarted(trans, restart_count);
ef1669ff 1564}
abcecb49 1565
0b17618f 1566static int check_dirent_inode_dirent(struct btree_trans *trans,
3d4998c2
KO
1567 struct btree_iter *iter,
1568 struct bkey_s_c_dirent d,
1569 struct bch_inode_unpacked *target,
1570 u32 target_snapshot)
ef1669ff
KO
1571{
1572 struct bch_fs *c = trans->c;
3d4998c2 1573 struct printbuf buf = PRINTBUF;
ef1669ff
KO
1574 int ret = 0;
1575
e539ebb8
KO
1576 if (inode_points_to_dirent(target, d))
1577 return 0;
1578
ef1669ff
KO
1579 if (!target->bi_dir &&
1580 !target->bi_dir_offset) {
1581 target->bi_dir = d.k->p.inode;
1582 target->bi_dir_offset = d.k->p.offset;
e539ebb8 1583 return __bch2_fsck_write_inode(trans, target, target_snapshot);
ef1669ff
KO
1584 }
1585
e539ebb8
KO
1586 struct btree_iter bp_iter = { NULL };
1587 struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1588 SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1589 ret = bkey_err(bp_dirent);
1590 if (ret && !bch2_err_matches(ret, ENOENT))
1591 goto err;
e3e464ac 1592
e539ebb8
KO
1593 bool backpointer_exists = !ret;
1594 ret = 0;
e3e464ac 1595
e539ebb8
KO
1596 if (fsck_err_on(!backpointer_exists,
1597 c, inode_wrong_backpointer,
1598 "inode %llu:%u has wrong backpointer:\n"
1599 "got %llu:%llu\n"
1600 "should be %llu:%llu",
1601 target->bi_inum, target_snapshot,
1602 target->bi_dir,
1603 target->bi_dir_offset,
1604 d.k->p.inode,
1605 d.k->p.offset)) {
1606 target->bi_dir = d.k->p.inode;
1607 target->bi_dir_offset = d.k->p.offset;
1608 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1609 goto out;
1610 }
e3e464ac 1611
e539ebb8
KO
1612 bch2_bkey_val_to_text(&buf, c, d.s_c);
1613 prt_newline(&buf);
1614 if (backpointer_exists)
1615 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1616
1617 if (fsck_err_on(backpointer_exists &&
1618 (S_ISDIR(target->bi_mode) ||
1619 target->bi_subvol),
1620 c, inode_dir_multiple_links,
1621 "%s %llu:%u with multiple links\n%s",
1622 S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1623 target->bi_inum, target_snapshot, buf.buf)) {
1624 ret = __remove_dirent(trans, d.k->p);
1625 goto out;
1626 }
1c6fdbd8 1627
e539ebb8
KO
1628 /*
1629 * hardlinked file with nlink 0:
1630 * We're just adjusting nlink here so check_nlinks() will pick
1631 * it up, it ignores inodes with nlink 0
1632 */
1633 if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1634 c, inode_multiple_links_but_nlink_0,
1635 "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1636 target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1637 target->bi_nlink++;
1638 target->bi_flags &= ~BCH_INODE_unlinked;
1639 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1640 if (ret)
1641 goto err;
ef1669ff 1642 }
3d4998c2
KO
1643out:
1644err:
1645fsck_err:
1646 bch2_trans_iter_exit(trans, &bp_iter);
1647 printbuf_exit(&buf);
1648 bch_err_fn(c, ret);
1649 return ret;
1650}
1651
1652static int check_dirent_target(struct btree_trans *trans,
1653 struct btree_iter *iter,
1654 struct bkey_s_c_dirent d,
1655 struct bch_inode_unpacked *target,
1656 u32 target_snapshot)
1657{
1658 struct bch_fs *c = trans->c;
1659 struct bkey_i_dirent *n;
1660 struct printbuf buf = PRINTBUF;
1661 int ret = 0;
1662
0b17618f 1663 ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
3d4998c2
KO
1664 if (ret)
1665 goto err;
1c6fdbd8 1666
b65db750
KO
1667 if (fsck_err_on(d.v->d_type != inode_d_type(target),
1668 c, dirent_d_type_wrong,
285b181a
KO
1669 "incorrect d_type: got %s, should be %s:\n%s",
1670 bch2_d_type_str(d.v->d_type),
1671 bch2_d_type_str(inode_d_type(target)),
fa8e94fa
KO
1672 (printbuf_reset(&buf),
1673 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
285b181a
KO
1674 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1675 ret = PTR_ERR_OR_ZERO(n);
1676 if (ret)
fa8e94fa 1677 goto err;
ef1669ff
KO
1678
1679 bkey_reassemble(&n->k_i, d.s_c);
285b181a 1680 n->v.d_type = inode_d_type(target);
45b4ed52
KO
1681 if (n->v.d_type == DT_SUBVOL) {
1682 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1683 n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1684 } else {
1685 n->v.d_inum = cpu_to_le64(target->bi_inum);
1686 }
ef1669ff 1687
285b181a
KO
1688 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1689 if (ret)
fa8e94fa 1690 goto err;
4db65027 1691
285b181a 1692 d = dirent_i_to_s_c(n);
4db65027 1693 }
ef1669ff 1694err:
1c6fdbd8 1695fsck_err:
fa8e94fa 1696 printbuf_exit(&buf);
d2a990d1 1697 bch_err_fn(c, ret);
ef1669ff 1698 return ret;
1c6fdbd8
KO
1699}
1700
c60b7f80
KO
1701/* find a subvolume that's a descendent of @snapshot: */
1702static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1703{
1704 struct btree_iter iter;
1705 struct bkey_s_c k;
1706 int ret;
1707
1708 for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1709 if (k.k->type != KEY_TYPE_subvolume)
1710 continue;
1711
1712 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1713 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1714 bch2_trans_iter_exit(trans, &iter);
1715 *subvolid = k.k->p.offset;
1716 goto found;
1717 }
1718 }
1719 if (!ret)
1720 ret = -ENOENT;
1721found:
1722 bch2_trans_iter_exit(trans, &iter);
1723 return ret;
1724}
1725
ea27001e
KO
1726static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1727 struct bkey_s_c_dirent d)
11def188
KO
1728{
1729 struct bch_fs *c = trans->c;
1730 struct bch_inode_unpacked subvol_root;
f4e68c85 1731 u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
11def188 1732 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
c60b7f80
KO
1733 u32 target_snapshot, parent_snapshot;
1734 u64 target_inum, parent_inum;
1735 struct printbuf buf = PRINTBUF;
11def188
KO
1736 int ret = 0;
1737
c60b7f80
KO
1738 ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1739 if (ret && !bch2_err_matches(ret, ENOENT))
1740 return ret;
1741
1742 if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
1743 "dirent parent_subvol points to missing subvolume\n%s",
1744 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1745 fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1746 c, dirent_not_visible_in_parent_subvol,
1747 "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1748 parent_snapshot,
1749 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1750 u32 new_parent_subvol;
1751 ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1752 if (ret)
1753 goto err;
1754
1755 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1756 ret = PTR_ERR_OR_ZERO(new_dirent);
1757 if (ret)
1758 goto err;
1759
1760 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1761 }
1762
11def188 1763 ret = subvol_lookup(trans, target_subvol,
ea27001e 1764 &target_snapshot, &target_inum);
11def188
KO
1765 if (ret && !bch2_err_matches(ret, ENOENT))
1766 return ret;
1767
f5d58d0c
KO
1768 if (ret) {
1769 if (fsck_err(c, dirent_to_missing_subvol,
1770 "dirent points to missing subvolume\n%s",
1771 (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1772 return __remove_dirent(trans, d.k->p);
1773 ret = 0;
1774 goto out;
1775 }
11def188
KO
1776
1777 ret = lookup_inode(trans, target_inum,
1778 &subvol_root, &target_snapshot);
1779 if (ret && !bch2_err_matches(ret, ENOENT))
1780 return ret;
1781
f4e68c85
KO
1782 if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol,
1783 c, inode_bi_parent_wrong,
1784 "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
1785 target_inum,
1786 subvol_root.bi_parent_subvol, parent_subvol)) {
1787 subvol_root.bi_parent_subvol = parent_subvol;
1788 ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
1789 if (ret)
1790 return ret;
1791 }
1792
11def188
KO
1793 ret = check_dirent_target(trans, iter, d, &subvol_root,
1794 target_snapshot);
1795 if (ret)
1796 return ret;
f5d58d0c 1797out:
c60b7f80 1798err:
11def188 1799fsck_err:
c60b7f80 1800 printbuf_exit(&buf);
11def188
KO
1801 return ret;
1802}
1803
914f2786 1804static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
a1783320 1805 struct bkey_s_c k,
914f2786 1806 struct bch_hash_info *hash_info,
ef1669ff
KO
1807 struct inode_walker *dir,
1808 struct inode_walker *target,
1809 struct snapshots_seen *s)
1c6fdbd8 1810{
914f2786 1811 struct bch_fs *c = trans->c;
914f2786 1812 struct bkey_s_c_dirent d;
ef1669ff 1813 struct inode_walker_entry *i;
fa8e94fa 1814 struct printbuf buf = PRINTBUF;
49124d8a 1815 struct bpos equiv;
fa8e94fa 1816 int ret = 0;
d69f41d6 1817
ef1669ff 1818 ret = check_key_has_snapshot(trans, iter, k);
fa8e94fa
KO
1819 if (ret) {
1820 ret = ret < 0 ? ret : 0;
1821 goto out;
1822 }
1c6fdbd8 1823
49124d8a
KO
1824 equiv = k.k->p;
1825 equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1826
1827 ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
914f2786 1828 if (ret)
fa8e94fa 1829 goto err;
8a85b20c 1830
ef1669ff 1831 if (k.k->type == KEY_TYPE_whiteout)
fa8e94fa 1832 goto out;
ef1669ff 1833
f9f52bc4 1834 if (dir->last_pos.inode != k.k->p.inode) {
ef1669ff
KO
1835 ret = check_subdir_count(trans, dir);
1836 if (ret)
fa8e94fa 1837 goto err;
ef1669ff
KO
1838 }
1839
07f383c7 1840 BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
292dea86 1841
971a1503 1842 i = walk_inode(trans, dir, k);
06dcca51 1843 ret = PTR_ERR_OR_ZERO(i);
ef1669ff 1844 if (ret < 0)
fa8e94fa 1845 goto err;
1c6fdbd8 1846
2a89a3e9 1847 if (dir->first_this_inode && dir->inodes.nr)
c8d5b714
KO
1848 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
1849 dir->first_this_inode = false;
1850
b65db750 1851 if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
914f2786 1852 "dirent in nonexisting directory:\n%s",
fa8e94fa
KO
1853 (printbuf_reset(&buf),
1854 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1855 ret = bch2_btree_delete_at(trans, iter,
285b181a 1856 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
fa8e94fa
KO
1857 goto out;
1858 }
ef1669ff 1859
06dcca51 1860 if (!i)
fa8e94fa 1861 goto out;
ef1669ff 1862
b65db750
KO
1863 if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
1864 c, dirent_in_non_dir_inode,
285b181a
KO
1865 "dirent in non directory inode type %s:\n%s",
1866 bch2_d_type_str(inode_d_type(&i->inode)),
fa8e94fa
KO
1867 (printbuf_reset(&buf),
1868 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1869 ret = bch2_btree_delete_at(trans, iter, 0);
1870 goto out;
1871 }
8a85b20c 1872
c8d5b714 1873 ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
914f2786 1874 if (ret < 0)
fa8e94fa
KO
1875 goto err;
1876 if (ret) {
1877 /* dirent has been deleted */
1878 ret = 0;
1879 goto out;
1880 }
7ac2c55e 1881
914f2786 1882 if (k.k->type != KEY_TYPE_dirent)
fa8e94fa 1883 goto out;
1c6fdbd8 1884
914f2786 1885 d = bkey_s_c_to_dirent(k);
1c6fdbd8 1886
4db65027 1887 if (d.v->d_type == DT_SUBVOL) {
ea27001e 1888 ret = check_dirent_to_subvol(trans, iter, d);
914f2786 1889 if (ret)
fa8e94fa 1890 goto err;
ef1669ff 1891 } else {
4db65027 1892 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
ef1669ff 1893 if (ret)
fa8e94fa 1894 goto err;
1c6fdbd8 1895
b65db750
KO
1896 if (fsck_err_on(!target->inodes.nr,
1897 c, dirent_to_missing_inode,
49124d8a
KO
1898 "dirent points to missing inode: (equiv %u)\n%s",
1899 equiv.snapshot,
fa8e94fa
KO
1900 (printbuf_reset(&buf),
1901 bch2_bkey_val_to_text(&buf, c, k),
1902 buf.buf))) {
285b181a 1903 ret = __remove_dirent(trans, d.k->p);
ef1669ff 1904 if (ret)
fa8e94fa 1905 goto err;
1c6fdbd8
KO
1906 }
1907
91d961ba 1908 darray_for_each(target->inodes, i) {
ef1669ff
KO
1909 ret = check_dirent_target(trans, iter, d,
1910 &i->inode, i->snapshot);
1911 if (ret)
fa8e94fa 1912 goto err;
d3ff7fec
KO
1913 }
1914
11def188
KO
1915 if (d.v->d_type == DT_DIR)
1916 for_each_visible_inode(c, s, dir, equiv.snapshot, i)
1917 i->count++;
1918 }
fa8e94fa
KO
1919out:
1920err:
914f2786 1921fsck_err:
fa8e94fa 1922 printbuf_exit(&buf);
d2a990d1 1923 bch_err_fn(c, ret);
914f2786
KO
1924 return ret;
1925}
1c6fdbd8 1926
914f2786
KO
1927/*
1928 * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1929 * validate d_type
1930 */
067d228b 1931int bch2_check_dirents(struct bch_fs *c)
914f2786 1932{
ef1669ff
KO
1933 struct inode_walker dir = inode_walker_init();
1934 struct inode_walker target = inode_walker_init();
1935 struct snapshots_seen s;
914f2786 1936 struct bch_hash_info hash_info;
1c6fdbd8 1937
ef1669ff 1938 snapshots_seen_init(&s);
1c6fdbd8 1939
4eb3877e
KO
1940 int ret = bch2_trans_run(c,
1941 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
1942 POS(BCACHEFS_ROOT_INO, 0),
1943 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1944 k,
1945 NULL, NULL,
1946 BCH_TRANS_COMMIT_no_enospc,
1947 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
914f2786 1948
ef1669ff
KO
1949 snapshots_seen_exit(&s);
1950 inode_walker_exit(&dir);
1951 inode_walker_exit(&target);
d2a990d1 1952 bch_err_fn(c, ret);
ef1669ff 1953 return ret;
1c6fdbd8
KO
1954}
1955
285b181a 1956static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
a1783320 1957 struct bkey_s_c k,
285b181a
KO
1958 struct bch_hash_info *hash_info,
1959 struct inode_walker *inode)
1960{
1961 struct bch_fs *c = trans->c;
06dcca51 1962 struct inode_walker_entry *i;
285b181a
KO
1963 int ret;
1964
285b181a
KO
1965 ret = check_key_has_snapshot(trans, iter, k);
1966 if (ret)
1967 return ret;
1968
971a1503 1969 i = walk_inode(trans, inode, k);
06dcca51
KO
1970 ret = PTR_ERR_OR_ZERO(i);
1971 if (ret)
285b181a
KO
1972 return ret;
1973
2a89a3e9 1974 if (inode->first_this_inode && inode->inodes.nr)
c8d5b714
KO
1975 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
1976 inode->first_this_inode = false;
1977
b65db750 1978 if (fsck_err_on(!i, c, xattr_in_missing_inode,
285b181a
KO
1979 "xattr for missing inode %llu",
1980 k.k->p.inode))
1981 return bch2_btree_delete_at(trans, iter, 0);
1982
06dcca51 1983 if (!i)
285b181a
KO
1984 return 0;
1985
285b181a
KO
1986 ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
1987fsck_err:
d2a990d1 1988 bch_err_fn(c, ret);
285b181a
KO
1989 return ret;
1990}
1991
1c6fdbd8
KO
1992/*
1993 * Walk xattrs: verify that they all have a corresponding inode
1994 */
067d228b 1995int bch2_check_xattrs(struct bch_fs *c)
1c6fdbd8 1996{
285b181a 1997 struct inode_walker inode = inode_walker_init();
7ac2c55e 1998 struct bch_hash_info hash_info;
1c6fdbd8
KO
1999 int ret = 0;
2000
6bd68ec2
KO
2001 ret = bch2_trans_run(c,
2002 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
a1783320
KO
2003 POS(BCACHEFS_ROOT_INO, 0),
2004 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
2005 k,
2006 NULL, NULL,
3f0e297d 2007 BCH_TRANS_COMMIT_no_enospc,
6bd68ec2 2008 check_xattr(trans, &iter, k, &hash_info, &inode)));
d2a990d1 2009 bch_err_fn(c, ret);
9a796fdb 2010 return ret;
1c6fdbd8
KO
2011}
2012
285b181a 2013static int check_root_trans(struct btree_trans *trans)
1c6fdbd8 2014{
285b181a 2015 struct bch_fs *c = trans->c;
ef1669ff 2016 struct bch_inode_unpacked root_inode;
d3ff7fec 2017 u32 snapshot;
ef1669ff 2018 u64 inum;
1c6fdbd8
KO
2019 int ret;
2020
c98d132e 2021 ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
e47a390a 2022 if (ret && !bch2_err_matches(ret, ENOENT))
1c6fdbd8
KO
2023 return ret;
2024
b65db750
KO
2025 if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
2026 "root subvol missing")) {
ef1669ff 2027 struct bkey_i_subvolume root_subvol;
1c6fdbd8 2028
ef1669ff
KO
2029 snapshot = U32_MAX;
2030 inum = BCACHEFS_ROOT_INO;
1c6fdbd8 2031
ef1669ff
KO
2032 bkey_subvolume_init(&root_subvol.k_i);
2033 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
2034 root_subvol.v.flags = 0;
2035 root_subvol.v.snapshot = cpu_to_le32(snapshot);
2036 root_subvol.v.inode = cpu_to_le64(inum);
002c76dc 2037 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
d2a990d1
KO
2038 bch_err_msg(c, ret, "writing root subvol");
2039 if (ret)
ef1669ff 2040 goto err;
ef1669ff
KO
2041 }
2042
c98d132e 2043 ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
e47a390a 2044 if (ret && !bch2_err_matches(ret, ENOENT))
ef1669ff 2045 return ret;
1c6fdbd8 2046
b65db750
KO
2047 if (mustfix_fsck_err_on(ret, c, root_dir_missing,
2048 "root directory missing") ||
2049 mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2050 c, root_inode_not_dir,
ef1669ff
KO
2051 "root inode not a directory")) {
2052 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2053 0, NULL);
2054 root_inode.bi_inum = inum;
1c6fdbd8 2055
69c8e6ce 2056 ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
d2a990d1 2057 bch_err_msg(c, ret, "writing root inode");
ef1669ff
KO
2058 }
2059err:
2060fsck_err:
ef1669ff 2061 return ret;
1c6fdbd8
KO
2062}
2063
285b181a 2064/* Get root directory, create if it doesn't exist: */
067d228b 2065int bch2_check_root(struct bch_fs *c)
285b181a 2066{
002c76dc 2067 int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
6bd68ec2 2068 check_root_trans(trans));
d2a990d1 2069 bch_err_fn(c, ret);
1bb3c2a9 2070 return ret;
285b181a
KO
2071}
2072
91d961ba
KO
2073struct pathbuf_entry {
2074 u64 inum;
2075 u32 snapshot;
1c6fdbd8
KO
2076};
2077
91d961ba
KO
2078typedef DARRAY(struct pathbuf_entry) pathbuf;
2079
2080static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
6e0c886d 2081{
91d961ba 2082 darray_for_each(*p, i)
6e0c886d
KO
2083 if (i->inum == inum &&
2084 i->snapshot == snapshot)
2085 return true;
6e0c886d
KO
2086 return false;
2087}
2088
91d961ba 2089static int path_down(struct bch_fs *c, pathbuf *p,
f0f41a6d 2090 u64 inum, u32 snapshot)
1c6fdbd8 2091{
91d961ba 2092 int ret = darray_push(p, ((struct pathbuf_entry) {
6e0c886d
KO
2093 .inum = inum,
2094 .snapshot = snapshot,
91d961ba
KO
2095 }));
2096
2097 if (ret)
2098 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
2099 p->size);
2100 return ret;
1c6fdbd8
KO
2101}
2102
6e0c886d
KO
2103/*
2104 * Check that a given inode is reachable from the root:
2105 *
2106 * XXX: we should also be verifying that inodes are in the right subvolumes
2107 */
688a7694 2108static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
1c6fdbd8 2109{
d3ff7fec 2110 struct bch_fs *c = trans->c;
688a7694
KO
2111 struct bch_inode_unpacked inode;
2112 u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
1c6fdbd8
KO
2113 int ret = 0;
2114
d3ff7fec 2115 p->nr = 0;
424eb881 2116
688a7694
KO
2117 BUG_ON(bch2_inode_unpack(inode_k, &inode));
2118
2119 while (!(inode.bi_inum == BCACHEFS_ROOT_INO &&
2120 inode.bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
4db65027
KO
2121 struct btree_iter dirent_iter;
2122 struct bkey_s_c_dirent d;
6e0c886d
KO
2123 u32 parent_snapshot = snapshot;
2124
688a7694 2125 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
c98d132e 2126 ret = bkey_err(d.s_c);
e47a390a 2127 if (ret && !bch2_err_matches(ret, ENOENT))
d3ff7fec 2128 break;
1c6fdbd8 2129
688a7694 2130 if (!ret && !dirent_points_to_inode(d, &inode)) {
4db65027 2131 bch2_trans_iter_exit(trans, &dirent_iter);
e47a390a 2132 ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
4db65027
KO
2133 }
2134
e47a390a 2135 if (bch2_err_matches(ret, ENOENT)) {
688a7694 2136 if (fsck_err(c, inode_unreachable,
b65db750 2137 "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
688a7694
KO
2138 inode.bi_inum, snapshot,
2139 bch2_d_type_str(inode_d_type(&inode)),
2140 inode.bi_nlink,
2141 inode.bi_dir,
2142 inode.bi_dir_offset))
2143 ret = reattach_inode(trans, &inode, snapshot);
d3ff7fec
KO
2144 break;
2145 }
4db65027
KO
2146
2147 bch2_trans_iter_exit(trans, &dirent_iter);
1c6fdbd8 2148
688a7694 2149 if (!S_ISDIR(inode.bi_mode))
d3ff7fec 2150 break;
1c6fdbd8 2151
688a7694 2152 ret = path_down(c, p, inode.bi_inum, snapshot);
d3ff7fec
KO
2153 if (ret) {
2154 bch_err(c, "memory allocation failure");
2155 return ret;
2156 }
1c6fdbd8 2157
6e0c886d
KO
2158 snapshot = parent_snapshot;
2159
688a7694 2160 ret = lookup_inode(trans, inode.bi_dir, &inode, &snapshot);
6e0c886d
KO
2161 if (ret) {
2162 /* Should have been caught in dirents pass */
c98d132e
KO
2163 if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2164 bch_err(c, "error looking up parent directory: %i", ret);
6e0c886d
KO
2165 break;
2166 }
2167
688a7694 2168 if (path_is_dup(p, inode.bi_inum, snapshot)) {
d3ff7fec 2169 /* XXX print path */
6e0c886d
KO
2170 bch_err(c, "directory structure loop");
2171
91d961ba 2172 darray_for_each(*p, i)
6e0c886d 2173 pr_err("%llu:%u", i->inum, i->snapshot);
688a7694 2174 pr_err("%llu:%u", inode.bi_inum, snapshot);
6e0c886d 2175
c98d132e 2176 if (!fsck_err(c, dir_loop, "directory structure loop"))
d3ff7fec 2177 return 0;
1c6fdbd8 2178
688a7694 2179 ret = remove_backpointer(trans, &inode);
c98d132e
KO
2180 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
2181 bch_err_msg(c, ret, "removing dirent");
2182 if (ret)
d3ff7fec 2183 break;
1c6fdbd8 2184
688a7694 2185 ret = reattach_inode(trans, &inode, snapshot);
c98d132e 2186 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
688a7694 2187 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
c98d132e 2188 break;
1c6fdbd8 2189 }
1c6fdbd8 2190 }
d3ff7fec 2191fsck_err:
d2a990d1 2192 bch_err_fn(c, ret);
d3ff7fec
KO
2193 return ret;
2194}
1c6fdbd8 2195
d3ff7fec
KO
2196/*
2197 * Check for unreachable inodes, as well as loops in the directory structure:
067d228b 2198 * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
d3ff7fec
KO
2199 * unreachable:
2200 */
067d228b 2201int bch2_check_directory_structure(struct bch_fs *c)
d3ff7fec 2202{
91d961ba 2203 pathbuf path = { 0, };
d3ff7fec 2204 int ret;
1c6fdbd8 2205
c98d132e
KO
2206 ret = bch2_trans_run(c,
2207 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2208 BTREE_ITER_INTENT|
2209 BTREE_ITER_PREFETCH|
2210 BTREE_ITER_ALL_SNAPSHOTS, k,
2211 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2212 if (!bkey_is_inode(k.k))
2213 continue;
1c6fdbd8 2214
688a7694 2215 if (bch2_inode_flags(k) & BCH_INODE_unlinked)
c98d132e 2216 continue;
ef1669ff 2217
688a7694 2218 check_path(trans, &path, k);
c98d132e 2219 })));
91d961ba 2220 darray_exit(&path);
27b2df98 2221
d2a990d1 2222 bch_err_fn(c, ret);
9a796fdb 2223 return ret;
1c6fdbd8
KO
2224}
2225
fc51b041
KO
2226struct nlink_table {
2227 size_t nr;
2228 size_t size;
1c6fdbd8 2229
fc51b041
KO
2230 struct nlink {
2231 u64 inum;
2232 u32 snapshot;
2233 u32 count;
2234 } *d;
2235};
1c6fdbd8 2236
f0f41a6d
KO
2237static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2238 u64 inum, u32 snapshot)
1c6fdbd8 2239{
fc51b041
KO
2240 if (t->nr == t->size) {
2241 size_t new_size = max_t(size_t, 128UL, t->size * 2);
3e3e02e6
KO
2242 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2243
fc51b041 2244 if (!d) {
f0f41a6d
KO
2245 bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2246 new_size);
65d48e35 2247 return -BCH_ERR_ENOMEM_fsck_add_nlink;
fc51b041 2248 }
1c6fdbd8 2249
82355e28
KO
2250 if (t->d)
2251 memcpy(d, t->d, t->size * sizeof(t->d[0]));
fc51b041 2252 kvfree(t->d);
1c6fdbd8 2253
fc51b041
KO
2254 t->d = d;
2255 t->size = new_size;
2bb748a6
KO
2256 }
2257
fc51b041
KO
2258
2259 t->d[t->nr++] = (struct nlink) {
2260 .inum = inum,
2261 .snapshot = snapshot,
2262 };
2263
2264 return 0;
2265}
2266
2267static int nlink_cmp(const void *_l, const void *_r)
2268{
2269 const struct nlink *l = _l;
2270 const struct nlink *r = _r;
2271
db18ef1a 2272 return cmp_int(l->inum, r->inum);
fc51b041
KO
2273}
2274
ef1669ff
KO
2275static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2276 struct nlink_table *links,
2277 u64 range_start, u64 range_end, u64 inum, u32 snapshot)
fc51b041
KO
2278{
2279 struct nlink *link, key = {
2280 .inum = inum, .snapshot = U32_MAX,
2281 };
2282
2283 if (inum < range_start || inum >= range_end)
1c6fdbd8 2284 return;
fc51b041
KO
2285
2286 link = __inline_bsearch(&key, links->d, links->nr,
2287 sizeof(links->d[0]), nlink_cmp);
ef1669ff
KO
2288 if (!link)
2289 return;
2290
2291 while (link > links->d && link[0].inum == link[-1].inum)
2292 --link;
2293
2294 for (; link < links->d + links->nr && link->inum == inum; link++)
2295 if (ref_visible(c, s, snapshot, link->snapshot)) {
2296 link->count++;
2297 if (link->snapshot >= snapshot)
2298 break;
2299 }
fc51b041
KO
2300}
2301
2302noinline_for_stack
2303static int check_nlinks_find_hardlinks(struct bch_fs *c,
2304 struct nlink_table *t,
2305 u64 start, u64 *end)
2306{
27b2df98 2307 int ret = bch2_trans_run(c,
5028b907
KO
2308 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2309 POS(0, start),
2310 BTREE_ITER_INTENT|
2311 BTREE_ITER_PREFETCH|
2312 BTREE_ITER_ALL_SNAPSHOTS, k, ({
27b2df98
KO
2313 if (!bkey_is_inode(k.k))
2314 continue;
fc51b041 2315
27b2df98 2316 /* Should never fail, checked by bch2_inode_invalid: */
80eab7a7 2317 struct bch_inode_unpacked u;
27b2df98 2318 BUG_ON(bch2_inode_unpack(k, &u));
fc51b041 2319
27b2df98
KO
2320 /*
2321 * Backpointer and directory structure checks are sufficient for
2322 * directories, since they can't have hardlinks:
2323 */
2324 if (S_ISDIR(u.bi_mode))
2325 continue;
fc51b041 2326
27b2df98
KO
2327 if (!u.bi_nlink)
2328 continue;
fc51b041 2329
27b2df98
KO
2330 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2331 if (ret) {
2332 *end = k.k->p.offset;
2333 ret = 0;
2334 break;
2335 }
2336 0;
2337 })));
1c6fdbd8 2338
27b2df98 2339 bch_err_fn(c, ret);
fc51b041 2340 return ret;
1c6fdbd8
KO
2341}
2342
2343noinline_for_stack
fc51b041
KO
2344static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2345 u64 range_start, u64 range_end)
1c6fdbd8 2346{
ef1669ff 2347 struct snapshots_seen s;
1c6fdbd8 2348
ef1669ff
KO
2349 snapshots_seen_init(&s);
2350
27b2df98 2351 int ret = bch2_trans_run(c,
5028b907
KO
2352 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2353 BTREE_ITER_INTENT|
2354 BTREE_ITER_PREFETCH|
2355 BTREE_ITER_ALL_SNAPSHOTS, k, ({
27b2df98
KO
2356 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2357 if (ret)
2358 break;
1c6fdbd8 2359
80eab7a7
KO
2360 if (k.k->type == KEY_TYPE_dirent) {
2361 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
abcecb49 2362
27b2df98
KO
2363 if (d.v->d_type != DT_DIR &&
2364 d.v->d_type != DT_SUBVOL)
2365 inc_link(c, &s, links, range_start, range_end,
2366 le64_to_cpu(d.v->d_inum),
2367 bch2_snapshot_equiv(c, d.k->p.snapshot));
27b2df98
KO
2368 }
2369 0;
2370 })));
1c6fdbd8 2371
ef1669ff 2372 snapshots_seen_exit(&s);
27b2df98
KO
2373
2374 bch_err_fn(c, ret);
1c6fdbd8
KO
2375 return ret;
2376}
2377
eace11a7
KO
2378static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2379 struct bkey_s_c k,
2380 struct nlink_table *links,
2381 size_t *idx, u64 range_end)
2382{
2383 struct bch_fs *c = trans->c;
2384 struct bch_inode_unpacked u;
2385 struct nlink *link = &links->d[*idx];
2386 int ret = 0;
2387
2388 if (k.k->p.offset >= range_end)
2389 return 1;
2390
2391 if (!bkey_is_inode(k.k))
2392 return 0;
2393
2394 BUG_ON(bch2_inode_unpack(k, &u));
2395
73bd774d 2396 if (S_ISDIR(u.bi_mode))
eace11a7
KO
2397 return 0;
2398
2399 if (!u.bi_nlink)
2400 return 0;
2401
2402 while ((cmp_int(link->inum, k.k->p.offset) ?:
2403 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2404 BUG_ON(*idx == links->nr);
2405 link = &links->d[++*idx];
2406 }
2407
b65db750
KO
2408 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2409 c, inode_wrong_nlink,
eace11a7
KO
2410 "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2411 u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2412 bch2_inode_nlink_get(&u), link->count)) {
2413 bch2_inode_nlink_set(&u, link->count);
69c8e6ce 2414 ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
eace11a7
KO
2415 }
2416fsck_err:
2417 return ret;
2418}
2419
1c6fdbd8 2420noinline_for_stack
fc51b041
KO
2421static int check_nlinks_update_hardlinks(struct bch_fs *c,
2422 struct nlink_table *links,
1c6fdbd8
KO
2423 u64 range_start, u64 range_end)
2424{
eace11a7 2425 size_t idx = 0;
1c6fdbd8 2426
80eab7a7 2427 int ret = bch2_trans_run(c,
6bd68ec2
KO
2428 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2429 POS(0, range_start),
2430 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
3f0e297d 2431 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
6bd68ec2 2432 check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
eace11a7 2433 if (ret < 0) {
8b58623f 2434 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
eace11a7
KO
2435 return ret;
2436 }
1c6fdbd8 2437
eace11a7 2438 return 0;
1c6fdbd8
KO
2439}
2440
067d228b 2441int bch2_check_nlinks(struct bch_fs *c)
1c6fdbd8 2442{
fc51b041 2443 struct nlink_table links = { 0 };
1c6fdbd8
KO
2444 u64 this_iter_range_start, next_iter_range_start = 0;
2445 int ret = 0;
2446
1c6fdbd8
KO
2447 do {
2448 this_iter_range_start = next_iter_range_start;
2449 next_iter_range_start = U64_MAX;
2450
fc51b041
KO
2451 ret = check_nlinks_find_hardlinks(c, &links,
2452 this_iter_range_start,
2453 &next_iter_range_start);
2454
2455 ret = check_nlinks_walk_dirents(c, &links,
1c6fdbd8 2456 this_iter_range_start,
fc51b041 2457 next_iter_range_start);
1c6fdbd8
KO
2458 if (ret)
2459 break;
2460
fc51b041 2461 ret = check_nlinks_update_hardlinks(c, &links,
1c6fdbd8
KO
2462 this_iter_range_start,
2463 next_iter_range_start);
2464 if (ret)
2465 break;
2466
fc51b041 2467 links.nr = 0;
1c6fdbd8
KO
2468 } while (next_iter_range_start != U64_MAX);
2469
fc51b041 2470 kvfree(links.d);
d2a990d1 2471 bch_err_fn(c, ret);
1c6fdbd8
KO
2472 return ret;
2473}
2474
eace11a7
KO
2475static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2476 struct bkey_s_c k)
bfe88863 2477{
bfe88863
KO
2478 struct bkey_s_c_reflink_p p;
2479 struct bkey_i_reflink_p *u;
bfe88863 2480
bfe88863
KO
2481 if (k.k->type != KEY_TYPE_reflink_p)
2482 return 0;
2483
2484 p = bkey_s_c_to_reflink_p(k);
2485
6d76aefe 2486 if (!p.v->front_pad && !p.v->back_pad)
bfe88863
KO
2487 return 0;
2488
2489 u = bch2_trans_kmalloc(trans, sizeof(*u));
cf904c8d 2490 int ret = PTR_ERR_OR_ZERO(u);
bfe88863
KO
2491 if (ret)
2492 return ret;
2493
2494 bkey_reassemble(&u->k_i, k);
6d76aefe
KO
2495 u->v.front_pad = 0;
2496 u->v.back_pad = 0;
bfe88863 2497
6b3d8b89 2498 return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
bfe88863
KO
2499}
2500
067d228b 2501int bch2_fix_reflink_p(struct bch_fs *c)
bfe88863 2502{
bfe88863
KO
2503 if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2504 return 0;
2505
80eab7a7 2506 int ret = bch2_trans_run(c,
6bd68ec2 2507 for_each_btree_key_commit(trans, iter,
1bb3c2a9
KO
2508 BTREE_ID_extents, POS_MIN,
2509 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
2510 BTREE_ITER_ALL_SNAPSHOTS, k,
3f0e297d 2511 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
6bd68ec2 2512 fix_reflink_p_key(trans, &iter, k)));
d2a990d1 2513 bch_err_fn(c, ret);
bfe88863
KO
2514 return ret;
2515}