Commit | Line | Data |
---|---|---|
739a2fe0 | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
dcb660f9 | 2 | /* |
ecc73f8a | 3 | * Copyright (C) 2017-2023 Oracle. All Rights Reserved. |
739a2fe0 | 4 | * Author: Darrick J. Wong <djwong@kernel.org> |
dcb660f9 DW |
5 | */ |
6 | #include "xfs.h" | |
7 | #include "xfs_fs.h" | |
8 | #include "xfs_shared.h" | |
9 | #include "xfs_format.h" | |
10 | #include "xfs_trans_resv.h" | |
11 | #include "xfs_mount.h" | |
dcb660f9 | 12 | #include "xfs_btree.h" |
dcb660f9 DW |
13 | #include "xfs_log_format.h" |
14 | #include "xfs_trans.h" | |
dcb660f9 | 15 | #include "xfs_inode.h" |
80e4e126 | 16 | #include "xfs_icache.h" |
dcb660f9 DW |
17 | #include "xfs_alloc.h" |
18 | #include "xfs_alloc_btree.h" | |
dcb660f9 DW |
19 | #include "xfs_ialloc.h" |
20 | #include "xfs_ialloc_btree.h" | |
dcb660f9 DW |
21 | #include "xfs_refcount_btree.h" |
22 | #include "xfs_rmap.h" | |
23 | #include "xfs_rmap_btree.h" | |
3daa6641 DW |
24 | #include "xfs_log.h" |
25 | #include "xfs_trans_priv.h" | |
fd920008 AH |
26 | #include "xfs_da_format.h" |
27 | #include "xfs_da_btree.h" | |
6c728952 | 28 | #include "xfs_dir2_priv.h" |
87d9d609 DW |
29 | #include "xfs_attr.h" |
30 | #include "xfs_reflink.h" | |
9bbafc71 | 31 | #include "xfs_ag.h" |
48dd9117 | 32 | #include "xfs_error.h" |
20049187 | 33 | #include "xfs_quota.h" |
dcb660f9 DW |
34 | #include "scrub/scrub.h" |
35 | #include "scrub/common.h" | |
36 | #include "scrub/trace.h" | |
0a9633fa | 37 | #include "scrub/repair.h" |
4fb7951f | 38 | #include "scrub/health.h" |
dcb660f9 DW |
39 | |
40 | /* Common code for the metadata scrubbers. */ | |
41 | ||
4700d229 DW |
42 | /* |
43 | * Handling operational errors. | |
44 | * | |
45 | * The *_process_error() family of functions are used to process error return | |
46 | * codes from functions called as part of a scrub operation. | |
47 | * | |
48 | * If there's no error, we return true to tell the caller that it's ok | |
49 | * to move on to the next check in its list. | |
50 | * | |
51 | * For non-verifier errors (e.g. ENOMEM) we return false to tell the | |
52 | * caller that something bad happened, and we preserve *error so that | |
53 | * the caller can return the *error up the stack to userspace. | |
54 | * | |
55 | * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting | |
56 | * OFLAG_CORRUPT in sm_flags and the *error is cleared. In other words, | |
57 | * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT, | |
58 | * not via return codes. We return false to tell the caller that | |
59 | * something bad happened. Since the error has been cleared, the caller | |
60 | * will (presumably) return that zero and scrubbing will move on to | |
61 | * whatever's next. | |
62 | * | |
63 | * ftrace can be used to record the precise metadata location and the | |
64 | * approximate code location of the failed operation. | |
65 | */ | |
66 | ||
67 | /* Check for operational errors. */ | |
64b12563 | 68 | static bool |
c517b3aa | 69 | __xchk_process_error( |
1d8a748a | 70 | struct xfs_scrub *sc, |
032d91f9 DW |
71 | xfs_agnumber_t agno, |
72 | xfs_agblock_t bno, | |
73 | int *error, | |
74 | __u32 errflag, | |
75 | void *ret_ip) | |
4700d229 DW |
76 | { |
77 | switch (*error) { | |
78 | case 0: | |
79 | return true; | |
80 | case -EDEADLOCK: | |
88accf17 | 81 | case -ECHRNG: |
4700d229 | 82 | /* Used to restart an op with deadlock avoidance. */ |
16c9de54 DW |
83 | trace_xchk_deadlock_retry( |
84 | sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), | |
85 | sc->sm, *error); | |
4700d229 | 86 | break; |
48dd9117 DW |
87 | case -ECANCELED: |
88 | /* | |
89 | * ECANCELED here means that the caller set one of the scrub | |
90 | * outcome flags (corrupt, xfail, xcorrupt) and wants to exit | |
91 | * quickly. Set error to zero and do not continue. | |
92 | */ | |
93 | trace_xchk_op_error(sc, agno, bno, *error, ret_ip); | |
94 | *error = 0; | |
95 | break; | |
4700d229 DW |
96 | case -EFSBADCRC: |
97 | case -EFSCORRUPTED: | |
98 | /* Note the badness but don't abort. */ | |
64b12563 | 99 | sc->sm->sm_flags |= errflag; |
4700d229 | 100 | *error = 0; |
53004ee7 | 101 | fallthrough; |
4700d229 | 102 | default: |
48dd9117 | 103 | trace_xchk_op_error(sc, agno, bno, *error, ret_ip); |
4700d229 DW |
104 | break; |
105 | } | |
106 | return false; | |
107 | } | |
108 | ||
4700d229 | 109 | bool |
c517b3aa | 110 | xchk_process_error( |
1d8a748a | 111 | struct xfs_scrub *sc, |
032d91f9 DW |
112 | xfs_agnumber_t agno, |
113 | xfs_agblock_t bno, | |
114 | int *error) | |
64b12563 | 115 | { |
c517b3aa | 116 | return __xchk_process_error(sc, agno, bno, error, |
64b12563 DW |
117 | XFS_SCRUB_OFLAG_CORRUPT, __return_address); |
118 | } | |
119 | ||
120 | bool | |
c517b3aa | 121 | xchk_xref_process_error( |
1d8a748a | 122 | struct xfs_scrub *sc, |
032d91f9 DW |
123 | xfs_agnumber_t agno, |
124 | xfs_agblock_t bno, | |
125 | int *error) | |
64b12563 | 126 | { |
c517b3aa | 127 | return __xchk_process_error(sc, agno, bno, error, |
64b12563 DW |
128 | XFS_SCRUB_OFLAG_XFAIL, __return_address); |
129 | } | |
130 | ||
131 | /* Check for operational errors for a file offset. */ | |
132 | static bool | |
c517b3aa | 133 | __xchk_fblock_process_error( |
1d8a748a | 134 | struct xfs_scrub *sc, |
032d91f9 DW |
135 | int whichfork, |
136 | xfs_fileoff_t offset, | |
137 | int *error, | |
138 | __u32 errflag, | |
139 | void *ret_ip) | |
4700d229 DW |
140 | { |
141 | switch (*error) { | |
142 | case 0: | |
143 | return true; | |
144 | case -EDEADLOCK: | |
88accf17 | 145 | case -ECHRNG: |
4700d229 | 146 | /* Used to restart an op with deadlock avoidance. */ |
c517b3aa | 147 | trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); |
4700d229 | 148 | break; |
48dd9117 DW |
149 | case -ECANCELED: |
150 | /* | |
151 | * ECANCELED here means that the caller set one of the scrub | |
152 | * outcome flags (corrupt, xfail, xcorrupt) and wants to exit | |
153 | * quickly. Set error to zero and do not continue. | |
154 | */ | |
155 | trace_xchk_file_op_error(sc, whichfork, offset, *error, | |
156 | ret_ip); | |
157 | *error = 0; | |
158 | break; | |
4700d229 DW |
159 | case -EFSBADCRC: |
160 | case -EFSCORRUPTED: | |
161 | /* Note the badness but don't abort. */ | |
64b12563 | 162 | sc->sm->sm_flags |= errflag; |
4700d229 | 163 | *error = 0; |
53004ee7 | 164 | fallthrough; |
4700d229 | 165 | default: |
c517b3aa | 166 | trace_xchk_file_op_error(sc, whichfork, offset, *error, |
64b12563 | 167 | ret_ip); |
4700d229 DW |
168 | break; |
169 | } | |
170 | return false; | |
171 | } | |
172 | ||
64b12563 | 173 | bool |
c517b3aa | 174 | xchk_fblock_process_error( |
1d8a748a | 175 | struct xfs_scrub *sc, |
032d91f9 DW |
176 | int whichfork, |
177 | xfs_fileoff_t offset, | |
178 | int *error) | |
64b12563 | 179 | { |
c517b3aa | 180 | return __xchk_fblock_process_error(sc, whichfork, offset, error, |
64b12563 DW |
181 | XFS_SCRUB_OFLAG_CORRUPT, __return_address); |
182 | } | |
183 | ||
184 | bool | |
c517b3aa | 185 | xchk_fblock_xref_process_error( |
1d8a748a | 186 | struct xfs_scrub *sc, |
032d91f9 DW |
187 | int whichfork, |
188 | xfs_fileoff_t offset, | |
189 | int *error) | |
64b12563 | 190 | { |
c517b3aa | 191 | return __xchk_fblock_process_error(sc, whichfork, offset, error, |
64b12563 DW |
192 | XFS_SCRUB_OFLAG_XFAIL, __return_address); |
193 | } | |
194 | ||
4700d229 DW |
195 | /* |
196 | * Handling scrub corruption/optimization/warning checks. | |
197 | * | |
198 | * The *_set_{corrupt,preen,warning}() family of functions are used to | |
199 | * record the presence of metadata that is incorrect (corrupt), could be | |
200 | * optimized somehow (preen), or should be flagged for administrative | |
201 | * review but is not incorrect (warn). | |
202 | * | |
203 | * ftrace can be used to record the precise metadata location and | |
204 | * approximate code location of the failed check. | |
205 | */ | |
206 | ||
207 | /* Record a block which could be optimized. */ | |
208 | void | |
c517b3aa | 209 | xchk_block_set_preen( |
1d8a748a | 210 | struct xfs_scrub *sc, |
032d91f9 | 211 | struct xfs_buf *bp) |
4700d229 DW |
212 | { |
213 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; | |
9343ee76 | 214 | trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address); |
4700d229 DW |
215 | } |
216 | ||
217 | /* | |
218 | * Record an inode which could be optimized. The trace data will | |
219 | * include the block given by bp if bp is given; otherwise it will use | |
220 | * the block location of the inode record itself. | |
221 | */ | |
222 | void | |
c517b3aa | 223 | xchk_ino_set_preen( |
1d8a748a | 224 | struct xfs_scrub *sc, |
032d91f9 | 225 | xfs_ino_t ino) |
4700d229 DW |
226 | { |
227 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; | |
c517b3aa | 228 | trace_xchk_ino_preen(sc, ino, __return_address); |
4700d229 DW |
229 | } |
230 | ||
75efa57d DW |
231 | /* Record something being wrong with the filesystem primary superblock. */ |
232 | void | |
233 | xchk_set_corrupt( | |
234 | struct xfs_scrub *sc) | |
235 | { | |
236 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; | |
237 | trace_xchk_fs_error(sc, 0, __return_address); | |
238 | } | |
239 | ||
4700d229 DW |
240 | /* Record a corrupt block. */ |
241 | void | |
c517b3aa | 242 | xchk_block_set_corrupt( |
1d8a748a | 243 | struct xfs_scrub *sc, |
032d91f9 | 244 | struct xfs_buf *bp) |
4700d229 DW |
245 | { |
246 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; | |
9343ee76 | 247 | trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); |
4700d229 DW |
248 | } |
249 | ||
48dd9117 DW |
250 | #ifdef CONFIG_XFS_QUOTA |
251 | /* Record a corrupt quota counter. */ | |
252 | void | |
253 | xchk_qcheck_set_corrupt( | |
254 | struct xfs_scrub *sc, | |
255 | unsigned int dqtype, | |
256 | xfs_dqid_t id) | |
257 | { | |
258 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; | |
259 | trace_xchk_qcheck_error(sc, dqtype, id, __return_address); | |
260 | } | |
261 | #endif | |
262 | ||
64b12563 DW |
263 | /* Record a corruption while cross-referencing. */ |
264 | void | |
c517b3aa | 265 | xchk_block_xref_set_corrupt( |
1d8a748a | 266 | struct xfs_scrub *sc, |
032d91f9 | 267 | struct xfs_buf *bp) |
64b12563 DW |
268 | { |
269 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; | |
9343ee76 | 270 | trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); |
64b12563 DW |
271 | } |
272 | ||
4700d229 DW |
273 | /* |
274 | * Record a corrupt inode. The trace data will include the block given | |
275 | * by bp if bp is given; otherwise it will use the block location of the | |
276 | * inode record itself. | |
277 | */ | |
278 | void | |
c517b3aa | 279 | xchk_ino_set_corrupt( |
1d8a748a | 280 | struct xfs_scrub *sc, |
032d91f9 | 281 | xfs_ino_t ino) |
4700d229 DW |
282 | { |
283 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; | |
c517b3aa | 284 | trace_xchk_ino_error(sc, ino, __return_address); |
4700d229 DW |
285 | } |
286 | ||
64b12563 DW |
287 | /* Record a corruption while cross-referencing with an inode. */ |
288 | void | |
c517b3aa | 289 | xchk_ino_xref_set_corrupt( |
1d8a748a | 290 | struct xfs_scrub *sc, |
032d91f9 | 291 | xfs_ino_t ino) |
64b12563 DW |
292 | { |
293 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; | |
c517b3aa | 294 | trace_xchk_ino_error(sc, ino, __return_address); |
64b12563 DW |
295 | } |
296 | ||
4700d229 DW |
297 | /* Record corruption in a block indexed by a file fork. */ |
298 | void | |
c517b3aa | 299 | xchk_fblock_set_corrupt( |
1d8a748a | 300 | struct xfs_scrub *sc, |
032d91f9 DW |
301 | int whichfork, |
302 | xfs_fileoff_t offset) | |
4700d229 DW |
303 | { |
304 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; | |
c517b3aa | 305 | trace_xchk_fblock_error(sc, whichfork, offset, __return_address); |
4700d229 DW |
306 | } |
307 | ||
64b12563 DW |
308 | /* Record a corruption while cross-referencing a fork block. */ |
309 | void | |
c517b3aa | 310 | xchk_fblock_xref_set_corrupt( |
1d8a748a | 311 | struct xfs_scrub *sc, |
032d91f9 DW |
312 | int whichfork, |
313 | xfs_fileoff_t offset) | |
64b12563 DW |
314 | { |
315 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; | |
c517b3aa | 316 | trace_xchk_fblock_error(sc, whichfork, offset, __return_address); |
64b12563 DW |
317 | } |
318 | ||
4700d229 DW |
319 | /* |
320 | * Warn about inodes that need administrative review but is not | |
321 | * incorrect. | |
322 | */ | |
323 | void | |
c517b3aa | 324 | xchk_ino_set_warning( |
1d8a748a | 325 | struct xfs_scrub *sc, |
032d91f9 | 326 | xfs_ino_t ino) |
4700d229 DW |
327 | { |
328 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; | |
c517b3aa | 329 | trace_xchk_ino_warning(sc, ino, __return_address); |
4700d229 DW |
330 | } |
331 | ||
332 | /* Warn about a block indexed by a file fork that needs review. */ | |
333 | void | |
c517b3aa | 334 | xchk_fblock_set_warning( |
1d8a748a | 335 | struct xfs_scrub *sc, |
032d91f9 DW |
336 | int whichfork, |
337 | xfs_fileoff_t offset) | |
4700d229 DW |
338 | { |
339 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; | |
c517b3aa | 340 | trace_xchk_fblock_warning(sc, whichfork, offset, __return_address); |
4700d229 DW |
341 | } |
342 | ||
343 | /* Signal an incomplete scrub. */ | |
344 | void | |
c517b3aa | 345 | xchk_set_incomplete( |
1d8a748a | 346 | struct xfs_scrub *sc) |
4700d229 DW |
347 | { |
348 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE; | |
c517b3aa | 349 | trace_xchk_incomplete(sc, __return_address); |
4700d229 DW |
350 | } |
351 | ||
d852657c DW |
352 | /* |
353 | * rmap scrubbing -- compute the number of blocks with a given owner, | |
354 | * at least according to the reverse mapping data. | |
355 | */ | |
356 | ||
c517b3aa | 357 | struct xchk_rmap_ownedby_info { |
66e3237e DW |
358 | const struct xfs_owner_info *oinfo; |
359 | xfs_filblks_t *blocks; | |
d852657c DW |
360 | }; |
361 | ||
362 | STATIC int | |
c517b3aa | 363 | xchk_count_rmap_ownedby_irec( |
032d91f9 | 364 | struct xfs_btree_cur *cur, |
159eb69d | 365 | const struct xfs_rmap_irec *rec, |
032d91f9 | 366 | void *priv) |
d852657c | 367 | { |
032d91f9 DW |
368 | struct xchk_rmap_ownedby_info *sroi = priv; |
369 | bool irec_attr; | |
370 | bool oinfo_attr; | |
d852657c DW |
371 | |
372 | irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK; | |
373 | oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK; | |
374 | ||
375 | if (rec->rm_owner != sroi->oinfo->oi_owner) | |
376 | return 0; | |
377 | ||
378 | if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr) | |
379 | (*sroi->blocks) += rec->rm_blockcount; | |
380 | ||
381 | return 0; | |
382 | } | |
383 | ||
384 | /* | |
385 | * Calculate the number of blocks the rmap thinks are owned by something. | |
386 | * The caller should pass us an rmapbt cursor. | |
387 | */ | |
388 | int | |
c517b3aa | 389 | xchk_count_rmap_ownedby_ag( |
1d8a748a | 390 | struct xfs_scrub *sc, |
032d91f9 | 391 | struct xfs_btree_cur *cur, |
66e3237e | 392 | const struct xfs_owner_info *oinfo, |
032d91f9 | 393 | xfs_filblks_t *blocks) |
d852657c | 394 | { |
66e3237e DW |
395 | struct xchk_rmap_ownedby_info sroi = { |
396 | .oinfo = oinfo, | |
397 | .blocks = blocks, | |
398 | }; | |
d852657c | 399 | |
d852657c | 400 | *blocks = 0; |
c517b3aa | 401 | return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec, |
d852657c DW |
402 | &sroi); |
403 | } | |
404 | ||
b6c1beb9 DW |
405 | /* |
406 | * AG scrubbing | |
407 | * | |
408 | * These helpers facilitate locking an allocation group's header | |
409 | * buffers, setting up cursors for all btrees that are present, and | |
410 | * cleaning everything up once we're through. | |
411 | */ | |
412 | ||
ab9d5dc5 DW |
413 | /* Decide if we want to return an AG header read failure. */ |
414 | static inline bool | |
415 | want_ag_read_header_failure( | |
1d8a748a | 416 | struct xfs_scrub *sc, |
032d91f9 | 417 | unsigned int type) |
ab9d5dc5 DW |
418 | { |
419 | /* Return all AG header read failures when scanning btrees. */ | |
420 | if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF && | |
a12890ae DW |
421 | sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL && |
422 | sc->sm->sm_type != XFS_SCRUB_TYPE_AGI) | |
ab9d5dc5 DW |
423 | return true; |
424 | /* | |
425 | * If we're scanning a given type of AG header, we only want to | |
426 | * see read failures from that specific header. We'd like the | |
427 | * other headers to cross-check them, but this isn't required. | |
428 | */ | |
429 | if (sc->sm->sm_type == type) | |
430 | return true; | |
431 | return false; | |
432 | } | |
433 | ||
b6c1beb9 | 434 | /* |
d5c88131 | 435 | * Grab the AG header buffers for the attached perag structure. |
b6c1beb9 | 436 | * |
48c6615c DW |
437 | * The headers should be released by xchk_ag_free, but as a fail safe we attach |
438 | * all the buffers we grab to the scrub transaction so they'll all be freed | |
d5c88131 | 439 | * when we cancel it. |
b6c1beb9 | 440 | */ |
d5c88131 DW |
441 | static inline int |
442 | xchk_perag_read_headers( | |
1d8a748a | 443 | struct xfs_scrub *sc, |
de9d2a78 | 444 | struct xchk_ag *sa) |
b6c1beb9 | 445 | { |
032d91f9 | 446 | int error; |
b6c1beb9 | 447 | |
99b13c7f | 448 | error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); |
a12890ae | 449 | if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) |
48c6615c | 450 | return error; |
b6c1beb9 | 451 | |
08d3e84f | 452 | error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp); |
ab9d5dc5 | 453 | if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) |
48c6615c | 454 | return error; |
b6c1beb9 | 455 | |
48c6615c | 456 | return 0; |
b6c1beb9 DW |
457 | } |
458 | ||
d5c88131 DW |
459 | /* |
460 | * Grab the AG headers for the attached perag structure and wait for pending | |
461 | * intents to drain. | |
462 | */ | |
32080a9b | 463 | int |
d5c88131 DW |
464 | xchk_perag_drain_and_lock( |
465 | struct xfs_scrub *sc) | |
466 | { | |
467 | struct xchk_ag *sa = &sc->sa; | |
468 | int error = 0; | |
469 | ||
470 | ASSERT(sa->pag != NULL); | |
471 | ASSERT(sa->agi_bp == NULL); | |
472 | ASSERT(sa->agf_bp == NULL); | |
473 | ||
474 | do { | |
475 | if (xchk_should_terminate(sc, &error)) | |
476 | return error; | |
477 | ||
478 | error = xchk_perag_read_headers(sc, sa); | |
479 | if (error) | |
480 | return error; | |
481 | ||
482 | /* | |
483 | * If we've grabbed an inode for scrubbing then we assume that | |
484 | * holding its ILOCK will suffice to coordinate with any intent | |
485 | * chains involving this inode. | |
486 | */ | |
487 | if (sc->ip) | |
488 | return 0; | |
489 | ||
490 | /* | |
491 | * Decide if this AG is quiet enough for all metadata to be | |
492 | * consistent with each other. XFS allows the AG header buffer | |
493 | * locks to cycle across transaction rolls while processing | |
494 | * chains of deferred ops, which means that there could be | |
495 | * other threads in the middle of processing a chain of | |
496 | * deferred ops. For regular operations we are careful about | |
497 | * ordering operations to prevent collisions between threads | |
498 | * (which is why we don't need a per-AG lock), but scrub and | |
499 | * repair have to serialize against chained operations. | |
500 | * | |
501 | * We just locked all the AG headers buffers; now take a look | |
502 | * to see if there are any intents in progress. If there are, | |
503 | * drop the AG headers and wait for the intents to drain. | |
504 | * Since we hold all the AG header locks for the duration of | |
505 | * the scrub, this is the only time we have to sample the | |
506 | * intents counter; any threads increasing it after this point | |
507 | * can't possibly be in the middle of a chain of AG metadata | |
508 | * updates. | |
509 | * | |
510 | * Obviously, this should be slanted against scrub and in favor | |
511 | * of runtime threads. | |
512 | */ | |
513 | if (!xfs_perag_intent_busy(sa->pag)) | |
514 | return 0; | |
515 | ||
516 | if (sa->agf_bp) { | |
517 | xfs_trans_brelse(sc->tp, sa->agf_bp); | |
518 | sa->agf_bp = NULL; | |
519 | } | |
520 | ||
521 | if (sa->agi_bp) { | |
522 | xfs_trans_brelse(sc->tp, sa->agi_bp); | |
523 | sa->agi_bp = NULL; | |
524 | } | |
525 | ||
466c525d | 526 | if (!(sc->flags & XCHK_FSGATES_DRAIN)) |
88accf17 | 527 | return -ECHRNG; |
d5c88131 DW |
528 | error = xfs_perag_intent_drain(sa->pag); |
529 | if (error == -ERESTARTSYS) | |
530 | error = -EINTR; | |
531 | } while (!error); | |
532 | ||
533 | return error; | |
534 | } | |
535 | ||
536 | /* | |
537 | * Grab the per-AG structure, grab all AG header buffers, and wait until there | |
538 | * aren't any pending intents. Returns -ENOENT if we can't grab the perag | |
539 | * structure. | |
540 | */ | |
541 | int | |
542 | xchk_ag_read_headers( | |
543 | struct xfs_scrub *sc, | |
544 | xfs_agnumber_t agno, | |
545 | struct xchk_ag *sa) | |
546 | { | |
547 | struct xfs_mount *mp = sc->mp; | |
548 | ||
549 | ASSERT(!sa->pag); | |
550 | sa->pag = xfs_perag_get(mp, agno); | |
551 | if (!sa->pag) | |
552 | return -ENOENT; | |
553 | ||
554 | return xchk_perag_drain_and_lock(sc); | |
555 | } | |
556 | ||
b6c1beb9 DW |
557 | /* Release all the AG btree cursors. */ |
558 | void | |
c517b3aa DW |
559 | xchk_ag_btcur_free( |
560 | struct xchk_ag *sa) | |
b6c1beb9 DW |
561 | { |
562 | if (sa->refc_cur) | |
563 | xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR); | |
564 | if (sa->rmap_cur) | |
565 | xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR); | |
566 | if (sa->fino_cur) | |
567 | xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR); | |
568 | if (sa->ino_cur) | |
569 | xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR); | |
570 | if (sa->cnt_cur) | |
571 | xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR); | |
572 | if (sa->bno_cur) | |
573 | xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR); | |
574 | ||
575 | sa->refc_cur = NULL; | |
576 | sa->rmap_cur = NULL; | |
577 | sa->fino_cur = NULL; | |
578 | sa->ino_cur = NULL; | |
579 | sa->bno_cur = NULL; | |
580 | sa->cnt_cur = NULL; | |
581 | } | |
582 | ||
583 | /* Initialize all the btree cursors for an AG. */ | |
f53acfac | 584 | void |
c517b3aa | 585 | xchk_ag_btcur_init( |
1d8a748a | 586 | struct xfs_scrub *sc, |
c517b3aa | 587 | struct xchk_ag *sa) |
b6c1beb9 | 588 | { |
032d91f9 | 589 | struct xfs_mount *mp = sc->mp; |
b6c1beb9 | 590 | |
48039926 | 591 | if (sa->agf_bp) { |
b6c1beb9 | 592 | /* Set up a bnobt cursor for cross-referencing. */ |
1c8b9fd2 CH |
593 | sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, |
594 | sa->pag); | |
48039926 CH |
595 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur, |
596 | XFS_SCRUB_TYPE_BNOBT); | |
b6c1beb9 DW |
597 | |
598 | /* Set up a cntbt cursor for cross-referencing. */ | |
1c8b9fd2 CH |
599 | sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, |
600 | sa->pag); | |
48039926 CH |
601 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur, |
602 | XFS_SCRUB_TYPE_CNTBT); | |
603 | ||
604 | /* Set up a rmapbt cursor for cross-referencing. */ | |
605 | if (xfs_has_rmapbt(mp)) { | |
606 | sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, | |
607 | sa->agf_bp, sa->pag); | |
608 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur, | |
609 | XFS_SCRUB_TYPE_RMAPBT); | |
610 | } | |
611 | ||
612 | /* Set up a refcountbt cursor for cross-referencing. */ | |
613 | if (xfs_has_reflink(mp)) { | |
614 | sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, | |
615 | sa->agf_bp, sa->pag); | |
616 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur, | |
617 | XFS_SCRUB_TYPE_REFCNTBT); | |
618 | } | |
b6c1beb9 DW |
619 | } |
620 | ||
48039926 CH |
621 | if (sa->agi_bp) { |
622 | /* Set up a inobt cursor for cross-referencing. */ | |
14dd46cf CH |
623 | sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, |
624 | sa->agi_bp); | |
48039926 CH |
625 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur, |
626 | XFS_SCRUB_TYPE_INOBT); | |
627 | ||
628 | /* Set up a finobt cursor for cross-referencing. */ | |
629 | if (xfs_has_finobt(mp)) { | |
14dd46cf CH |
630 | sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp, |
631 | sa->agi_bp); | |
48039926 CH |
632 | xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur, |
633 | XFS_SCRUB_TYPE_FINOBT); | |
634 | } | |
b6c1beb9 | 635 | } |
b6c1beb9 DW |
636 | } |
637 | ||
638 | /* Release the AG header context and btree cursors. */ | |
639 | void | |
c517b3aa | 640 | xchk_ag_free( |
1d8a748a | 641 | struct xfs_scrub *sc, |
c517b3aa | 642 | struct xchk_ag *sa) |
b6c1beb9 | 643 | { |
c517b3aa | 644 | xchk_ag_btcur_free(sa); |
dbfbf3bd | 645 | xrep_reset_perag_resv(sc); |
b6c1beb9 DW |
646 | if (sa->agf_bp) { |
647 | xfs_trans_brelse(sc->tp, sa->agf_bp); | |
648 | sa->agf_bp = NULL; | |
649 | } | |
650 | if (sa->agi_bp) { | |
651 | xfs_trans_brelse(sc->tp, sa->agi_bp); | |
652 | sa->agi_bp = NULL; | |
653 | } | |
51863d7d DW |
654 | if (sa->pag) { |
655 | xfs_perag_put(sa->pag); | |
656 | sa->pag = NULL; | |
657 | } | |
b6c1beb9 DW |
658 | } |
659 | ||
660 | /* | |
48c6615c DW |
661 | * For scrub, grab the perag structure, the AGI, and the AGF headers, in that |
662 | * order. Locking order requires us to get the AGI before the AGF. We use the | |
663 | * transaction to avoid deadlocking on crosslinked metadata buffers; either the | |
664 | * caller passes one in (bmap scrub) or we have to create a transaction | |
665 | * ourselves. Returns ENOENT if the perag struct cannot be grabbed. | |
b6c1beb9 DW |
666 | */ |
667 | int | |
c517b3aa | 668 | xchk_ag_init( |
1d8a748a | 669 | struct xfs_scrub *sc, |
032d91f9 | 670 | xfs_agnumber_t agno, |
c517b3aa | 671 | struct xchk_ag *sa) |
b6c1beb9 | 672 | { |
032d91f9 | 673 | int error; |
b6c1beb9 | 674 | |
de9d2a78 | 675 | error = xchk_ag_read_headers(sc, agno, sa); |
b6c1beb9 DW |
676 | if (error) |
677 | return error; | |
678 | ||
f53acfac DW |
679 | xchk_ag_btcur_init(sc, sa); |
680 | return 0; | |
b6c1beb9 DW |
681 | } |
682 | ||
dcb660f9 DW |
683 | /* Per-scrubber setup functions */ |
684 | ||
302436c2 DW |
685 | void |
686 | xchk_trans_cancel( | |
687 | struct xfs_scrub *sc) | |
688 | { | |
689 | xfs_trans_cancel(sc->tp); | |
690 | sc->tp = NULL; | |
691 | } | |
692 | ||
564fee6d DW |
693 | int |
694 | xchk_trans_alloc_empty( | |
695 | struct xfs_scrub *sc) | |
696 | { | |
697 | return xfs_trans_alloc_empty(sc->mp, &sc->tp); | |
698 | } | |
699 | ||
9d9c9028 DW |
700 | /* |
701 | * Grab an empty transaction so that we can re-grab locked buffers if | |
702 | * one of our btrees turns out to be cyclic. | |
0a9633fa DW |
703 | * |
704 | * If we're going to repair something, we need to ask for the largest possible | |
705 | * log reservation so that we can handle the worst case scenario for metadata | |
706 | * updates while rebuilding a metadata item. We also need to reserve as many | |
707 | * blocks in the head transaction as we think we're going to need to rebuild | |
708 | * the metadata object. | |
9d9c9028 DW |
709 | */ |
710 | int | |
c517b3aa | 711 | xchk_trans_alloc( |
1d8a748a | 712 | struct xfs_scrub *sc, |
032d91f9 | 713 | uint resblks) |
9d9c9028 | 714 | { |
0a9633fa DW |
715 | if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) |
716 | return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, | |
717 | resblks, 0, 0, &sc->tp); | |
718 | ||
564fee6d | 719 | return xchk_trans_alloc_empty(sc); |
9d9c9028 DW |
720 | } |
721 | ||
dcb660f9 DW |
722 | /* Set us up with a transaction and an empty context. */ |
723 | int | |
c517b3aa | 724 | xchk_setup_fs( |
026f57eb | 725 | struct xfs_scrub *sc) |
dcb660f9 | 726 | { |
032d91f9 | 727 | uint resblks; |
0a9633fa | 728 | |
b5e2196e | 729 | resblks = xrep_calc_ag_resblks(sc); |
c517b3aa | 730 | return xchk_trans_alloc(sc, resblks); |
dcb660f9 | 731 | } |
efa7a99c DW |
732 | |
733 | /* Set us up with AG headers and btree cursors. */ | |
734 | int | |
c517b3aa | 735 | xchk_setup_ag_btree( |
1d8a748a | 736 | struct xfs_scrub *sc, |
032d91f9 | 737 | bool force_log) |
efa7a99c | 738 | { |
032d91f9 DW |
739 | struct xfs_mount *mp = sc->mp; |
740 | int error; | |
efa7a99c | 741 | |
3daa6641 DW |
742 | /* |
743 | * If the caller asks us to checkpont the log, do so. This | |
744 | * expensive operation should be performed infrequently and only | |
745 | * as a last resort. Any caller that sets force_log should | |
746 | * document why they need to do so. | |
747 | */ | |
748 | if (force_log) { | |
c517b3aa | 749 | error = xchk_checkpoint_log(mp); |
3daa6641 DW |
750 | if (error) |
751 | return error; | |
752 | } | |
753 | ||
026f57eb | 754 | error = xchk_setup_fs(sc); |
efa7a99c DW |
755 | if (error) |
756 | return error; | |
757 | ||
c517b3aa | 758 | return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa); |
efa7a99c | 759 | } |
3daa6641 DW |
760 | |
761 | /* Push everything out of the log onto disk. */ | |
762 | int | |
c517b3aa | 763 | xchk_checkpoint_log( |
3daa6641 DW |
764 | struct xfs_mount *mp) |
765 | { | |
766 | int error; | |
767 | ||
60e5bb78 | 768 | error = xfs_log_force(mp, XFS_LOG_SYNC); |
3daa6641 DW |
769 | if (error) |
770 | return error; | |
771 | xfs_ail_push_all_sync(mp->m_ail); | |
772 | return 0; | |
773 | } | |
80e4e126 | 774 | |
a03297a0 DW |
775 | /* Verify that an inode is allocated ondisk, then return its cached inode. */ |
776 | int | |
777 | xchk_iget( | |
778 | struct xfs_scrub *sc, | |
779 | xfs_ino_t inum, | |
780 | struct xfs_inode **ipp) | |
781 | { | |
3f113c27 DW |
782 | ASSERT(sc->tp != NULL); |
783 | ||
a03297a0 DW |
784 | return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); |
785 | } | |
786 | ||
302436c2 DW |
787 | /* |
788 | * Try to grab an inode in a manner that avoids races with physical inode | |
789 | * allocation. If we can't, return the locked AGI buffer so that the caller | |
790 | * can single-step the loading process to see where things went wrong. | |
791 | * Callers must have a valid scrub transaction. | |
792 | * | |
793 | * If the iget succeeds, return 0, a NULL AGI, and the inode. | |
794 | * | |
795 | * If the iget fails, return the error, the locked AGI, and a NULL inode. This | |
796 | * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are | |
797 | * no longer allocated; or any other corruption or runtime error. | |
798 | * | |
799 | * If the AGI read fails, return the error, a NULL AGI, and NULL inode. | |
800 | * | |
801 | * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode. | |
802 | */ | |
803 | int | |
804 | xchk_iget_agi( | |
805 | struct xfs_scrub *sc, | |
806 | xfs_ino_t inum, | |
807 | struct xfs_buf **agi_bpp, | |
808 | struct xfs_inode **ipp) | |
809 | { | |
810 | struct xfs_mount *mp = sc->mp; | |
811 | struct xfs_trans *tp = sc->tp; | |
812 | struct xfs_perag *pag; | |
813 | int error; | |
814 | ||
815 | ASSERT(sc->tp != NULL); | |
816 | ||
817 | again: | |
818 | *agi_bpp = NULL; | |
819 | *ipp = NULL; | |
820 | error = 0; | |
821 | ||
822 | if (xchk_should_terminate(sc, &error)) | |
823 | return error; | |
824 | ||
825 | /* | |
826 | * Attach the AGI buffer to the scrub transaction to avoid deadlocks | |
827 | * in the iget cache miss path. | |
828 | */ | |
829 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); | |
830 | error = xfs_ialloc_read_agi(pag, tp, agi_bpp); | |
831 | xfs_perag_put(pag); | |
832 | if (error) | |
833 | return error; | |
834 | ||
835 | error = xfs_iget(mp, tp, inum, | |
836 | XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); | |
837 | if (error == -EAGAIN) { | |
838 | /* | |
839 | * The inode may be in core but temporarily unavailable and may | |
840 | * require the AGI buffer before it can be returned. Drop the | |
841 | * AGI buffer and retry the lookup. | |
842 | * | |
843 | * Incore lookup will fail with EAGAIN on a cache hit if the | |
844 | * inode is queued to the inactivation list. The inactivation | |
845 | * worker may remove the inode from the unlinked list and hence | |
846 | * needs the AGI. | |
847 | * | |
848 | * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN | |
849 | * to allow inodegc to make progress and move the inode to | |
850 | * IRECLAIMABLE state where xfs_iget will be able to return it | |
851 | * again if it can lock the inode. | |
852 | */ | |
853 | xfs_trans_brelse(tp, *agi_bpp); | |
854 | delay(1); | |
855 | goto again; | |
856 | } | |
857 | if (error) | |
858 | return error; | |
859 | ||
860 | /* We got the inode, so we can release the AGI. */ | |
861 | ASSERT(*ipp != NULL); | |
862 | xfs_trans_brelse(tp, *agi_bpp); | |
863 | *agi_bpp = NULL; | |
864 | return 0; | |
865 | } | |
866 | ||
259ba1d3 DW |
867 | #ifdef CONFIG_XFS_QUOTA |
868 | /* | |
869 | * Try to attach dquots to this inode if we think we might want to repair it. | |
870 | * Callers must not hold any ILOCKs. If the dquots are broken and cannot be | |
871 | * attached, a quotacheck will be scheduled. | |
872 | */ | |
873 | int | |
874 | xchk_ino_dqattach( | |
875 | struct xfs_scrub *sc) | |
876 | { | |
877 | ASSERT(sc->tp != NULL); | |
878 | ASSERT(sc->ip != NULL); | |
879 | ||
880 | if (!xchk_could_repair(sc)) | |
881 | return 0; | |
882 | ||
883 | return xrep_ino_dqattach(sc); | |
884 | } | |
885 | #endif | |
886 | ||
302436c2 | 887 | /* Install an inode that we opened by handle for scrubbing. */ |
38bb1310 | 888 | int |
302436c2 DW |
889 | xchk_install_handle_inode( |
890 | struct xfs_scrub *sc, | |
891 | struct xfs_inode *ip) | |
892 | { | |
893 | if (VFS_I(ip)->i_generation != sc->sm->sm_gen) { | |
894 | xchk_irele(sc, ip); | |
895 | return -ENOENT; | |
896 | } | |
897 | ||
898 | sc->ip = ip; | |
899 | return 0; | |
900 | } | |
901 | ||
17308539 DW |
902 | /* |
903 | * Install an already-referenced inode for scrubbing. Get our own reference to | |
904 | * the inode to make disposal simpler. The inode must not be in I_FREEING or | |
905 | * I_WILL_FREE state! | |
906 | */ | |
907 | int | |
908 | xchk_install_live_inode( | |
909 | struct xfs_scrub *sc, | |
910 | struct xfs_inode *ip) | |
911 | { | |
912 | if (!igrab(VFS_I(ip))) { | |
913 | xchk_ino_set_corrupt(sc, ip->i_ino); | |
914 | return -EFSCORRUPTED; | |
915 | } | |
916 | ||
917 | sc->ip = ip; | |
918 | return 0; | |
919 | } | |
920 | ||
80e4e126 | 921 | /* |
46e0dd89 DW |
922 | * In preparation to scrub metadata structures that hang off of an inode, |
923 | * grab either the inode referenced in the scrub control structure or the | |
924 | * inode passed in. If the inumber does not reference an allocated inode | |
925 | * record, the function returns ENOENT to end the scrub early. The inode | |
926 | * is not locked. | |
80e4e126 DW |
927 | */ |
928 | int | |
46e0dd89 | 929 | xchk_iget_for_scrubbing( |
026f57eb | 930 | struct xfs_scrub *sc) |
80e4e126 | 931 | { |
032d91f9 DW |
932 | struct xfs_imap imap; |
933 | struct xfs_mount *mp = sc->mp; | |
498f0adb | 934 | struct xfs_perag *pag; |
302436c2 | 935 | struct xfs_buf *agi_bp; |
026f57eb | 936 | struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); |
032d91f9 | 937 | struct xfs_inode *ip = NULL; |
302436c2 | 938 | xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino); |
032d91f9 | 939 | int error; |
80e4e126 | 940 | |
302436c2 DW |
941 | ASSERT(sc->tp == NULL); |
942 | ||
80e4e126 | 943 | /* We want to scan the inode we already had opened. */ |
17308539 DW |
944 | if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) |
945 | return xchk_install_live_inode(sc, ip_in); | |
80e4e126 | 946 | |
302436c2 | 947 | /* Reject internal metadata files and obviously bad inode numbers. */ |
80e4e126 DW |
948 | if (xfs_internal_inum(mp, sc->sm->sm_ino)) |
949 | return -ENOENT; | |
302436c2 DW |
950 | if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) |
951 | return -ENOENT; | |
952 | ||
3f113c27 DW |
953 | /* Try a safe untrusted iget. */ |
954 | error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); | |
302436c2 DW |
955 | if (!error) |
956 | return xchk_install_handle_inode(sc, ip); | |
957 | if (error == -ENOENT) | |
80e4e126 | 958 | return error; |
302436c2 DW |
959 | if (error != -EINVAL) |
960 | goto out_error; | |
961 | ||
962 | /* | |
963 | * EINVAL with IGET_UNTRUSTED probably means one of several things: | |
964 | * userspace gave us an inode number that doesn't correspond to fs | |
965 | * space; the inode btree lacks a record for this inode; or there is a | |
966 | * record, and it says this inode is free. | |
967 | * | |
968 | * We want to look up this inode in the inobt to distinguish two | |
969 | * scenarios: (1) the inobt says the inode is free, in which case | |
970 | * there's nothing to do; and (2) the inobt says the inode is | |
971 | * allocated, but loading it failed due to corruption. | |
972 | * | |
973 | * Allocate a transaction and grab the AGI to prevent inobt activity | |
974 | * in this AG. Retry the iget in case someone allocated a new inode | |
975 | * after the first iget failed. | |
976 | */ | |
977 | error = xchk_trans_alloc(sc, 0); | |
978 | if (error) | |
979 | goto out_error; | |
980 | ||
981 | error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip); | |
982 | if (error == 0) { | |
983 | /* Actually got the inode, so install it. */ | |
984 | xchk_trans_cancel(sc); | |
985 | return xchk_install_handle_inode(sc, ip); | |
80e4e126 | 986 | } |
302436c2 DW |
987 | if (error == -ENOENT) |
988 | goto out_gone; | |
989 | if (error != -EINVAL) | |
990 | goto out_cancel; | |
991 | ||
992 | /* Ensure that we have protected against inode allocation/freeing. */ | |
993 | if (agi_bp == NULL) { | |
994 | ASSERT(agi_bp != NULL); | |
995 | error = -ECANCELED; | |
996 | goto out_cancel; | |
80e4e126 DW |
997 | } |
998 | ||
302436c2 DW |
999 | /* |
1000 | * Untrusted iget failed a second time. Let's try an inobt lookup. | |
1001 | * If the inobt thinks this the inode neither can exist inside the | |
1002 | * filesystem nor is allocated, return ENOENT to signal that the check | |
1003 | * can be skipped. | |
1004 | * | |
1005 | * If the lookup returns corruption, we'll mark this inode corrupt and | |
1006 | * exit to userspace. There's little chance of fixing anything until | |
1007 | * the inobt is straightened out, but there's nothing we can do here. | |
1008 | * | |
1009 | * If the lookup encounters any other error, exit to userspace. | |
1010 | * | |
1011 | * If the lookup succeeds, something else must be very wrong in the fs | |
1012 | * such that setting up the incore inode failed in some strange way. | |
1013 | * Treat those as corruptions. | |
1014 | */ | |
1015 | pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino)); | |
1016 | if (!pag) { | |
1017 | error = -EFSCORRUPTED; | |
1018 | goto out_cancel; | |
1019 | } | |
1020 | ||
1021 | error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap, | |
1022 | XFS_IGET_UNTRUSTED); | |
1023 | xfs_perag_put(pag); | |
1024 | if (error == -EINVAL || error == -ENOENT) | |
1025 | goto out_gone; | |
1026 | if (!error) | |
1027 | error = -EFSCORRUPTED; | |
1028 | ||
1029 | out_cancel: | |
1030 | xchk_trans_cancel(sc); | |
1031 | out_error: | |
1032 | trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), | |
1033 | error, __return_address); | |
1034 | return error; | |
1035 | out_gone: | |
1036 | /* The file is gone, so there's nothing to check. */ | |
1037 | xchk_trans_cancel(sc); | |
1038 | return -ENOENT; | |
80e4e126 | 1039 | } |
a5c46e5e | 1040 | |
a03297a0 DW |
1041 | /* Release an inode, possibly dropping it in the process. */ |
1042 | void | |
1043 | xchk_irele( | |
1044 | struct xfs_scrub *sc, | |
1045 | struct xfs_inode *ip) | |
1046 | { | |
1047 | if (current->journal_info != NULL) { | |
1048 | ASSERT(current->journal_info == sc->tp); | |
1049 | ||
1050 | /* | |
1051 | * If we are in a transaction, we /cannot/ drop the inode | |
1052 | * ourselves, because the VFS will trigger writeback, which | |
1053 | * can require a transaction. Clear DONTCACHE to force the | |
1054 | * inode to the LRU, where someone else can take care of | |
1055 | * dropping it. | |
1056 | * | |
1057 | * Note that when we grabbed our reference to the inode, it | |
1058 | * could have had an active ref and DONTCACHE set if a sysadmin | |
1059 | * is trying to coerce a change in file access mode. icache | |
1060 | * hits do not clear DONTCACHE, so we must do it here. | |
1061 | */ | |
1062 | spin_lock(&VFS_I(ip)->i_lock); | |
1063 | VFS_I(ip)->i_state &= ~I_DONTCACHE; | |
1064 | spin_unlock(&VFS_I(ip)->i_lock); | |
1065 | } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { | |
1066 | /* | |
1067 | * If this is the last reference to the inode and the caller | |
1068 | * permits it, set DONTCACHE to avoid thrashing. | |
1069 | */ | |
1070 | d_mark_dontcache(VFS_I(ip)); | |
1071 | } | |
1072 | ||
1073 | xfs_irele(ip); | |
1074 | } | |
1075 | ||
1fc7a059 DW |
1076 | /* |
1077 | * Set us up to scrub metadata mapped by a file's fork. Callers must not use | |
1078 | * this to operate on user-accessible regular file data because the MMAPLOCK is | |
1079 | * not taken. | |
1080 | */ | |
a5c46e5e | 1081 | int |
c517b3aa | 1082 | xchk_setup_inode_contents( |
1d8a748a | 1083 | struct xfs_scrub *sc, |
032d91f9 | 1084 | unsigned int resblks) |
a5c46e5e | 1085 | { |
032d91f9 | 1086 | int error; |
a5c46e5e | 1087 | |
46e0dd89 | 1088 | error = xchk_iget_for_scrubbing(sc); |
a5c46e5e DW |
1089 | if (error) |
1090 | return error; | |
1091 | ||
1fc7a059 | 1092 | /* Lock the inode so the VFS cannot touch this file. */ |
294012fb | 1093 | xchk_ilock(sc, XFS_IOLOCK_EXCL); |
1fc7a059 | 1094 | |
c517b3aa | 1095 | error = xchk_trans_alloc(sc, resblks); |
a5c46e5e DW |
1096 | if (error) |
1097 | goto out; | |
259ba1d3 DW |
1098 | |
1099 | error = xchk_ino_dqattach(sc); | |
1100 | if (error) | |
1101 | goto out; | |
1102 | ||
294012fb | 1103 | xchk_ilock(sc, XFS_ILOCK_EXCL); |
a5c46e5e DW |
1104 | out: |
1105 | /* scrub teardown will unlock and release the inode for us */ | |
1106 | return error; | |
1107 | } | |
64b12563 | 1108 | |
294012fb DW |
1109 | void |
1110 | xchk_ilock( | |
1111 | struct xfs_scrub *sc, | |
1112 | unsigned int ilock_flags) | |
1113 | { | |
1114 | xfs_ilock(sc->ip, ilock_flags); | |
1115 | sc->ilock_flags |= ilock_flags; | |
1116 | } | |
1117 | ||
1118 | bool | |
1119 | xchk_ilock_nowait( | |
1120 | struct xfs_scrub *sc, | |
1121 | unsigned int ilock_flags) | |
1122 | { | |
1123 | if (xfs_ilock_nowait(sc->ip, ilock_flags)) { | |
1124 | sc->ilock_flags |= ilock_flags; | |
1125 | return true; | |
1126 | } | |
1127 | ||
1128 | return false; | |
1129 | } | |
1130 | ||
1131 | void | |
1132 | xchk_iunlock( | |
1133 | struct xfs_scrub *sc, | |
1134 | unsigned int ilock_flags) | |
1135 | { | |
1136 | sc->ilock_flags &= ~ilock_flags; | |
1137 | xfs_iunlock(sc->ip, ilock_flags); | |
1138 | } | |
1139 | ||
64b12563 DW |
1140 | /* |
1141 | * Predicate that decides if we need to evaluate the cross-reference check. | |
1142 | * If there was an error accessing the cross-reference btree, just delete | |
1143 | * the cursor and skip the check. | |
1144 | */ | |
1145 | bool | |
c517b3aa | 1146 | xchk_should_check_xref( |
1d8a748a | 1147 | struct xfs_scrub *sc, |
032d91f9 DW |
1148 | int *error, |
1149 | struct xfs_btree_cur **curpp) | |
64b12563 | 1150 | { |
8389f3ff | 1151 | /* No point in xref if we already know we're corrupt. */ |
c517b3aa | 1152 | if (xchk_skip_xref(sc->sm)) |
8389f3ff DW |
1153 | return false; |
1154 | ||
64b12563 DW |
1155 | if (*error == 0) |
1156 | return true; | |
1157 | ||
1158 | if (curpp) { | |
1159 | /* If we've already given up on xref, just bail out. */ | |
1160 | if (!*curpp) | |
1161 | return false; | |
1162 | ||
1163 | /* xref error, delete cursor and bail out. */ | |
1164 | xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR); | |
1165 | *curpp = NULL; | |
1166 | } | |
1167 | ||
1168 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; | |
c517b3aa | 1169 | trace_xchk_xref_error(sc, *error, __return_address); |
64b12563 DW |
1170 | |
1171 | /* | |
1172 | * Errors encountered during cross-referencing with another | |
1173 | * data structure should not cause this scrubber to abort. | |
1174 | */ | |
1175 | *error = 0; | |
1176 | return false; | |
1177 | } | |
cf1b0b8b DW |
1178 | |
1179 | /* Run the structure verifiers on in-memory buffers to detect bad memory. */ | |
1180 | void | |
c517b3aa | 1181 | xchk_buffer_recheck( |
1d8a748a | 1182 | struct xfs_scrub *sc, |
032d91f9 | 1183 | struct xfs_buf *bp) |
cf1b0b8b | 1184 | { |
032d91f9 | 1185 | xfs_failaddr_t fa; |
cf1b0b8b DW |
1186 | |
1187 | if (bp->b_ops == NULL) { | |
c517b3aa | 1188 | xchk_block_set_corrupt(sc, bp); |
cf1b0b8b DW |
1189 | return; |
1190 | } | |
1191 | if (bp->b_ops->verify_struct == NULL) { | |
c517b3aa | 1192 | xchk_set_incomplete(sc); |
cf1b0b8b DW |
1193 | return; |
1194 | } | |
1195 | fa = bp->b_ops->verify_struct(bp); | |
1196 | if (!fa) | |
1197 | return; | |
1198 | sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; | |
9343ee76 | 1199 | trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); |
cf1b0b8b | 1200 | } |
87d9d609 | 1201 | |
f36b954a DW |
1202 | static inline int |
1203 | xchk_metadata_inode_subtype( | |
1204 | struct xfs_scrub *sc, | |
1205 | unsigned int scrub_type) | |
1206 | { | |
1207 | __u32 smtype = sc->sm->sm_type; | |
d9041681 | 1208 | unsigned int sick_mask = sc->sick_mask; |
f36b954a DW |
1209 | int error; |
1210 | ||
1211 | sc->sm->sm_type = scrub_type; | |
1212 | ||
1213 | switch (scrub_type) { | |
1214 | case XFS_SCRUB_TYPE_INODE: | |
1215 | error = xchk_inode(sc); | |
1216 | break; | |
1217 | case XFS_SCRUB_TYPE_BMBTD: | |
1218 | error = xchk_bmap_data(sc); | |
1219 | break; | |
1220 | default: | |
1221 | ASSERT(0); | |
1222 | error = -EFSCORRUPTED; | |
1223 | break; | |
1224 | } | |
1225 | ||
d9041681 | 1226 | sc->sick_mask = sick_mask; |
f36b954a DW |
1227 | sc->sm->sm_type = smtype; |
1228 | return error; | |
1229 | } | |
1230 | ||
87d9d609 DW |
1231 | /* |
1232 | * Scrub the attr/data forks of a metadata inode. The metadata inode must be | |
1233 | * pointed to by sc->ip and the ILOCK must be held. | |
1234 | */ | |
1235 | int | |
c517b3aa | 1236 | xchk_metadata_inode_forks( |
1d8a748a | 1237 | struct xfs_scrub *sc) |
87d9d609 | 1238 | { |
032d91f9 DW |
1239 | bool shared; |
1240 | int error; | |
87d9d609 DW |
1241 | |
1242 | if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) | |
1243 | return 0; | |
1244 | ||
f36b954a DW |
1245 | /* Check the inode record. */ |
1246 | error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); | |
1247 | if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) | |
1248 | return error; | |
1249 | ||
87d9d609 | 1250 | /* Metadata inodes don't live on the rt device. */ |
db07349d | 1251 | if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) { |
c517b3aa | 1252 | xchk_ino_set_corrupt(sc, sc->ip->i_ino); |
87d9d609 DW |
1253 | return 0; |
1254 | } | |
1255 | ||
1256 | /* They should never participate in reflink. */ | |
1257 | if (xfs_is_reflink_inode(sc->ip)) { | |
c517b3aa | 1258 | xchk_ino_set_corrupt(sc, sc->ip->i_ino); |
87d9d609 DW |
1259 | return 0; |
1260 | } | |
1261 | ||
1262 | /* They also should never have extended attributes. */ | |
1263 | if (xfs_inode_hasattr(sc->ip)) { | |
c517b3aa | 1264 | xchk_ino_set_corrupt(sc, sc->ip->i_ino); |
87d9d609 DW |
1265 | return 0; |
1266 | } | |
1267 | ||
1268 | /* Invoke the data fork scrubber. */ | |
f36b954a | 1269 | error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); |
87d9d609 DW |
1270 | if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) |
1271 | return error; | |
1272 | ||
1273 | /* Look for incorrect shared blocks. */ | |
38c26bfd | 1274 | if (xfs_has_reflink(sc->mp)) { |
87d9d609 DW |
1275 | error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, |
1276 | &shared); | |
c517b3aa | 1277 | if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, |
87d9d609 DW |
1278 | &error)) |
1279 | return error; | |
1280 | if (shared) | |
c517b3aa | 1281 | xchk_ino_set_corrupt(sc, sc->ip->i_ino); |
87d9d609 DW |
1282 | } |
1283 | ||
f36b954a | 1284 | return 0; |
87d9d609 | 1285 | } |
ddd10c2f | 1286 | |
466c525d DW |
1287 | /* |
1288 | * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub | |
1289 | * operation. Callers must not hold any locks that intersect with the CPU | |
1290 | * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs | |
1291 | * to change kernel code. | |
1292 | */ | |
1293 | void | |
1294 | xchk_fsgates_enable( | |
1295 | struct xfs_scrub *sc, | |
1296 | unsigned int scrub_fsgates) | |
1297 | { | |
1298 | ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL)); | |
1299 | ASSERT(!(sc->flags & scrub_fsgates)); | |
1300 | ||
1301 | trace_xchk_fsgates_enable(sc, scrub_fsgates); | |
1302 | ||
1303 | if (scrub_fsgates & XCHK_FSGATES_DRAIN) | |
1304 | xfs_drain_wait_enable(); | |
1305 | ||
20049187 DW |
1306 | if (scrub_fsgates & XCHK_FSGATES_QUOTA) |
1307 | xfs_dqtrx_hook_enable(); | |
1308 | ||
86a1746e DW |
1309 | if (scrub_fsgates & XCHK_FSGATES_DIRENTS) |
1310 | xfs_dir_hook_enable(); | |
1311 | ||
7e1b84b2 DW |
1312 | if (scrub_fsgates & XCHK_FSGATES_RMAP) |
1313 | xfs_rmap_hook_enable(); | |
1314 | ||
466c525d DW |
1315 | sc->flags |= scrub_fsgates; |
1316 | } | |
0d296634 DW |
1317 | |
1318 | /* | |
369c001b DW |
1319 | * Decide if this is this a cached inode that's also allocated. The caller |
1320 | * must hold a reference to an AG and the AGI buffer lock to prevent inodes | |
1321 | * from being allocated or freed. | |
0d296634 | 1322 | * |
369c001b DW |
1323 | * Look up an inode by number in the given file system. If the inode number |
1324 | * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA. | |
1325 | * If the inode is being reclaimed, return -ENODATA because we know the inode | |
1326 | * cache cannot be updating the ondisk metadata. | |
0d296634 | 1327 | * |
369c001b DW |
1328 | * Otherwise, the incore inode is the one we want, and it is either live, |
1329 | * somewhere in the inactivation machinery, or reclaimable. The inode is | |
1330 | * allocated if i_mode is nonzero. In all three cases, the cached inode will | |
1331 | * be more up to date than the ondisk inode buffer, so we must use the incore | |
1332 | * i_mode. | |
0d296634 DW |
1333 | */ |
1334 | int | |
1335 | xchk_inode_is_allocated( | |
1336 | struct xfs_scrub *sc, | |
369c001b | 1337 | xfs_agino_t agino, |
0d296634 DW |
1338 | bool *inuse) |
1339 | { | |
369c001b DW |
1340 | struct xfs_mount *mp = sc->mp; |
1341 | struct xfs_perag *pag = sc->sa.pag; | |
1342 | xfs_ino_t ino; | |
0d296634 DW |
1343 | struct xfs_inode *ip; |
1344 | int error; | |
1345 | ||
369c001b DW |
1346 | /* caller must hold perag reference */ |
1347 | if (pag == NULL) { | |
1348 | ASSERT(pag != NULL); | |
1349 | return -EINVAL; | |
1350 | } | |
0d296634 | 1351 | |
369c001b DW |
1352 | /* caller must have AGI buffer */ |
1353 | if (sc->sa.agi_bp == NULL) { | |
1354 | ASSERT(sc->sa.agi_bp != NULL); | |
1355 | return -EINVAL; | |
1356 | } | |
1357 | ||
1358 | /* reject inode numbers outside existing AGs */ | |
1359 | ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); | |
1360 | if (!xfs_verify_ino(mp, ino)) | |
1361 | return -EINVAL; | |
1362 | ||
1363 | error = -ENODATA; | |
1364 | rcu_read_lock(); | |
1365 | ip = radix_tree_lookup(&pag->pag_ici_root, agino); | |
1366 | if (!ip) { | |
1367 | /* cache miss */ | |
1368 | goto out_rcu; | |
1369 | } | |
1370 | ||
1371 | /* | |
1372 | * If the inode number doesn't match, the incore inode got reused | |
1373 | * during an RCU grace period and the radix tree hasn't been updated. | |
1374 | * This isn't the inode we want. | |
1375 | */ | |
1376 | spin_lock(&ip->i_flags_lock); | |
1377 | if (ip->i_ino != ino) | |
1378 | goto out_skip; | |
1379 | ||
1380 | trace_xchk_inode_is_allocated(ip); | |
1381 | ||
1382 | /* | |
1383 | * We have an incore inode that matches the inode we want, and the | |
1384 | * caller holds the perag structure and the AGI buffer. Let's check | |
1385 | * our assumptions below: | |
1386 | */ | |
1387 | ||
1388 | #ifdef DEBUG | |
1389 | /* | |
1390 | * (1) If the incore inode is live (i.e. referenced from the dcache), | |
1391 | * it will not be INEW, nor will it be in the inactivation or reclaim | |
1392 | * machinery. The ondisk inode had better be allocated. This is the | |
1393 | * most trivial case. | |
1394 | */ | |
1395 | if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE | | |
1396 | XFS_INACTIVATING))) { | |
1397 | /* live inode */ | |
1398 | ASSERT(VFS_I(ip)->i_mode != 0); | |
1399 | } | |
1400 | ||
1401 | /* | |
1402 | * If the incore inode is INEW, there are several possibilities: | |
1403 | * | |
1404 | * (2) For a file that is being created, note that we allocate the | |
1405 | * ondisk inode before allocating, initializing, and adding the incore | |
1406 | * inode to the radix tree. | |
1407 | * | |
1408 | * (3) If the incore inode is being recycled, the inode has to be | |
1409 | * allocated because we don't allow freed inodes to be recycled. | |
1410 | * Recycling doesn't touch i_mode. | |
1411 | */ | |
1412 | if (ip->i_flags & XFS_INEW) { | |
1413 | /* created on disk already or recycling */ | |
1414 | ASSERT(VFS_I(ip)->i_mode != 0); | |
1415 | } | |
1416 | ||
1417 | /* | |
1418 | * (4) If the inode is queued for inactivation (NEED_INACTIVE) but | |
1419 | * inactivation has not started (!INACTIVATING), it is still allocated. | |
1420 | */ | |
1421 | if ((ip->i_flags & XFS_NEED_INACTIVE) && | |
1422 | !(ip->i_flags & XFS_INACTIVATING)) { | |
1423 | /* definitely before difree */ | |
1424 | ASSERT(VFS_I(ip)->i_mode != 0); | |
1425 | } | |
1426 | #endif | |
1427 | ||
1428 | /* | |
1429 | * If the incore inode is undergoing inactivation (INACTIVATING), there | |
1430 | * are two possibilities: | |
1431 | * | |
1432 | * (5) It is before the point where it would get freed ondisk, in which | |
1433 | * case i_mode is still nonzero. | |
1434 | * | |
1435 | * (6) It has already been freed, in which case i_mode is zero. | |
1436 | * | |
1437 | * We don't take the ILOCK here, but difree and dialloc update the AGI, | |
1438 | * and we've taken the AGI buffer lock, which prevents that from | |
1439 | * happening. | |
1440 | */ | |
1441 | ||
1442 | /* | |
1443 | * (7) Inodes undergoing inactivation (INACTIVATING) or queued for | |
1444 | * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still | |
1445 | * reflects the ondisk state. | |
1446 | */ | |
1447 | ||
1448 | /* | |
1449 | * (8) If the inode is in IFLUSHING, it's safe to query i_mode because | |
1450 | * the flush code uses i_mode to format the ondisk inode. | |
1451 | */ | |
1452 | ||
1453 | /* | |
1454 | * (9) If the inode is in IRECLAIM and was reachable via the radix | |
1455 | * tree, it still has the same i_mode as it did before it entered | |
1456 | * reclaim. The inode object is still alive because we hold the RCU | |
1457 | * read lock. | |
1458 | */ | |
1459 | ||
1460 | *inuse = VFS_I(ip)->i_mode != 0; | |
1461 | error = 0; | |
1462 | ||
1463 | out_skip: | |
1464 | spin_unlock(&ip->i_flags_lock); | |
1465 | out_rcu: | |
1466 | rcu_read_unlock(); | |
1467 | return error; | |
0d296634 | 1468 | } |