fs/xfs/scrub/common.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <djwong@kernel.org>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_btree.h"
  13 #include "xfs_log_format.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_inode.h"
  16 #include "xfs_icache.h"
  17 #include "xfs_alloc.h"
  18 #include "xfs_alloc_btree.h"
  19 #include "xfs_ialloc.h"
  20 #include "xfs_ialloc_btree.h"
  21 #include "xfs_refcount_btree.h"
  22 #include "xfs_rmap.h"
  23 #include "xfs_rmap_btree.h"
  24 #include "xfs_log.h"
  25 #include "xfs_trans_priv.h"
  26 #include "xfs_da_format.h"
  27 #include "xfs_da_btree.h"
  28 #include "xfs_dir2_priv.h"
  29 #include "xfs_attr.h"
  30 #include "xfs_reflink.h"
  31 #include "xfs_ag.h"
  32 #include "xfs_error.h"
  33 #include "xfs_quota.h"
  34 #include "scrub/scrub.h"
  35 #include "scrub/common.h"
  36 #include "scrub/trace.h"
  37 #include "scrub/repair.h"
  38 #include "scrub/health.h"
  39
  40 /* Common code for the metadata scrubbers. */
  41
  42 /*
  43  * Handling operational errors.
  44  *
  45  * The *_process_error() family of functions are used to process error return
  46  * codes from functions called as part of a scrub operation.
  47  *
  48  * If there's no error, we return true to tell the caller that it's ok
  49  * to move on to the next check in its list.
  50  *
  51  * For non-verifier errors (e.g. ENOMEM) we return false to tell the
  52  * caller that something bad happened, and we preserve *error so that
  53  * the caller can return the *error up the stack to userspace.
  54  *
  55  * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
  56  * OFLAG_CORRUPT in sm_flags and the *error is cleared.  In other words,
  57  * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
  58  * not via return codes.  We return false to tell the caller that
  59  * something bad happened.  Since the error has been cleared, the caller
  60  * will (presumably) return that zero and scrubbing will move on to
  61  * whatever's next.
  62  *
  63  * ftrace can be used to record the precise metadata location and the
  64  * approximate code location of the failed operation.
  65  */
  66
  67 /* Check for operational errors. */
  68 static bool
  69 __xchk_process_error(
  70         struct xfs_scrub        *sc,
  71         xfs_agnumber_t          agno,
  72         xfs_agblock_t           bno,
  73         int                     *error,
  74         __u32                   errflag,
  75         void                    *ret_ip)
  76 {
  77         switch (*error) {
  78         case 0:
  79                 return true;
  80         case -EDEADLOCK:
  81         case -ECHRNG:
  82                 /* Used to restart an op with deadlock avoidance. */
  83                 trace_xchk_deadlock_retry(
  84                                 sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
  85                                 sc->sm, *error);
  86                 break;
  87         case -ECANCELED:
  88                 /*
  89                  * ECANCELED here means that the caller set one of the scrub
  90                  * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
  91                  * quickly.  Set error to zero and do not continue.
  92                  */
  93                 trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
  94                 *error = 0;
  95                 break;
  96         case -EFSBADCRC:
  97         case -EFSCORRUPTED:
  98                 /* Note the badness but don't abort. */
  99                 sc->sm->sm_flags |= errflag;
 100                 *error = 0;
 101                 fallthrough;
 102         default:
 103                 trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
 104                 break;
 105         }
 106         return false;
 107 }
 108
 109 bool
 110 xchk_process_error(
 111         struct xfs_scrub        *sc,
 112         xfs_agnumber_t          agno,
 113         xfs_agblock_t           bno,
 114         int                     *error)
 115 {
 116         return __xchk_process_error(sc, agno, bno, error,
 117                         XFS_SCRUB_OFLAG_CORRUPT, __return_address);
 118 }
 119
 120 bool
 121 xchk_xref_process_error(
 122         struct xfs_scrub        *sc,
 123         xfs_agnumber_t          agno,
 124         xfs_agblock_t           bno,
 125         int                     *error)
 126 {
 127         return __xchk_process_error(sc, agno, bno, error,
 128                         XFS_SCRUB_OFLAG_XFAIL, __return_address);
 129 }
 130
 131 /* Check for operational errors for a file offset. */
 132 static bool
 133 __xchk_fblock_process_error(
 134         struct xfs_scrub        *sc,
 135         int                     whichfork,
 136         xfs_fileoff_t           offset,
 137         int                     *error,
 138         __u32                   errflag,
 139         void                    *ret_ip)
 140 {
 141         switch (*error) {
 142         case 0:
 143                 return true;
 144         case -EDEADLOCK:
 145         case -ECHRNG:
 146                 /* Used to restart an op with deadlock avoidance. */
 147                 trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
 148                 break;
 149         case -ECANCELED:
 150                 /*
 151                  * ECANCELED here means that the caller set one of the scrub
 152                  * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
 153                  * quickly.  Set error to zero and do not continue.
 154                  */
 155                 trace_xchk_file_op_error(sc, whichfork, offset, *error,
 156                                 ret_ip);
 157                 *error = 0;
 158                 break;
 159         case -EFSBADCRC:
 160         case -EFSCORRUPTED:
 161                 /* Note the badness but don't abort. */
 162                 sc->sm->sm_flags |= errflag;
 163                 *error = 0;
 164                 fallthrough;
 165         default:
 166                 trace_xchk_file_op_error(sc, whichfork, offset, *error,
 167                                 ret_ip);
 168                 break;
 169         }
 170         return false;
 171 }
 172
 173 bool
 174 xchk_fblock_process_error(
 175         struct xfs_scrub        *sc,
 176         int                     whichfork,
 177         xfs_fileoff_t           offset,
 178         int                     *error)
 179 {
 180         return __xchk_fblock_process_error(sc, whichfork, offset, error,
 181                         XFS_SCRUB_OFLAG_CORRUPT, __return_address);
 182 }
 183
 184 bool
 185 xchk_fblock_xref_process_error(
 186         struct xfs_scrub        *sc,
 187         int                     whichfork,
 188         xfs_fileoff_t           offset,
 189         int                     *error)
 190 {
 191         return __xchk_fblock_process_error(sc, whichfork, offset, error,
 192                         XFS_SCRUB_OFLAG_XFAIL, __return_address);
 193 }
 194
 195 /*
 196  * Handling scrub corruption/optimization/warning checks.
 197  *
 198  * The *_set_{corrupt,preen,warning}() family of functions are used to
 199  * record the presence of metadata that is incorrect (corrupt), could be
 200  * optimized somehow (preen), or should be flagged for administrative
 201  * review but is not incorrect (warn).
 202  *
 203  * ftrace can be used to record the precise metadata location and
 204  * approximate code location of the failed check.
 205  */
 206
 207 /* Record a block which could be optimized. */
 208 void
 209 xchk_block_set_preen(
 210         struct xfs_scrub        *sc,
 211         struct xfs_buf          *bp)
 212 {
 213         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
 214         trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
 215 }
 216
 217 /*
 218  * Record an inode which could be optimized.  The trace data will
 219  * include the block given by bp if bp is given; otherwise it will use
 220  * the block location of the inode record itself.
 221  */
 222 void
 223 xchk_ino_set_preen(
 224         struct xfs_scrub        *sc,
 225         xfs_ino_t               ino)
 226 {
 227         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
 228         trace_xchk_ino_preen(sc, ino, __return_address);
 229 }
 230
 231 /* Record something being wrong with the filesystem primary superblock. */
 232 void
 233 xchk_set_corrupt(
 234         struct xfs_scrub        *sc)
 235 {
 236         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 237         trace_xchk_fs_error(sc, 0, __return_address);
 238 }
 239
 240 /* Record a corrupt block. */
 241 void
 242 xchk_block_set_corrupt(
 243         struct xfs_scrub        *sc,
 244         struct xfs_buf          *bp)
 245 {
 246         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 247         trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
 248 }
 249
 250 #ifdef CONFIG_XFS_QUOTA
 251 /* Record a corrupt quota counter. */
 252 void
 253 xchk_qcheck_set_corrupt(
 254         struct xfs_scrub        *sc,
 255         unsigned int            dqtype,
 256         xfs_dqid_t              id)
 257 {
 258         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 259         trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
 260 }
 261 #endif
 262
 263 /* Record a corruption while cross-referencing. */
 264 void
 265 xchk_block_xref_set_corrupt(
 266         struct xfs_scrub        *sc,
 267         struct xfs_buf          *bp)
 268 {
 269         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
 270         trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
 271 }
 272
 273 /*
 274  * Record a corrupt inode.  The trace data will include the block given
 275  * by bp if bp is given; otherwise it will use the block location of the
 276  * inode record itself.
 277  */
 278 void
 279 xchk_ino_set_corrupt(
 280         struct xfs_scrub        *sc,
 281         xfs_ino_t               ino)
 282 {
 283         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 284         trace_xchk_ino_error(sc, ino, __return_address);
 285 }
 286
 287 /* Record a corruption while cross-referencing with an inode. */
 288 void
 289 xchk_ino_xref_set_corrupt(
 290         struct xfs_scrub        *sc,
 291         xfs_ino_t               ino)
 292 {
 293         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
 294         trace_xchk_ino_error(sc, ino, __return_address);
 295 }
 296
 297 /* Record corruption in a block indexed by a file fork. */
 298 void
 299 xchk_fblock_set_corrupt(
 300         struct xfs_scrub        *sc,
 301         int                     whichfork,
 302         xfs_fileoff_t           offset)
 303 {
 304         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 305         trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
 306 }
 307
 308 /* Record a corruption while cross-referencing a fork block. */
 309 void
 310 xchk_fblock_xref_set_corrupt(
 311         struct xfs_scrub        *sc,
 312         int                     whichfork,
 313         xfs_fileoff_t           offset)
 314 {
 315         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
 316         trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
 317 }
 318
 319 /*
 320  * Warn about inodes that need administrative review but is not
 321  * incorrect.
 322  */
 323 void
 324 xchk_ino_set_warning(
 325         struct xfs_scrub        *sc,
 326         xfs_ino_t               ino)
 327 {
 328         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
 329         trace_xchk_ino_warning(sc, ino, __return_address);
 330 }
 331
 332 /* Warn about a block indexed by a file fork that needs review. */
 333 void
 334 xchk_fblock_set_warning(
 335         struct xfs_scrub        *sc,
 336         int                     whichfork,
 337         xfs_fileoff_t           offset)
 338 {
 339         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
 340         trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
 341 }
 342
 343 /* Signal an incomplete scrub. */
 344 void
 345 xchk_set_incomplete(
 346         struct xfs_scrub        *sc)
 347 {
 348         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
 349         trace_xchk_incomplete(sc, __return_address);
 350 }
 351
 352 /*
 353  * rmap scrubbing -- compute the number of blocks with a given owner,
 354  * at least according to the reverse mapping data.
 355  */
 356
 357 struct xchk_rmap_ownedby_info {
 358         const struct xfs_owner_info     *oinfo;
 359         xfs_filblks_t                   *blocks;
 360 };
 361
 362 STATIC int
 363 xchk_count_rmap_ownedby_irec(
 364         struct xfs_btree_cur            *cur,
 365         const struct xfs_rmap_irec      *rec,
 366         void                            *priv)
 367 {
 368         struct xchk_rmap_ownedby_info   *sroi = priv;
 369         bool                            irec_attr;
 370         bool                            oinfo_attr;
 371
 372         irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
 373         oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
 374
 375         if (rec->rm_owner != sroi->oinfo->oi_owner)
 376                 return 0;
 377
 378         if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
 379                 (*sroi->blocks) += rec->rm_blockcount;
 380
 381         return 0;
 382 }
 383
 384 /*
 385  * Calculate the number of blocks the rmap thinks are owned by something.
 386  * The caller should pass us an rmapbt cursor.
 387  */
 388 int
 389 xchk_count_rmap_ownedby_ag(
 390         struct xfs_scrub                *sc,
 391         struct xfs_btree_cur            *cur,
 392         const struct xfs_owner_info     *oinfo,
 393         xfs_filblks_t                   *blocks)
 394 {
 395         struct xchk_rmap_ownedby_info   sroi = {
 396                 .oinfo                  = oinfo,
 397                 .blocks                 = blocks,
 398         };
 399
 400         *blocks = 0;
 401         return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
 402                         &sroi);
 403 }
 404
 405 /*
 406  * AG scrubbing
 407  *
 408  * These helpers facilitate locking an allocation group's header
 409  * buffers, setting up cursors for all btrees that are present, and
 410  * cleaning everything up once we're through.
 411  */
 412
 413 /* Decide if we want to return an AG header read failure. */
 414 static inline bool
 415 want_ag_read_header_failure(
 416         struct xfs_scrub        *sc,
 417         unsigned int            type)
 418 {
 419         /* Return all AG header read failures when scanning btrees. */
 420         if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
 421             sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
 422             sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
 423                 return true;
 424         /*
 425          * If we're scanning a given type of AG header, we only want to
 426          * see read failures from that specific header.  We'd like the
 427          * other headers to cross-check them, but this isn't required.
 428          */
 429         if (sc->sm->sm_type == type)
 430                 return true;
 431         return false;
 432 }
 433
 434 /*
 435  * Grab the AG header buffers for the attached perag structure.
 436  *
 437  * The headers should be released by xchk_ag_free, but as a fail safe we attach
 438  * all the buffers we grab to the scrub transaction so they'll all be freed
 439  * when we cancel it.
 440  */
 441 static inline int
 442 xchk_perag_read_headers(
 443         struct xfs_scrub        *sc,
 444         struct xchk_ag          *sa)
 445 {
 446         int                     error;
 447
 448         error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
 449         if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
 450                 return error;
 451
 452         error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
 453         if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
 454                 return error;
 455
 456         return 0;
 457 }
 458
 459 /*
 460  * Grab the AG headers for the attached perag structure and wait for pending
 461  * intents to drain.
 462  */
 463 int
 464 xchk_perag_drain_and_lock(
 465         struct xfs_scrub        *sc)
 466 {
 467         struct xchk_ag          *sa = &sc->sa;
 468         int                     error = 0;
 469
 470         ASSERT(sa->pag != NULL);
 471         ASSERT(sa->agi_bp == NULL);
 472         ASSERT(sa->agf_bp == NULL);
 473
 474         do {
 475                 if (xchk_should_terminate(sc, &error))
 476                         return error;
 477
 478                 error = xchk_perag_read_headers(sc, sa);
 479                 if (error)
 480                         return error;
 481
 482                 /*
 483                  * If we've grabbed an inode for scrubbing then we assume that
 484                  * holding its ILOCK will suffice to coordinate with any intent
 485                  * chains involving this inode.
 486                  */
 487                 if (sc->ip)
 488                         return 0;
 489
 490                 /*
 491                  * Decide if this AG is quiet enough for all metadata to be
 492                  * consistent with each other.  XFS allows the AG header buffer
 493                  * locks to cycle across transaction rolls while processing
 494                  * chains of deferred ops, which means that there could be
 495                  * other threads in the middle of processing a chain of
 496                  * deferred ops.  For regular operations we are careful about
 497                  * ordering operations to prevent collisions between threads
 498                  * (which is why we don't need a per-AG lock), but scrub and
 499                  * repair have to serialize against chained operations.
 500                  *
 501                  * We just locked all the AG headers buffers; now take a look
 502                  * to see if there are any intents in progress.  If there are,
 503                  * drop the AG headers and wait for the intents to drain.
 504                  * Since we hold all the AG header locks for the duration of
 505                  * the scrub, this is the only time we have to sample the
 506                  * intents counter; any threads increasing it after this point
 507                  * can't possibly be in the middle of a chain of AG metadata
 508                  * updates.
 509                  *
 510                  * Obviously, this should be slanted against scrub and in favor
 511                  * of runtime threads.
 512                  */
 513                 if (!xfs_perag_intent_busy(sa->pag))
 514                         return 0;
 515
 516                 if (sa->agf_bp) {
 517                         xfs_trans_brelse(sc->tp, sa->agf_bp);
 518                         sa->agf_bp = NULL;
 519                 }
 520
 521                 if (sa->agi_bp) {
 522                         xfs_trans_brelse(sc->tp, sa->agi_bp);
 523                         sa->agi_bp = NULL;
 524                 }
 525
 526                 if (!(sc->flags & XCHK_FSGATES_DRAIN))
 527                         return -ECHRNG;
 528                 error = xfs_perag_intent_drain(sa->pag);
 529                 if (error == -ERESTARTSYS)
 530                         error = -EINTR;
 531         } while (!error);
 532
 533         return error;
 534 }
 535
 536 /*
 537  * Grab the per-AG structure, grab all AG header buffers, and wait until there
 538  * aren't any pending intents.  Returns -ENOENT if we can't grab the perag
 539  * structure.
 540  */
 541 int
 542 xchk_ag_read_headers(
 543         struct xfs_scrub        *sc,
 544         xfs_agnumber_t          agno,
 545         struct xchk_ag          *sa)
 546 {
 547         struct xfs_mount        *mp = sc->mp;
 548
 549         ASSERT(!sa->pag);
 550         sa->pag = xfs_perag_get(mp, agno);
 551         if (!sa->pag)
 552                 return -ENOENT;
 553
 554         return xchk_perag_drain_and_lock(sc);
 555 }
 556
 557 /* Release all the AG btree cursors. */
 558 void
 559 xchk_ag_btcur_free(
 560         struct xchk_ag          *sa)
 561 {
 562         if (sa->refc_cur)
 563                 xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
 564         if (sa->rmap_cur)
 565                 xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
 566         if (sa->fino_cur)
 567                 xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
 568         if (sa->ino_cur)
 569                 xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
 570         if (sa->cnt_cur)
 571                 xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
 572         if (sa->bno_cur)
 573                 xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
 574
 575         sa->refc_cur = NULL;
 576         sa->rmap_cur = NULL;
 577         sa->fino_cur = NULL;
 578         sa->ino_cur = NULL;
 579         sa->bno_cur = NULL;
 580         sa->cnt_cur = NULL;
 581 }
 582
 583 /* Initialize all the btree cursors for an AG. */
 584 void
 585 xchk_ag_btcur_init(
 586         struct xfs_scrub        *sc,
 587         struct xchk_ag          *sa)
 588 {
 589         struct xfs_mount        *mp = sc->mp;
 590
 591         if (sa->agf_bp) {
 592                 /* Set up a bnobt cursor for cross-referencing. */
 593                 sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
 594                                 sa->pag);
 595                 xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
 596                                 XFS_SCRUB_TYPE_BNOBT);
 597
 598                 /* Set up a cntbt cursor for cross-referencing. */
 599                 sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
 600                                 sa->pag);
 601                 xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
 602                                 XFS_SCRUB_TYPE_CNTBT);
 603
 604                 /* Set up a rmapbt cursor for cross-referencing. */
 605                 if (xfs_has_rmapbt(mp)) {
 606                         sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
 607                                         sa->agf_bp, sa->pag);
 608                         xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
 609                                         XFS_SCRUB_TYPE_RMAPBT);
 610                 }
 611
 612                 /* Set up a refcountbt cursor for cross-referencing. */
 613                 if (xfs_has_reflink(mp)) {
 614                         sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
 615                                         sa->agf_bp, sa->pag);
 616                         xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
 617                                         XFS_SCRUB_TYPE_REFCNTBT);
 618                 }
 619         }
 620
 621         if (sa->agi_bp) {
 622                 /* Set up a inobt cursor for cross-referencing. */
 623                 sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
 624                                 sa->agi_bp);
 625                 xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
 626                                 XFS_SCRUB_TYPE_INOBT);
 627
 628                 /* Set up a finobt cursor for cross-referencing. */
 629                 if (xfs_has_finobt(mp)) {
 630                         sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
 631                                         sa->agi_bp);
 632                         xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
 633                                         XFS_SCRUB_TYPE_FINOBT);
 634                 }
 635         }
 636 }
 637
 638 /* Release the AG header context and btree cursors. */
 639 void
 640 xchk_ag_free(
 641         struct xfs_scrub        *sc,
 642         struct xchk_ag          *sa)
 643 {
 644         xchk_ag_btcur_free(sa);
 645         xrep_reset_perag_resv(sc);
 646         if (sa->agf_bp) {
 647                 xfs_trans_brelse(sc->tp, sa->agf_bp);
 648                 sa->agf_bp = NULL;
 649         }
 650         if (sa->agi_bp) {
 651                 xfs_trans_brelse(sc->tp, sa->agi_bp);
 652                 sa->agi_bp = NULL;
 653         }
 654         if (sa->pag) {
 655                 xfs_perag_put(sa->pag);
 656                 sa->pag = NULL;
 657         }
 658 }
 659
 660 /*
 661  * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
 662  * order.  Locking order requires us to get the AGI before the AGF.  We use the
 663  * transaction to avoid deadlocking on crosslinked metadata buffers; either the
 664  * caller passes one in (bmap scrub) or we have to create a transaction
 665  * ourselves.  Returns ENOENT if the perag struct cannot be grabbed.
 666  */
 667 int
 668 xchk_ag_init(
 669         struct xfs_scrub        *sc,
 670         xfs_agnumber_t          agno,
 671         struct xchk_ag          *sa)
 672 {
 673         int                     error;
 674
 675         error = xchk_ag_read_headers(sc, agno, sa);
 676         if (error)
 677                 return error;
 678
 679         xchk_ag_btcur_init(sc, sa);
 680         return 0;
 681 }
 682
 683 /* Per-scrubber setup functions */
 684
 685 void
 686 xchk_trans_cancel(
 687         struct xfs_scrub        *sc)
 688 {
 689         xfs_trans_cancel(sc->tp);
 690         sc->tp = NULL;
 691 }
 692
 693 int
 694 xchk_trans_alloc_empty(
 695         struct xfs_scrub        *sc)
 696 {
 697         return xfs_trans_alloc_empty(sc->mp, &sc->tp);
 698 }
 699
 700 /*
 701  * Grab an empty transaction so that we can re-grab locked buffers if
 702  * one of our btrees turns out to be cyclic.
 703  *
 704  * If we're going to repair something, we need to ask for the largest possible
 705  * log reservation so that we can handle the worst case scenario for metadata
 706  * updates while rebuilding a metadata item.  We also need to reserve as many
 707  * blocks in the head transaction as we think we're going to need to rebuild
 708  * the metadata object.
 709  */
 710 int
 711 xchk_trans_alloc(
 712         struct xfs_scrub        *sc,
 713         uint                    resblks)
 714 {
 715         if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
 716                 return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
 717                                 resblks, 0, 0, &sc->tp);
 718
 719         return xchk_trans_alloc_empty(sc);
 720 }
 721
 722 /* Set us up with a transaction and an empty context. */
 723 int
 724 xchk_setup_fs(
 725         struct xfs_scrub        *sc)
 726 {
 727         uint                    resblks;
 728
 729         resblks = xrep_calc_ag_resblks(sc);
 730         return xchk_trans_alloc(sc, resblks);
 731 }
 732
 733 /* Set us up with AG headers and btree cursors. */
 734 int
 735 xchk_setup_ag_btree(
 736         struct xfs_scrub        *sc,
 737         bool                    force_log)
 738 {
 739         struct xfs_mount        *mp = sc->mp;
 740         int                     error;
 741
 742         /*
 743          * If the caller asks us to checkpont the log, do so.  This
 744          * expensive operation should be performed infrequently and only
 745          * as a last resort.  Any caller that sets force_log should
 746          * document why they need to do so.
 747          */
 748         if (force_log) {
 749                 error = xchk_checkpoint_log(mp);
 750                 if (error)
 751                         return error;
 752         }
 753
 754         error = xchk_setup_fs(sc);
 755         if (error)
 756                 return error;
 757
 758         return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
 759 }
 760
 761 /* Push everything out of the log onto disk. */
 762 int
 763 xchk_checkpoint_log(
 764         struct xfs_mount        *mp)
 765 {
 766         int                     error;
 767
 768         error = xfs_log_force(mp, XFS_LOG_SYNC);
 769         if (error)
 770                 return error;
 771         xfs_ail_push_all_sync(mp->m_ail);
 772         return 0;
 773 }
 774
 775 /* Verify that an inode is allocated ondisk, then return its cached inode. */
 776 int
 777 xchk_iget(
 778         struct xfs_scrub        *sc,
 779         xfs_ino_t               inum,
 780         struct xfs_inode        **ipp)
 781 {
 782         ASSERT(sc->tp != NULL);
 783
 784         return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
 785 }
 786
 787 /*
 788  * Try to grab an inode in a manner that avoids races with physical inode
 789  * allocation.  If we can't, return the locked AGI buffer so that the caller
 790  * can single-step the loading process to see where things went wrong.
 791  * Callers must have a valid scrub transaction.
 792  *
 793  * If the iget succeeds, return 0, a NULL AGI, and the inode.
 794  *
 795  * If the iget fails, return the error, the locked AGI, and a NULL inode.  This
 796  * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
 797  * no longer allocated; or any other corruption or runtime error.
 798  *
 799  * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
 800  *
 801  * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
 802  */
 803 int
 804 xchk_iget_agi(
 805         struct xfs_scrub        *sc,
 806         xfs_ino_t               inum,
 807         struct xfs_buf          **agi_bpp,
 808         struct xfs_inode        **ipp)
 809 {
 810         struct xfs_mount        *mp = sc->mp;
 811         struct xfs_trans        *tp = sc->tp;
 812         struct xfs_perag        *pag;
 813         int                     error;
 814
 815         ASSERT(sc->tp != NULL);
 816
 817 again:
 818         *agi_bpp = NULL;
 819         *ipp = NULL;
 820         error = 0;
 821
 822         if (xchk_should_terminate(sc, &error))
 823                 return error;
 824
 825         /*
 826          * Attach the AGI buffer to the scrub transaction to avoid deadlocks
 827          * in the iget cache miss path.
 828          */
 829         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
 830         error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
 831         xfs_perag_put(pag);
 832         if (error)
 833                 return error;
 834
 835         error = xfs_iget(mp, tp, inum,
 836                         XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
 837         if (error == -EAGAIN) {
 838                 /*
 839                  * The inode may be in core but temporarily unavailable and may
 840                  * require the AGI buffer before it can be returned.  Drop the
 841                  * AGI buffer and retry the lookup.
 842                  *
 843                  * Incore lookup will fail with EAGAIN on a cache hit if the
 844                  * inode is queued to the inactivation list.  The inactivation
 845                  * worker may remove the inode from the unlinked list and hence
 846                  * needs the AGI.
 847                  *
 848                  * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
 849                  * to allow inodegc to make progress and move the inode to
 850                  * IRECLAIMABLE state where xfs_iget will be able to return it
 851                  * again if it can lock the inode.
 852                  */
 853                 xfs_trans_brelse(tp, *agi_bpp);
 854                 delay(1);
 855                 goto again;
 856         }
 857         if (error)
 858                 return error;
 859
 860         /* We got the inode, so we can release the AGI. */
 861         ASSERT(*ipp != NULL);
 862         xfs_trans_brelse(tp, *agi_bpp);
 863         *agi_bpp = NULL;
 864         return 0;
 865 }
 866
 867 #ifdef CONFIG_XFS_QUOTA
 868 /*
 869  * Try to attach dquots to this inode if we think we might want to repair it.
 870  * Callers must not hold any ILOCKs.  If the dquots are broken and cannot be
 871  * attached, a quotacheck will be scheduled.
 872  */
 873 int
 874 xchk_ino_dqattach(
 875         struct xfs_scrub        *sc)
 876 {
 877         ASSERT(sc->tp != NULL);
 878         ASSERT(sc->ip != NULL);
 879
 880         if (!xchk_could_repair(sc))
 881                 return 0;
 882
 883         return xrep_ino_dqattach(sc);
 884 }
 885 #endif
 886
 887 /* Install an inode that we opened by handle for scrubbing. */
 888 int
 889 xchk_install_handle_inode(
 890         struct xfs_scrub        *sc,
 891         struct xfs_inode        *ip)
 892 {
 893         if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
 894                 xchk_irele(sc, ip);
 895                 return -ENOENT;
 896         }
 897
 898         sc->ip = ip;
 899         return 0;
 900 }
 901
 902 /*
 903  * Install an already-referenced inode for scrubbing.  Get our own reference to
 904  * the inode to make disposal simpler.  The inode must not be in I_FREEING or
 905  * I_WILL_FREE state!
 906  */
 907 int
 908 xchk_install_live_inode(
 909         struct xfs_scrub        *sc,
 910         struct xfs_inode        *ip)
 911 {
 912         if (!igrab(VFS_I(ip))) {
 913                 xchk_ino_set_corrupt(sc, ip->i_ino);
 914                 return -EFSCORRUPTED;
 915         }
 916
 917         sc->ip = ip;
 918         return 0;
 919 }
 920
 921 /*
 922  * In preparation to scrub metadata structures that hang off of an inode,
 923  * grab either the inode referenced in the scrub control structure or the
 924  * inode passed in.  If the inumber does not reference an allocated inode
 925  * record, the function returns ENOENT to end the scrub early.  The inode
 926  * is not locked.
 927  */
 928 int
 929 xchk_iget_for_scrubbing(
 930         struct xfs_scrub        *sc)
 931 {
 932         struct xfs_imap         imap;
 933         struct xfs_mount        *mp = sc->mp;
 934         struct xfs_perag        *pag;
 935         struct xfs_buf          *agi_bp;
 936         struct xfs_inode        *ip_in = XFS_I(file_inode(sc->file));
 937         struct xfs_inode        *ip = NULL;
 938         xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
 939         int                     error;
 940
 941         ASSERT(sc->tp == NULL);
 942
 943         /* We want to scan the inode we already had opened. */
 944         if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
 945                 return xchk_install_live_inode(sc, ip_in);
 946
 947         /* Reject internal metadata files and obviously bad inode numbers. */
 948         if (xfs_internal_inum(mp, sc->sm->sm_ino))
 949                 return -ENOENT;
 950         if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
 951                 return -ENOENT;
 952
 953         /* Try a safe untrusted iget. */
 954         error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
 955         if (!error)
 956                 return xchk_install_handle_inode(sc, ip);
 957         if (error == -ENOENT)
 958                 return error;
 959         if (error != -EINVAL)
 960                 goto out_error;
 961
 962         /*
 963          * EINVAL with IGET_UNTRUSTED probably means one of several things:
 964          * userspace gave us an inode number that doesn't correspond to fs
 965          * space; the inode btree lacks a record for this inode; or there is a
 966          * record, and it says this inode is free.
 967          *
 968          * We want to look up this inode in the inobt to distinguish two
 969          * scenarios: (1) the inobt says the inode is free, in which case
 970          * there's nothing to do; and (2) the inobt says the inode is
 971          * allocated, but loading it failed due to corruption.
 972          *
 973          * Allocate a transaction and grab the AGI to prevent inobt activity
 974          * in this AG.  Retry the iget in case someone allocated a new inode
 975          * after the first iget failed.
 976          */
 977         error = xchk_trans_alloc(sc, 0);
 978         if (error)
 979                 goto out_error;
 980
 981         error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
 982         if (error == 0) {
 983                 /* Actually got the inode, so install it. */
 984                 xchk_trans_cancel(sc);
 985                 return xchk_install_handle_inode(sc, ip);
 986         }
 987         if (error == -ENOENT)
 988                 goto out_gone;
 989         if (error != -EINVAL)
 990                 goto out_cancel;
 991
 992         /* Ensure that we have protected against inode allocation/freeing. */
 993         if (agi_bp == NULL) {
 994                 ASSERT(agi_bp != NULL);
 995                 error = -ECANCELED;
 996                 goto out_cancel;
 997         }
 998
 999         /*
1000          * Untrusted iget failed a second time.  Let's try an inobt lookup.
1001          * If the inobt thinks this the inode neither can exist inside the
1002          * filesystem nor is allocated, return ENOENT to signal that the check
1003          * can be skipped.
1004          *
1005          * If the lookup returns corruption, we'll mark this inode corrupt and
1006          * exit to userspace.  There's little chance of fixing anything until
1007          * the inobt is straightened out, but there's nothing we can do here.
1008          *
1009          * If the lookup encounters any other error, exit to userspace.
1010          *
1011          * If the lookup succeeds, something else must be very wrong in the fs
1012          * such that setting up the incore inode failed in some strange way.
1013          * Treat those as corruptions.
1014          */
1015         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
1016         if (!pag) {
1017                 error = -EFSCORRUPTED;
1018                 goto out_cancel;
1019         }
1020
1021         error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
1022                         XFS_IGET_UNTRUSTED);
1023         xfs_perag_put(pag);
1024         if (error == -EINVAL || error == -ENOENT)
1025                 goto out_gone;
1026         if (!error)
1027                 error = -EFSCORRUPTED;
1028
1029 out_cancel:
1030         xchk_trans_cancel(sc);
1031 out_error:
1032         trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
1033                         error, __return_address);
1034         return error;
1035 out_gone:
1036         /* The file is gone, so there's nothing to check. */
1037         xchk_trans_cancel(sc);
1038         return -ENOENT;
1039 }
1040
1041 /* Release an inode, possibly dropping it in the process. */
1042 void
1043 xchk_irele(
1044         struct xfs_scrub        *sc,
1045         struct xfs_inode        *ip)
1046 {
1047         if (sc->tp) {
1048                 /*
1049                  * If we are in a transaction, we /cannot/ drop the inode
1050                  * ourselves, because the VFS will trigger writeback, which
1051                  * can require a transaction.  Clear DONTCACHE to force the
1052                  * inode to the LRU, where someone else can take care of
1053                  * dropping it.
1054                  *
1055                  * Note that when we grabbed our reference to the inode, it
1056                  * could have had an active ref and DONTCACHE set if a sysadmin
1057                  * is trying to coerce a change in file access mode.  icache
1058                  * hits do not clear DONTCACHE, so we must do it here.
1059                  */
1060                 spin_lock(&VFS_I(ip)->i_lock);
1061                 VFS_I(ip)->i_state &= ~I_DONTCACHE;
1062                 spin_unlock(&VFS_I(ip)->i_lock);
1063         } else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
1064                 /*
1065                  * If this is the last reference to the inode and the caller
1066                  * permits it, set DONTCACHE to avoid thrashing.
1067                  */
1068                 d_mark_dontcache(VFS_I(ip));
1069         }
1070
1071         xfs_irele(ip);
1072 }
1073
1074 /*
1075  * Set us up to scrub metadata mapped by a file's fork.  Callers must not use
1076  * this to operate on user-accessible regular file data because the MMAPLOCK is
1077  * not taken.
1078  */
1079 int
1080 xchk_setup_inode_contents(
1081         struct xfs_scrub        *sc,
1082         unsigned int            resblks)
1083 {
1084         int                     error;
1085
1086         error = xchk_iget_for_scrubbing(sc);
1087         if (error)
1088                 return error;
1089
1090         /* Lock the inode so the VFS cannot touch this file. */
1091         xchk_ilock(sc, XFS_IOLOCK_EXCL);
1092
1093         error = xchk_trans_alloc(sc, resblks);
1094         if (error)
1095                 goto out;
1096
1097         error = xchk_ino_dqattach(sc);
1098         if (error)
1099                 goto out;
1100
1101         xchk_ilock(sc, XFS_ILOCK_EXCL);
1102 out:
1103         /* scrub teardown will unlock and release the inode for us */
1104         return error;
1105 }
1106
1107 void
1108 xchk_ilock(
1109         struct xfs_scrub        *sc,
1110         unsigned int            ilock_flags)
1111 {
1112         xfs_ilock(sc->ip, ilock_flags);
1113         sc->ilock_flags |= ilock_flags;
1114 }
1115
1116 bool
1117 xchk_ilock_nowait(
1118         struct xfs_scrub        *sc,
1119         unsigned int            ilock_flags)
1120 {
1121         if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1122                 sc->ilock_flags |= ilock_flags;
1123                 return true;
1124         }
1125
1126         return false;
1127 }
1128
1129 void
1130 xchk_iunlock(
1131         struct xfs_scrub        *sc,
1132         unsigned int            ilock_flags)
1133 {
1134         sc->ilock_flags &= ~ilock_flags;
1135         xfs_iunlock(sc->ip, ilock_flags);
1136 }
1137
1138 /*
1139  * Predicate that decides if we need to evaluate the cross-reference check.
1140  * If there was an error accessing the cross-reference btree, just delete
1141  * the cursor and skip the check.
1142  */
1143 bool
1144 xchk_should_check_xref(
1145         struct xfs_scrub        *sc,
1146         int                     *error,
1147         struct xfs_btree_cur    **curpp)
1148 {
1149         /* No point in xref if we already know we're corrupt. */
1150         if (xchk_skip_xref(sc->sm))
1151                 return false;
1152
1153         if (*error == 0)
1154                 return true;
1155
1156         if (curpp) {
1157                 /* If we've already given up on xref, just bail out. */
1158                 if (!*curpp)
1159                         return false;
1160
1161                 /* xref error, delete cursor and bail out. */
1162                 xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1163                 *curpp = NULL;
1164         }
1165
1166         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1167         trace_xchk_xref_error(sc, *error, __return_address);
1168
1169         /*
1170          * Errors encountered during cross-referencing with another
1171          * data structure should not cause this scrubber to abort.
1172          */
1173         *error = 0;
1174         return false;
1175 }
1176
1177 /* Run the structure verifiers on in-memory buffers to detect bad memory. */
1178 void
1179 xchk_buffer_recheck(
1180         struct xfs_scrub        *sc,
1181         struct xfs_buf          *bp)
1182 {
1183         xfs_failaddr_t          fa;
1184
1185         if (bp->b_ops == NULL) {
1186                 xchk_block_set_corrupt(sc, bp);
1187                 return;
1188         }
1189         if (bp->b_ops->verify_struct == NULL) {
1190                 xchk_set_incomplete(sc);
1191                 return;
1192         }
1193         fa = bp->b_ops->verify_struct(bp);
1194         if (!fa)
1195                 return;
1196         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1197         trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1198 }
1199
1200 static inline int
1201 xchk_metadata_inode_subtype(
1202         struct xfs_scrub        *sc,
1203         unsigned int            scrub_type)
1204 {
1205         __u32                   smtype = sc->sm->sm_type;
1206         unsigned int            sick_mask = sc->sick_mask;
1207         int                     error;
1208
1209         sc->sm->sm_type = scrub_type;
1210
1211         switch (scrub_type) {
1212         case XFS_SCRUB_TYPE_INODE:
1213                 error = xchk_inode(sc);
1214                 break;
1215         case XFS_SCRUB_TYPE_BMBTD:
1216                 error = xchk_bmap_data(sc);
1217                 break;
1218         default:
1219                 ASSERT(0);
1220                 error = -EFSCORRUPTED;
1221                 break;
1222         }
1223
1224         sc->sick_mask = sick_mask;
1225         sc->sm->sm_type = smtype;
1226         return error;
1227 }
1228
1229 /*
1230  * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
1231  * pointed to by sc->ip and the ILOCK must be held.
1232  */
1233 int
1234 xchk_metadata_inode_forks(
1235         struct xfs_scrub        *sc)
1236 {
1237         bool                    shared;
1238         int                     error;
1239
1240         if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1241                 return 0;
1242
1243         /* Check the inode record. */
1244         error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1245         if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1246                 return error;
1247
1248         /* Metadata inodes don't live on the rt device. */
1249         if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1250                 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1251                 return 0;
1252         }
1253
1254         /* They should never participate in reflink. */
1255         if (xfs_is_reflink_inode(sc->ip)) {
1256                 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1257                 return 0;
1258         }
1259
1260         /* They also should never have extended attributes. */
1261         if (xfs_inode_hasattr(sc->ip)) {
1262                 xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1263                 return 0;
1264         }
1265
1266         /* Invoke the data fork scrubber. */
1267         error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1268         if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1269                 return error;
1270
1271         /* Look for incorrect shared blocks. */
1272         if (xfs_has_reflink(sc->mp)) {
1273                 error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1274                                 &shared);
1275                 if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1276                                 &error))
1277                         return error;
1278                 if (shared)
1279                         xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1280         }
1281
1282         return 0;
1283 }
1284
1285 /*
1286  * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1287  * operation.  Callers must not hold any locks that intersect with the CPU
1288  * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1289  * to change kernel code.
1290  */
1291 void
1292 xchk_fsgates_enable(
1293         struct xfs_scrub        *sc,
1294         unsigned int            scrub_fsgates)
1295 {
1296         ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1297         ASSERT(!(sc->flags & scrub_fsgates));
1298
1299         trace_xchk_fsgates_enable(sc, scrub_fsgates);
1300
1301         if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1302                 xfs_drain_wait_enable();
1303
1304         if (scrub_fsgates & XCHK_FSGATES_QUOTA)
1305                 xfs_dqtrx_hook_enable();
1306
1307         if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
1308                 xfs_dir_hook_enable();
1309
1310         if (scrub_fsgates & XCHK_FSGATES_RMAP)
1311                 xfs_rmap_hook_enable();
1312
1313         sc->flags |= scrub_fsgates;
1314 }
1315
1316 /*
1317  * Decide if this is this a cached inode that's also allocated.  The caller
1318  * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1319  * from being allocated or freed.
1320  *
1321  * Look up an inode by number in the given file system.  If the inode number
1322  * is invalid, return -EINVAL.  If the inode is not in cache, return -ENODATA.
1323  * If the inode is being reclaimed, return -ENODATA because we know the inode
1324  * cache cannot be updating the ondisk metadata.
1325  *
1326  * Otherwise, the incore inode is the one we want, and it is either live,
1327  * somewhere in the inactivation machinery, or reclaimable.  The inode is
1328  * allocated if i_mode is nonzero.  In all three cases, the cached inode will
1329  * be more up to date than the ondisk inode buffer, so we must use the incore
1330  * i_mode.
1331  */
1332 int
1333 xchk_inode_is_allocated(
1334         struct xfs_scrub        *sc,
1335         xfs_agino_t             agino,
1336         bool                    *inuse)
1337 {
1338         struct xfs_mount        *mp = sc->mp;
1339         struct xfs_perag        *pag = sc->sa.pag;
1340         xfs_ino_t               ino;
1341         struct xfs_inode        *ip;
1342         int                     error;
1343
1344         /* caller must hold perag reference */
1345         if (pag == NULL) {
1346                 ASSERT(pag != NULL);
1347                 return -EINVAL;
1348         }
1349
1350         /* caller must have AGI buffer */
1351         if (sc->sa.agi_bp == NULL) {
1352                 ASSERT(sc->sa.agi_bp != NULL);
1353                 return -EINVAL;
1354         }
1355
1356         /* reject inode numbers outside existing AGs */
1357         ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1358         if (!xfs_verify_ino(mp, ino))
1359                 return -EINVAL;
1360
1361         error = -ENODATA;
1362         rcu_read_lock();
1363         ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1364         if (!ip) {
1365                 /* cache miss */
1366                 goto out_rcu;
1367         }
1368
1369         /*
1370          * If the inode number doesn't match, the incore inode got reused
1371          * during an RCU grace period and the radix tree hasn't been updated.
1372          * This isn't the inode we want.
1373          */
1374         spin_lock(&ip->i_flags_lock);
1375         if (ip->i_ino != ino)
1376                 goto out_skip;
1377
1378         trace_xchk_inode_is_allocated(ip);
1379
1380         /*
1381          * We have an incore inode that matches the inode we want, and the
1382          * caller holds the perag structure and the AGI buffer.  Let's check
1383          * our assumptions below:
1384          */
1385
1386 #ifdef DEBUG
1387         /*
1388          * (1) If the incore inode is live (i.e. referenced from the dcache),
1389          * it will not be INEW, nor will it be in the inactivation or reclaim
1390          * machinery.  The ondisk inode had better be allocated.  This is the
1391          * most trivial case.
1392          */
1393         if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1394                              XFS_INACTIVATING))) {
1395                 /* live inode */
1396                 ASSERT(VFS_I(ip)->i_mode != 0);
1397         }
1398
1399         /*
1400          * If the incore inode is INEW, there are several possibilities:
1401          *
1402          * (2) For a file that is being created, note that we allocate the
1403          * ondisk inode before allocating, initializing, and adding the incore
1404          * inode to the radix tree.
1405          *
1406          * (3) If the incore inode is being recycled, the inode has to be
1407          * allocated because we don't allow freed inodes to be recycled.
1408          * Recycling doesn't touch i_mode.
1409          */
1410         if (ip->i_flags & XFS_INEW) {
1411                 /* created on disk already or recycling */
1412                 ASSERT(VFS_I(ip)->i_mode != 0);
1413         }
1414
1415         /*
1416          * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1417          * inactivation has not started (!INACTIVATING), it is still allocated.
1418          */
1419         if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1420             !(ip->i_flags & XFS_INACTIVATING)) {
1421                 /* definitely before difree */
1422                 ASSERT(VFS_I(ip)->i_mode != 0);
1423         }
1424 #endif
1425
1426         /*
1427          * If the incore inode is undergoing inactivation (INACTIVATING), there
1428          * are two possibilities:
1429          *
1430          * (5) It is before the point where it would get freed ondisk, in which
1431          * case i_mode is still nonzero.
1432          *
1433          * (6) It has already been freed, in which case i_mode is zero.
1434          *
1435          * We don't take the ILOCK here, but difree and dialloc update the AGI,
1436          * and we've taken the AGI buffer lock, which prevents that from
1437          * happening.
1438          */
1439
1440         /*
1441          * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1442          * reclaim (IRECLAIMABLE) could be allocated or free.  i_mode still
1443          * reflects the ondisk state.
1444          */
1445
1446         /*
1447          * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1448          * the flush code uses i_mode to format the ondisk inode.
1449          */
1450
1451         /*
1452          * (9) If the inode is in IRECLAIM and was reachable via the radix
1453          * tree, it still has the same i_mode as it did before it entered
1454          * reclaim.  The inode object is still alive because we hold the RCU
1455          * read lock.
1456          */
1457
1458         *inuse = VFS_I(ip)->i_mode != 0;
1459         error = 0;
1460
1461 out_skip:
1462         spin_unlock(&ip->i_flags_lock);
1463 out_rcu:
1464         rcu_read_unlock();
1465         return error;
1466 }